From f8ef0a5cf1b152cb5cdbedbc9aed55ad317b1360 Mon Sep 17 00:00:00 2001 From: anti Date: Fri, 24 Apr 2026 22:07:13 -0400 Subject: [PATCH] fix(deploy): redirect DOCKER_CONFIG out of $HOME so ProtectHome doesn't kill builds MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The api unit's ProtectHome=read-only made the user's HOME read-only inside the unit's namespace. docker compose --build then tried to write ~/.docker/buildx/activity/* and got EROFS — which we'd been misdiagnosing as a buildx wedge for the last few iterations. Real fix: set DOCKER_CONFIG and BUILDX_CONFIG in the unit's Environment= to a path inside ReadWritePaths. Hardening stays on, docker CLI writes to install_dir/.docker instead of /home//.docker. The wedge classifier now detects this case (count==0 + /home/ in the stderr path) and emits a recipe pointing at the env-var fix instead of the driver-rebuild path. Test added. Wiki gets the new branch first since it's the most common cause on systemd-managed installs. --- decnet/engine/deployer.py | 34 +++++++++++++++++++++++++++++++++- deploy/decnet-api.service.j2 | 7 +++++++ tests/fleet/test_deployer.py | 24 +++++++++++++++++++++++- 3 files changed, 63 insertions(+), 2 deletions(-) diff --git a/decnet/engine/deployer.py b/decnet/engine/deployer.py index bebe5d3d..0ac7fa32 100644 --- a/decnet/engine/deployer.py +++ b/decnet/engine/deployer.py @@ -177,7 +177,13 @@ def _format_subprocess_error(exc: BaseException) -> str: def _buildx_recovery_hint(*, leaked_mounts: int, original_stderr: str = "") -> str: """Compose a recovery recipe tailored to which side of the wedge fired. - Two failure modes share the 'read-only file system' symptom: + Three failure modes share the 'read-only file system' symptom: + + * **Sandboxed home** (path under ``/home/.../.docker``): the + service unit has ``ProtectHome=read-only`` and docker CLI is + trying to write its activity file in the user's HOME. Fix is + to redirect ``DOCKER_CONFIG`` / ``BUILDX_CONFIG`` to a path + inside ``ReadWritePaths``. * **Leaked mounts** (count > 0): buildkit accumulated bind mounts in /var/lib/docker/tmp from a prior failed build. Fix is to drop @@ -193,6 +199,32 @@ def _buildx_recovery_hint(*, leaked_mounts: int, original_stderr: str = "") -> s "Buildx is wedged — Docker's build driver can no longer write " "its activity file (spurious 'read-only file system' error)." ) + + # If the offending path is under /home/, leaked mounts are a red + # herring — the unit's namespace is what's blocking the write. + is_protecthome_case = ( + leaked_mounts == 0 + and "/home/" in original_stderr + and ".docker/buildx" in original_stderr + ) + if is_protecthome_case: + fix = ( + "Path is under /home but no mounts are leaked — the API " + "unit is running with ProtectHome=read-only and docker CLI " + "can't write its activity file inside the user's HOME.\n" + "Recovery (in the systemd unit):\n" + " Environment=DOCKER_CONFIG=/.docker\n" + " Environment=BUILDX_CONFIG=/.docker/buildx\n" + "Then: sudo systemctl daemon-reload && sudo systemctl restart decnet-api\n" + "(Already wired into deploy/decnet-api.service.j2 — re-run\n" + "`decnet init` to refresh the installed unit, then restart.)" + ) + tail = "See wiki: Troubleshooting → 'Buildx leaked mounts'." + parts = [head, fix, tail] + if original_stderr: + parts.append(f"Original error:\n{original_stderr.strip()}") + return "\n\n".join(parts) + if leaked_mounts > 0: fix = ( f"Detected {leaked_mounts} leaked buildkit bind-mounts — " diff --git a/deploy/decnet-api.service.j2 b/deploy/decnet-api.service.j2 index 4a717c8d..51aa4b18 100644 --- a/deploy/decnet-api.service.j2 +++ b/deploy/decnet-api.service.j2 @@ -14,6 +14,13 @@ SupplementaryGroups=docker WorkingDirectory={{ install_dir }} EnvironmentFile=-{{ install_dir }}/.env.local Environment=DECNET_SYSTEM_LOGS=/var/log/decnet/decnet.api.log +# ProtectHome=read-only (below) makes the user's $HOME read-only inside +# the unit's namespace, which breaks `docker compose build` because the +# CLI writes ~/.docker/buildx/activity/. Redirect the docker CLI's +# config root into install_dir (already in ReadWritePaths) so the +# hardening stays on without crippling the build path. +Environment=DOCKER_CONFIG={{ install_dir }}/.docker +Environment=BUILDX_CONFIG={{ install_dir }}/.docker/buildx ExecStart={{ venv_dir }}/bin/decnet api StandardOutput=append:/var/log/decnet/decnet.api.log StandardError=append:/var/log/decnet/decnet.api.log diff --git a/tests/fleet/test_deployer.py b/tests/fleet/test_deployer.py index c70cbc2e..8caf98dc 100644 --- a/tests/fleet/test_deployer.py +++ b/tests/fleet/test_deployer.py @@ -163,6 +163,26 @@ class TestComposeWithRetry: # can see what compose actually said. assert "Original error" in ei.value.stderr + @patch("decnet.engine.deployer.subprocess.run") + def test_buildx_wedge_protecthome_branch(self, mock_run, monkeypatch): + """When stderr names a path under /home and no mounts are + leaked, the cause is systemd's ProtectHome — recipe should + point at DOCKER_CONFIG redirection, not driver rebuild.""" + from decnet.engine import deployer + monkeypatch.setattr(deployer, "_count_leaked_buildkit_mounts", lambda: 0) + mock_run.return_value = MagicMock( + returncode=1, stdout="", + stderr=("failed to update builder last activity time: open " + "/home/anti/.docker/buildx/activity/.tmp-x: read-only file system"), + ) + with pytest.raises(subprocess.CalledProcessError) as ei: + deployer._compose_with_retry("up", "--build") + assert "ProtectHome=read-only" in ei.value.stderr + assert "DOCKER_CONFIG" in ei.value.stderr + assert "BUILDX_CONFIG" in ei.value.stderr + # Driver-rebuild recipe must NOT be the suggested fix here. + assert "buildx create --name decnet-builder" not in ei.value.stderr + @patch("decnet.engine.deployer.subprocess.run") def test_buildx_wedge_zero_mounts_uses_driver_rebuild_recipe(self, mock_run, monkeypatch): """Wedge signature with 0 leaked mounts means the buildx driver @@ -172,7 +192,9 @@ class TestComposeWithRetry: monkeypatch.setattr(deployer, "_count_leaked_buildkit_mounts", lambda: 0) mock_run.return_value = MagicMock( returncode=1, stdout="", - stderr="failed to update builder last activity time: read-only file system", + # No /home/ path — driver-rebuild branch, not ProtectHome. + stderr="failed to update builder last activity time: open " + "/var/lib/decnet/.docker/buildx/activity/.tmp-x: read-only file system", ) with pytest.raises(subprocess.CalledProcessError) as ei: deployer._compose_with_retry("up", "--build")