fix(deploy): redirect DOCKER_CONFIG out of $HOME so ProtectHome doesn't kill builds
The api unit's ProtectHome=read-only made the user's HOME read-only inside the unit's namespace. docker compose --build then tried to write ~/.docker/buildx/activity/* and got EROFS — which we'd been misdiagnosing as a buildx wedge for the last few iterations. Real fix: set DOCKER_CONFIG and BUILDX_CONFIG in the unit's Environment= to a path inside ReadWritePaths. Hardening stays on, docker CLI writes to install_dir/.docker instead of /home/<user>/.docker. The wedge classifier now detects this case (count==0 + /home/ in the stderr path) and emits a recipe pointing at the env-var fix instead of the driver-rebuild path. Test added. Wiki gets the new branch first since it's the most common cause on systemd-managed installs.
This commit is contained in:
@@ -177,7 +177,13 @@ def _format_subprocess_error(exc: BaseException) -> str:
|
||||
def _buildx_recovery_hint(*, leaked_mounts: int, original_stderr: str = "") -> str:
|
||||
"""Compose a recovery recipe tailored to which side of the wedge fired.
|
||||
|
||||
Two failure modes share the 'read-only file system' symptom:
|
||||
Three failure modes share the 'read-only file system' symptom:
|
||||
|
||||
* **Sandboxed home** (path under ``/home/.../.docker``): the
|
||||
service unit has ``ProtectHome=read-only`` and docker CLI is
|
||||
trying to write its activity file in the user's HOME. Fix is
|
||||
to redirect ``DOCKER_CONFIG`` / ``BUILDX_CONFIG`` to a path
|
||||
inside ``ReadWritePaths``.
|
||||
|
||||
* **Leaked mounts** (count > 0): buildkit accumulated bind mounts
|
||||
in /var/lib/docker/tmp from a prior failed build. Fix is to drop
|
||||
@@ -193,6 +199,32 @@ def _buildx_recovery_hint(*, leaked_mounts: int, original_stderr: str = "") -> s
|
||||
"Buildx is wedged — Docker's build driver can no longer write "
|
||||
"its activity file (spurious 'read-only file system' error)."
|
||||
)
|
||||
|
||||
# If the offending path is under /home/, leaked mounts are a red
|
||||
# herring — the unit's namespace is what's blocking the write.
|
||||
is_protecthome_case = (
|
||||
leaked_mounts == 0
|
||||
and "/home/" in original_stderr
|
||||
and ".docker/buildx" in original_stderr
|
||||
)
|
||||
if is_protecthome_case:
|
||||
fix = (
|
||||
"Path is under /home but no mounts are leaked — the API "
|
||||
"unit is running with ProtectHome=read-only and docker CLI "
|
||||
"can't write its activity file inside the user's HOME.\n"
|
||||
"Recovery (in the systemd unit):\n"
|
||||
" Environment=DOCKER_CONFIG=<install_dir>/.docker\n"
|
||||
" Environment=BUILDX_CONFIG=<install_dir>/.docker/buildx\n"
|
||||
"Then: sudo systemctl daemon-reload && sudo systemctl restart decnet-api\n"
|
||||
"(Already wired into deploy/decnet-api.service.j2 — re-run\n"
|
||||
"`decnet init` to refresh the installed unit, then restart.)"
|
||||
)
|
||||
tail = "See wiki: Troubleshooting → 'Buildx leaked mounts'."
|
||||
parts = [head, fix, tail]
|
||||
if original_stderr:
|
||||
parts.append(f"Original error:\n{original_stderr.strip()}")
|
||||
return "\n\n".join(parts)
|
||||
|
||||
if leaked_mounts > 0:
|
||||
fix = (
|
||||
f"Detected {leaked_mounts} leaked buildkit bind-mounts — "
|
||||
|
||||
@@ -14,6 +14,13 @@ SupplementaryGroups=docker
|
||||
WorkingDirectory={{ install_dir }}
|
||||
EnvironmentFile=-{{ install_dir }}/.env.local
|
||||
Environment=DECNET_SYSTEM_LOGS=/var/log/decnet/decnet.api.log
|
||||
# ProtectHome=read-only (below) makes the user's $HOME read-only inside
|
||||
# the unit's namespace, which breaks `docker compose build` because the
|
||||
# CLI writes ~/.docker/buildx/activity/. Redirect the docker CLI's
|
||||
# config root into install_dir (already in ReadWritePaths) so the
|
||||
# hardening stays on without crippling the build path.
|
||||
Environment=DOCKER_CONFIG={{ install_dir }}/.docker
|
||||
Environment=BUILDX_CONFIG={{ install_dir }}/.docker/buildx
|
||||
ExecStart={{ venv_dir }}/bin/decnet api
|
||||
StandardOutput=append:/var/log/decnet/decnet.api.log
|
||||
StandardError=append:/var/log/decnet/decnet.api.log
|
||||
|
||||
@@ -163,6 +163,26 @@ class TestComposeWithRetry:
|
||||
# can see what compose actually said.
|
||||
assert "Original error" in ei.value.stderr
|
||||
|
||||
@patch("decnet.engine.deployer.subprocess.run")
|
||||
def test_buildx_wedge_protecthome_branch(self, mock_run, monkeypatch):
|
||||
"""When stderr names a path under /home and no mounts are
|
||||
leaked, the cause is systemd's ProtectHome — recipe should
|
||||
point at DOCKER_CONFIG redirection, not driver rebuild."""
|
||||
from decnet.engine import deployer
|
||||
monkeypatch.setattr(deployer, "_count_leaked_buildkit_mounts", lambda: 0)
|
||||
mock_run.return_value = MagicMock(
|
||||
returncode=1, stdout="",
|
||||
stderr=("failed to update builder last activity time: open "
|
||||
"/home/anti/.docker/buildx/activity/.tmp-x: read-only file system"),
|
||||
)
|
||||
with pytest.raises(subprocess.CalledProcessError) as ei:
|
||||
deployer._compose_with_retry("up", "--build")
|
||||
assert "ProtectHome=read-only" in ei.value.stderr
|
||||
assert "DOCKER_CONFIG" in ei.value.stderr
|
||||
assert "BUILDX_CONFIG" in ei.value.stderr
|
||||
# Driver-rebuild recipe must NOT be the suggested fix here.
|
||||
assert "buildx create --name decnet-builder" not in ei.value.stderr
|
||||
|
||||
@patch("decnet.engine.deployer.subprocess.run")
|
||||
def test_buildx_wedge_zero_mounts_uses_driver_rebuild_recipe(self, mock_run, monkeypatch):
|
||||
"""Wedge signature with 0 leaked mounts means the buildx driver
|
||||
@@ -172,7 +192,9 @@ class TestComposeWithRetry:
|
||||
monkeypatch.setattr(deployer, "_count_leaked_buildkit_mounts", lambda: 0)
|
||||
mock_run.return_value = MagicMock(
|
||||
returncode=1, stdout="",
|
||||
stderr="failed to update builder last activity time: read-only file system",
|
||||
# No /home/ path — driver-rebuild branch, not ProtectHome.
|
||||
stderr="failed to update builder last activity time: open "
|
||||
"/var/lib/decnet/.docker/buildx/activity/.tmp-x: read-only file system",
|
||||
)
|
||||
with pytest.raises(subprocess.CalledProcessError) as ei:
|
||||
deployer._compose_with_retry("up", "--build")
|
||||
|
||||
Reference in New Issue
Block a user