fix(deploy): redirect DOCKER_CONFIG out of $HOME so ProtectHome doesn't kill builds
The api unit's ProtectHome=read-only made the user's HOME read-only inside the unit's namespace. docker compose --build then tried to write ~/.docker/buildx/activity/* and got EROFS — which we'd been misdiagnosing as a buildx wedge for the last few iterations. Real fix: set DOCKER_CONFIG and BUILDX_CONFIG in the unit's Environment= to a path inside ReadWritePaths. Hardening stays on, docker CLI writes to install_dir/.docker instead of /home/<user>/.docker. The wedge classifier now detects this case (count==0 + /home/ in the stderr path) and emits a recipe pointing at the env-var fix instead of the driver-rebuild path. Test added. Wiki gets the new branch first since it's the most common cause on systemd-managed installs.
This commit is contained in:
@@ -177,7 +177,13 @@ def _format_subprocess_error(exc: BaseException) -> str:
|
|||||||
def _buildx_recovery_hint(*, leaked_mounts: int, original_stderr: str = "") -> str:
|
def _buildx_recovery_hint(*, leaked_mounts: int, original_stderr: str = "") -> str:
|
||||||
"""Compose a recovery recipe tailored to which side of the wedge fired.
|
"""Compose a recovery recipe tailored to which side of the wedge fired.
|
||||||
|
|
||||||
Two failure modes share the 'read-only file system' symptom:
|
Three failure modes share the 'read-only file system' symptom:
|
||||||
|
|
||||||
|
* **Sandboxed home** (path under ``/home/.../.docker``): the
|
||||||
|
service unit has ``ProtectHome=read-only`` and docker CLI is
|
||||||
|
trying to write its activity file in the user's HOME. Fix is
|
||||||
|
to redirect ``DOCKER_CONFIG`` / ``BUILDX_CONFIG`` to a path
|
||||||
|
inside ``ReadWritePaths``.
|
||||||
|
|
||||||
* **Leaked mounts** (count > 0): buildkit accumulated bind mounts
|
* **Leaked mounts** (count > 0): buildkit accumulated bind mounts
|
||||||
in /var/lib/docker/tmp from a prior failed build. Fix is to drop
|
in /var/lib/docker/tmp from a prior failed build. Fix is to drop
|
||||||
@@ -193,6 +199,32 @@ def _buildx_recovery_hint(*, leaked_mounts: int, original_stderr: str = "") -> s
|
|||||||
"Buildx is wedged — Docker's build driver can no longer write "
|
"Buildx is wedged — Docker's build driver can no longer write "
|
||||||
"its activity file (spurious 'read-only file system' error)."
|
"its activity file (spurious 'read-only file system' error)."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# If the offending path is under /home/, leaked mounts are a red
|
||||||
|
# herring — the unit's namespace is what's blocking the write.
|
||||||
|
is_protecthome_case = (
|
||||||
|
leaked_mounts == 0
|
||||||
|
and "/home/" in original_stderr
|
||||||
|
and ".docker/buildx" in original_stderr
|
||||||
|
)
|
||||||
|
if is_protecthome_case:
|
||||||
|
fix = (
|
||||||
|
"Path is under /home but no mounts are leaked — the API "
|
||||||
|
"unit is running with ProtectHome=read-only and docker CLI "
|
||||||
|
"can't write its activity file inside the user's HOME.\n"
|
||||||
|
"Recovery (in the systemd unit):\n"
|
||||||
|
" Environment=DOCKER_CONFIG=<install_dir>/.docker\n"
|
||||||
|
" Environment=BUILDX_CONFIG=<install_dir>/.docker/buildx\n"
|
||||||
|
"Then: sudo systemctl daemon-reload && sudo systemctl restart decnet-api\n"
|
||||||
|
"(Already wired into deploy/decnet-api.service.j2 — re-run\n"
|
||||||
|
"`decnet init` to refresh the installed unit, then restart.)"
|
||||||
|
)
|
||||||
|
tail = "See wiki: Troubleshooting → 'Buildx leaked mounts'."
|
||||||
|
parts = [head, fix, tail]
|
||||||
|
if original_stderr:
|
||||||
|
parts.append(f"Original error:\n{original_stderr.strip()}")
|
||||||
|
return "\n\n".join(parts)
|
||||||
|
|
||||||
if leaked_mounts > 0:
|
if leaked_mounts > 0:
|
||||||
fix = (
|
fix = (
|
||||||
f"Detected {leaked_mounts} leaked buildkit bind-mounts — "
|
f"Detected {leaked_mounts} leaked buildkit bind-mounts — "
|
||||||
|
|||||||
@@ -14,6 +14,13 @@ SupplementaryGroups=docker
|
|||||||
WorkingDirectory={{ install_dir }}
|
WorkingDirectory={{ install_dir }}
|
||||||
EnvironmentFile=-{{ install_dir }}/.env.local
|
EnvironmentFile=-{{ install_dir }}/.env.local
|
||||||
Environment=DECNET_SYSTEM_LOGS=/var/log/decnet/decnet.api.log
|
Environment=DECNET_SYSTEM_LOGS=/var/log/decnet/decnet.api.log
|
||||||
|
# ProtectHome=read-only (below) makes the user's $HOME read-only inside
|
||||||
|
# the unit's namespace, which breaks `docker compose build` because the
|
||||||
|
# CLI writes ~/.docker/buildx/activity/. Redirect the docker CLI's
|
||||||
|
# config root into install_dir (already in ReadWritePaths) so the
|
||||||
|
# hardening stays on without crippling the build path.
|
||||||
|
Environment=DOCKER_CONFIG={{ install_dir }}/.docker
|
||||||
|
Environment=BUILDX_CONFIG={{ install_dir }}/.docker/buildx
|
||||||
ExecStart={{ venv_dir }}/bin/decnet api
|
ExecStart={{ venv_dir }}/bin/decnet api
|
||||||
StandardOutput=append:/var/log/decnet/decnet.api.log
|
StandardOutput=append:/var/log/decnet/decnet.api.log
|
||||||
StandardError=append:/var/log/decnet/decnet.api.log
|
StandardError=append:/var/log/decnet/decnet.api.log
|
||||||
|
|||||||
@@ -163,6 +163,26 @@ class TestComposeWithRetry:
|
|||||||
# can see what compose actually said.
|
# can see what compose actually said.
|
||||||
assert "Original error" in ei.value.stderr
|
assert "Original error" in ei.value.stderr
|
||||||
|
|
||||||
|
@patch("decnet.engine.deployer.subprocess.run")
|
||||||
|
def test_buildx_wedge_protecthome_branch(self, mock_run, monkeypatch):
|
||||||
|
"""When stderr names a path under /home and no mounts are
|
||||||
|
leaked, the cause is systemd's ProtectHome — recipe should
|
||||||
|
point at DOCKER_CONFIG redirection, not driver rebuild."""
|
||||||
|
from decnet.engine import deployer
|
||||||
|
monkeypatch.setattr(deployer, "_count_leaked_buildkit_mounts", lambda: 0)
|
||||||
|
mock_run.return_value = MagicMock(
|
||||||
|
returncode=1, stdout="",
|
||||||
|
stderr=("failed to update builder last activity time: open "
|
||||||
|
"/home/anti/.docker/buildx/activity/.tmp-x: read-only file system"),
|
||||||
|
)
|
||||||
|
with pytest.raises(subprocess.CalledProcessError) as ei:
|
||||||
|
deployer._compose_with_retry("up", "--build")
|
||||||
|
assert "ProtectHome=read-only" in ei.value.stderr
|
||||||
|
assert "DOCKER_CONFIG" in ei.value.stderr
|
||||||
|
assert "BUILDX_CONFIG" in ei.value.stderr
|
||||||
|
# Driver-rebuild recipe must NOT be the suggested fix here.
|
||||||
|
assert "buildx create --name decnet-builder" not in ei.value.stderr
|
||||||
|
|
||||||
@patch("decnet.engine.deployer.subprocess.run")
|
@patch("decnet.engine.deployer.subprocess.run")
|
||||||
def test_buildx_wedge_zero_mounts_uses_driver_rebuild_recipe(self, mock_run, monkeypatch):
|
def test_buildx_wedge_zero_mounts_uses_driver_rebuild_recipe(self, mock_run, monkeypatch):
|
||||||
"""Wedge signature with 0 leaked mounts means the buildx driver
|
"""Wedge signature with 0 leaked mounts means the buildx driver
|
||||||
@@ -172,7 +192,9 @@ class TestComposeWithRetry:
|
|||||||
monkeypatch.setattr(deployer, "_count_leaked_buildkit_mounts", lambda: 0)
|
monkeypatch.setattr(deployer, "_count_leaked_buildkit_mounts", lambda: 0)
|
||||||
mock_run.return_value = MagicMock(
|
mock_run.return_value = MagicMock(
|
||||||
returncode=1, stdout="",
|
returncode=1, stdout="",
|
||||||
stderr="failed to update builder last activity time: read-only file system",
|
# No /home/ path — driver-rebuild branch, not ProtectHome.
|
||||||
|
stderr="failed to update builder last activity time: open "
|
||||||
|
"/var/lib/decnet/.docker/buildx/activity/.tmp-x: read-only file system",
|
||||||
)
|
)
|
||||||
with pytest.raises(subprocess.CalledProcessError) as ei:
|
with pytest.raises(subprocess.CalledProcessError) as ei:
|
||||||
deployer._compose_with_retry("up", "--build")
|
deployer._compose_with_retry("up", "--build")
|
||||||
|
|||||||
Reference in New Issue
Block a user