fix(engine): detect wedged buildx + surface recovery hint on deploy

When Docker's buildx leaks bind-mounts from a failed build it starts
reporting 'read-only file system' on its own activity file, even
though nothing is actually read-only. The user's host had 20+
leaked mounts before we noticed — each retry compounds the leak.

_compose_with_retry now:
 * Pre-flight counts /var/lib/docker/tmp/buildkit-mount* entries in
   /proc/self/mounts; if >= 10 and the command is a build, refuses
   to start and returns a clean recovery recipe instead of retrying.
 * On mid-build failures that match the wedge signature
   ('failed to update builder last activity time' or the activity-dir
   path in stderr), short-circuits the retry loop with the same
   recipe. The first occurrence no longer needs a pre-flight; the
   pre-flight catches repeat attempts.

Recipe points at 'docker buildx prune -af && sudo systemctl restart
docker', which is what actually clears the leaked mounts.

Tests cover all three paths: wedge preflight blocks builds, non-build
commands (down/stop) ignore the preflight, mid-build signature
detection kills the retry loop. A new autouse fixture stubs the
wedge-detector to 0 so dev-host state doesn't poison the mocked
subprocess tests.

Wiki companion commit adds Troubleshooting → 'Buildx leaked mounts'.
This commit is contained in:
2026-04-24 19:25:45 -04:00
parent a8356407c5
commit 86b9decf80
2 changed files with 121 additions and 0 deletions

View File

@@ -59,6 +59,17 @@ class TestCompose:
# ── _compose_with_retry ───────────────────────────────────────────────────────
@pytest.fixture(autouse=True)
def _no_leaked_buildkit_mounts(monkeypatch):
"""Stub out the wedge-detector so dev-host state doesn't bleed into
the mocked-subprocess tests below. Tests that exercise the preflight
itself patch this function explicitly."""
monkeypatch.setattr(
"decnet.engine.deployer._count_leaked_buildkit_mounts",
lambda: 0,
)
class TestComposeWithRetry:
@patch("decnet.engine.deployer.subprocess.run")
def test_success_first_try(self, mock_run):
@@ -106,6 +117,46 @@ class TestComposeWithRetry:
captured = capsys.readouterr()
assert "done" in captured.out
@patch("decnet.engine.deployer.subprocess.run")
def test_buildx_preflight_blocks_when_wedged(self, mock_run, monkeypatch):
"""Pre-flight: refuse to run a build command when buildx already
shows pathological mount leakage — retrying would only leak more."""
from decnet.engine import deployer
monkeypatch.setattr(deployer, "_count_leaked_buildkit_mounts", lambda: 42)
with pytest.raises(subprocess.CalledProcessError) as ei:
deployer._compose_with_retry("up", "--build", "-d")
mock_run.assert_not_called()
assert "Buildx is wedged" in ei.value.stderr
assert "docker buildx prune" in ei.value.stderr
@patch("decnet.engine.deployer.subprocess.run")
def test_buildx_preflight_skipped_for_non_build_cmds(self, mock_run, monkeypatch):
"""down/stop/etc. don't go through buildx — the preflight must
not block them even if mounts are leaked."""
from decnet.engine import deployer
monkeypatch.setattr(deployer, "_count_leaked_buildkit_mounts", lambda: 999)
mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="")
deployer._compose_with_retry("down") # must not raise
mock_run.assert_called_once()
@patch("decnet.engine.deployer.time.sleep")
@patch("decnet.engine.deployer.subprocess.run")
def test_buildx_wedge_mid_build_short_circuits_retries(self, mock_run, mock_sleep):
"""If a build fails with the wedge signature, skip remaining
retries and surface the recovery hint in stderr."""
from decnet.engine.deployer import _compose_with_retry
fail = MagicMock(
returncode=1, stdout="",
stderr="failed to update builder last activity time: "
"open /home/x/.docker/buildx/activity/.tmp-default: read-only file system",
)
mock_run.return_value = fail
with pytest.raises(subprocess.CalledProcessError) as ei:
_compose_with_retry("up", "--build", retries=5)
assert mock_run.call_count == 1 # no retry
mock_sleep.assert_not_called()
assert "Buildx is wedged" in ei.value.stderr
# ── _sync_logging_helper ─────────────────────────────────────────────────────