From 86b9decf8010234b5c1c647f570f379b7f04b78c Mon Sep 17 00:00:00 2001 From: anti Date: Fri, 24 Apr 2026 19:25:45 -0400 Subject: [PATCH] fix(engine): detect wedged buildx + surface recovery hint on deploy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When Docker's buildx leaks bind-mounts from a failed build it starts reporting 'read-only file system' on its own activity file, even though nothing is actually read-only. The user's host had 20+ leaked mounts before we noticed — each retry compounds the leak. _compose_with_retry now: * Pre-flight counts /var/lib/docker/tmp/buildkit-mount* entries in /proc/self/mounts; if >= 10 and the command is a build, refuses to start and returns a clean recovery recipe instead of retrying. * On mid-build failures that match the wedge signature ('failed to update builder last activity time' or the activity-dir path in stderr), short-circuits the retry loop with the same recipe. The first occurrence no longer needs a pre-flight; the pre-flight catches repeat attempts. Recipe points at 'docker buildx prune -af && sudo systemctl restart docker', which is what actually clears the leaked mounts. Tests cover all three paths: wedge preflight blocks builds, non-build commands (down/stop) ignore the preflight, mid-build signature detection kills the retry loop. A new autouse fixture stubs the wedge-detector to 0 so dev-host state doesn't poison the mocked subprocess tests. Wiki companion commit adds Troubleshooting → 'Buildx leaked mounts'. --- decnet/engine/deployer.py | 70 ++++++++++++++++++++++++++++++++++++ tests/fleet/test_deployer.py | 51 ++++++++++++++++++++++++++ 2 files changed, 121 insertions(+) diff --git a/decnet/engine/deployer.py b/decnet/engine/deployer.py index 6fec4119..c88b30a2 100644 --- a/decnet/engine/deployer.py +++ b/decnet/engine/deployer.py @@ -132,6 +132,52 @@ _PERMANENT_ERRORS = ( "repository does not exist", ) +# Signatures of a wedged buildx: the daemon has leaked bind mounts and +# can no longer update its activity file, which surfaces as "read-only +# file system" even though nothing is actually read-only. Retrying the +# compose build just accumulates more leaked mounts — we bail early +# with a clear recovery recipe. +_BUILDX_WEDGE_PATTERNS = ( + "failed to update builder last activity time", + ".docker/buildx/activity/", +) + +# Count above which we consider buildx's bind-mount table pathological. +# A healthy daemon has 0; a couple is transient during a build. Past +# 10 you're seeing accumulation from a previous failed run. +_BUILDKIT_MOUNT_THRESHOLD = 10 + + +def _count_leaked_buildkit_mounts() -> int: + """How many orphaned buildkit bind-mounts is the daemon holding? + + Best-effort: reads /proc/self/mounts and greps for the known + buildkit tmp pattern. Returns 0 if the file can't be read so we + never block a deploy over our own diagnostic. + """ + try: + with open("/proc/self/mounts", "r", encoding="utf-8") as fh: + return sum(1 for line in fh if "/var/lib/docker/tmp/buildkit-mount" in line) + except OSError: + return 0 + + +def _buildx_recovery_hint(extra: str = "") -> str: + head = ( + "Buildx is wedged — Docker's build driver has leaked bind " + "mounts from a previous failed run and can no longer write " + "its activity file. This surfaces as a spurious " + "'read-only file system' error." + ) + fix = ( + "Recovery:\n" + " 1. docker buildx prune -af\n" + " 2. sudo systemctl restart docker\n" + " 3. Retry the deploy.\n" + "See wiki: Troubleshooting → 'Buildx leaked mounts'." + ) + return f"{head}\n\n{fix}{(' ' + extra) if extra else ''}" + @_traced("engine.compose_with_retry") def _compose_with_retry( @@ -150,6 +196,21 @@ def _compose_with_retry( # "project name must not be empty". cmd = ["docker", "compose", "-p", "decnet", "-f", str(compose_file), *args] merged = {**os.environ, **(env or {})} + + # Preflight: if buildx already looks wedged before the first attempt, + # refuse to start — retrying just leaks more mounts. Only applies to + # build-bearing invocations ("up --build", "build"); "down" etc. are + # unaffected by buildx state. + is_build_cmd = any(a in args for a in ("--build", "build")) + if is_build_cmd: + leaked = _count_leaked_buildkit_mounts() + if leaked >= _BUILDKIT_MOUNT_THRESHOLD: + hint = _buildx_recovery_hint(f"(Detected {leaked} leaked buildkit mounts.)") + log.error("preflight: buildx wedge detected (%d mounts) — refusing to deploy", leaked) + raise subprocess.CalledProcessError( + returncode=1, cmd=cmd, output="", stderr=hint, + ) + for attempt in range(1, retries + 1): result = subprocess.run(cmd, capture_output=True, text=True, env=merged) # nosec B603 if result.returncode == 0: @@ -163,6 +224,15 @@ def _compose_with_retry( if any(pat in stderr_lower for pat in _PERMANENT_ERRORS): console.print(f"[red]Permanent Docker error — not retrying:[/]\n{result.stderr.strip()}") raise last_exc + if any(pat in stderr_lower for pat in _BUILDX_WEDGE_PATTERNS): + leaked = _count_leaked_buildkit_mounts() + hint = _buildx_recovery_hint(f"(Detected {leaked} leaked buildkit mounts.)") + console.print(f"[red]{hint}[/]") + log.error("buildx wedge detected mid-build (%d mounts) — not retrying", leaked) + raise subprocess.CalledProcessError( + returncode=result.returncode, cmd=cmd, + output=result.stdout, stderr=hint, + ) if attempt < retries: console.print( f"[yellow]docker compose {' '.join(args)} failed " diff --git a/tests/fleet/test_deployer.py b/tests/fleet/test_deployer.py index bf244310..7b182b5d 100644 --- a/tests/fleet/test_deployer.py +++ b/tests/fleet/test_deployer.py @@ -59,6 +59,17 @@ class TestCompose: # ── _compose_with_retry ─────────────────────────────────────────────────────── +@pytest.fixture(autouse=True) +def _no_leaked_buildkit_mounts(monkeypatch): + """Stub out the wedge-detector so dev-host state doesn't bleed into + the mocked-subprocess tests below. Tests that exercise the preflight + itself patch this function explicitly.""" + monkeypatch.setattr( + "decnet.engine.deployer._count_leaked_buildkit_mounts", + lambda: 0, + ) + + class TestComposeWithRetry: @patch("decnet.engine.deployer.subprocess.run") def test_success_first_try(self, mock_run): @@ -106,6 +117,46 @@ class TestComposeWithRetry: captured = capsys.readouterr() assert "done" in captured.out + @patch("decnet.engine.deployer.subprocess.run") + def test_buildx_preflight_blocks_when_wedged(self, mock_run, monkeypatch): + """Pre-flight: refuse to run a build command when buildx already + shows pathological mount leakage — retrying would only leak more.""" + from decnet.engine import deployer + monkeypatch.setattr(deployer, "_count_leaked_buildkit_mounts", lambda: 42) + with pytest.raises(subprocess.CalledProcessError) as ei: + deployer._compose_with_retry("up", "--build", "-d") + mock_run.assert_not_called() + assert "Buildx is wedged" in ei.value.stderr + assert "docker buildx prune" in ei.value.stderr + + @patch("decnet.engine.deployer.subprocess.run") + def test_buildx_preflight_skipped_for_non_build_cmds(self, mock_run, monkeypatch): + """down/stop/etc. don't go through buildx — the preflight must + not block them even if mounts are leaked.""" + from decnet.engine import deployer + monkeypatch.setattr(deployer, "_count_leaked_buildkit_mounts", lambda: 999) + mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="") + deployer._compose_with_retry("down") # must not raise + mock_run.assert_called_once() + + @patch("decnet.engine.deployer.time.sleep") + @patch("decnet.engine.deployer.subprocess.run") + def test_buildx_wedge_mid_build_short_circuits_retries(self, mock_run, mock_sleep): + """If a build fails with the wedge signature, skip remaining + retries and surface the recovery hint in stderr.""" + from decnet.engine.deployer import _compose_with_retry + fail = MagicMock( + returncode=1, stdout="", + stderr="failed to update builder last activity time: " + "open /home/x/.docker/buildx/activity/.tmp-default: read-only file system", + ) + mock_run.return_value = fail + with pytest.raises(subprocess.CalledProcessError) as ei: + _compose_with_retry("up", "--build", retries=5) + assert mock_run.call_count == 1 # no retry + mock_sleep.assert_not_called() + assert "Buildx is wedged" in ei.value.stderr + # ── _sync_logging_helper ─────────────────────────────────────────────────────