fix(engine): detect wedged buildx + surface recovery hint on deploy

When Docker's buildx leaks bind-mounts from a failed build it starts reporting 'read-only file system' on its own activity file, even though nothing is actually read-only. The user's host had 20+ leaked mounts before we noticed — each retry compounds the leak. _compose_with_retry now: * Pre-flight counts /var/lib/docker/tmp/buildkit-mount* entries in /proc/self/mounts; if >= 10 and the command is a build, refuses to start and returns a clean recovery recipe instead of retrying. * On mid-build failures that match the wedge signature ('failed to update builder last activity time' or the activity-dir path in stderr), short-circuits the retry loop with the same recipe. The first occurrence no longer needs a pre-flight; the pre-flight catches repeat attempts. Recipe points at 'docker buildx prune -af && sudo systemctl restart docker', which is what actually clears the leaked mounts. Tests cover all three paths: wedge preflight blocks builds, non-build commands (down/stop) ignore the preflight, mid-build signature detection kills the retry loop. A new autouse fixture stubs the wedge-detector to 0 so dev-host state doesn't poison the mocked subprocess tests. Wiki companion commit adds Troubleshooting → 'Buildx leaked mounts'.
2026-04-24 19:25:45 -04:00
parent a8356407c5
commit 86b9decf80
2 changed files with 121 additions and 0 deletions
--- a/decnet/engine/deployer.py
+++ b/decnet/engine/deployer.py
@@ -132,6 +132,52 @@ _PERMANENT_ERRORS = (
    "repository does not exist",
 )

+# Signatures of a wedged buildx: the daemon has leaked bind mounts and
+# can no longer update its activity file, which surfaces as "read-only
+# file system" even though nothing is actually read-only. Retrying the
+# compose build just accumulates more leaked mounts — we bail early
+# with a clear recovery recipe.
+_BUILDX_WEDGE_PATTERNS = (
+    "failed to update builder last activity time",
+    ".docker/buildx/activity/",
+)
+
+# Count above which we consider buildx's bind-mount table pathological.
+# A healthy daemon has 0; a couple is transient during a build. Past
+# 10 you're seeing accumulation from a previous failed run.
+_BUILDKIT_MOUNT_THRESHOLD = 10
+
+
+def _count_leaked_buildkit_mounts() -> int:
+    """How many orphaned buildkit bind-mounts is the daemon holding?
+
+    Best-effort: reads /proc/self/mounts and greps for the known
+    buildkit tmp pattern. Returns 0 if the file can't be read so we
+    never block a deploy over our own diagnostic.
+    """
+    try:
+        with open("/proc/self/mounts", "r", encoding="utf-8") as fh:
+            return sum(1 for line in fh if "/var/lib/docker/tmp/buildkit-mount" in line)
+    except OSError:
+        return 0
+
+
+def _buildx_recovery_hint(extra: str = "") -> str:
+    head = (
+        "Buildx is wedged — Docker's build driver has leaked bind "
+        "mounts from a previous failed run and can no longer write "
+        "its activity file. This surfaces as a spurious "
+        "'read-only file system' error."
+    )
+    fix = (
+        "Recovery:\n"
+        "  1. docker buildx prune -af\n"
+        "  2. sudo systemctl restart docker\n"
+        "  3. Retry the deploy.\n"
+        "See wiki: Troubleshooting → 'Buildx leaked mounts'."
+    )
+    return f"{head}\n\n{fix}{(' ' + extra) if extra else ''}"
+

@_traced("engine.compose_with_retry")
 def _compose_with_retry(
@@ -150,6 +196,21 @@ def _compose_with_retry(
    # "project name must not be empty".
    cmd = ["docker", "compose", "-p", "decnet", "-f", str(compose_file), *args]
    merged = {**os.environ, **(env or {})}
+
+    # Preflight: if buildx already looks wedged before the first attempt,
+    # refuse to start — retrying just leaks more mounts. Only applies to
+    # build-bearing invocations ("up --build", "build"); "down" etc. are
+    # unaffected by buildx state.
+    is_build_cmd = any(a in args for a in ("--build", "build"))
+    if is_build_cmd:
+        leaked = _count_leaked_buildkit_mounts()
+        if leaked >= _BUILDKIT_MOUNT_THRESHOLD:
+            hint = _buildx_recovery_hint(f"(Detected {leaked} leaked buildkit mounts.)")
+            log.error("preflight: buildx wedge detected (%d mounts) — refusing to deploy", leaked)
+            raise subprocess.CalledProcessError(
+                returncode=1, cmd=cmd, output="", stderr=hint,
+            )
+
    for attempt in range(1, retries + 1):
        result = subprocess.run(cmd, capture_output=True, text=True, env=merged)  # nosec B603
        if result.returncode == 0:
@@ -163,6 +224,15 @@ def _compose_with_retry(
        if any(pat in stderr_lower for pat in _PERMANENT_ERRORS):
            console.print(f"[red]Permanent Docker error — not retrying:[/]\n{result.stderr.strip()}")
            raise last_exc
+        if any(pat in stderr_lower for pat in _BUILDX_WEDGE_PATTERNS):
+            leaked = _count_leaked_buildkit_mounts()
+            hint = _buildx_recovery_hint(f"(Detected {leaked} leaked buildkit mounts.)")
+            console.print(f"[red]{hint}[/]")
+            log.error("buildx wedge detected mid-build (%d mounts) — not retrying", leaked)
+            raise subprocess.CalledProcessError(
+                returncode=result.returncode, cmd=cmd,
+                output=result.stdout, stderr=hint,
+            )
        if attempt < retries:
            console.print(
                f"[yellow]docker compose {' '.join(args)} failed "
--- a/tests/fleet/test_deployer.py
+++ b/tests/fleet/test_deployer.py
@@ -59,6 +59,17 @@ class TestCompose:

 # ── _compose_with_retry ───────────────────────────────────────────────────────

+@pytest.fixture(autouse=True)
+def _no_leaked_buildkit_mounts(monkeypatch):
+    """Stub out the wedge-detector so dev-host state doesn't bleed into
+    the mocked-subprocess tests below. Tests that exercise the preflight
+    itself patch this function explicitly."""
+    monkeypatch.setattr(
+        "decnet.engine.deployer._count_leaked_buildkit_mounts",
+        lambda: 0,
+    )
+
+
 class TestComposeWithRetry:
    @patch("decnet.engine.deployer.subprocess.run")
    def test_success_first_try(self, mock_run):
@@ -106,6 +117,46 @@ class TestComposeWithRetry:
        captured = capsys.readouterr()
        assert "done" in captured.out

+    @patch("decnet.engine.deployer.subprocess.run")
+    def test_buildx_preflight_blocks_when_wedged(self, mock_run, monkeypatch):
+        """Pre-flight: refuse to run a build command when buildx already
+        shows pathological mount leakage — retrying would only leak more."""
+        from decnet.engine import deployer
+        monkeypatch.setattr(deployer, "_count_leaked_buildkit_mounts", lambda: 42)
+        with pytest.raises(subprocess.CalledProcessError) as ei:
+            deployer._compose_with_retry("up", "--build", "-d")
+        mock_run.assert_not_called()
+        assert "Buildx is wedged" in ei.value.stderr
+        assert "docker buildx prune" in ei.value.stderr
+
+    @patch("decnet.engine.deployer.subprocess.run")
+    def test_buildx_preflight_skipped_for_non_build_cmds(self, mock_run, monkeypatch):
+        """down/stop/etc. don't go through buildx — the preflight must
+        not block them even if mounts are leaked."""
+        from decnet.engine import deployer
+        monkeypatch.setattr(deployer, "_count_leaked_buildkit_mounts", lambda: 999)
+        mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="")
+        deployer._compose_with_retry("down")  # must not raise
+        mock_run.assert_called_once()
+
+    @patch("decnet.engine.deployer.time.sleep")
+    @patch("decnet.engine.deployer.subprocess.run")
+    def test_buildx_wedge_mid_build_short_circuits_retries(self, mock_run, mock_sleep):
+        """If a build fails with the wedge signature, skip remaining
+        retries and surface the recovery hint in stderr."""
+        from decnet.engine.deployer import _compose_with_retry
+        fail = MagicMock(
+            returncode=1, stdout="",
+            stderr="failed to update builder last activity time: "
+                   "open /home/x/.docker/buildx/activity/.tmp-default: read-only file system",
+        )
+        mock_run.return_value = fail
+        with pytest.raises(subprocess.CalledProcessError) as ei:
+            _compose_with_retry("up", "--build", retries=5)
+        assert mock_run.call_count == 1  # no retry
+        mock_sleep.assert_not_called()
+        assert "Buildx is wedged" in ei.value.stderr
+

 # ── _sync_logging_helper ─────────────────────────────────────────────────────