fix(engine): detect wedged buildx + surface recovery hint on deploy
When Docker's buildx leaks bind-mounts from a failed build it starts
reporting 'read-only file system' on its own activity file, even
though nothing is actually read-only. The user's host had 20+
leaked mounts before we noticed — each retry compounds the leak.
_compose_with_retry now:
* Pre-flight counts /var/lib/docker/tmp/buildkit-mount* entries in
/proc/self/mounts; if >= 10 and the command is a build, refuses
to start and returns a clean recovery recipe instead of retrying.
* On mid-build failures that match the wedge signature
('failed to update builder last activity time' or the activity-dir
path in stderr), short-circuits the retry loop with the same
recipe. The first occurrence no longer needs a pre-flight; the
pre-flight catches repeat attempts.
Recipe points at 'docker buildx prune -af && sudo systemctl restart
docker', which is what actually clears the leaked mounts.
Tests cover all three paths: wedge preflight blocks builds, non-build
commands (down/stop) ignore the preflight, mid-build signature
detection kills the retry loop. A new autouse fixture stubs the
wedge-detector to 0 so dev-host state doesn't poison the mocked
subprocess tests.
Wiki companion commit adds Troubleshooting → 'Buildx leaked mounts'.
This commit is contained in:
@@ -132,6 +132,52 @@ _PERMANENT_ERRORS = (
|
||||
"repository does not exist",
|
||||
)
|
||||
|
||||
# Signatures of a wedged buildx: the daemon has leaked bind mounts and
|
||||
# can no longer update its activity file, which surfaces as "read-only
|
||||
# file system" even though nothing is actually read-only. Retrying the
|
||||
# compose build just accumulates more leaked mounts — we bail early
|
||||
# with a clear recovery recipe.
|
||||
_BUILDX_WEDGE_PATTERNS = (
|
||||
"failed to update builder last activity time",
|
||||
".docker/buildx/activity/",
|
||||
)
|
||||
|
||||
# Count above which we consider buildx's bind-mount table pathological.
|
||||
# A healthy daemon has 0; a couple is transient during a build. Past
|
||||
# 10 you're seeing accumulation from a previous failed run.
|
||||
_BUILDKIT_MOUNT_THRESHOLD = 10
|
||||
|
||||
|
||||
def _count_leaked_buildkit_mounts() -> int:
|
||||
"""How many orphaned buildkit bind-mounts is the daemon holding?
|
||||
|
||||
Best-effort: reads /proc/self/mounts and greps for the known
|
||||
buildkit tmp pattern. Returns 0 if the file can't be read so we
|
||||
never block a deploy over our own diagnostic.
|
||||
"""
|
||||
try:
|
||||
with open("/proc/self/mounts", "r", encoding="utf-8") as fh:
|
||||
return sum(1 for line in fh if "/var/lib/docker/tmp/buildkit-mount" in line)
|
||||
except OSError:
|
||||
return 0
|
||||
|
||||
|
||||
def _buildx_recovery_hint(extra: str = "") -> str:
|
||||
head = (
|
||||
"Buildx is wedged — Docker's build driver has leaked bind "
|
||||
"mounts from a previous failed run and can no longer write "
|
||||
"its activity file. This surfaces as a spurious "
|
||||
"'read-only file system' error."
|
||||
)
|
||||
fix = (
|
||||
"Recovery:\n"
|
||||
" 1. docker buildx prune -af\n"
|
||||
" 2. sudo systemctl restart docker\n"
|
||||
" 3. Retry the deploy.\n"
|
||||
"See wiki: Troubleshooting → 'Buildx leaked mounts'."
|
||||
)
|
||||
return f"{head}\n\n{fix}{(' ' + extra) if extra else ''}"
|
||||
|
||||
|
||||
@_traced("engine.compose_with_retry")
|
||||
def _compose_with_retry(
|
||||
@@ -150,6 +196,21 @@ def _compose_with_retry(
|
||||
# "project name must not be empty".
|
||||
cmd = ["docker", "compose", "-p", "decnet", "-f", str(compose_file), *args]
|
||||
merged = {**os.environ, **(env or {})}
|
||||
|
||||
# Preflight: if buildx already looks wedged before the first attempt,
|
||||
# refuse to start — retrying just leaks more mounts. Only applies to
|
||||
# build-bearing invocations ("up --build", "build"); "down" etc. are
|
||||
# unaffected by buildx state.
|
||||
is_build_cmd = any(a in args for a in ("--build", "build"))
|
||||
if is_build_cmd:
|
||||
leaked = _count_leaked_buildkit_mounts()
|
||||
if leaked >= _BUILDKIT_MOUNT_THRESHOLD:
|
||||
hint = _buildx_recovery_hint(f"(Detected {leaked} leaked buildkit mounts.)")
|
||||
log.error("preflight: buildx wedge detected (%d mounts) — refusing to deploy", leaked)
|
||||
raise subprocess.CalledProcessError(
|
||||
returncode=1, cmd=cmd, output="", stderr=hint,
|
||||
)
|
||||
|
||||
for attempt in range(1, retries + 1):
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, env=merged) # nosec B603
|
||||
if result.returncode == 0:
|
||||
@@ -163,6 +224,15 @@ def _compose_with_retry(
|
||||
if any(pat in stderr_lower for pat in _PERMANENT_ERRORS):
|
||||
console.print(f"[red]Permanent Docker error — not retrying:[/]\n{result.stderr.strip()}")
|
||||
raise last_exc
|
||||
if any(pat in stderr_lower for pat in _BUILDX_WEDGE_PATTERNS):
|
||||
leaked = _count_leaked_buildkit_mounts()
|
||||
hint = _buildx_recovery_hint(f"(Detected {leaked} leaked buildkit mounts.)")
|
||||
console.print(f"[red]{hint}[/]")
|
||||
log.error("buildx wedge detected mid-build (%d mounts) — not retrying", leaked)
|
||||
raise subprocess.CalledProcessError(
|
||||
returncode=result.returncode, cmd=cmd,
|
||||
output=result.stdout, stderr=hint,
|
||||
)
|
||||
if attempt < retries:
|
||||
console.print(
|
||||
f"[yellow]docker compose {' '.join(args)} failed "
|
||||
|
||||
@@ -59,6 +59,17 @@ class TestCompose:
|
||||
|
||||
# ── _compose_with_retry ───────────────────────────────────────────────────────
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _no_leaked_buildkit_mounts(monkeypatch):
|
||||
"""Stub out the wedge-detector so dev-host state doesn't bleed into
|
||||
the mocked-subprocess tests below. Tests that exercise the preflight
|
||||
itself patch this function explicitly."""
|
||||
monkeypatch.setattr(
|
||||
"decnet.engine.deployer._count_leaked_buildkit_mounts",
|
||||
lambda: 0,
|
||||
)
|
||||
|
||||
|
||||
class TestComposeWithRetry:
|
||||
@patch("decnet.engine.deployer.subprocess.run")
|
||||
def test_success_first_try(self, mock_run):
|
||||
@@ -106,6 +117,46 @@ class TestComposeWithRetry:
|
||||
captured = capsys.readouterr()
|
||||
assert "done" in captured.out
|
||||
|
||||
@patch("decnet.engine.deployer.subprocess.run")
|
||||
def test_buildx_preflight_blocks_when_wedged(self, mock_run, monkeypatch):
|
||||
"""Pre-flight: refuse to run a build command when buildx already
|
||||
shows pathological mount leakage — retrying would only leak more."""
|
||||
from decnet.engine import deployer
|
||||
monkeypatch.setattr(deployer, "_count_leaked_buildkit_mounts", lambda: 42)
|
||||
with pytest.raises(subprocess.CalledProcessError) as ei:
|
||||
deployer._compose_with_retry("up", "--build", "-d")
|
||||
mock_run.assert_not_called()
|
||||
assert "Buildx is wedged" in ei.value.stderr
|
||||
assert "docker buildx prune" in ei.value.stderr
|
||||
|
||||
@patch("decnet.engine.deployer.subprocess.run")
|
||||
def test_buildx_preflight_skipped_for_non_build_cmds(self, mock_run, monkeypatch):
|
||||
"""down/stop/etc. don't go through buildx — the preflight must
|
||||
not block them even if mounts are leaked."""
|
||||
from decnet.engine import deployer
|
||||
monkeypatch.setattr(deployer, "_count_leaked_buildkit_mounts", lambda: 999)
|
||||
mock_run.return_value = MagicMock(returncode=0, stdout="", stderr="")
|
||||
deployer._compose_with_retry("down") # must not raise
|
||||
mock_run.assert_called_once()
|
||||
|
||||
@patch("decnet.engine.deployer.time.sleep")
|
||||
@patch("decnet.engine.deployer.subprocess.run")
|
||||
def test_buildx_wedge_mid_build_short_circuits_retries(self, mock_run, mock_sleep):
|
||||
"""If a build fails with the wedge signature, skip remaining
|
||||
retries and surface the recovery hint in stderr."""
|
||||
from decnet.engine.deployer import _compose_with_retry
|
||||
fail = MagicMock(
|
||||
returncode=1, stdout="",
|
||||
stderr="failed to update builder last activity time: "
|
||||
"open /home/x/.docker/buildx/activity/.tmp-default: read-only file system",
|
||||
)
|
||||
mock_run.return_value = fail
|
||||
with pytest.raises(subprocess.CalledProcessError) as ei:
|
||||
_compose_with_retry("up", "--build", retries=5)
|
||||
assert mock_run.call_count == 1 # no retry
|
||||
mock_sleep.assert_not_called()
|
||||
assert "Buildx is wedged" in ei.value.stderr
|
||||
|
||||
|
||||
# ── _sync_logging_helper ─────────────────────────────────────────────────────
|
||||
|
||||
|
||||
Reference in New Issue
Block a user