fix(engine/buildx): branch recovery recipe on leaked-mount count
The hint was one-size-fits-all and pointed at prune+restart even
when zero mounts were leaked — a false positive caused by matching
any stderr containing the activity-dir path.
Two changes:
1. Tighten the wedge classifier. Both the buildx-specific phrase
('failed to update builder last activity time') AND the EROFS
marker ('read-only file system') must appear in stderr. Either
alone is now treated as a normal transient error and retried.
2. Branch the recipe on _count_leaked_buildkit_mounts():
* count > 0 → unmount loop + daemon stop + umount -l
(prune+restart alone doesn't evict held mounts)
* count == 0 → rebuild the buildx driver (rm builder state,
buildx create --use, inspect --bootstrap)
Original compose stderr is now preserved in the hint as
'Original error: ...' so the user sees both the recipe and what
compose actually said.
Tests cover both branches plus a negative case (unrelated EROFS).
This commit is contained in:
@@ -132,15 +132,12 @@ _PERMANENT_ERRORS = (
|
|||||||
"repository does not exist",
|
"repository does not exist",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Signatures of a wedged buildx: the daemon has leaked bind mounts and
|
# Signature of a wedged buildx. The phrase is what buildx itself emits
|
||||||
# can no longer update its activity file, which surfaces as "read-only
|
# when its activity-file write fails. Pairing it with "read-only file
|
||||||
# file system" even though nothing is actually read-only. Retrying the
|
# system" avoids false-positives on stderr that merely mentions the
|
||||||
# compose build just accumulates more leaked mounts — we bail early
|
# activity dir path for unrelated reasons.
|
||||||
# with a clear recovery recipe.
|
_BUILDX_WEDGE_SIGNATURE = "failed to update builder last activity time"
|
||||||
_BUILDX_WEDGE_PATTERNS = (
|
_BUILDX_EROFS_SIGNATURE = "read-only file system"
|
||||||
"failed to update builder last activity time",
|
|
||||||
".docker/buildx/activity/",
|
|
||||||
)
|
|
||||||
|
|
||||||
# Count above which we consider buildx's bind-mount table pathological.
|
# Count above which we consider buildx's bind-mount table pathological.
|
||||||
# A healthy daemon has 0; a couple is transient during a build. Past
|
# A healthy daemon has 0; a couple is transient during a build. Past
|
||||||
@@ -177,21 +174,52 @@ def _format_subprocess_error(exc: BaseException) -> str:
|
|||||||
return str(exc)
|
return str(exc)
|
||||||
|
|
||||||
|
|
||||||
def _buildx_recovery_hint(extra: str = "") -> str:
|
def _buildx_recovery_hint(*, leaked_mounts: int, original_stderr: str = "") -> str:
|
||||||
|
"""Compose a recovery recipe tailored to which side of the wedge fired.
|
||||||
|
|
||||||
|
Two failure modes share the 'read-only file system' symptom:
|
||||||
|
|
||||||
|
* **Leaked mounts** (count > 0): buildkit accumulated bind mounts
|
||||||
|
in /var/lib/docker/tmp from a prior failed build. Fix is to drop
|
||||||
|
the mounts by stopping Docker, unmounting them explicitly, and
|
||||||
|
starting clean — ``prune -af && systemctl restart`` alone does
|
||||||
|
not evict already-held mounts.
|
||||||
|
|
||||||
|
* **Driver corruption** (count == 0): the buildx driver's own
|
||||||
|
state is inconsistent (activity dir permissions, stale instance
|
||||||
|
pointer, etc.). Fix is to rebuild the default builder.
|
||||||
|
"""
|
||||||
head = (
|
head = (
|
||||||
"Buildx is wedged — Docker's build driver has leaked bind "
|
"Buildx is wedged — Docker's build driver can no longer write "
|
||||||
"mounts from a previous failed run and can no longer write "
|
"its activity file (spurious 'read-only file system' error)."
|
||||||
"its activity file. This surfaces as a spurious "
|
|
||||||
"'read-only file system' error."
|
|
||||||
)
|
)
|
||||||
fix = (
|
if leaked_mounts > 0:
|
||||||
"Recovery:\n"
|
fix = (
|
||||||
" 1. docker buildx prune -af\n"
|
f"Detected {leaked_mounts} leaked buildkit bind-mounts — "
|
||||||
" 2. sudo systemctl restart docker\n"
|
"prune+restart alone won't evict them.\n"
|
||||||
" 3. Retry the deploy.\n"
|
"Recovery:\n"
|
||||||
"See wiki: Troubleshooting → 'Buildx leaked mounts'."
|
" 1. sudo systemctl stop docker.socket docker.service\n"
|
||||||
)
|
" 2. sudo pkill -9 -f buildkitd; sudo pkill -9 -f containerd-shim\n"
|
||||||
return f"{head}\n\n{fix}{(' ' + extra) if extra else ''}"
|
" 3. for m in $(mount | awk '$3 ~ /buildkit-mount/ {print $3}'); do sudo umount -l \"$m\"; done\n"
|
||||||
|
" 4. rm -rf ~/.docker/buildx/activity\n"
|
||||||
|
" 5. sudo systemctl start docker\n"
|
||||||
|
" 6. docker buildx create --use --name default && docker buildx inspect --bootstrap"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
fix = (
|
||||||
|
"No leaked mounts (count=0) — the buildx driver state "
|
||||||
|
"itself is inconsistent.\n"
|
||||||
|
"Recovery:\n"
|
||||||
|
" 1. docker buildx rm default 2>/dev/null\n"
|
||||||
|
" 2. rm -rf ~/.docker/buildx/activity ~/.docker/buildx/instances/default\n"
|
||||||
|
" 3. docker buildx create --use --name default\n"
|
||||||
|
" 4. docker buildx inspect --bootstrap"
|
||||||
|
)
|
||||||
|
tail = "See wiki: Troubleshooting → 'Buildx leaked mounts'."
|
||||||
|
parts = [head, fix, tail]
|
||||||
|
if original_stderr:
|
||||||
|
parts.append(f"Original error:\n{original_stderr.strip()}")
|
||||||
|
return "\n\n".join(parts)
|
||||||
|
|
||||||
|
|
||||||
@_traced("engine.compose_with_retry")
|
@_traced("engine.compose_with_retry")
|
||||||
@@ -220,7 +248,7 @@ def _compose_with_retry(
|
|||||||
if is_build_cmd:
|
if is_build_cmd:
|
||||||
leaked = _count_leaked_buildkit_mounts()
|
leaked = _count_leaked_buildkit_mounts()
|
||||||
if leaked >= _BUILDKIT_MOUNT_THRESHOLD:
|
if leaked >= _BUILDKIT_MOUNT_THRESHOLD:
|
||||||
hint = _buildx_recovery_hint(f"(Detected {leaked} leaked buildkit mounts.)")
|
hint = _buildx_recovery_hint(leaked_mounts=leaked)
|
||||||
log.error("preflight: buildx wedge detected (%d mounts) — refusing to deploy", leaked)
|
log.error("preflight: buildx wedge detected (%d mounts) — refusing to deploy", leaked)
|
||||||
raise subprocess.CalledProcessError(
|
raise subprocess.CalledProcessError(
|
||||||
returncode=1, cmd=cmd, output="", stderr=hint,
|
returncode=1, cmd=cmd, output="", stderr=hint,
|
||||||
@@ -239,9 +267,18 @@ def _compose_with_retry(
|
|||||||
if any(pat in stderr_lower for pat in _PERMANENT_ERRORS):
|
if any(pat in stderr_lower for pat in _PERMANENT_ERRORS):
|
||||||
console.print(f"[red]Permanent Docker error — not retrying:[/]\n{result.stderr.strip()}")
|
console.print(f"[red]Permanent Docker error — not retrying:[/]\n{result.stderr.strip()}")
|
||||||
raise last_exc
|
raise last_exc
|
||||||
if any(pat in stderr_lower for pat in _BUILDX_WEDGE_PATTERNS):
|
# Wedge match needs BOTH the buildx-specific phrase AND the
|
||||||
|
# EROFS marker — otherwise unrelated stderr that mentions the
|
||||||
|
# activity dir false-positives.
|
||||||
|
if (
|
||||||
|
_BUILDX_WEDGE_SIGNATURE in stderr_lower
|
||||||
|
and _BUILDX_EROFS_SIGNATURE in stderr_lower
|
||||||
|
):
|
||||||
leaked = _count_leaked_buildkit_mounts()
|
leaked = _count_leaked_buildkit_mounts()
|
||||||
hint = _buildx_recovery_hint(f"(Detected {leaked} leaked buildkit mounts.)")
|
hint = _buildx_recovery_hint(
|
||||||
|
leaked_mounts=leaked,
|
||||||
|
original_stderr=result.stderr or "",
|
||||||
|
)
|
||||||
console.print(f"[red]{hint}[/]")
|
console.print(f"[red]{hint}[/]")
|
||||||
log.error("buildx wedge detected mid-build (%d mounts) — not retrying", leaked)
|
log.error("buildx wedge detected mid-build (%d mounts) — not retrying", leaked)
|
||||||
raise subprocess.CalledProcessError(
|
raise subprocess.CalledProcessError(
|
||||||
|
|||||||
@@ -127,7 +127,10 @@ class TestComposeWithRetry:
|
|||||||
deployer._compose_with_retry("up", "--build", "-d")
|
deployer._compose_with_retry("up", "--build", "-d")
|
||||||
mock_run.assert_not_called()
|
mock_run.assert_not_called()
|
||||||
assert "Buildx is wedged" in ei.value.stderr
|
assert "Buildx is wedged" in ei.value.stderr
|
||||||
assert "docker buildx prune" in ei.value.stderr
|
# leaked>0 recipe centres on unmount + daemon stop, since
|
||||||
|
# prune+restart alone doesn't evict already-held mounts.
|
||||||
|
assert "umount -l" in ei.value.stderr
|
||||||
|
assert "Detected 42 leaked" in ei.value.stderr
|
||||||
|
|
||||||
@patch("decnet.engine.deployer.subprocess.run")
|
@patch("decnet.engine.deployer.subprocess.run")
|
||||||
def test_buildx_preflight_skipped_for_non_build_cmds(self, mock_run, monkeypatch):
|
def test_buildx_preflight_skipped_for_non_build_cmds(self, mock_run, monkeypatch):
|
||||||
@@ -156,6 +159,44 @@ class TestComposeWithRetry:
|
|||||||
assert mock_run.call_count == 1 # no retry
|
assert mock_run.call_count == 1 # no retry
|
||||||
mock_sleep.assert_not_called()
|
mock_sleep.assert_not_called()
|
||||||
assert "Buildx is wedged" in ei.value.stderr
|
assert "Buildx is wedged" in ei.value.stderr
|
||||||
|
# Original stderr is preserved alongside the hint so the user
|
||||||
|
# can see what compose actually said.
|
||||||
|
assert "Original error" in ei.value.stderr
|
||||||
|
|
||||||
|
@patch("decnet.engine.deployer.subprocess.run")
|
||||||
|
def test_buildx_wedge_zero_mounts_uses_driver_rebuild_recipe(self, mock_run, monkeypatch):
|
||||||
|
"""Wedge signature with 0 leaked mounts means the buildx driver
|
||||||
|
itself is corrupt — recipe should suggest rebuilding it, not
|
||||||
|
unmounting nothing."""
|
||||||
|
from decnet.engine import deployer
|
||||||
|
monkeypatch.setattr(deployer, "_count_leaked_buildkit_mounts", lambda: 0)
|
||||||
|
mock_run.return_value = MagicMock(
|
||||||
|
returncode=1, stdout="",
|
||||||
|
stderr="failed to update builder last activity time: read-only file system",
|
||||||
|
)
|
||||||
|
with pytest.raises(subprocess.CalledProcessError) as ei:
|
||||||
|
deployer._compose_with_retry("up", "--build")
|
||||||
|
assert "buildx create --use" in ei.value.stderr
|
||||||
|
assert "umount" not in ei.value.stderr
|
||||||
|
assert "No leaked mounts (count=0)" in ei.value.stderr
|
||||||
|
|
||||||
|
@patch("decnet.engine.deployer.time.sleep")
|
||||||
|
@patch("decnet.engine.deployer.subprocess.run")
|
||||||
|
def test_unrelated_erofs_does_not_match_wedge(self, mock_run, mock_sleep):
|
||||||
|
"""Stderr containing 'read-only file system' alone (no buildx
|
||||||
|
activity-time phrase) must NOT be classified as a wedge — that
|
||||||
|
was the false-positive that misled the user."""
|
||||||
|
from decnet.engine.deployer import _compose_with_retry
|
||||||
|
fail = MagicMock(
|
||||||
|
returncode=1, stdout="",
|
||||||
|
stderr="open /etc/foo/bar: read-only file system", # not buildx
|
||||||
|
)
|
||||||
|
mock_run.return_value = fail
|
||||||
|
with pytest.raises(subprocess.CalledProcessError) as ei:
|
||||||
|
_compose_with_retry("up", "--build", retries=2)
|
||||||
|
assert "Buildx is wedged" not in (ei.value.stderr or "")
|
||||||
|
# Treated as a normal transient error → retried until exhausted.
|
||||||
|
assert mock_run.call_count == 2
|
||||||
|
|
||||||
|
|
||||||
# ── _sync_logging_helper ─────────────────────────────────────────────────────
|
# ── _sync_logging_helper ─────────────────────────────────────────────────────
|
||||||
|
|||||||
Reference in New Issue
Block a user