fix(engine/buildx): branch recovery recipe on leaked-mount count

The hint was one-size-fits-all and pointed at prune+restart even when zero mounts were leaked — a false positive caused by matching any stderr containing the activity-dir path. Two changes: 1. Tighten the wedge classifier. Both the buildx-specific phrase ('failed to update builder last activity time') AND the EROFS marker ('read-only file system') must appear in stderr. Either alone is now treated as a normal transient error and retried. 2. Branch the recipe on _count_leaked_buildkit_mounts(): * count > 0 → unmount loop + daemon stop + umount -l (prune+restart alone doesn't evict held mounts) * count == 0 → rebuild the buildx driver (rm builder state, buildx create --use, inspect --bootstrap) Original compose stderr is now preserved in the hint as 'Original error: ...' so the user sees both the recipe and what compose actually said. Tests cover both branches plus a negative case (unrelated EROFS).
2026-04-24 21:58:09 -04:00
parent 05d225ae38
commit 40a31d8bc7
2 changed files with 104 additions and 26 deletions
--- a/tests/fleet/test_deployer.py
+++ b/tests/fleet/test_deployer.py
@@ -127,7 +127,10 @@ class TestComposeWithRetry:
            deployer._compose_with_retry("up", "--build", "-d")
        mock_run.assert_not_called()
        assert "Buildx is wedged" in ei.value.stderr
-        assert "docker buildx prune" in ei.value.stderr
+        # leaked>0 recipe centres on unmount + daemon stop, since
+        # prune+restart alone doesn't evict already-held mounts.
+        assert "umount -l" in ei.value.stderr
+        assert "Detected 42 leaked" in ei.value.stderr

    @patch("decnet.engine.deployer.subprocess.run")
    def test_buildx_preflight_skipped_for_non_build_cmds(self, mock_run, monkeypatch):
@@ -156,6 +159,44 @@ class TestComposeWithRetry:
        assert mock_run.call_count == 1  # no retry
        mock_sleep.assert_not_called()
        assert "Buildx is wedged" in ei.value.stderr
+        # Original stderr is preserved alongside the hint so the user
+        # can see what compose actually said.
+        assert "Original error" in ei.value.stderr
+
+    @patch("decnet.engine.deployer.subprocess.run")
+    def test_buildx_wedge_zero_mounts_uses_driver_rebuild_recipe(self, mock_run, monkeypatch):
+        """Wedge signature with 0 leaked mounts means the buildx driver
+        itself is corrupt — recipe should suggest rebuilding it, not
+        unmounting nothing."""
+        from decnet.engine import deployer
+        monkeypatch.setattr(deployer, "_count_leaked_buildkit_mounts", lambda: 0)
+        mock_run.return_value = MagicMock(
+            returncode=1, stdout="",
+            stderr="failed to update builder last activity time: read-only file system",
+        )
+        with pytest.raises(subprocess.CalledProcessError) as ei:
+            deployer._compose_with_retry("up", "--build")
+        assert "buildx create --use" in ei.value.stderr
+        assert "umount" not in ei.value.stderr
+        assert "No leaked mounts (count=0)" in ei.value.stderr
+
+    @patch("decnet.engine.deployer.time.sleep")
+    @patch("decnet.engine.deployer.subprocess.run")
+    def test_unrelated_erofs_does_not_match_wedge(self, mock_run, mock_sleep):
+        """Stderr containing 'read-only file system' alone (no buildx
+        activity-time phrase) must NOT be classified as a wedge — that
+        was the false-positive that misled the user."""
+        from decnet.engine.deployer import _compose_with_retry
+        fail = MagicMock(
+            returncode=1, stdout="",
+            stderr="open /etc/foo/bar: read-only file system",  # not buildx
+        )
+        mock_run.return_value = fail
+        with pytest.raises(subprocess.CalledProcessError) as ei:
+            _compose_with_retry("up", "--build", retries=2)
+        assert "Buildx is wedged" not in (ei.value.stderr or "")
+        # Treated as a normal transient error → retried until exhausted.
+        assert mock_run.call_count == 2


 # ── _sync_logging_helper ─────────────────────────────────────────────────────