From 57e527534ccb34afce103ad7be8eb74d69e3b47d Mon Sep 17 00:00:00 2001 From: anti Date: Wed, 29 Apr 2026 10:59:04 -0400 Subject: [PATCH] fix(mutator): auto-fall-back to legacy builder when buildx wedges live decky add MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit apply_add_decky's compose-up was hard-failing whenever the operator's ~/.docker/buildx/activity/ landed on a read-only mount — the wedge detection in _compose_with_retry correctly refuses to retry (would just leak more mounts), but for live materialisation we don't want a wedged buildx state to abort an admin's mutation. ANTI hit it on adding decky-a977: 'failed to update builder last activity time: ... read-only file system → buildx wedge detected → returned non-zero'. _compose_up_with_buildkit_fallback wraps _compose_with_retry: on a CalledProcessError whose stderr matches both wedge signatures (_BUILDX_WEDGE_SIGNATURE + _BUILDX_EROFS_SIGNATURE), it logs a warning with the manual recovery steps + retries once with DOCKER_BUILDKIT=0 set. The legacy non-buildx builder doesn't use the activity dir and isn't affected. Wired into the two paths that pass --build: * _materialise_decky_spawn (apply_add_decky) * _materialise_decky_services_diff (apply_update_decky service add) _materialise_decky_recreate_base doesn't build — it just recreates a container from an existing image — so it's not affected. Operator-facing log message points at the manual fix (rm -rf ~/.docker/buildx/activity + docker buildx create) so they can recover at their leisure; we don't ATTEMPT the recovery because the activity dir might be RO for a reason (zfs/btrfs snapshot, etc.) that an automated rm would be wrong to fight. --- decnet/mutator/ops.py | 92 ++++++++++++++++++----- tests/mutator/test_ops_materialisation.py | 49 ++++++++++++ 2 files changed, 123 insertions(+), 18 deletions(-) diff --git a/decnet/mutator/ops.py b/decnet/mutator/ops.py index e68edf8a..ab127849 100644 --- a/decnet/mutator/ops.py +++ b/decnet/mutator/ops.py @@ -186,6 +186,69 @@ async def _materialise_lan_change( ) +def _is_buildx_wedge(exc: BaseException) -> bool: + """True when *exc* looks like the buildx EROFS wedge. + + We consult both the structured CalledProcessError.stderr and the + str(exc) form because ``_compose_with_retry`` raises a synthetic + CalledProcessError whose ``stderr`` contains the recovery hint + (which preserves the wedge signatures verbatim). + """ + from decnet.engine.deployer import ( + _BUILDX_EROFS_SIGNATURE, _BUILDX_WEDGE_SIGNATURE, + ) + stderr = "" + if hasattr(exc, "stderr") and exc.stderr: + stderr = str(exc.stderr) + haystack = (stderr + " " + str(exc)).lower() + return ( + _BUILDX_WEDGE_SIGNATURE in haystack + and _BUILDX_EROFS_SIGNATURE in haystack + ) + + +async def _compose_up_with_buildkit_fallback( + *args: str, compose_file, label: str, +) -> None: + """Run ``compose up`` and auto-fall-back to the legacy builder on wedge. + + The buildx activity dir occasionally lands on a read-only mount — + happens enough on operator dev boxes that we don't want a single + wedge to abort a live decky-add. When _compose_with_retry raises + with the EROFS-wedge signatures, we retry once with + ``DOCKER_BUILDKIT=0`` set. The legacy (non-buildx) builder doesn't + use the activity dir and isn't affected. + + *label* is a human-readable identifier used only in log lines so an + operator can grep the fall-back back to the originating op. + """ + import anyio + from decnet.engine.deployer import _compose_with_retry + try: + await anyio.to_thread.run_sync( + lambda: _compose_with_retry(*args, compose_file=compose_file), + ) + return + except Exception as exc: # noqa: BLE001 + if not _is_buildx_wedge(exc): + raise + _log.warning( + "%s: buildx wedge detected; retrying with DOCKER_BUILDKIT=0 " + "(legacy builder). Recover the buildx state at your leisure: " + "rm -rf ~/.docker/buildx/activity && " + "docker buildx create --name decnet-builder --use --bootstrap", + label, + ) + # Outside the except so the second attempt's traceback isn't + # nested under the first failure if it also blows up. + await anyio.to_thread.run_sync( + lambda: _compose_with_retry( + *args, compose_file=compose_file, + env={"DOCKER_BUILDKIT": "0"}, + ), + ) + + def _decky_targets(decky_name: str, services: list[str]) -> list[str]: """Compose service names for one decky: base + each per-decky service. @@ -273,20 +336,15 @@ async def _materialise_decky_spawn( """ if await _live_topology_or_none(repo, topology_id) is None: return - import anyio - from decnet.engine.deployer import ( - _compose_with_retry, - _topology_compose_path, - ) + from decnet.engine.deployer import _topology_compose_path await _rerender_compose(repo, topology_id) targets = _decky_targets(decky_name, services) compose_path = _topology_compose_path(topology_id) try: - await anyio.to_thread.run_sync( - lambda: _compose_with_retry( - "up", "-d", "--no-deps", "--build", *targets, - compose_file=compose_path, - ), + await _compose_up_with_buildkit_fallback( + "up", "-d", "--no-deps", "--build", *targets, + compose_file=compose_path, + label=f"live add_decky topology={topology_id} decky={decky_name}", ) except Exception as exc: # noqa: BLE001 _log.error( @@ -429,19 +487,17 @@ async def _materialise_decky_services_diff( if await _live_topology_or_none(repo, topology_id) is None: return import anyio - from decnet.engine.deployer import ( - _compose, _compose_with_retry, _topology_compose_path, - ) + from decnet.engine.deployer import _compose, _topology_compose_path + await _rerender_compose(repo, topology_id) compose_path = _topology_compose_path(topology_id) add_targets = _decky_targets(decky_name, list(added))[1:] # drop the base if add_targets: try: - await anyio.to_thread.run_sync( - lambda: _compose_with_retry( - "up", "-d", "--no-deps", "--build", *add_targets, - compose_file=compose_path, - ), + await _compose_up_with_buildkit_fallback( + "up", "-d", "--no-deps", "--build", *add_targets, + compose_file=compose_path, + label=f"live update_decky add topology={topology_id} decky={decky_name}", ) except Exception as exc: # noqa: BLE001 _log.error( diff --git a/tests/mutator/test_ops_materialisation.py b/tests/mutator/test_ops_materialisation.py index 9a2a3c37..948b5354 100644 --- a/tests/mutator/test_ops_materialisation.py +++ b/tests/mutator/test_ops_materialisation.py @@ -298,6 +298,55 @@ async def test_update_decky_forwards_l3_flip_with_force_recreates_base( assert found, "expected force-recreate up against the base" +@pytest.mark.anyio +async def test_add_decky_falls_back_to_legacy_builder_on_buildx_wedge( + repo, monkeypatch, tmp_path, +): + """When the first compose up hits the buildx EROFS wedge, the + helper retries once with DOCKER_BUILDKIT=0 (legacy builder).""" + import subprocess + from decnet.engine import deployer as _deployer + from decnet.topology import compose as _compose_mod + + tid = await _make_active(repo) + lans = await repo.list_lans_for_topology(tid) + home_lan = lans[0]["name"] + + calls: list[dict] = [] + + def fake_compose_with_retry(*args, compose_file=None, env=None, **_): + calls.append({"args": args, "env": env}) + # First call: simulate the wedge. Second: succeed. + if len(calls) == 1: + raise subprocess.CalledProcessError( + returncode=1, + cmd=["docker", "compose", *args], + output="", + stderr=( + "failed to update builder last activity time: open " + "/home/anti/.docker/buildx/activity/.tmp-X: " + "read-only file system" + ), + ) + + monkeypatch.setattr(_deployer, "_compose_with_retry", fake_compose_with_retry) + monkeypatch.setattr(_deployer, "_compose", lambda *a, **k: None) + monkeypatch.setattr( + _deployer, "_topology_compose_path", + lambda topo_id: tmp_path / f"compose-{topo_id[:8]}.yml", + ) + monkeypatch.setattr(_compose_mod, "write_topology_compose", lambda *a, **k: None) + + await apply_add_decky(repo, tid, { + "name": "wedgey", "lan": home_lan, "services": ["ssh"], + }) + + # Two attempts total; the second carries DOCKER_BUILDKIT=0. + assert len(calls) == 2 + assert calls[0]["env"] is None + assert calls[1]["env"] == {"DOCKER_BUILDKIT": "0"} + + @pytest.mark.anyio async def test_update_decky_refuses_gateway_promotion_on_non_dmz_lan( repo, stubs,