fix(mutator): auto-fall-back to legacy builder when buildx wedges live decky add

apply_add_decky's compose-up was hard-failing whenever the operator's
~/.docker/buildx/activity/ landed on a read-only mount — the wedge
detection in _compose_with_retry correctly refuses to retry (would
just leak more mounts), but for live materialisation we don't want a
wedged buildx state to abort an admin's mutation.  ANTI hit it on
adding decky-a977: 'failed to update builder last activity time: ...
read-only file system → buildx wedge detected → returned non-zero'.

_compose_up_with_buildkit_fallback wraps _compose_with_retry: on a
CalledProcessError whose stderr matches both wedge signatures
(_BUILDX_WEDGE_SIGNATURE + _BUILDX_EROFS_SIGNATURE), it logs a
warning with the manual recovery steps + retries once with
DOCKER_BUILDKIT=0 set.  The legacy non-buildx builder doesn't use
the activity dir and isn't affected.

Wired into the two paths that pass --build:
* _materialise_decky_spawn (apply_add_decky)
* _materialise_decky_services_diff (apply_update_decky service add)

_materialise_decky_recreate_base doesn't build — it just recreates a
container from an existing image — so it's not affected.

Operator-facing log message points at the manual fix
(rm -rf ~/.docker/buildx/activity + docker buildx create) so they
can recover at their leisure; we don't ATTEMPT the recovery because
the activity dir might be RO for a reason (zfs/btrfs snapshot, etc.)
that an automated rm would be wrong to fight.
This commit is contained in:
2026-04-29 10:59:04 -04:00
parent 892219ec87
commit 57e527534c
2 changed files with 123 additions and 18 deletions

View File

@@ -186,6 +186,69 @@ async def _materialise_lan_change(
)
def _is_buildx_wedge(exc: BaseException) -> bool:
"""True when *exc* looks like the buildx EROFS wedge.
We consult both the structured CalledProcessError.stderr and the
str(exc) form because ``_compose_with_retry`` raises a synthetic
CalledProcessError whose ``stderr`` contains the recovery hint
(which preserves the wedge signatures verbatim).
"""
from decnet.engine.deployer import (
_BUILDX_EROFS_SIGNATURE, _BUILDX_WEDGE_SIGNATURE,
)
stderr = ""
if hasattr(exc, "stderr") and exc.stderr:
stderr = str(exc.stderr)
haystack = (stderr + " " + str(exc)).lower()
return (
_BUILDX_WEDGE_SIGNATURE in haystack
and _BUILDX_EROFS_SIGNATURE in haystack
)
async def _compose_up_with_buildkit_fallback(
*args: str, compose_file, label: str,
) -> None:
"""Run ``compose up`` and auto-fall-back to the legacy builder on wedge.
The buildx activity dir occasionally lands on a read-only mount —
happens enough on operator dev boxes that we don't want a single
wedge to abort a live decky-add. When _compose_with_retry raises
with the EROFS-wedge signatures, we retry once with
``DOCKER_BUILDKIT=0`` set. The legacy (non-buildx) builder doesn't
use the activity dir and isn't affected.
*label* is a human-readable identifier used only in log lines so an
operator can grep the fall-back back to the originating op.
"""
import anyio
from decnet.engine.deployer import _compose_with_retry
try:
await anyio.to_thread.run_sync(
lambda: _compose_with_retry(*args, compose_file=compose_file),
)
return
except Exception as exc: # noqa: BLE001
if not _is_buildx_wedge(exc):
raise
_log.warning(
"%s: buildx wedge detected; retrying with DOCKER_BUILDKIT=0 "
"(legacy builder). Recover the buildx state at your leisure: "
"rm -rf ~/.docker/buildx/activity && "
"docker buildx create --name decnet-builder --use --bootstrap",
label,
)
# Outside the except so the second attempt's traceback isn't
# nested under the first failure if it also blows up.
await anyio.to_thread.run_sync(
lambda: _compose_with_retry(
*args, compose_file=compose_file,
env={"DOCKER_BUILDKIT": "0"},
),
)
def _decky_targets(decky_name: str, services: list[str]) -> list[str]:
"""Compose service names for one decky: base + each per-decky service.
@@ -273,20 +336,15 @@ async def _materialise_decky_spawn(
"""
if await _live_topology_or_none(repo, topology_id) is None:
return
import anyio
from decnet.engine.deployer import (
_compose_with_retry,
_topology_compose_path,
)
from decnet.engine.deployer import _topology_compose_path
await _rerender_compose(repo, topology_id)
targets = _decky_targets(decky_name, services)
compose_path = _topology_compose_path(topology_id)
try:
await anyio.to_thread.run_sync(
lambda: _compose_with_retry(
"up", "-d", "--no-deps", "--build", *targets,
compose_file=compose_path,
),
await _compose_up_with_buildkit_fallback(
"up", "-d", "--no-deps", "--build", *targets,
compose_file=compose_path,
label=f"live add_decky topology={topology_id} decky={decky_name}",
)
except Exception as exc: # noqa: BLE001
_log.error(
@@ -429,19 +487,17 @@ async def _materialise_decky_services_diff(
if await _live_topology_or_none(repo, topology_id) is None:
return
import anyio
from decnet.engine.deployer import (
_compose, _compose_with_retry, _topology_compose_path,
)
from decnet.engine.deployer import _compose, _topology_compose_path
await _rerender_compose(repo, topology_id)
compose_path = _topology_compose_path(topology_id)
add_targets = _decky_targets(decky_name, list(added))[1:] # drop the base
if add_targets:
try:
await anyio.to_thread.run_sync(
lambda: _compose_with_retry(
"up", "-d", "--no-deps", "--build", *add_targets,
compose_file=compose_path,
),
await _compose_up_with_buildkit_fallback(
"up", "-d", "--no-deps", "--build", *add_targets,
compose_file=compose_path,
label=f"live update_decky add topology={topology_id} decky={decky_name}",
)
except Exception as exc: # noqa: BLE001
_log.error(

View File

@@ -298,6 +298,55 @@ async def test_update_decky_forwards_l3_flip_with_force_recreates_base(
assert found, "expected force-recreate up against the base"
@pytest.mark.anyio
async def test_add_decky_falls_back_to_legacy_builder_on_buildx_wedge(
repo, monkeypatch, tmp_path,
):
"""When the first compose up hits the buildx EROFS wedge, the
helper retries once with DOCKER_BUILDKIT=0 (legacy builder)."""
import subprocess
from decnet.engine import deployer as _deployer
from decnet.topology import compose as _compose_mod
tid = await _make_active(repo)
lans = await repo.list_lans_for_topology(tid)
home_lan = lans[0]["name"]
calls: list[dict] = []
def fake_compose_with_retry(*args, compose_file=None, env=None, **_):
calls.append({"args": args, "env": env})
# First call: simulate the wedge. Second: succeed.
if len(calls) == 1:
raise subprocess.CalledProcessError(
returncode=1,
cmd=["docker", "compose", *args],
output="",
stderr=(
"failed to update builder last activity time: open "
"/home/anti/.docker/buildx/activity/.tmp-X: "
"read-only file system"
),
)
monkeypatch.setattr(_deployer, "_compose_with_retry", fake_compose_with_retry)
monkeypatch.setattr(_deployer, "_compose", lambda *a, **k: None)
monkeypatch.setattr(
_deployer, "_topology_compose_path",
lambda topo_id: tmp_path / f"compose-{topo_id[:8]}.yml",
)
monkeypatch.setattr(_compose_mod, "write_topology_compose", lambda *a, **k: None)
await apply_add_decky(repo, tid, {
"name": "wedgey", "lan": home_lan, "services": ["ssh"],
})
# Two attempts total; the second carries DOCKER_BUILDKIT=0.
assert len(calls) == 2
assert calls[0]["env"] is None
assert calls[1]["env"] == {"DOCKER_BUILDKIT": "0"}
@pytest.mark.anyio
async def test_update_decky_refuses_gateway_promotion_on_non_dmz_lan(
repo, stubs,