fix(mutator): auto-fall-back to legacy builder when buildx wedges live decky add
apply_add_decky's compose-up was hard-failing whenever the operator's ~/.docker/buildx/activity/ landed on a read-only mount — the wedge detection in _compose_with_retry correctly refuses to retry (would just leak more mounts), but for live materialisation we don't want a wedged buildx state to abort an admin's mutation. ANTI hit it on adding decky-a977: 'failed to update builder last activity time: ... read-only file system → buildx wedge detected → returned non-zero'. _compose_up_with_buildkit_fallback wraps _compose_with_retry: on a CalledProcessError whose stderr matches both wedge signatures (_BUILDX_WEDGE_SIGNATURE + _BUILDX_EROFS_SIGNATURE), it logs a warning with the manual recovery steps + retries once with DOCKER_BUILDKIT=0 set. The legacy non-buildx builder doesn't use the activity dir and isn't affected. Wired into the two paths that pass --build: * _materialise_decky_spawn (apply_add_decky) * _materialise_decky_services_diff (apply_update_decky service add) _materialise_decky_recreate_base doesn't build — it just recreates a container from an existing image — so it's not affected. Operator-facing log message points at the manual fix (rm -rf ~/.docker/buildx/activity + docker buildx create) so they can recover at their leisure; we don't ATTEMPT the recovery because the activity dir might be RO for a reason (zfs/btrfs snapshot, etc.) that an automated rm would be wrong to fight.
This commit is contained in:
@@ -186,6 +186,69 @@ async def _materialise_lan_change(
|
||||
)
|
||||
|
||||
|
||||
def _is_buildx_wedge(exc: BaseException) -> bool:
|
||||
"""True when *exc* looks like the buildx EROFS wedge.
|
||||
|
||||
We consult both the structured CalledProcessError.stderr and the
|
||||
str(exc) form because ``_compose_with_retry`` raises a synthetic
|
||||
CalledProcessError whose ``stderr`` contains the recovery hint
|
||||
(which preserves the wedge signatures verbatim).
|
||||
"""
|
||||
from decnet.engine.deployer import (
|
||||
_BUILDX_EROFS_SIGNATURE, _BUILDX_WEDGE_SIGNATURE,
|
||||
)
|
||||
stderr = ""
|
||||
if hasattr(exc, "stderr") and exc.stderr:
|
||||
stderr = str(exc.stderr)
|
||||
haystack = (stderr + " " + str(exc)).lower()
|
||||
return (
|
||||
_BUILDX_WEDGE_SIGNATURE in haystack
|
||||
and _BUILDX_EROFS_SIGNATURE in haystack
|
||||
)
|
||||
|
||||
|
||||
async def _compose_up_with_buildkit_fallback(
|
||||
*args: str, compose_file, label: str,
|
||||
) -> None:
|
||||
"""Run ``compose up`` and auto-fall-back to the legacy builder on wedge.
|
||||
|
||||
The buildx activity dir occasionally lands on a read-only mount —
|
||||
happens enough on operator dev boxes that we don't want a single
|
||||
wedge to abort a live decky-add. When _compose_with_retry raises
|
||||
with the EROFS-wedge signatures, we retry once with
|
||||
``DOCKER_BUILDKIT=0`` set. The legacy (non-buildx) builder doesn't
|
||||
use the activity dir and isn't affected.
|
||||
|
||||
*label* is a human-readable identifier used only in log lines so an
|
||||
operator can grep the fall-back back to the originating op.
|
||||
"""
|
||||
import anyio
|
||||
from decnet.engine.deployer import _compose_with_retry
|
||||
try:
|
||||
await anyio.to_thread.run_sync(
|
||||
lambda: _compose_with_retry(*args, compose_file=compose_file),
|
||||
)
|
||||
return
|
||||
except Exception as exc: # noqa: BLE001
|
||||
if not _is_buildx_wedge(exc):
|
||||
raise
|
||||
_log.warning(
|
||||
"%s: buildx wedge detected; retrying with DOCKER_BUILDKIT=0 "
|
||||
"(legacy builder). Recover the buildx state at your leisure: "
|
||||
"rm -rf ~/.docker/buildx/activity && "
|
||||
"docker buildx create --name decnet-builder --use --bootstrap",
|
||||
label,
|
||||
)
|
||||
# Outside the except so the second attempt's traceback isn't
|
||||
# nested under the first failure if it also blows up.
|
||||
await anyio.to_thread.run_sync(
|
||||
lambda: _compose_with_retry(
|
||||
*args, compose_file=compose_file,
|
||||
env={"DOCKER_BUILDKIT": "0"},
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def _decky_targets(decky_name: str, services: list[str]) -> list[str]:
|
||||
"""Compose service names for one decky: base + each per-decky service.
|
||||
|
||||
@@ -273,20 +336,15 @@ async def _materialise_decky_spawn(
|
||||
"""
|
||||
if await _live_topology_or_none(repo, topology_id) is None:
|
||||
return
|
||||
import anyio
|
||||
from decnet.engine.deployer import (
|
||||
_compose_with_retry,
|
||||
_topology_compose_path,
|
||||
)
|
||||
from decnet.engine.deployer import _topology_compose_path
|
||||
await _rerender_compose(repo, topology_id)
|
||||
targets = _decky_targets(decky_name, services)
|
||||
compose_path = _topology_compose_path(topology_id)
|
||||
try:
|
||||
await anyio.to_thread.run_sync(
|
||||
lambda: _compose_with_retry(
|
||||
"up", "-d", "--no-deps", "--build", *targets,
|
||||
compose_file=compose_path,
|
||||
),
|
||||
await _compose_up_with_buildkit_fallback(
|
||||
"up", "-d", "--no-deps", "--build", *targets,
|
||||
compose_file=compose_path,
|
||||
label=f"live add_decky topology={topology_id} decky={decky_name}",
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
_log.error(
|
||||
@@ -429,19 +487,17 @@ async def _materialise_decky_services_diff(
|
||||
if await _live_topology_or_none(repo, topology_id) is None:
|
||||
return
|
||||
import anyio
|
||||
from decnet.engine.deployer import (
|
||||
_compose, _compose_with_retry, _topology_compose_path,
|
||||
)
|
||||
from decnet.engine.deployer import _compose, _topology_compose_path
|
||||
|
||||
await _rerender_compose(repo, topology_id)
|
||||
compose_path = _topology_compose_path(topology_id)
|
||||
add_targets = _decky_targets(decky_name, list(added))[1:] # drop the base
|
||||
if add_targets:
|
||||
try:
|
||||
await anyio.to_thread.run_sync(
|
||||
lambda: _compose_with_retry(
|
||||
"up", "-d", "--no-deps", "--build", *add_targets,
|
||||
compose_file=compose_path,
|
||||
),
|
||||
await _compose_up_with_buildkit_fallback(
|
||||
"up", "-d", "--no-deps", "--build", *add_targets,
|
||||
compose_file=compose_path,
|
||||
label=f"live update_decky add topology={topology_id} decky={decky_name}",
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
_log.error(
|
||||
|
||||
@@ -298,6 +298,55 @@ async def test_update_decky_forwards_l3_flip_with_force_recreates_base(
|
||||
assert found, "expected force-recreate up against the base"
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_add_decky_falls_back_to_legacy_builder_on_buildx_wedge(
|
||||
repo, monkeypatch, tmp_path,
|
||||
):
|
||||
"""When the first compose up hits the buildx EROFS wedge, the
|
||||
helper retries once with DOCKER_BUILDKIT=0 (legacy builder)."""
|
||||
import subprocess
|
||||
from decnet.engine import deployer as _deployer
|
||||
from decnet.topology import compose as _compose_mod
|
||||
|
||||
tid = await _make_active(repo)
|
||||
lans = await repo.list_lans_for_topology(tid)
|
||||
home_lan = lans[0]["name"]
|
||||
|
||||
calls: list[dict] = []
|
||||
|
||||
def fake_compose_with_retry(*args, compose_file=None, env=None, **_):
|
||||
calls.append({"args": args, "env": env})
|
||||
# First call: simulate the wedge. Second: succeed.
|
||||
if len(calls) == 1:
|
||||
raise subprocess.CalledProcessError(
|
||||
returncode=1,
|
||||
cmd=["docker", "compose", *args],
|
||||
output="",
|
||||
stderr=(
|
||||
"failed to update builder last activity time: open "
|
||||
"/home/anti/.docker/buildx/activity/.tmp-X: "
|
||||
"read-only file system"
|
||||
),
|
||||
)
|
||||
|
||||
monkeypatch.setattr(_deployer, "_compose_with_retry", fake_compose_with_retry)
|
||||
monkeypatch.setattr(_deployer, "_compose", lambda *a, **k: None)
|
||||
monkeypatch.setattr(
|
||||
_deployer, "_topology_compose_path",
|
||||
lambda topo_id: tmp_path / f"compose-{topo_id[:8]}.yml",
|
||||
)
|
||||
monkeypatch.setattr(_compose_mod, "write_topology_compose", lambda *a, **k: None)
|
||||
|
||||
await apply_add_decky(repo, tid, {
|
||||
"name": "wedgey", "lan": home_lan, "services": ["ssh"],
|
||||
})
|
||||
|
||||
# Two attempts total; the second carries DOCKER_BUILDKIT=0.
|
||||
assert len(calls) == 2
|
||||
assert calls[0]["env"] is None
|
||||
assert calls[1]["env"] == {"DOCKER_BUILDKIT": "0"}
|
||||
|
||||
|
||||
@pytest.mark.anyio
|
||||
async def test_update_decky_refuses_gateway_promotion_on_non_dmz_lan(
|
||||
repo, stubs,
|
||||
|
||||
Reference in New Issue
Block a user