fix(mutator): auto-fall-back to legacy builder when buildx wedges live decky add
apply_add_decky's compose-up was hard-failing whenever the operator's ~/.docker/buildx/activity/ landed on a read-only mount — the wedge detection in _compose_with_retry correctly refuses to retry (would just leak more mounts), but for live materialisation we don't want a wedged buildx state to abort an admin's mutation. ANTI hit it on adding decky-a977: 'failed to update builder last activity time: ... read-only file system → buildx wedge detected → returned non-zero'. _compose_up_with_buildkit_fallback wraps _compose_with_retry: on a CalledProcessError whose stderr matches both wedge signatures (_BUILDX_WEDGE_SIGNATURE + _BUILDX_EROFS_SIGNATURE), it logs a warning with the manual recovery steps + retries once with DOCKER_BUILDKIT=0 set. The legacy non-buildx builder doesn't use the activity dir and isn't affected. Wired into the two paths that pass --build: * _materialise_decky_spawn (apply_add_decky) * _materialise_decky_services_diff (apply_update_decky service add) _materialise_decky_recreate_base doesn't build — it just recreates a container from an existing image — so it's not affected. Operator-facing log message points at the manual fix (rm -rf ~/.docker/buildx/activity + docker buildx create) so they can recover at their leisure; we don't ATTEMPT the recovery because the activity dir might be RO for a reason (zfs/btrfs snapshot, etc.) that an automated rm would be wrong to fight.
This commit is contained in:
@@ -186,6 +186,69 @@ async def _materialise_lan_change(
|
||||
)
|
||||
|
||||
|
||||
def _is_buildx_wedge(exc: BaseException) -> bool:
|
||||
"""True when *exc* looks like the buildx EROFS wedge.
|
||||
|
||||
We consult both the structured CalledProcessError.stderr and the
|
||||
str(exc) form because ``_compose_with_retry`` raises a synthetic
|
||||
CalledProcessError whose ``stderr`` contains the recovery hint
|
||||
(which preserves the wedge signatures verbatim).
|
||||
"""
|
||||
from decnet.engine.deployer import (
|
||||
_BUILDX_EROFS_SIGNATURE, _BUILDX_WEDGE_SIGNATURE,
|
||||
)
|
||||
stderr = ""
|
||||
if hasattr(exc, "stderr") and exc.stderr:
|
||||
stderr = str(exc.stderr)
|
||||
haystack = (stderr + " " + str(exc)).lower()
|
||||
return (
|
||||
_BUILDX_WEDGE_SIGNATURE in haystack
|
||||
and _BUILDX_EROFS_SIGNATURE in haystack
|
||||
)
|
||||
|
||||
|
||||
async def _compose_up_with_buildkit_fallback(
|
||||
*args: str, compose_file, label: str,
|
||||
) -> None:
|
||||
"""Run ``compose up`` and auto-fall-back to the legacy builder on wedge.
|
||||
|
||||
The buildx activity dir occasionally lands on a read-only mount —
|
||||
happens enough on operator dev boxes that we don't want a single
|
||||
wedge to abort a live decky-add. When _compose_with_retry raises
|
||||
with the EROFS-wedge signatures, we retry once with
|
||||
``DOCKER_BUILDKIT=0`` set. The legacy (non-buildx) builder doesn't
|
||||
use the activity dir and isn't affected.
|
||||
|
||||
*label* is a human-readable identifier used only in log lines so an
|
||||
operator can grep the fall-back back to the originating op.
|
||||
"""
|
||||
import anyio
|
||||
from decnet.engine.deployer import _compose_with_retry
|
||||
try:
|
||||
await anyio.to_thread.run_sync(
|
||||
lambda: _compose_with_retry(*args, compose_file=compose_file),
|
||||
)
|
||||
return
|
||||
except Exception as exc: # noqa: BLE001
|
||||
if not _is_buildx_wedge(exc):
|
||||
raise
|
||||
_log.warning(
|
||||
"%s: buildx wedge detected; retrying with DOCKER_BUILDKIT=0 "
|
||||
"(legacy builder). Recover the buildx state at your leisure: "
|
||||
"rm -rf ~/.docker/buildx/activity && "
|
||||
"docker buildx create --name decnet-builder --use --bootstrap",
|
||||
label,
|
||||
)
|
||||
# Outside the except so the second attempt's traceback isn't
|
||||
# nested under the first failure if it also blows up.
|
||||
await anyio.to_thread.run_sync(
|
||||
lambda: _compose_with_retry(
|
||||
*args, compose_file=compose_file,
|
||||
env={"DOCKER_BUILDKIT": "0"},
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def _decky_targets(decky_name: str, services: list[str]) -> list[str]:
|
||||
"""Compose service names for one decky: base + each per-decky service.
|
||||
|
||||
@@ -273,20 +336,15 @@ async def _materialise_decky_spawn(
|
||||
"""
|
||||
if await _live_topology_or_none(repo, topology_id) is None:
|
||||
return
|
||||
import anyio
|
||||
from decnet.engine.deployer import (
|
||||
_compose_with_retry,
|
||||
_topology_compose_path,
|
||||
)
|
||||
from decnet.engine.deployer import _topology_compose_path
|
||||
await _rerender_compose(repo, topology_id)
|
||||
targets = _decky_targets(decky_name, services)
|
||||
compose_path = _topology_compose_path(topology_id)
|
||||
try:
|
||||
await anyio.to_thread.run_sync(
|
||||
lambda: _compose_with_retry(
|
||||
"up", "-d", "--no-deps", "--build", *targets,
|
||||
compose_file=compose_path,
|
||||
),
|
||||
await _compose_up_with_buildkit_fallback(
|
||||
"up", "-d", "--no-deps", "--build", *targets,
|
||||
compose_file=compose_path,
|
||||
label=f"live add_decky topology={topology_id} decky={decky_name}",
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
_log.error(
|
||||
@@ -429,19 +487,17 @@ async def _materialise_decky_services_diff(
|
||||
if await _live_topology_or_none(repo, topology_id) is None:
|
||||
return
|
||||
import anyio
|
||||
from decnet.engine.deployer import (
|
||||
_compose, _compose_with_retry, _topology_compose_path,
|
||||
)
|
||||
from decnet.engine.deployer import _compose, _topology_compose_path
|
||||
|
||||
await _rerender_compose(repo, topology_id)
|
||||
compose_path = _topology_compose_path(topology_id)
|
||||
add_targets = _decky_targets(decky_name, list(added))[1:] # drop the base
|
||||
if add_targets:
|
||||
try:
|
||||
await anyio.to_thread.run_sync(
|
||||
lambda: _compose_with_retry(
|
||||
"up", "-d", "--no-deps", "--build", *add_targets,
|
||||
compose_file=compose_path,
|
||||
),
|
||||
await _compose_up_with_buildkit_fallback(
|
||||
"up", "-d", "--no-deps", "--build", *add_targets,
|
||||
compose_file=compose_path,
|
||||
label=f"live update_decky add topology={topology_id} decky={decky_name}",
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
_log.error(
|
||||
|
||||
Reference in New Issue
Block a user