fix(engine): surface CalledProcessError.stderr in deploy-failure log + status reason

str(CalledProcessError) is just 'Command ... returned non-zero exit
status N' — the stderr (where the buildx recovery hint lives) was
being silently dropped from both the deploy log line and the
persisted 'failed' status reason.

New _format_subprocess_error helper appends .stderr when the
exception is a CalledProcessError. Applied to transition_status
reason and the background-deploy log message so operators and the
UI see the real failure, not just the exit code.

This is what makes the buildx preflight hint from 86b9dec actually
reach the user.
This commit is contained in:
2026-04-24 19:31:37 -04:00
parent 86b9decf80
commit 05d225ae38
2 changed files with 24 additions and 3 deletions

View File

@@ -162,6 +162,21 @@ def _count_leaked_buildkit_mounts() -> int:
return 0
def _format_subprocess_error(exc: BaseException) -> str:
"""Stringify CalledProcessError so stderr actually shows up.
The default str(CalledProcessError) is just 'Command ... returned
non-zero exit status N', which drops the stderr we carefully stuff
our buildx recovery hint into. Status reasons and deploy-failure
log lines were losing the payload — surface it here instead.
"""
if isinstance(exc, subprocess.CalledProcessError):
stderr = (exc.stderr or "").strip()
if stderr:
return f"{exc}: {stderr}"
return str(exc)
def _buildx_recovery_hint(extra: str = "") -> str:
head = (
"Buildx is wedged — Docker's build driver has leaked bind "
@@ -505,7 +520,8 @@ async def _deploy_on_agent(repo, topology_id: str, hydrated: dict) -> None:
topology_id, host.get("name"), exc,
)
await transition_status(
repo, topology_id, TopologyStatus.FAILED, reason=str(exc)
repo, topology_id, TopologyStatus.FAILED,
reason=_format_subprocess_error(exc),
)
raise
@@ -691,7 +707,8 @@ async def deploy_topology(repo, topology_id: str, *, dry_run: bool = False) -> N
except OSError: # pragma: no cover
pass
await transition_status(
repo, topology_id, TopologyStatus.FAILED, reason=str(exc)
repo, topology_id, TopologyStatus.FAILED,
reason=_format_subprocess_error(exc),
)
raise

View File

@@ -34,7 +34,11 @@ async def _run_deploy(topology_id: str) -> None:
except asyncio.CancelledError: # pragma: no cover — shutdown
raise
except Exception as exc: # noqa: BLE001
log.error("background deploy of %s failed: %s", topology_id, exc)
from decnet.engine.deployer import _format_subprocess_error
log.error(
"background deploy of %s failed: %s",
topology_id, _format_subprocess_error(exc),
)
@router.post(