From 05d225ae3801389811cdd1e461d49d13286059de Mon Sep 17 00:00:00 2001 From: anti Date: Fri, 24 Apr 2026 19:31:37 -0400 Subject: [PATCH] fix(engine): surface CalledProcessError.stderr in deploy-failure log + status reason MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit str(CalledProcessError) is just 'Command ... returned non-zero exit status N' — the stderr (where the buildx recovery hint lives) was being silently dropped from both the deploy log line and the persisted 'failed' status reason. New _format_subprocess_error helper appends .stderr when the exception is a CalledProcessError. Applied to transition_status reason and the background-deploy log message so operators and the UI see the real failure, not just the exit code. This is what makes the buildx preflight hint from 86b9dec actually reach the user. --- decnet/engine/deployer.py | 21 +++++++++++++++++-- .../router/topology/api_deploy_topology.py | 6 +++++- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/decnet/engine/deployer.py b/decnet/engine/deployer.py index c88b30a2..89569cb0 100644 --- a/decnet/engine/deployer.py +++ b/decnet/engine/deployer.py @@ -162,6 +162,21 @@ def _count_leaked_buildkit_mounts() -> int: return 0 +def _format_subprocess_error(exc: BaseException) -> str: + """Stringify CalledProcessError so stderr actually shows up. + + The default str(CalledProcessError) is just 'Command ... returned + non-zero exit status N', which drops the stderr we carefully stuff + our buildx recovery hint into. Status reasons and deploy-failure + log lines were losing the payload — surface it here instead. + """ + if isinstance(exc, subprocess.CalledProcessError): + stderr = (exc.stderr or "").strip() + if stderr: + return f"{exc}: {stderr}" + return str(exc) + + def _buildx_recovery_hint(extra: str = "") -> str: head = ( "Buildx is wedged — Docker's build driver has leaked bind " @@ -505,7 +520,8 @@ async def _deploy_on_agent(repo, topology_id: str, hydrated: dict) -> None: topology_id, host.get("name"), exc, ) await transition_status( - repo, topology_id, TopologyStatus.FAILED, reason=str(exc) + repo, topology_id, TopologyStatus.FAILED, + reason=_format_subprocess_error(exc), ) raise @@ -691,7 +707,8 @@ async def deploy_topology(repo, topology_id: str, *, dry_run: bool = False) -> N except OSError: # pragma: no cover pass await transition_status( - repo, topology_id, TopologyStatus.FAILED, reason=str(exc) + repo, topology_id, TopologyStatus.FAILED, + reason=_format_subprocess_error(exc), ) raise diff --git a/decnet/web/router/topology/api_deploy_topology.py b/decnet/web/router/topology/api_deploy_topology.py index c5b764b1..3256c015 100644 --- a/decnet/web/router/topology/api_deploy_topology.py +++ b/decnet/web/router/topology/api_deploy_topology.py @@ -34,7 +34,11 @@ async def _run_deploy(topology_id: str) -> None: except asyncio.CancelledError: # pragma: no cover — shutdown raise except Exception as exc: # noqa: BLE001 - log.error("background deploy of %s failed: %s", topology_id, exc) + from decnet.engine.deployer import _format_subprocess_error + log.error( + "background deploy of %s failed: %s", + topology_id, _format_subprocess_error(exc), + ) @router.post(