Files
DECNET/decnet/web/router/swarm_mgmt/api_decommission_host.py
anti 14250cacad feat(swarm): self-destruct agent on decommission
Decommissioning a worker from the dashboard (or swarm controller) now
asks the agent to wipe its own install before the master forgets it.
The agent stops decky containers + every decnet-* systemd unit, then
deletes /opt/decnet*, /etc/systemd/system/decnet-*, /var/lib/decnet/*,
and /usr/local/bin/decnet*. Logs under /var/log are preserved.

The reaper runs as a detached /tmp script (start_new_session=True) so
it survives the agent process being killed. Self-destruct dispatch is
best-effort — a dead worker doesn't block master-side cleanup.
2026-04-19 20:47:09 -04:00

66 lines
2.1 KiB
Python

"""DELETE /swarm/hosts/{uuid} — decommission a worker from the dashboard.
Also instructs the worker agent to stop all DECNET services and delete
its install footprint (keeping logs). Agent self-destruct failure does
not block decommission — the master-side cleanup always runs so a dead
worker can still be removed from the dashboard.
"""
from __future__ import annotations
import pathlib
from fastapi import APIRouter, Depends, HTTPException, status
from decnet.logging import get_logger
from decnet.swarm.client import AgentClient
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo, require_admin
log = get_logger("swarm.decommission")
router = APIRouter()
@router.delete(
"/hosts/{uuid}",
status_code=status.HTTP_204_NO_CONTENT,
tags=["Swarm Management"],
)
async def decommission_host(
uuid: str,
admin: dict = Depends(require_admin),
repo: BaseRepository = Depends(get_repo),
) -> None:
row = await repo.get_swarm_host_by_uuid(uuid)
if row is None:
raise HTTPException(status_code=404, detail="host not found")
# Ask the worker to wipe its own install (keeps logs). The agent
# schedules the reaper as a detached process and returns immediately,
# so this call is fast when the worker is reachable. A dead worker
# shouldn't block the operator from cleaning up the dashboard entry,
# hence best-effort with a log and continue.
try:
async with AgentClient(host=row) as agent:
await agent.self_destruct()
except Exception:
log.exception(
"decommission: self-destruct dispatch failed host=%s"
"proceeding with master-side cleanup anyway",
row.get("name"),
)
await repo.delete_decky_shards_for_host(uuid)
await repo.delete_swarm_host(uuid)
bundle_dir = pathlib.Path(row.get("cert_bundle_path") or "")
if bundle_dir.is_dir():
for child in bundle_dir.iterdir():
try:
child.unlink()
except OSError:
pass
try:
bundle_dir.rmdir()
except OSError:
pass