Files
DECNET/decnet/agent/app.py
anti 14250cacad feat(swarm): self-destruct agent on decommission
Decommissioning a worker from the dashboard (or swarm controller) now
asks the agent to wipe its own install before the master forgets it.
The agent stops decky containers + every decnet-* systemd unit, then
deletes /opt/decnet*, /etc/systemd/system/decnet-*, /var/lib/decnet/*,
and /usr/local/bin/decnet*. Logs under /var/log are preserved.

The reaper runs as a detached /tmp script (start_new_session=True) so
it survives the agent process being killed. Self-destruct dispatch is
best-effort — a dead worker doesn't block master-side cleanup.
2026-04-19 20:47:09 -04:00

115 lines
3.7 KiB
Python

"""Worker-side FastAPI app.
Protected by mTLS at the ASGI/uvicorn transport layer: uvicorn is started
with ``--ssl-ca-certs`` + ``--ssl-cert-reqs 2`` (CERT_REQUIRED), so any
client that cannot prove a cert signed by the DECNET CA is rejected before
reaching a handler. Once past the TLS handshake, all peers are trusted
equally (the only entity holding a CA-signed cert is the master
controller).
Endpoints mirror the existing unihost CLI verbs:
* ``POST /deploy`` — body: serialized ``DecnetConfig``
* ``POST /teardown`` — body: optional ``{"decky_id": "..."}``
* ``POST /mutate`` — body: ``{"decky_id": "...", "services": [...]}``
* ``GET /status`` — deployment snapshot
* ``GET /health`` — liveness probe, does NOT require mTLS? No — mTLS
still required; master pings it with its cert.
"""
from __future__ import annotations
from typing import Optional
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, Field
from decnet.agent import executor as _exec
from decnet.config import DecnetConfig
from decnet.logging import get_logger
log = get_logger("agent.app")
app = FastAPI(
title="DECNET SWARM Agent",
version="0.1.0",
docs_url=None, # no interactive docs on worker — narrow attack surface
redoc_url=None,
openapi_url=None,
)
# ------------------------------------------------------------------ schemas
class DeployRequest(BaseModel):
config: DecnetConfig = Field(..., description="Full DecnetConfig to materialise on this worker")
dry_run: bool = False
no_cache: bool = False
class TeardownRequest(BaseModel):
decky_id: Optional[str] = None
class MutateRequest(BaseModel):
decky_id: str
services: list[str]
# ------------------------------------------------------------------ routes
@app.get("/health")
async def health() -> dict[str, str]:
return {"status": "ok"}
@app.get("/status")
async def status() -> dict:
return await _exec.status()
@app.post("/deploy")
async def deploy(req: DeployRequest) -> dict:
try:
await _exec.deploy(req.config, dry_run=req.dry_run, no_cache=req.no_cache)
except Exception as exc:
log.exception("agent.deploy failed")
raise HTTPException(status_code=500, detail=str(exc)) from exc
return {"status": "deployed", "deckies": len(req.config.deckies)}
@app.post("/teardown")
async def teardown(req: TeardownRequest) -> dict:
try:
await _exec.teardown(req.decky_id)
except Exception as exc:
log.exception("agent.teardown failed")
raise HTTPException(status_code=500, detail=str(exc)) from exc
return {"status": "torn_down", "decky_id": req.decky_id}
@app.post("/self-destruct")
async def self_destruct() -> dict:
"""Stop all DECNET services on this worker and delete the install
footprint. Called by the master during decommission. Logs under
/var/log/decnet* are preserved. Fire-and-forget — returns 202 before
the reaper starts deleting files."""
try:
await _exec.self_destruct()
except Exception as exc:
log.exception("agent.self_destruct failed")
raise HTTPException(status_code=500, detail=str(exc)) from exc
return {"status": "self_destruct_scheduled"}
@app.post("/mutate")
async def mutate(req: MutateRequest) -> dict:
# Service rotation is routed through the deployer's existing mutate path
# by the master (worker-side mutate is a redeploy of a single decky with
# the new service set). For v1 we accept the request and ask the master
# to send a full /deploy with the updated DecnetConfig — simpler and
# avoids duplicating mutation logic on the worker.
raise HTTPException(
status_code=501,
detail="Per-decky mutate is performed via /deploy with updated services",
)