Files
DECNET/decnet/agent/app.py

145 lines
4.5 KiB
Python

"""Worker-side FastAPI app.
Protected by mTLS at the ASGI/uvicorn transport layer: uvicorn is started
with ``--ssl-ca-certs`` + ``--ssl-cert-reqs 2`` (CERT_REQUIRED), so any
client that cannot prove a cert signed by the DECNET CA is rejected before
reaching a handler. Once past the TLS handshake, all peers are trusted
equally (the only entity holding a CA-signed cert is the master
controller).
Endpoints mirror the existing unihost CLI verbs:
* ``POST /deploy`` — body: serialized ``DecnetConfig``
* ``POST /teardown`` — body: optional ``{"decky_id": "..."}``
* ``POST /mutate`` — body: ``{"decky_id": "...", "services": [...]}``
* ``GET /status`` — deployment snapshot
* ``GET /health`` — liveness probe, does NOT require mTLS? No — mTLS
still required; master pings it with its cert.
"""
from __future__ import annotations
from contextlib import asynccontextmanager
from typing import Optional
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, Field
from decnet.agent import executor as _exec
from decnet.agent import heartbeat as _heartbeat
from decnet.config import DecnetConfig
from decnet.logging import get_logger
log = get_logger("agent.app")
@asynccontextmanager
async def _lifespan(app: FastAPI):
# Best-effort: if identity/bundle plumbing isn't configured (e.g. dev
# runs or non-enrolled hosts), heartbeat.start() is a silent no-op.
_heartbeat.start()
try:
yield
finally:
await _heartbeat.stop()
app = FastAPI(
title="DECNET SWARM Agent",
version="0.1.0",
docs_url=None, # no interactive docs on worker — narrow attack surface
redoc_url=None,
openapi_url=None,
lifespan=_lifespan,
responses={
400: {"description": "Malformed request body"},
500: {"description": "Executor error"},
},
)
# ------------------------------------------------------------------ schemas
class DeployRequest(BaseModel):
config: DecnetConfig = Field(..., description="Full DecnetConfig to materialise on this worker")
dry_run: bool = False
no_cache: bool = False
class TeardownRequest(BaseModel):
decky_id: Optional[str] = None
class MutateRequest(BaseModel):
decky_id: str
services: list[str]
# ------------------------------------------------------------------ routes
@app.get("/health")
async def health() -> dict[str, str]:
return {"status": "ok"}
@app.get("/status")
async def status() -> dict:
return await _exec.status()
@app.post(
"/deploy",
responses={500: {"description": "Deployer raised an exception materialising the config"}},
)
async def deploy(req: DeployRequest) -> dict:
try:
await _exec.deploy(req.config, dry_run=req.dry_run, no_cache=req.no_cache)
except Exception as exc:
log.exception("agent.deploy failed")
raise HTTPException(status_code=500, detail=str(exc)) from exc
return {"status": "deployed", "deckies": len(req.config.deckies)}
@app.post(
"/teardown",
responses={500: {"description": "Teardown raised an exception"}},
)
async def teardown(req: TeardownRequest) -> dict:
try:
await _exec.teardown(req.decky_id)
except Exception as exc:
log.exception("agent.teardown failed")
raise HTTPException(status_code=500, detail=str(exc)) from exc
return {"status": "torn_down", "decky_id": req.decky_id}
@app.post(
"/self-destruct",
responses={500: {"description": "Reaper could not be scheduled"}},
)
async def self_destruct() -> dict:
"""Stop all DECNET services on this worker and delete the install
footprint. Called by the master during decommission. Logs under
/var/log/decnet* are preserved. Fire-and-forget — returns 202 before
the reaper starts deleting files."""
try:
await _exec.self_destruct()
except Exception as exc:
log.exception("agent.self_destruct failed")
raise HTTPException(status_code=500, detail=str(exc)) from exc
return {"status": "self_destruct_scheduled"}
@app.post(
"/mutate",
responses={501: {"description": "Worker-side mutate not yet implemented"}},
)
async def mutate(req: MutateRequest) -> dict:
# TODO: implement worker-side mutate. Currently the master performs
# mutation by re-sending a full /deploy with the updated DecnetConfig;
# this avoids duplicating mutation logic on the worker for v1. When
# ready, replace the 501 with a real redeploy-of-a-single-decky path.
raise HTTPException(
status_code=501,
detail="Per-decky mutate is performed via /deploy with updated services",
)