If the agent was started outside the updater (manually, during dev, or from a prior systemd unit), there is no agent.pid for _stop_agent to target, so a successful code install leaves the old in-memory agent process still serving requests. Scan /proc for any decnet agent command and SIGTERM all matches so restart is reliable regardless of how the agent was originally launched.
101 lines
3.2 KiB
Python
101 lines
3.2 KiB
Python
"""Worker-side FastAPI app.
|
|
|
|
Protected by mTLS at the ASGI/uvicorn transport layer: uvicorn is started
|
|
with ``--ssl-ca-certs`` + ``--ssl-cert-reqs 2`` (CERT_REQUIRED), so any
|
|
client that cannot prove a cert signed by the DECNET CA is rejected before
|
|
reaching a handler. Once past the TLS handshake, all peers are trusted
|
|
equally (the only entity holding a CA-signed cert is the master
|
|
controller).
|
|
|
|
Endpoints mirror the existing unihost CLI verbs:
|
|
|
|
* ``POST /deploy`` — body: serialized ``DecnetConfig``
|
|
* ``POST /teardown`` — body: optional ``{"decky_id": "..."}``
|
|
* ``POST /mutate`` — body: ``{"decky_id": "...", "services": [...]}``
|
|
* ``GET /status`` — deployment snapshot
|
|
* ``GET /health`` — liveness probe, does NOT require mTLS? No — mTLS
|
|
still required; master pings it with its cert.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
from typing import Optional
|
|
|
|
from fastapi import FastAPI, HTTPException
|
|
from pydantic import BaseModel, Field
|
|
|
|
from decnet.agent import executor as _exec
|
|
from decnet.config import DecnetConfig
|
|
from decnet.logging import get_logger
|
|
|
|
log = get_logger("agent.app")
|
|
|
|
app = FastAPI(
|
|
title="DECNET SWARM Agent",
|
|
version="0.1.0",
|
|
docs_url=None, # no interactive docs on worker — narrow attack surface
|
|
redoc_url=None,
|
|
openapi_url=None,
|
|
)
|
|
|
|
|
|
# ------------------------------------------------------------------ schemas
|
|
|
|
class DeployRequest(BaseModel):
|
|
config: DecnetConfig = Field(..., description="Full DecnetConfig to materialise on this worker")
|
|
dry_run: bool = False
|
|
no_cache: bool = False
|
|
|
|
|
|
class TeardownRequest(BaseModel):
|
|
decky_id: Optional[str] = None
|
|
|
|
|
|
class MutateRequest(BaseModel):
|
|
decky_id: str
|
|
services: list[str]
|
|
|
|
|
|
# ------------------------------------------------------------------ routes
|
|
|
|
@app.get("/health")
|
|
async def health() -> dict[str, str]:
|
|
return {"status": "ok", "marker": "push-test-2"}
|
|
|
|
|
|
@app.get("/status")
|
|
async def status() -> dict:
|
|
return await _exec.status()
|
|
|
|
|
|
@app.post("/deploy")
|
|
async def deploy(req: DeployRequest) -> dict:
|
|
try:
|
|
await _exec.deploy(req.config, dry_run=req.dry_run, no_cache=req.no_cache)
|
|
except Exception as exc:
|
|
log.exception("agent.deploy failed")
|
|
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
|
return {"status": "deployed", "deckies": len(req.config.deckies)}
|
|
|
|
|
|
@app.post("/teardown")
|
|
async def teardown(req: TeardownRequest) -> dict:
|
|
try:
|
|
await _exec.teardown(req.decky_id)
|
|
except Exception as exc:
|
|
log.exception("agent.teardown failed")
|
|
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
|
return {"status": "torn_down", "decky_id": req.decky_id}
|
|
|
|
|
|
@app.post("/mutate")
|
|
async def mutate(req: MutateRequest) -> dict:
|
|
# Service rotation is routed through the deployer's existing mutate path
|
|
# by the master (worker-side mutate is a redeploy of a single decky with
|
|
# the new service set). For v1 we accept the request and ask the master
|
|
# to send a full /deploy with the updated DecnetConfig — simpler and
|
|
# avoids duplicating mutation logic on the worker.
|
|
raise HTTPException(
|
|
status_code=501,
|
|
detail="Per-decky mutate is performed via /deploy with updated services",
|
|
)
|