Replaces LICENSE (GPLv3 -> AGPLv3) and prepends `SPDX-License-Identifier: AGPL-3.0-or-later` to every source file across decnet/, decnet_web/, tests/, scripts/, and tools/. Rationale: closes the GPLv3 ASP loophole so any party operating a modified DECNET as a network service must offer their modified source. Personal copyright (Samuel Paschuan) + inbound=outbound contributions make a future unilateral relicense infeasible. - LICENSE: full AGPL-3.0 text (gnu.org/licenses/agpl-3.0.txt) - COPYRIGHT: project copyright notice - tools/add_spdx_headers.py: idempotent header injector (shebang- and PEP 263-aware) Touches 1565 source files (.py, .ts, .tsx, .js, .jsx, .css, .sh). No behavior change; comments only.
367 lines
12 KiB
Python
367 lines
12 KiB
Python
# SPDX-License-Identifier: AGPL-3.0-or-later
|
|
"""Worker-side FastAPI app.
|
|
|
|
Protected by mTLS at the ASGI/uvicorn transport layer: uvicorn is started
|
|
with ``--ssl-ca-certs`` + ``--ssl-cert-reqs 2`` (CERT_REQUIRED), so any
|
|
client that cannot prove a cert signed by the DECNET CA is rejected before
|
|
reaching a handler. Once past the TLS handshake, all peers are trusted
|
|
equally (the only entity holding a CA-signed cert is the master
|
|
controller).
|
|
|
|
Endpoints mirror the existing unihost CLI verbs:
|
|
|
|
* ``POST /deploy`` — body: serialized ``DecnetConfig``
|
|
* ``POST /teardown`` — body: optional ``{"decky_id": "..."}``
|
|
* ``POST /mutate`` — body: ``{"decky_id": "...", "services": [...]}``
|
|
* ``GET /status`` — deployment snapshot
|
|
* ``GET /health`` — liveness probe, does NOT require mTLS? No — mTLS
|
|
still required; master pings it with its cert.
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import asyncio
|
|
import os
|
|
import pathlib
|
|
from contextlib import asynccontextmanager
|
|
from typing import Any, Optional
|
|
|
|
from fastapi import FastAPI, HTTPException
|
|
from fastapi.responses import JSONResponse
|
|
from pydantic import BaseModel, Field
|
|
|
|
import contextlib
|
|
|
|
from decnet.agent import executor as _exec
|
|
from decnet.agent import heartbeat as _heartbeat
|
|
from decnet.agent import topology_ops as _topology_ops
|
|
from decnet.bus.factory import get_bus
|
|
from decnet.bus.publish import run_health_heartbeat
|
|
from decnet.swarm.pki import DEFAULT_AGENT_DIR
|
|
from decnet.agent.topology_store import AlreadyApplied, TopologyStore
|
|
from decnet.config import DecnetConfig
|
|
from decnet.logging import get_logger
|
|
from decnet.topology.validate import ValidationError
|
|
|
|
log = get_logger("agent.app")
|
|
|
|
|
|
def _resolve_agent_dir() -> pathlib.Path:
|
|
env = os.environ.get("DECNET_AGENT_DIR")
|
|
if env:
|
|
return pathlib.Path(env)
|
|
system = pathlib.Path("/etc/decnet/agent")
|
|
if system.exists():
|
|
return system
|
|
return DEFAULT_AGENT_DIR
|
|
|
|
|
|
# Module-level singleton. Created lazily on first use so tests can
|
|
# monkeypatch DECNET_AGENT_DIR before the store binds to a path.
|
|
_topology_store: Optional[TopologyStore] = None
|
|
|
|
|
|
def _store() -> TopologyStore:
|
|
global _topology_store
|
|
if _topology_store is None:
|
|
_topology_store = TopologyStore(_resolve_agent_dir() / "topology.db")
|
|
return _topology_store
|
|
|
|
|
|
_collector_task: Optional[asyncio.Task] = None
|
|
|
|
|
|
def _ensure_collector_started() -> None:
|
|
"""Spawn the log collector on demand — called from /topology/apply
|
|
after a successful materialise. We must NOT start this in the
|
|
lifespan hook: the agent's boot invariant is "never touch docker
|
|
until master tells us to" (see tests/swarm/test_agent_no_auto_restore.py).
|
|
|
|
The collector watches ``decnet.topology.service=true`` labels via
|
|
docker events, writing RFC 5424 lines to ``DECNET_AGENT_LOG_FILE``
|
|
which the forwarder ships to the master over syslog-TLS. Idempotent:
|
|
subsequent calls while the task is still running are no-ops.
|
|
"""
|
|
global _collector_task
|
|
if _collector_task is not None and not _collector_task.done():
|
|
return
|
|
from decnet.env import DECNET_AGENT_LOG_FILE
|
|
|
|
try:
|
|
from decnet.collector.worker import log_collector_worker
|
|
except Exception: # noqa: BLE001 — docker may be unavailable on dev
|
|
log.warning(
|
|
"agent log collector not starting — collector worker import failed",
|
|
exc_info=True,
|
|
)
|
|
return
|
|
_collector_task = asyncio.create_task(
|
|
log_collector_worker(DECNET_AGENT_LOG_FILE),
|
|
name="agent-log-collector",
|
|
)
|
|
log.info("agent log collector started log_file=%s", DECNET_AGENT_LOG_FILE)
|
|
|
|
|
|
_bus_heartbeat_task: Optional[asyncio.Task] = None
|
|
|
|
|
|
@asynccontextmanager
|
|
async def _lifespan(app: FastAPI):
|
|
# Best-effort: if identity/bundle plumbing isn't configured (e.g. dev
|
|
# runs or non-enrolled hosts), heartbeat.start() is a silent no-op.
|
|
_heartbeat.start()
|
|
|
|
# Host-local bus heartbeat (system.agent.health). Separate channel
|
|
# from the mTLS master-facing heartbeat above; this one lets peers on
|
|
# the same host (dashboard, updater) see the agent is alive without
|
|
# hitting its HTTPS endpoint. Bus-disabled path is a no-op loop.
|
|
bus = None
|
|
try:
|
|
bus = get_bus(client_name="agent")
|
|
await bus.connect()
|
|
except Exception as exc: # noqa: BLE001
|
|
log.warning("agent: bus unavailable, skipping health heartbeat: %s", exc)
|
|
bus = None
|
|
|
|
global _bus_heartbeat_task
|
|
_bus_heartbeat_task = asyncio.create_task(
|
|
run_health_heartbeat(bus, "agent"),
|
|
name="agent-bus-heartbeat",
|
|
)
|
|
|
|
try:
|
|
yield
|
|
finally:
|
|
await _heartbeat.stop()
|
|
if _bus_heartbeat_task is not None:
|
|
_bus_heartbeat_task.cancel()
|
|
with contextlib.suppress(asyncio.CancelledError, Exception):
|
|
await _bus_heartbeat_task
|
|
_bus_heartbeat_task = None
|
|
if bus is not None:
|
|
with contextlib.suppress(Exception):
|
|
await bus.close()
|
|
global _collector_task
|
|
if _collector_task is not None and not _collector_task.done():
|
|
_collector_task.cancel()
|
|
try:
|
|
await _collector_task
|
|
except (asyncio.CancelledError, Exception): # noqa: BLE001
|
|
pass
|
|
_collector_task = None
|
|
global _topology_store
|
|
if _topology_store is not None:
|
|
_topology_store.close()
|
|
_topology_store = None
|
|
|
|
|
|
app = FastAPI(
|
|
title="DECNET SWARM Agent",
|
|
version="0.1.0",
|
|
docs_url=None, # no interactive docs on worker — narrow attack surface
|
|
redoc_url=None,
|
|
openapi_url=None,
|
|
lifespan=_lifespan,
|
|
responses={
|
|
400: {"description": "Malformed request body"},
|
|
500: {"description": "Executor error"},
|
|
},
|
|
)
|
|
|
|
|
|
# ------------------------------------------------------------------ schemas
|
|
|
|
class DeployRequest(BaseModel):
|
|
config: DecnetConfig = Field(..., description="Full DecnetConfig to materialise on this worker")
|
|
dry_run: bool = False
|
|
no_cache: bool = False
|
|
|
|
|
|
class TeardownRequest(BaseModel):
|
|
decky_id: Optional[str] = None
|
|
|
|
|
|
class MutateRequest(BaseModel):
|
|
decky_id: str
|
|
services: list[str]
|
|
dry_run: bool = False
|
|
|
|
|
|
# ------------------------------------------------------------------ routes
|
|
|
|
@app.get("/health")
|
|
async def health() -> dict[str, str]:
|
|
return {"status": "ok"}
|
|
|
|
|
|
@app.get("/status")
|
|
async def status() -> dict:
|
|
return await _exec.status()
|
|
|
|
|
|
@app.post(
|
|
"/deploy",
|
|
status_code=202,
|
|
responses={202: {"description": "Deploy accepted; runs in background; lifecycle deltas pushed via heartbeat"}},
|
|
)
|
|
async def deploy(req: DeployRequest) -> dict:
|
|
"""Spawn the deploy in the background and return 202 immediately.
|
|
|
|
The master tracks per-decky completion via lifecycle deltas pushed on
|
|
the next heartbeat (one immediate push on completion, plus the
|
|
scheduled 30 s ticks as a fallback). Holding the request open across
|
|
a multi-minute compose build was the previous source of the wizard
|
|
API-hang."""
|
|
asyncio.create_task(
|
|
_exec.deploy_async(req.config, dry_run=req.dry_run, no_cache=req.no_cache),
|
|
name=f"deploy-{id(req)}",
|
|
)
|
|
return {"status": "accepted", "deckies": [d.name for d in req.config.deckies]}
|
|
|
|
|
|
@app.post(
|
|
"/teardown",
|
|
responses={500: {"description": "Teardown raised an exception"}},
|
|
)
|
|
async def teardown(req: TeardownRequest) -> dict:
|
|
try:
|
|
await _exec.teardown(req.decky_id)
|
|
except Exception as exc:
|
|
log.exception("agent.teardown failed")
|
|
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
|
return {"status": "torn_down", "decky_id": req.decky_id}
|
|
|
|
|
|
@app.post(
|
|
"/self-destruct",
|
|
responses={500: {"description": "Reaper could not be scheduled"}},
|
|
)
|
|
async def self_destruct() -> dict:
|
|
"""Stop all DECNET services on this worker and delete the install
|
|
footprint. Called by the master during decommission. Logs under
|
|
/var/log/decnet* are preserved. Fire-and-forget — returns 202 before
|
|
the reaper starts deleting files."""
|
|
try:
|
|
await _exec.self_destruct()
|
|
except Exception as exc:
|
|
log.exception("agent.self_destruct failed")
|
|
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
|
return {"status": "self_destruct_scheduled"}
|
|
|
|
|
|
# ------------------------------------------------------- topology endpoints
|
|
|
|
|
|
class ApplyTopologyRequest(BaseModel):
|
|
hydrated: dict[str, Any] = Field(
|
|
..., description="Hydrated topology dict from master.persistence.hydrate()"
|
|
)
|
|
version_hash: str = Field(
|
|
..., description="Master's canonical_hash(hydrated); must match ours"
|
|
)
|
|
|
|
|
|
class TeardownTopologyRequest(BaseModel):
|
|
topology_id: str = Field(..., description="Topology UUID to dismantle")
|
|
|
|
|
|
@app.post(
|
|
"/topology/apply",
|
|
responses={
|
|
400: {"description": "Malformed hydrated topology or hash mismatch"},
|
|
409: {"description": "A different topology is already applied"},
|
|
500: {"description": "Docker or compose raised while applying"},
|
|
},
|
|
)
|
|
async def topology_apply(req: ApplyTopologyRequest) -> dict:
|
|
store = _store()
|
|
try:
|
|
await _topology_ops.apply(req.hydrated, req.version_hash, store)
|
|
except _topology_ops.HashMismatch as exc:
|
|
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
|
except ValidationError as exc:
|
|
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
|
except AlreadyApplied as exc:
|
|
raise HTTPException(status_code=409, detail=str(exc)) from exc
|
|
except Exception as exc:
|
|
log.exception("agent.topology_apply failed")
|
|
topology_id = (req.hydrated.get("topology") or {}).get("id")
|
|
if topology_id:
|
|
try:
|
|
store.record_error(
|
|
str(topology_id), str(exc)[:500], hydrated=req.hydrated,
|
|
)
|
|
except Exception: # noqa: BLE001 — don't mask original failure
|
|
log.exception("failed to record apply error")
|
|
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
|
_ensure_collector_started()
|
|
return {"status": "applied", "version_hash": req.version_hash}
|
|
|
|
|
|
@app.post(
|
|
"/topology/teardown",
|
|
responses={500: {"description": "Docker or compose raised while tearing down"}},
|
|
)
|
|
async def topology_teardown(req: TeardownTopologyRequest) -> dict:
|
|
try:
|
|
await _topology_ops.teardown(req.topology_id, _store())
|
|
except Exception as exc:
|
|
log.exception("agent.topology_teardown failed")
|
|
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
|
return {"status": "torn_down", "topology_id": req.topology_id}
|
|
|
|
|
|
@app.get("/topology/state")
|
|
async def topology_state() -> dict:
|
|
return _topology_ops.state(_store())
|
|
|
|
|
|
@app.post(
|
|
"/mutate",
|
|
status_code=202,
|
|
responses={
|
|
202: {"description": "Mutate accepted; runs in background; lifecycle delta pushed via heartbeat"},
|
|
404: {"description": "No active deployment, or unknown decky_id (dry_run validation only)"},
|
|
},
|
|
)
|
|
async def mutate(req: MutateRequest) -> Any:
|
|
"""Spawn the mutate in the background and return 202 immediately.
|
|
|
|
Master tracks completion via a lifecycle delta pushed on the next
|
|
heartbeat (immediate push on completion). ``dry_run`` is still
|
|
synchronous — it validates against the worker's current state and
|
|
returns the would-be services without spawning a task or touching
|
|
docker, so the wizard's preview path stays cheap."""
|
|
if req.dry_run:
|
|
from decnet.config import load_state
|
|
state = load_state()
|
|
if state is None:
|
|
raise HTTPException(
|
|
status_code=404,
|
|
detail="no active deployment on this worker",
|
|
)
|
|
cfg, _ = state
|
|
decky = next((d for d in cfg.deckies if d.name == req.decky_id), None)
|
|
if decky is None:
|
|
raise HTTPException(
|
|
status_code=404,
|
|
detail=f"decky {req.decky_id!r} not found in worker state",
|
|
)
|
|
return JSONResponse(
|
|
status_code=200,
|
|
content={
|
|
"status": "dry_run",
|
|
"decky_id": req.decky_id,
|
|
"services": list(req.services),
|
|
},
|
|
)
|
|
|
|
asyncio.create_task(
|
|
_exec.mutate_async(req.decky_id, list(req.services)),
|
|
name=f"mutate-{req.decky_id}",
|
|
)
|
|
return {
|
|
"status": "accepted",
|
|
"decky_id": req.decky_id,
|
|
"services": list(req.services),
|
|
}
|