feat(updater): remote self-update daemon with auto-rollback

Adds a separate `decnet updater` daemon on each worker that owns the
agent's release directory and installs tarball pushes from the master
over mTLS. A normal `/update` never touches the updater itself, so the
updater is always a known-good rescuer if a bad agent push breaks
/health — the rotation is reversed and the agent restarted against the
previous release. `POST /update-self` handles updater upgrades
explicitly (no auto-rollback).

- decnet/updater/: executor, FastAPI app, uvicorn launcher
- decnet/swarm/updater_client.py, tar_tree.py: master-side push
- cli: `decnet updater`, `decnet swarm update [--host|--all]
  [--include-self] [--dry-run]`, `--updater` on `swarm enroll`
- enrollment API issues a second cert (CN=updater@<host>) signed by the
  same CA; SwarmHost records updater_cert_fingerprint
- tests: executor, app, CLI, tar tree, enroll-with-updater (37 new)
- wiki: Remote-Updates page + sidebar + SWARM-Mode cross-link
This commit is contained in:
2026-04-18 21:40:21 -04:00
parent 8914c27220
commit 7765b36c50
16 changed files with 1814 additions and 4 deletions

View File

@@ -118,6 +118,10 @@ class SwarmHost(SQLModel, table=True):
# ISO-8601 string of the last successful agent /health probe
last_heartbeat: Optional[datetime] = Field(default=None)
client_cert_fingerprint: str # SHA-256 hex of worker's issued client cert
# SHA-256 hex of the updater-identity cert, if the host was enrolled
# with ``--updater`` / ``issue_updater_bundle``. ``None`` for hosts
# that only have an agent identity.
updater_cert_fingerprint: Optional[str] = Field(default=None)
# Directory on the master where the per-worker cert bundle lives
cert_bundle_path: str
enrolled_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
@@ -281,6 +285,17 @@ class SwarmEnrollRequest(BaseModel):
description="Extra SANs (IPs / hostnames) to embed in the worker cert",
)
notes: Optional[str] = None
issue_updater_bundle: bool = PydanticField(
default=False,
description="If true, also issue an updater cert (CN=updater@<name>) for the remote self-updater",
)
class SwarmUpdaterBundle(BaseModel):
"""Subset of SwarmEnrolledBundle for the updater identity."""
fingerprint: str
updater_cert_pem: str
updater_key_pem: str
class SwarmEnrolledBundle(BaseModel):
@@ -293,6 +308,7 @@ class SwarmEnrolledBundle(BaseModel):
ca_cert_pem: str
worker_cert_pem: str
worker_key_pem: str
updater: Optional[SwarmUpdaterBundle] = None
class SwarmHostView(BaseModel):
@@ -303,6 +319,7 @@ class SwarmHostView(BaseModel):
status: str
last_heartbeat: Optional[datetime] = None
client_cert_fingerprint: str
updater_cert_fingerprint: Optional[str] = None
enrolled_at: datetime
notes: Optional[str] = None

View File

@@ -12,13 +12,14 @@ from __future__ import annotations
import uuid as _uuid
from datetime import datetime, timezone
from typing import Optional
from fastapi import APIRouter, Depends, HTTPException, status
from decnet.swarm import pki
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
from decnet.web.db.models import SwarmEnrolledBundle, SwarmEnrollRequest
from decnet.web.db.models import SwarmEnrolledBundle, SwarmEnrollRequest, SwarmUpdaterBundle
router = APIRouter()
@@ -46,6 +47,26 @@ async def api_enroll_host(
bundle_dir = pki.DEFAULT_CA_DIR / "workers" / req.name
pki.write_worker_bundle(issued, bundle_dir)
updater_view: Optional[SwarmUpdaterBundle] = None
updater_fp: Optional[str] = None
if req.issue_updater_bundle:
updater_cn = f"updater@{req.name}"
updater_sans = list({*sans, updater_cn, "127.0.0.1"})
updater_issued = pki.issue_worker_cert(ca, updater_cn, updater_sans)
# Persist alongside the worker bundle for replay.
updater_dir = bundle_dir / "updater"
updater_dir.mkdir(parents=True, exist_ok=True)
(updater_dir / "updater.crt").write_bytes(updater_issued.cert_pem)
(updater_dir / "updater.key").write_bytes(updater_issued.key_pem)
import os as _os
_os.chmod(updater_dir / "updater.key", 0o600)
updater_fp = updater_issued.fingerprint_sha256
updater_view = SwarmUpdaterBundle(
fingerprint=updater_fp,
updater_cert_pem=updater_issued.cert_pem.decode(),
updater_key_pem=updater_issued.key_pem.decode(),
)
host_uuid = str(_uuid.uuid4())
await repo.add_swarm_host(
{
@@ -55,6 +76,7 @@ async def api_enroll_host(
"agent_port": req.agent_port,
"status": "enrolled",
"client_cert_fingerprint": issued.fingerprint_sha256,
"updater_cert_fingerprint": updater_fp,
"cert_bundle_path": str(bundle_dir),
"enrolled_at": datetime.now(timezone.utc),
"notes": req.notes,
@@ -69,4 +91,5 @@ async def api_enroll_host(
ca_cert_pem=issued.ca_cert_pem.decode(),
worker_cert_pem=issued.cert_pem.decode(),
worker_key_pem=issued.key_pem.decode(),
updater=updater_view,
)