merge: testing → main (reconcile 2-week divergence)

This commit is contained in:
2026-04-28 18:36:00 -04:00
parent 499836c9e4
commit 862e4dbb31
1235 changed files with 160255 additions and 7996 deletions

View File

@@ -5,14 +5,65 @@ from .auth.api_change_pass import router as change_pass_router
from .logs.api_get_logs import router as logs_router
from .logs.api_get_histogram import router as histogram_router
from .bounty.api_get_bounties import router as bounty_router
from .credentials.api_get_credentials import router as credentials_router
from .credential_reuse.api_get_credential_reuse import router as credential_reuse_router
from .stats.api_get_stats import router as stats_router
from .fleet.api_get_deckies import router as get_deckies_router
from .fleet.api_mutate_decky import router as mutate_decky_router
from .fleet.api_mutate_interval import router as mutate_interval_router
from .fleet.api_deploy_deckies import router as deploy_deckies_router
from .stream.api_stream_events import router as stream_router
from .attackers.api_get_attackers import router as attackers_router
from .attackers.api_get_attacker_detail import router as attacker_detail_router
from .attackers.api_get_attacker_commands import router as attacker_commands_router
from .attackers.api_get_attacker_artifacts import router as attacker_artifacts_router
from .attackers.api_get_attacker_transcripts import router as attacker_transcripts_router
from .attackers.api_get_attacker_smtp_targets import router as attacker_smtp_targets_router
from .attackers.api_get_attacker_mail import router as attacker_mail_router
from .attackers.api_get_attacker_intel import router as attacker_intel_router
from .identities.api_list_identities import router as identities_list_router
from .identities.api_get_identity_detail import router as identity_detail_router
from .identities.api_list_identity_observations import router as identity_observations_router
from .identities.api_events import router as identity_events_router
from .campaigns.api_list_campaigns import router as campaigns_list_router
from .campaigns.api_get_campaign_detail import router as campaign_detail_router
from .campaigns.api_list_campaign_identities import router as campaign_identities_router
from .campaigns.api_events import router as campaign_events_router
from .orchestrator.api_list_events import router as orchestrator_list_router
from .orchestrator.api_events import router as orchestrator_events_router
from .realism.api_config import router as realism_config_router
from .realism.api_personas import router as realism_personas_router
from .realism.api_synthetic_files import router as realism_synthetic_files_router
from .transcripts import transcripts_router
from .config.api_get_config import router as config_get_router
from .config.api_update_config import router as config_update_router
from .config.api_manage_users import router as config_users_router
from .config.api_reinit import router as config_reinit_router
from .health.api_get_health import router as health_router
from .workers.api_list_workers import router as workers_list_router
from .workers.api_control_worker import router as workers_control_router
from .workers.api_start_worker import router as workers_start_router
from .workers.api_start_all_workers import router as workers_start_all_router
from .artifacts.api_get_artifact import router as artifacts_router
from .swarm_updates import swarm_updates_router
from .swarm_mgmt import swarm_mgmt_router
from .system import system_router
from .topology import topology_router
from .canary import canary_router
from .webhooks import webhooks_router
api_router = APIRouter()
api_router = APIRouter(
# Every route under /api/v1 is auth-guarded (either by an explicit
# require_* Depends or by the global auth middleware). Document 401/403
# here so the OpenAPI schema reflects reality for contract tests.
responses={
400: {"description": "Malformed request body"},
401: {"description": "Missing or invalid credentials"},
403: {"description": "Authenticated but not authorized"},
404: {"description": "Referenced resource does not exist"},
409: {"description": "Conflict with existing resource"},
},
)
# Authentication
api_router.include_router(login_router)
@@ -25,12 +76,86 @@ api_router.include_router(histogram_router)
# Bounty Vault
api_router.include_router(bounty_router)
# Credentials (deduped attacker auth attempts)
api_router.include_router(credentials_router)
# Credential reuse findings (cross-decky/cross-service same-secret hits)
api_router.include_router(credential_reuse_router)
# Fleet Management
api_router.include_router(get_deckies_router)
api_router.include_router(mutate_decky_router)
api_router.include_router(mutate_interval_router)
api_router.include_router(deploy_deckies_router)
# Attacker Profiles
api_router.include_router(attackers_router)
api_router.include_router(attacker_detail_router)
api_router.include_router(attacker_commands_router)
api_router.include_router(attacker_artifacts_router)
api_router.include_router(attacker_transcripts_router)
api_router.include_router(attacker_smtp_targets_router)
api_router.include_router(attacker_mail_router)
api_router.include_router(attacker_intel_router)
# Identity Resolution (read-only; populated by the clusterer worker —
# see development/IDENTITY_RESOLUTION.md). Empty until the clusterer
# ships; the API surface lands first so frontend + downstream work
# can target a stable shape.
api_router.include_router(identities_list_router)
api_router.include_router(identity_detail_router)
api_router.include_router(identity_observations_router)
api_router.include_router(identity_events_router)
api_router.include_router(campaigns_list_router)
api_router.include_router(campaign_detail_router)
api_router.include_router(campaign_identities_router)
api_router.include_router(campaign_events_router)
api_router.include_router(orchestrator_list_router)
api_router.include_router(orchestrator_events_router)
# Realism — global persona pool CRUD for the dashboard's
# "Persona Generation" page. The orchestrator reads from the same
# on-disk JSON file directly (see decnet.realism.personas_pool).
api_router.include_router(realism_personas_router)
api_router.include_router(realism_synthetic_files_router)
api_router.include_router(realism_config_router)
# Observability
api_router.include_router(stats_router)
api_router.include_router(stream_router)
api_router.include_router(health_router)
api_router.include_router(workers_list_router)
api_router.include_router(workers_control_router)
api_router.include_router(workers_start_router)
api_router.include_router(workers_start_all_router)
# Configuration
api_router.include_router(config_get_router)
api_router.include_router(config_update_router)
api_router.include_router(config_users_router)
api_router.include_router(config_reinit_router)
# Artifacts (captured attacker file drops)
api_router.include_router(artifacts_router)
# Transcripts (PTY session recordings, paged asciinema events)
api_router.include_router(transcripts_router)
# Remote Updates (dashboard → worker updater daemons)
api_router.include_router(swarm_updates_router)
# Swarm Management (dashboard: hosts, deckies, agent enrollment bundles)
api_router.include_router(swarm_mgmt_router)
# System info (deployment-mode auto-detection, etc.)
api_router.include_router(system_router)
# MazeNET Topologies (nested topology CRUD + mutation queue)
api_router.include_router(topology_router)
# Canary tokens — operator-facing CRUD (worker hosts the
# attacker-facing surface separately via `decnet canary`).
api_router.include_router(canary_router)
# External webhook subscriptions (SIEM/SOAR egress)
api_router.include_router(webhooks_router)

View File

View File

@@ -0,0 +1,95 @@
"""
Artifact download endpoint.
SSH deckies farm attacker file drops into a host-mounted quarantine:
/var/lib/decnet/artifacts/{decky}/ssh/{stored_as}
The capture event already flows through the normal log pipeline (one
RFC 5424 line per capture, see templates/ssh/emit_capture.py), so metadata
is served via /logs. This endpoint exists only to retrieve the raw bytes —
admin-gated because the payloads are attacker-controlled content.
"""
from __future__ import annotations
import os
import re
from pathlib import Path
from fastapi import APIRouter, Depends, HTTPException, Query
from fastapi.responses import FileResponse
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_admin
router = APIRouter()
# Override via env for tests; the prod path matches the bind mount declared in
# decnet/services/ssh.py and decnet/services/smtp.py.
ARTIFACTS_ROOT = Path(os.environ.get("DECNET_ARTIFACTS_ROOT", "/var/lib/decnet/artifacts"))
# decky names come from the deployer — lowercase alnum plus hyphens.
_DECKY_RE = re.compile(r"^[a-z0-9][a-z0-9-]{0,62}$")
# Services that own an artifacts subdir. Kept explicit so a caller can't
# pivot into arbitrary subpaths via the query string.
_ALLOWED_SERVICES = {"ssh", "smtp"}
# stored_as is assembled by the capturing template as:
# ${ts}_${sha:0:12}_${base}
# where ts is ISO-8601 UTC (e.g. 2026-04-18T02:22:56Z), sha is 12 hex chars,
# and base is the original filename's basename. Keep the filename charset
# tight but allow common punctuation dropped files actually use.
_STORED_AS_RE = re.compile(
r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z_[a-f0-9]{12}_[A-Za-z0-9._-]{1,255}$"
)
def _resolve_artifact_path(decky: str, stored_as: str, service: str) -> Path:
"""Validate inputs, resolve the on-disk path, and confirm it stays inside
the artifacts root. Raises HTTPException(400) on any violation."""
if service not in _ALLOWED_SERVICES:
raise HTTPException(status_code=400, detail="invalid service")
if not _DECKY_RE.fullmatch(decky):
raise HTTPException(status_code=400, detail="invalid decky name")
if not _STORED_AS_RE.fullmatch(stored_as):
raise HTTPException(status_code=400, detail="invalid stored_as")
root = ARTIFACTS_ROOT.resolve()
candidate = (root / decky / service / stored_as).resolve()
# defence-in-depth: even though the regexes reject `..`, make sure a
# symlink or weird filesystem state can't escape the root.
if root not in candidate.parents and candidate != root:
raise HTTPException(status_code=400, detail="path escapes artifacts root")
return candidate
@router.get(
"/artifacts/{decky}/{stored_as}",
tags=["Artifacts"],
responses={
400: {"description": "Invalid decky, service, or stored_as parameter"},
401: {"description": "Could not validate credentials"},
403: {"description": "Admin access required"},
404: {"description": "Artifact not found"},
},
)
@_traced("api.get_artifact")
async def get_artifact(
decky: str,
stored_as: str,
service: str = Query("ssh", pattern=r"^[a-z]{1,16}$"),
admin: dict = Depends(require_admin),
) -> FileResponse:
path = _resolve_artifact_path(decky, stored_as, service)
if not path.is_file():
raise HTTPException(status_code=404, detail="artifact not found")
return FileResponse(
path=str(path),
media_type="application/octet-stream",
filename=stored_as,
headers={
"Content-Disposition": f'attachment; filename="{stored_as}"',
"X-Content-Type-Options": "nosniff",
},
)

View File

View File

@@ -0,0 +1,34 @@
from typing import Any
from fastapi import APIRouter, Depends, HTTPException
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo
router = APIRouter()
@router.get(
"/attackers/{uuid}/artifacts",
tags=["Attacker Profiles"],
responses={
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "Attacker not found"},
},
)
@_traced("api.get_attacker_artifacts")
async def get_attacker_artifacts(
uuid: str,
user: dict = Depends(require_viewer),
) -> dict[str, Any]:
"""List captured file-drop artifacts for an attacker (newest first).
Each entry is a `file_captured` log row — the frontend renders the
badge/drawer using the same `fields` payload as /logs.
"""
attacker = await repo.get_attacker_by_uuid(uuid)
if not attacker:
raise HTTPException(status_code=404, detail="Attacker not found")
rows = await repo.get_attacker_artifacts(uuid)
return {"total": len(rows), "data": rows}

View File

@@ -0,0 +1,42 @@
from typing import Any, Optional
from fastapi import APIRouter, Depends, HTTPException, Query
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo
router = APIRouter()
@router.get(
"/attackers/{uuid}/commands",
tags=["Attacker Profiles"],
responses={
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "Attacker not found"},
422: {"description": "Query parameter validation error (limit/offset out of range or invalid)"},
},
)
@_traced("api.get_attacker_commands")
async def get_attacker_commands(
uuid: str,
limit: int = Query(50, ge=1, le=200),
offset: int = Query(0, ge=0, le=2147483647),
service: Optional[str] = None,
user: dict = Depends(require_viewer),
) -> dict[str, Any]:
"""Retrieve paginated commands for an attacker profile."""
attacker = await repo.get_attacker_by_uuid(uuid)
if not attacker:
raise HTTPException(status_code=404, detail="Attacker not found")
def _norm(v: Optional[str]) -> Optional[str]:
if v in (None, "null", "NULL", "undefined", ""):
return None
return v
result = await repo.get_attacker_commands(
uuid=uuid, limit=limit, offset=offset, service=_norm(service),
)
return {"total": result["total"], "limit": limit, "offset": offset, "data": result["data"]}

View File

@@ -0,0 +1,44 @@
from typing import Any
from fastapi import APIRouter, Depends, HTTPException
from decnet.correlation.event_kinds import bucket_services
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo
router = APIRouter()
@router.get(
"/attackers/{uuid}",
tags=["Attacker Profiles"],
responses={
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "Attacker not found"},
},
)
@_traced("api.get_attacker_detail")
async def get_attacker_detail(
uuid: str,
user: dict = Depends(require_viewer),
) -> dict[str, Any]:
"""Retrieve a single attacker profile by UUID (with behavior block)."""
attacker = await repo.get_attacker_by_uuid(uuid)
if not attacker:
raise HTTPException(status_code=404, detail="Attacker not found")
attacker["behavior"] = await repo.get_attacker_behavior(uuid)
# Scanned vs. interacted-with — computed per-request from the log
# stream, not persisted. Cheap (DISTINCT bounded by service ×
# event_type cardinality), and changes to the classifier take effect
# immediately without a profiler re-tick.
pairs = await repo.get_attacker_service_activity(uuid)
attacker["service_activity"] = bucket_services(pairs)
# Attribution leaks — XFF / Forwarded / X-Real-IP mismatches captured
# by the HTTP bounty extractor. Cap the returned list at 10 so a
# rotation attack (100s of forged XFF values) doesn't flood the UI;
# `ip_leaks_total` carries the unbounded count so the UI can render
# a ROTATION DETECTED badge when the count crosses a threshold.
attacker["ip_leaks"] = await repo.get_attacker_ip_leaks(uuid, limit=10)
attacker["ip_leaks_total"] = await repo.count_attacker_ip_leaks(uuid)
return attacker

View File

@@ -0,0 +1,38 @@
"""GET /api/v1/attackers/{uuid}/intel — latest threat-intel row for an attacker."""
from typing import Any
from fastapi import APIRouter, Depends, HTTPException
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import repo, require_viewer
router = APIRouter()
@router.get(
"/attackers/{uuid}/intel",
tags=["Attacker Profiles"],
responses={
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "No intel cached for this attacker"},
},
)
@_traced("api.get_attacker_intel")
async def get_attacker_intel(
uuid: str,
user: dict = Depends(require_viewer),
) -> dict[str, Any]:
"""Return the most recent cached threat-intel verdict for an attacker.
The row is populated out-of-band by the ``decnet enrich`` worker
(typically within seconds of first observation, sub-second when the
bus is healthy). 404 means either the worker has not run yet or the
UUID does not correspond to an attacker DECNET has seen.
"""
record = await repo.get_attacker_intel_by_uuid(uuid)
if not record:
raise HTTPException(
status_code=404, detail="No intel cached for this attacker",
)
return record

View File

@@ -0,0 +1,37 @@
from typing import Any
from fastapi import APIRouter, Depends, HTTPException
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_admin, repo
router = APIRouter()
@router.get(
"/attackers/{uuid}/mail",
tags=["Attacker Profiles"],
responses={
401: {"description": "Could not validate credentials"},
403: {"description": "Admin access required"},
404: {"description": "Attacker not found"},
},
)
@_traced("api.get_attacker_mail")
async def get_attacker_mail(
uuid: str,
admin: dict = Depends(require_admin),
) -> dict[str, Any]:
"""List stored messages this attacker relayed via the SMTP honeypots.
Each entry is a ``message_stored`` log row — headers + attachment
manifest live in ``fields``; the raw .eml bytes are fetched via
``/artifacts/{decky}/{stored_as}?service=smtp`` (also admin-gated).
Admin-only because message bodies are attacker-controlled content
and may include phishing kits / malware droppers.
"""
attacker = await repo.get_attacker_by_uuid(uuid)
if not attacker:
raise HTTPException(status_code=404, detail="Attacker not found")
rows = await repo.get_attacker_stored_mail(uuid)
return {"total": len(rows), "data": rows}

View File

@@ -0,0 +1,36 @@
from typing import Any
from fastapi import APIRouter, Depends, HTTPException
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo
router = APIRouter()
@router.get(
"/attackers/{uuid}/smtp-targets",
tags=["Attacker Profiles"],
responses={
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "Attacker not found"},
},
)
@_traced("api.get_attacker_smtp_targets")
async def get_attacker_smtp_targets(
uuid: str,
user: dict = Depends(require_viewer),
) -> dict[str, Any]:
"""List victim domains this attacker targeted via the SMTP honeypots.
Rows are ordered by most-recent activity. Each row is one
(attacker, domain) pair with a running count + first/last seen — no
local-parts (user names) are ever stored, so this is safe to show
to any viewer role.
"""
attacker = await repo.get_attacker_by_uuid(uuid)
if not attacker:
raise HTTPException(status_code=404, detail="Attacker not found")
rows = await repo.list_smtp_targets(uuid)
return {"total": len(rows), "data": rows}

View File

@@ -0,0 +1,34 @@
from typing import Any
from fastapi import APIRouter, Depends, HTTPException
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo
router = APIRouter()
@router.get(
"/attackers/{uuid}/transcripts",
tags=["Attacker Profiles"],
responses={
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "Attacker not found"},
},
)
@_traced("api.get_attacker_transcripts")
async def get_attacker_transcripts(
uuid: str,
user: dict = Depends(require_viewer),
) -> dict[str, Any]:
"""List PTY session recordings for an attacker (newest first).
Each entry is a `session_recorded` log row — the frontend lists them
in the AttackerDetail Sessions tab and opens SessionDrawer on click.
"""
attacker = await repo.get_attacker_by_uuid(uuid)
if not attacker:
raise HTTPException(status_code=404, detail="Attacker not found")
rows = await repo.get_attacker_transcripts(uuid)
return {"total": len(rows), "data": rows}

View File

@@ -0,0 +1,83 @@
import asyncio
import time
from typing import Any, Optional
from fastapi import APIRouter, Depends, Query
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo
from decnet.web.db.models import AttackersResponse
router = APIRouter()
# Same pattern as /logs — cache the unfiltered total count; filtered
# counts go straight to the DB.
_TOTAL_TTL = 2.0
_total_cache: tuple[Optional[int], float] = (None, 0.0)
_total_lock: Optional[asyncio.Lock] = None
def _reset_total_cache() -> None:
global _total_cache, _total_lock
_total_cache = (None, 0.0)
_total_lock = None
async def _get_total_attackers_cached() -> int:
global _total_cache, _total_lock
value, ts = _total_cache
now = time.monotonic()
if value is not None and now - ts < _TOTAL_TTL:
return value
if _total_lock is None:
_total_lock = asyncio.Lock()
async with _total_lock:
value, ts = _total_cache
now = time.monotonic()
if value is not None and now - ts < _TOTAL_TTL:
return value
value = await repo.get_total_attackers()
_total_cache = (value, time.monotonic())
return value
@router.get(
"/attackers",
response_model=AttackersResponse,
tags=["Attacker Profiles"],
responses={
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
422: {"description": "Validation error"},
},
)
@_traced("api.get_attackers")
async def get_attackers(
limit: int = Query(50, ge=1, le=1000),
offset: int = Query(0, ge=0, le=2147483647),
search: Optional[str] = None,
sort_by: str = Query("recent", pattern="^(recent|active|traversals)$"),
service: Optional[str] = None,
user: dict = Depends(require_viewer),
) -> dict[str, Any]:
"""Retrieve paginated attacker profiles."""
def _norm(v: Optional[str]) -> Optional[str]:
if v in (None, "null", "NULL", "undefined", ""):
return None
return v
s = _norm(search)
svc = _norm(service)
_data = await repo.get_attackers(limit=limit, offset=offset, search=s, sort_by=sort_by, service=svc)
if s is None and svc is None:
_total = await _get_total_attackers_cached()
else:
_total = await repo.get_total_attackers(search=s, service=svc)
# Bulk-join behavior rows for the IPs in this page to avoid N+1 queries.
_ips = {row["ip"] for row in _data if row.get("ip")}
_behaviors = await repo.get_behaviors_for_ips(_ips) if _ips else {}
for row in _data:
row["behavior"] = _behaviors.get(row.get("ip"))
return {"total": _total, "limit": limit, "offset": offset, "data": _data}

View File

@@ -2,9 +2,10 @@ from typing import Any, Optional
from fastapi import APIRouter, Depends, HTTPException, status
from decnet.web.auth import get_password_hash, verify_password
from decnet.web.dependencies import get_current_user_unchecked, repo
from decnet.web.db.models import ChangePasswordRequest
from decnet.telemetry import traced as _traced
from decnet.web.auth import ahash_password, averify_password
from decnet.web.dependencies import get_current_user_unchecked, invalidate_user_cache, repo
from decnet.web.db.models import ChangePasswordRequest, MessageResponse
router = APIRouter()
@@ -12,20 +13,23 @@ router = APIRouter()
@router.post(
"/auth/change-password",
tags=["Authentication"],
response_model=MessageResponse,
responses={
400: {"description": "Bad Request (e.g. malformed JSON)"},
401: {"description": "Could not validate credentials"},
422: {"description": "Validation error"}
},
)
@_traced("api.change_password")
async def change_password(request: ChangePasswordRequest, current_user: str = Depends(get_current_user_unchecked)) -> dict[str, str]:
_user: Optional[dict[str, Any]] = await repo.get_user_by_uuid(current_user)
if not _user or not verify_password(request.old_password, _user["password_hash"]):
if not _user or not await averify_password(request.old_password, _user["password_hash"]):
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Incorrect old password",
)
_new_hash: str = get_password_hash(request.new_password)
_new_hash: str = await ahash_password(request.new_password)
await repo.update_user_password(current_user, _new_hash, must_change_password=False)
invalidate_user_cache(current_user)
return {"message": "Password updated successfully"}

View File

@@ -1,19 +1,32 @@
from datetime import timedelta
from typing import Any, Optional
from fastapi import APIRouter, HTTPException, status
from fastapi import APIRouter, HTTPException, Request, status
from decnet.telemetry import traced as _traced
from decnet.web.auth import (
ACCESS_TOKEN_EXPIRE_MINUTES,
averify_password,
create_access_token,
verify_password,
)
from decnet.web.dependencies import repo
from decnet.web.dependencies import get_user_by_username_cached
from decnet.web.db.models import LoginRequest, Token
from decnet.web.limiter import limiter, login_ip_key, login_username_key
router = APIRouter()
# Two independent buckets, tripping either → 429:
#
# - per-IP (login_ip_key): catches a botnet hitting one account.
# - per-user (login_username_key): catches distributed credential
# stuffing against one account.
#
# Limits: 10 attempts per 5 minutes per bucket. Buckets are process-local
# (memory://); see decnet/web/limiter.py for the rationale. Buckets do
# NOT reset on successful login — a legitimate user tripping the limit
# via fat-fingering will need to wait the window out. 10 tries is
# generous; a rolling window naturally drains.
@router.post(
"/auth/login",
response_model=Token,
@@ -21,12 +34,16 @@ router = APIRouter()
responses={
400: {"description": "Bad Request (e.g. malformed JSON)"},
401: {"description": "Incorrect username or password"},
422: {"description": "Validation error"}
422: {"description": "Validation error"},
429: {"description": "Too many login attempts — retry after the window resets"},
},
)
async def login(request: LoginRequest) -> dict[str, Any]:
_user: Optional[dict[str, Any]] = await repo.get_user_by_username(request.username)
if not _user or not verify_password(request.password, _user["password_hash"]):
@limiter.limit("10/5 minutes", key_func=login_ip_key)
@limiter.limit("10/5 minutes", key_func=login_username_key)
@_traced("api.login")
async def login(request: Request, payload: LoginRequest) -> dict[str, Any]:
_user: Optional[dict[str, Any]] = await get_user_by_username_cached(payload.username)
if not _user or not await averify_password(payload.password, _user["password_hash"]):
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Incorrect username or password",
@@ -40,6 +57,6 @@ async def login(request: LoginRequest) -> dict[str, Any]:
)
return {
"access_token": _access_token,
"token_type": "bearer", # nosec B105
"token_type": "bearer", # nosec B105 — OAuth2 token type, not a password
"must_change_password": bool(_user.get("must_change_password", False))
}

View File

@@ -1,21 +1,62 @@
import asyncio
import time
from typing import Any, Optional
from fastapi import APIRouter, Depends, Query
from decnet.web.dependencies import get_current_user, repo
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo
from decnet.web.db.models import BountyResponse
router = APIRouter()
# Cache the unfiltered default page — the UI/locust hit this constantly
# with no params. Filtered requests (bounty_type/search) bypass: rare
# and staleness matters for search.
_BOUNTY_TTL = 5.0
_DEFAULT_LIMIT = 50
_DEFAULT_OFFSET = 0
_bounty_cache: tuple[Optional[dict[str, Any]], float] = (None, 0.0)
_bounty_lock: Optional[asyncio.Lock] = None
def _reset_bounty_cache() -> None:
global _bounty_cache, _bounty_lock
_bounty_cache = (None, 0.0)
_bounty_lock = None
async def _get_bounty_default_cached() -> dict[str, Any]:
global _bounty_cache, _bounty_lock
value, ts = _bounty_cache
now = time.monotonic()
if value is not None and now - ts < _BOUNTY_TTL:
return value
if _bounty_lock is None:
_bounty_lock = asyncio.Lock()
async with _bounty_lock:
value, ts = _bounty_cache
now = time.monotonic()
if value is not None and now - ts < _BOUNTY_TTL:
return value
_data = await repo.get_bounties(
limit=_DEFAULT_LIMIT, offset=_DEFAULT_OFFSET, bounty_type=None, search=None,
)
_total = await repo.get_total_bounties(bounty_type=None, search=None)
value = {"total": _total, "limit": _DEFAULT_LIMIT, "offset": _DEFAULT_OFFSET, "data": _data}
_bounty_cache = (value, time.monotonic())
return value
@router.get("/bounty", response_model=BountyResponse, tags=["Bounty Vault"],
responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}},)
responses={401: {"description": "Could not validate credentials"}, 403: {"description": "Insufficient permissions"}, 422: {"description": "Validation error"}},)
@_traced("api.get_bounties")
async def get_bounties(
limit: int = Query(50, ge=1, le=1000),
offset: int = Query(0, ge=0, le=2147483647),
bounty_type: Optional[str] = None,
search: Optional[str] = None,
current_user: str = Depends(get_current_user)
user: dict = Depends(require_viewer)
) -> dict[str, Any]:
"""Retrieve collected bounties (harvested credentials, payloads, etc.)."""
def _norm(v: Optional[str]) -> Optional[str]:
@@ -26,6 +67,9 @@ async def get_bounties(
bt = _norm(bounty_type)
s = _norm(search)
if bt is None and s is None and limit == _DEFAULT_LIMIT and offset == _DEFAULT_OFFSET:
return await _get_bounty_default_cached()
_data = await repo.get_bounties(limit=limit, offset=offset, bounty_type=bt, search=s)
_total = await repo.get_total_bounties(bounty_type=bt, search=s)
return {

View File

View File

@@ -0,0 +1,123 @@
"""SSE stream of campaign events — one connection per viewer.
Subscribes to ``campaign.>`` on the bus for the duration of the
request and forwards each matching event as a Server-Sent Event.
Emits a one-shot snapshot on connect (current paginated campaign
list).
Mirror of :mod:`decnet.web.router.identities.api_events`. Auth: JWT
via ``?token=`` query param + ``require_stream_viewer`` role.
"""
from __future__ import annotations
import asyncio
from typing import AsyncGenerator
import orjson
from fastapi import APIRouter, Depends, Request
from fastapi.responses import StreamingResponse
from decnet.bus import topics as _topics
from decnet.bus.app import get_app_bus
from decnet.logging import get_logger
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import repo, require_stream_viewer
from decnet.web.sse_limits import sse_connection_slot
log = get_logger("api.campaigns.events")
router = APIRouter()
_KEEPALIVE_SECS = 15.0
_SNAPSHOT_LIMIT = 50
def _format_sse(event_name: str, data: dict) -> str:
return f"event: {event_name}\ndata: {orjson.dumps(data).decode()}\n\n"
@router.get(
"/campaigns/events",
tags=["Campaign Clustering"],
responses={
200: {
"content": {"text/event-stream": {}},
"description": "SSE stream of campaign-clustering events",
},
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
429: {"description": "Per-user SSE connection cap reached"},
},
)
@_traced("api.campaigns.events")
async def api_campaigns_events(
request: Request,
user: dict = Depends(require_stream_viewer),
) -> StreamingResponse:
# Event types: snapshot, formed, identity.assigned, merged, unmerged.
snapshot = await repo.list_campaigns(limit=_SNAPSHOT_LIMIT, offset=0)
async def generator() -> AsyncGenerator[str, None]:
async with sse_connection_slot(user["uuid"]):
yield ": keepalive\n\n"
yield _format_sse("snapshot", {"campaigns": snapshot})
bus = await get_app_bus()
if bus is None:
while not await request.is_disconnected():
try:
await asyncio.sleep(_KEEPALIVE_SECS)
except asyncio.CancelledError:
break
yield ": keepalive\n\n"
return
sub = bus.subscribe(f"{_topics.CAMPAIGN}.>")
try:
async with sub:
sub_iter = sub.__aiter__()
while True:
if await request.is_disconnected():
break
next_task = asyncio.ensure_future(sub_iter.__anext__())
try:
event = await asyncio.wait_for(
next_task, timeout=_KEEPALIVE_SECS,
)
except asyncio.TimeoutError:
next_task.cancel()
yield ": keepalive\n\n"
continue
except StopAsyncIteration:
break
yield _format_sse(
_sse_name_for(event.topic),
{
"topic": event.topic,
"type": event.type,
"ts": event.ts,
"payload": event.payload,
},
)
except asyncio.CancelledError:
pass
except Exception:
log.exception("campaign events stream crashed")
yield _format_sse("error", {"message": "Stream interrupted"})
return StreamingResponse(
generator(),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"X-Accel-Buffering": "no",
},
)
def _sse_name_for(topic: str) -> str:
"""``campaign.formed`` → ``formed``;
``campaign.identity.assigned`` → ``identity.assigned``."""
if topic.startswith(f"{_topics.CAMPAIGN}."):
return topic[len(_topics.CAMPAIGN) + 1:]
return topic

View File

@@ -0,0 +1,40 @@
"""GET /api/v1/campaigns/{uuid} — single campaign row.
Soft-merge handling: if the requested UUID has merged_into_uuid set,
the repository follows the chain and returns the winner. Mirror of
:mod:`decnet.web.router.identities.api_get_identity_detail`.
"""
from typing import Any
from fastapi import APIRouter, Depends, HTTPException
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import repo, require_viewer
router = APIRouter()
@router.get(
"/campaigns/{uuid}",
tags=["Campaign Clustering"],
responses={
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "Campaign not found"},
},
)
@_traced("api.get_campaign_detail")
async def get_campaign_detail(
uuid: str,
user: dict = Depends(require_viewer),
) -> dict[str, Any]:
campaign = await repo.get_campaign_by_uuid(uuid)
if not campaign:
raise HTTPException(status_code=404, detail="Campaign not found")
# Cheap aggregate the CampaignDetail page surfaces — counted off
# the FK rather than the denormalized identity_count so the answer
# is always live.
campaign["identity_count_live"] = await repo.count_identities_for_campaign(
campaign["uuid"]
)
return campaign

View File

@@ -0,0 +1,41 @@
"""GET /api/v1/campaigns/{uuid}/identities — identities for a campaign.
Returns the ``AttackerIdentity`` rows whose ``campaign_id`` FK points
at this campaign. Mirror of
:mod:`decnet.web.router.identities.api_list_identity_observations`.
"""
from typing import Any
from fastapi import APIRouter, Depends, HTTPException, Query
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import repo, require_viewer
router = APIRouter()
@router.get(
"/campaigns/{uuid}/identities",
tags=["Campaign Clustering"],
responses={
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "Campaign not found"},
},
)
@_traced("api.list_campaign_identities")
async def list_campaign_identities(
uuid: str,
limit: int = Query(50, ge=1, le=1000),
offset: int = Query(0, ge=0, le=2147483647),
user: dict = Depends(require_viewer),
) -> dict[str, Any]:
campaign = await repo.get_campaign_by_uuid(uuid)
if not campaign:
raise HTTPException(status_code=404, detail="Campaign not found")
canonical_uuid = campaign["uuid"]
data = await repo.list_identities_for_campaign(
canonical_uuid, limit=limit, offset=offset
)
total = await repo.count_identities_for_campaign(canonical_uuid)
return {"total": total, "limit": limit, "offset": offset, "data": data}

View File

@@ -0,0 +1,35 @@
"""GET /api/v1/campaigns — paginated list of campaigns.
Mirror of :mod:`decnet.web.router.identities.api_list_identities` for
the campaign layer. Returns an empty list while the campaign clusterer
hasn't run yet (the campaigns table ships empty).
"""
from typing import Any
from fastapi import APIRouter, Depends, Query
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import repo, require_viewer
router = APIRouter()
@router.get(
"/campaigns",
tags=["Campaign Clustering"],
responses={
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
422: {"description": "Validation error"},
},
)
@_traced("api.list_campaigns")
async def list_campaigns(
limit: int = Query(50, ge=1, le=1000),
offset: int = Query(0, ge=0, le=2147483647),
user: dict = Depends(require_viewer),
) -> dict[str, Any]:
"""Paginated campaign list, newest-updated first."""
data = await repo.list_campaigns(limit=limit, offset=offset)
total = await repo.count_campaigns()
return {"total": total, "limit": limit, "offset": offset, "data": data}

View File

@@ -0,0 +1,23 @@
"""Canary tokens — operator-facing CRUD.
Mounted under ``/api/v1/canary``. Covers:
* ``POST /blobs`` — upload an artifact (multipart);
``GET /blobs``, ``DELETE /blobs/{id}`` — listing + cleanup
* ``POST /tokens`` — generate + plant a token on a target decky;
``GET /tokens``, ``GET /tokens/{id}``, ``DELETE /tokens/{id}``
— listing + detail + revoke
* ``GET /tokens/{id}/preview`` — instrumented bytes for sanity-check
* ``GET /tokens/{id}/triggers`` — paged callback log
The ``decnet canary`` worker runs the ATTACKER-facing surface (HTTP
slug + DNS); this module is the OPERATOR-facing surface only.
"""
from fastapi import APIRouter
from .api_blobs import router as blobs_router
from .api_tokens import router as tokens_router
canary_router = APIRouter(prefix="/canary")
canary_router.include_router(blobs_router)
canary_router.include_router(tokens_router)

View File

@@ -0,0 +1,172 @@
"""Operator-uploaded canary blob CRUD.
Three endpoints:
* ``POST /blobs`` — multipart upload; sniffs MIME from the magic
bytes (no python-magic dependency), persists to disk under the
sha256 hash, returns the (possibly pre-existing) row.
* ``GET /blobs`` — list all blobs with their live token reference
count.
* ``DELETE /blobs/{uuid}`` — refcount-aware delete; returns 409 if
any token still references the blob.
Admin-gated: blobs are operator-supplied content that may carry
sensitive material (real-looking financial reports, etc.); listing
them and deleting them is an admin operation. Reading them via the
preview path is also admin-gated.
"""
from __future__ import annotations
from datetime import datetime, timezone
from typing import Any
from fastapi import APIRouter, Depends, File, HTTPException, UploadFile
from decnet.canary import storage
from decnet.logging import get_logger
from decnet.web.db.models import (
CanaryBlobResponse,
CanaryBlobsResponse,
MessageResponse,
)
from decnet.web.dependencies import repo, require_admin
log = get_logger("api.canary.blobs")
router = APIRouter(prefix="/blobs", tags=["Canary"])
# --- MIME sniffing (stdlib-only, replaces python-magic) -------------------
#
# The DOCX/XLSX/PDF/PNG/JPEG/GIF/HTML/JSON/YAML space covers everything
# our instrumenters know how to mutate. Anything else falls through to
# ``application/octet-stream`` and the API routes the token to the
# ``passthrough`` instrumenter.
_MAGIC_TABLE: tuple[tuple[bytes, str], ...] = (
(b"\x89PNG\r\n\x1a\n", "image/png"),
(b"\xff\xd8\xff", "image/jpeg"),
(b"GIF87a", "image/gif"),
(b"GIF89a", "image/gif"),
(b"%PDF-", "application/pdf"),
# OOXML (DOCX/XLSX) starts with PK\x03\x04 but so do plain zips.
# We disambiguate by Content_Types entry below.
(b"<!DOCTYPE", "text/html"),
(b"<html", "text/html"),
(b"<HTML", "text/html"),
(b"<?xml", "application/xml"),
)
def _sniff_mime(filename: str, head: bytes) -> str:
for marker, mime in _MAGIC_TABLE:
if head.startswith(marker):
return mime
if head[:4] == b"PK\x03\x04":
# OOXML alias detection: peek for the document-specific Override
# in [Content_Types].xml. We only need to look at the first
# block; the central directory comes later.
if b"wordprocessingml" in head:
return "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
if b"spreadsheetml" in head:
return "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
return "application/zip"
# Plaintext heuristic: if the head decodes as printable utf-8 we
# call it text/plain — that's good enough to route to the plain
# instrumenter, which also handles json/yaml/toml.
try:
head.decode("utf-8")
if all(b in (0x09, 0x0A, 0x0D) or b >= 0x20 for b in head[:128]):
lf = filename.lower()
if lf.endswith((".json",)):
return "application/json"
if lf.endswith((".yaml", ".yml")):
return "application/yaml"
if lf.endswith((".toml",)):
return "application/toml"
return "text/plain"
except UnicodeDecodeError:
pass
return "application/octet-stream"
def _row_to_response(row: dict[str, Any]) -> CanaryBlobResponse:
return CanaryBlobResponse(**row)
@router.post(
"",
response_model=CanaryBlobResponse,
status_code=201,
responses={
400: {"description": "Empty file or unreadable upload"},
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
},
)
async def api_upload_blob(
file: UploadFile = File(...),
admin: dict = Depends(require_admin),
) -> CanaryBlobResponse:
content = await file.read()
if not content:
raise HTTPException(status_code=400, detail="uploaded file is empty")
sniffed = _sniff_mime(file.filename or "", content[:1024])
sha, _path, size = storage.write_blob(content)
row = await repo.upsert_canary_blob({
"sha256": sha,
"filename": file.filename or "(unnamed)",
"content_type": sniffed,
"size_bytes": size,
"uploaded_by": admin.get("uuid", "unknown"),
"uploaded_at": datetime.now(timezone.utc),
})
row.setdefault("token_count", 0)
return _row_to_response(row)
@router.get(
"",
response_model=CanaryBlobsResponse,
responses={
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
},
)
async def api_list_blobs(
admin: dict = Depends(require_admin),
) -> CanaryBlobsResponse:
rows = await repo.list_canary_blobs()
return CanaryBlobsResponse(
blobs=[_row_to_response(r) for r in rows],
total=len(rows),
)
@router.delete(
"/{uuid}",
response_model=MessageResponse,
responses={
404: {"description": "Blob not found"},
409: {"description": "Blob still referenced by a token"},
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
},
)
async def api_delete_blob(
uuid: str,
admin: dict = Depends(require_admin),
) -> MessageResponse:
existing = await repo.get_canary_blob(uuid)
if existing is None:
raise HTTPException(status_code=404, detail="blob not found")
deleted = await repo.delete_canary_blob(uuid)
if not deleted:
raise HTTPException(
status_code=409,
detail="blob is still referenced by one or more tokens",
)
# DB row is gone; best-effort unlink the bytes on disk. A failure
# here leaves a recoverable orphan, never a dangling DB ref.
storage.unlink_blob(existing["sha256"])
return MessageResponse(message="ok")

View File

@@ -0,0 +1,318 @@
"""Operator-facing canary token CRUD.
Every body-bearing route documents the 400 error per
:mod:`feedback_schemathesis_400`. Auth deps:
* writes (POST, DELETE) → :func:`require_admin`
* reads (GET, preview) → :func:`require_viewer`
The router resolves blobs / instrumenters / generators here, builds
the :class:`CanaryArtifact`, and hands it to the planter. The
worker is a separate process; it doesn't see this code path.
"""
from __future__ import annotations
from secrets import token_urlsafe
from typing import Any
from uuid import uuid4
from fastapi import APIRouter, Depends, HTTPException, Query, Response
from decnet.canary import (
CanaryContext,
get_generator,
get_instrumenter,
pick_instrumenter_for_mime,
storage,
)
from decnet.canary.base import InstrumenterRejectedError
from decnet.canary.factory import KNOWN_GENERATORS
from decnet.canary.paths import normalize_placement
from decnet.canary import planter
from decnet.logging import get_logger
from decnet.web.db.models import (
CanaryTokenCreateRequest,
CanaryTokenResponse,
CanaryTokensResponse,
CanaryTriggerResponse,
CanaryTriggersResponse,
MessageResponse,
)
from decnet.web.dependencies import repo, require_admin, require_viewer
log = get_logger("api.canary.tokens")
router = APIRouter(prefix="/tokens", tags=["Canary"])
def _http_base() -> str:
import os
return os.environ.get(
"DECNET_CANARY_HTTP_BASE", "http://localhost:8088",
).rstrip("/")
def _dns_zone() -> str:
import os
return os.environ.get("DECNET_CANARY_DNS_ZONE", "").strip(".").lower()
def _row_to_response(row: dict[str, Any]) -> CanaryTokenResponse:
return CanaryTokenResponse(**row)
def _trigger_row_to_response(row: dict[str, Any]) -> CanaryTriggerResponse:
# Decode raw_headers JSON for the response shape.
headers = row.get("raw_headers") or "{}"
try:
import json
decoded = json.loads(headers) if isinstance(headers, str) else headers
if not isinstance(decoded, dict):
decoded = {}
except (ValueError, TypeError):
decoded = {}
out = dict(row)
out["headers"] = decoded
out.pop("raw_headers", None)
return CanaryTriggerResponse(**out)
# ---------------------------------------------------------- create
@router.post(
"",
response_model=CanaryTokenResponse,
status_code=201,
responses={
400: {"description": "Invalid token request (missing/conflicting fields, bad path, instrumenter rejection)"},
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "Referenced blob not found"},
},
)
async def api_create_token(
req: CanaryTokenCreateRequest,
admin: dict = Depends(require_admin),
) -> CanaryTokenResponse:
# Exactly one of blob_uuid / generator must be set.
if bool(req.blob_uuid) == bool(req.generator):
raise HTTPException(
status_code=400,
detail="provide exactly one of blob_uuid or generator",
)
try:
placement_path = normalize_placement(req.placement_path)
except ValueError as e:
raise HTTPException(status_code=400, detail=str(e)) from e
slug = token_urlsafe(16)
ctx = CanaryContext(
callback_token=slug, http_base=_http_base(), dns_zone=_dns_zone(),
)
if req.generator:
if req.generator not in KNOWN_GENERATORS:
raise HTTPException(
status_code=400,
detail=f"unknown generator: {req.generator!r}",
)
generator = get_generator(req.generator)
artifact = generator.generate(ctx)
instrumenter_name = None
else:
# Upload-driven token.
blob = await repo.get_canary_blob(req.blob_uuid)
if blob is None:
raise HTTPException(status_code=404, detail="blob not found")
try:
blob_bytes = storage.read_blob(blob["sha256"])
except FileNotFoundError as e:
raise HTTPException(
status_code=410,
detail="blob bytes missing on disk; please re-upload",
) from e
instrumenter_name = pick_instrumenter_for_mime(blob["content_type"])
ins = get_instrumenter(instrumenter_name)
try:
artifact = ins.instrument(blob_bytes, ctx, target_path=placement_path)
except InstrumenterRejectedError as e:
raise HTTPException(status_code=400, detail=str(e)) from e
artifact.path = placement_path
token_uuid = str(uuid4())
kind = req.kind
await repo.create_canary_token({
"uuid": token_uuid,
"kind": kind,
"decky_name": req.decky_name,
"blob_uuid": req.blob_uuid,
"instrumenter": instrumenter_name,
"generator": req.generator,
"placement_path": placement_path,
"callback_token": slug,
"secret_seed": slug,
"created_by": admin.get("uuid", "unknown"),
"state": "planted",
})
await planter.plant(req.decky_name, artifact, token_uuid=token_uuid, repo=repo)
row = await repo.get_canary_token(token_uuid)
return _row_to_response(row)
# ---------------------------------------------------------- list / detail
@router.get(
"",
response_model=CanaryTokensResponse,
responses={
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
},
)
async def api_list_tokens(
decky_name: str | None = Query(default=None),
state: str | None = Query(default=None),
kind: str | None = Query(default=None),
viewer: dict = Depends(require_viewer),
) -> CanaryTokensResponse:
rows = await repo.list_canary_tokens(
decky_name=decky_name, state=state, kind=kind,
)
return CanaryTokensResponse(
tokens=[_row_to_response(r) for r in rows],
total=len(rows),
)
@router.get(
"/{uuid}",
response_model=CanaryTokenResponse,
responses={
404: {"description": "Token not found"},
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
},
)
async def api_get_token(
uuid: str,
viewer: dict = Depends(require_viewer),
) -> CanaryTokenResponse:
row = await repo.get_canary_token(uuid)
if row is None:
raise HTTPException(status_code=404, detail="token not found")
return _row_to_response(row)
# ---------------------------------------------------------- preview
@router.get(
"/{uuid}/preview",
response_class=Response,
responses={
200: {"description": "Instrumented bytes (raw)"},
404: {"description": "Token not found"},
409: {"description": "Token has no preview-able bytes (passive aws_creds, etc.)"},
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
},
)
async def api_preview_token(
uuid: str,
admin: dict = Depends(require_admin),
) -> Response:
"""Return the instrumented bytes the planter dropped on the decky.
Re-derived deterministically from the row's ``secret_seed`` —
we don't store the rendered bytes server-side. Lets operators
diff-check what we wrote without ``docker exec``-ing into the
container.
"""
row = await repo.get_canary_token(uuid)
if row is None:
raise HTTPException(status_code=404, detail="token not found")
ctx = CanaryContext(
callback_token=row["callback_token"],
http_base=_http_base(),
dns_zone=_dns_zone(),
)
if row["generator"]:
artifact = get_generator(row["generator"]).generate(ctx)
elif row["blob_uuid"] and row["instrumenter"]:
blob = await repo.get_canary_blob(row["blob_uuid"])
if blob is None:
raise HTTPException(
status_code=409,
detail="blob has been deleted; preview unavailable",
)
try:
blob_bytes = storage.read_blob(blob["sha256"])
except FileNotFoundError as e:
raise HTTPException(
status_code=409,
detail="blob bytes missing on disk",
) from e
ins = get_instrumenter(row["instrumenter"])
try:
artifact = ins.instrument(
blob_bytes, ctx, target_path=row["placement_path"],
)
except InstrumenterRejectedError as e:
raise HTTPException(status_code=409, detail=str(e)) from e
else:
raise HTTPException(
status_code=409,
detail="token has neither generator nor instrumenter — nothing to preview",
)
return Response(content=artifact.content, media_type="application/octet-stream")
# ---------------------------------------------------------- triggers
@router.get(
"/{uuid}/triggers",
response_model=CanaryTriggersResponse,
responses={
404: {"description": "Token not found"},
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
},
)
async def api_list_triggers(
uuid: str,
limit: int = Query(default=100, ge=1, le=500),
offset: int = Query(default=0, ge=0),
viewer: dict = Depends(require_viewer),
) -> CanaryTriggersResponse:
row = await repo.get_canary_token(uuid)
if row is None:
raise HTTPException(status_code=404, detail="token not found")
rows = await repo.list_canary_triggers(uuid, limit=limit, offset=offset)
return CanaryTriggersResponse(
triggers=[_trigger_row_to_response(r) for r in rows],
total=len(rows),
)
# ---------------------------------------------------------- revoke
@router.delete(
"/{uuid}",
response_model=MessageResponse,
responses={
404: {"description": "Token not found"},
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
},
)
async def api_revoke_token(
uuid: str,
admin: dict = Depends(require_admin),
) -> MessageResponse:
row = await repo.get_canary_token(uuid)
if row is None:
raise HTTPException(status_code=404, detail="token not found")
await planter.revoke(
row["decky_name"], row["placement_path"],
token_uuid=uuid, repo=repo,
)
return MessageResponse(message="ok")

View File

View File

@@ -0,0 +1,124 @@
import asyncio
import time
from typing import Any, Optional
from fastapi import APIRouter, Depends
from decnet.env import DECNET_DEVELOPER
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo
from decnet.web.db.models import UserResponse
router = APIRouter()
_DEFAULT_DEPLOYMENT_LIMIT = 10
_DEFAULT_MUTATION_INTERVAL = "30m"
# Cache config_limits / config_globals reads — these change on rare admin
# writes but get polled constantly by the UI and locust.
_STATE_TTL = 5.0
_state_cache: dict[str, tuple[Optional[dict[str, Any]], float]] = {}
_state_locks: dict[str, asyncio.Lock] = {}
# Admin branch fetched repo.list_users() on every /config call — cache 5s,
# invalidate on user create/update/delete so the admin UI stays consistent.
_USERS_TTL = 5.0
_users_cache: tuple[Optional[list[dict[str, Any]]], float] = (None, 0.0)
_users_lock: Optional[asyncio.Lock] = None
def _reset_state_cache() -> None:
"""Reset cached config state — used by tests."""
global _users_cache, _users_lock
_state_cache.clear()
# Drop any locks bound to the previous event loop — reusing one from
# a dead loop deadlocks the next test.
_state_locks.clear()
_users_cache = (None, 0.0)
_users_lock = None
def invalidate_list_users_cache() -> None:
global _users_cache
_users_cache = (None, 0.0)
async def _get_list_users_cached() -> list[dict[str, Any]]:
global _users_cache, _users_lock
value, ts = _users_cache
now = time.monotonic()
if value is not None and now - ts < _USERS_TTL:
return value
if _users_lock is None:
_users_lock = asyncio.Lock()
async with _users_lock:
value, ts = _users_cache
now = time.monotonic()
if value is not None and now - ts < _USERS_TTL:
return value
value = await repo.list_users()
_users_cache = (value, time.monotonic())
return value
async def _get_state_cached(name: str) -> Optional[dict[str, Any]]:
entry = _state_cache.get(name)
now = time.monotonic()
if entry is not None and now - entry[1] < _STATE_TTL:
return entry[0]
lock = _state_locks.setdefault(name, asyncio.Lock())
async with lock:
entry = _state_cache.get(name)
now = time.monotonic()
if entry is not None and now - entry[1] < _STATE_TTL:
return entry[0]
value = await repo.get_state(name)
_state_cache[name] = (value, time.monotonic())
return value
@router.get(
"/config",
tags=["Configuration"],
responses={
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
},
)
@_traced("api.get_config")
async def api_get_config(user: dict = Depends(require_viewer)) -> dict:
limits_state = await _get_state_cached("config_limits")
globals_state = await _get_state_cached("config_globals")
deployment_limit = (
limits_state.get("deployment_limit", _DEFAULT_DEPLOYMENT_LIMIT)
if limits_state
else _DEFAULT_DEPLOYMENT_LIMIT
)
global_mutation_interval = (
globals_state.get("global_mutation_interval", _DEFAULT_MUTATION_INTERVAL)
if globals_state
else _DEFAULT_MUTATION_INTERVAL
)
base = {
"role": user["role"],
"deployment_limit": deployment_limit,
"global_mutation_interval": global_mutation_interval,
}
if user["role"] == "admin":
all_users = await _get_list_users_cached()
base["users"] = [
UserResponse(
uuid=u["uuid"],
username=u["username"],
role=u["role"],
must_change_password=u["must_change_password"],
).model_dump()
for u in all_users
]
if DECNET_DEVELOPER:
base["developer_mode"] = True
return base

View File

@@ -0,0 +1,144 @@
import uuid as _uuid
from fastapi import APIRouter, Depends, HTTPException
from decnet.telemetry import traced as _traced
from decnet.web.auth import ahash_password
from decnet.web.dependencies import require_admin, invalidate_user_cache, repo
from decnet.web.router.config.api_get_config import invalidate_list_users_cache
from decnet.web.db.models import (
CreateUserRequest,
MessageResponse,
ResetUserPasswordRequest,
UpdateUserRoleRequest,
UserResponse,
)
router = APIRouter()
@router.post(
"/config/users",
tags=["Configuration"],
response_model=UserResponse,
responses={
400: {"description": "Bad Request (e.g. malformed JSON)"},
401: {"description": "Could not validate credentials"},
403: {"description": "Admin access required"},
409: {"description": "Username already exists"},
422: {"description": "Validation error"},
},
)
@_traced("api.create_user")
async def api_create_user(
req: CreateUserRequest,
admin: dict = Depends(require_admin),
) -> UserResponse:
existing = await repo.get_user_by_username(req.username)
if existing:
raise HTTPException(status_code=409, detail="Username already exists")
user_uuid = str(_uuid.uuid4())
await repo.create_user({
"uuid": user_uuid,
"username": req.username,
"password_hash": await ahash_password(req.password),
"role": req.role,
"must_change_password": True, # nosec B105 — not a password
})
invalidate_list_users_cache()
return UserResponse(
uuid=user_uuid,
username=req.username,
role=req.role,
must_change_password=True,
)
@router.delete(
"/config/users/{user_uuid}",
tags=["Configuration"],
response_model=MessageResponse,
responses={
401: {"description": "Could not validate credentials"},
403: {"description": "Admin access required / cannot delete self"},
404: {"description": "User not found"},
},
)
@_traced("api.delete_user")
async def api_delete_user(
user_uuid: str,
admin: dict = Depends(require_admin),
) -> dict[str, str]:
if user_uuid == admin["uuid"]:
raise HTTPException(status_code=403, detail="Cannot delete your own account")
deleted = await repo.delete_user(user_uuid)
if not deleted:
raise HTTPException(status_code=404, detail="User not found")
invalidate_user_cache(user_uuid)
invalidate_list_users_cache()
return {"message": "User deleted"}
@router.put(
"/config/users/{user_uuid}/role",
tags=["Configuration"],
response_model=MessageResponse,
responses={
400: {"description": "Bad Request (e.g. malformed JSON)"},
401: {"description": "Could not validate credentials"},
403: {"description": "Admin access required / cannot change own role"},
404: {"description": "User not found"},
422: {"description": "Validation error"},
},
)
@_traced("api.update_user_role")
async def api_update_user_role(
user_uuid: str,
req: UpdateUserRoleRequest,
admin: dict = Depends(require_admin),
) -> dict[str, str]:
if user_uuid == admin["uuid"]:
raise HTTPException(status_code=403, detail="Cannot change your own role")
target = await repo.get_user_by_uuid(user_uuid)
if not target:
raise HTTPException(status_code=404, detail="User not found")
await repo.update_user_role(user_uuid, req.role)
invalidate_user_cache(user_uuid)
invalidate_list_users_cache()
return {"message": "User role updated"}
@router.put(
"/config/users/{user_uuid}/reset-password",
tags=["Configuration"],
response_model=MessageResponse,
responses={
400: {"description": "Bad Request (e.g. malformed JSON)"},
401: {"description": "Could not validate credentials"},
403: {"description": "Admin access required"},
404: {"description": "User not found"},
422: {"description": "Validation error"},
},
)
@_traced("api.reset_user_password")
async def api_reset_user_password(
user_uuid: str,
req: ResetUserPasswordRequest,
admin: dict = Depends(require_admin),
) -> dict[str, str]:
target = await repo.get_user_by_uuid(user_uuid)
if not target:
raise HTTPException(status_code=404, detail="User not found")
await repo.update_user_password(
user_uuid,
await ahash_password(req.new_password),
must_change_password=True,
)
invalidate_user_cache(user_uuid)
invalidate_list_users_cache()
return {"message": "Password reset successfully"}

View File

@@ -0,0 +1,29 @@
from fastapi import APIRouter, Depends, HTTPException
from decnet.env import DECNET_DEVELOPER
from decnet.telemetry import traced as _traced
from decnet.web.db.models import PurgeResponse
from decnet.web.dependencies import require_admin, repo
router = APIRouter()
@router.delete(
"/config/reinit",
tags=["Configuration"],
response_model=PurgeResponse,
responses={
401: {"description": "Could not validate credentials"},
403: {"description": "Admin access required or developer mode not enabled"},
},
)
@_traced("api.reinit")
async def api_reinit(admin: dict = Depends(require_admin)) -> dict:
if not DECNET_DEVELOPER:
raise HTTPException(status_code=403, detail="Developer mode is not enabled")
counts = await repo.purge_logs_and_bounties()
return {
"message": "Data purged",
"deleted": counts,
}

View File

@@ -0,0 +1,50 @@
from fastapi import APIRouter, Depends
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_admin, repo
from decnet.web.db.models import DeploymentLimitRequest, GlobalMutationIntervalRequest, MessageResponse
router = APIRouter()
@router.put(
"/config/deployment-limit",
tags=["Configuration"],
response_model=MessageResponse,
responses={
400: {"description": "Bad Request (e.g. malformed JSON)"},
401: {"description": "Could not validate credentials"},
403: {"description": "Admin access required"},
422: {"description": "Validation error"},
},
)
@_traced("api.update_deployment_limit")
async def api_update_deployment_limit(
req: DeploymentLimitRequest,
admin: dict = Depends(require_admin),
) -> dict[str, str]:
await repo.set_state("config_limits", {"deployment_limit": req.deployment_limit})
return {"message": "Deployment limit updated"}
@router.put(
"/config/global-mutation-interval",
tags=["Configuration"],
response_model=MessageResponse,
responses={
400: {"description": "Bad Request (e.g. malformed JSON)"},
401: {"description": "Could not validate credentials"},
403: {"description": "Admin access required"},
422: {"description": "Validation error"},
},
)
@_traced("api.update_global_mutation_interval")
async def api_update_global_mutation_interval(
req: GlobalMutationIntervalRequest,
admin: dict = Depends(require_admin),
) -> dict[str, str]:
await repo.set_state(
"config_globals",
{"global_mutation_interval": req.global_mutation_interval},
)
return {"message": "Global mutation interval updated"}

View File

@@ -0,0 +1,74 @@
from typing import Any, Optional
from fastapi import APIRouter, Depends, HTTPException, Query
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo
from decnet.web.db.models import CredentialReuseResponse
router = APIRouter()
@router.get(
"/credential-reuse",
response_model=CredentialReuseResponse,
tags=["Credentials"],
responses={
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
422: {"description": "Validation error"},
},
)
@_traced("api.list_credential_reuse")
async def list_credential_reuse(
limit: int = Query(50, ge=1, le=1000),
offset: int = Query(0, ge=0, le=2147483647),
min_target_count: int = Query(2, ge=2, le=2147483647),
secret_kind: Optional[str] = None,
user: dict = Depends(require_viewer),
) -> dict[str, Any]:
"""Paged list of credential-reuse findings ordered by target_count desc.
Each row collapses every Credential capture sharing the same secret
+ principal across distinct (decky, service) pairs into a single
finding with the union of attacker UUIDs/IPs and reach.
"""
def _norm(v: Optional[str]) -> Optional[str]:
if v in (None, "null", "NULL", "undefined", ""):
return None
return v
kind = _norm(secret_kind)
total, data = await repo.list_credential_reuses(
limit=limit,
offset=offset,
min_target_count=min_target_count,
secret_kind=kind,
)
return {
"total": total,
"limit": limit,
"offset": offset,
"data": data,
}
@router.get(
"/credential-reuse/{reuse_id}",
tags=["Credentials"],
responses={
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "CredentialReuse row not found"},
},
)
@_traced("api.get_credential_reuse")
async def get_credential_reuse(
reuse_id: str,
user: dict = Depends(require_viewer),
) -> dict[str, Any]:
"""One credential-reuse finding by UUID, or 404."""
row = await repo.get_credential_reuse_by_id(reuse_id)
if row is None:
raise HTTPException(status_code=404, detail="credential_reuse not found")
return row

View File

@@ -0,0 +1,103 @@
import asyncio
import time
from typing import Any, Optional
from fastapi import APIRouter, Depends, Query
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo
from decnet.web.db.models import CredentialsResponse
router = APIRouter()
# Mirror the Bounty cache pattern: the dashboard hits the unfiltered
# default page constantly. Filtered requests bypass — staleness matters
# when an operator is searching for a specific principal/IP.
_CRED_TTL = 5.0
_DEFAULT_LIMIT = 50
_DEFAULT_OFFSET = 0
_cred_cache: tuple[Optional[dict[str, Any]], float] = (None, 0.0)
_cred_lock: Optional[asyncio.Lock] = None
def _reset_credentials_cache() -> None:
global _cred_cache, _cred_lock
_cred_cache = (None, 0.0)
_cred_lock = None
async def _get_credentials_default_cached() -> dict[str, Any]:
global _cred_cache, _cred_lock
value, ts = _cred_cache
now = time.monotonic()
if value is not None and now - ts < _CRED_TTL:
return value
if _cred_lock is None:
_cred_lock = asyncio.Lock()
async with _cred_lock:
value, ts = _cred_cache
now = time.monotonic()
if value is not None and now - ts < _CRED_TTL:
return value
_data = await repo.get_credentials(
limit=_DEFAULT_LIMIT, offset=_DEFAULT_OFFSET,
search=None, service=None, attacker_ip=None,
)
_total = await repo.get_total_credentials(
search=None, service=None, attacker_ip=None,
)
value = {"total": _total, "limit": _DEFAULT_LIMIT, "offset": _DEFAULT_OFFSET, "data": _data}
_cred_cache = (value, time.monotonic())
return value
@router.get(
"/credentials",
response_model=CredentialsResponse,
tags=["Credentials"],
responses={
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
422: {"description": "Validation error"},
},
)
@_traced("api.get_credentials")
async def get_credentials(
limit: int = Query(50, ge=1, le=1000),
offset: int = Query(0, ge=0, le=2147483647),
search: Optional[str] = None,
service: Optional[str] = None,
attacker_ip: Optional[str] = None,
user: dict = Depends(require_viewer),
) -> dict[str, Any]:
"""Retrieve captured credentials (deduped by attacker/decky/service/secret)."""
def _norm(v: Optional[str]) -> Optional[str]:
if v in (None, "null", "NULL", "undefined", ""):
return None
return v
s = _norm(search)
svc = _norm(service)
aip = _norm(attacker_ip)
if (
s is None
and svc is None
and aip is None
and limit == _DEFAULT_LIMIT
and offset == _DEFAULT_OFFSET
):
return await _get_credentials_default_cached()
_data = await repo.get_credentials(
limit=limit, offset=offset, search=s, service=svc, attacker_ip=aip,
)
_total = await repo.get_total_credentials(
search=s, service=svc, attacker_ip=aip,
)
return {
"total": _total,
"limit": limit,
"offset": offset,
"data": _data,
}

View File

@@ -1,14 +1,18 @@
import logging
import os
from fastapi import APIRouter, Depends, HTTPException
from decnet.config import DEFAULT_MUTATE_INTERVAL, DecnetConfig, _ROOT, log
from decnet.logging import get_logger
from decnet.telemetry import traced as _traced
from decnet.config import DEFAULT_MUTATE_INTERVAL, DecnetConfig, _ROOT
from decnet.engine import deploy as _deploy
from decnet.ini_loader import load_ini_from_string
from decnet.network import detect_interface, detect_subnet, get_host_ip
from decnet.web.dependencies import get_current_user, repo
from decnet.web.db.models import DeployIniRequest
from decnet.web.dependencies import require_admin, repo
from decnet.web.db.models import DeployIniRequest, DeployResponse
from decnet.web.router.swarm.api_deploy_swarm import dispatch_decnet_config
log = get_logger("api")
router = APIRouter()
@@ -16,15 +20,19 @@ router = APIRouter()
@router.post(
"/deckies/deploy",
tags=["Fleet Management"],
response_model=DeployResponse,
responses={
400: {"description": "Bad Request (e.g. malformed JSON)"},
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
409: {"description": "Configuration conflict (e.g. invalid IP allocation or network mismatch)"},
422: {"description": "Invalid INI config or schema validation error"},
500: {"description": "Deployment failed"}
500: {"description": "Deployment failed"},
502: {"description": "Partial swarm deploy failure — one or more worker hosts returned an error"},
}
)
async def api_deploy_deckies(req: DeployIniRequest, current_user: str = Depends(get_current_user)) -> dict[str, str]:
@_traced("api.deploy_deckies")
async def api_deploy_deckies(req: DeployIniRequest, admin: dict = Depends(require_admin)) -> dict[str, str]:
from decnet.fleet import build_deckies_from_ini
try:
@@ -38,16 +46,20 @@ async def api_deploy_deckies(req: DeployIniRequest, current_user: str = Depends(
state_dict = await repo.get_state("deployment")
ingest_log_file = os.environ.get("DECNET_INGEST_LOG_FILE")
config: DecnetConfig | None = None
if state_dict:
config = DecnetConfig(**state_dict["config"])
subnet_cidr = ini.subnet or config.subnet
gateway = ini.gateway or config.gateway
host_ip = get_host_ip(config.interface)
iface = config.interface
host_ip = get_host_ip(iface)
# Always sync config log_file with current API ingestion target
if ingest_log_file:
config.log_file = ingest_log_file
else:
# If no state exists, we need to infer network details from the INI or the host.
# No state yet — infer network details from the INI or the host. We
# defer instantiating DecnetConfig until after build_deckies_from_ini
# because DecnetConfig.deckies has min_length=1.
try:
iface = ini.interface or detect_interface()
subnet_cidr, gateway = ini.subnet, ini.gateway
@@ -62,16 +74,6 @@ async def api_deploy_deckies(req: DeployIniRequest, current_user: str = Depends(
detail=f"Network configuration conflict: {e}. "
"Add a [general] section with interface=, net=, and gw= to the INI."
)
config = DecnetConfig(
mode="unihost",
interface=iface,
subnet=subnet_cidr,
gateway=gateway,
deckies=[],
log_file=ingest_log_file,
ipvlan=False,
mutate_interval=ini.mutate_interval or DEFAULT_MUTATE_INTERVAL
)
try:
new_decky_configs = build_deckies_from_ini(
@@ -81,26 +83,99 @@ async def api_deploy_deckies(req: DeployIniRequest, current_user: str = Depends(
log.debug("deploy: build_deckies_from_ini rejected input: %s", e)
raise HTTPException(status_code=409, detail=str(e))
# Merge deckies
existing_deckies_map = {d.name: d for d in config.deckies}
for new_decky in new_decky_configs:
existing_deckies_map[new_decky.name] = new_decky
if config is None:
config = DecnetConfig(
mode="unihost",
interface=iface,
subnet=subnet_cidr,
gateway=gateway,
deckies=new_decky_configs,
log_file=ingest_log_file,
ipvlan=False,
mutate_interval=ini.mutate_interval or DEFAULT_MUTATE_INTERVAL,
)
config.deckies = list(existing_deckies_map.values())
# The INI is the source of truth for *which* deckies exist this deploy.
# The old "merge with prior state" behaviour meant submitting `[decky1]`
# after a 3-decky run silently redeployed decky2/decky3 too — and then
# collided on their stale IPs ("Address already in use"). Full replace
# matches what the operator sees in the submitted config.
config.deckies = list(new_decky_configs)
# We call deploy(config) which regenerates docker-compose and runs `up -d --remove-orphans`.
limits_state = await repo.get_state("config_limits")
deployment_limit = limits_state.get("deployment_limit", 10) if limits_state else 10
if len(config.deckies) > deployment_limit:
raise HTTPException(
status_code=409,
detail=f"Deployment would result in {len(config.deckies)} deckies, "
f"exceeding the configured limit of {deployment_limit}",
)
# Auto-mode: if we're a master with at least one enrolled/active SWARM
# host, shard the deckies across those workers instead of spawning docker
# containers on the master itself. Round-robin assignment over deckies
# that don't already carry a host_uuid (state from a prior swarm deploy
# keeps its original assignment).
swarm_hosts: list[dict] = []
if os.environ.get("DECNET_MODE", "master").lower() == "master":
swarm_hosts = [
h for h in await repo.list_swarm_hosts()
if h.get("status") in ("active", "enrolled") and h.get("address")
]
if swarm_hosts:
# Carry-over from a prior deployment may reference a host_uuid that's
# since been decommissioned / re-enrolled at a new uuid. Drop any
# assignment that isn't in the currently-reachable set, then round-
# robin-fill the blanks — otherwise dispatch 404s on a dead uuid.
live_uuids = {h["uuid"] for h in swarm_hosts}
for d in config.deckies:
if d.host_uuid and d.host_uuid not in live_uuids:
d.host_uuid = None
unassigned = [d for d in config.deckies if not d.host_uuid]
for i, d in enumerate(unassigned):
d.host_uuid = swarm_hosts[i % len(swarm_hosts)]["uuid"]
config = config.model_copy(update={"mode": "swarm"})
try:
result = await dispatch_decnet_config(config, repo, dry_run=False, no_cache=False)
except HTTPException:
raise
except Exception as e:
log.exception("swarm-auto deploy dispatch failed: %s", e)
raise HTTPException(status_code=500, detail="Swarm dispatch failed. Check server logs.")
await repo.set_state("deployment", {
"config": config.model_dump(),
"compose_path": state_dict["compose_path"] if state_dict else "",
})
failed = [r for r in result.results if not r.ok]
if failed:
detail = "; ".join(f"{r.host_name}: {r.detail}" for r in failed)
raise HTTPException(status_code=502, detail=f"Partial swarm deploy failure — {detail}")
return {
"message": f"Deckies deployed across {len(result.results)} swarm host(s)",
"mode": "swarm",
}
# Unihost path — docker-compose on the master itself.
# NB: the JSON state file (decnet-state.json) and fleet_deckies DB rows
# are both written *inside* _deploy(config) — engine.deployer is the
# single shared sink for every fleet-creation path (CLI deploy, this
# unihost API path, and per-worker SWARM agent deploys). Do not
# duplicate save_state / fleet upserts here.
try:
if os.environ.get("DECNET_CONTRACT_TEST") != "true":
_deploy(config)
# Persist new state to DB
new_state_payload = {
"config": config.model_dump(),
"compose_path": str(_ROOT / "docker-compose.yml") if not state_dict else state_dict["compose_path"]
}
await repo.set_state("deployment", new_state_payload)
except Exception as e:
logging.getLogger("decnet.web.api").exception("Deployment failed: %s", e)
log.exception("Deployment failed: %s", e)
raise HTTPException(status_code=500, detail="Deployment failed. Check server logs for details.")
return {"message": "Deckies deployed successfully"}
return {"message": "Deckies deployed successfully", "mode": "unihost"}

View File

@@ -1,13 +1,48 @@
from typing import Any
import asyncio
import time
from typing import Any, Optional
from fastapi import APIRouter, Depends
from decnet.web.dependencies import get_current_user, repo
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo
router = APIRouter()
# /deckies is full fleet inventory — polled by the UI and under locust.
# Fleet state changes on deploy/teardown (seconds to minutes); a 5s window
# collapses the read storm into one DB hit.
_DECKIES_TTL = 5.0
_deckies_cache: tuple[Optional[list[dict[str, Any]]], float] = (None, 0.0)
_deckies_lock: Optional[asyncio.Lock] = None
def _reset_deckies_cache() -> None:
global _deckies_cache, _deckies_lock
_deckies_cache = (None, 0.0)
_deckies_lock = None
async def _get_deckies_cached() -> list[dict[str, Any]]:
global _deckies_cache, _deckies_lock
value, ts = _deckies_cache
now = time.monotonic()
if value is not None and now - ts < _DECKIES_TTL:
return value
if _deckies_lock is None:
_deckies_lock = asyncio.Lock()
async with _deckies_lock:
value, ts = _deckies_cache
now = time.monotonic()
if value is not None and now - ts < _DECKIES_TTL:
return value
value = await repo.get_deckies()
_deckies_cache = (value, time.monotonic())
return value
@router.get("/deckies", tags=["Fleet Management"],
responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}},)
async def get_deckies(current_user: str = Depends(get_current_user)) -> list[dict[str, Any]]:
return await repo.get_deckies()
responses={401: {"description": "Could not validate credentials"}, 403: {"description": "Insufficient permissions"}, 422: {"description": "Validation error"}},)
@_traced("api.get_deckies")
async def get_deckies(user: dict = Depends(require_viewer)) -> list[dict[str, Any]]:
return await _get_deckies_cached()

View File

@@ -1,8 +1,10 @@
import os
from fastapi import APIRouter, Depends, HTTPException, Path
from decnet.telemetry import traced as _traced
from decnet.mutator import mutate_decky
from decnet.web.dependencies import get_current_user, repo
from decnet.web.db.models import MessageResponse
from decnet.web.dependencies import require_admin, repo
router = APIRouter()
@@ -10,11 +12,18 @@ router = APIRouter()
@router.post(
"/deckies/{decky_name}/mutate",
tags=["Fleet Management"],
responses={401: {"description": "Could not validate credentials"}, 404: {"description": "Decky not found"}}
response_model=MessageResponse,
responses={
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "Decky not found"},
422: {"description": "Path parameter validation error (decky_name must match ^[a-z0-9\\-]{1,64}$)"},
}
)
@_traced("api.mutate_decky")
async def api_mutate_decky(
decky_name: str = Path(..., pattern=r"^[a-z0-9\-]{1,64}$"),
current_user: str = Depends(get_current_user),
admin: dict = Depends(require_admin),
) -> dict[str, str]:
if os.environ.get("DECNET_CONTRACT_TEST") == "true":
return {"message": f"Successfully mutated {decky_name} (Contract Test Mock)"}

View File

@@ -1,8 +1,9 @@
from fastapi import APIRouter, Depends, HTTPException
from decnet.telemetry import traced as _traced
from decnet.config import DecnetConfig
from decnet.web.dependencies import get_current_user, repo
from decnet.web.db.models import MutateIntervalRequest
from decnet.web.dependencies import require_admin, repo
from decnet.web.db.models import MessageResponse, MutateIntervalRequest
router = APIRouter()
@@ -16,14 +17,17 @@ def _parse_duration(s: str) -> int:
@router.put("/deckies/{decky_name}/mutate-interval", tags=["Fleet Management"],
response_model=MessageResponse,
responses={
400: {"description": "Bad Request (e.g. malformed JSON)"},
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "No active deployment or decky not found"},
422: {"description": "Validation error"}
},
)
async def api_update_mutate_interval(decky_name: str, req: MutateIntervalRequest, current_user: str = Depends(get_current_user)) -> dict[str, str]:
@_traced("api.update_mutate_interval")
async def api_update_mutate_interval(decky_name: str, req: MutateIntervalRequest, admin: dict = Depends(require_admin)) -> dict[str, str]:
state_dict = await repo.get_state("deployment")
if not state_dict:
raise HTTPException(status_code=404, detail="No active deployment")

View File

View File

@@ -0,0 +1,151 @@
import asyncio
import time
from typing import Any, Optional
from fastapi import APIRouter, Depends
from fastapi.responses import ORJSONResponse
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo
from decnet.web.db.models import HealthResponse, ComponentHealth
router = APIRouter()
_CRITICAL_SERVICES = {"database", "docker", "ingestion_worker"}
# Cache Docker client and health result to avoid hammering the Docker socket
_docker_client: Optional[Any] = None
_docker_healthy: bool = False
_docker_detail: str = ""
_docker_last_check: float = 0.0
_DOCKER_CHECK_INTERVAL = 5.0 # seconds between actual Docker pings
# Cache DB liveness result — under load, every request was hitting
# repo.get_total_logs() and filling the aiosqlite queue.
_db_component: Optional[ComponentHealth] = None
_db_last_check: float = 0.0
# Lazy-init — an asyncio.Lock bound to a dead event loop deadlocks any
# later test running under a fresh loop. Create on first use.
_db_lock: Optional[asyncio.Lock] = None
_DB_CHECK_INTERVAL = 1.0 # seconds
def _reset_docker_cache() -> None:
"""Reset cached Docker state — used by tests."""
global _docker_client, _docker_healthy, _docker_detail, _docker_last_check
_docker_client = None
_docker_healthy = False
_docker_detail = ""
_docker_last_check = 0.0
def _reset_db_cache() -> None:
"""Reset cached DB liveness — used by tests."""
global _db_component, _db_last_check, _db_lock
_db_component = None
_db_last_check = 0.0
_db_lock = None
async def _check_database_cached() -> ComponentHealth:
global _db_component, _db_last_check, _db_lock
now = time.monotonic()
if _db_component is not None and now - _db_last_check < _DB_CHECK_INTERVAL:
return _db_component
if _db_lock is None:
_db_lock = asyncio.Lock()
async with _db_lock:
now = time.monotonic()
if _db_component is not None and now - _db_last_check < _DB_CHECK_INTERVAL:
return _db_component
try:
await repo.get_total_logs()
_db_component = ComponentHealth(status="ok")
except Exception as exc:
_db_component = ComponentHealth(status="failing", detail=str(exc))
_db_last_check = time.monotonic()
return _db_component
@router.get(
"/health",
response_model=HealthResponse,
tags=["Observability"],
responses={
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
503: {"model": HealthResponse, "description": "System unhealthy"},
},
)
@_traced("api.get_health")
async def get_health(user: dict = Depends(require_viewer)) -> Any:
components: dict[str, ComponentHealth] = {}
# 1. Database (cached — avoids a DB round-trip per request)
components["database"] = await _check_database_cached()
# 2. Background workers
from decnet.web.api import get_background_tasks
for name, task in get_background_tasks().items():
if task is None:
components[name] = ComponentHealth(status="failing", detail="not started")
elif task.done():
if task.cancelled():
detail = "cancelled"
else:
exc = task.exception()
detail = f"exited: {exc}" if exc else "exited unexpectedly"
components[name] = ComponentHealth(status="failing", detail=detail)
else:
components[name] = ComponentHealth(status="ok")
# 3. Docker daemon (cached — avoids creating a new client per request)
global _docker_client, _docker_healthy, _docker_detail, _docker_last_check
now = time.monotonic()
if now - _docker_last_check > _DOCKER_CHECK_INTERVAL:
try:
import docker
if _docker_client is None:
_docker_client = await asyncio.to_thread(docker.from_env)
await asyncio.to_thread(_docker_client.ping)
_docker_healthy = True
_docker_detail = ""
except Exception as exc:
_docker_client = None
_docker_healthy = False
_docker_detail = str(exc)
_docker_last_check = now
if _docker_healthy:
components["docker"] = ComponentHealth(status="ok")
else:
components["docker"] = ComponentHealth(status="failing", detail=_docker_detail)
# Overall status tiers:
# healthy — every component ok
# degraded — only non-critical components failing (service usable,
# falls back to cache or skips non-essential work)
# unhealthy — a critical component (db, docker, ingestion) failing;
# survival depends on caches
critical_failing = any(
c.status == "failing"
for name, c in components.items()
if name in _CRITICAL_SERVICES
)
noncritical_failing = any(
c.status == "failing"
for name, c in components.items()
if name not in _CRITICAL_SERVICES
)
if critical_failing:
overall = "unhealthy"
elif noncritical_failing:
overall = "degraded"
else:
overall = "healthy"
result = HealthResponse(status=overall, components=components)
status_code = 503 if overall == "unhealthy" else 200
return ORJSONResponse(content=result.model_dump(), status_code=status_code)

View File

View File

@@ -0,0 +1,143 @@
"""SSE stream of identity-resolution events — one connection per viewer.
Subscribes to ``identity.>`` on the :class:`~decnet.bus.base.BaseBus` for
the duration of the request and forwards each matching bus event as a
Server-Sent Event to the browser. Emits a one-shot snapshot on connect
(current paginated identity list) so the client doesn't need a separate
fetch to initialise.
Authorization mirrors :mod:`decnet.web.router.topology.api_events` — a
JWT passed via the ``?token=`` query parameter (EventSource can't set
arbitrary headers) + ``require_stream_viewer`` role gate.
The endpoint is broadly scoped (every identity event, not per-uuid)
because both ``AttackerDetail`` and ``IdentityDetail`` need the same
firehose: a bare ``AttackerDetail`` watches for ``identity.formed``
events that finally bind its ``identity_id``, and ``IdentityDetail``
watches for ``observation.linked`` / ``merged`` / ``unmerged`` against
the identity it's rendering. A per-uuid filter would force the client
to know its identity before subscribing, which it doesn't always.
"""
from __future__ import annotations
import asyncio
from typing import AsyncGenerator
import orjson
from fastapi import APIRouter, Depends, Request
from fastapi.responses import StreamingResponse
from decnet.bus import topics as _topics
from decnet.bus.app import get_app_bus
from decnet.logging import get_logger
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import repo, require_stream_viewer
from decnet.web.sse_limits import sse_connection_slot
log = get_logger("api.identities.events")
router = APIRouter()
_KEEPALIVE_SECS = 15.0
_SNAPSHOT_LIMIT = 50
def _format_sse(event_name: str, data: dict) -> str:
return f"event: {event_name}\ndata: {orjson.dumps(data).decode()}\n\n"
@router.get(
"/identities/events",
tags=["Identity Resolution"],
responses={
200: {
"content": {"text/event-stream": {}},
"description": "SSE stream of identity-resolution events",
},
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
429: {"description": "Per-user SSE connection cap reached"},
},
)
@_traced("api.identities.events")
async def api_identities_events(
request: Request,
user: dict = Depends(require_stream_viewer),
) -> StreamingResponse:
# Event types emitted: snapshot, formed, observation.linked,
# merged, unmerged. All wrap bus events whose payload is also
# reachable via viewer-gated REST (GET /identities/*).
snapshot = await repo.list_identities(limit=_SNAPSHOT_LIMIT, offset=0)
async def generator() -> AsyncGenerator[str, None]:
async with sse_connection_slot(user["uuid"]):
yield ": keepalive\n\n"
yield _format_sse("snapshot", {"identities": snapshot})
bus = await get_app_bus()
if bus is None:
# Bus disabled / unreachable — keep the connection
# alive so the client doesn't reconnect-storm; it can
# re-poll the REST API on its own timer.
while not await request.is_disconnected():
try:
await asyncio.sleep(_KEEPALIVE_SECS)
except asyncio.CancelledError:
break
yield ": keepalive\n\n"
return
sub = bus.subscribe(f"{_topics.IDENTITY}.>")
try:
async with sub:
sub_iter = sub.__aiter__()
while True:
if await request.is_disconnected():
break
next_task = asyncio.ensure_future(sub_iter.__anext__())
try:
event = await asyncio.wait_for(
next_task, timeout=_KEEPALIVE_SECS,
)
except asyncio.TimeoutError:
next_task.cancel()
yield ": keepalive\n\n"
continue
except StopAsyncIteration:
break
yield _format_sse(
_sse_name_for(event.topic),
{
"topic": event.topic,
"type": event.type,
"ts": event.ts,
"payload": event.payload,
},
)
except asyncio.CancelledError:
pass
except Exception:
log.exception("identity events stream crashed")
yield _format_sse("error", {"message": "Stream interrupted"})
return StreamingResponse(
generator(),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"X-Accel-Buffering": "no",
},
)
def _sse_name_for(topic: str) -> str:
"""Derive an SSE ``event:`` name from a bus topic.
``identity.formed`` → ``formed``
``identity.observation.linked`` → ``observation.linked``
Pass-through preserves dotted leaves so the frontend can switch on
a stable name.
"""
if topic.startswith(f"{_topics.IDENTITY}."):
return topic[len(_topics.IDENTITY) + 1:]
return topic

View File

@@ -0,0 +1,44 @@
"""GET /api/v1/identities/{uuid} — single identity row.
Soft-merge handling: if the requested UUID has merged_into_uuid set,
the repository follows the chain and returns the winner. Callers always
receive the canonical identity for any UUID that has ever been part of
the merge tree.
Returns 404 against an empty/unknown UUID — expected response while the
clusterer hasn't run yet.
"""
from typing import Any
from fastapi import APIRouter, Depends, HTTPException
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import repo, require_viewer
router = APIRouter()
@router.get(
"/identities/{uuid}",
tags=["Identity Resolution"],
responses={
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "Identity not found"},
},
)
@_traced("api.get_identity_detail")
async def get_identity_detail(
uuid: str,
user: dict = Depends(require_viewer),
) -> dict[str, Any]:
identity = await repo.get_identity_by_uuid(uuid)
if not identity:
raise HTTPException(status_code=404, detail="Identity not found")
# Cheap aggregates the IdentityDetail page surfaces. Counted off the
# FK rather than maintained in observation_count so the answer is
# always live (the denormalized field can lag the clusterer briefly).
identity["observation_count_live"] = await repo.count_observations_for_identity(
identity["uuid"]
)
return identity

View File

@@ -0,0 +1,35 @@
"""GET /api/v1/identities — paginated list of resolved identities.
Returns an empty list while the clusterer hasn't run yet (the
identities table ships empty in the schema-only PR). See
development/IDENTITY_RESOLUTION.md.
"""
from typing import Any
from fastapi import APIRouter, Depends, Query
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import repo, require_viewer
router = APIRouter()
@router.get(
"/identities",
tags=["Identity Resolution"],
responses={
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
422: {"description": "Validation error"},
},
)
@_traced("api.list_identities")
async def list_identities(
limit: int = Query(50, ge=1, le=1000),
offset: int = Query(0, ge=0, le=2147483647),
user: dict = Depends(require_viewer),
) -> dict[str, Any]:
"""Paginated identity list, newest-updated first."""
data = await repo.list_identities(limit=limit, offset=offset)
total = await repo.count_identities()
return {"total": total, "limit": limit, "offset": offset, "data": data}

View File

@@ -0,0 +1,48 @@
"""GET /api/v1/identities/{uuid}/observations — observations for an identity.
Returns the per-IP ``Attacker`` rows whose ``identity_id`` FK points at
this identity. The shape mirrors ``AttackersResponse`` so the frontend
can reuse the same row component as the main attackers list.
Empty result while the clusterer hasn't linked any observations yet.
"""
from typing import Any
from fastapi import APIRouter, Depends, HTTPException, Query
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import repo, require_viewer
router = APIRouter()
@router.get(
"/identities/{uuid}/observations",
tags=["Identity Resolution"],
responses={
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "Identity not found"},
},
)
@_traced("api.list_identity_observations")
async def list_identity_observations(
uuid: str,
limit: int = Query(50, ge=1, le=1000),
offset: int = Query(0, ge=0, le=2147483647),
user: dict = Depends(require_viewer),
) -> dict[str, Any]:
# 404 if the identity itself doesn't exist. Otherwise return the
# observations linked to it (which may be empty — a freshly-formed
# identity briefly has no observations yet from the FK side).
identity = await repo.get_identity_by_uuid(uuid)
if not identity:
raise HTTPException(status_code=404, detail="Identity not found")
# If the requested uuid was merged, return observations under the
# winner's uuid (which is what get_identity_by_uuid resolves to).
canonical_uuid = identity["uuid"]
data = await repo.list_observations_for_identity(
canonical_uuid, limit=limit, offset=offset
)
total = await repo.count_observations_for_identity(canonical_uuid)
return {"total": total, "limit": limit, "offset": offset, "data": data}

View File

@@ -1,20 +1,58 @@
import asyncio
import time
from typing import Any, Optional
from fastapi import APIRouter, Depends, Query
from decnet.web.dependencies import get_current_user, repo
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo
router = APIRouter()
# /logs/histogram aggregates over the full logs table — expensive and
# polled constantly by the UI. Cache only the unfiltered default call
# (which is what the UI and locust hit); any filter bypasses.
_HISTOGRAM_TTL = 5.0
_DEFAULT_INTERVAL = 15
_histogram_cache: tuple[Optional[list[dict[str, Any]]], float] = (None, 0.0)
_histogram_lock: Optional[asyncio.Lock] = None
def _reset_histogram_cache() -> None:
global _histogram_cache, _histogram_lock
_histogram_cache = (None, 0.0)
_histogram_lock = None
async def _get_histogram_cached() -> list[dict[str, Any]]:
global _histogram_cache, _histogram_lock
value, ts = _histogram_cache
now = time.monotonic()
if value is not None and now - ts < _HISTOGRAM_TTL:
return value
if _histogram_lock is None:
_histogram_lock = asyncio.Lock()
async with _histogram_lock:
value, ts = _histogram_cache
now = time.monotonic()
if value is not None and now - ts < _HISTOGRAM_TTL:
return value
value = await repo.get_log_histogram(
search=None, start_time=None, end_time=None, interval_minutes=_DEFAULT_INTERVAL,
)
_histogram_cache = (value, time.monotonic())
return value
@router.get("/logs/histogram", tags=["Logs"],
responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}},)
responses={401: {"description": "Could not validate credentials"}, 403: {"description": "Insufficient permissions"}, 422: {"description": "Validation error"}},)
@_traced("api.get_logs_histogram")
async def get_logs_histogram(
search: Optional[str] = None,
start_time: Optional[str] = Query(None),
end_time: Optional[str] = Query(None),
interval_minutes: int = Query(15, ge=1),
current_user: str = Depends(get_current_user)
user: dict = Depends(require_viewer)
) -> list[dict[str, Any]]:
def _norm(v: Optional[str]) -> Optional[str]:
if v in (None, "null", "NULL", "undefined", ""):
@@ -25,4 +63,6 @@ async def get_logs_histogram(
st = _norm(start_time)
et = _norm(end_time)
if s is None and st is None and et is None and interval_minutes == _DEFAULT_INTERVAL:
return await _get_histogram_cached()
return await repo.get_log_histogram(search=s, start_time=st, end_time=et, interval_minutes=interval_minutes)

View File

@@ -1,22 +1,57 @@
import asyncio
import time
from typing import Any, Optional
from fastapi import APIRouter, Depends, Query
from decnet.web.dependencies import get_current_user, repo
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo
from decnet.web.db.models import LogsResponse
router = APIRouter()
# Cache the unfiltered total-logs count. Filtered counts bypass the cache
# (rare, freshness matters for search). SELECT count(*) FROM logs is a
# full scan and gets hammered by paginating clients.
_TOTAL_TTL = 2.0
_total_cache: tuple[Optional[int], float] = (None, 0.0)
_total_lock: Optional[asyncio.Lock] = None
def _reset_total_cache() -> None:
global _total_cache, _total_lock
_total_cache = (None, 0.0)
_total_lock = None
async def _get_total_logs_cached() -> int:
global _total_cache, _total_lock
value, ts = _total_cache
now = time.monotonic()
if value is not None and now - ts < _TOTAL_TTL:
return value
if _total_lock is None:
_total_lock = asyncio.Lock()
async with _total_lock:
value, ts = _total_cache
now = time.monotonic()
if value is not None and now - ts < _TOTAL_TTL:
return value
value = await repo.get_total_logs()
_total_cache = (value, time.monotonic())
return value
@router.get("/logs", response_model=LogsResponse, tags=["Logs"],
responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}})
responses={401: {"description": "Could not validate credentials"}, 403: {"description": "Insufficient permissions"}, 422: {"description": "Validation error"}})
@_traced("api.get_logs")
async def get_logs(
limit: int = Query(50, ge=1, le=1000),
offset: int = Query(0, ge=0, le=2147483647),
search: Optional[str] = Query(None, max_length=512),
start_time: Optional[str] = Query(None),
end_time: Optional[str] = Query(None),
current_user: str = Depends(get_current_user)
user: dict = Depends(require_viewer)
) -> dict[str, Any]:
def _norm(v: Optional[str]) -> Optional[str]:
if v in (None, "null", "NULL", "undefined", ""):
@@ -28,7 +63,10 @@ async def get_logs(
et = _norm(end_time)
_logs: list[dict[str, Any]] = await repo.get_logs(limit=limit, offset=offset, search=s, start_time=st, end_time=et)
_total: int = await repo.get_total_logs(search=s, start_time=st, end_time=et)
if s is None and st is None and et is None:
_total: int = await _get_total_logs_cached()
else:
_total = await repo.get_total_logs(search=s, start_time=st, end_time=et)
return {
"total": _total,
"limit": limit,

View File

@@ -0,0 +1,123 @@
"""SSE stream of orchestrator events.
Subscribes to ``orchestrator.>`` for the duration of the request and
forwards each event as a Server-Sent Event. Emits a one-shot snapshot
on connect (latest 50 rows).
Mirror of :mod:`decnet.web.router.campaigns.api_events`.
"""
from __future__ import annotations
import asyncio
from typing import AsyncGenerator
import orjson
from fastapi import APIRouter, Depends, Request
from fastapi.responses import StreamingResponse
from decnet.bus import topics as _topics
from decnet.bus.app import get_app_bus
from decnet.logging import get_logger
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import repo, require_stream_viewer
from decnet.web.sse_limits import sse_connection_slot
log = get_logger("api.orchestrator.events")
router = APIRouter()
_KEEPALIVE_SECS = 15.0
_SNAPSHOT_LIMIT = 50
def _format_sse(event_name: str, data: dict) -> str:
return f"event: {event_name}\ndata: {orjson.dumps(data).decode()}\n\n"
@router.get(
"/orchestrator/events/stream",
tags=["Orchestrator"],
responses={
200: {
"content": {"text/event-stream": {}},
"description": "SSE stream of orchestrator events",
},
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
429: {"description": "Per-user SSE connection cap reached"},
},
)
@_traced("api.orchestrator.events")
async def api_orchestrator_events(
request: Request,
user: dict = Depends(require_stream_viewer),
) -> StreamingResponse:
snapshot = await repo.list_orchestrator_events(
limit=_SNAPSHOT_LIMIT, offset=0,
)
async def generator() -> AsyncGenerator[str, None]:
async with sse_connection_slot(user["uuid"]):
yield ": keepalive\n\n"
yield _format_sse("snapshot", {"events": snapshot})
bus = await get_app_bus()
if bus is None:
while not await request.is_disconnected():
try:
await asyncio.sleep(_KEEPALIVE_SECS)
except asyncio.CancelledError:
break
yield ": keepalive\n\n"
return
sub = bus.subscribe(f"{_topics.ORCHESTRATOR}.>")
try:
async with sub:
sub_iter = sub.__aiter__()
while True:
if await request.is_disconnected():
break
next_task = asyncio.ensure_future(sub_iter.__anext__())
try:
event = await asyncio.wait_for(
next_task, timeout=_KEEPALIVE_SECS,
)
except asyncio.TimeoutError:
next_task.cancel()
yield ": keepalive\n\n"
continue
except StopAsyncIteration:
break
yield _format_sse(
_sse_name_for(event.topic),
{
"topic": event.topic,
"type": event.type,
"ts": event.ts,
"payload": event.payload,
},
)
except asyncio.CancelledError:
pass
except Exception:
log.exception("orchestrator events stream crashed")
yield _format_sse("error", {"message": "Stream interrupted"})
return StreamingResponse(
generator(),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"X-Accel-Buffering": "no",
},
)
def _sse_name_for(topic: str) -> str:
"""``orchestrator.traffic.<uuid>`` → ``traffic``;
``orchestrator.file.<uuid>`` → ``file``."""
parts = topic.split(".", 2)
if len(parts) >= 2 and parts[0] == _topics.ORCHESTRATOR:
return parts[1]
return topic

View File

@@ -0,0 +1,87 @@
"""GET /api/v1/orchestrator/events — paginated orchestrator activity.
Two underlying tables back this endpoint:
* ``orchestrator_events`` — SSH traffic + file ops (kind = ``traffic``, ``file``)
* ``orchestrator_emails`` — emailgen-generated EMLs (kind = ``email``)
When the caller filters ``kind=email`` we dispatch to the emails table
and adapt rows into the same wire shape the dashboard already renders.
The mapping is:
* ``action`` ← email subject
* ``src_decky_uuid`` ← sender_email
* ``dst_decky_uuid`` ← recipient_email
* ``protocol`` ← ``"smtp"``
* email-specific fields (``thread_id``, ``language``, ``mail_decky_uuid``,
``message_id``, ``in_reply_to``) ride along as top-level keys for the
inspector / future per-email views; the existing event renderer
ignores anything it doesn't recognise.
Mirrors :mod:`decnet.web.router.campaigns.api_list_campaigns`. The
orchestrator + emailgen workers are the sole writers; this surface is
read-only.
"""
from typing import Any, Optional
from fastapi import APIRouter, Depends, Query
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import repo, require_viewer
router = APIRouter()
def _adapt_email_row(e: dict[str, Any]) -> dict[str, Any]:
"""Reshape an ``orchestrator_emails`` row into the wire shape the
dashboard's event table understands, while carrying the email-only
fields through as extras."""
return {
"uuid": e.get("uuid"),
"ts": e.get("ts"),
"kind": "email",
"protocol": "smtp",
"action": e.get("subject", ""),
"src_decky_uuid": e.get("sender_email"),
"dst_decky_uuid": e.get("recipient_email"),
"success": bool(e.get("success")),
"payload": e.get("payload", "{}"),
# Email-specific extras (renderer keys off ``kind == 'email'``).
"subject": e.get("subject"),
"sender_email": e.get("sender_email"),
"recipient_email": e.get("recipient_email"),
"language": e.get("language"),
"thread_id": e.get("thread_id"),
"mail_decky_uuid": e.get("mail_decky_uuid"),
"message_id": e.get("message_id"),
"in_reply_to": e.get("in_reply_to"),
}
@router.get(
"/orchestrator/events",
tags=["Orchestrator"],
responses={
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
422: {"description": "Validation error"},
},
)
@_traced("api.list_orchestrator_events")
async def list_orchestrator_events(
limit: int = Query(50, ge=1, le=1000),
offset: int = Query(0, ge=0, le=2147483647),
kind: Optional[str] = Query(None, pattern="^(traffic|file|email)$"),
user: dict = Depends(require_viewer),
) -> dict[str, Any]:
"""Paginated orchestrator-event list, newest first."""
if kind == "email":
emails = await repo.list_orchestrator_emails(limit=limit, offset=offset)
total = await repo.count_orchestrator_emails()
data = [_adapt_email_row(e) for e in emails]
else:
data = await repo.list_orchestrator_events(
limit=limit, offset=offset, kind=kind,
)
total = await repo.count_orchestrator_events(kind=kind)
return {"total": total, "limit": limit, "offset": offset, "data": data}

View File

View File

@@ -0,0 +1,115 @@
"""GET/PUT ``/api/v1/realism/config`` — operator-tunable realism knobs.
Today only the planner's content-class weights + canary probability
are exposed. The wire shape mirrors what
:func:`decnet.realism.planner.current_payload` produces and
:func:`decnet.realism.planner.apply_payload` consumes.
Reads accept viewer; writes are admin (writes mutate sampling
behaviour across the whole orchestrator fleet, same trust level as
the persona-pool surface).
The orchestrator worker periodically re-loads from the
``realism_config`` table; the API process applies overrides locally
on PUT so the GET-after-PUT round-trip reflects the change without
waiting for the orchestrator's next refresh tick.
"""
from __future__ import annotations
import json
from typing import Any
from fastapi import APIRouter, Depends, HTTPException
from decnet.logging import get_logger
from decnet.realism import planner
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import repo, require_admin, require_viewer
router = APIRouter()
log = get_logger("api.realism.config")
_CONFIG_KEY = "weights"
@router.get(
"/realism/config",
tags=["Realism"],
responses={
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
},
)
@_traced("api.realism.get_config")
async def get_config(
user: dict = Depends(require_viewer),
) -> dict[str, Any]:
"""Return the live planner config in this API process.
Note: the API process and the orchestrator worker each carry their
own in-memory copy of the planner config. After a fresh API
restart the ``realism_config`` row is loaded into this process the
first time GET is called; subsequent reads are local.
"""
# Lazy hydration — first call after restart pulls from DB so the
# admin sees what the orchestrator is actually using, not the
# baked-in defaults.
row = await repo.get_realism_config(_CONFIG_KEY)
if row is not None:
try:
stored = json.loads(row.get("value") or "{}")
if isinstance(stored, dict):
planner.apply_payload(stored)
except (json.JSONDecodeError, ValueError) as exc:
log.warning(
"api.realism.get_config: stored payload invalid, "
"serving defaults: %s", exc,
)
return planner.current_payload()
@router.put(
"/realism/config",
tags=["Realism"],
responses={
400: {"description": "Invalid config payload"},
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
},
)
@_traced("api.realism.put_config")
async def put_config(
body: dict[str, Any],
user: dict = Depends(require_admin),
) -> dict[str, Any]:
"""Replace (partial) planner config and persist to ``realism_config``.
Body shape (all fields optional — unset fields keep current value):
* ``user_class_weights``: ``[{"content_class": "note", "weight": 30}, ...]``
* ``system_class_weights``: same shape
* ``canary_class_weights``: same shape
* ``canary_probability``: float in [0.0, 1.0]
Validation: any structural failure raises 400 *before* the rebind,
so the live config never goes torn.
"""
if not isinstance(body, dict):
raise HTTPException(status_code=400, detail="body must be an object")
try:
planner.apply_payload(body)
except ValueError as exc:
raise HTTPException(status_code=400, detail=str(exc)) from exc
# Persist what the planner now reflects (keeps DB in sync with the
# in-memory state — partial bodies merge into prior config).
snapshot = planner.current_payload()
await repo.set_realism_config(_CONFIG_KEY, json.dumps(snapshot))
log.info(
"api.realism.put_config user=%s canary_probability=%.4f",
user.get("username", user.get("uuid")),
snapshot["canary_probability"],
)
return snapshot

View File

@@ -0,0 +1,143 @@
"""GET/PUT ``/api/v1/realism/personas`` — global persona pool CRUD.
The "global pool" is a JSON file consumed by the realism content
engine for fleet (MACVLAN/IPVLAN) and SWARM-shard deckies — see
:mod:`decnet.realism.personas_pool`. MazeNET topology deckies use
``Topology.email_personas`` instead and are configured per-topology
elsewhere.
This endpoint is the API surface behind the dashboard's "Persona
Generation" page. Reads accept admin or viewer; writes are admin-only
because the persistence target is a config file the worker reads on
its hot path.
Concurrency: last-write-wins. The pool is operator-curated and small
(<50 entries typically); the cost of a stronger model isn't justified.
"""
from __future__ import annotations
import json
from typing import Any
from fastapi import APIRouter, Depends, HTTPException
from decnet.logging import get_logger
from decnet.realism import personas_pool as global_pool
from decnet.realism.personas import EmailPersona, parse_personas
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_admin, require_viewer
from decnet.web.db.models.common import MessageResponse # noqa: F401 - response shape
router = APIRouter()
log = get_logger("api.realism.personas")
def _serialize(personas: list[EmailPersona]) -> list[dict[str, Any]]:
"""Pydantic → plain dicts for the response body."""
return [p.model_dump(exclude_none=False) for p in personas]
@router.get(
"/realism/personas",
tags=["Emailgen"],
responses={
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
},
)
@_traced("api.realism.list_personas")
async def list_personas(
user: dict = Depends(require_viewer),
) -> dict[str, Any]:
"""Return the current global persona pool + the resolved file path.
The ``path`` field lets the dashboard show operators where the file
lives on disk so a CLI-driven backup / git-tracked workflow stays
discoverable.
"""
# Reset the in-process cache before reading so a fresh CLI-driven
# ``decnet realism import-personas`` shows up immediately rather
# than waiting on the worker's mtime check.
global_pool.reset_cache()
personas = global_pool.load()
return {
"path": str(global_pool.resolve_path()),
"personas": _serialize(personas),
}
@router.put(
"/realism/personas",
tags=["Emailgen"],
responses={
400: {"description": "Invalid persona payload"},
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
},
)
@_traced("api.realism.replace_personas")
async def replace_personas(
body: dict[str, Any],
user: dict = Depends(require_admin),
) -> dict[str, Any]:
"""Replace the entire global pool with the supplied list.
Body shape: ``{"personas": [<EmailPersona>, ...]}``.
Validation is the same path the worker uses (``parse_personas``):
invalid entries are dropped with a warning rather than failing the
whole request — operators see exactly what landed by reading back
the GET response. An entirely-invalid payload returns 400.
"""
raw = body.get("personas")
if not isinstance(raw, list):
raise HTTPException(
status_code=400,
detail="body.personas must be a list",
)
parsed = parse_personas(raw)
if raw and not parsed:
# Operator sent a non-empty list and *every* entry was invalid —
# almost certainly a schema mistake on their side; fail loudly
# rather than silently writing an empty pool.
raise HTTPException(
status_code=400,
detail=(
"All persona entries failed validation. Required fields: "
"name, email (user@host.tld), role, tone, mannerisms."
),
)
dest = global_pool.resolve_path()
try:
dest.parent.mkdir(parents=True, exist_ok=True)
dest.write_text(
json.dumps(_serialize(parsed), indent=2, ensure_ascii=False),
encoding="utf-8",
)
except OSError as exc:
# Most common cause on dev boxes: ``/etc/decnet`` exists but is
# not writable by the API process. Surface a 500 with the
# actionable hint instead of leaking a traceback.
log.warning(
"api.realism.replace_personas write failed path=%s err=%s",
dest, exc,
)
raise HTTPException(
status_code=500,
detail=(
f"Could not write persona pool at {dest}: {exc.strerror or exc}. "
f"Set DECNET_EMAILGEN_PERSONAS to a writable path "
f"(e.g. ~/.decnet/email_personas.json) and restart the API."
),
) from exc
global_pool.reset_cache()
log.info(
"api.realism.replace_personas user=%s wrote=%d path=%s",
user.get("username", user.get("uuid")), len(parsed), dest,
)
return {
"path": str(dest),
"personas": _serialize(parsed),
}

View File

@@ -0,0 +1,99 @@
"""GET ``/api/v1/realism/synthetic-files`` — browse planted realism files.
The orchestrator's realism worker grows synthetic files on each decky
(notes, TODOs, drafts, scripts, log lines, canary artifacts). The
:class:`~decnet.web.db.models.realism.SyntheticFile` table is the
canonical record of what's been planted where; this endpoint lets
operators inspect the lineage without ssh'ing into a decky.
Read-only. No writes — the orchestrator is the sole writer; the
dashboard is observation surface only.
The body preview (``last_body``) is repo-clipped at 64 KB
(:data:`SYNTHETIC_FILE_BODY_LIMIT`); when the original was larger the
detail response carries ``truncated: true`` so the operator knows what
they're looking at.
"""
from __future__ import annotations
from typing import Any, Optional
from fastapi import APIRouter, Depends, HTTPException, Query
from decnet.telemetry import traced as _traced
from decnet.web.db.models.realism import SYNTHETIC_FILE_BODY_LIMIT
from decnet.web.dependencies import repo, require_viewer
router = APIRouter()
@router.get(
"/realism/synthetic-files",
tags=["Realism"],
responses={
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
422: {"description": "Validation error"},
},
)
@_traced("api.realism.list_synthetic_files")
async def list_synthetic_files(
limit: int = Query(50, ge=1, le=500),
offset: int = Query(0, ge=0, le=2147483647),
decky_uuid: Optional[str] = Query(None, max_length=64),
persona: Optional[str] = Query(None, max_length=128),
content_class: Optional[str] = Query(None, max_length=32),
user: dict = Depends(require_viewer),
) -> dict[str, Any]:
"""Paginated synthetic_files newest-first.
Filters: ``decky_uuid``, ``persona``, ``content_class``. The list
response strips ``last_body`` to keep the payload bounded — fetch
the detail endpoint for the body preview.
"""
rows = await repo.list_synthetic_files(
decky_uuid=decky_uuid,
persona=persona,
content_class=content_class,
limit=limit,
offset=offset,
)
total = await repo.count_synthetic_files(
decky_uuid=decky_uuid,
persona=persona,
content_class=content_class,
)
# The list view doesn't need bodies; drop them so the response stays
# small even when 50 rows each carry ~64 KB. Detail endpoint returns
# the body.
for r in rows:
r.pop("last_body", None)
return {"total": total, "limit": limit, "offset": offset, "data": rows}
@router.get(
"/realism/synthetic-files/{uuid}",
tags=["Realism"],
responses={
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "Synthetic file not found"},
},
)
@_traced("api.realism.get_synthetic_file")
async def get_synthetic_file(
uuid: str,
user: dict = Depends(require_viewer),
) -> dict[str, Any]:
"""Return one synthetic_files row including the body preview.
``truncated`` is true when the stored body is at the cap — the
decky filesystem holds the canonical bytes; the master view is a
snapshot.
"""
row = await repo.get_synthetic_file(uuid)
if row is None:
raise HTTPException(status_code=404, detail="synthetic file not found")
body = row.get("last_body") or ""
row["truncated"] = len(body) >= SYNTHETIC_FILE_BODY_LIMIT
return row

View File

@@ -1,14 +1,50 @@
from typing import Any
import asyncio
import time
from typing import Any, Optional
from fastapi import APIRouter, Depends
from decnet.web.dependencies import get_current_user, repo
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo
from decnet.web.db.models import StatsResponse
router = APIRouter()
# /stats is aggregate telemetry polled constantly by the UI and locust.
# A 5s window collapses thousands of concurrent calls — each of which
# runs SELECT count(*) FROM logs + SELECT count(DISTINCT attacker_ip) —
# into one DB hit per window.
_STATS_TTL = 5.0
_stats_cache: tuple[Optional[dict[str, Any]], float] = (None, 0.0)
_stats_lock: Optional[asyncio.Lock] = None
def _reset_stats_cache() -> None:
global _stats_cache, _stats_lock
_stats_cache = (None, 0.0)
_stats_lock = None
async def _get_stats_cached() -> dict[str, Any]:
global _stats_cache, _stats_lock
value, ts = _stats_cache
now = time.monotonic()
if value is not None and now - ts < _STATS_TTL:
return value
if _stats_lock is None:
_stats_lock = asyncio.Lock()
async with _stats_lock:
value, ts = _stats_cache
now = time.monotonic()
if value is not None and now - ts < _STATS_TTL:
return value
value = await repo.get_stats_summary()
_stats_cache = (value, time.monotonic())
return value
@router.get("/stats", response_model=StatsResponse, tags=["Observability"],
responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}},)
async def get_stats(current_user: str = Depends(get_current_user)) -> dict[str, Any]:
return await repo.get_stats_summary()
responses={401: {"description": "Could not validate credentials"}, 403: {"description": "Insufficient permissions"}, 422: {"description": "Validation error"}},)
@_traced("api.get_stats")
async def get_stats(user: dict = Depends(require_viewer)) -> dict[str, Any]:
return await _get_stats_cached()

View File

@@ -1,19 +1,50 @@
import json
import asyncio
import logging
import orjson
from typing import AsyncGenerator, Optional
from fastapi import APIRouter, Depends, Query, Request
from fastapi.responses import StreamingResponse
from decnet.env import DECNET_DEVELOPER
from decnet.web.dependencies import get_stream_user, repo
from decnet.logging import get_logger
from decnet.telemetry import traced as _traced, get_tracer as _get_tracer
from decnet.web.dependencies import require_stream_viewer, repo
from decnet.web.sse_limits import sse_connection_slot
log = logging.getLogger(__name__)
log = get_logger("api")
router = APIRouter()
def _build_trace_links(logs: list[dict]) -> list:
"""Build OTEL span links from persisted trace_id/span_id in log rows.
Returns an empty list when tracing is disabled (no OTEL imports).
"""
try:
from opentelemetry.trace import Link, SpanContext, TraceFlags
except ImportError:
return []
links: list[Link] = []
for entry in logs:
tid = entry.get("trace_id")
sid = entry.get("span_id")
if not tid or not sid or tid == "0":
continue
try:
ctx = SpanContext(
trace_id=int(tid, 16),
span_id=int(sid, 16),
is_remote=True,
trace_flags=TraceFlags(TraceFlags.SAMPLED),
)
links.append(Link(ctx))
except (ValueError, TypeError):
continue
return links
@router.get("/stream", tags=["Observability"],
responses={
200: {
@@ -21,9 +52,12 @@ router = APIRouter()
"description": "Real-time Server-Sent Events (SSE) stream"
},
401: {"description": "Could not validate credentials"},
422: {"description": "Validation error"}
403: {"description": "Insufficient permissions"},
422: {"description": "Validation error"},
429: {"description": "Per-user SSE connection cap reached"},
},
)
@_traced("api.stream_events")
async def stream_events(
request: Request,
last_event_id: int = Query(0, alias="lastEventId"),
@@ -31,63 +65,87 @@ async def stream_events(
start_time: Optional[str] = None,
end_time: Optional[str] = None,
max_output: Optional[int] = Query(None, alias="maxOutput"),
current_user: str = Depends(get_stream_user)
user: dict = Depends(require_stream_viewer)
) -> StreamingResponse:
# Event types emitted on this stream: logs, stats, histogram.
# All three are viewer-safe — same data is reachable via /logs and
# /stats (viewer-gated REST). Adding a new event family here
# requires a threat-model review for F6/I (role leakage).
async def event_generator() -> AsyncGenerator[str, None]:
last_id = last_event_id
stats_interval_sec = 10
loops_since_stats = 0
emitted_chunks = 0
try:
if last_id == 0:
last_id = await repo.get_max_log_id()
# Emit initial snapshot immediately so the client never needs to poll /stats
stats = await repo.get_stats_summary()
yield f"event: message\ndata: {json.dumps({'type': 'stats', 'data': stats})}\n\n"
histogram = await repo.get_log_histogram(
search=search, start_time=start_time,
end_time=end_time, interval_minutes=15,
async with sse_connection_slot(user["uuid"]):
# Prefetch the initial snapshot before the first yield.
# With asyncmy (pure async TCP I/O), a DB await AFTER the first
# yield races with the HTTP write callback; running DB reads
# here (pre-yield, normal coroutine context) avoids that.
# aiosqlite is immune because SQLite runs on a worker thread.
_start_id = last_event_id if last_event_id != 0 else await repo.get_max_log_id()
_initial_stats = await repo.get_stats_summary()
_initial_histogram = await repo.get_log_histogram(
search=search, start_time=start_time, end_time=end_time, interval_minutes=15,
)
yield f"event: message\ndata: {json.dumps({'type': 'histogram', 'data': histogram})}\n\n"
last_id = _start_id
stats_interval_sec = 10
loops_since_stats = 0
emitted_chunks = 0
try:
yield ": keepalive\n\n" # flush headers immediately
while True:
if DECNET_DEVELOPER and max_output is not None:
emitted_chunks += 1
if emitted_chunks > max_output:
log.debug("Developer mode: max_output reached (%d), closing stream", max_output)
# Emit pre-fetched initial snapshot — no DB calls in generator until the loop
yield f"event: message\ndata: {orjson.dumps({'type': 'stats', 'data': _initial_stats}).decode()}\n\n"
yield f"event: message\ndata: {orjson.dumps({'type': 'histogram', 'data': _initial_histogram}).decode()}\n\n"
while True:
if DECNET_DEVELOPER and max_output is not None:
emitted_chunks += 1
if emitted_chunks > max_output:
log.debug("Developer mode: max_output reached (%d), closing stream", max_output)
break
if await request.is_disconnected():
break
if await request.is_disconnected():
break
new_logs = await repo.get_logs_after_id(
last_id, limit=50, search=search,
start_time=start_time, end_time=end_time,
)
if new_logs:
last_id = max(entry["id"] for entry in new_logs)
yield f"event: message\ndata: {json.dumps({'type': 'logs', 'data': new_logs})}\n\n"
loops_since_stats = stats_interval_sec
if loops_since_stats >= stats_interval_sec:
stats = await repo.get_stats_summary()
yield f"event: message\ndata: {json.dumps({'type': 'stats', 'data': stats})}\n\n"
histogram = await repo.get_log_histogram(
search=search, start_time=start_time,
end_time=end_time, interval_minutes=15,
new_logs = await repo.get_logs_after_id(
last_id, limit=50, search=search,
start_time=start_time, end_time=end_time,
)
yield f"event: message\ndata: {json.dumps({'type': 'histogram', 'data': histogram})}\n\n"
loops_since_stats = 0
if new_logs:
last_id = max(entry["id"] for entry in new_logs)
# Create a span linking back to the ingestion traces
# stored in each log row, closing the pipeline gap.
_links = _build_trace_links(new_logs)
_tracer = _get_tracer("sse")
with _tracer.start_as_current_span(
"sse.emit_logs", links=_links,
attributes={"log_count": len(new_logs)},
):
yield f"event: message\ndata: {orjson.dumps({'type': 'logs', 'data': new_logs}).decode()}\n\n"
loops_since_stats = stats_interval_sec
loops_since_stats += 1
if loops_since_stats >= stats_interval_sec:
stats = await repo.get_stats_summary()
yield f"event: message\ndata: {orjson.dumps({'type': 'stats', 'data': stats}).decode()}\n\n"
histogram = await repo.get_log_histogram(
search=search, start_time=start_time,
end_time=end_time, interval_minutes=15,
)
yield f"event: message\ndata: {orjson.dumps({'type': 'histogram', 'data': histogram}).decode()}\n\n"
loops_since_stats = 0
await asyncio.sleep(1)
except asyncio.CancelledError:
pass
except Exception:
log.exception("SSE stream error for user %s", last_event_id)
yield f"event: error\ndata: {json.dumps({'type': 'error', 'message': 'Stream interrupted'})}\n\n"
loops_since_stats += 1
return StreamingResponse(event_generator(), media_type="text/event-stream")
await asyncio.sleep(1)
except asyncio.CancelledError:
pass
except Exception:
log.exception("SSE stream error for user %s", last_event_id)
yield f"event: error\ndata: {orjson.dumps({'type': 'error', 'message': 'Stream interrupted'}).decode()}\n\n"
return StreamingResponse(
event_generator(),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"X-Accel-Buffering": "no",
},
)

View File

@@ -0,0 +1,47 @@
"""Swarm controller routers.
One file per endpoint, aggregated under the ``/swarm`` prefix. Mounted
onto the swarm-api FastAPI app (``decnet/web/swarm_api.py``), a separate
process from the main DECNET API so swarm failures cannot cascade into
log ingestion / dashboard serving.
"""
from fastapi import APIRouter
from .api_enroll_host import router as enroll_host_router
from .api_list_hosts import router as list_hosts_router
from .api_get_host import router as get_host_router
from .api_decommission_host import router as decommission_host_router
from .api_deploy_swarm import router as deploy_swarm_router
from .api_teardown_swarm import router as teardown_swarm_router
from .api_get_swarm_health import router as get_swarm_health_router
from .api_check_hosts import router as check_hosts_router
from .api_heartbeat import router as heartbeat_router
from .api_list_deckies import router as list_deckies_router
swarm_router = APIRouter(
prefix="/swarm",
# Error responses that every swarm route can surface. Route-level
# `responses=` entries still override/extend these for route-specific
# codes (e.g. 409 on /enroll).
responses={
400: {"description": "Malformed request"},
403: {"description": "Peer cert missing or fingerprint mismatch"},
404: {"description": "Referenced host does not exist"},
},
)
# Hosts
swarm_router.include_router(enroll_host_router)
swarm_router.include_router(list_hosts_router)
swarm_router.include_router(get_host_router)
swarm_router.include_router(decommission_host_router)
# Deployments
swarm_router.include_router(deploy_swarm_router)
swarm_router.include_router(teardown_swarm_router)
swarm_router.include_router(list_deckies_router)
# Health
swarm_router.include_router(get_swarm_health_router)
swarm_router.include_router(check_hosts_router)
swarm_router.include_router(heartbeat_router)

View File

@@ -0,0 +1,61 @@
"""POST /swarm/check — active mTLS probe of every enrolled worker.
Updates ``SwarmHost.status`` and ``last_heartbeat`` for each host based
on the outcome of the probe.
"""
from __future__ import annotations
import asyncio
from datetime import datetime, timezone
from typing import Any
from fastapi import APIRouter, Depends
from decnet.logging import get_logger
from decnet.swarm.client import AgentClient
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
from decnet.web.db.models import SwarmCheckResponse, SwarmHostHealth
log = get_logger("swarm.check")
router = APIRouter()
@router.post("/check", response_model=SwarmCheckResponse, tags=["Swarm Health"])
async def api_check_hosts(
repo: BaseRepository = Depends(get_repo),
) -> SwarmCheckResponse:
hosts = await repo.list_swarm_hosts()
async def _probe(host: dict[str, Any]) -> SwarmHostHealth:
try:
async with AgentClient(host=host) as agent:
body = await agent.health()
await repo.update_swarm_host(
host["uuid"],
{
"status": "active",
"last_heartbeat": datetime.now(timezone.utc),
},
)
return SwarmHostHealth(
host_uuid=host["uuid"],
name=host["name"],
address=host["address"],
reachable=True,
detail=body,
)
except Exception as exc:
log.warning("swarm.check unreachable host=%s err=%s", host["name"], exc)
await repo.update_swarm_host(host["uuid"], {"status": "unreachable"})
return SwarmHostHealth(
host_uuid=host["uuid"],
name=host["name"],
address=host["address"],
reachable=False,
detail=str(exc),
)
results = await asyncio.gather(*(_probe(h) for h in hosts))
return SwarmCheckResponse(results=list(results))

View File

@@ -0,0 +1,63 @@
"""DELETE /swarm/hosts/{uuid} — decommission a worker.
Removes the DeckyShard rows bound to the host (portable cascade — MySQL
and SQLite both honor it via the repo layer), deletes the SwarmHost row,
and best-effort-cleans the per-worker bundle directory on the master.
Also asks the worker agent to wipe its own install (keeping logs). A
dead/unreachable worker does not block master-side cleanup.
"""
from __future__ import annotations
import pathlib
from fastapi import APIRouter, Depends, HTTPException, status
from decnet.logging import get_logger
from decnet.swarm.client import AgentClient
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
log = get_logger("swarm.decommission")
router = APIRouter()
@router.delete(
"/hosts/{uuid}",
status_code=status.HTTP_204_NO_CONTENT,
tags=["Swarm Hosts"],
responses={404: {"description": "No host with this UUID is enrolled"}},
)
async def api_decommission_host(
uuid: str,
repo: BaseRepository = Depends(get_repo),
) -> None:
row = await repo.get_swarm_host_by_uuid(uuid)
if row is None:
raise HTTPException(status_code=404, detail="host not found")
try:
async with AgentClient(host=row) as agent:
await agent.self_destruct()
except Exception:
log.exception(
"decommission: self-destruct dispatch failed host=%s"
"proceeding with master-side cleanup anyway",
row.get("name"),
)
await repo.delete_decky_shards_for_host(uuid)
await repo.delete_swarm_host(uuid)
# Best-effort bundle cleanup; if the dir was moved manually, don't fail.
bundle_dir = pathlib.Path(row.get("cert_bundle_path") or "")
if bundle_dir.is_dir():
for child in bundle_dir.iterdir():
try:
child.unlink()
except OSError:
pass
try:
bundle_dir.rmdir()
except OSError:
pass

View File

@@ -0,0 +1,155 @@
"""POST /swarm/deploy — shard a DecnetConfig across enrolled workers.
Per worker we build a filtered copy containing only the deckies assigned
to that worker (via ``host_uuid``), then POST it to the worker agent.
The caller is expected to have already set ``host_uuid`` on every decky;
if any decky arrives without one, we fail fast. Auto-sharding lives in
the CLI layer, not here.
"""
from __future__ import annotations
import asyncio
import json
from datetime import datetime, timezone
from typing import Any
from fastapi import APIRouter, Depends, HTTPException
from decnet.config import DecnetConfig, DeckyConfig
from decnet.logging import get_logger
from decnet.swarm.client import AgentClient
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
from decnet.web.db.models import (
SwarmDeployRequest,
SwarmDeployResponse,
SwarmHostResult,
)
log = get_logger("swarm.deploy")
router = APIRouter()
def _shard_by_host(config: DecnetConfig) -> dict[str, list[DeckyConfig]]:
buckets: dict[str, list[DeckyConfig]] = {}
for d in config.deckies:
if not d.host_uuid:
raise HTTPException(
status_code=400,
detail=f"decky '{d.name}' has no host_uuid — caller must shard before dispatch",
)
buckets.setdefault(d.host_uuid, []).append(d)
return buckets
def _worker_config(
base: DecnetConfig,
shard: list[DeckyConfig],
host: dict[str, Any],
) -> DecnetConfig:
updates: dict[str, Any] = {"deckies": shard}
# Per-host driver opt-in (Wi-Fi-bridged VMs can't use macvlan — see
# SwarmHost.use_ipvlan). Never downgrade: if the operator picked ipvlan
# at the deploy level, keep it regardless of the per-host flag.
if host.get("use_ipvlan"):
updates["ipvlan"] = True
return base.model_copy(update=updates)
async def dispatch_decnet_config(
config: DecnetConfig,
repo: BaseRepository,
dry_run: bool = False,
no_cache: bool = False,
) -> SwarmDeployResponse:
"""Shard ``config`` by ``host_uuid`` and dispatch to each worker in parallel.
Shared between POST /swarm/deploy (explicit swarm call) and the auto-swarm
branch of POST /deckies/deploy.
"""
buckets = _shard_by_host(config)
hosts: dict[str, dict[str, Any]] = {}
for host_uuid in buckets:
row = await repo.get_swarm_host_by_uuid(host_uuid)
if row is None:
raise HTTPException(status_code=404, detail=f"unknown host_uuid: {host_uuid}")
hosts[host_uuid] = row
async def _dispatch(host_uuid: str, shard: list[DeckyConfig]) -> SwarmHostResult:
host = hosts[host_uuid]
cfg = _worker_config(config, shard, host)
try:
async with AgentClient(host=host) as agent:
body = await agent.deploy(cfg, dry_run=dry_run, no_cache=no_cache)
for d in shard:
await repo.upsert_decky_shard(
{
"decky_name": d.name,
"host_uuid": host_uuid,
"services": json.dumps(d.services),
"decky_config": d.model_dump_json(),
"decky_ip": d.ip,
"state": "running" if not dry_run else "pending",
"last_error": None,
"updated_at": datetime.now(timezone.utc),
}
)
await repo.update_swarm_host(host_uuid, {"status": "active"})
return SwarmHostResult(host_uuid=host_uuid, host_name=host["name"], ok=True, detail=body)
except Exception as exc:
log.exception("swarm.deploy dispatch failed host=%s", host["name"])
# Compose-up is partial-success-friendly: one decky failing to
# build doesn't roll back the ones that already came up. Ask the
# agent which containers actually exist before painting the whole
# shard red — otherwise decky1 and decky2 look "failed" even
# though they're live on the worker.
runtime: dict[str, Any] = {}
try:
async with AgentClient(host=host) as probe:
snap = await probe.status()
runtime = snap.get("runtime") or {}
except Exception:
log.warning("swarm.deploy: runtime probe failed host=%s — marking shard failed", host["name"])
for d in shard:
rstate = runtime.get(d.name) or {}
is_up = bool(rstate.get("running"))
await repo.upsert_decky_shard(
{
"decky_name": d.name,
"host_uuid": host_uuid,
"services": json.dumps(d.services),
"decky_config": d.model_dump_json(),
"decky_ip": d.ip,
"state": "running" if is_up else "failed",
"last_error": None if is_up else str(exc)[:512],
"updated_at": datetime.now(timezone.utc),
}
)
return SwarmHostResult(host_uuid=host_uuid, host_name=host["name"], ok=False, detail=str(exc))
results = await asyncio.gather(
*(_dispatch(uuid_, shard) for uuid_, shard in buckets.items())
)
return SwarmDeployResponse(results=list(results))
@router.post(
"/deploy",
response_model=SwarmDeployResponse,
tags=["Swarm Deployments"],
responses={
400: {"description": "Deployment mode must be 'swarm'"},
404: {"description": "A referenced host_uuid is not enrolled"},
},
)
async def api_deploy_swarm(
req: SwarmDeployRequest,
repo: BaseRepository = Depends(get_repo),
) -> SwarmDeployResponse:
if req.config.mode != "swarm":
raise HTTPException(status_code=400, detail="mode must be 'swarm'")
return await dispatch_decnet_config(
req.config, repo, dry_run=req.dry_run, no_cache=req.no_cache
)

View File

@@ -0,0 +1,100 @@
"""POST /swarm/enroll — issue a worker cert bundle and register the host.
Enrollment is master-driven: the controller holds the CA private key,
generates a fresh worker keypair + CA-signed cert, and returns the full
bundle to the operator. Bundle delivery to the worker (scp/sshpass/etc.)
is outside this process's trust boundary.
Rationale: the worker agent speaks ONLY mTLS; there is no pre-auth
bootstrap endpoint, so nothing to attack before the worker is enrolled.
"""
from __future__ import annotations
import uuid as _uuid
from datetime import datetime, timezone
from typing import Optional
from fastapi import APIRouter, Depends, HTTPException, status
from decnet.swarm import pki
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
from decnet.web.db.models import SwarmEnrolledBundle, SwarmEnrollRequest, SwarmUpdaterBundle
router = APIRouter()
@router.post(
"/enroll",
response_model=SwarmEnrolledBundle,
status_code=status.HTTP_201_CREATED,
tags=["Swarm Hosts"],
responses={
400: {"description": "Bad Request (malformed JSON body)"},
409: {"description": "A worker with this name is already enrolled"},
422: {"description": "Request body validation error"},
},
)
async def api_enroll_host(
req: SwarmEnrollRequest,
repo: BaseRepository = Depends(get_repo),
) -> SwarmEnrolledBundle:
existing = await repo.get_swarm_host_by_name(req.name)
if existing is not None:
raise HTTPException(status_code=409, detail=f"Worker '{req.name}' is already enrolled")
ca = pki.ensure_ca()
sans = list({*req.sans, req.address, req.name})
issued = pki.issue_worker_cert(ca, req.name, sans)
# Persist the bundle under ~/.decnet/ca/workers/<name>/ so the master
# can replay it if the operator loses the original delivery.
bundle_dir = pki.DEFAULT_CA_DIR / "workers" / req.name
pki.write_worker_bundle(issued, bundle_dir)
updater_view: Optional[SwarmUpdaterBundle] = None
updater_fp: Optional[str] = None
if req.issue_updater_bundle:
updater_cn = f"updater@{req.name}"
updater_sans = list({*sans, updater_cn, "127.0.0.1"})
updater_issued = pki.issue_worker_cert(ca, updater_cn, updater_sans)
# Persist alongside the worker bundle for replay.
updater_dir = bundle_dir / "updater"
updater_dir.mkdir(parents=True, exist_ok=True)
(updater_dir / "updater.crt").write_bytes(updater_issued.cert_pem)
(updater_dir / "updater.key").write_bytes(updater_issued.key_pem)
import os as _os
_os.chmod(updater_dir / "updater.key", 0o600)
updater_fp = updater_issued.fingerprint_sha256
updater_view = SwarmUpdaterBundle(
fingerprint=updater_fp,
updater_cert_pem=updater_issued.cert_pem.decode(),
updater_key_pem=updater_issued.key_pem.decode(),
)
host_uuid = str(_uuid.uuid4())
await repo.add_swarm_host(
{
"uuid": host_uuid,
"name": req.name,
"address": req.address,
"agent_port": req.agent_port,
"status": "enrolled",
"client_cert_fingerprint": issued.fingerprint_sha256,
"updater_cert_fingerprint": updater_fp,
"cert_bundle_path": str(bundle_dir),
"enrolled_at": datetime.now(timezone.utc),
"notes": req.notes,
}
)
return SwarmEnrolledBundle(
host_uuid=host_uuid,
name=req.name,
address=req.address,
agent_port=req.agent_port,
fingerprint=issued.fingerprint_sha256,
ca_cert_pem=issued.ca_cert_pem.decode(),
worker_cert_pem=issued.cert_pem.decode(),
worker_key_pem=issued.key_pem.decode(),
updater=updater_view,
)

View File

@@ -0,0 +1,26 @@
"""GET /swarm/hosts/{uuid} — fetch a single worker by UUID."""
from __future__ import annotations
from fastapi import APIRouter, Depends, HTTPException
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
from decnet.web.db.models import SwarmHostView
router = APIRouter()
@router.get(
"/hosts/{uuid}",
response_model=SwarmHostView,
tags=["Swarm Hosts"],
responses={404: {"description": "No host with this UUID is enrolled"}},
)
async def api_get_host(
uuid: str,
repo: BaseRepository = Depends(get_repo),
) -> SwarmHostView:
row = await repo.get_swarm_host_by_uuid(uuid)
if row is None:
raise HTTPException(status_code=404, detail="host not found")
return SwarmHostView(**row)

View File

@@ -0,0 +1,11 @@
"""GET /swarm/health — controller liveness (no I/O)."""
from __future__ import annotations
from fastapi import APIRouter
router = APIRouter()
@router.get("/health", tags=["Swarm Health"])
async def api_get_swarm_health() -> dict[str, str]:
return {"status": "ok", "role": "swarm-controller"}

View File

@@ -0,0 +1,212 @@
"""POST /swarm/heartbeat — agent→master liveness + decky snapshot refresh.
Workers call this every ~30 s with the output of ``executor.status()``.
The master bumps ``SwarmHost.last_heartbeat`` and re-upserts each
``DeckyShard`` with the fresh ``DeckyConfig`` snapshot + runtime-derived
state so the dashboard stays current without a master-pull probe.
Security: CA-signed mTLS is necessary but not sufficient — a
decommissioned worker's still-valid cert must not resurrect ghost
shards. We pin the presented peer cert's SHA-256 to the
``client_cert_fingerprint`` stored for the claimed ``host_uuid``.
Mismatch (or decommissioned host) → 403.
"""
from __future__ import annotations
import hashlib
import json
from datetime import datetime, timezone
from typing import Any, Optional
from fastapi import APIRouter, Depends, HTTPException, Request
from pydantic import BaseModel
from decnet.config import DeckyConfig
from decnet.logging import get_logger
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
log = get_logger("swarm.heartbeat")
router = APIRouter()
class HeartbeatRequest(BaseModel):
host_uuid: str
agent_version: Optional[str] = None
status: dict[str, Any]
topology: Optional[dict[str, Any]] = None
def _extract_peer_fingerprint(scope: dict[str, Any]) -> Optional[str]:
"""Pull the peer cert's SHA-256 fingerprint from an ASGI scope.
Tries two extraction paths because uvicorn has historically stashed
the TLS peer cert in different scope keys across versions:
1. Primary: ``scope["extensions"]["tls"]["client_cert_chain"][0]``
(uvicorn ≥ 0.30 ASGI TLS extension).
2. Fallback: the transport object's ``ssl_object.getpeercert(binary_form=True)``
(older uvicorn builds + some other servers).
Returns the lowercase hex SHA-256 of the DER-encoded cert, or None
when neither path yields bytes. The endpoint fails closed on None.
"""
peer_der: Optional[bytes] = None
source = "none"
try:
chain = scope.get("extensions", {}).get("tls", {}).get("client_cert_chain")
if chain:
peer_der = chain[0]
source = "primary"
except Exception:
peer_der = None
if peer_der is None:
transport = scope.get("transport")
try:
ssl_obj = transport.get_extra_info("ssl_object") if transport else None
if ssl_obj is not None:
peer_der = ssl_obj.getpeercert(binary_form=True)
if peer_der:
source = "fallback"
except Exception:
peer_der = None
if not peer_der:
log.debug("heartbeat: peer cert extraction failed via none")
return None
log.debug("heartbeat: peer cert extraction succeeded via %s", source)
return hashlib.sha256(peer_der).hexdigest().lower()
async def _verify_peer_matches_host(
request: Request, host_uuid: str, repo: BaseRepository
) -> dict[str, Any]:
host = await repo.get_swarm_host_by_uuid(host_uuid)
if host is None:
raise HTTPException(status_code=404, detail="unknown host")
fp = _extract_peer_fingerprint(request.scope)
if fp is None:
raise HTTPException(status_code=403, detail="peer cert unavailable")
expected = (host.get("client_cert_fingerprint") or "").lower()
if not expected or fp != expected:
raise HTTPException(status_code=403, detail="cert fingerprint mismatch")
return host
async def _reconcile_topology_report(
repo: BaseRepository,
host_uuid: str,
reported: Optional[dict[str, Any]],
) -> None:
"""Compare the agent's reported applied_version_hash against what
master expects for any topology pinned to *host_uuid*.
Sets ``needs_resync=True`` when:
- master has an ACTIVE topology targeted here but the agent reports
a different hash, OR
- master has an ACTIVE topology targeted here but the agent reports
no topology at all (fresh boot / wiped cache).
The actual re-push is handled by the mutator reconcile loop so the
heartbeat endpoint stays cheap.
"""
from decnet.topology.hashing import canonical_hash
from decnet.topology.persistence import hydrate
from decnet.topology.status import TopologyStatus
try:
topos = await repo.list_topologies(status=TopologyStatus.ACTIVE)
except Exception:
log.exception("heartbeat: could not list active topologies")
return
mine = [t for t in topos if t.get("target_host_uuid") == host_uuid]
if not mine:
return
reported_id = (reported or {}).get("topology_id")
reported_hash = (reported or {}).get("applied_version_hash")
for topo in mine:
tid = topo["id"]
if topo.get("needs_resync"):
continue
expected: Optional[str] = None
if reported_id == tid and reported_hash:
try:
hydrated = await hydrate(repo, tid)
except Exception:
log.exception("heartbeat: hydrate failed tid=%s", tid)
continue
if hydrated is None:
continue
expected = canonical_hash(hydrated)
if expected == reported_hash:
continue
# Either mismatch or agent reports no/other topology — flag it.
try:
await repo.set_topology_resync(tid, True)
log.info(
"heartbeat: flagged topology %s for resync (host=%s "
"reported_id=%s reported_hash=%s expected=%s)",
tid, host_uuid, reported_id, reported_hash, expected,
)
except Exception:
log.exception("heartbeat: failed to flag resync tid=%s", tid)
@router.post(
"/heartbeat",
status_code=204,
tags=["Swarm Health"],
responses={
400: {"description": "Bad Request (malformed JSON body)"},
403: {"description": "Peer cert missing, or its fingerprint does not match the host's pinned cert"},
404: {"description": "host_uuid is not enrolled"},
422: {"description": "Request body validation error"},
},
)
async def heartbeat(
req: HeartbeatRequest,
request: Request,
repo: BaseRepository = Depends(get_repo),
) -> None:
await _verify_peer_matches_host(request, req.host_uuid, repo)
now = datetime.now(timezone.utc)
await repo.update_swarm_host(
req.host_uuid,
{"status": "active", "last_heartbeat": now},
)
await _reconcile_topology_report(repo, req.host_uuid, req.topology)
status_body = req.status or {}
if not status_body.get("deployed"):
return
runtime = status_body.get("runtime") or {}
for decky_dict in status_body.get("deckies") or []:
try:
d = DeckyConfig(**decky_dict)
except Exception:
log.exception("heartbeat: skipping malformed decky payload host=%s", req.host_uuid)
continue
rstate = runtime.get(d.name) or {}
is_up = bool(rstate.get("running"))
await repo.upsert_decky_shard(
{
"decky_name": d.name,
"host_uuid": req.host_uuid,
"services": json.dumps(d.services),
"decky_config": d.model_dump_json(),
"decky_ip": d.ip,
"state": "running" if is_up else "degraded",
"last_error": None,
"last_seen": now,
"updated_at": now,
}
)

View File

@@ -0,0 +1,55 @@
"""GET /swarm/deckies — list decky shards with their worker host's identity.
The DeckyShard table maps decky_name → host_uuid; users want to see which
deckies are running and *where*, so we enrich each shard with the owning
host's name/address/status from SwarmHost rather than making callers do
the join themselves.
"""
from __future__ import annotations
from typing import Optional
from fastapi import APIRouter, Depends
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
from decnet.web.db.models import DeckyShardView
router = APIRouter()
@router.get("/deckies", response_model=list[DeckyShardView], tags=["Swarm Deckies"])
async def api_list_deckies(
host_uuid: Optional[str] = None,
state: Optional[str] = None,
repo: BaseRepository = Depends(get_repo),
) -> list[DeckyShardView]:
shards = await repo.list_decky_shards(host_uuid)
hosts = {h["uuid"]: h for h in await repo.list_swarm_hosts()}
out: list[DeckyShardView] = []
for s in shards:
if state and s.get("state") != state:
continue
host = hosts.get(s["host_uuid"], {})
out.append(DeckyShardView(
decky_name=s["decky_name"],
decky_ip=s.get("decky_ip"),
host_uuid=s["host_uuid"],
host_name=host.get("name") or "<unknown>",
host_address=host.get("address") or "",
host_status=host.get("status") or "unknown",
services=s.get("services") or [],
state=s.get("state") or "pending",
last_error=s.get("last_error"),
compose_hash=s.get("compose_hash"),
updated_at=s["updated_at"],
hostname=s.get("hostname"),
distro=s.get("distro"),
archetype=s.get("archetype"),
service_config=s.get("service_config") or {},
mutate_interval=s.get("mutate_interval"),
last_mutated=s.get("last_mutated") or 0.0,
last_seen=s.get("last_seen"),
))
return out

View File

@@ -0,0 +1,21 @@
"""GET /swarm/hosts — list enrolled workers, optionally filtered by status."""
from __future__ import annotations
from typing import Optional
from fastapi import APIRouter, Depends
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
from decnet.web.db.models import SwarmHostView
router = APIRouter()
@router.get("/hosts", response_model=list[SwarmHostView], tags=["Swarm Hosts"])
async def api_list_hosts(
host_status: Optional[str] = None,
repo: BaseRepository = Depends(get_repo),
) -> list[SwarmHostView]:
rows = await repo.list_swarm_hosts(host_status)
return [SwarmHostView(**r) for r in rows]

View File

@@ -0,0 +1,60 @@
"""POST /swarm/teardown — tear down one or all enrolled workers."""
from __future__ import annotations
import asyncio
from typing import Any
from fastapi import APIRouter, Depends, HTTPException
from decnet.logging import get_logger
from decnet.swarm.client import AgentClient
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
from decnet.web.db.models import (
SwarmDeployResponse,
SwarmHostResult,
SwarmTeardownRequest,
)
log = get_logger("swarm.teardown")
router = APIRouter()
@router.post(
"/teardown",
response_model=SwarmDeployResponse,
tags=["Swarm Deployments"],
responses={
400: {"description": "Bad Request (malformed JSON body)"},
404: {"description": "A targeted host does not exist"},
422: {"description": "Request body validation error"},
},
)
async def api_teardown_swarm(
req: SwarmTeardownRequest,
repo: BaseRepository = Depends(get_repo),
) -> SwarmDeployResponse:
if req.host_uuid is not None:
row = await repo.get_swarm_host_by_uuid(req.host_uuid)
if row is None:
raise HTTPException(status_code=404, detail="host not found")
targets = [row]
else:
targets = await repo.list_swarm_hosts()
async def _call(host: dict[str, Any]) -> SwarmHostResult:
try:
async with AgentClient(host=host) as agent:
body = await agent.teardown(req.decky_id)
if req.decky_id is None:
await repo.delete_decky_shards_for_host(host["uuid"])
return SwarmHostResult(host_uuid=host["uuid"], host_name=host["name"], ok=True, detail=body)
except Exception as exc:
log.exception("swarm.teardown failed host=%s", host["name"])
return SwarmHostResult(
host_uuid=host["uuid"], host_name=host["name"], ok=False, detail=str(exc)
)
results = await asyncio.gather(*(_call(h) for h in targets))
return SwarmDeployResponse(results=list(results))

View File

@@ -0,0 +1,26 @@
"""Swarm management endpoints for the React dashboard.
These are *not* the unauthenticated /swarm routes mounted on the separate
swarm-controller process (decnet/web/swarm_api.py on port 8770). These
live on the main web API, go through ``require_admin``, and are the
interface the dashboard uses to list hosts, decommission them, list
deckies across the fleet, and generate one-shot agent-enrollment
bundles.
Mounted under ``/api/v1/swarm`` by the main api router.
"""
from fastapi import APIRouter
from .api_list_hosts import router as list_hosts_router
from .api_decommission_host import router as decommission_host_router
from .api_list_deckies import router as list_deckies_router
from .api_enroll_bundle import router as enroll_bundle_router
from .api_teardown_host import router as teardown_host_router
swarm_mgmt_router = APIRouter(prefix="/swarm")
swarm_mgmt_router.include_router(list_hosts_router)
swarm_mgmt_router.include_router(decommission_host_router)
swarm_mgmt_router.include_router(list_deckies_router)
swarm_mgmt_router.include_router(enroll_bundle_router)
swarm_mgmt_router.include_router(teardown_host_router)

View File

@@ -0,0 +1,71 @@
"""DELETE /swarm/hosts/{uuid} — decommission a worker from the dashboard.
Also instructs the worker agent to stop all DECNET services and delete
its install footprint (keeping logs). Agent self-destruct failure does
not block decommission — the master-side cleanup always runs so a dead
worker can still be removed from the dashboard.
"""
from __future__ import annotations
import pathlib
from fastapi import APIRouter, Depends, HTTPException, status
from decnet.logging import get_logger
from decnet.swarm.client import AgentClient
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo, require_admin
log = get_logger("swarm.decommission")
router = APIRouter()
@router.delete(
"/hosts/{uuid}",
status_code=status.HTTP_204_NO_CONTENT,
tags=["Swarm Management"],
responses={
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "Host not found"},
422: {"description": "Path parameter validation error"},
},
)
async def decommission_host(
uuid: str,
admin: dict = Depends(require_admin),
repo: BaseRepository = Depends(get_repo),
) -> None:
row = await repo.get_swarm_host_by_uuid(uuid)
if row is None:
raise HTTPException(status_code=404, detail="host not found")
# Ask the worker to wipe its own install (keeps logs). The agent
# schedules the reaper as a detached process and returns immediately,
# so this call is fast when the worker is reachable. A dead worker
# shouldn't block the operator from cleaning up the dashboard entry,
# hence best-effort with a log and continue.
try:
async with AgentClient(host=row) as agent:
await agent.self_destruct()
except Exception:
log.exception(
"decommission: self-destruct dispatch failed host=%s"
"proceeding with master-side cleanup anyway",
row.get("name"),
)
await repo.delete_decky_shards_for_host(uuid)
await repo.delete_swarm_host(uuid)
bundle_dir = pathlib.Path(row.get("cert_bundle_path") or "")
if bundle_dir.is_dir():
for child in bundle_dir.iterdir():
try:
child.unlink()
except OSError:
pass
try:
bundle_dir.rmdir()
except OSError:
pass

View File

@@ -0,0 +1,504 @@
"""Agent-enrollment bundles — the Wazuh-style one-liner flow.
Three endpoints:
POST /swarm/enroll-bundle — admin issues certs + builds payload
GET /swarm/enroll-bundle/{t}.sh — bootstrap script (idempotent until .tgz)
GET /swarm/enroll-bundle/{t}.tgz — tarball payload (one-shot; trips served)
The operator's paste is a single pipe ``curl -fsSL <.sh> | sudo bash``.
Under the hood the bootstrap curls the ``.tgz`` from the same token.
Both files are rendered + persisted on POST; the ``.tgz`` GET atomically
marks the token served, reads the bytes under the lock, and unlinks both
files so a sweeper cannot race it. Unclaimed tokens expire after 5 min.
We avoid the single-self-extracting-script pattern because ``bash`` run
via pipe has ``$0 == "bash"`` — there is no file on disk to ``tail`` for
the embedded payload. Two URLs, one paste.
"""
from __future__ import annotations
import asyncio
import io
import os
import pathlib
import secrets
import tarfile
from dataclasses import dataclass
from datetime import datetime, timedelta, timezone
from typing import Optional
from fastapi import APIRouter, Depends, HTTPException, Request, Response, status
from pydantic import BaseModel, Field
from decnet.logging import get_logger
from decnet.swarm import pki
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo, require_admin
log = get_logger("swarm_mgmt.enroll_bundle")
router = APIRouter()
BUNDLE_TTL = timedelta(minutes=5)
BUNDLE_DIR = pathlib.Path(os.environ.get("DECNET_ENROLL_BUNDLE_DIR", "/tmp/decnet-enroll")) # nosec B108 - short-lived 0600 bundle cache, env-overridable
SWEEP_INTERVAL_SECS = 30
# Include list — explicit set of paths that ship to the agent. An
# include list fails closed: anything new on the master (stray .env, dev
# venvs, data dumps, editor scratch dirs) cannot leak into the bundle
# just because we forgot to exclude it.
#
# What the agent actually needs:
# * pyproject.toml at the repo root, so ``pip install`` works against
# the bundle during enroll_bootstrap.sh.
# * the ``decnet/`` package, MINUS the master-only subtrees called out
# by _EXCLUDED_DECNET_SUBTREES — those never import on an agent host.
# Everything else the bootstrap needs (the INI, certs, systemd units) is
# synthesized in-memory by ``_build_tarball`` below — it never hits the
# filesystem walk.
# Top-level files shipped verbatim. Relative to the repo root.
_INCLUDED_ROOT_FILES: tuple[str, ...] = ("pyproject.toml",)
# Top-level directories walked into the bundle. Relative to the repo root.
_INCLUDED_DIRS: tuple[str, ...] = ("decnet",)
# Subtrees of an included directory that must NOT ship. Paths are
# relative to the repo root, forward-slash separated.
# * ``decnet/web`` — FastAPI master app, unused by agents.
# * ``decnet/mutator`` — schedules respawns swarm-wide; master-only.
# * ``decnet/profiler`` — rebuilds profiles against the master DB.
_EXCLUDED_DECNET_SUBTREES: frozenset[str] = frozenset({
"decnet/web",
"decnet/mutator",
"decnet/profiler",
})
# ---------------------------------------------------------------------------
# DTOs
# ---------------------------------------------------------------------------
class EnrollBundleRequest(BaseModel):
master_host: str = Field(..., min_length=1, max_length=253,
description="IP/host the agent will reach back to")
agent_name: str = Field(..., pattern=r"^[a-z0-9][a-z0-9-]{0,62}$",
description="Worker name (DNS-label safe)")
with_updater: bool = Field(
default=True,
description="Include updater cert bundle and auto-start decnet updater on the agent",
)
use_ipvlan: bool = Field(
default=False,
description=(
"Run deckies on this agent over IPvlan L2 instead of MACVLAN. "
"Required when the agent is a VirtualBox/VMware guest bridged over Wi-Fi — "
"Wi-Fi APs bind one MAC per station, so MACVLAN's extra container MACs "
"rotate the VM's DHCP lease. Safe no-op on wired/bare-metal hosts."
),
)
services_ini: Optional[str] = Field(
default=None,
description="Optional INI text shipped to the agent as /etc/decnet/services.ini",
)
class EnrollBundleResponse(BaseModel):
token: str
command: str
expires_at: datetime
host_uuid: str
# ---------------------------------------------------------------------------
# In-memory registry
# ---------------------------------------------------------------------------
@dataclass
class _Bundle:
sh_path: pathlib.Path
tgz_path: pathlib.Path
expires_at: datetime
host_uuid: str
served: bool = False
_BUNDLES: dict[str, _Bundle] = {}
_LOCK = asyncio.Lock()
_SWEEPER_TASK: Optional[asyncio.Task] = None
async def _sweep_loop() -> None:
while True:
try:
await asyncio.sleep(SWEEP_INTERVAL_SECS)
now = datetime.now(timezone.utc)
async with _LOCK:
dead = [t for t, b in _BUNDLES.items() if b.served or b.expires_at <= now]
for t in dead:
b = _BUNDLES.pop(t)
for p in (b.sh_path, b.tgz_path):
try:
p.unlink()
except FileNotFoundError:
pass
except OSError as exc:
log.warning("enroll-bundle sweep unlink failed path=%s err=%s", p, exc)
except asyncio.CancelledError:
raise
except Exception: # noqa: BLE001
log.exception("enroll-bundle sweeper iteration failed")
def _ensure_sweeper() -> None:
global _SWEEPER_TASK
if _SWEEPER_TASK is None or _SWEEPER_TASK.done():
_SWEEPER_TASK = asyncio.create_task(_sweep_loop())
# ---------------------------------------------------------------------------
# Tarball construction
# ---------------------------------------------------------------------------
def _repo_root() -> pathlib.Path:
# decnet/web/router/swarm_mgmt/api_enroll_bundle.py -> 4 parents = repo root.
return pathlib.Path(__file__).resolve().parents[4]
def _iter_included(root: pathlib.Path) -> "list[tuple[pathlib.Path, str]]":
"""Return ``(full_path, arcname)`` pairs for every file the agent needs.
Walk is pruned in-place: ``__pycache__`` and the master-only subtrees
in :data:`_EXCLUDED_DECNET_SUBTREES` are skipped at the directory
level so we never descend into them (critical on dev boxes where
``decnet/web/`` pulls in a fat frontend tree via package-data).
"""
found: list[tuple[pathlib.Path, str]] = []
# Top-level files.
for rel in _INCLUDED_ROOT_FILES:
p = root / rel
if p.is_file():
found.append((p, rel))
# Top-level dirs, pruned.
for top in _INCLUDED_DIRS:
start = root / top
if not start.is_dir():
continue
for dirpath, dirnames, filenames in os.walk(start, topdown=True, followlinks=False):
dir_path = pathlib.Path(dirpath)
rel_dir = dir_path.relative_to(root).as_posix()
# Prune excluded subtrees + cache dirs BEFORE descending.
dirnames[:] = [
d for d in dirnames
if d != "__pycache__"
and f"{rel_dir}/{d}" not in _EXCLUDED_DECNET_SUBTREES
]
for fn in filenames:
if fn.endswith((".pyc", ".pyo")):
continue
full = dir_path / fn
if full.is_symlink():
continue
found.append((full, f"{rel_dir}/{fn}"))
# Deterministic tarball ordering.
found.sort(key=lambda t: t[1])
return found
def _render_decnet_ini(
master_host: str,
host_uuid: str,
use_ipvlan: bool = False,
swarmctl_port: int = 8770,
) -> bytes:
ipvlan_line = f"ipvlan = {'true' if use_ipvlan else 'false'}\n"
return (
"; Generated by DECNET agent-enrollment bundle.\n"
"[decnet]\n"
"mode = agent\n"
"disallow-master = true\n"
"log-directory = /var/log/decnet\n"
f"{ipvlan_line}"
"\n"
"[agent]\n"
f"master-host = {master_host}\n"
f"swarmctl-port = {swarmctl_port}\n"
"swarm-syslog-port = 6514\n"
"agent-port = 8765\n"
"agent-dir = /etc/decnet/agent\n"
"updater-dir = /etc/decnet/updater\n"
f"host-uuid = {host_uuid}\n"
).encode()
def _add_bytes(tar: tarfile.TarFile, name: str, data: bytes, mode: int = 0o644) -> None:
info = tarfile.TarInfo(name)
info.size = len(data)
info.mode = mode
info.mtime = int(datetime.now(timezone.utc).timestamp())
tar.addfile(info, io.BytesIO(data))
def _build_tarball(
master_host: str,
agent_name: str,
host_uuid: str,
issued: pki.IssuedCert,
services_ini: Optional[str],
updater_issued: Optional[pki.IssuedCert] = None,
use_ipvlan: bool = False,
) -> bytes:
"""Gzipped tarball with:
- agent-required source (see :data:`_INCLUDED_DIRS` /
:data:`_INCLUDED_ROOT_FILES`; master-only decnet/ subtrees
pruned)
- etc/decnet/decnet.ini (pre-baked for mode=agent)
- home/.decnet/agent/{ca.crt,worker.crt,worker.key}
- home/.decnet/updater/{ca.crt,updater.crt,updater.key} (if updater_issued)
- services.ini at root if provided
"""
root = _repo_root()
buf = io.BytesIO()
with tarfile.open(fileobj=buf, mode="w:gz") as tar:
for path, arcname in _iter_included(root):
tar.add(path, arcname=arcname, recursive=False)
_add_bytes(
tar,
"etc/decnet/decnet.ini",
_render_decnet_ini(master_host, host_uuid, use_ipvlan),
)
for unit in _SYSTEMD_UNITS:
_add_bytes(
tar,
f"etc/systemd/system/{unit}.service",
_render_systemd_unit(unit, agent_name, master_host),
)
_add_bytes(tar, "home/.decnet/agent/ca.crt", issued.ca_cert_pem)
_add_bytes(tar, "home/.decnet/agent/worker.crt", issued.cert_pem)
_add_bytes(tar, "home/.decnet/agent/worker.key", issued.key_pem, mode=0o600)
if updater_issued is not None:
_add_bytes(tar, "home/.decnet/updater/ca.crt", updater_issued.ca_cert_pem)
_add_bytes(tar, "home/.decnet/updater/updater.crt", updater_issued.cert_pem)
_add_bytes(tar, "home/.decnet/updater/updater.key", updater_issued.key_pem, mode=0o600)
if services_ini:
_add_bytes(tar, "services.ini", services_ini.encode())
return buf.getvalue()
_SYSTEMD_UNITS = (
"decnet-agent", "decnet-forwarder", "decnet-engine", "decnet-updater",
# Per-host microservices — activated by enroll_bootstrap.sh. The
# profiler intentionally stays master-side: it rebuilds attacker
# profiles against the master DB, which workers don't share.
"decnet-collector", "decnet-prober", "decnet-sniffer",
)
def _render_systemd_unit(name: str, agent_name: str, master_host: str) -> bytes:
tpl_path = pathlib.Path(__file__).resolve().parents[1].parent / "templates" / f"{name}.service.j2"
tpl = tpl_path.read_text()
return (
tpl.replace("{{ agent_name }}", agent_name)
.replace("{{ master_host }}", master_host)
).encode()
def _render_bootstrap(
agent_name: str,
master_host: str,
tarball_url: str,
expires_at: datetime,
with_updater: bool,
) -> bytes:
tpl_path = pathlib.Path(__file__).resolve().parents[1].parent / "templates" / "enroll_bootstrap.sh.j2"
tpl = tpl_path.read_text()
now = datetime.now(timezone.utc).replace(microsecond=0).isoformat()
rendered = (
tpl.replace("{{ agent_name }}", agent_name)
.replace("{{ master_host }}", master_host)
.replace("{{ tarball_url }}", tarball_url)
.replace("{{ generated_at }}", now)
.replace("{{ expires_at }}", expires_at.replace(microsecond=0).isoformat())
.replace("{{ with_updater }}", "true" if with_updater else "false")
)
return rendered.encode()
# ---------------------------------------------------------------------------
# Endpoints
# ---------------------------------------------------------------------------
@router.post(
"/enroll-bundle",
response_model=EnrollBundleResponse,
status_code=status.HTTP_201_CREATED,
tags=["Swarm Management"],
responses={
400: {"description": "Bad Request (malformed JSON body)"},
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
409: {"description": "A worker with this name is already enrolled"},
422: {"description": "Request body validation error"},
},
)
async def create_enroll_bundle(
req: EnrollBundleRequest,
request: Request,
admin: dict = Depends(require_admin),
repo: BaseRepository = Depends(get_repo),
) -> EnrollBundleResponse:
import uuid as _uuid
existing = await repo.get_swarm_host_by_name(req.agent_name)
if existing is not None:
raise HTTPException(status_code=409, detail=f"Worker '{req.agent_name}' is already enrolled")
# 1. Issue certs (reuses the same code as /swarm/enroll). The worker's own
# address is not known yet — the master learns it when the agent fetches
# the tarball (see get_payload), which also backfills the SwarmHost row.
ca = pki.ensure_ca()
sans = list({req.agent_name, req.master_host})
issued = pki.issue_worker_cert(ca, req.agent_name, sans)
bundle_dir = pki.DEFAULT_CA_DIR / "workers" / req.agent_name
pki.write_worker_bundle(issued, bundle_dir)
updater_issued: Optional[pki.IssuedCert] = None
updater_fp: Optional[str] = None
if req.with_updater:
updater_cn = f"updater@{req.agent_name}"
updater_sans = list({*sans, updater_cn, "127.0.0.1"})
updater_issued = pki.issue_worker_cert(ca, updater_cn, updater_sans)
updater_dir = bundle_dir / "updater"
updater_dir.mkdir(parents=True, exist_ok=True)
(updater_dir / "updater.crt").write_bytes(updater_issued.cert_pem)
(updater_dir / "updater.key").write_bytes(updater_issued.key_pem)
os.chmod(updater_dir / "updater.key", 0o600)
updater_fp = updater_issued.fingerprint_sha256
# 2. Register the host row so it shows up in SwarmHosts immediately.
host_uuid = str(_uuid.uuid4())
await repo.add_swarm_host(
{
"uuid": host_uuid,
"name": req.agent_name,
"address": "", # filled in when the agent fetches the .tgz (its source IP)
"agent_port": 8765,
"status": "enrolled",
"client_cert_fingerprint": issued.fingerprint_sha256,
"updater_cert_fingerprint": updater_fp,
"cert_bundle_path": str(bundle_dir),
"enrolled_at": datetime.now(timezone.utc),
"notes": "enrolled via UI bundle",
"use_ipvlan": req.use_ipvlan,
}
)
# 3. Render payload + bootstrap.
tarball = _build_tarball(
req.master_host, req.agent_name, host_uuid, issued, req.services_ini, updater_issued,
use_ipvlan=req.use_ipvlan,
)
token = secrets.token_urlsafe(24)
expires_at = datetime.now(timezone.utc) + BUNDLE_TTL
BUNDLE_DIR.mkdir(parents=True, exist_ok=True, mode=0o700)
sh_path = BUNDLE_DIR / f"{token}.sh"
tgz_path = BUNDLE_DIR / f"{token}.tgz"
# Build URLs against the operator-supplied master_host (reachable from the
# new agent) rather than request.base_url, which reflects how the dashboard
# user reached us — often 127.0.0.1 behind a proxy or loopback-bound API.
scheme = request.url.scheme
port = request.url.port
netloc = req.master_host if port is None else f"{req.master_host}:{port}"
base = f"{scheme}://{netloc}"
tarball_url = f"{base}/api/v1/swarm/enroll-bundle/{token}.tgz"
bootstrap_url = f"{base}/api/v1/swarm/enroll-bundle/{token}.sh"
script = _render_bootstrap(req.agent_name, req.master_host, tarball_url, expires_at, req.with_updater)
tgz_path.write_bytes(tarball)
sh_path.write_bytes(script)
os.chmod(tgz_path, 0o600)
os.chmod(sh_path, 0o600)
async with _LOCK:
_BUNDLES[token] = _Bundle(
sh_path=sh_path, tgz_path=tgz_path, expires_at=expires_at, host_uuid=host_uuid,
)
_ensure_sweeper()
log.info("enroll-bundle created agent=%s master=%s token=%s...", req.agent_name, req.master_host, token[:8])
return EnrollBundleResponse(
token=token,
command=f"curl -fsSL {bootstrap_url} | sudo bash",
expires_at=expires_at,
host_uuid=host_uuid,
)
def _now() -> datetime:
# Indirection so tests can monkeypatch.
return datetime.now(timezone.utc)
async def _lookup_live(token: str) -> _Bundle:
b = _BUNDLES.get(token)
if b is None or b.served or b.expires_at <= _now():
raise HTTPException(status_code=404, detail="bundle not found or expired")
return b
@router.get(
"/enroll-bundle/{token}.sh",
tags=["Swarm Management"],
include_in_schema=False,
)
async def get_bootstrap(token: str) -> Response:
async with _LOCK:
b = await _lookup_live(token)
data = b.sh_path.read_bytes()
return Response(content=data, media_type="text/x-shellscript")
@router.get(
"/enroll-bundle/{token}.tgz",
tags=["Swarm Management"],
include_in_schema=False,
)
async def get_payload(
token: str,
request: Request,
repo: BaseRepository = Depends(get_repo),
) -> Response:
async with _LOCK:
b = await _lookup_live(token)
b.served = True
data = b.tgz_path.read_bytes()
host_uuid = b.host_uuid
for p in (b.sh_path, b.tgz_path):
try:
p.unlink()
except FileNotFoundError:
pass
# The agent's first connect-back — its source IP is the reachable address
# the master will later use to probe it. Backfill the SwarmHost row here
# so the operator sees the real address instead of an empty placeholder.
client_host = request.client.host if request.client else ""
if client_host:
try:
await repo.update_swarm_host(host_uuid, {"address": client_host})
except Exception as e: # noqa: BLE001
log.warning("enroll-bundle could not backfill address host=%s err=%s", host_uuid, e)
return Response(content=data, media_type="application/gzip")

View File

@@ -0,0 +1,58 @@
"""GET /swarm/deckies — admin-gated list of decky shards across the fleet."""
from __future__ import annotations
from typing import Optional
from fastapi import APIRouter, Depends
from decnet.web.db.models import DeckyShardView
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo, require_admin
router = APIRouter()
@router.get("/deckies", response_model=list[DeckyShardView], tags=["Swarm Management"])
async def list_deckies(
host_uuid: Optional[str] = None,
state: Optional[str] = None,
admin: dict = Depends(require_admin),
repo: BaseRepository = Depends(get_repo),
) -> list[DeckyShardView]:
shards = await repo.list_decky_shards(host_uuid)
hosts = {h["uuid"]: h for h in await repo.list_swarm_hosts()}
# Pre-heartbeat fallback — older rows without decky_config can still
# surface their IP from the master's deploy state snapshot.
deploy_state = await repo.get_state("deployment") or {}
cfg_deckies = (deploy_state.get("config") or {}).get("deckies") or []
ip_by_name: dict[str, str] = {
d.get("name"): d.get("ip") for d in cfg_deckies if d.get("name")
}
out: list[DeckyShardView] = []
for s in shards:
if state and s.get("state") != state:
continue
host = hosts.get(s["host_uuid"], {})
out.append(DeckyShardView(
decky_name=s["decky_name"],
decky_ip=s.get("decky_ip") or ip_by_name.get(s["decky_name"]),
host_uuid=s["host_uuid"],
host_name=host.get("name") or "<unknown>",
host_address=host.get("address") or "",
host_status=host.get("status") or "unknown",
services=s.get("services") or [],
state=s.get("state") or "pending",
last_error=s.get("last_error"),
compose_hash=s.get("compose_hash"),
updated_at=s["updated_at"],
hostname=s.get("hostname"),
distro=s.get("distro"),
archetype=s.get("archetype"),
service_config=s.get("service_config") or {},
mutate_interval=s.get("mutate_interval"),
last_mutated=s.get("last_mutated") or 0.0,
last_seen=s.get("last_seen"),
))
return out

View File

@@ -0,0 +1,60 @@
"""GET /swarm/hosts — admin-gated list of enrolled workers for the dashboard.
Fans out an ``AgentClient.health()`` probe to each host on every call and
updates ``status`` / ``last_heartbeat`` as a side effect. This mirrors how
``/swarm-updates/hosts`` probes the updater daemon — the SwarmHosts page
polls this endpoint, so probe-on-read is what drives heartbeat freshness
in the UI. No separate scheduler needed.
"""
from __future__ import annotations
import asyncio
from datetime import datetime, timezone
from typing import Any, Optional
from fastapi import APIRouter, Depends
from decnet.logging import get_logger
from decnet.swarm.client import AgentClient
from decnet.web.db.models import SwarmHostView
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo, require_admin
log = get_logger("swarm_mgmt.list_hosts")
router = APIRouter()
async def _probe_and_update(
host: dict[str, Any], repo: BaseRepository
) -> dict[str, Any]:
"""Best-effort mTLS probe. Skips hosts with no address yet (pending first
connect-back) so we don't pollute the DB with 'unreachable' on fresh
enrollments that haven't fetched the tarball."""
if not host.get("address"):
return host
try:
async with AgentClient(host=host) as agent:
await agent.health()
patch = {"status": "active", "last_heartbeat": datetime.now(timezone.utc)}
except Exception as exc: # noqa: BLE001
log.debug("swarm/hosts probe unreachable host=%s err=%s", host.get("name"), exc)
patch = {"status": "unreachable"}
try:
await repo.update_swarm_host(host["uuid"], patch)
except Exception as exc: # noqa: BLE001
log.warning("swarm/hosts could not persist probe result host=%s err=%s", host.get("name"), exc)
return host
host.update(patch)
return host
@router.get("/hosts", response_model=list[SwarmHostView], tags=["Swarm Management"])
async def list_hosts(
host_status: Optional[str] = None,
admin: dict = Depends(require_admin),
repo: BaseRepository = Depends(get_repo),
) -> list[SwarmHostView]:
rows = await repo.list_swarm_hosts(host_status)
probed = await asyncio.gather(*(_probe_and_update(r, repo) for r in rows))
return [SwarmHostView(**r) for r in probed]

View File

@@ -0,0 +1,150 @@
"""POST /swarm/hosts/{uuid}/teardown — remote teardown on a swarm worker.
Body: ``{"decky_id": "..."}`` (optional). When ``decky_id`` is null/omitted
the agent tears down the entire host (all deckies + network); otherwise it
tears down that single decky.
Async-by-default: the endpoint returns 202 the moment the request is
accepted and runs the actual agent call + DB cleanup in a background task.
That lets the operator queue multiple teardowns in parallel without
blocking on slow docker-compose-down cycles on the worker.
"""
from __future__ import annotations
import asyncio
from typing import Any, Optional
from fastapi import APIRouter, Depends, HTTPException, status
from pydantic import BaseModel
from decnet.logging import get_logger
from decnet.swarm.client import AgentClient
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo, require_admin
log = get_logger("swarm.teardown")
router = APIRouter()
# Track spawned background tasks so (a) they're not GC'd mid-flight and
# (b) tests can drain them deterministically via ``await drain_pending()``.
_PENDING: "set[asyncio.Task]" = set()
def _spawn(coro) -> asyncio.Task:
task = asyncio.create_task(coro)
_PENDING.add(task)
task.add_done_callback(_PENDING.discard)
return task
async def drain_pending() -> None:
"""Await all outstanding teardown tasks. Used by tests."""
while _PENDING:
await asyncio.gather(*list(_PENDING), return_exceptions=True)
class TeardownHostRequest(BaseModel):
decky_id: Optional[str] = None
class TeardownHostResponse(BaseModel):
host_uuid: str
host_name: str
decky_id: Optional[str] = None
accepted: bool
detail: str
async def _mark_tearing_down(
repo: BaseRepository, host_uuid: str, decky_id: Optional[str]
) -> None:
"""Flip affected shards to state='tearing_down' so the UI can show
progress immediately while the background task runs."""
shards = await repo.list_decky_shards(host_uuid)
for s in shards:
if decky_id and s.get("decky_name") != decky_id:
continue
await repo.upsert_decky_shard({
**s,
"state": "tearing_down",
"last_error": None,
})
async def _run_teardown(
host: dict[str, Any], repo: BaseRepository, decky_id: Optional[str]
) -> None:
"""Fire the remote teardown + DB cleanup. Exceptions are logged and
reflected on the shard so the UI surfaces them — never re-raised,
since nothing is awaiting us."""
try:
async with AgentClient(host=host) as agent:
await agent.teardown(decky_id)
except Exception as exc:
log.exception(
"swarm.teardown background task failed host=%s decky=%s",
host.get("name"), decky_id,
)
# Reflect the failure on the shard(s) — don't delete on failure,
# the operator needs to see what went wrong and retry.
try:
shards = await repo.list_decky_shards(host["uuid"])
for s in shards:
if decky_id and s.get("decky_name") != decky_id:
continue
await repo.upsert_decky_shard({
**s,
"state": "teardown_failed",
"last_error": str(exc)[:512],
})
except Exception:
log.exception("swarm.teardown failed to record shard failure")
return
try:
if decky_id:
await repo.delete_decky_shard(decky_id)
else:
await repo.delete_decky_shards_for_host(host["uuid"])
except Exception:
log.exception("swarm.teardown DB cleanup failed (agent call succeeded)")
@router.post(
"/hosts/{uuid}/teardown",
response_model=TeardownHostResponse,
status_code=status.HTTP_202_ACCEPTED,
tags=["Swarm Management"],
responses={
400: {"description": "Bad Request (malformed JSON body)"},
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "Host not found"},
422: {"description": "Request body or path parameter validation error"},
},
)
async def teardown_host(
uuid: str,
req: TeardownHostRequest,
admin: dict = Depends(require_admin),
repo: BaseRepository = Depends(get_repo),
) -> TeardownHostResponse:
host = await repo.get_swarm_host_by_uuid(uuid)
if host is None:
raise HTTPException(status_code=404, detail="host not found")
await _mark_tearing_down(repo, uuid, req.decky_id)
# Fire-and-forget: asyncio.create_task (not BackgroundTasks) so the
# task runs independently of this request's lifecycle — the operator
# can queue another teardown the moment this one returns 202 without
# waiting for any per-request cleanup phase.
_spawn(_run_teardown(host, repo, req.decky_id))
return TeardownHostResponse(
host_uuid=uuid,
host_name=host.get("name") or "",
decky_id=req.decky_id,
accepted=True,
detail="teardown queued",
)

View File

@@ -0,0 +1,23 @@
"""Remote Updates — master dashboard's surface for pushing code to workers.
These are *not* the swarm-controller's /swarm routes (those run on a
separate process, auth-free, internal-only). They live on the main web
API, go through ``require_admin``, and are the interface the React
dashboard calls to fan updates out to worker ``decnet updater`` daemons
via ``UpdaterClient``.
Mounted under ``/api/v1/swarm-updates`` by the main api router.
"""
from fastapi import APIRouter
from .api_list_host_releases import router as list_host_releases_router
from .api_push_update import router as push_update_router
from .api_push_update_self import router as push_update_self_router
from .api_rollback_host import router as rollback_host_router
swarm_updates_router = APIRouter(prefix="/swarm-updates")
swarm_updates_router.include_router(list_host_releases_router)
swarm_updates_router.include_router(push_update_router)
swarm_updates_router.include_router(push_update_self_router)
swarm_updates_router.include_router(rollback_host_router)

View File

@@ -0,0 +1,86 @@
"""GET /swarm-updates/hosts — per-host updater health + release slots.
Fans out an ``UpdaterClient.health()`` probe to every enrolled host that
has an updater bundle. Each probe is isolated: a single unreachable host
never fails the whole list (that's normal partial-failure behaviour for
a fleet view).
"""
from __future__ import annotations
import asyncio
from typing import Any
from fastapi import APIRouter, Depends
from decnet.logging import get_logger
from decnet.swarm.updater_client import UpdaterClient
from decnet.web.db.models import HostReleaseInfo, HostReleasesResponse
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo, require_admin
log = get_logger("swarm_updates.list")
router = APIRouter()
def _extract_shas(releases: list[dict[str, Any]]) -> tuple[str | None, str | None]:
"""Pick the (current, previous) SHA from the updater's releases list.
The updater reports releases as ``[{"slot": "active"|"prev", "sha": ...,
...}]`` in no guaranteed order, so pull by slot name rather than index.
"""
current = next((r.get("sha") for r in releases if r.get("slot") == "active"), None)
previous = next((r.get("sha") for r in releases if r.get("slot") == "prev"), None)
return current, previous
async def _probe_host(host: dict[str, Any]) -> HostReleaseInfo:
try:
async with UpdaterClient(host=host) as u:
body = await u.health()
except Exception as exc: # noqa: BLE001
return HostReleaseInfo(
host_uuid=host["uuid"],
host_name=host["name"],
address=host["address"],
reachable=False,
detail=f"{type(exc).__name__}: {exc}",
)
releases = body.get("releases") or []
current, previous = _extract_shas(releases)
return HostReleaseInfo(
host_uuid=host["uuid"],
host_name=host["name"],
address=host["address"],
reachable=True,
agent_status=body.get("agent_status") or body.get("status"),
current_sha=current,
previous_sha=previous,
releases=releases,
)
@router.get(
"/hosts",
response_model=HostReleasesResponse,
tags=["Swarm Updates"],
responses={
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
},
)
async def api_list_host_releases(
admin: dict = Depends(require_admin),
repo: BaseRepository = Depends(get_repo),
) -> HostReleasesResponse:
rows = await repo.list_swarm_hosts()
# Only hosts actually capable of receiving updates — decommissioned
# hosts and agent-only enrollments are filtered out.
targets = [
r for r in rows
if r.get("status") != "decommissioned" and r.get("updater_cert_fingerprint")
]
if not targets:
return HostReleasesResponse(hosts=[])
results = await asyncio.gather(*(_probe_host(h) for h in targets))
return HostReleasesResponse(hosts=list(results))

View File

@@ -0,0 +1,163 @@
"""POST /swarm-updates/push — fan a tarball of the master's tree to workers.
Mirrors the ``decnet swarm update`` CLI flow: build the tarball once,
dispatch concurrently, collect per-host statuses. Returns HTTP 200 even
when individual hosts failed — the operator reads per-host ``status``.
"""
from __future__ import annotations
import asyncio
import pathlib
from typing import Any
from fastapi import APIRouter, Depends, HTTPException
from decnet.logging import get_logger
from decnet.swarm.tar_tree import detect_git_sha, tar_working_tree
from decnet.swarm.updater_client import UpdaterClient
from decnet.web.db.models import PushUpdateRequest, PushUpdateResponse, PushUpdateResult
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo, require_admin
log = get_logger("swarm_updates.push")
router = APIRouter()
def _master_tree_root() -> pathlib.Path:
"""Resolve the master's install tree to tar.
Walks up from this file: ``decnet/web/router/swarm_updates/`` → 3 parents
lands on the repo root. Matches the layout shipped via ``pip install -e .``
and the dev checkout at ``~/Tools/DECNET``.
"""
return pathlib.Path(__file__).resolve().parents[4]
def _classify_update(status_code: int) -> str:
if status_code == 200:
return "updated"
if status_code == 409:
return "rolled-back"
return "failed"
async def _resolve_targets(
repo: BaseRepository,
req: PushUpdateRequest,
) -> list[dict[str, Any]]:
if req.all == bool(req.host_uuids):
raise HTTPException(
status_code=400,
detail="Specify exactly one of host_uuids or all=true.",
)
rows = await repo.list_swarm_hosts()
rows = [r for r in rows if r.get("updater_cert_fingerprint")]
if req.all:
targets = [r for r in rows if r.get("status") != "decommissioned"]
else:
wanted = set(req.host_uuids or [])
targets = [r for r in rows if r["uuid"] in wanted]
missing = wanted - {r["uuid"] for r in targets}
if missing:
raise HTTPException(
status_code=404,
detail=f"Unknown or updater-less host(s): {sorted(missing)}",
)
if not targets:
raise HTTPException(
status_code=404,
detail="No targets: no enrolled hosts have an updater bundle.",
)
return targets
async def _push_one(
host: dict[str, Any],
tarball: bytes,
sha: str,
include_self: bool,
) -> PushUpdateResult:
try:
async with UpdaterClient(host=host) as u:
r = await u.update(tarball, sha=sha)
body = r.json() if r.content else {}
status = _classify_update(r.status_code)
stderr = body.get("stderr") if isinstance(body, dict) else None
if include_self and r.status_code == 200:
# Agent first, updater second — a broken updater push must never
# strand the fleet on an old agent.
try:
rs = await u.update_self(tarball, sha=sha)
self_ok = rs.status_code in (200, 0) # 0 = connection dropped (expected)
except Exception as exc: # noqa: BLE001
# Connection drop on update-self is expected and not an error.
self_ok = _is_expected_connection_drop(exc)
if not self_ok:
return PushUpdateResult(
host_uuid=host["uuid"], host_name=host["name"],
status="self-failed", http_status=r.status_code, sha=sha,
detail=f"agent updated OK but self-update failed: {exc}",
stderr=stderr,
)
status = "self-updated" if self_ok else "self-failed"
return PushUpdateResult(
host_uuid=host["uuid"], host_name=host["name"],
status=status, http_status=r.status_code, sha=sha,
detail=body.get("error") or body.get("probe") if isinstance(body, dict) else None,
stderr=stderr,
)
except Exception as exc: # noqa: BLE001
log.exception("swarm_updates.push failed host=%s", host.get("name"))
return PushUpdateResult(
host_uuid=host["uuid"], host_name=host["name"],
status="failed",
detail=f"{type(exc).__name__}: {exc}",
)
def _is_expected_connection_drop(exc: BaseException) -> bool:
"""update-self re-execs the updater mid-response; httpx raises on the drop."""
import httpx
return isinstance(exc, (httpx.RemoteProtocolError, httpx.ReadError, httpx.ConnectError))
@router.post(
"/push",
response_model=PushUpdateResponse,
tags=["Swarm Updates"],
responses={
400: {"description": "Bad Request (malformed JSON body or conflicting host_uuids/all flags)"},
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "No matching target hosts or no updater-capable hosts enrolled"},
422: {"description": "Request body validation error"},
},
)
async def api_push_update(
req: PushUpdateRequest,
admin: dict = Depends(require_admin),
repo: BaseRepository = Depends(get_repo),
) -> PushUpdateResponse:
targets = await _resolve_targets(repo, req)
tree_root = _master_tree_root()
# Both `detect_git_sha` (shells out) and `tar_working_tree` (walks the repo
# + gzips a few MB) are synchronous CPU+I/O. Running them directly on the
# event loop blocks every other request until the tarball is built — the
# dashboard freezes on /swarm-updates push. Offload to a worker thread.
sha = await asyncio.to_thread(detect_git_sha, tree_root)
tarball = await asyncio.to_thread(tar_working_tree, tree_root, extra_excludes=req.exclude)
log.info(
"swarm_updates.push sha=%s tarball=%d hosts=%d include_self=%s",
sha or "(not a git repo)", len(tarball), len(targets), req.include_self,
)
results = await asyncio.gather(
*(_push_one(h, tarball, sha, req.include_self) for h in targets)
)
return PushUpdateResponse(
sha=sha,
tarball_bytes=len(tarball),
results=list(results),
)

View File

@@ -0,0 +1,101 @@
"""POST /swarm-updates/push-self — push only to workers' /update-self.
Use case: the agent is fine but the updater itself needs an upgrade (e.g.
a fix to ``executor.py``). Uploading only ``/update-self`` avoids a
redundant agent restart on healthy workers.
No auto-rollback: the updater re-execs itself on success, so a broken
push leaves the worker on the old code — verified by polling ``/health``
after the request returns.
"""
from __future__ import annotations
import asyncio
from typing import Any
from fastapi import APIRouter, Depends
from decnet.logging import get_logger
from decnet.swarm.tar_tree import detect_git_sha, tar_working_tree
from decnet.swarm.updater_client import UpdaterClient
from decnet.web.db.models import PushUpdateRequest, PushUpdateResponse, PushUpdateResult
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo, require_admin
from .api_push_update import _is_expected_connection_drop, _master_tree_root, _resolve_targets
log = get_logger("swarm_updates.push_self")
router = APIRouter()
async def _push_self_one(host: dict[str, Any], tarball: bytes, sha: str) -> PushUpdateResult:
try:
async with UpdaterClient(host=host) as u:
try:
r = await u.update_self(tarball, sha=sha)
http_status = r.status_code
body = r.json() if r.content else {}
ok = http_status == 200
detail = (body.get("error") or body.get("probe")) if isinstance(body, dict) else None
stderr = body.get("stderr") if isinstance(body, dict) else None
except Exception as exc: # noqa: BLE001
# Connection drops during self-update are expected — the updater
# re-execs itself mid-response.
if _is_expected_connection_drop(exc):
return PushUpdateResult(
host_uuid=host["uuid"], host_name=host["name"],
status="self-updated", sha=sha,
detail="updater re-exec dropped connection (expected)",
)
raise
return PushUpdateResult(
host_uuid=host["uuid"], host_name=host["name"],
status="self-updated" if ok else "self-failed",
http_status=http_status, sha=sha,
detail=detail, stderr=stderr,
)
except Exception as exc: # noqa: BLE001
log.exception("swarm_updates.push_self failed host=%s", host.get("name"))
return PushUpdateResult(
host_uuid=host["uuid"], host_name=host["name"],
status="self-failed",
detail=f"{type(exc).__name__}: {exc}",
)
@router.post(
"/push-self",
response_model=PushUpdateResponse,
tags=["Swarm Updates"],
responses={
400: {"description": "Bad Request (malformed JSON body or conflicting host_uuids/all flags)"},
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "No matching target hosts or no updater-capable hosts enrolled"},
422: {"description": "Request body validation error"},
},
)
async def api_push_update_self(
req: PushUpdateRequest,
admin: dict = Depends(require_admin),
repo: BaseRepository = Depends(get_repo),
) -> PushUpdateResponse:
targets = await _resolve_targets(repo, req)
tree_root = _master_tree_root()
# Offload sync I/O (git shell-out + tar+gzip of the repo) so the event
# loop stays responsive while the tarball is being built.
sha = await asyncio.to_thread(detect_git_sha, tree_root)
tarball = await asyncio.to_thread(tar_working_tree, tree_root, extra_excludes=req.exclude)
log.info(
"swarm_updates.push_self sha=%s tarball=%d hosts=%d",
sha or "(not a git repo)", len(tarball), len(targets),
)
results = await asyncio.gather(
*(_push_self_one(h, tarball, sha) for h in targets)
)
return PushUpdateResponse(
sha=sha,
tarball_bytes=len(tarball),
results=list(results),
)

View File

@@ -0,0 +1,77 @@
"""POST /swarm-updates/rollback — manual rollback on a single host.
Calls the worker updater's ``/rollback`` which swaps the ``current``
symlink back to ``releases/prev``. Fails with 404 if the target has no
previous release slot.
"""
from __future__ import annotations
from fastapi import APIRouter, Depends, HTTPException
from decnet.logging import get_logger
from decnet.swarm.updater_client import UpdaterClient
from decnet.web.db.models import RollbackRequest, RollbackResponse
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo, require_admin
log = get_logger("swarm_updates.rollback")
router = APIRouter()
@router.post(
"/rollback",
response_model=RollbackResponse,
tags=["Swarm Updates"],
responses={
400: {"description": "Bad Request (malformed JSON body or host has no updater bundle)"},
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "Unknown host, or no previous release slot on the worker"},
422: {"description": "Request body validation error"},
},
)
async def api_rollback_host(
req: RollbackRequest,
admin: dict = Depends(require_admin),
repo: BaseRepository = Depends(get_repo),
) -> RollbackResponse:
host = await repo.get_swarm_host_by_uuid(req.host_uuid)
if host is None:
raise HTTPException(status_code=404, detail=f"Unknown host: {req.host_uuid}")
if not host.get("updater_cert_fingerprint"):
raise HTTPException(
status_code=400,
detail=f"Host '{host['name']}' has no updater bundle — nothing to roll back.",
)
try:
async with UpdaterClient(host=host) as u:
r = await u.rollback()
except Exception as exc: # noqa: BLE001
log.exception("swarm_updates.rollback transport failure host=%s", host["name"])
return RollbackResponse(
host_uuid=host["uuid"], host_name=host["name"],
status="failed",
detail=f"{type(exc).__name__}: {exc}",
)
body = r.json() if r.content else {}
if r.status_code == 404:
# No previous release — surface as 404 so the UI can render the
# "nothing to roll back" state distinctly from a transport error.
raise HTTPException(
status_code=404,
detail=body.get("detail") if isinstance(body, dict) else "No previous release on worker.",
)
if r.status_code != 200:
return RollbackResponse(
host_uuid=host["uuid"], host_name=host["name"],
status="failed", http_status=r.status_code,
detail=(body.get("error") or body.get("detail")) if isinstance(body, dict) else None,
)
return RollbackResponse(
host_uuid=host["uuid"], host_name=host["name"],
status="rolled-back", http_status=r.status_code,
detail=body.get("status") if isinstance(body, dict) else None,
)

View File

@@ -0,0 +1,6 @@
from fastapi import APIRouter
from .api_deployment_mode import router as deployment_mode_router
system_router = APIRouter(prefix="/system", tags=["System"])
system_router.include_router(deployment_mode_router)

View File

@@ -0,0 +1,41 @@
"""GET /system/deployment-mode — tells the UI whether a deploy will shard
across SWARM workers or land on the master itself.
Logic mirrors the auto-mode branch in ``api_deploy_deckies``: master role
plus at least one reachable enrolled worker = swarm; otherwise unihost.
"""
from __future__ import annotations
import os
from fastapi import APIRouter, Depends
from pydantic import BaseModel
from decnet.web.db.repository import BaseRepository
from decnet.web.dependencies import get_repo
router = APIRouter()
class DeploymentModeResponse(BaseModel):
mode: str # "swarm" or "unihost"
role: str # "master" or "agent"
swarm_host_count: int
@router.get("/deployment-mode", response_model=DeploymentModeResponse)
async def get_deployment_mode(
repo: BaseRepository = Depends(get_repo),
) -> DeploymentModeResponse:
role = os.environ.get("DECNET_MODE", "master").lower()
hosts = 0
if role == "master":
hosts = sum(
1 for h in await repo.list_swarm_hosts()
if h.get("status") in ("active", "enrolled") and h.get("address")
)
return DeploymentModeResponse(
mode="swarm" if hosts > 0 else "unihost",
role=role,
swarm_host_count=hosts,
)

View File

@@ -0,0 +1,55 @@
"""MazeNET topology REST endpoints (phase 3).
Thin FastAPI layer over the phase-2 topology machinery:
generate/validate/deploy/teardown, pending-only child CRUD, and the
live-mutation queue for active|degraded topologies.
Mounted at ``/api/v1/topologies`` by the main api router. Sub-routers
live one-per-file and are aggregated here.
"""
from fastapi import APIRouter
from .api_catalog import router as _catalog_router
from .api_create_topology import router as _create_router
from .api_create_blank_topology import router as _create_blank_router
from .api_decky_crud import router as _decky_router
from .api_delete_topology import router as _delete_router
from .api_deploy_topology import router as _deploy_router
from .api_edge_crud import router as _edge_router
from .api_events import router as _events_router
from .api_get_topology import router as _get_router
from .api_lan_crud import router as _lan_router
from .api_list_topologies import router as _list_router
from .api_mutations import router as _mutations_router
from .api_personas import router as _personas_router
from .api_reap_orphans import router as _reap_router
from .api_teardown_topology import router as _teardown_router
topology_router = APIRouter(prefix="/topologies", tags=["topologies"])
# Order matters: catalog routes use literal path segments (e.g.
# /services, /next-subnet) that would otherwise be shadowed by the
# `/{topology_id}` path in api_get_topology. Keep the catalog router
# included first so FastAPI's trie resolves literals before the
# parameterized fallback.
topology_router.include_router(_catalog_router)
topology_router.include_router(_list_router)
topology_router.include_router(_create_blank_router)
topology_router.include_router(_create_router)
topology_router.include_router(_reap_router)
topology_router.include_router(_deploy_router)
topology_router.include_router(_teardown_router)
topology_router.include_router(_delete_router)
topology_router.include_router(_lan_router)
topology_router.include_router(_decky_router)
topology_router.include_router(_edge_router)
topology_router.include_router(_mutations_router)
topology_router.include_router(_events_router)
# Personas use a literal-suffix path (`/{id}/personas`) — register
# before the bare `/{id}` getter so FastAPI's trie sees the literal
# segment first.
topology_router.include_router(_personas_router)
topology_router.include_router(_get_router)
__all__ = ["topology_router"]

View File

@@ -0,0 +1,53 @@
"""Shared helpers for the Phase-3 child-CRUD routes."""
from __future__ import annotations
from typing import Any
from fastapi import HTTPException
from decnet.topology.status import (
TopologyNotEditable,
TopologyStatus,
VersionConflict,
)
from decnet.web.dependencies import repo
async def get_topology_or_404(topology_id: str) -> dict[str, Any]:
topo = await repo.get_topology(topology_id)
if topo is None:
raise HTTPException(status_code=404, detail="Topology not found")
return topo
async def assert_pending_or_409(topology_id: str) -> dict[str, Any]:
"""Ensure the topology exists and is in ``pending`` state.
The repo layer enforces the same rule inside mutation methods, but the
``add_*`` helpers don't — re-check here so every write route agrees on
the pre-condition before any side effect.
"""
topo = await get_topology_or_404(topology_id)
if topo["status"] != TopologyStatus.PENDING:
raise HTTPException(
status_code=409,
detail=(
f"Topology is {topo['status']!r}; free-form child edits are "
f"pending-only. Use the mutation queue for active topologies."
),
)
return topo
def map_repo_exception(exc: Exception) -> HTTPException:
"""Translate repo-layer exceptions to HTTP status codes."""
if isinstance(exc, TopologyNotEditable):
return HTTPException(status_code=409, detail=str(exc))
if isinstance(exc, VersionConflict):
return HTTPException(
status_code=409,
detail=f"Version conflict: expected {exc.expected}, current {exc.current}",
)
if isinstance(exc, ValueError):
return HTTPException(status_code=400, detail=str(exc))
return HTTPException(status_code=500, detail="Internal error")

View File

@@ -0,0 +1,66 @@
"""Shared validation for the ``mode`` / ``target_host_uuid`` pair.
Called by the two topology-create endpoints
(``api_create_topology``, ``api_create_blank_topology``). Kept as a
tiny module so the rules stay in one place when Step 6 grows the list
(e.g. when we start rejecting hosts that already own a topology).
"""
from __future__ import annotations
from typing import Any, Optional
from fastapi import HTTPException
# Hosts we're willing to route a new topology to. ``enrolled`` is fine
# because the agent process has certs and will answer mTLS calls as
# soon as it's up; ``active`` means we've seen a heartbeat recently.
_ROUTABLE_HOST_STATUSES = {"enrolled", "active"}
async def validate_target_host(
repo: Any,
mode: str,
target_host_uuid: Optional[str],
) -> None:
"""Raise HTTPException(400) if the mode/host combination is invalid.
Rules:
- ``mode=="unihost"`` with a ``target_host_uuid`` → 400 (nonsense).
- ``mode=="agent"`` without ``target_host_uuid`` → 400.
- ``mode=="agent"`` with an unknown uuid → 400.
- ``mode=="agent"`` pointing at a host in ``unreachable`` /
``decommissioned`` → 400 (operator asked for a broken path).
"""
if mode == "unihost":
if target_host_uuid is not None:
raise HTTPException(
status_code=400,
detail="target_host_uuid is only valid when mode='agent'",
)
return
if mode == "agent":
if not target_host_uuid:
raise HTTPException(
status_code=400,
detail="mode='agent' requires target_host_uuid",
)
host = await repo.get_swarm_host_by_uuid(target_host_uuid)
if host is None:
raise HTTPException(
status_code=400,
detail=f"unknown swarm host {target_host_uuid!r}",
)
if host.get("status") not in _ROUTABLE_HOST_STATUSES:
raise HTTPException(
status_code=400,
detail=(
f"swarm host {target_host_uuid!r} is "
f"{host.get('status')!r}; expected one of "
f"{sorted(_ROUTABLE_HOST_STATUSES)}"
),
)
return
# Shouldn't happen — the pydantic pattern should have rejected it.
raise HTTPException(status_code=400, detail=f"unknown mode {mode!r}")

View File

@@ -0,0 +1,140 @@
"""Read-only catalog endpoints — services, next-subnet, next-ip.
These wrap fleet/allocator helpers so the phase-4 canvas UI can lean
on the server for allocation instead of shipping the logic client-side.
"""
from __future__ import annotations
from fastapi import APIRouter, Depends, HTTPException, Query
from decnet.archetypes import all_archetypes
from decnet.fleet import all_service_names
from decnet.telemetry import traced as _traced
from decnet.topology.allocator import (
AllocatorExhausted,
IPAllocator,
SubnetAllocator,
reserved_subnets,
)
from decnet.web.db.models import (
ArchetypeCatalogResponse,
ArchetypeEntry,
NextIPResponse,
NextSubnetResponse,
ServiceCatalogResponse,
)
from decnet.web.dependencies import repo, require_viewer
router = APIRouter()
@router.get(
"/services",
tags=["MazeNET Topologies"],
response_model=ServiceCatalogResponse,
responses={
400: {"description": "Malformed query parameters"},
401: {"description": "Missing or invalid credentials"},
403: {"description": "Insufficient permissions"},
},
)
@_traced("api.topology.catalog.services")
async def api_list_services(
_viewer: dict = Depends(require_viewer),
) -> ServiceCatalogResponse:
return ServiceCatalogResponse(services=all_service_names())
@router.get(
"/archetypes",
tags=["MazeNET Topologies"],
response_model=ArchetypeCatalogResponse,
responses={
401: {"description": "Missing or invalid credentials"},
403: {"description": "Insufficient permissions"},
},
)
@_traced("api.topology.catalog.archetypes")
async def api_list_archetypes(
_viewer: dict = Depends(require_viewer),
) -> ArchetypeCatalogResponse:
return ArchetypeCatalogResponse(
archetypes=[
ArchetypeEntry(
slug=a.slug,
display_name=a.display_name,
description=a.description,
services=list(a.services),
preferred_distros=list(a.preferred_distros),
nmap_os=a.nmap_os,
)
for a in all_archetypes().values()
],
)
@router.get(
"/next-subnet",
tags=["MazeNET Topologies"],
response_model=NextSubnetResponse,
responses={
401: {"description": "Missing or invalid credentials"},
403: {"description": "Insufficient permissions"},
409: {"description": "Allocator exhausted"},
},
)
@_traced("api.topology.catalog.next_subnet")
async def api_next_subnet(
base: str = Query(
default="172.16.0.0/12",
pattern=r"^\d{1,3}\.\d{1,3}(\.\d{1,3}\.\d{1,3}/\d{1,2})?$",
),
_viewer: dict = Depends(require_viewer),
) -> NextSubnetResponse:
reserved = await reserved_subnets(repo)
alloc = SubnetAllocator(base_prefix=base, reserved=reserved)
try:
subnet = alloc.next_free()
except AllocatorExhausted as e:
raise HTTPException(status_code=409, detail=str(e))
return NextSubnetResponse(subnet=subnet)
@router.get(
"/{topology_id}/lans/{lan_id}/next-ip",
tags=["MazeNET Topologies"],
response_model=NextIPResponse,
responses={
400: {"description": "Malformed path parameters"},
401: {"description": "Missing or invalid credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "Topology or LAN not found"},
409: {"description": "Allocator exhausted"},
},
)
@_traced("api.topology.catalog.next_ip")
async def api_next_ip(
topology_id: str,
lan_id: str,
_viewer: dict = Depends(require_viewer),
) -> NextIPResponse:
if await repo.get_topology(topology_id) is None:
raise HTTPException(status_code=404, detail="Topology not found")
lans = await repo.list_lans_for_topology(topology_id)
lan = next((ln for ln in lans if ln["id"] == lan_id), None)
if lan is None:
raise HTTPException(status_code=404, detail="LAN not found")
deckies = await repo.list_topology_deckies(topology_id)
alloc = IPAllocator(subnet=lan["subnet"])
for d in deckies:
ip = (d.get("decky_config") or {}).get("ips_by_lan", {}).get(lan["name"])
if ip:
try:
alloc.reserve(ip)
except ValueError:
continue
try:
ip = alloc.next_free()
except AllocatorExhausted as e:
raise HTTPException(status_code=409, detail=str(e))
return NextIPResponse(subnet=lan["subnet"], ip=ip)

View File

@@ -0,0 +1,123 @@
"""POST /topologies/blank — create an empty editable topology.
Produces a minimal ``pending`` topology seeded with exactly one DMZ LAN
and its mandatory host-gateway decky. Intended for the MazeNET editor
landing flow: unlike ``POST /topologies`` (which runs the generator),
this endpoint takes no generator parameters and skips the planner
entirely. The DMZ+gateway invariant is enforced server-side so the
editor never has to special-case a "no DMZ yet" state.
"""
from __future__ import annotations
import json
from fastapi import APIRouter, Depends, HTTPException, status
from pydantic import BaseModel, Field as PydanticField
from decnet.telemetry import traced as _traced
from decnet.topology.allocator import SubnetAllocator, reserved_subnets
from decnet.web.db.models import TopologySummary
from decnet.web.dependencies import repo, require_admin
from decnet.web.router.topology._target_host import validate_target_host
router = APIRouter()
class BlankTopologyRequest(BaseModel):
"""Body for POST /topologies/blank — name plus optional agent pinning."""
name: str = PydanticField(..., min_length=1, max_length=64)
mode: str = PydanticField(default="unihost", pattern=r"^(unihost|agent)$")
target_host_uuid: str | None = PydanticField(default=None)
@router.post(
"/blank",
tags=["MazeNET Topologies"],
response_model=TopologySummary,
status_code=status.HTTP_201_CREATED,
responses={
400: {"description": "Malformed body or invalid topology name"},
401: {"description": "Missing or invalid credentials"},
403: {"description": "Insufficient permissions"},
409: {"description": "Name collision or subnet pool exhausted"},
},
)
@_traced("api.topology.create_blank")
async def api_create_blank_topology(
body: BlankTopologyRequest,
_admin: dict = Depends(require_admin),
) -> TopologySummary:
# 0. Validate mode/host pairing before any writes.
await validate_target_host(repo, body.mode, body.target_host_uuid)
# 1. Topology row
try:
topology_id = await repo.create_topology(
{
"name": body.name,
"mode": body.mode,
"target_host_uuid": body.target_host_uuid,
"status": "pending",
"config_snapshot": json.dumps({"blank": True}),
}
)
except Exception as exc: # noqa: BLE001 — surface duplicate-name as 409
raise HTTPException(status_code=409, detail=str(exc)) from exc
# 2. DMZ LAN with auto-allocated subnet
try:
allocator = SubnetAllocator(
"10.0", reserved=await reserved_subnets(repo)
)
subnet = allocator.next_free()
except RuntimeError as exc:
raise HTTPException(status_code=409, detail=str(exc)) from exc
lan_id = await repo.add_lan(
{
"topology_id": topology_id,
"name": "dmz",
"subnet": subnet,
"is_dmz": True,
"x": 40,
"y": 40,
}
)
# 3. DMZ-gateway decky — a normal multi-homed bridge decky.
# `forwards_l3=True` turns on net.ipv4.ip_forward + NET_ADMIN at
# compose time (see decnet/topology/compose.py). No host-mode,
# no MACVLAN — the gateway reaches the outside world via Docker
# port publishing (see composer port emission).
decky_uuid = await repo.add_topology_decky(
{
"topology_id": topology_id,
"name": "dmz-gateway",
"services": ["ssh"],
"decky_config": {
"archetype": "deaddeck",
"forwards_l3": True,
},
"state": "pending",
"x": 20,
"y": 60,
}
)
# 4. Membership edge on the DMZ — is_bridge=True marks this decky
# as the topology's bridge gateway; forwards_l3 mirrors the decky
# config so the generator/compose paths stay consistent.
await repo.add_topology_edge(
{
"topology_id": topology_id,
"decky_uuid": decky_uuid,
"lan_id": lan_id,
"is_bridge": True,
"forwards_l3": True,
}
)
row = await repo.get_topology(topology_id)
if row is None: # pragma: no cover — create then vanish
raise HTTPException(status_code=500, detail="topology insert vanished")
return TopologySummary(**row)

View File

@@ -0,0 +1,77 @@
"""POST /topologies — generate and persist a new MazeNET topology."""
from __future__ import annotations
from fastapi import APIRouter, Depends, HTTPException, status
from sqlalchemy.exc import IntegrityError
from decnet.telemetry import traced as _traced
from decnet.topology.allocator import reserved_subnets
from decnet.topology.config import TopologyConfig
from decnet.topology.generator import generate
from decnet.topology.persistence import persist
from decnet.web.db.models import TopologyGenerateRequest, TopologySummary
from decnet.web.dependencies import repo, require_admin
from decnet.web.router.topology._target_host import validate_target_host
router = APIRouter()
@router.post(
"/",
tags=["MazeNET Topologies"],
response_model=TopologySummary,
status_code=status.HTTP_201_CREATED,
responses={
400: {"description": "Malformed or invalid generation parameters"},
401: {"description": "Missing or invalid credentials"},
403: {"description": "Insufficient permissions"},
409: {"description": "Duplicate topology name, or generator could not allocate subnets (exhausted pool)"},
},
)
@_traced("api.topology.create")
async def api_create_topology(
body: TopologyGenerateRequest,
_admin: dict = Depends(require_admin),
) -> TopologySummary:
await validate_target_host(repo, body.mode, body.target_host_uuid)
try:
config = TopologyConfig(
name=body.name,
mode=body.mode,
depth=body.depth,
branching_factor=body.branching_factor,
deckies_per_lan_min=body.deckies_per_lan_min,
deckies_per_lan_max=body.deckies_per_lan_max,
bridge_forward_probability=body.bridge_forward_probability,
cross_edge_probability=body.cross_edge_probability,
services_explicit=body.services_explicit,
randomize_services=body.randomize_services,
seed=body.seed,
)
except (ValueError, TypeError) as exc:
raise HTTPException(status_code=400, detail=str(exc)) from exc
try:
plan = generate(config, reserved_subnets=await reserved_subnets(repo))
except RuntimeError as exc:
# Subnet allocator exhaustion or similar planner-level failure.
raise HTTPException(status_code=409, detail=str(exc)) from exc
except (ValueError, TypeError) as exc:
raise HTTPException(status_code=400, detail=str(exc)) from exc
try:
topology_id = await persist(repo, plan, target_host_uuid=body.target_host_uuid)
except IntegrityError as exc:
# Unique constraint on topologies.name is the only integrity
# error the create path can realistically hit — inspecting the
# constraint name keeps us from silently mapping unrelated
# integrity failures to 409.
msg = str(exc.orig) if exc.orig is not None else str(exc)
if "ix_topologies_name" in msg or "topologies.name" in msg:
raise HTTPException(
status_code=409,
detail=f"A topology named {body.name!r} already exists.",
) from exc
raise
row = await repo.get_topology(topology_id)
return TopologySummary(**row)

View File

@@ -0,0 +1,136 @@
"""Decky CRUD endpoints — pending-only child mutations.
POST /topologies/{id}/deckies
PATCH /topologies/{id}/deckies/{uuid}
DELETE /topologies/{id}/deckies/{uuid}
"""
from __future__ import annotations
from fastapi import APIRouter, Depends, HTTPException, Response, status
from decnet.telemetry import traced as _traced
from decnet.topology.status import (
TopologyNotEditable,
VersionConflict,
)
from decnet.web.db.models import DeckyCreateRequest, DeckyRow, DeckyUpdateRequest
from decnet.web.dependencies import repo, require_admin
from ._guards import assert_pending_or_409, map_repo_exception
router = APIRouter()
@router.post(
"/{topology_id}/deckies",
tags=["MazeNET Topologies"],
response_model=DeckyRow,
status_code=status.HTTP_201_CREATED,
responses={
400: {"description": "Malformed body or invalid decky fields"},
401: {"description": "Missing or invalid credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "Topology not found"},
409: {"description": "Topology not editable or version conflict"},
},
)
@_traced("api.topology.decky.create")
async def api_create_decky(
topology_id: str,
body: DeckyCreateRequest,
_admin: dict = Depends(require_admin),
) -> DeckyRow:
await assert_pending_or_409(topology_id)
payload = {
"topology_id": topology_id,
"name": body.name,
"services": body.services,
"decky_config": body.decky_config,
"x": body.x,
"y": body.y,
}
try:
decky_uuid = await repo.add_topology_decky(
payload, expected_version=body.expected_version
)
except (TopologyNotEditable, VersionConflict, ValueError) as exc:
raise map_repo_exception(exc) from exc
rows = await repo.list_topology_deckies(topology_id)
row = next((r for r in rows if r["uuid"] == decky_uuid), None)
if row is None: # pragma: no cover
raise HTTPException(status_code=500, detail="Decky insert vanished")
return DeckyRow(**row)
@router.patch(
"/{topology_id}/deckies/{decky_uuid}",
tags=["MazeNET Topologies"],
response_model=DeckyRow,
responses={
400: {"description": "Malformed body"},
401: {"description": "Missing or invalid credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "Topology or decky not found"},
409: {"description": "Topology not editable or version conflict"},
},
)
@_traced("api.topology.decky.update")
async def api_update_decky(
topology_id: str,
decky_uuid: str,
body: DeckyUpdateRequest,
_admin: dict = Depends(require_admin),
) -> DeckyRow:
await assert_pending_or_409(topology_id)
fields = body.model_dump(exclude_unset=True, exclude={"expected_version"})
try:
await repo.update_topology_decky(
decky_uuid,
fields,
expected_version=body.expected_version,
enforce_pending=True,
)
except (TopologyNotEditable, VersionConflict) as exc:
raise map_repo_exception(exc) from exc
except ValueError as exc:
raise HTTPException(status_code=404, detail=str(exc)) from exc
rows = await repo.list_topology_deckies(topology_id)
row = next((r for r in rows if r["uuid"] == decky_uuid), None)
if row is None:
raise HTTPException(status_code=404, detail="Decky not found")
return DeckyRow(**row)
@router.delete(
"/{topology_id}/deckies/{decky_uuid}",
tags=["MazeNET Topologies"],
status_code=status.HTTP_204_NO_CONTENT,
responses={
400: {"description": "Malformed path"},
401: {"description": "Missing or invalid credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "Topology or decky not found"},
409: {"description": "Topology not editable or version conflict"},
},
)
@_traced("api.topology.decky.delete")
async def api_delete_decky(
topology_id: str,
decky_uuid: str,
_admin: dict = Depends(require_admin),
) -> Response:
await assert_pending_or_409(topology_id)
rows = await repo.list_topology_deckies(topology_id)
if not any(r["uuid"] == decky_uuid for r in rows):
raise HTTPException(status_code=404, detail="Decky not found")
try:
await repo.delete_topology_decky(decky_uuid)
except (TopologyNotEditable, VersionConflict, ValueError) as exc:
raise map_repo_exception(exc) from exc
return Response(status_code=status.HTTP_204_NO_CONTENT)

View File

@@ -0,0 +1,51 @@
"""DELETE /topologies/{id} — cascade-delete a pending or torn-down topology."""
from __future__ import annotations
from fastapi import APIRouter, Depends, HTTPException, Response, status
from decnet.telemetry import traced as _traced
from decnet.topology.status import TopologyStatus
from decnet.web.dependencies import repo, require_admin
router = APIRouter()
# Only allow delete when containers are guaranteed not to be running.
# ACTIVE / DEPLOYING / DEGRADED / TEARING_DOWN must teardown first.
_DELETABLE: frozenset[str] = frozenset(
{TopologyStatus.PENDING, TopologyStatus.TORN_DOWN, TopologyStatus.FAILED}
)
@router.delete(
"/{topology_id}",
tags=["MazeNET Topologies"],
status_code=status.HTTP_204_NO_CONTENT,
responses={
400: {"description": "Malformed path parameters"},
401: {"description": "Missing or invalid credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "Topology not found"},
409: {"description": "Topology has running resources; teardown first"},
},
)
@_traced("api.topology.delete")
async def api_delete_topology(
topology_id: str,
_admin: dict = Depends(require_admin),
) -> Response:
topo = await repo.get_topology(topology_id)
if topo is None:
raise HTTPException(status_code=404, detail="Topology not found")
if topo["status"] not in _DELETABLE:
raise HTTPException(
status_code=409,
detail=(
f"Topology is {topo['status']!r}; teardown to 'torn_down' "
f"before delete."
),
)
deleted = await repo.delete_topology_cascade(topology_id)
if not deleted:
# Race: row vanished between the status check and the cascade.
raise HTTPException(status_code=404, detail="Topology not found")
return Response(status_code=status.HTTP_204_NO_CONTENT)

View File

@@ -0,0 +1,76 @@
"""POST /topologies/{id}/deploy — transition pending → deploying and fire
the background deploy.
The actual Docker work happens in a BackgroundTask so the HTTP caller
returns quickly with ``202 Accepted``. Status transitions
(``deploying`` → ``active`` | ``failed``) are written by
:func:`decnet.engine.deployer.deploy_topology` itself.
"""
from __future__ import annotations
import asyncio
import logging
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, status
from decnet.engine.deployer import deploy_topology
from decnet.telemetry import traced as _traced
from decnet.topology.status import TopologyStatus
from decnet.web.db.models import TopologySummary
from decnet.web.dependencies import repo, require_admin
log = logging.getLogger(__name__)
router = APIRouter()
async def _run_deploy(topology_id: str) -> None:
"""BackgroundTask body: deploy, swallow + log any exception so the
task runner doesn't crash. Status on failure is marked by
:func:`deploy_topology` via its own exception handler.
"""
try:
await deploy_topology(repo, topology_id)
except asyncio.CancelledError: # pragma: no cover — shutdown
raise
except Exception as exc: # noqa: BLE001
from decnet.engine.deployer import _format_subprocess_error
log.error(
"background deploy of %s failed: %s",
topology_id, _format_subprocess_error(exc),
)
@router.post(
"/{topology_id}/deploy",
tags=["MazeNET Topologies"],
response_model=TopologySummary,
status_code=status.HTTP_202_ACCEPTED,
responses={
400: {"description": "Malformed path parameters"},
401: {"description": "Missing or invalid credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "Topology not found"},
409: {"description": "Topology is not in 'pending' status"},
},
)
@_traced("api.topology.deploy")
async def api_deploy_topology(
topology_id: str,
background: BackgroundTasks,
_admin: dict = Depends(require_admin),
) -> TopologySummary:
topo = await repo.get_topology(topology_id)
if topo is None:
raise HTTPException(status_code=404, detail="Topology not found")
if topo["status"] != TopologyStatus.PENDING:
raise HTTPException(
status_code=409,
detail=(
f"Topology is {topo['status']!r}; only 'pending' topologies "
f"can be deployed."
),
)
background.add_task(_run_deploy, topology_id)
return TopologySummary(**topo)

View File

@@ -0,0 +1,110 @@
"""Edge CRUD endpoints — pending-only child mutations.
POST /topologies/{id}/edges
DELETE /topologies/{id}/edges/{edge_id}
Edges are the decky↔LAN membership table (bipartite). Creating an
edge attaches a decky to an additional LAN; deleting one detaches.
"""
from __future__ import annotations
from fastapi import APIRouter, Depends, HTTPException, Response, status
from decnet.telemetry import traced as _traced
from decnet.topology.status import (
TopologyNotEditable,
VersionConflict,
)
from decnet.web.db.models import EdgeCreateRequest, EdgeRow
from decnet.web.dependencies import repo, require_admin
from ._guards import assert_pending_or_409, map_repo_exception
router = APIRouter()
@router.post(
"/{topology_id}/edges",
tags=["MazeNET Topologies"],
response_model=EdgeRow,
status_code=status.HTTP_201_CREATED,
responses={
400: {"description": "Malformed body or unknown decky/LAN"},
401: {"description": "Missing or invalid credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "Topology not found"},
409: {"description": "Topology not editable or version conflict"},
},
)
@_traced("api.topology.edge.create")
async def api_create_edge(
topology_id: str,
body: EdgeCreateRequest,
_admin: dict = Depends(require_admin),
) -> EdgeRow:
await assert_pending_or_409(topology_id)
# Referential integrity: decky + LAN must belong to this topology.
deckies = await repo.list_topology_deckies(topology_id)
if not any(d["uuid"] == body.decky_uuid for d in deckies):
raise HTTPException(
status_code=400,
detail=f"decky {body.decky_uuid!r} not in topology {topology_id!r}",
)
lans = await repo.list_lans_for_topology(topology_id)
if not any(r["id"] == body.lan_id for r in lans):
raise HTTPException(
status_code=400,
detail=f"lan {body.lan_id!r} not in topology {topology_id!r}",
)
payload = {
"topology_id": topology_id,
"decky_uuid": body.decky_uuid,
"lan_id": body.lan_id,
"is_bridge": body.is_bridge,
"forwards_l3": body.forwards_l3,
}
try:
edge_id = await repo.add_topology_edge(
payload, expected_version=body.expected_version
)
except (TopologyNotEditable, VersionConflict, ValueError) as exc:
raise map_repo_exception(exc) from exc
edges = await repo.list_topology_edges(topology_id)
row = next((e for e in edges if e["id"] == edge_id), None)
if row is None: # pragma: no cover
raise HTTPException(status_code=500, detail="Edge insert vanished")
return EdgeRow(**row)
@router.delete(
"/{topology_id}/edges/{edge_id}",
tags=["MazeNET Topologies"],
status_code=status.HTTP_204_NO_CONTENT,
responses={
400: {"description": "Malformed path"},
401: {"description": "Missing or invalid credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "Topology or edge not found"},
409: {"description": "Topology not editable or version conflict"},
},
)
@_traced("api.topology.edge.delete")
async def api_delete_edge(
topology_id: str,
edge_id: str,
_admin: dict = Depends(require_admin),
) -> Response:
await assert_pending_or_409(topology_id)
edges = await repo.list_topology_edges(topology_id)
if not any(e["id"] == edge_id for e in edges):
raise HTTPException(status_code=404, detail="Edge not found")
try:
await repo.delete_topology_edge(edge_id)
except (TopologyNotEditable, VersionConflict, ValueError) as exc:
raise map_repo_exception(exc) from exc
return Response(status_code=status.HTTP_204_NO_CONTENT)

View File

@@ -0,0 +1,157 @@
"""SSE stream of topology lifecycle events — one connection per editor.
Subscribes to ``topology.<id>.>`` on the :class:`~decnet.bus.base.BaseBus`
for the duration of the request and forwards each matching bus event as
a Server-Sent Event to the browser. Emits a one-shot snapshot on connect
(current status + any in-flight mutations) so the client doesn't need a
separate fetch to initialise the "pending" buffer.
Authorization matches :mod:`decnet.web.router.stream.api_stream_events`
— a JWT passed via the ``?token=`` query parameter (EventSource can't
set arbitrary headers) + ``require_stream_viewer`` role gate. The
per-topology 404 is enforced after auth so existence probes can't leak
a topology id to an unauthenticated caller.
"""
from __future__ import annotations
import asyncio
from typing import AsyncGenerator
import orjson
from fastapi import APIRouter, Depends, Request
from fastapi.responses import StreamingResponse
from decnet.bus import topics as _topics
from decnet.bus.app import get_app_bus
from decnet.logging import get_logger
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import repo, require_stream_viewer
from decnet.web.sse_limits import sse_connection_slot
from ._guards import get_topology_or_404
log = get_logger("api.topology.events")
router = APIRouter()
_KEEPALIVE_SECS = 15.0
_IN_FLIGHT_STATES = ("pending", "applying")
def _format_sse(event_name: str, data: dict) -> str:
"""Build one SSE frame: ``event: <name>\\ndata: <json>\\n\\n``."""
return f"event: {event_name}\ndata: {orjson.dumps(data).decode()}\n\n"
@router.get(
"/{topology_id}/events",
tags=["MazeNET Topologies"],
responses={
200: {
"content": {"text/event-stream": {}},
"description": "SSE stream of mutation and status events for one topology",
},
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "Topology not found"},
429: {"description": "Per-user SSE connection cap reached"},
},
)
@_traced("api.topology.events")
async def api_topology_events(
topology_id: str,
request: Request,
user: dict = Depends(require_stream_viewer),
) -> StreamingResponse:
# Event types emitted: snapshot, status, mutation.{enqueued,
# applying,applied,failed}. All wrap bus events whose payload is
# also reachable via viewer-gated REST (GET /topologies/{id},
# GET /topologies/{id}/mutations). Adding a new event family here
# requires a threat-model review for F6/I (role leakage).
topo = await get_topology_or_404(topology_id)
snapshot_status = topo["status"]
in_flight: list[dict] = []
for state in _IN_FLIGHT_STATES:
in_flight.extend(await repo.list_topology_mutations(topology_id, state=state))
async def generator() -> AsyncGenerator[str, None]:
async with sse_connection_slot(user["uuid"]):
# Flush headers immediately so the browser's EventSource sees a
# live connection before the first real event arrives.
yield ": keepalive\n\n"
# One-shot snapshot — pair the current topology status with any
# mutations the mutator is still holding, so the client buffer
# can render an accurate "already in flight" state.
yield _format_sse("snapshot", {
"topology_id": topology_id,
"status": snapshot_status,
"in_flight": in_flight,
})
bus = await get_app_bus()
if bus is None:
# Bus disabled (NullBus) or unreachable. The snapshot is
# still useful; we idle on keepalives so the client stays
# connected and will re-poll on its own timers.
while not await request.is_disconnected():
try:
await asyncio.sleep(_KEEPALIVE_SECS)
except asyncio.CancelledError:
break
yield ": keepalive\n\n"
return
sub = bus.subscribe(f"{_topics.TOPOLOGY}.{topology_id}.>")
try:
async with sub:
sub_iter = sub.__aiter__()
while True:
if await request.is_disconnected():
break
next_task = asyncio.ensure_future(sub_iter.__anext__())
try:
event = await asyncio.wait_for(next_task, timeout=_KEEPALIVE_SECS)
except asyncio.TimeoutError:
next_task.cancel()
yield ": keepalive\n\n"
continue
except StopAsyncIteration:
break
# Map the bus event onto an SSE ``event:`` name that
# the frontend can switch on without parsing topics.
yield _format_sse(
_sse_name_for(event.topic),
{
"topic": event.topic,
"type": event.type,
"ts": event.ts,
"payload": event.payload,
},
)
except asyncio.CancelledError:
pass
except Exception:
log.exception("topology events stream crashed topology_id=%s", topology_id)
yield _format_sse("error", {"message": "Stream interrupted"})
return StreamingResponse(
generator(),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"X-Accel-Buffering": "no",
},
)
def _sse_name_for(topic: str) -> str:
"""Derive an SSE ``event:`` name from a bus topic.
``topology.<id>.mutation.applied`` → ``mutation.applied``
``topology.<id>.status`` → ``status``
Anything else is passed through unchanged so future topic families
don't silently collapse onto a generic bucket.
"""
parts = topic.split(".", 2)
return parts[2] if len(parts) >= 3 else topic

View File

@@ -0,0 +1,68 @@
"""GET /topologies/{id} and /topologies/{id}/status-events."""
from __future__ import annotations
from fastapi import APIRouter, Depends, HTTPException, Query
from decnet.telemetry import traced as _traced
from decnet.topology.persistence import hydrate
from decnet.web.db.models import (
DeckyRow,
EdgeRow,
LANRow,
TopologyDetail,
TopologyStatusEventRow,
TopologySummary,
)
from decnet.web.dependencies import repo, require_viewer
router = APIRouter()
@router.get(
"/{topology_id}",
tags=["MazeNET Topologies"],
response_model=TopologyDetail,
responses={
400: {"description": "Malformed path parameters"},
401: {"description": "Missing or invalid credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "Topology not found"},
},
)
@_traced("api.topology.get")
async def api_get_topology(
topology_id: str,
_viewer: dict = Depends(require_viewer),
) -> TopologyDetail:
hydrated = await hydrate(repo, topology_id)
if hydrated is None:
raise HTTPException(status_code=404, detail="Topology not found")
return TopologyDetail(
topology=TopologySummary(**hydrated["topology"]),
lans=[LANRow(**r) for r in hydrated["lans"]],
deckies=[DeckyRow(**r) for r in hydrated["deckies"]],
edges=[EdgeRow(**r) for r in hydrated["edges"]],
)
@router.get(
"/{topology_id}/status-events",
tags=["MazeNET Topologies"],
response_model=list[TopologyStatusEventRow],
responses={
400: {"description": "Malformed query parameters"},
401: {"description": "Missing or invalid credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "Topology not found"},
},
)
@_traced("api.topology.status_events")
async def api_get_status_events(
topology_id: str,
limit: int = Query(default=100, ge=1, le=1000),
_viewer: dict = Depends(require_viewer),
) -> list[TopologyStatusEventRow]:
if await repo.get_topology(topology_id) is None:
raise HTTPException(status_code=404, detail="Topology not found")
rows = await repo.list_topology_status_events(topology_id, limit=limit)
return [TopologyStatusEventRow(**r) for r in rows]

View File

@@ -0,0 +1,152 @@
"""LAN CRUD endpoints — pending-only child mutations.
POST /topologies/{id}/lans
PATCH /topologies/{id}/lans/{lan_id}
DELETE /topologies/{id}/lans/{lan_id}
"""
from __future__ import annotations
from fastapi import APIRouter, Depends, HTTPException, Response, status
from decnet.logging import get_logger
from decnet.telemetry import traced as _traced
from decnet.topology.allocator import reserved_subnets
from decnet.topology.status import (
TopologyNotEditable,
VersionConflict,
)
from decnet.web.db.models import LANCreateRequest, LANRow, LANUpdateRequest
from decnet.web.dependencies import repo, require_admin
from ._guards import assert_pending_or_409, map_repo_exception
log = get_logger("api.topology.lan")
router = APIRouter()
@router.post(
"/{topology_id}/lans",
tags=["MazeNET Topologies"],
response_model=LANRow,
status_code=status.HTTP_201_CREATED,
responses={
400: {"description": "Malformed body or invalid LAN fields"},
401: {"description": "Missing or invalid credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "Topology not found"},
409: {"description": "Topology not editable or version conflict"},
},
)
@_traced("api.topology.lan.create")
async def api_create_lan(
topology_id: str,
body: LANCreateRequest,
_admin: dict = Depends(require_admin),
) -> LANRow:
await assert_pending_or_409(topology_id)
subnet = body.subnet
if subnet is None:
# Mint a free /24. The allocator scans the claimed set and hands
# back the next free subnet base — same logic as the catalog
# /next-subnet endpoint, but inlined so create is atomic.
from decnet.topology.allocator import SubnetAllocator
allocator = SubnetAllocator(
"10.0", reserved=await reserved_subnets(repo)
)
subnet = allocator.next_free()
payload = {
"topology_id": topology_id,
"name": body.name,
"subnet": subnet,
"is_dmz": body.is_dmz,
"x": body.x,
"y": body.y,
}
try:
lan_id = await repo.add_lan(
payload, expected_version=body.expected_version
)
except (TopologyNotEditable, VersionConflict, ValueError) as exc:
raise map_repo_exception(exc) from exc
rows = await repo.list_lans_for_topology(topology_id)
row = next((r for r in rows if r["id"] == lan_id), None)
if row is None: # pragma: no cover — would mean insert vanished
raise HTTPException(status_code=500, detail="LAN insert vanished")
return LANRow(**row)
@router.patch(
"/{topology_id}/lans/{lan_id}",
tags=["MazeNET Topologies"],
response_model=LANRow,
responses={
400: {"description": "Malformed body or invalid LAN fields"},
401: {"description": "Missing or invalid credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "Topology or LAN not found"},
409: {"description": "Topology not editable or version conflict"},
},
)
@_traced("api.topology.lan.update")
async def api_update_lan(
topology_id: str,
lan_id: str,
body: LANUpdateRequest,
_admin: dict = Depends(require_admin),
) -> LANRow:
await assert_pending_or_409(topology_id)
fields = body.model_dump(exclude_unset=True, exclude={"expected_version"})
try:
await repo.update_lan(
lan_id,
fields,
expected_version=body.expected_version,
enforce_pending=True,
)
except (TopologyNotEditable, VersionConflict) as exc:
raise map_repo_exception(exc) from exc
except ValueError as exc:
raise HTTPException(status_code=404, detail=str(exc)) from exc
rows = await repo.list_lans_for_topology(topology_id)
row = next((r for r in rows if r["id"] == lan_id), None)
if row is None:
raise HTTPException(status_code=404, detail="LAN not found")
return LANRow(**row)
@router.delete(
"/{topology_id}/lans/{lan_id}",
tags=["MazeNET Topologies"],
status_code=status.HTTP_204_NO_CONTENT,
responses={
400: {"description": "Cannot delete: LAN has orphan-risking deckies"},
401: {"description": "Missing or invalid credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "Topology or LAN not found"},
409: {"description": "Topology not editable or version conflict"},
},
)
@_traced("api.topology.lan.delete")
async def api_delete_lan(
topology_id: str,
lan_id: str,
_admin: dict = Depends(require_admin),
) -> Response:
await assert_pending_or_409(topology_id)
rows = await repo.list_lans_for_topology(topology_id)
if not any(r["id"] == lan_id for r in rows):
raise HTTPException(status_code=404, detail="LAN not found")
try:
await repo.delete_lan(lan_id)
except (TopologyNotEditable, VersionConflict, ValueError) as exc:
raise map_repo_exception(exc) from exc
return Response(status_code=status.HTTP_204_NO_CONTENT)

View File

@@ -0,0 +1,39 @@
"""GET /topologies — paginated list of MazeNET topologies."""
from __future__ import annotations
from typing import Optional
from fastapi import APIRouter, Depends, Query
from decnet.telemetry import traced as _traced
from decnet.web.db.models import TopologyListResponse, TopologySummary
from decnet.web.dependencies import repo, require_viewer
router = APIRouter()
@router.get(
"/",
tags=["MazeNET Topologies"],
response_model=TopologyListResponse,
responses={
400: {"description": "Malformed query parameters"},
401: {"description": "Missing or invalid credentials"},
403: {"description": "Insufficient permissions"},
},
)
@_traced("api.topology.list")
async def api_list_topologies(
status: Optional[str] = Query(default=None, description="Filter by topology status"),
limit: int = Query(default=50, ge=1, le=500),
offset: int = Query(default=0, ge=0, le=2147483647),
_viewer: dict = Depends(require_viewer),
) -> TopologyListResponse:
total = await repo.count_topologies(status=status)
rows = await repo.list_topologies(status=status, limit=limit, offset=offset)
return TopologyListResponse(
total=total,
limit=limit,
offset=offset,
data=[TopologySummary(**r) for r in rows],
)

View File

@@ -0,0 +1,127 @@
"""Live-mutation queue endpoints — for active | degraded topologies.
POST /topologies/{id}/mutations enqueue one mutation op
GET /topologies/{id}/mutations list queued / applied / failed rows
The mutator worker claims pending rows via ``claim_next_mutation`` and
transitions them to ``applying`` → ``applied`` | ``failed``. The API
layer only stages rows and reports them back.
"""
from __future__ import annotations
from typing import Optional
from fastapi import APIRouter, Depends, HTTPException, Query, status
from decnet.bus import topics as _topics
from decnet.bus.app import get_app_bus
from decnet.logging import get_logger
from decnet.telemetry import traced as _traced
from decnet.topology.status import (
TopologyStatus,
VersionConflict,
)
from decnet.web.db.models import (
MutationEnqueueRequest,
MutationEnqueueResponse,
MutationRow,
)
from decnet.web.dependencies import repo, require_admin, require_viewer
from ._guards import get_topology_or_404, map_repo_exception
_log = get_logger("api.topology.mutations")
router = APIRouter()
_MUTATABLE: frozenset[str] = frozenset(
{TopologyStatus.ACTIVE, TopologyStatus.DEGRADED}
)
@router.post(
"/{topology_id}/mutations",
tags=["MazeNET Topologies"],
response_model=MutationEnqueueResponse,
status_code=status.HTTP_202_ACCEPTED,
responses={
400: {"description": "Malformed body or unknown mutation op"},
401: {"description": "Missing or invalid credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "Topology not found"},
409: {
"description": (
"Topology is not active|degraded, or version conflict"
)
},
},
)
@_traced("api.topology.mutation.enqueue")
async def api_enqueue_mutation(
topology_id: str,
body: MutationEnqueueRequest,
_admin: dict = Depends(require_admin),
) -> MutationEnqueueResponse:
topo = await get_topology_or_404(topology_id)
if topo["status"] not in _MUTATABLE:
raise HTTPException(
status_code=409,
detail=(
f"Topology is {topo['status']!r}; the mutation queue is "
f"only open for 'active' or 'degraded' topologies. Use "
f"child-CRUD endpoints while pending."
),
)
try:
mutation_id = await repo.enqueue_topology_mutation(
topology_id,
body.op,
body.payload,
expected_version=body.expected_version,
)
except VersionConflict as exc:
raise map_repo_exception(exc) from exc
except ValueError as exc:
raise HTTPException(status_code=400, detail=str(exc)) from exc
# Fire-and-forget bus publish so the mutator can wake immediately and
# the SSE route can notify connected editors. Bus failure here must
# never mask a successful enqueue — the DB row is authoritative.
bus = await get_app_bus()
if bus is not None:
try:
await bus.publish(
_topics.topology_mutation(topology_id, _topics.MUTATION_ENQUEUED),
{"mutation_id": mutation_id, "op": body.op, "payload": body.payload},
event_type=_topics.MUTATION_ENQUEUED,
)
except Exception as exc: # noqa: BLE001
_log.warning("bus publish (enqueued) failed: %s", exc)
return MutationEnqueueResponse(mutation_id=mutation_id, state="pending")
@router.get(
"/{topology_id}/mutations",
tags=["MazeNET Topologies"],
response_model=list[MutationRow],
responses={
400: {"description": "Malformed query parameters"},
401: {"description": "Missing or invalid credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "Topology not found"},
},
)
@_traced("api.topology.mutation.list")
async def api_list_mutations(
topology_id: str,
state: Optional[str] = Query(
default=None,
description="Filter by state: pending | applying | applied | failed",
),
_viewer: dict = Depends(require_viewer),
) -> list[MutationRow]:
await get_topology_or_404(topology_id)
rows = await repo.list_topology_mutations(topology_id, state=state)
return [MutationRow(**r) for r in rows]

View File

@@ -0,0 +1,131 @@
"""GET/PUT ``/topologies/{id}/personas`` — per-topology email persona pool.
The global pool (``decnet/web/router/emailgen/api_personas.py``) drives
non-MazeNET fleet/SWARM-shard mail deckies. MazeNET topology mail
deckies use ``Topology.email_personas`` instead — one JSON-serialized
list per topology, parsed by the emailgen scheduler each tick.
This endpoint is the API surface behind the dashboard's per-topology
"Personas" editor. Reads accept admin or viewer; writes are admin-only.
Concurrency: last-write-wins. The list is operator-curated and small
(typically <20 entries); no need for optimistic versioning here.
"""
from __future__ import annotations
import json
from typing import Any
from fastapi import APIRouter, Depends, HTTPException
from decnet.logging import get_logger
from decnet.realism.personas import EmailPersona, parse_personas
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import repo, require_admin, require_viewer
router = APIRouter()
log = get_logger("api.topology.personas")
def _serialize(personas: list[EmailPersona]) -> list[dict[str, Any]]:
return [p.model_dump(exclude_none=False) for p in personas]
@router.get(
"/{topology_id}/personas",
tags=["MazeNET Topologies"],
responses={
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "Topology not found"},
},
)
@_traced("api.topology.list_personas")
async def list_topology_personas(
topology_id: str,
_viewer: dict = Depends(require_viewer),
) -> dict[str, Any]:
"""Return the topology's persona list and its language default.
``language_default`` is included so the editor can show which
language unset entries fall back to — same fallback the scheduler
applies when building prompts.
"""
topo = await repo.get_topology(topology_id)
if topo is None:
raise HTTPException(status_code=404, detail="Topology not found")
language_default = topo.get("language_default") or "en"
personas = parse_personas(
topo.get("email_personas"), language_default=language_default,
)
return {
"topology_id": topology_id,
"topology_name": topo.get("name", ""),
"language_default": language_default,
"personas": _serialize(personas),
}
@router.put(
"/{topology_id}/personas",
tags=["MazeNET Topologies"],
responses={
400: {"description": "Invalid persona payload"},
401: {"description": "Could not validate credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "Topology not found"},
},
)
@_traced("api.topology.replace_personas")
async def replace_topology_personas(
topology_id: str,
body: dict[str, Any],
user: dict = Depends(require_admin),
) -> dict[str, Any]:
"""Replace the topology's persona list.
Body shape: ``{"personas": [<EmailPersona>, ...]}``.
Drop-invalid semantics mirror the global-pool endpoint: bad entries
are skipped with a warning rather than failing the whole request, but
a wholly invalid payload returns 400 so a schema mistake doesn't
silently wipe the list.
"""
raw = body.get("personas")
if not isinstance(raw, list):
raise HTTPException(
status_code=400, detail="body.personas must be a list",
)
topo = await repo.get_topology(topology_id)
if topo is None:
raise HTTPException(status_code=404, detail="Topology not found")
language_default = topo.get("language_default") or "en"
parsed = parse_personas(raw, language_default=language_default)
if raw and not parsed:
raise HTTPException(
status_code=400,
detail=(
"All persona entries failed validation. Required fields: "
"name, email (user@host.tld), role, tone, mannerisms."
),
)
serialized = _serialize(parsed)
payload = json.dumps(serialized, ensure_ascii=False)
updated = await repo.set_topology_email_personas(topology_id, payload)
if not updated:
# Race: row vanished between the get and the update.
raise HTTPException(status_code=404, detail="Topology not found")
log.info(
"api.topology.replace_personas user=%s topology=%s wrote=%d",
user.get("username", user.get("uuid")), topology_id, len(parsed),
)
return {
"topology_id": topology_id,
"topology_name": topo.get("name", ""),
"language_default": language_default,
"personas": serialized,
}

View File

@@ -0,0 +1,48 @@
"""POST /topologies/reap-orphans — remove Docker resources for topology
ids the DB no longer knows about.
A topology row deleted outside the teardown flow (operator error,
crashed master, direct DB edit) leaves its containers and bridge
networks behind. The orphan networks keep their IPAM pools, so the
next deploy at the same subnet hits a 403 ``Pool overlaps`` from the
Docker daemon.
This endpoint walks the local Docker daemon, computes the set of
topology prefixes still known to the repo, and force-removes every
container + network whose prefix is orphaned. Resources belonging to
live topologies are never touched.
"""
from __future__ import annotations
from fastapi import APIRouter, Depends
from decnet.engine.reaper import reap_orphan_topology_resources
from decnet.telemetry import traced as _traced
from decnet.web.db.models import ReapReportResponse
from decnet.web.dependencies import repo, require_admin
router = APIRouter()
@router.post(
"/reap-orphans",
tags=["MazeNET Topologies"],
response_model=ReapReportResponse,
responses={
401: {"description": "Missing or invalid credentials"},
403: {"description": "Insufficient permissions"},
},
)
@_traced("api.topology.reap_orphans")
async def api_reap_orphans(
_admin: dict = Depends(require_admin),
) -> dict:
"""Reap Docker resources whose topology id is absent from the DB.
Returns a report with the live prefixes, the orphan prefixes that
were identified, every container + network actually removed, and
any per-resource errors encountered. Errors are non-fatal — a
single stuck resource does not abort the sweep.
"""
report = await reap_orphan_topology_resources(repo)
return report.to_dict()

View File

@@ -0,0 +1,79 @@
"""POST /topologies/{id}/teardown — transition an active/degraded/failed
topology to ``tearing_down`` and fire the background teardown.
Mirrors :mod:`api_deploy_topology`: the real Docker work runs in a
BackgroundTask, the caller returns ``202 Accepted``, and
:func:`decnet.engine.deployer.teardown_topology` writes the terminal
``torn_down`` status when it finishes.
"""
from __future__ import annotations
import asyncio
import logging
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, status
from decnet.engine.deployer import teardown_topology
from decnet.telemetry import traced as _traced
from decnet.topology.status import TopologyStatus
from decnet.web.db.models import TopologySummary
from decnet.web.dependencies import repo, require_admin
log = logging.getLogger(__name__)
router = APIRouter()
# Statuses that can legally transition to TEARING_DOWN (see
# decnet.topology.status._LEGAL).
_TEARDOWNABLE: frozenset[str] = frozenset(
{
TopologyStatus.ACTIVE,
TopologyStatus.DEGRADED,
TopologyStatus.FAILED,
TopologyStatus.DEPLOYING,
}
)
async def _run_teardown(topology_id: str) -> None:
try:
await teardown_topology(repo, topology_id)
except asyncio.CancelledError: # pragma: no cover — shutdown
raise
except Exception as exc: # noqa: BLE001
log.error("background teardown of %s failed: %s", topology_id, exc)
@router.post(
"/{topology_id}/teardown",
tags=["MazeNET Topologies"],
response_model=TopologySummary,
status_code=status.HTTP_202_ACCEPTED,
responses={
400: {"description": "Malformed path parameters"},
401: {"description": "Missing or invalid credentials"},
403: {"description": "Insufficient permissions"},
404: {"description": "Topology not found"},
409: {"description": "Topology cannot be torn down from its current status"},
},
)
@_traced("api.topology.teardown")
async def api_teardown_topology(
topology_id: str,
background: BackgroundTasks,
_admin: dict = Depends(require_admin),
) -> TopologySummary:
topo = await repo.get_topology(topology_id)
if topo is None:
raise HTTPException(status_code=404, detail="Topology not found")
if topo["status"] not in _TEARDOWNABLE:
raise HTTPException(
status_code=409,
detail=(
f"Topology is {topo['status']!r}; cannot teardown "
f"(allowed from: {sorted(_TEARDOWNABLE)})."
),
)
background.add_task(_run_teardown, topology_id)
return TopologySummary(**topo)

View File

@@ -0,0 +1,6 @@
from fastapi import APIRouter
from .api_get_transcript import router as transcript_router
transcripts_router = APIRouter()
transcripts_router.include_router(transcript_router)

View File

@@ -0,0 +1,243 @@
"""
Paged asciinema v2 transcript endpoint.
Transcripts are stored as one JSONL day-shard per (decky, UTC day) under
/var/lib/decnet/artifacts/{decky}/{service}/transcripts/sessions-YYYY-MM-DD.jsonl
Each line carries a ``sid`` tag; multiple concurrent sessions interleave into
the same shard (O_APPEND + sub-PIPE_BUF writes keep lines atomic — see
decnet/templates/_shared/sessrec/sessrec.c for the guarantee).
Rather than scanning the whole shard on every request, the first hit for a
given (shard path, mtime) builds an in-memory index of ``sid → [byte offsets]``
by one pass. Subsequent paged reads pread() exact line slices in O(limit).
Index is bounded by the disk-free precheck (< 200 MB free → no recording)
and the 10 MB per-session cap.
"""
from __future__ import annotations
import json
import os
import re
from collections import OrderedDict
from pathlib import Path
from typing import Any
from fastapi import APIRouter, Depends, HTTPException, Query
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_admin, repo
router = APIRouter()
ARTIFACTS_ROOT = Path(os.environ.get("DECNET_ARTIFACTS_ROOT", "/var/lib/decnet/artifacts"))
_DECKY_RE = re.compile(r"^[a-z0-9][a-z0-9-]{0,62}$")
_SID_RE = re.compile(r"^[a-f0-9-]{36}$")
_SERVICE_RE = re.compile(r"^(ssh|telnet)$")
# Shard filename is built by sessrec from UTC date — keep the charset tight
# so a forged shard_path in the Log row can't traverse.
_SHARD_BASENAME_RE = re.compile(r"^sessions-\d{4}-\d{2}-\d{2}\.jsonl$")
# (path, mtime_ns) → {sid: [(offset, length), ...]}
_INDEX_CACHE: "OrderedDict[tuple[str, int], dict[str, list[tuple[int, int]]]]" = OrderedDict()
_CACHE_MAX = 32
def _get_index(path: Path) -> tuple[dict[str, list[tuple[int, int]]], int]:
st = path.stat()
key = (str(path), st.st_mtime_ns)
if key in _INDEX_CACHE:
_INDEX_CACHE.move_to_end(key)
return _INDEX_CACHE[key], st.st_size
index: dict[str, list[tuple[int, int]]] = {}
with path.open("rb") as f:
offset = 0
for line in f:
length = len(line)
# Fast sid extract: look for `"sid":"<36 chars>"` prefix — every
# sessrec line starts with that field (see emit_*).
try:
m = re.search(rb'"sid"\s*:\s*"([a-f0-9-]{36})"', line)
except re.error:
m = None
if m:
sid = m.group(1).decode("ascii")
index.setdefault(sid, []).append((offset, length))
offset += length
_INDEX_CACHE[key] = index
_INDEX_CACHE.move_to_end(key)
while len(_INDEX_CACHE) > _CACHE_MAX:
_INDEX_CACHE.popitem(last=False)
return index, st.st_size
def _validate_names(decky: str, service: str) -> None:
if not _DECKY_RE.fullmatch(decky):
raise HTTPException(status_code=400, detail="invalid decky name")
if not _SERVICE_RE.fullmatch(service):
raise HTTPException(status_code=400, detail="invalid service")
def _resolve_shard(decky: str, service: str, shard_name: str) -> Path:
_validate_names(decky, service)
if not _SHARD_BASENAME_RE.fullmatch(shard_name):
raise HTTPException(status_code=400, detail="invalid shard name")
root = ARTIFACTS_ROOT.resolve()
candidate = (root / decky / service / "transcripts" / shard_name).resolve()
if root not in candidate.parents and candidate != root:
raise HTTPException(status_code=400, detail="path escapes artifacts root")
return candidate
def _find_shard_with_sid(decky: str, service: str, sid: str) -> Path | None:
"""Scan every ``sessions-YYYY-MM-DD.jsonl`` under the decky's transcripts
dir until one claims this sid.
Fallback for rows where ``fields.shard_path`` is missing (current
sessrec.c does not emit it) or for sessions that span UTC midnight
(events land in two shards; the emitted SD could only name one).
Newest shards first — most transcript lookups are for recent
sessions. Result is cached by ``_get_index`` keyed on
(path, mtime), so repeated calls are ~free.
"""
_validate_names(decky, service)
root = ARTIFACTS_ROOT.resolve()
transcripts_dir = (root / decky / service / "transcripts").resolve()
if root not in transcripts_dir.parents:
return None
# Absent dir, or dir the API process can't stat/read — treat as
# "no transcript", not as a 500 traceback. Most commonly the decky
# container wrote this tree as a container-side uid that the API
# (running under --user / --group) can't cross.
try:
if not transcripts_dir.is_dir():
return None
entries = list(transcripts_dir.iterdir())
except (OSError, PermissionError):
return None
shards = sorted(
(p for p in entries if _SHARD_BASENAME_RE.fullmatch(p.name)),
reverse=True, # newest day first
)
for shard in shards:
try:
index, _size = _get_index(shard)
except (OSError, PermissionError):
continue
if sid in index:
return shard
return None
@router.get(
"/transcripts/{decky}/{sid}",
tags=["Transcripts"],
responses={
400: {"description": "Invalid decky or sid parameter"},
401: {"description": "Could not validate credentials"},
403: {"description": "Admin access required"},
404: {"description": "Transcript not found"},
},
)
@_traced("api.get_transcript")
async def get_transcript(
decky: str,
sid: str,
offset: int = Query(0, ge=0, le=2147483647),
limit: int = Query(500, ge=1, le=5000),
admin: dict = Depends(require_admin),
) -> dict[str, Any]:
if not _DECKY_RE.fullmatch(decky):
raise HTTPException(status_code=400, detail="invalid decky name")
if not _SID_RE.fullmatch(sid):
raise HTTPException(status_code=400, detail="invalid sid")
log = await repo.get_session_log(sid)
if not log:
raise HTTPException(status_code=404, detail="session not found")
try:
fields = json.loads(log.get("fields") or "{}")
except (ValueError, TypeError):
fields = {}
service = fields.get("service") or log.get("service")
shard_path_field = fields.get("shard_path") or ""
shard_name = Path(shard_path_field).name
log_decky = log.get("decky") or fields.get("decky")
if log_decky and log_decky != decky:
raise HTTPException(status_code=404, detail="session not found")
# Fast path: the Log row carries a fields.shard_path we can validate
# and hit directly. Falls back to scanning all shards when the SD
# didn't include one (current sessrec.c doesn't emit shard_path) or
# when the named shard isn't on disk anymore.
path: Path | None = None
if _SHARD_BASENAME_RE.fullmatch(shard_name or ""):
candidate = _resolve_shard(decky, service or "", shard_name)
if candidate.is_file():
path = candidate
if path is None:
path = _find_shard_with_sid(decky, service or "", sid)
if path is None:
raise HTTPException(status_code=404, detail="transcript not found")
index, _size = _get_index(path)
lines_meta = index.get(sid)
if not lines_meta:
raise HTTPException(status_code=404, detail="sid not present in shard")
header: dict[str, Any] = {}
events: list[list[Any]] = []
truncated = False
# First pass: find the header line (has "hdr" key) and count events.
# Keep it O(n lines for this sid), not O(shard).
total_events = 0
event_positions: list[tuple[int, int]] = []
with path.open("rb") as f:
for off, ln in lines_meta:
f.seek(off)
raw = f.read(ln)
try:
obj = json.loads(raw)
except ValueError:
continue
if "hdr" in obj:
header = obj["hdr"]
continue
if obj.get("trunc"):
truncated = True
continue
event_positions.append((off, ln))
total_events += 1
# Page the events window.
window = event_positions[offset:offset + limit]
for off, ln in window:
f.seek(off)
raw = f.read(ln)
try:
obj = json.loads(raw)
except ValueError:
continue
t = obj.get("t")
ch = obj.get("ch")
d = obj.get("d")
if t is None or ch is None or d is None:
continue
events.append([t, ch, d])
return {
"sid": sid,
"service": service,
"header": header,
"events": events,
"offset": offset,
"limit": limit,
"total": total_events,
"has_more": (offset + limit) < total_events,
"truncated": truncated,
}

View File

@@ -0,0 +1,18 @@
"""Webhook subscription CRUD.
Admin-gated management of external-egress webhook subscriptions. The
actual delivery happens in the `decnet webhook` worker, which watches
the DB + bus and POSTs matching events out. This module is the API
surface operators use to configure destinations.
Mounted under `/api/v1/webhooks` by the main api router.
"""
from fastapi import APIRouter
from .api_manage_webhooks import router as manage_webhooks_router
from .api_test_webhook import router as test_webhook_router
webhooks_router = APIRouter(prefix="/webhooks")
webhooks_router.include_router(manage_webhooks_router)
webhooks_router.include_router(test_webhook_router)

View File

@@ -0,0 +1,231 @@
"""Webhook subscription CRUD — admin-gated."""
from __future__ import annotations
import json
import secrets
from datetime import datetime, timezone
from typing import Any
from fastapi import APIRouter, Depends, HTTPException
from decnet.bus import topics as _topics
from decnet.bus.app import get_app_bus
from decnet.logging import get_logger
from decnet.telemetry import traced as _traced
from decnet.web.db.models import (
MessageResponse,
WebhookCreateRequest,
WebhookCreateResponse,
WebhookResponse,
WebhookUpdateRequest,
)
from decnet.web.db.models.webhooks import _row_to_response_dict
from decnet.web.dependencies import repo, require_admin
from decnet.webhook.enums import merge_patterns
log = get_logger("api.webhooks")
router = APIRouter()
async def _notify_subscriptions_changed() -> None:
"""Publish `system.webhook.subscriptions_changed` on the bus.
Fire-and-forget per the bus contract — a dropped signal is recoverable
because the webhook worker also reloads on a slow timer as a fallback.
"""
try:
bus = await get_app_bus()
if bus is None:
return
await bus.publish(
_topics.WEBHOOK_SUBSCRIPTIONS_CHANGED,
{},
event_type="changed",
)
except Exception as e: # noqa: BLE001 — bus failures must not break CRUD
log.warning("webhook subscriptions-changed publish failed: %s", e)
def _row_to_response(row: dict[str, Any]) -> WebhookResponse:
return WebhookResponse(**_row_to_response_dict(row))
@router.post(
"/",
tags=["Webhooks"],
response_model=WebhookCreateResponse,
status_code=201,
responses={
400: {"description": "At least one of simple_events / topic_patterns required"},
409: {"description": "Name already in use"},
},
)
@_traced("api.webhook.create")
async def api_create_webhook(
req: WebhookCreateRequest,
admin: dict = Depends(require_admin),
) -> WebhookCreateResponse:
patterns = merge_patterns(req.simple_events, req.topic_patterns)
if not patterns:
raise HTTPException(
status_code=400,
detail="Provide at least one simple_events entry or topic_patterns pattern.",
)
existing = await repo.get_webhook_subscription_by_name(req.name)
if existing:
raise HTTPException(status_code=409, detail="Webhook name already exists")
# Auto-generate a URL-safe secret if the caller didn't provide one.
# 32 bytes of os-entropy is the same ballpark as a CSRF token.
secret = req.secret or secrets.token_urlsafe(32)
now = datetime.now(timezone.utc)
data = {
"name": req.name,
"url": str(req.url),
"secret": secret,
"topic_patterns": json.dumps(patterns),
"enabled": req.enabled,
"consecutive_failures": 0,
"created_at": now,
"updated_at": now,
}
await repo.create_webhook_subscription(data)
row = await repo.get_webhook_subscription_by_name(req.name)
if row is None:
# Should never happen — the create just committed. Treat as 500
# rather than silently masking a storage bug.
raise HTTPException(status_code=500, detail="Webhook created but not retrievable")
await _notify_subscriptions_changed()
return WebhookCreateResponse(
**_row_to_response_dict(row),
secret=secret,
)
@router.get(
"/",
tags=["Webhooks"],
response_model=list[WebhookResponse],
)
@_traced("api.webhook.list")
async def api_list_webhooks(
admin: dict = Depends(require_admin),
) -> list[WebhookResponse]:
rows = await repo.list_webhook_subscriptions()
return [_row_to_response(r) for r in rows]
@router.get(
"/{uuid}",
tags=["Webhooks"],
response_model=WebhookResponse,
responses={404: {"description": "Webhook not found"}},
)
@_traced("api.webhook.get")
async def api_get_webhook(
uuid: str,
admin: dict = Depends(require_admin),
) -> WebhookResponse:
row = await repo.get_webhook_subscription(uuid)
if not row:
raise HTTPException(status_code=404, detail="Webhook not found")
return _row_to_response(row)
@router.patch(
"/{uuid}",
tags=["Webhooks"],
response_model=WebhookResponse,
responses={
400: {"description": "Empty or invalid patch"},
404: {"description": "Webhook not found"},
409: {"description": "Name already in use"},
},
)
@_traced("api.webhook.update")
async def api_update_webhook(
uuid: str,
req: WebhookUpdateRequest,
admin: dict = Depends(require_admin),
) -> WebhookResponse:
current = await repo.get_webhook_subscription(uuid)
if not current:
raise HTTPException(status_code=404, detail="Webhook not found")
patch: dict[str, Any] = {}
if req.name is not None and req.name != current["name"]:
clash = await repo.get_webhook_subscription_by_name(req.name)
if clash and clash["uuid"] != uuid:
raise HTTPException(status_code=409, detail="Webhook name already exists")
patch["name"] = req.name
if req.url is not None:
patch["url"] = str(req.url)
if req.secret is not None:
patch["secret"] = req.secret
if req.enabled is not None:
patch["enabled"] = req.enabled
# Re-enabling after a circuit trip clears the trip stamp and
# zeros the failure count — the operator has acknowledged and
# is ready to resume delivery. Admin-paused → re-enabled also
# hits this path harmlessly (auto_disabled_at is already NULL
# and consecutive_failures is already 0).
if req.enabled is True and not current.get("enabled"):
patch["auto_disabled_at"] = None
patch["consecutive_failures"] = 0
patch["last_error"] = None
if req.simple_events is not None or req.topic_patterns is not None:
# Re-merge using whatever the caller supplied; a caller that wants
# to clear all patterns must explicitly pass both as empty lists.
simple = req.simple_events if req.simple_events is not None else []
raw = req.topic_patterns if req.topic_patterns is not None else []
patterns = merge_patterns(simple, raw)
if not patterns:
raise HTTPException(
status_code=400,
detail="Cannot clear all patterns; disable the webhook instead.",
)
patch["topic_patterns"] = json.dumps(patterns)
if not patch:
# No-op patch — return the current row untouched.
return _row_to_response(current)
updated = await repo.update_webhook_subscription(uuid, patch)
if not updated:
raise HTTPException(status_code=404, detail="Webhook not found")
await _notify_subscriptions_changed()
row = await repo.get_webhook_subscription(uuid)
if row is None:
raise HTTPException(status_code=404, detail="Webhook not found")
return _row_to_response(row)
@router.delete(
"/{uuid}",
tags=["Webhooks"],
response_model=MessageResponse,
responses={404: {"description": "Webhook not found"}},
)
@_traced("api.webhook.delete")
async def api_delete_webhook(
uuid: str,
admin: dict = Depends(require_admin),
) -> dict[str, str]:
deleted = await repo.delete_webhook_subscription(uuid)
if not deleted:
raise HTTPException(status_code=404, detail="Webhook not found")
await _notify_subscriptions_changed()
return {"message": "Webhook deleted"}

Some files were not shown because too many files have changed in this diff Show More