merge: testing → main (reconcile 2-week divergence)
This commit is contained in:
@@ -5,14 +5,65 @@ from .auth.api_change_pass import router as change_pass_router
|
||||
from .logs.api_get_logs import router as logs_router
|
||||
from .logs.api_get_histogram import router as histogram_router
|
||||
from .bounty.api_get_bounties import router as bounty_router
|
||||
from .credentials.api_get_credentials import router as credentials_router
|
||||
from .credential_reuse.api_get_credential_reuse import router as credential_reuse_router
|
||||
from .stats.api_get_stats import router as stats_router
|
||||
from .fleet.api_get_deckies import router as get_deckies_router
|
||||
from .fleet.api_mutate_decky import router as mutate_decky_router
|
||||
from .fleet.api_mutate_interval import router as mutate_interval_router
|
||||
from .fleet.api_deploy_deckies import router as deploy_deckies_router
|
||||
from .stream.api_stream_events import router as stream_router
|
||||
from .attackers.api_get_attackers import router as attackers_router
|
||||
from .attackers.api_get_attacker_detail import router as attacker_detail_router
|
||||
from .attackers.api_get_attacker_commands import router as attacker_commands_router
|
||||
from .attackers.api_get_attacker_artifacts import router as attacker_artifacts_router
|
||||
from .attackers.api_get_attacker_transcripts import router as attacker_transcripts_router
|
||||
from .attackers.api_get_attacker_smtp_targets import router as attacker_smtp_targets_router
|
||||
from .attackers.api_get_attacker_mail import router as attacker_mail_router
|
||||
from .attackers.api_get_attacker_intel import router as attacker_intel_router
|
||||
from .identities.api_list_identities import router as identities_list_router
|
||||
from .identities.api_get_identity_detail import router as identity_detail_router
|
||||
from .identities.api_list_identity_observations import router as identity_observations_router
|
||||
from .identities.api_events import router as identity_events_router
|
||||
from .campaigns.api_list_campaigns import router as campaigns_list_router
|
||||
from .campaigns.api_get_campaign_detail import router as campaign_detail_router
|
||||
from .campaigns.api_list_campaign_identities import router as campaign_identities_router
|
||||
from .campaigns.api_events import router as campaign_events_router
|
||||
from .orchestrator.api_list_events import router as orchestrator_list_router
|
||||
from .orchestrator.api_events import router as orchestrator_events_router
|
||||
from .realism.api_config import router as realism_config_router
|
||||
from .realism.api_personas import router as realism_personas_router
|
||||
from .realism.api_synthetic_files import router as realism_synthetic_files_router
|
||||
from .transcripts import transcripts_router
|
||||
from .config.api_get_config import router as config_get_router
|
||||
from .config.api_update_config import router as config_update_router
|
||||
from .config.api_manage_users import router as config_users_router
|
||||
from .config.api_reinit import router as config_reinit_router
|
||||
from .health.api_get_health import router as health_router
|
||||
from .workers.api_list_workers import router as workers_list_router
|
||||
from .workers.api_control_worker import router as workers_control_router
|
||||
from .workers.api_start_worker import router as workers_start_router
|
||||
from .workers.api_start_all_workers import router as workers_start_all_router
|
||||
from .artifacts.api_get_artifact import router as artifacts_router
|
||||
from .swarm_updates import swarm_updates_router
|
||||
from .swarm_mgmt import swarm_mgmt_router
|
||||
from .system import system_router
|
||||
from .topology import topology_router
|
||||
from .canary import canary_router
|
||||
from .webhooks import webhooks_router
|
||||
|
||||
api_router = APIRouter()
|
||||
api_router = APIRouter(
|
||||
# Every route under /api/v1 is auth-guarded (either by an explicit
|
||||
# require_* Depends or by the global auth middleware). Document 401/403
|
||||
# here so the OpenAPI schema reflects reality for contract tests.
|
||||
responses={
|
||||
400: {"description": "Malformed request body"},
|
||||
401: {"description": "Missing or invalid credentials"},
|
||||
403: {"description": "Authenticated but not authorized"},
|
||||
404: {"description": "Referenced resource does not exist"},
|
||||
409: {"description": "Conflict with existing resource"},
|
||||
},
|
||||
)
|
||||
|
||||
# Authentication
|
||||
api_router.include_router(login_router)
|
||||
@@ -25,12 +76,86 @@ api_router.include_router(histogram_router)
|
||||
# Bounty Vault
|
||||
api_router.include_router(bounty_router)
|
||||
|
||||
# Credentials (deduped attacker auth attempts)
|
||||
api_router.include_router(credentials_router)
|
||||
|
||||
# Credential reuse findings (cross-decky/cross-service same-secret hits)
|
||||
api_router.include_router(credential_reuse_router)
|
||||
|
||||
# Fleet Management
|
||||
api_router.include_router(get_deckies_router)
|
||||
api_router.include_router(mutate_decky_router)
|
||||
api_router.include_router(mutate_interval_router)
|
||||
api_router.include_router(deploy_deckies_router)
|
||||
|
||||
# Attacker Profiles
|
||||
api_router.include_router(attackers_router)
|
||||
api_router.include_router(attacker_detail_router)
|
||||
api_router.include_router(attacker_commands_router)
|
||||
api_router.include_router(attacker_artifacts_router)
|
||||
api_router.include_router(attacker_transcripts_router)
|
||||
api_router.include_router(attacker_smtp_targets_router)
|
||||
api_router.include_router(attacker_mail_router)
|
||||
api_router.include_router(attacker_intel_router)
|
||||
|
||||
# Identity Resolution (read-only; populated by the clusterer worker —
|
||||
# see development/IDENTITY_RESOLUTION.md). Empty until the clusterer
|
||||
# ships; the API surface lands first so frontend + downstream work
|
||||
# can target a stable shape.
|
||||
api_router.include_router(identities_list_router)
|
||||
api_router.include_router(identity_detail_router)
|
||||
api_router.include_router(identity_observations_router)
|
||||
api_router.include_router(identity_events_router)
|
||||
api_router.include_router(campaigns_list_router)
|
||||
api_router.include_router(campaign_detail_router)
|
||||
api_router.include_router(campaign_identities_router)
|
||||
api_router.include_router(campaign_events_router)
|
||||
api_router.include_router(orchestrator_list_router)
|
||||
api_router.include_router(orchestrator_events_router)
|
||||
|
||||
# Realism — global persona pool CRUD for the dashboard's
|
||||
# "Persona Generation" page. The orchestrator reads from the same
|
||||
# on-disk JSON file directly (see decnet.realism.personas_pool).
|
||||
api_router.include_router(realism_personas_router)
|
||||
api_router.include_router(realism_synthetic_files_router)
|
||||
api_router.include_router(realism_config_router)
|
||||
|
||||
# Observability
|
||||
api_router.include_router(stats_router)
|
||||
api_router.include_router(stream_router)
|
||||
api_router.include_router(health_router)
|
||||
api_router.include_router(workers_list_router)
|
||||
api_router.include_router(workers_control_router)
|
||||
api_router.include_router(workers_start_router)
|
||||
api_router.include_router(workers_start_all_router)
|
||||
|
||||
# Configuration
|
||||
api_router.include_router(config_get_router)
|
||||
api_router.include_router(config_update_router)
|
||||
api_router.include_router(config_users_router)
|
||||
api_router.include_router(config_reinit_router)
|
||||
|
||||
# Artifacts (captured attacker file drops)
|
||||
api_router.include_router(artifacts_router)
|
||||
|
||||
# Transcripts (PTY session recordings, paged asciinema events)
|
||||
api_router.include_router(transcripts_router)
|
||||
|
||||
# Remote Updates (dashboard → worker updater daemons)
|
||||
api_router.include_router(swarm_updates_router)
|
||||
|
||||
# Swarm Management (dashboard: hosts, deckies, agent enrollment bundles)
|
||||
api_router.include_router(swarm_mgmt_router)
|
||||
|
||||
# System info (deployment-mode auto-detection, etc.)
|
||||
api_router.include_router(system_router)
|
||||
|
||||
# MazeNET Topologies (nested topology CRUD + mutation queue)
|
||||
api_router.include_router(topology_router)
|
||||
|
||||
# Canary tokens — operator-facing CRUD (worker hosts the
|
||||
# attacker-facing surface separately via `decnet canary`).
|
||||
api_router.include_router(canary_router)
|
||||
|
||||
# External webhook subscriptions (SIEM/SOAR egress)
|
||||
api_router.include_router(webhooks_router)
|
||||
|
||||
0
decnet/web/router/artifacts/__init__.py
Normal file
0
decnet/web/router/artifacts/__init__.py
Normal file
95
decnet/web/router/artifacts/api_get_artifact.py
Normal file
95
decnet/web/router/artifacts/api_get_artifact.py
Normal file
@@ -0,0 +1,95 @@
|
||||
"""
|
||||
Artifact download endpoint.
|
||||
|
||||
SSH deckies farm attacker file drops into a host-mounted quarantine:
|
||||
/var/lib/decnet/artifacts/{decky}/ssh/{stored_as}
|
||||
|
||||
The capture event already flows through the normal log pipeline (one
|
||||
RFC 5424 line per capture, see templates/ssh/emit_capture.py), so metadata
|
||||
is served via /logs. This endpoint exists only to retrieve the raw bytes —
|
||||
admin-gated because the payloads are attacker-controlled content.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||
from fastapi.responses import FileResponse
|
||||
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.dependencies import require_admin
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
# Override via env for tests; the prod path matches the bind mount declared in
|
||||
# decnet/services/ssh.py and decnet/services/smtp.py.
|
||||
ARTIFACTS_ROOT = Path(os.environ.get("DECNET_ARTIFACTS_ROOT", "/var/lib/decnet/artifacts"))
|
||||
|
||||
# decky names come from the deployer — lowercase alnum plus hyphens.
|
||||
_DECKY_RE = re.compile(r"^[a-z0-9][a-z0-9-]{0,62}$")
|
||||
|
||||
# Services that own an artifacts subdir. Kept explicit so a caller can't
|
||||
# pivot into arbitrary subpaths via the query string.
|
||||
_ALLOWED_SERVICES = {"ssh", "smtp"}
|
||||
|
||||
# stored_as is assembled by the capturing template as:
|
||||
# ${ts}_${sha:0:12}_${base}
|
||||
# where ts is ISO-8601 UTC (e.g. 2026-04-18T02:22:56Z), sha is 12 hex chars,
|
||||
# and base is the original filename's basename. Keep the filename charset
|
||||
# tight but allow common punctuation dropped files actually use.
|
||||
_STORED_AS_RE = re.compile(
|
||||
r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z_[a-f0-9]{12}_[A-Za-z0-9._-]{1,255}$"
|
||||
)
|
||||
|
||||
|
||||
def _resolve_artifact_path(decky: str, stored_as: str, service: str) -> Path:
|
||||
"""Validate inputs, resolve the on-disk path, and confirm it stays inside
|
||||
the artifacts root. Raises HTTPException(400) on any violation."""
|
||||
if service not in _ALLOWED_SERVICES:
|
||||
raise HTTPException(status_code=400, detail="invalid service")
|
||||
if not _DECKY_RE.fullmatch(decky):
|
||||
raise HTTPException(status_code=400, detail="invalid decky name")
|
||||
if not _STORED_AS_RE.fullmatch(stored_as):
|
||||
raise HTTPException(status_code=400, detail="invalid stored_as")
|
||||
|
||||
root = ARTIFACTS_ROOT.resolve()
|
||||
candidate = (root / decky / service / stored_as).resolve()
|
||||
# defence-in-depth: even though the regexes reject `..`, make sure a
|
||||
# symlink or weird filesystem state can't escape the root.
|
||||
if root not in candidate.parents and candidate != root:
|
||||
raise HTTPException(status_code=400, detail="path escapes artifacts root")
|
||||
return candidate
|
||||
|
||||
|
||||
@router.get(
|
||||
"/artifacts/{decky}/{stored_as}",
|
||||
tags=["Artifacts"],
|
||||
responses={
|
||||
400: {"description": "Invalid decky, service, or stored_as parameter"},
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Admin access required"},
|
||||
404: {"description": "Artifact not found"},
|
||||
},
|
||||
)
|
||||
@_traced("api.get_artifact")
|
||||
async def get_artifact(
|
||||
decky: str,
|
||||
stored_as: str,
|
||||
service: str = Query("ssh", pattern=r"^[a-z]{1,16}$"),
|
||||
admin: dict = Depends(require_admin),
|
||||
) -> FileResponse:
|
||||
path = _resolve_artifact_path(decky, stored_as, service)
|
||||
if not path.is_file():
|
||||
raise HTTPException(status_code=404, detail="artifact not found")
|
||||
return FileResponse(
|
||||
path=str(path),
|
||||
media_type="application/octet-stream",
|
||||
filename=stored_as,
|
||||
headers={
|
||||
"Content-Disposition": f'attachment; filename="{stored_as}"',
|
||||
"X-Content-Type-Options": "nosniff",
|
||||
},
|
||||
)
|
||||
0
decnet/web/router/attackers/__init__.py
Normal file
0
decnet/web/router/attackers/__init__.py
Normal file
34
decnet/web/router/attackers/api_get_attacker_artifacts.py
Normal file
34
decnet/web/router/attackers/api_get_attacker_artifacts.py
Normal file
@@ -0,0 +1,34 @@
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.dependencies import require_viewer, repo
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get(
|
||||
"/attackers/{uuid}/artifacts",
|
||||
tags=["Attacker Profiles"],
|
||||
responses={
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "Attacker not found"},
|
||||
},
|
||||
)
|
||||
@_traced("api.get_attacker_artifacts")
|
||||
async def get_attacker_artifacts(
|
||||
uuid: str,
|
||||
user: dict = Depends(require_viewer),
|
||||
) -> dict[str, Any]:
|
||||
"""List captured file-drop artifacts for an attacker (newest first).
|
||||
|
||||
Each entry is a `file_captured` log row — the frontend renders the
|
||||
badge/drawer using the same `fields` payload as /logs.
|
||||
"""
|
||||
attacker = await repo.get_attacker_by_uuid(uuid)
|
||||
if not attacker:
|
||||
raise HTTPException(status_code=404, detail="Attacker not found")
|
||||
rows = await repo.get_attacker_artifacts(uuid)
|
||||
return {"total": len(rows), "data": rows}
|
||||
42
decnet/web/router/attackers/api_get_attacker_commands.py
Normal file
42
decnet/web/router/attackers/api_get_attacker_commands.py
Normal file
@@ -0,0 +1,42 @@
|
||||
from typing import Any, Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.dependencies import require_viewer, repo
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get(
|
||||
"/attackers/{uuid}/commands",
|
||||
tags=["Attacker Profiles"],
|
||||
responses={
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "Attacker not found"},
|
||||
422: {"description": "Query parameter validation error (limit/offset out of range or invalid)"},
|
||||
},
|
||||
)
|
||||
@_traced("api.get_attacker_commands")
|
||||
async def get_attacker_commands(
|
||||
uuid: str,
|
||||
limit: int = Query(50, ge=1, le=200),
|
||||
offset: int = Query(0, ge=0, le=2147483647),
|
||||
service: Optional[str] = None,
|
||||
user: dict = Depends(require_viewer),
|
||||
) -> dict[str, Any]:
|
||||
"""Retrieve paginated commands for an attacker profile."""
|
||||
attacker = await repo.get_attacker_by_uuid(uuid)
|
||||
if not attacker:
|
||||
raise HTTPException(status_code=404, detail="Attacker not found")
|
||||
|
||||
def _norm(v: Optional[str]) -> Optional[str]:
|
||||
if v in (None, "null", "NULL", "undefined", ""):
|
||||
return None
|
||||
return v
|
||||
|
||||
result = await repo.get_attacker_commands(
|
||||
uuid=uuid, limit=limit, offset=offset, service=_norm(service),
|
||||
)
|
||||
return {"total": result["total"], "limit": limit, "offset": offset, "data": result["data"]}
|
||||
44
decnet/web/router/attackers/api_get_attacker_detail.py
Normal file
44
decnet/web/router/attackers/api_get_attacker_detail.py
Normal file
@@ -0,0 +1,44 @@
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
|
||||
from decnet.correlation.event_kinds import bucket_services
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.dependencies import require_viewer, repo
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get(
|
||||
"/attackers/{uuid}",
|
||||
tags=["Attacker Profiles"],
|
||||
responses={
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "Attacker not found"},
|
||||
},
|
||||
)
|
||||
@_traced("api.get_attacker_detail")
|
||||
async def get_attacker_detail(
|
||||
uuid: str,
|
||||
user: dict = Depends(require_viewer),
|
||||
) -> dict[str, Any]:
|
||||
"""Retrieve a single attacker profile by UUID (with behavior block)."""
|
||||
attacker = await repo.get_attacker_by_uuid(uuid)
|
||||
if not attacker:
|
||||
raise HTTPException(status_code=404, detail="Attacker not found")
|
||||
attacker["behavior"] = await repo.get_attacker_behavior(uuid)
|
||||
# Scanned vs. interacted-with — computed per-request from the log
|
||||
# stream, not persisted. Cheap (DISTINCT bounded by service ×
|
||||
# event_type cardinality), and changes to the classifier take effect
|
||||
# immediately without a profiler re-tick.
|
||||
pairs = await repo.get_attacker_service_activity(uuid)
|
||||
attacker["service_activity"] = bucket_services(pairs)
|
||||
# Attribution leaks — XFF / Forwarded / X-Real-IP mismatches captured
|
||||
# by the HTTP bounty extractor. Cap the returned list at 10 so a
|
||||
# rotation attack (100s of forged XFF values) doesn't flood the UI;
|
||||
# `ip_leaks_total` carries the unbounded count so the UI can render
|
||||
# a ROTATION DETECTED badge when the count crosses a threshold.
|
||||
attacker["ip_leaks"] = await repo.get_attacker_ip_leaks(uuid, limit=10)
|
||||
attacker["ip_leaks_total"] = await repo.count_attacker_ip_leaks(uuid)
|
||||
return attacker
|
||||
38
decnet/web/router/attackers/api_get_attacker_intel.py
Normal file
38
decnet/web/router/attackers/api_get_attacker_intel.py
Normal file
@@ -0,0 +1,38 @@
|
||||
"""GET /api/v1/attackers/{uuid}/intel — latest threat-intel row for an attacker."""
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.dependencies import repo, require_viewer
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get(
|
||||
"/attackers/{uuid}/intel",
|
||||
tags=["Attacker Profiles"],
|
||||
responses={
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "No intel cached for this attacker"},
|
||||
},
|
||||
)
|
||||
@_traced("api.get_attacker_intel")
|
||||
async def get_attacker_intel(
|
||||
uuid: str,
|
||||
user: dict = Depends(require_viewer),
|
||||
) -> dict[str, Any]:
|
||||
"""Return the most recent cached threat-intel verdict for an attacker.
|
||||
|
||||
The row is populated out-of-band by the ``decnet enrich`` worker
|
||||
(typically within seconds of first observation, sub-second when the
|
||||
bus is healthy). 404 means either the worker has not run yet or the
|
||||
UUID does not correspond to an attacker DECNET has seen.
|
||||
"""
|
||||
record = await repo.get_attacker_intel_by_uuid(uuid)
|
||||
if not record:
|
||||
raise HTTPException(
|
||||
status_code=404, detail="No intel cached for this attacker",
|
||||
)
|
||||
return record
|
||||
37
decnet/web/router/attackers/api_get_attacker_mail.py
Normal file
37
decnet/web/router/attackers/api_get_attacker_mail.py
Normal file
@@ -0,0 +1,37 @@
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.dependencies import require_admin, repo
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get(
|
||||
"/attackers/{uuid}/mail",
|
||||
tags=["Attacker Profiles"],
|
||||
responses={
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Admin access required"},
|
||||
404: {"description": "Attacker not found"},
|
||||
},
|
||||
)
|
||||
@_traced("api.get_attacker_mail")
|
||||
async def get_attacker_mail(
|
||||
uuid: str,
|
||||
admin: dict = Depends(require_admin),
|
||||
) -> dict[str, Any]:
|
||||
"""List stored messages this attacker relayed via the SMTP honeypots.
|
||||
|
||||
Each entry is a ``message_stored`` log row — headers + attachment
|
||||
manifest live in ``fields``; the raw .eml bytes are fetched via
|
||||
``/artifacts/{decky}/{stored_as}?service=smtp`` (also admin-gated).
|
||||
Admin-only because message bodies are attacker-controlled content
|
||||
and may include phishing kits / malware droppers.
|
||||
"""
|
||||
attacker = await repo.get_attacker_by_uuid(uuid)
|
||||
if not attacker:
|
||||
raise HTTPException(status_code=404, detail="Attacker not found")
|
||||
rows = await repo.get_attacker_stored_mail(uuid)
|
||||
return {"total": len(rows), "data": rows}
|
||||
36
decnet/web/router/attackers/api_get_attacker_smtp_targets.py
Normal file
36
decnet/web/router/attackers/api_get_attacker_smtp_targets.py
Normal file
@@ -0,0 +1,36 @@
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.dependencies import require_viewer, repo
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get(
|
||||
"/attackers/{uuid}/smtp-targets",
|
||||
tags=["Attacker Profiles"],
|
||||
responses={
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "Attacker not found"},
|
||||
},
|
||||
)
|
||||
@_traced("api.get_attacker_smtp_targets")
|
||||
async def get_attacker_smtp_targets(
|
||||
uuid: str,
|
||||
user: dict = Depends(require_viewer),
|
||||
) -> dict[str, Any]:
|
||||
"""List victim domains this attacker targeted via the SMTP honeypots.
|
||||
|
||||
Rows are ordered by most-recent activity. Each row is one
|
||||
(attacker, domain) pair with a running count + first/last seen — no
|
||||
local-parts (user names) are ever stored, so this is safe to show
|
||||
to any viewer role.
|
||||
"""
|
||||
attacker = await repo.get_attacker_by_uuid(uuid)
|
||||
if not attacker:
|
||||
raise HTTPException(status_code=404, detail="Attacker not found")
|
||||
rows = await repo.list_smtp_targets(uuid)
|
||||
return {"total": len(rows), "data": rows}
|
||||
34
decnet/web/router/attackers/api_get_attacker_transcripts.py
Normal file
34
decnet/web/router/attackers/api_get_attacker_transcripts.py
Normal file
@@ -0,0 +1,34 @@
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.dependencies import require_viewer, repo
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get(
|
||||
"/attackers/{uuid}/transcripts",
|
||||
tags=["Attacker Profiles"],
|
||||
responses={
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "Attacker not found"},
|
||||
},
|
||||
)
|
||||
@_traced("api.get_attacker_transcripts")
|
||||
async def get_attacker_transcripts(
|
||||
uuid: str,
|
||||
user: dict = Depends(require_viewer),
|
||||
) -> dict[str, Any]:
|
||||
"""List PTY session recordings for an attacker (newest first).
|
||||
|
||||
Each entry is a `session_recorded` log row — the frontend lists them
|
||||
in the AttackerDetail Sessions tab and opens SessionDrawer on click.
|
||||
"""
|
||||
attacker = await repo.get_attacker_by_uuid(uuid)
|
||||
if not attacker:
|
||||
raise HTTPException(status_code=404, detail="Attacker not found")
|
||||
rows = await repo.get_attacker_transcripts(uuid)
|
||||
return {"total": len(rows), "data": rows}
|
||||
83
decnet/web/router/attackers/api_get_attackers.py
Normal file
83
decnet/web/router/attackers/api_get_attackers.py
Normal file
@@ -0,0 +1,83 @@
|
||||
import asyncio
|
||||
import time
|
||||
from typing import Any, Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, Query
|
||||
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.dependencies import require_viewer, repo
|
||||
from decnet.web.db.models import AttackersResponse
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
# Same pattern as /logs — cache the unfiltered total count; filtered
|
||||
# counts go straight to the DB.
|
||||
_TOTAL_TTL = 2.0
|
||||
_total_cache: tuple[Optional[int], float] = (None, 0.0)
|
||||
_total_lock: Optional[asyncio.Lock] = None
|
||||
|
||||
|
||||
def _reset_total_cache() -> None:
|
||||
global _total_cache, _total_lock
|
||||
_total_cache = (None, 0.0)
|
||||
_total_lock = None
|
||||
|
||||
|
||||
async def _get_total_attackers_cached() -> int:
|
||||
global _total_cache, _total_lock
|
||||
value, ts = _total_cache
|
||||
now = time.monotonic()
|
||||
if value is not None and now - ts < _TOTAL_TTL:
|
||||
return value
|
||||
if _total_lock is None:
|
||||
_total_lock = asyncio.Lock()
|
||||
async with _total_lock:
|
||||
value, ts = _total_cache
|
||||
now = time.monotonic()
|
||||
if value is not None and now - ts < _TOTAL_TTL:
|
||||
return value
|
||||
value = await repo.get_total_attackers()
|
||||
_total_cache = (value, time.monotonic())
|
||||
return value
|
||||
|
||||
|
||||
@router.get(
|
||||
"/attackers",
|
||||
response_model=AttackersResponse,
|
||||
tags=["Attacker Profiles"],
|
||||
responses={
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
422: {"description": "Validation error"},
|
||||
},
|
||||
)
|
||||
@_traced("api.get_attackers")
|
||||
async def get_attackers(
|
||||
limit: int = Query(50, ge=1, le=1000),
|
||||
offset: int = Query(0, ge=0, le=2147483647),
|
||||
search: Optional[str] = None,
|
||||
sort_by: str = Query("recent", pattern="^(recent|active|traversals)$"),
|
||||
service: Optional[str] = None,
|
||||
user: dict = Depends(require_viewer),
|
||||
) -> dict[str, Any]:
|
||||
"""Retrieve paginated attacker profiles."""
|
||||
def _norm(v: Optional[str]) -> Optional[str]:
|
||||
if v in (None, "null", "NULL", "undefined", ""):
|
||||
return None
|
||||
return v
|
||||
|
||||
s = _norm(search)
|
||||
svc = _norm(service)
|
||||
_data = await repo.get_attackers(limit=limit, offset=offset, search=s, sort_by=sort_by, service=svc)
|
||||
if s is None and svc is None:
|
||||
_total = await _get_total_attackers_cached()
|
||||
else:
|
||||
_total = await repo.get_total_attackers(search=s, service=svc)
|
||||
|
||||
# Bulk-join behavior rows for the IPs in this page to avoid N+1 queries.
|
||||
_ips = {row["ip"] for row in _data if row.get("ip")}
|
||||
_behaviors = await repo.get_behaviors_for_ips(_ips) if _ips else {}
|
||||
for row in _data:
|
||||
row["behavior"] = _behaviors.get(row.get("ip"))
|
||||
|
||||
return {"total": _total, "limit": limit, "offset": offset, "data": _data}
|
||||
@@ -2,9 +2,10 @@ from typing import Any, Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, status
|
||||
|
||||
from decnet.web.auth import get_password_hash, verify_password
|
||||
from decnet.web.dependencies import get_current_user_unchecked, repo
|
||||
from decnet.web.db.models import ChangePasswordRequest
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.auth import ahash_password, averify_password
|
||||
from decnet.web.dependencies import get_current_user_unchecked, invalidate_user_cache, repo
|
||||
from decnet.web.db.models import ChangePasswordRequest, MessageResponse
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
@@ -12,20 +13,23 @@ router = APIRouter()
|
||||
@router.post(
|
||||
"/auth/change-password",
|
||||
tags=["Authentication"],
|
||||
response_model=MessageResponse,
|
||||
responses={
|
||||
400: {"description": "Bad Request (e.g. malformed JSON)"},
|
||||
401: {"description": "Could not validate credentials"},
|
||||
422: {"description": "Validation error"}
|
||||
},
|
||||
)
|
||||
@_traced("api.change_password")
|
||||
async def change_password(request: ChangePasswordRequest, current_user: str = Depends(get_current_user_unchecked)) -> dict[str, str]:
|
||||
_user: Optional[dict[str, Any]] = await repo.get_user_by_uuid(current_user)
|
||||
if not _user or not verify_password(request.old_password, _user["password_hash"]):
|
||||
if not _user or not await averify_password(request.old_password, _user["password_hash"]):
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="Incorrect old password",
|
||||
)
|
||||
|
||||
_new_hash: str = get_password_hash(request.new_password)
|
||||
_new_hash: str = await ahash_password(request.new_password)
|
||||
await repo.update_user_password(current_user, _new_hash, must_change_password=False)
|
||||
invalidate_user_cache(current_user)
|
||||
return {"message": "Password updated successfully"}
|
||||
|
||||
@@ -1,19 +1,32 @@
|
||||
from datetime import timedelta
|
||||
from typing import Any, Optional
|
||||
|
||||
from fastapi import APIRouter, HTTPException, status
|
||||
from fastapi import APIRouter, HTTPException, Request, status
|
||||
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.auth import (
|
||||
ACCESS_TOKEN_EXPIRE_MINUTES,
|
||||
averify_password,
|
||||
create_access_token,
|
||||
verify_password,
|
||||
)
|
||||
from decnet.web.dependencies import repo
|
||||
from decnet.web.dependencies import get_user_by_username_cached
|
||||
from decnet.web.db.models import LoginRequest, Token
|
||||
from decnet.web.limiter import limiter, login_ip_key, login_username_key
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
# Two independent buckets, tripping either → 429:
|
||||
#
|
||||
# - per-IP (login_ip_key): catches a botnet hitting one account.
|
||||
# - per-user (login_username_key): catches distributed credential
|
||||
# stuffing against one account.
|
||||
#
|
||||
# Limits: 10 attempts per 5 minutes per bucket. Buckets are process-local
|
||||
# (memory://); see decnet/web/limiter.py for the rationale. Buckets do
|
||||
# NOT reset on successful login — a legitimate user tripping the limit
|
||||
# via fat-fingering will need to wait the window out. 10 tries is
|
||||
# generous; a rolling window naturally drains.
|
||||
@router.post(
|
||||
"/auth/login",
|
||||
response_model=Token,
|
||||
@@ -21,12 +34,16 @@ router = APIRouter()
|
||||
responses={
|
||||
400: {"description": "Bad Request (e.g. malformed JSON)"},
|
||||
401: {"description": "Incorrect username or password"},
|
||||
422: {"description": "Validation error"}
|
||||
422: {"description": "Validation error"},
|
||||
429: {"description": "Too many login attempts — retry after the window resets"},
|
||||
},
|
||||
)
|
||||
async def login(request: LoginRequest) -> dict[str, Any]:
|
||||
_user: Optional[dict[str, Any]] = await repo.get_user_by_username(request.username)
|
||||
if not _user or not verify_password(request.password, _user["password_hash"]):
|
||||
@limiter.limit("10/5 minutes", key_func=login_ip_key)
|
||||
@limiter.limit("10/5 minutes", key_func=login_username_key)
|
||||
@_traced("api.login")
|
||||
async def login(request: Request, payload: LoginRequest) -> dict[str, Any]:
|
||||
_user: Optional[dict[str, Any]] = await get_user_by_username_cached(payload.username)
|
||||
if not _user or not await averify_password(payload.password, _user["password_hash"]):
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_401_UNAUTHORIZED,
|
||||
detail="Incorrect username or password",
|
||||
@@ -40,6 +57,6 @@ async def login(request: LoginRequest) -> dict[str, Any]:
|
||||
)
|
||||
return {
|
||||
"access_token": _access_token,
|
||||
"token_type": "bearer", # nosec B105
|
||||
"token_type": "bearer", # nosec B105 — OAuth2 token type, not a password
|
||||
"must_change_password": bool(_user.get("must_change_password", False))
|
||||
}
|
||||
|
||||
@@ -1,21 +1,62 @@
|
||||
import asyncio
|
||||
import time
|
||||
from typing import Any, Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, Query
|
||||
|
||||
from decnet.web.dependencies import get_current_user, repo
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.dependencies import require_viewer, repo
|
||||
from decnet.web.db.models import BountyResponse
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
# Cache the unfiltered default page — the UI/locust hit this constantly
|
||||
# with no params. Filtered requests (bounty_type/search) bypass: rare
|
||||
# and staleness matters for search.
|
||||
_BOUNTY_TTL = 5.0
|
||||
_DEFAULT_LIMIT = 50
|
||||
_DEFAULT_OFFSET = 0
|
||||
_bounty_cache: tuple[Optional[dict[str, Any]], float] = (None, 0.0)
|
||||
_bounty_lock: Optional[asyncio.Lock] = None
|
||||
|
||||
|
||||
def _reset_bounty_cache() -> None:
|
||||
global _bounty_cache, _bounty_lock
|
||||
_bounty_cache = (None, 0.0)
|
||||
_bounty_lock = None
|
||||
|
||||
|
||||
async def _get_bounty_default_cached() -> dict[str, Any]:
|
||||
global _bounty_cache, _bounty_lock
|
||||
value, ts = _bounty_cache
|
||||
now = time.monotonic()
|
||||
if value is not None and now - ts < _BOUNTY_TTL:
|
||||
return value
|
||||
if _bounty_lock is None:
|
||||
_bounty_lock = asyncio.Lock()
|
||||
async with _bounty_lock:
|
||||
value, ts = _bounty_cache
|
||||
now = time.monotonic()
|
||||
if value is not None and now - ts < _BOUNTY_TTL:
|
||||
return value
|
||||
_data = await repo.get_bounties(
|
||||
limit=_DEFAULT_LIMIT, offset=_DEFAULT_OFFSET, bounty_type=None, search=None,
|
||||
)
|
||||
_total = await repo.get_total_bounties(bounty_type=None, search=None)
|
||||
value = {"total": _total, "limit": _DEFAULT_LIMIT, "offset": _DEFAULT_OFFSET, "data": _data}
|
||||
_bounty_cache = (value, time.monotonic())
|
||||
return value
|
||||
|
||||
|
||||
@router.get("/bounty", response_model=BountyResponse, tags=["Bounty Vault"],
|
||||
responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}},)
|
||||
responses={401: {"description": "Could not validate credentials"}, 403: {"description": "Insufficient permissions"}, 422: {"description": "Validation error"}},)
|
||||
@_traced("api.get_bounties")
|
||||
async def get_bounties(
|
||||
limit: int = Query(50, ge=1, le=1000),
|
||||
offset: int = Query(0, ge=0, le=2147483647),
|
||||
bounty_type: Optional[str] = None,
|
||||
search: Optional[str] = None,
|
||||
current_user: str = Depends(get_current_user)
|
||||
user: dict = Depends(require_viewer)
|
||||
) -> dict[str, Any]:
|
||||
"""Retrieve collected bounties (harvested credentials, payloads, etc.)."""
|
||||
def _norm(v: Optional[str]) -> Optional[str]:
|
||||
@@ -26,6 +67,9 @@ async def get_bounties(
|
||||
bt = _norm(bounty_type)
|
||||
s = _norm(search)
|
||||
|
||||
if bt is None and s is None and limit == _DEFAULT_LIMIT and offset == _DEFAULT_OFFSET:
|
||||
return await _get_bounty_default_cached()
|
||||
|
||||
_data = await repo.get_bounties(limit=limit, offset=offset, bounty_type=bt, search=s)
|
||||
_total = await repo.get_total_bounties(bounty_type=bt, search=s)
|
||||
return {
|
||||
|
||||
0
decnet/web/router/campaigns/__init__.py
Normal file
0
decnet/web/router/campaigns/__init__.py
Normal file
123
decnet/web/router/campaigns/api_events.py
Normal file
123
decnet/web/router/campaigns/api_events.py
Normal file
@@ -0,0 +1,123 @@
|
||||
"""SSE stream of campaign events — one connection per viewer.
|
||||
|
||||
Subscribes to ``campaign.>`` on the bus for the duration of the
|
||||
request and forwards each matching event as a Server-Sent Event.
|
||||
Emits a one-shot snapshot on connect (current paginated campaign
|
||||
list).
|
||||
|
||||
Mirror of :mod:`decnet.web.router.identities.api_events`. Auth: JWT
|
||||
via ``?token=`` query param + ``require_stream_viewer`` role.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from typing import AsyncGenerator
|
||||
|
||||
import orjson
|
||||
from fastapi import APIRouter, Depends, Request
|
||||
from fastapi.responses import StreamingResponse
|
||||
|
||||
from decnet.bus import topics as _topics
|
||||
from decnet.bus.app import get_app_bus
|
||||
from decnet.logging import get_logger
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.dependencies import repo, require_stream_viewer
|
||||
from decnet.web.sse_limits import sse_connection_slot
|
||||
|
||||
log = get_logger("api.campaigns.events")
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
_KEEPALIVE_SECS = 15.0
|
||||
_SNAPSHOT_LIMIT = 50
|
||||
|
||||
|
||||
def _format_sse(event_name: str, data: dict) -> str:
|
||||
return f"event: {event_name}\ndata: {orjson.dumps(data).decode()}\n\n"
|
||||
|
||||
|
||||
@router.get(
|
||||
"/campaigns/events",
|
||||
tags=["Campaign Clustering"],
|
||||
responses={
|
||||
200: {
|
||||
"content": {"text/event-stream": {}},
|
||||
"description": "SSE stream of campaign-clustering events",
|
||||
},
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
429: {"description": "Per-user SSE connection cap reached"},
|
||||
},
|
||||
)
|
||||
@_traced("api.campaigns.events")
|
||||
async def api_campaigns_events(
|
||||
request: Request,
|
||||
user: dict = Depends(require_stream_viewer),
|
||||
) -> StreamingResponse:
|
||||
# Event types: snapshot, formed, identity.assigned, merged, unmerged.
|
||||
snapshot = await repo.list_campaigns(limit=_SNAPSHOT_LIMIT, offset=0)
|
||||
|
||||
async def generator() -> AsyncGenerator[str, None]:
|
||||
async with sse_connection_slot(user["uuid"]):
|
||||
yield ": keepalive\n\n"
|
||||
yield _format_sse("snapshot", {"campaigns": snapshot})
|
||||
|
||||
bus = await get_app_bus()
|
||||
if bus is None:
|
||||
while not await request.is_disconnected():
|
||||
try:
|
||||
await asyncio.sleep(_KEEPALIVE_SECS)
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
yield ": keepalive\n\n"
|
||||
return
|
||||
|
||||
sub = bus.subscribe(f"{_topics.CAMPAIGN}.>")
|
||||
try:
|
||||
async with sub:
|
||||
sub_iter = sub.__aiter__()
|
||||
while True:
|
||||
if await request.is_disconnected():
|
||||
break
|
||||
next_task = asyncio.ensure_future(sub_iter.__anext__())
|
||||
try:
|
||||
event = await asyncio.wait_for(
|
||||
next_task, timeout=_KEEPALIVE_SECS,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
next_task.cancel()
|
||||
yield ": keepalive\n\n"
|
||||
continue
|
||||
except StopAsyncIteration:
|
||||
break
|
||||
yield _format_sse(
|
||||
_sse_name_for(event.topic),
|
||||
{
|
||||
"topic": event.topic,
|
||||
"type": event.type,
|
||||
"ts": event.ts,
|
||||
"payload": event.payload,
|
||||
},
|
||||
)
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
except Exception:
|
||||
log.exception("campaign events stream crashed")
|
||||
yield _format_sse("error", {"message": "Stream interrupted"})
|
||||
|
||||
return StreamingResponse(
|
||||
generator(),
|
||||
media_type="text/event-stream",
|
||||
headers={
|
||||
"Cache-Control": "no-cache",
|
||||
"X-Accel-Buffering": "no",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def _sse_name_for(topic: str) -> str:
|
||||
"""``campaign.formed`` → ``formed``;
|
||||
``campaign.identity.assigned`` → ``identity.assigned``."""
|
||||
if topic.startswith(f"{_topics.CAMPAIGN}."):
|
||||
return topic[len(_topics.CAMPAIGN) + 1:]
|
||||
return topic
|
||||
40
decnet/web/router/campaigns/api_get_campaign_detail.py
Normal file
40
decnet/web/router/campaigns/api_get_campaign_detail.py
Normal file
@@ -0,0 +1,40 @@
|
||||
"""GET /api/v1/campaigns/{uuid} — single campaign row.
|
||||
|
||||
Soft-merge handling: if the requested UUID has merged_into_uuid set,
|
||||
the repository follows the chain and returns the winner. Mirror of
|
||||
:mod:`decnet.web.router.identities.api_get_identity_detail`.
|
||||
"""
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.dependencies import repo, require_viewer
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get(
|
||||
"/campaigns/{uuid}",
|
||||
tags=["Campaign Clustering"],
|
||||
responses={
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "Campaign not found"},
|
||||
},
|
||||
)
|
||||
@_traced("api.get_campaign_detail")
|
||||
async def get_campaign_detail(
|
||||
uuid: str,
|
||||
user: dict = Depends(require_viewer),
|
||||
) -> dict[str, Any]:
|
||||
campaign = await repo.get_campaign_by_uuid(uuid)
|
||||
if not campaign:
|
||||
raise HTTPException(status_code=404, detail="Campaign not found")
|
||||
# Cheap aggregate the CampaignDetail page surfaces — counted off
|
||||
# the FK rather than the denormalized identity_count so the answer
|
||||
# is always live.
|
||||
campaign["identity_count_live"] = await repo.count_identities_for_campaign(
|
||||
campaign["uuid"]
|
||||
)
|
||||
return campaign
|
||||
41
decnet/web/router/campaigns/api_list_campaign_identities.py
Normal file
41
decnet/web/router/campaigns/api_list_campaign_identities.py
Normal file
@@ -0,0 +1,41 @@
|
||||
"""GET /api/v1/campaigns/{uuid}/identities — identities for a campaign.
|
||||
|
||||
Returns the ``AttackerIdentity`` rows whose ``campaign_id`` FK points
|
||||
at this campaign. Mirror of
|
||||
:mod:`decnet.web.router.identities.api_list_identity_observations`.
|
||||
"""
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.dependencies import repo, require_viewer
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get(
|
||||
"/campaigns/{uuid}/identities",
|
||||
tags=["Campaign Clustering"],
|
||||
responses={
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "Campaign not found"},
|
||||
},
|
||||
)
|
||||
@_traced("api.list_campaign_identities")
|
||||
async def list_campaign_identities(
|
||||
uuid: str,
|
||||
limit: int = Query(50, ge=1, le=1000),
|
||||
offset: int = Query(0, ge=0, le=2147483647),
|
||||
user: dict = Depends(require_viewer),
|
||||
) -> dict[str, Any]:
|
||||
campaign = await repo.get_campaign_by_uuid(uuid)
|
||||
if not campaign:
|
||||
raise HTTPException(status_code=404, detail="Campaign not found")
|
||||
canonical_uuid = campaign["uuid"]
|
||||
data = await repo.list_identities_for_campaign(
|
||||
canonical_uuid, limit=limit, offset=offset
|
||||
)
|
||||
total = await repo.count_identities_for_campaign(canonical_uuid)
|
||||
return {"total": total, "limit": limit, "offset": offset, "data": data}
|
||||
35
decnet/web/router/campaigns/api_list_campaigns.py
Normal file
35
decnet/web/router/campaigns/api_list_campaigns.py
Normal file
@@ -0,0 +1,35 @@
|
||||
"""GET /api/v1/campaigns — paginated list of campaigns.
|
||||
|
||||
Mirror of :mod:`decnet.web.router.identities.api_list_identities` for
|
||||
the campaign layer. Returns an empty list while the campaign clusterer
|
||||
hasn't run yet (the campaigns table ships empty).
|
||||
"""
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, Depends, Query
|
||||
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.dependencies import repo, require_viewer
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get(
|
||||
"/campaigns",
|
||||
tags=["Campaign Clustering"],
|
||||
responses={
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
422: {"description": "Validation error"},
|
||||
},
|
||||
)
|
||||
@_traced("api.list_campaigns")
|
||||
async def list_campaigns(
|
||||
limit: int = Query(50, ge=1, le=1000),
|
||||
offset: int = Query(0, ge=0, le=2147483647),
|
||||
user: dict = Depends(require_viewer),
|
||||
) -> dict[str, Any]:
|
||||
"""Paginated campaign list, newest-updated first."""
|
||||
data = await repo.list_campaigns(limit=limit, offset=offset)
|
||||
total = await repo.count_campaigns()
|
||||
return {"total": total, "limit": limit, "offset": offset, "data": data}
|
||||
23
decnet/web/router/canary/__init__.py
Normal file
23
decnet/web/router/canary/__init__.py
Normal file
@@ -0,0 +1,23 @@
|
||||
"""Canary tokens — operator-facing CRUD.
|
||||
|
||||
Mounted under ``/api/v1/canary``. Covers:
|
||||
|
||||
* ``POST /blobs`` — upload an artifact (multipart);
|
||||
``GET /blobs``, ``DELETE /blobs/{id}`` — listing + cleanup
|
||||
* ``POST /tokens`` — generate + plant a token on a target decky;
|
||||
``GET /tokens``, ``GET /tokens/{id}``, ``DELETE /tokens/{id}``
|
||||
— listing + detail + revoke
|
||||
* ``GET /tokens/{id}/preview`` — instrumented bytes for sanity-check
|
||||
* ``GET /tokens/{id}/triggers`` — paged callback log
|
||||
|
||||
The ``decnet canary`` worker runs the ATTACKER-facing surface (HTTP
|
||||
slug + DNS); this module is the OPERATOR-facing surface only.
|
||||
"""
|
||||
from fastapi import APIRouter
|
||||
|
||||
from .api_blobs import router as blobs_router
|
||||
from .api_tokens import router as tokens_router
|
||||
|
||||
canary_router = APIRouter(prefix="/canary")
|
||||
canary_router.include_router(blobs_router)
|
||||
canary_router.include_router(tokens_router)
|
||||
172
decnet/web/router/canary/api_blobs.py
Normal file
172
decnet/web/router/canary/api_blobs.py
Normal file
@@ -0,0 +1,172 @@
|
||||
"""Operator-uploaded canary blob CRUD.
|
||||
|
||||
Three endpoints:
|
||||
|
||||
* ``POST /blobs`` — multipart upload; sniffs MIME from the magic
|
||||
bytes (no python-magic dependency), persists to disk under the
|
||||
sha256 hash, returns the (possibly pre-existing) row.
|
||||
* ``GET /blobs`` — list all blobs with their live token reference
|
||||
count.
|
||||
* ``DELETE /blobs/{uuid}`` — refcount-aware delete; returns 409 if
|
||||
any token still references the blob.
|
||||
|
||||
Admin-gated: blobs are operator-supplied content that may carry
|
||||
sensitive material (real-looking financial reports, etc.); listing
|
||||
them and deleting them is an admin operation. Reading them via the
|
||||
preview path is also admin-gated.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, Depends, File, HTTPException, UploadFile
|
||||
|
||||
from decnet.canary import storage
|
||||
from decnet.logging import get_logger
|
||||
from decnet.web.db.models import (
|
||||
CanaryBlobResponse,
|
||||
CanaryBlobsResponse,
|
||||
MessageResponse,
|
||||
)
|
||||
from decnet.web.dependencies import repo, require_admin
|
||||
|
||||
log = get_logger("api.canary.blobs")
|
||||
|
||||
router = APIRouter(prefix="/blobs", tags=["Canary"])
|
||||
|
||||
|
||||
# --- MIME sniffing (stdlib-only, replaces python-magic) -------------------
|
||||
#
|
||||
# The DOCX/XLSX/PDF/PNG/JPEG/GIF/HTML/JSON/YAML space covers everything
|
||||
# our instrumenters know how to mutate. Anything else falls through to
|
||||
# ``application/octet-stream`` and the API routes the token to the
|
||||
# ``passthrough`` instrumenter.
|
||||
|
||||
_MAGIC_TABLE: tuple[tuple[bytes, str], ...] = (
|
||||
(b"\x89PNG\r\n\x1a\n", "image/png"),
|
||||
(b"\xff\xd8\xff", "image/jpeg"),
|
||||
(b"GIF87a", "image/gif"),
|
||||
(b"GIF89a", "image/gif"),
|
||||
(b"%PDF-", "application/pdf"),
|
||||
# OOXML (DOCX/XLSX) starts with PK\x03\x04 but so do plain zips.
|
||||
# We disambiguate by Content_Types entry below.
|
||||
(b"<!DOCTYPE", "text/html"),
|
||||
(b"<html", "text/html"),
|
||||
(b"<HTML", "text/html"),
|
||||
(b"<?xml", "application/xml"),
|
||||
)
|
||||
|
||||
|
||||
def _sniff_mime(filename: str, head: bytes) -> str:
|
||||
for marker, mime in _MAGIC_TABLE:
|
||||
if head.startswith(marker):
|
||||
return mime
|
||||
if head[:4] == b"PK\x03\x04":
|
||||
# OOXML alias detection: peek for the document-specific Override
|
||||
# in [Content_Types].xml. We only need to look at the first
|
||||
# block; the central directory comes later.
|
||||
if b"wordprocessingml" in head:
|
||||
return "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
if b"spreadsheetml" in head:
|
||||
return "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
||||
return "application/zip"
|
||||
# Plaintext heuristic: if the head decodes as printable utf-8 we
|
||||
# call it text/plain — that's good enough to route to the plain
|
||||
# instrumenter, which also handles json/yaml/toml.
|
||||
try:
|
||||
head.decode("utf-8")
|
||||
if all(b in (0x09, 0x0A, 0x0D) or b >= 0x20 for b in head[:128]):
|
||||
lf = filename.lower()
|
||||
if lf.endswith((".json",)):
|
||||
return "application/json"
|
||||
if lf.endswith((".yaml", ".yml")):
|
||||
return "application/yaml"
|
||||
if lf.endswith((".toml",)):
|
||||
return "application/toml"
|
||||
return "text/plain"
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
return "application/octet-stream"
|
||||
|
||||
|
||||
def _row_to_response(row: dict[str, Any]) -> CanaryBlobResponse:
|
||||
return CanaryBlobResponse(**row)
|
||||
|
||||
|
||||
@router.post(
|
||||
"",
|
||||
response_model=CanaryBlobResponse,
|
||||
status_code=201,
|
||||
responses={
|
||||
400: {"description": "Empty file or unreadable upload"},
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
},
|
||||
)
|
||||
async def api_upload_blob(
|
||||
file: UploadFile = File(...),
|
||||
admin: dict = Depends(require_admin),
|
||||
) -> CanaryBlobResponse:
|
||||
content = await file.read()
|
||||
if not content:
|
||||
raise HTTPException(status_code=400, detail="uploaded file is empty")
|
||||
sniffed = _sniff_mime(file.filename or "", content[:1024])
|
||||
sha, _path, size = storage.write_blob(content)
|
||||
row = await repo.upsert_canary_blob({
|
||||
"sha256": sha,
|
||||
"filename": file.filename or "(unnamed)",
|
||||
"content_type": sniffed,
|
||||
"size_bytes": size,
|
||||
"uploaded_by": admin.get("uuid", "unknown"),
|
||||
"uploaded_at": datetime.now(timezone.utc),
|
||||
})
|
||||
row.setdefault("token_count", 0)
|
||||
return _row_to_response(row)
|
||||
|
||||
|
||||
@router.get(
|
||||
"",
|
||||
response_model=CanaryBlobsResponse,
|
||||
responses={
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
},
|
||||
)
|
||||
async def api_list_blobs(
|
||||
admin: dict = Depends(require_admin),
|
||||
) -> CanaryBlobsResponse:
|
||||
rows = await repo.list_canary_blobs()
|
||||
return CanaryBlobsResponse(
|
||||
blobs=[_row_to_response(r) for r in rows],
|
||||
total=len(rows),
|
||||
)
|
||||
|
||||
|
||||
@router.delete(
|
||||
"/{uuid}",
|
||||
response_model=MessageResponse,
|
||||
responses={
|
||||
404: {"description": "Blob not found"},
|
||||
409: {"description": "Blob still referenced by a token"},
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
},
|
||||
)
|
||||
async def api_delete_blob(
|
||||
uuid: str,
|
||||
admin: dict = Depends(require_admin),
|
||||
) -> MessageResponse:
|
||||
existing = await repo.get_canary_blob(uuid)
|
||||
if existing is None:
|
||||
raise HTTPException(status_code=404, detail="blob not found")
|
||||
deleted = await repo.delete_canary_blob(uuid)
|
||||
if not deleted:
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail="blob is still referenced by one or more tokens",
|
||||
)
|
||||
# DB row is gone; best-effort unlink the bytes on disk. A failure
|
||||
# here leaves a recoverable orphan, never a dangling DB ref.
|
||||
storage.unlink_blob(existing["sha256"])
|
||||
return MessageResponse(message="ok")
|
||||
318
decnet/web/router/canary/api_tokens.py
Normal file
318
decnet/web/router/canary/api_tokens.py
Normal file
@@ -0,0 +1,318 @@
|
||||
"""Operator-facing canary token CRUD.
|
||||
|
||||
Every body-bearing route documents the 400 error per
|
||||
:mod:`feedback_schemathesis_400`. Auth deps:
|
||||
|
||||
* writes (POST, DELETE) → :func:`require_admin`
|
||||
* reads (GET, preview) → :func:`require_viewer`
|
||||
|
||||
The router resolves blobs / instrumenters / generators here, builds
|
||||
the :class:`CanaryArtifact`, and hands it to the planter. The
|
||||
worker is a separate process; it doesn't see this code path.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from secrets import token_urlsafe
|
||||
from typing import Any
|
||||
from uuid import uuid4
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query, Response
|
||||
|
||||
from decnet.canary import (
|
||||
CanaryContext,
|
||||
get_generator,
|
||||
get_instrumenter,
|
||||
pick_instrumenter_for_mime,
|
||||
storage,
|
||||
)
|
||||
from decnet.canary.base import InstrumenterRejectedError
|
||||
from decnet.canary.factory import KNOWN_GENERATORS
|
||||
from decnet.canary.paths import normalize_placement
|
||||
from decnet.canary import planter
|
||||
from decnet.logging import get_logger
|
||||
from decnet.web.db.models import (
|
||||
CanaryTokenCreateRequest,
|
||||
CanaryTokenResponse,
|
||||
CanaryTokensResponse,
|
||||
CanaryTriggerResponse,
|
||||
CanaryTriggersResponse,
|
||||
MessageResponse,
|
||||
)
|
||||
from decnet.web.dependencies import repo, require_admin, require_viewer
|
||||
|
||||
log = get_logger("api.canary.tokens")
|
||||
|
||||
router = APIRouter(prefix="/tokens", tags=["Canary"])
|
||||
|
||||
|
||||
def _http_base() -> str:
|
||||
import os
|
||||
return os.environ.get(
|
||||
"DECNET_CANARY_HTTP_BASE", "http://localhost:8088",
|
||||
).rstrip("/")
|
||||
|
||||
|
||||
def _dns_zone() -> str:
|
||||
import os
|
||||
return os.environ.get("DECNET_CANARY_DNS_ZONE", "").strip(".").lower()
|
||||
|
||||
|
||||
def _row_to_response(row: dict[str, Any]) -> CanaryTokenResponse:
|
||||
return CanaryTokenResponse(**row)
|
||||
|
||||
|
||||
def _trigger_row_to_response(row: dict[str, Any]) -> CanaryTriggerResponse:
|
||||
# Decode raw_headers JSON for the response shape.
|
||||
headers = row.get("raw_headers") or "{}"
|
||||
try:
|
||||
import json
|
||||
decoded = json.loads(headers) if isinstance(headers, str) else headers
|
||||
if not isinstance(decoded, dict):
|
||||
decoded = {}
|
||||
except (ValueError, TypeError):
|
||||
decoded = {}
|
||||
out = dict(row)
|
||||
out["headers"] = decoded
|
||||
out.pop("raw_headers", None)
|
||||
return CanaryTriggerResponse(**out)
|
||||
|
||||
|
||||
# ---------------------------------------------------------- create
|
||||
|
||||
@router.post(
|
||||
"",
|
||||
response_model=CanaryTokenResponse,
|
||||
status_code=201,
|
||||
responses={
|
||||
400: {"description": "Invalid token request (missing/conflicting fields, bad path, instrumenter rejection)"},
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "Referenced blob not found"},
|
||||
},
|
||||
)
|
||||
async def api_create_token(
|
||||
req: CanaryTokenCreateRequest,
|
||||
admin: dict = Depends(require_admin),
|
||||
) -> CanaryTokenResponse:
|
||||
# Exactly one of blob_uuid / generator must be set.
|
||||
if bool(req.blob_uuid) == bool(req.generator):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="provide exactly one of blob_uuid or generator",
|
||||
)
|
||||
try:
|
||||
placement_path = normalize_placement(req.placement_path)
|
||||
except ValueError as e:
|
||||
raise HTTPException(status_code=400, detail=str(e)) from e
|
||||
|
||||
slug = token_urlsafe(16)
|
||||
ctx = CanaryContext(
|
||||
callback_token=slug, http_base=_http_base(), dns_zone=_dns_zone(),
|
||||
)
|
||||
|
||||
if req.generator:
|
||||
if req.generator not in KNOWN_GENERATORS:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"unknown generator: {req.generator!r}",
|
||||
)
|
||||
generator = get_generator(req.generator)
|
||||
artifact = generator.generate(ctx)
|
||||
instrumenter_name = None
|
||||
else:
|
||||
# Upload-driven token.
|
||||
blob = await repo.get_canary_blob(req.blob_uuid)
|
||||
if blob is None:
|
||||
raise HTTPException(status_code=404, detail="blob not found")
|
||||
try:
|
||||
blob_bytes = storage.read_blob(blob["sha256"])
|
||||
except FileNotFoundError as e:
|
||||
raise HTTPException(
|
||||
status_code=410,
|
||||
detail="blob bytes missing on disk; please re-upload",
|
||||
) from e
|
||||
instrumenter_name = pick_instrumenter_for_mime(blob["content_type"])
|
||||
ins = get_instrumenter(instrumenter_name)
|
||||
try:
|
||||
artifact = ins.instrument(blob_bytes, ctx, target_path=placement_path)
|
||||
except InstrumenterRejectedError as e:
|
||||
raise HTTPException(status_code=400, detail=str(e)) from e
|
||||
|
||||
artifact.path = placement_path
|
||||
token_uuid = str(uuid4())
|
||||
kind = req.kind
|
||||
await repo.create_canary_token({
|
||||
"uuid": token_uuid,
|
||||
"kind": kind,
|
||||
"decky_name": req.decky_name,
|
||||
"blob_uuid": req.blob_uuid,
|
||||
"instrumenter": instrumenter_name,
|
||||
"generator": req.generator,
|
||||
"placement_path": placement_path,
|
||||
"callback_token": slug,
|
||||
"secret_seed": slug,
|
||||
"created_by": admin.get("uuid", "unknown"),
|
||||
"state": "planted",
|
||||
})
|
||||
await planter.plant(req.decky_name, artifact, token_uuid=token_uuid, repo=repo)
|
||||
row = await repo.get_canary_token(token_uuid)
|
||||
return _row_to_response(row)
|
||||
|
||||
|
||||
# ---------------------------------------------------------- list / detail
|
||||
|
||||
@router.get(
|
||||
"",
|
||||
response_model=CanaryTokensResponse,
|
||||
responses={
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
},
|
||||
)
|
||||
async def api_list_tokens(
|
||||
decky_name: str | None = Query(default=None),
|
||||
state: str | None = Query(default=None),
|
||||
kind: str | None = Query(default=None),
|
||||
viewer: dict = Depends(require_viewer),
|
||||
) -> CanaryTokensResponse:
|
||||
rows = await repo.list_canary_tokens(
|
||||
decky_name=decky_name, state=state, kind=kind,
|
||||
)
|
||||
return CanaryTokensResponse(
|
||||
tokens=[_row_to_response(r) for r in rows],
|
||||
total=len(rows),
|
||||
)
|
||||
|
||||
|
||||
@router.get(
|
||||
"/{uuid}",
|
||||
response_model=CanaryTokenResponse,
|
||||
responses={
|
||||
404: {"description": "Token not found"},
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
},
|
||||
)
|
||||
async def api_get_token(
|
||||
uuid: str,
|
||||
viewer: dict = Depends(require_viewer),
|
||||
) -> CanaryTokenResponse:
|
||||
row = await repo.get_canary_token(uuid)
|
||||
if row is None:
|
||||
raise HTTPException(status_code=404, detail="token not found")
|
||||
return _row_to_response(row)
|
||||
|
||||
|
||||
# ---------------------------------------------------------- preview
|
||||
|
||||
@router.get(
|
||||
"/{uuid}/preview",
|
||||
response_class=Response,
|
||||
responses={
|
||||
200: {"description": "Instrumented bytes (raw)"},
|
||||
404: {"description": "Token not found"},
|
||||
409: {"description": "Token has no preview-able bytes (passive aws_creds, etc.)"},
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
},
|
||||
)
|
||||
async def api_preview_token(
|
||||
uuid: str,
|
||||
admin: dict = Depends(require_admin),
|
||||
) -> Response:
|
||||
"""Return the instrumented bytes the planter dropped on the decky.
|
||||
|
||||
Re-derived deterministically from the row's ``secret_seed`` —
|
||||
we don't store the rendered bytes server-side. Lets operators
|
||||
diff-check what we wrote without ``docker exec``-ing into the
|
||||
container.
|
||||
"""
|
||||
row = await repo.get_canary_token(uuid)
|
||||
if row is None:
|
||||
raise HTTPException(status_code=404, detail="token not found")
|
||||
ctx = CanaryContext(
|
||||
callback_token=row["callback_token"],
|
||||
http_base=_http_base(),
|
||||
dns_zone=_dns_zone(),
|
||||
)
|
||||
if row["generator"]:
|
||||
artifact = get_generator(row["generator"]).generate(ctx)
|
||||
elif row["blob_uuid"] and row["instrumenter"]:
|
||||
blob = await repo.get_canary_blob(row["blob_uuid"])
|
||||
if blob is None:
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail="blob has been deleted; preview unavailable",
|
||||
)
|
||||
try:
|
||||
blob_bytes = storage.read_blob(blob["sha256"])
|
||||
except FileNotFoundError as e:
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail="blob bytes missing on disk",
|
||||
) from e
|
||||
ins = get_instrumenter(row["instrumenter"])
|
||||
try:
|
||||
artifact = ins.instrument(
|
||||
blob_bytes, ctx, target_path=row["placement_path"],
|
||||
)
|
||||
except InstrumenterRejectedError as e:
|
||||
raise HTTPException(status_code=409, detail=str(e)) from e
|
||||
else:
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail="token has neither generator nor instrumenter — nothing to preview",
|
||||
)
|
||||
return Response(content=artifact.content, media_type="application/octet-stream")
|
||||
|
||||
|
||||
# ---------------------------------------------------------- triggers
|
||||
|
||||
@router.get(
|
||||
"/{uuid}/triggers",
|
||||
response_model=CanaryTriggersResponse,
|
||||
responses={
|
||||
404: {"description": "Token not found"},
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
},
|
||||
)
|
||||
async def api_list_triggers(
|
||||
uuid: str,
|
||||
limit: int = Query(default=100, ge=1, le=500),
|
||||
offset: int = Query(default=0, ge=0),
|
||||
viewer: dict = Depends(require_viewer),
|
||||
) -> CanaryTriggersResponse:
|
||||
row = await repo.get_canary_token(uuid)
|
||||
if row is None:
|
||||
raise HTTPException(status_code=404, detail="token not found")
|
||||
rows = await repo.list_canary_triggers(uuid, limit=limit, offset=offset)
|
||||
return CanaryTriggersResponse(
|
||||
triggers=[_trigger_row_to_response(r) for r in rows],
|
||||
total=len(rows),
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------- revoke
|
||||
|
||||
@router.delete(
|
||||
"/{uuid}",
|
||||
response_model=MessageResponse,
|
||||
responses={
|
||||
404: {"description": "Token not found"},
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
},
|
||||
)
|
||||
async def api_revoke_token(
|
||||
uuid: str,
|
||||
admin: dict = Depends(require_admin),
|
||||
) -> MessageResponse:
|
||||
row = await repo.get_canary_token(uuid)
|
||||
if row is None:
|
||||
raise HTTPException(status_code=404, detail="token not found")
|
||||
await planter.revoke(
|
||||
row["decky_name"], row["placement_path"],
|
||||
token_uuid=uuid, repo=repo,
|
||||
)
|
||||
return MessageResponse(message="ok")
|
||||
0
decnet/web/router/config/__init__.py
Normal file
0
decnet/web/router/config/__init__.py
Normal file
124
decnet/web/router/config/api_get_config.py
Normal file
124
decnet/web/router/config/api_get_config.py
Normal file
@@ -0,0 +1,124 @@
|
||||
import asyncio
|
||||
import time
|
||||
from typing import Any, Optional
|
||||
|
||||
from fastapi import APIRouter, Depends
|
||||
|
||||
from decnet.env import DECNET_DEVELOPER
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.dependencies import require_viewer, repo
|
||||
from decnet.web.db.models import UserResponse
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
_DEFAULT_DEPLOYMENT_LIMIT = 10
|
||||
_DEFAULT_MUTATION_INTERVAL = "30m"
|
||||
|
||||
# Cache config_limits / config_globals reads — these change on rare admin
|
||||
# writes but get polled constantly by the UI and locust.
|
||||
_STATE_TTL = 5.0
|
||||
_state_cache: dict[str, tuple[Optional[dict[str, Any]], float]] = {}
|
||||
_state_locks: dict[str, asyncio.Lock] = {}
|
||||
|
||||
# Admin branch fetched repo.list_users() on every /config call — cache 5s,
|
||||
# invalidate on user create/update/delete so the admin UI stays consistent.
|
||||
_USERS_TTL = 5.0
|
||||
_users_cache: tuple[Optional[list[dict[str, Any]]], float] = (None, 0.0)
|
||||
_users_lock: Optional[asyncio.Lock] = None
|
||||
|
||||
|
||||
def _reset_state_cache() -> None:
|
||||
"""Reset cached config state — used by tests."""
|
||||
global _users_cache, _users_lock
|
||||
_state_cache.clear()
|
||||
# Drop any locks bound to the previous event loop — reusing one from
|
||||
# a dead loop deadlocks the next test.
|
||||
_state_locks.clear()
|
||||
_users_cache = (None, 0.0)
|
||||
_users_lock = None
|
||||
|
||||
|
||||
def invalidate_list_users_cache() -> None:
|
||||
global _users_cache
|
||||
_users_cache = (None, 0.0)
|
||||
|
||||
|
||||
async def _get_list_users_cached() -> list[dict[str, Any]]:
|
||||
global _users_cache, _users_lock
|
||||
value, ts = _users_cache
|
||||
now = time.monotonic()
|
||||
if value is not None and now - ts < _USERS_TTL:
|
||||
return value
|
||||
if _users_lock is None:
|
||||
_users_lock = asyncio.Lock()
|
||||
async with _users_lock:
|
||||
value, ts = _users_cache
|
||||
now = time.monotonic()
|
||||
if value is not None and now - ts < _USERS_TTL:
|
||||
return value
|
||||
value = await repo.list_users()
|
||||
_users_cache = (value, time.monotonic())
|
||||
return value
|
||||
|
||||
|
||||
async def _get_state_cached(name: str) -> Optional[dict[str, Any]]:
|
||||
entry = _state_cache.get(name)
|
||||
now = time.monotonic()
|
||||
if entry is not None and now - entry[1] < _STATE_TTL:
|
||||
return entry[0]
|
||||
lock = _state_locks.setdefault(name, asyncio.Lock())
|
||||
async with lock:
|
||||
entry = _state_cache.get(name)
|
||||
now = time.monotonic()
|
||||
if entry is not None and now - entry[1] < _STATE_TTL:
|
||||
return entry[0]
|
||||
value = await repo.get_state(name)
|
||||
_state_cache[name] = (value, time.monotonic())
|
||||
return value
|
||||
|
||||
|
||||
@router.get(
|
||||
"/config",
|
||||
tags=["Configuration"],
|
||||
responses={
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
},
|
||||
)
|
||||
@_traced("api.get_config")
|
||||
async def api_get_config(user: dict = Depends(require_viewer)) -> dict:
|
||||
limits_state = await _get_state_cached("config_limits")
|
||||
globals_state = await _get_state_cached("config_globals")
|
||||
|
||||
deployment_limit = (
|
||||
limits_state.get("deployment_limit", _DEFAULT_DEPLOYMENT_LIMIT)
|
||||
if limits_state
|
||||
else _DEFAULT_DEPLOYMENT_LIMIT
|
||||
)
|
||||
global_mutation_interval = (
|
||||
globals_state.get("global_mutation_interval", _DEFAULT_MUTATION_INTERVAL)
|
||||
if globals_state
|
||||
else _DEFAULT_MUTATION_INTERVAL
|
||||
)
|
||||
|
||||
base = {
|
||||
"role": user["role"],
|
||||
"deployment_limit": deployment_limit,
|
||||
"global_mutation_interval": global_mutation_interval,
|
||||
}
|
||||
|
||||
if user["role"] == "admin":
|
||||
all_users = await _get_list_users_cached()
|
||||
base["users"] = [
|
||||
UserResponse(
|
||||
uuid=u["uuid"],
|
||||
username=u["username"],
|
||||
role=u["role"],
|
||||
must_change_password=u["must_change_password"],
|
||||
).model_dump()
|
||||
for u in all_users
|
||||
]
|
||||
if DECNET_DEVELOPER:
|
||||
base["developer_mode"] = True
|
||||
|
||||
return base
|
||||
144
decnet/web/router/config/api_manage_users.py
Normal file
144
decnet/web/router/config/api_manage_users.py
Normal file
@@ -0,0 +1,144 @@
|
||||
import uuid as _uuid
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.auth import ahash_password
|
||||
from decnet.web.dependencies import require_admin, invalidate_user_cache, repo
|
||||
from decnet.web.router.config.api_get_config import invalidate_list_users_cache
|
||||
from decnet.web.db.models import (
|
||||
CreateUserRequest,
|
||||
MessageResponse,
|
||||
ResetUserPasswordRequest,
|
||||
UpdateUserRoleRequest,
|
||||
UserResponse,
|
||||
)
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.post(
|
||||
"/config/users",
|
||||
tags=["Configuration"],
|
||||
response_model=UserResponse,
|
||||
responses={
|
||||
400: {"description": "Bad Request (e.g. malformed JSON)"},
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Admin access required"},
|
||||
409: {"description": "Username already exists"},
|
||||
422: {"description": "Validation error"},
|
||||
},
|
||||
)
|
||||
@_traced("api.create_user")
|
||||
async def api_create_user(
|
||||
req: CreateUserRequest,
|
||||
admin: dict = Depends(require_admin),
|
||||
) -> UserResponse:
|
||||
existing = await repo.get_user_by_username(req.username)
|
||||
if existing:
|
||||
raise HTTPException(status_code=409, detail="Username already exists")
|
||||
|
||||
user_uuid = str(_uuid.uuid4())
|
||||
await repo.create_user({
|
||||
"uuid": user_uuid,
|
||||
"username": req.username,
|
||||
"password_hash": await ahash_password(req.password),
|
||||
"role": req.role,
|
||||
"must_change_password": True, # nosec B105 — not a password
|
||||
})
|
||||
invalidate_list_users_cache()
|
||||
return UserResponse(
|
||||
uuid=user_uuid,
|
||||
username=req.username,
|
||||
role=req.role,
|
||||
must_change_password=True,
|
||||
)
|
||||
|
||||
|
||||
@router.delete(
|
||||
"/config/users/{user_uuid}",
|
||||
tags=["Configuration"],
|
||||
response_model=MessageResponse,
|
||||
responses={
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Admin access required / cannot delete self"},
|
||||
404: {"description": "User not found"},
|
||||
},
|
||||
)
|
||||
@_traced("api.delete_user")
|
||||
async def api_delete_user(
|
||||
user_uuid: str,
|
||||
admin: dict = Depends(require_admin),
|
||||
) -> dict[str, str]:
|
||||
if user_uuid == admin["uuid"]:
|
||||
raise HTTPException(status_code=403, detail="Cannot delete your own account")
|
||||
|
||||
deleted = await repo.delete_user(user_uuid)
|
||||
if not deleted:
|
||||
raise HTTPException(status_code=404, detail="User not found")
|
||||
invalidate_user_cache(user_uuid)
|
||||
invalidate_list_users_cache()
|
||||
return {"message": "User deleted"}
|
||||
|
||||
|
||||
@router.put(
|
||||
"/config/users/{user_uuid}/role",
|
||||
tags=["Configuration"],
|
||||
response_model=MessageResponse,
|
||||
responses={
|
||||
400: {"description": "Bad Request (e.g. malformed JSON)"},
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Admin access required / cannot change own role"},
|
||||
404: {"description": "User not found"},
|
||||
422: {"description": "Validation error"},
|
||||
},
|
||||
)
|
||||
@_traced("api.update_user_role")
|
||||
async def api_update_user_role(
|
||||
user_uuid: str,
|
||||
req: UpdateUserRoleRequest,
|
||||
admin: dict = Depends(require_admin),
|
||||
) -> dict[str, str]:
|
||||
if user_uuid == admin["uuid"]:
|
||||
raise HTTPException(status_code=403, detail="Cannot change your own role")
|
||||
|
||||
target = await repo.get_user_by_uuid(user_uuid)
|
||||
if not target:
|
||||
raise HTTPException(status_code=404, detail="User not found")
|
||||
|
||||
await repo.update_user_role(user_uuid, req.role)
|
||||
invalidate_user_cache(user_uuid)
|
||||
invalidate_list_users_cache()
|
||||
return {"message": "User role updated"}
|
||||
|
||||
|
||||
@router.put(
|
||||
"/config/users/{user_uuid}/reset-password",
|
||||
tags=["Configuration"],
|
||||
response_model=MessageResponse,
|
||||
responses={
|
||||
400: {"description": "Bad Request (e.g. malformed JSON)"},
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Admin access required"},
|
||||
404: {"description": "User not found"},
|
||||
422: {"description": "Validation error"},
|
||||
},
|
||||
)
|
||||
@_traced("api.reset_user_password")
|
||||
async def api_reset_user_password(
|
||||
user_uuid: str,
|
||||
req: ResetUserPasswordRequest,
|
||||
admin: dict = Depends(require_admin),
|
||||
) -> dict[str, str]:
|
||||
target = await repo.get_user_by_uuid(user_uuid)
|
||||
if not target:
|
||||
raise HTTPException(status_code=404, detail="User not found")
|
||||
|
||||
await repo.update_user_password(
|
||||
user_uuid,
|
||||
await ahash_password(req.new_password),
|
||||
must_change_password=True,
|
||||
)
|
||||
invalidate_user_cache(user_uuid)
|
||||
invalidate_list_users_cache()
|
||||
return {"message": "Password reset successfully"}
|
||||
29
decnet/web/router/config/api_reinit.py
Normal file
29
decnet/web/router/config/api_reinit.py
Normal file
@@ -0,0 +1,29 @@
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
|
||||
from decnet.env import DECNET_DEVELOPER
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.db.models import PurgeResponse
|
||||
from decnet.web.dependencies import require_admin, repo
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.delete(
|
||||
"/config/reinit",
|
||||
tags=["Configuration"],
|
||||
response_model=PurgeResponse,
|
||||
responses={
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Admin access required or developer mode not enabled"},
|
||||
},
|
||||
)
|
||||
@_traced("api.reinit")
|
||||
async def api_reinit(admin: dict = Depends(require_admin)) -> dict:
|
||||
if not DECNET_DEVELOPER:
|
||||
raise HTTPException(status_code=403, detail="Developer mode is not enabled")
|
||||
|
||||
counts = await repo.purge_logs_and_bounties()
|
||||
return {
|
||||
"message": "Data purged",
|
||||
"deleted": counts,
|
||||
}
|
||||
50
decnet/web/router/config/api_update_config.py
Normal file
50
decnet/web/router/config/api_update_config.py
Normal file
@@ -0,0 +1,50 @@
|
||||
from fastapi import APIRouter, Depends
|
||||
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.dependencies import require_admin, repo
|
||||
from decnet.web.db.models import DeploymentLimitRequest, GlobalMutationIntervalRequest, MessageResponse
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.put(
|
||||
"/config/deployment-limit",
|
||||
tags=["Configuration"],
|
||||
response_model=MessageResponse,
|
||||
responses={
|
||||
400: {"description": "Bad Request (e.g. malformed JSON)"},
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Admin access required"},
|
||||
422: {"description": "Validation error"},
|
||||
},
|
||||
)
|
||||
@_traced("api.update_deployment_limit")
|
||||
async def api_update_deployment_limit(
|
||||
req: DeploymentLimitRequest,
|
||||
admin: dict = Depends(require_admin),
|
||||
) -> dict[str, str]:
|
||||
await repo.set_state("config_limits", {"deployment_limit": req.deployment_limit})
|
||||
return {"message": "Deployment limit updated"}
|
||||
|
||||
|
||||
@router.put(
|
||||
"/config/global-mutation-interval",
|
||||
tags=["Configuration"],
|
||||
response_model=MessageResponse,
|
||||
responses={
|
||||
400: {"description": "Bad Request (e.g. malformed JSON)"},
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Admin access required"},
|
||||
422: {"description": "Validation error"},
|
||||
},
|
||||
)
|
||||
@_traced("api.update_global_mutation_interval")
|
||||
async def api_update_global_mutation_interval(
|
||||
req: GlobalMutationIntervalRequest,
|
||||
admin: dict = Depends(require_admin),
|
||||
) -> dict[str, str]:
|
||||
await repo.set_state(
|
||||
"config_globals",
|
||||
{"global_mutation_interval": req.global_mutation_interval},
|
||||
)
|
||||
return {"message": "Global mutation interval updated"}
|
||||
0
decnet/web/router/credential_reuse/__init__.py
Normal file
0
decnet/web/router/credential_reuse/__init__.py
Normal file
@@ -0,0 +1,74 @@
|
||||
from typing import Any, Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.dependencies import require_viewer, repo
|
||||
from decnet.web.db.models import CredentialReuseResponse
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get(
|
||||
"/credential-reuse",
|
||||
response_model=CredentialReuseResponse,
|
||||
tags=["Credentials"],
|
||||
responses={
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
422: {"description": "Validation error"},
|
||||
},
|
||||
)
|
||||
@_traced("api.list_credential_reuse")
|
||||
async def list_credential_reuse(
|
||||
limit: int = Query(50, ge=1, le=1000),
|
||||
offset: int = Query(0, ge=0, le=2147483647),
|
||||
min_target_count: int = Query(2, ge=2, le=2147483647),
|
||||
secret_kind: Optional[str] = None,
|
||||
user: dict = Depends(require_viewer),
|
||||
) -> dict[str, Any]:
|
||||
"""Paged list of credential-reuse findings ordered by target_count desc.
|
||||
|
||||
Each row collapses every Credential capture sharing the same secret
|
||||
+ principal across distinct (decky, service) pairs into a single
|
||||
finding with the union of attacker UUIDs/IPs and reach.
|
||||
"""
|
||||
def _norm(v: Optional[str]) -> Optional[str]:
|
||||
if v in (None, "null", "NULL", "undefined", ""):
|
||||
return None
|
||||
return v
|
||||
|
||||
kind = _norm(secret_kind)
|
||||
total, data = await repo.list_credential_reuses(
|
||||
limit=limit,
|
||||
offset=offset,
|
||||
min_target_count=min_target_count,
|
||||
secret_kind=kind,
|
||||
)
|
||||
return {
|
||||
"total": total,
|
||||
"limit": limit,
|
||||
"offset": offset,
|
||||
"data": data,
|
||||
}
|
||||
|
||||
|
||||
@router.get(
|
||||
"/credential-reuse/{reuse_id}",
|
||||
tags=["Credentials"],
|
||||
responses={
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "CredentialReuse row not found"},
|
||||
},
|
||||
)
|
||||
@_traced("api.get_credential_reuse")
|
||||
async def get_credential_reuse(
|
||||
reuse_id: str,
|
||||
user: dict = Depends(require_viewer),
|
||||
) -> dict[str, Any]:
|
||||
"""One credential-reuse finding by UUID, or 404."""
|
||||
row = await repo.get_credential_reuse_by_id(reuse_id)
|
||||
if row is None:
|
||||
raise HTTPException(status_code=404, detail="credential_reuse not found")
|
||||
return row
|
||||
0
decnet/web/router/credentials/__init__.py
Normal file
0
decnet/web/router/credentials/__init__.py
Normal file
103
decnet/web/router/credentials/api_get_credentials.py
Normal file
103
decnet/web/router/credentials/api_get_credentials.py
Normal file
@@ -0,0 +1,103 @@
|
||||
import asyncio
|
||||
import time
|
||||
from typing import Any, Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, Query
|
||||
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.dependencies import require_viewer, repo
|
||||
from decnet.web.db.models import CredentialsResponse
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
# Mirror the Bounty cache pattern: the dashboard hits the unfiltered
|
||||
# default page constantly. Filtered requests bypass — staleness matters
|
||||
# when an operator is searching for a specific principal/IP.
|
||||
_CRED_TTL = 5.0
|
||||
_DEFAULT_LIMIT = 50
|
||||
_DEFAULT_OFFSET = 0
|
||||
_cred_cache: tuple[Optional[dict[str, Any]], float] = (None, 0.0)
|
||||
_cred_lock: Optional[asyncio.Lock] = None
|
||||
|
||||
|
||||
def _reset_credentials_cache() -> None:
|
||||
global _cred_cache, _cred_lock
|
||||
_cred_cache = (None, 0.0)
|
||||
_cred_lock = None
|
||||
|
||||
|
||||
async def _get_credentials_default_cached() -> dict[str, Any]:
|
||||
global _cred_cache, _cred_lock
|
||||
value, ts = _cred_cache
|
||||
now = time.monotonic()
|
||||
if value is not None and now - ts < _CRED_TTL:
|
||||
return value
|
||||
if _cred_lock is None:
|
||||
_cred_lock = asyncio.Lock()
|
||||
async with _cred_lock:
|
||||
value, ts = _cred_cache
|
||||
now = time.monotonic()
|
||||
if value is not None and now - ts < _CRED_TTL:
|
||||
return value
|
||||
_data = await repo.get_credentials(
|
||||
limit=_DEFAULT_LIMIT, offset=_DEFAULT_OFFSET,
|
||||
search=None, service=None, attacker_ip=None,
|
||||
)
|
||||
_total = await repo.get_total_credentials(
|
||||
search=None, service=None, attacker_ip=None,
|
||||
)
|
||||
value = {"total": _total, "limit": _DEFAULT_LIMIT, "offset": _DEFAULT_OFFSET, "data": _data}
|
||||
_cred_cache = (value, time.monotonic())
|
||||
return value
|
||||
|
||||
|
||||
@router.get(
|
||||
"/credentials",
|
||||
response_model=CredentialsResponse,
|
||||
tags=["Credentials"],
|
||||
responses={
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
422: {"description": "Validation error"},
|
||||
},
|
||||
)
|
||||
@_traced("api.get_credentials")
|
||||
async def get_credentials(
|
||||
limit: int = Query(50, ge=1, le=1000),
|
||||
offset: int = Query(0, ge=0, le=2147483647),
|
||||
search: Optional[str] = None,
|
||||
service: Optional[str] = None,
|
||||
attacker_ip: Optional[str] = None,
|
||||
user: dict = Depends(require_viewer),
|
||||
) -> dict[str, Any]:
|
||||
"""Retrieve captured credentials (deduped by attacker/decky/service/secret)."""
|
||||
def _norm(v: Optional[str]) -> Optional[str]:
|
||||
if v in (None, "null", "NULL", "undefined", ""):
|
||||
return None
|
||||
return v
|
||||
|
||||
s = _norm(search)
|
||||
svc = _norm(service)
|
||||
aip = _norm(attacker_ip)
|
||||
|
||||
if (
|
||||
s is None
|
||||
and svc is None
|
||||
and aip is None
|
||||
and limit == _DEFAULT_LIMIT
|
||||
and offset == _DEFAULT_OFFSET
|
||||
):
|
||||
return await _get_credentials_default_cached()
|
||||
|
||||
_data = await repo.get_credentials(
|
||||
limit=limit, offset=offset, search=s, service=svc, attacker_ip=aip,
|
||||
)
|
||||
_total = await repo.get_total_credentials(
|
||||
search=s, service=svc, attacker_ip=aip,
|
||||
)
|
||||
return {
|
||||
"total": _total,
|
||||
"limit": limit,
|
||||
"offset": offset,
|
||||
"data": _data,
|
||||
}
|
||||
@@ -1,14 +1,18 @@
|
||||
import logging
|
||||
import os
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
|
||||
from decnet.config import DEFAULT_MUTATE_INTERVAL, DecnetConfig, _ROOT, log
|
||||
from decnet.logging import get_logger
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.config import DEFAULT_MUTATE_INTERVAL, DecnetConfig, _ROOT
|
||||
from decnet.engine import deploy as _deploy
|
||||
from decnet.ini_loader import load_ini_from_string
|
||||
from decnet.network import detect_interface, detect_subnet, get_host_ip
|
||||
from decnet.web.dependencies import get_current_user, repo
|
||||
from decnet.web.db.models import DeployIniRequest
|
||||
from decnet.web.dependencies import require_admin, repo
|
||||
from decnet.web.db.models import DeployIniRequest, DeployResponse
|
||||
from decnet.web.router.swarm.api_deploy_swarm import dispatch_decnet_config
|
||||
|
||||
log = get_logger("api")
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
@@ -16,15 +20,19 @@ router = APIRouter()
|
||||
@router.post(
|
||||
"/deckies/deploy",
|
||||
tags=["Fleet Management"],
|
||||
response_model=DeployResponse,
|
||||
responses={
|
||||
400: {"description": "Bad Request (e.g. malformed JSON)"},
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
409: {"description": "Configuration conflict (e.g. invalid IP allocation or network mismatch)"},
|
||||
422: {"description": "Invalid INI config or schema validation error"},
|
||||
500: {"description": "Deployment failed"}
|
||||
500: {"description": "Deployment failed"},
|
||||
502: {"description": "Partial swarm deploy failure — one or more worker hosts returned an error"},
|
||||
}
|
||||
)
|
||||
async def api_deploy_deckies(req: DeployIniRequest, current_user: str = Depends(get_current_user)) -> dict[str, str]:
|
||||
@_traced("api.deploy_deckies")
|
||||
async def api_deploy_deckies(req: DeployIniRequest, admin: dict = Depends(require_admin)) -> dict[str, str]:
|
||||
from decnet.fleet import build_deckies_from_ini
|
||||
|
||||
try:
|
||||
@@ -38,16 +46,20 @@ async def api_deploy_deckies(req: DeployIniRequest, current_user: str = Depends(
|
||||
state_dict = await repo.get_state("deployment")
|
||||
ingest_log_file = os.environ.get("DECNET_INGEST_LOG_FILE")
|
||||
|
||||
config: DecnetConfig | None = None
|
||||
if state_dict:
|
||||
config = DecnetConfig(**state_dict["config"])
|
||||
subnet_cidr = ini.subnet or config.subnet
|
||||
gateway = ini.gateway or config.gateway
|
||||
host_ip = get_host_ip(config.interface)
|
||||
iface = config.interface
|
||||
host_ip = get_host_ip(iface)
|
||||
# Always sync config log_file with current API ingestion target
|
||||
if ingest_log_file:
|
||||
config.log_file = ingest_log_file
|
||||
else:
|
||||
# If no state exists, we need to infer network details from the INI or the host.
|
||||
# No state yet — infer network details from the INI or the host. We
|
||||
# defer instantiating DecnetConfig until after build_deckies_from_ini
|
||||
# because DecnetConfig.deckies has min_length=1.
|
||||
try:
|
||||
iface = ini.interface or detect_interface()
|
||||
subnet_cidr, gateway = ini.subnet, ini.gateway
|
||||
@@ -62,16 +74,6 @@ async def api_deploy_deckies(req: DeployIniRequest, current_user: str = Depends(
|
||||
detail=f"Network configuration conflict: {e}. "
|
||||
"Add a [general] section with interface=, net=, and gw= to the INI."
|
||||
)
|
||||
config = DecnetConfig(
|
||||
mode="unihost",
|
||||
interface=iface,
|
||||
subnet=subnet_cidr,
|
||||
gateway=gateway,
|
||||
deckies=[],
|
||||
log_file=ingest_log_file,
|
||||
ipvlan=False,
|
||||
mutate_interval=ini.mutate_interval or DEFAULT_MUTATE_INTERVAL
|
||||
)
|
||||
|
||||
try:
|
||||
new_decky_configs = build_deckies_from_ini(
|
||||
@@ -81,26 +83,99 @@ async def api_deploy_deckies(req: DeployIniRequest, current_user: str = Depends(
|
||||
log.debug("deploy: build_deckies_from_ini rejected input: %s", e)
|
||||
raise HTTPException(status_code=409, detail=str(e))
|
||||
|
||||
# Merge deckies
|
||||
existing_deckies_map = {d.name: d for d in config.deckies}
|
||||
for new_decky in new_decky_configs:
|
||||
existing_deckies_map[new_decky.name] = new_decky
|
||||
if config is None:
|
||||
config = DecnetConfig(
|
||||
mode="unihost",
|
||||
interface=iface,
|
||||
subnet=subnet_cidr,
|
||||
gateway=gateway,
|
||||
deckies=new_decky_configs,
|
||||
log_file=ingest_log_file,
|
||||
ipvlan=False,
|
||||
mutate_interval=ini.mutate_interval or DEFAULT_MUTATE_INTERVAL,
|
||||
)
|
||||
|
||||
config.deckies = list(existing_deckies_map.values())
|
||||
# The INI is the source of truth for *which* deckies exist this deploy.
|
||||
# The old "merge with prior state" behaviour meant submitting `[decky1]`
|
||||
# after a 3-decky run silently redeployed decky2/decky3 too — and then
|
||||
# collided on their stale IPs ("Address already in use"). Full replace
|
||||
# matches what the operator sees in the submitted config.
|
||||
config.deckies = list(new_decky_configs)
|
||||
|
||||
# We call deploy(config) which regenerates docker-compose and runs `up -d --remove-orphans`.
|
||||
limits_state = await repo.get_state("config_limits")
|
||||
deployment_limit = limits_state.get("deployment_limit", 10) if limits_state else 10
|
||||
if len(config.deckies) > deployment_limit:
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail=f"Deployment would result in {len(config.deckies)} deckies, "
|
||||
f"exceeding the configured limit of {deployment_limit}",
|
||||
)
|
||||
|
||||
# Auto-mode: if we're a master with at least one enrolled/active SWARM
|
||||
# host, shard the deckies across those workers instead of spawning docker
|
||||
# containers on the master itself. Round-robin assignment over deckies
|
||||
# that don't already carry a host_uuid (state from a prior swarm deploy
|
||||
# keeps its original assignment).
|
||||
swarm_hosts: list[dict] = []
|
||||
if os.environ.get("DECNET_MODE", "master").lower() == "master":
|
||||
swarm_hosts = [
|
||||
h for h in await repo.list_swarm_hosts()
|
||||
if h.get("status") in ("active", "enrolled") and h.get("address")
|
||||
]
|
||||
|
||||
if swarm_hosts:
|
||||
# Carry-over from a prior deployment may reference a host_uuid that's
|
||||
# since been decommissioned / re-enrolled at a new uuid. Drop any
|
||||
# assignment that isn't in the currently-reachable set, then round-
|
||||
# robin-fill the blanks — otherwise dispatch 404s on a dead uuid.
|
||||
live_uuids = {h["uuid"] for h in swarm_hosts}
|
||||
for d in config.deckies:
|
||||
if d.host_uuid and d.host_uuid not in live_uuids:
|
||||
d.host_uuid = None
|
||||
unassigned = [d for d in config.deckies if not d.host_uuid]
|
||||
for i, d in enumerate(unassigned):
|
||||
d.host_uuid = swarm_hosts[i % len(swarm_hosts)]["uuid"]
|
||||
config = config.model_copy(update={"mode": "swarm"})
|
||||
|
||||
try:
|
||||
result = await dispatch_decnet_config(config, repo, dry_run=False, no_cache=False)
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
log.exception("swarm-auto deploy dispatch failed: %s", e)
|
||||
raise HTTPException(status_code=500, detail="Swarm dispatch failed. Check server logs.")
|
||||
|
||||
await repo.set_state("deployment", {
|
||||
"config": config.model_dump(),
|
||||
"compose_path": state_dict["compose_path"] if state_dict else "",
|
||||
})
|
||||
|
||||
failed = [r for r in result.results if not r.ok]
|
||||
if failed:
|
||||
detail = "; ".join(f"{r.host_name}: {r.detail}" for r in failed)
|
||||
raise HTTPException(status_code=502, detail=f"Partial swarm deploy failure — {detail}")
|
||||
return {
|
||||
"message": f"Deckies deployed across {len(result.results)} swarm host(s)",
|
||||
"mode": "swarm",
|
||||
}
|
||||
|
||||
# Unihost path — docker-compose on the master itself.
|
||||
# NB: the JSON state file (decnet-state.json) and fleet_deckies DB rows
|
||||
# are both written *inside* _deploy(config) — engine.deployer is the
|
||||
# single shared sink for every fleet-creation path (CLI deploy, this
|
||||
# unihost API path, and per-worker SWARM agent deploys). Do not
|
||||
# duplicate save_state / fleet upserts here.
|
||||
try:
|
||||
if os.environ.get("DECNET_CONTRACT_TEST") != "true":
|
||||
_deploy(config)
|
||||
|
||||
# Persist new state to DB
|
||||
new_state_payload = {
|
||||
"config": config.model_dump(),
|
||||
"compose_path": str(_ROOT / "docker-compose.yml") if not state_dict else state_dict["compose_path"]
|
||||
}
|
||||
await repo.set_state("deployment", new_state_payload)
|
||||
except Exception as e:
|
||||
logging.getLogger("decnet.web.api").exception("Deployment failed: %s", e)
|
||||
log.exception("Deployment failed: %s", e)
|
||||
raise HTTPException(status_code=500, detail="Deployment failed. Check server logs for details.")
|
||||
|
||||
return {"message": "Deckies deployed successfully"}
|
||||
return {"message": "Deckies deployed successfully", "mode": "unihost"}
|
||||
|
||||
@@ -1,13 +1,48 @@
|
||||
from typing import Any
|
||||
import asyncio
|
||||
import time
|
||||
from typing import Any, Optional
|
||||
|
||||
from fastapi import APIRouter, Depends
|
||||
|
||||
from decnet.web.dependencies import get_current_user, repo
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.dependencies import require_viewer, repo
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
# /deckies is full fleet inventory — polled by the UI and under locust.
|
||||
# Fleet state changes on deploy/teardown (seconds to minutes); a 5s window
|
||||
# collapses the read storm into one DB hit.
|
||||
_DECKIES_TTL = 5.0
|
||||
_deckies_cache: tuple[Optional[list[dict[str, Any]]], float] = (None, 0.0)
|
||||
_deckies_lock: Optional[asyncio.Lock] = None
|
||||
|
||||
|
||||
def _reset_deckies_cache() -> None:
|
||||
global _deckies_cache, _deckies_lock
|
||||
_deckies_cache = (None, 0.0)
|
||||
_deckies_lock = None
|
||||
|
||||
|
||||
async def _get_deckies_cached() -> list[dict[str, Any]]:
|
||||
global _deckies_cache, _deckies_lock
|
||||
value, ts = _deckies_cache
|
||||
now = time.monotonic()
|
||||
if value is not None and now - ts < _DECKIES_TTL:
|
||||
return value
|
||||
if _deckies_lock is None:
|
||||
_deckies_lock = asyncio.Lock()
|
||||
async with _deckies_lock:
|
||||
value, ts = _deckies_cache
|
||||
now = time.monotonic()
|
||||
if value is not None and now - ts < _DECKIES_TTL:
|
||||
return value
|
||||
value = await repo.get_deckies()
|
||||
_deckies_cache = (value, time.monotonic())
|
||||
return value
|
||||
|
||||
|
||||
@router.get("/deckies", tags=["Fleet Management"],
|
||||
responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}},)
|
||||
async def get_deckies(current_user: str = Depends(get_current_user)) -> list[dict[str, Any]]:
|
||||
return await repo.get_deckies()
|
||||
responses={401: {"description": "Could not validate credentials"}, 403: {"description": "Insufficient permissions"}, 422: {"description": "Validation error"}},)
|
||||
@_traced("api.get_deckies")
|
||||
async def get_deckies(user: dict = Depends(require_viewer)) -> list[dict[str, Any]]:
|
||||
return await _get_deckies_cached()
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
import os
|
||||
from fastapi import APIRouter, Depends, HTTPException, Path
|
||||
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.mutator import mutate_decky
|
||||
from decnet.web.dependencies import get_current_user, repo
|
||||
from decnet.web.db.models import MessageResponse
|
||||
from decnet.web.dependencies import require_admin, repo
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
@@ -10,11 +12,18 @@ router = APIRouter()
|
||||
@router.post(
|
||||
"/deckies/{decky_name}/mutate",
|
||||
tags=["Fleet Management"],
|
||||
responses={401: {"description": "Could not validate credentials"}, 404: {"description": "Decky not found"}}
|
||||
response_model=MessageResponse,
|
||||
responses={
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "Decky not found"},
|
||||
422: {"description": "Path parameter validation error (decky_name must match ^[a-z0-9\\-]{1,64}$)"},
|
||||
}
|
||||
)
|
||||
@_traced("api.mutate_decky")
|
||||
async def api_mutate_decky(
|
||||
decky_name: str = Path(..., pattern=r"^[a-z0-9\-]{1,64}$"),
|
||||
current_user: str = Depends(get_current_user),
|
||||
admin: dict = Depends(require_admin),
|
||||
) -> dict[str, str]:
|
||||
if os.environ.get("DECNET_CONTRACT_TEST") == "true":
|
||||
return {"message": f"Successfully mutated {decky_name} (Contract Test Mock)"}
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.config import DecnetConfig
|
||||
from decnet.web.dependencies import get_current_user, repo
|
||||
from decnet.web.db.models import MutateIntervalRequest
|
||||
from decnet.web.dependencies import require_admin, repo
|
||||
from decnet.web.db.models import MessageResponse, MutateIntervalRequest
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
@@ -16,14 +17,17 @@ def _parse_duration(s: str) -> int:
|
||||
|
||||
|
||||
@router.put("/deckies/{decky_name}/mutate-interval", tags=["Fleet Management"],
|
||||
response_model=MessageResponse,
|
||||
responses={
|
||||
400: {"description": "Bad Request (e.g. malformed JSON)"},
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "No active deployment or decky not found"},
|
||||
422: {"description": "Validation error"}
|
||||
},
|
||||
)
|
||||
async def api_update_mutate_interval(decky_name: str, req: MutateIntervalRequest, current_user: str = Depends(get_current_user)) -> dict[str, str]:
|
||||
@_traced("api.update_mutate_interval")
|
||||
async def api_update_mutate_interval(decky_name: str, req: MutateIntervalRequest, admin: dict = Depends(require_admin)) -> dict[str, str]:
|
||||
state_dict = await repo.get_state("deployment")
|
||||
if not state_dict:
|
||||
raise HTTPException(status_code=404, detail="No active deployment")
|
||||
|
||||
0
decnet/web/router/health/__init__.py
Normal file
0
decnet/web/router/health/__init__.py
Normal file
151
decnet/web/router/health/api_get_health.py
Normal file
151
decnet/web/router/health/api_get_health.py
Normal file
@@ -0,0 +1,151 @@
|
||||
import asyncio
|
||||
import time
|
||||
from typing import Any, Optional
|
||||
|
||||
from fastapi import APIRouter, Depends
|
||||
from fastapi.responses import ORJSONResponse
|
||||
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.dependencies import require_viewer, repo
|
||||
from decnet.web.db.models import HealthResponse, ComponentHealth
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
_CRITICAL_SERVICES = {"database", "docker", "ingestion_worker"}
|
||||
|
||||
# Cache Docker client and health result to avoid hammering the Docker socket
|
||||
_docker_client: Optional[Any] = None
|
||||
_docker_healthy: bool = False
|
||||
_docker_detail: str = ""
|
||||
_docker_last_check: float = 0.0
|
||||
_DOCKER_CHECK_INTERVAL = 5.0 # seconds between actual Docker pings
|
||||
|
||||
# Cache DB liveness result — under load, every request was hitting
|
||||
# repo.get_total_logs() and filling the aiosqlite queue.
|
||||
_db_component: Optional[ComponentHealth] = None
|
||||
_db_last_check: float = 0.0
|
||||
# Lazy-init — an asyncio.Lock bound to a dead event loop deadlocks any
|
||||
# later test running under a fresh loop. Create on first use.
|
||||
_db_lock: Optional[asyncio.Lock] = None
|
||||
_DB_CHECK_INTERVAL = 1.0 # seconds
|
||||
|
||||
|
||||
def _reset_docker_cache() -> None:
|
||||
"""Reset cached Docker state — used by tests."""
|
||||
global _docker_client, _docker_healthy, _docker_detail, _docker_last_check
|
||||
_docker_client = None
|
||||
_docker_healthy = False
|
||||
_docker_detail = ""
|
||||
_docker_last_check = 0.0
|
||||
|
||||
|
||||
def _reset_db_cache() -> None:
|
||||
"""Reset cached DB liveness — used by tests."""
|
||||
global _db_component, _db_last_check, _db_lock
|
||||
_db_component = None
|
||||
_db_last_check = 0.0
|
||||
_db_lock = None
|
||||
|
||||
|
||||
async def _check_database_cached() -> ComponentHealth:
|
||||
global _db_component, _db_last_check, _db_lock
|
||||
now = time.monotonic()
|
||||
if _db_component is not None and now - _db_last_check < _DB_CHECK_INTERVAL:
|
||||
return _db_component
|
||||
if _db_lock is None:
|
||||
_db_lock = asyncio.Lock()
|
||||
async with _db_lock:
|
||||
now = time.monotonic()
|
||||
if _db_component is not None and now - _db_last_check < _DB_CHECK_INTERVAL:
|
||||
return _db_component
|
||||
try:
|
||||
await repo.get_total_logs()
|
||||
_db_component = ComponentHealth(status="ok")
|
||||
except Exception as exc:
|
||||
_db_component = ComponentHealth(status="failing", detail=str(exc))
|
||||
_db_last_check = time.monotonic()
|
||||
return _db_component
|
||||
|
||||
|
||||
@router.get(
|
||||
"/health",
|
||||
response_model=HealthResponse,
|
||||
tags=["Observability"],
|
||||
responses={
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
503: {"model": HealthResponse, "description": "System unhealthy"},
|
||||
},
|
||||
)
|
||||
@_traced("api.get_health")
|
||||
async def get_health(user: dict = Depends(require_viewer)) -> Any:
|
||||
components: dict[str, ComponentHealth] = {}
|
||||
|
||||
# 1. Database (cached — avoids a DB round-trip per request)
|
||||
components["database"] = await _check_database_cached()
|
||||
|
||||
# 2. Background workers
|
||||
from decnet.web.api import get_background_tasks
|
||||
for name, task in get_background_tasks().items():
|
||||
if task is None:
|
||||
components[name] = ComponentHealth(status="failing", detail="not started")
|
||||
elif task.done():
|
||||
if task.cancelled():
|
||||
detail = "cancelled"
|
||||
else:
|
||||
exc = task.exception()
|
||||
detail = f"exited: {exc}" if exc else "exited unexpectedly"
|
||||
components[name] = ComponentHealth(status="failing", detail=detail)
|
||||
else:
|
||||
components[name] = ComponentHealth(status="ok")
|
||||
|
||||
# 3. Docker daemon (cached — avoids creating a new client per request)
|
||||
global _docker_client, _docker_healthy, _docker_detail, _docker_last_check
|
||||
now = time.monotonic()
|
||||
if now - _docker_last_check > _DOCKER_CHECK_INTERVAL:
|
||||
try:
|
||||
import docker
|
||||
|
||||
if _docker_client is None:
|
||||
_docker_client = await asyncio.to_thread(docker.from_env)
|
||||
await asyncio.to_thread(_docker_client.ping)
|
||||
_docker_healthy = True
|
||||
_docker_detail = ""
|
||||
except Exception as exc:
|
||||
_docker_client = None
|
||||
_docker_healthy = False
|
||||
_docker_detail = str(exc)
|
||||
_docker_last_check = now
|
||||
|
||||
if _docker_healthy:
|
||||
components["docker"] = ComponentHealth(status="ok")
|
||||
else:
|
||||
components["docker"] = ComponentHealth(status="failing", detail=_docker_detail)
|
||||
|
||||
# Overall status tiers:
|
||||
# healthy — every component ok
|
||||
# degraded — only non-critical components failing (service usable,
|
||||
# falls back to cache or skips non-essential work)
|
||||
# unhealthy — a critical component (db, docker, ingestion) failing;
|
||||
# survival depends on caches
|
||||
critical_failing = any(
|
||||
c.status == "failing"
|
||||
for name, c in components.items()
|
||||
if name in _CRITICAL_SERVICES
|
||||
)
|
||||
noncritical_failing = any(
|
||||
c.status == "failing"
|
||||
for name, c in components.items()
|
||||
if name not in _CRITICAL_SERVICES
|
||||
)
|
||||
|
||||
if critical_failing:
|
||||
overall = "unhealthy"
|
||||
elif noncritical_failing:
|
||||
overall = "degraded"
|
||||
else:
|
||||
overall = "healthy"
|
||||
|
||||
result = HealthResponse(status=overall, components=components)
|
||||
status_code = 503 if overall == "unhealthy" else 200
|
||||
return ORJSONResponse(content=result.model_dump(), status_code=status_code)
|
||||
0
decnet/web/router/identities/__init__.py
Normal file
0
decnet/web/router/identities/__init__.py
Normal file
143
decnet/web/router/identities/api_events.py
Normal file
143
decnet/web/router/identities/api_events.py
Normal file
@@ -0,0 +1,143 @@
|
||||
"""SSE stream of identity-resolution events — one connection per viewer.
|
||||
|
||||
Subscribes to ``identity.>`` on the :class:`~decnet.bus.base.BaseBus` for
|
||||
the duration of the request and forwards each matching bus event as a
|
||||
Server-Sent Event to the browser. Emits a one-shot snapshot on connect
|
||||
(current paginated identity list) so the client doesn't need a separate
|
||||
fetch to initialise.
|
||||
|
||||
Authorization mirrors :mod:`decnet.web.router.topology.api_events` — a
|
||||
JWT passed via the ``?token=`` query parameter (EventSource can't set
|
||||
arbitrary headers) + ``require_stream_viewer`` role gate.
|
||||
|
||||
The endpoint is broadly scoped (every identity event, not per-uuid)
|
||||
because both ``AttackerDetail`` and ``IdentityDetail`` need the same
|
||||
firehose: a bare ``AttackerDetail`` watches for ``identity.formed``
|
||||
events that finally bind its ``identity_id``, and ``IdentityDetail``
|
||||
watches for ``observation.linked`` / ``merged`` / ``unmerged`` against
|
||||
the identity it's rendering. A per-uuid filter would force the client
|
||||
to know its identity before subscribing, which it doesn't always.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from typing import AsyncGenerator
|
||||
|
||||
import orjson
|
||||
from fastapi import APIRouter, Depends, Request
|
||||
from fastapi.responses import StreamingResponse
|
||||
|
||||
from decnet.bus import topics as _topics
|
||||
from decnet.bus.app import get_app_bus
|
||||
from decnet.logging import get_logger
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.dependencies import repo, require_stream_viewer
|
||||
from decnet.web.sse_limits import sse_connection_slot
|
||||
|
||||
log = get_logger("api.identities.events")
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
_KEEPALIVE_SECS = 15.0
|
||||
_SNAPSHOT_LIMIT = 50
|
||||
|
||||
|
||||
def _format_sse(event_name: str, data: dict) -> str:
|
||||
return f"event: {event_name}\ndata: {orjson.dumps(data).decode()}\n\n"
|
||||
|
||||
|
||||
@router.get(
|
||||
"/identities/events",
|
||||
tags=["Identity Resolution"],
|
||||
responses={
|
||||
200: {
|
||||
"content": {"text/event-stream": {}},
|
||||
"description": "SSE stream of identity-resolution events",
|
||||
},
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
429: {"description": "Per-user SSE connection cap reached"},
|
||||
},
|
||||
)
|
||||
@_traced("api.identities.events")
|
||||
async def api_identities_events(
|
||||
request: Request,
|
||||
user: dict = Depends(require_stream_viewer),
|
||||
) -> StreamingResponse:
|
||||
# Event types emitted: snapshot, formed, observation.linked,
|
||||
# merged, unmerged. All wrap bus events whose payload is also
|
||||
# reachable via viewer-gated REST (GET /identities/*).
|
||||
snapshot = await repo.list_identities(limit=_SNAPSHOT_LIMIT, offset=0)
|
||||
|
||||
async def generator() -> AsyncGenerator[str, None]:
|
||||
async with sse_connection_slot(user["uuid"]):
|
||||
yield ": keepalive\n\n"
|
||||
yield _format_sse("snapshot", {"identities": snapshot})
|
||||
|
||||
bus = await get_app_bus()
|
||||
if bus is None:
|
||||
# Bus disabled / unreachable — keep the connection
|
||||
# alive so the client doesn't reconnect-storm; it can
|
||||
# re-poll the REST API on its own timer.
|
||||
while not await request.is_disconnected():
|
||||
try:
|
||||
await asyncio.sleep(_KEEPALIVE_SECS)
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
yield ": keepalive\n\n"
|
||||
return
|
||||
|
||||
sub = bus.subscribe(f"{_topics.IDENTITY}.>")
|
||||
try:
|
||||
async with sub:
|
||||
sub_iter = sub.__aiter__()
|
||||
while True:
|
||||
if await request.is_disconnected():
|
||||
break
|
||||
next_task = asyncio.ensure_future(sub_iter.__anext__())
|
||||
try:
|
||||
event = await asyncio.wait_for(
|
||||
next_task, timeout=_KEEPALIVE_SECS,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
next_task.cancel()
|
||||
yield ": keepalive\n\n"
|
||||
continue
|
||||
except StopAsyncIteration:
|
||||
break
|
||||
yield _format_sse(
|
||||
_sse_name_for(event.topic),
|
||||
{
|
||||
"topic": event.topic,
|
||||
"type": event.type,
|
||||
"ts": event.ts,
|
||||
"payload": event.payload,
|
||||
},
|
||||
)
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
except Exception:
|
||||
log.exception("identity events stream crashed")
|
||||
yield _format_sse("error", {"message": "Stream interrupted"})
|
||||
|
||||
return StreamingResponse(
|
||||
generator(),
|
||||
media_type="text/event-stream",
|
||||
headers={
|
||||
"Cache-Control": "no-cache",
|
||||
"X-Accel-Buffering": "no",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def _sse_name_for(topic: str) -> str:
|
||||
"""Derive an SSE ``event:`` name from a bus topic.
|
||||
|
||||
``identity.formed`` → ``formed``
|
||||
``identity.observation.linked`` → ``observation.linked``
|
||||
Pass-through preserves dotted leaves so the frontend can switch on
|
||||
a stable name.
|
||||
"""
|
||||
if topic.startswith(f"{_topics.IDENTITY}."):
|
||||
return topic[len(_topics.IDENTITY) + 1:]
|
||||
return topic
|
||||
44
decnet/web/router/identities/api_get_identity_detail.py
Normal file
44
decnet/web/router/identities/api_get_identity_detail.py
Normal file
@@ -0,0 +1,44 @@
|
||||
"""GET /api/v1/identities/{uuid} — single identity row.
|
||||
|
||||
Soft-merge handling: if the requested UUID has merged_into_uuid set,
|
||||
the repository follows the chain and returns the winner. Callers always
|
||||
receive the canonical identity for any UUID that has ever been part of
|
||||
the merge tree.
|
||||
|
||||
Returns 404 against an empty/unknown UUID — expected response while the
|
||||
clusterer hasn't run yet.
|
||||
"""
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.dependencies import repo, require_viewer
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get(
|
||||
"/identities/{uuid}",
|
||||
tags=["Identity Resolution"],
|
||||
responses={
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "Identity not found"},
|
||||
},
|
||||
)
|
||||
@_traced("api.get_identity_detail")
|
||||
async def get_identity_detail(
|
||||
uuid: str,
|
||||
user: dict = Depends(require_viewer),
|
||||
) -> dict[str, Any]:
|
||||
identity = await repo.get_identity_by_uuid(uuid)
|
||||
if not identity:
|
||||
raise HTTPException(status_code=404, detail="Identity not found")
|
||||
# Cheap aggregates the IdentityDetail page surfaces. Counted off the
|
||||
# FK rather than maintained in observation_count so the answer is
|
||||
# always live (the denormalized field can lag the clusterer briefly).
|
||||
identity["observation_count_live"] = await repo.count_observations_for_identity(
|
||||
identity["uuid"]
|
||||
)
|
||||
return identity
|
||||
35
decnet/web/router/identities/api_list_identities.py
Normal file
35
decnet/web/router/identities/api_list_identities.py
Normal file
@@ -0,0 +1,35 @@
|
||||
"""GET /api/v1/identities — paginated list of resolved identities.
|
||||
|
||||
Returns an empty list while the clusterer hasn't run yet (the
|
||||
identities table ships empty in the schema-only PR). See
|
||||
development/IDENTITY_RESOLUTION.md.
|
||||
"""
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, Depends, Query
|
||||
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.dependencies import repo, require_viewer
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get(
|
||||
"/identities",
|
||||
tags=["Identity Resolution"],
|
||||
responses={
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
422: {"description": "Validation error"},
|
||||
},
|
||||
)
|
||||
@_traced("api.list_identities")
|
||||
async def list_identities(
|
||||
limit: int = Query(50, ge=1, le=1000),
|
||||
offset: int = Query(0, ge=0, le=2147483647),
|
||||
user: dict = Depends(require_viewer),
|
||||
) -> dict[str, Any]:
|
||||
"""Paginated identity list, newest-updated first."""
|
||||
data = await repo.list_identities(limit=limit, offset=offset)
|
||||
total = await repo.count_identities()
|
||||
return {"total": total, "limit": limit, "offset": offset, "data": data}
|
||||
@@ -0,0 +1,48 @@
|
||||
"""GET /api/v1/identities/{uuid}/observations — observations for an identity.
|
||||
|
||||
Returns the per-IP ``Attacker`` rows whose ``identity_id`` FK points at
|
||||
this identity. The shape mirrors ``AttackersResponse`` so the frontend
|
||||
can reuse the same row component as the main attackers list.
|
||||
|
||||
Empty result while the clusterer hasn't linked any observations yet.
|
||||
"""
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.dependencies import repo, require_viewer
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get(
|
||||
"/identities/{uuid}/observations",
|
||||
tags=["Identity Resolution"],
|
||||
responses={
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "Identity not found"},
|
||||
},
|
||||
)
|
||||
@_traced("api.list_identity_observations")
|
||||
async def list_identity_observations(
|
||||
uuid: str,
|
||||
limit: int = Query(50, ge=1, le=1000),
|
||||
offset: int = Query(0, ge=0, le=2147483647),
|
||||
user: dict = Depends(require_viewer),
|
||||
) -> dict[str, Any]:
|
||||
# 404 if the identity itself doesn't exist. Otherwise return the
|
||||
# observations linked to it (which may be empty — a freshly-formed
|
||||
# identity briefly has no observations yet from the FK side).
|
||||
identity = await repo.get_identity_by_uuid(uuid)
|
||||
if not identity:
|
||||
raise HTTPException(status_code=404, detail="Identity not found")
|
||||
# If the requested uuid was merged, return observations under the
|
||||
# winner's uuid (which is what get_identity_by_uuid resolves to).
|
||||
canonical_uuid = identity["uuid"]
|
||||
data = await repo.list_observations_for_identity(
|
||||
canonical_uuid, limit=limit, offset=offset
|
||||
)
|
||||
total = await repo.count_observations_for_identity(canonical_uuid)
|
||||
return {"total": total, "limit": limit, "offset": offset, "data": data}
|
||||
@@ -1,20 +1,58 @@
|
||||
import asyncio
|
||||
import time
|
||||
from typing import Any, Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, Query
|
||||
|
||||
from decnet.web.dependencies import get_current_user, repo
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.dependencies import require_viewer, repo
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
# /logs/histogram aggregates over the full logs table — expensive and
|
||||
# polled constantly by the UI. Cache only the unfiltered default call
|
||||
# (which is what the UI and locust hit); any filter bypasses.
|
||||
_HISTOGRAM_TTL = 5.0
|
||||
_DEFAULT_INTERVAL = 15
|
||||
_histogram_cache: tuple[Optional[list[dict[str, Any]]], float] = (None, 0.0)
|
||||
_histogram_lock: Optional[asyncio.Lock] = None
|
||||
|
||||
|
||||
def _reset_histogram_cache() -> None:
|
||||
global _histogram_cache, _histogram_lock
|
||||
_histogram_cache = (None, 0.0)
|
||||
_histogram_lock = None
|
||||
|
||||
|
||||
async def _get_histogram_cached() -> list[dict[str, Any]]:
|
||||
global _histogram_cache, _histogram_lock
|
||||
value, ts = _histogram_cache
|
||||
now = time.monotonic()
|
||||
if value is not None and now - ts < _HISTOGRAM_TTL:
|
||||
return value
|
||||
if _histogram_lock is None:
|
||||
_histogram_lock = asyncio.Lock()
|
||||
async with _histogram_lock:
|
||||
value, ts = _histogram_cache
|
||||
now = time.monotonic()
|
||||
if value is not None and now - ts < _HISTOGRAM_TTL:
|
||||
return value
|
||||
value = await repo.get_log_histogram(
|
||||
search=None, start_time=None, end_time=None, interval_minutes=_DEFAULT_INTERVAL,
|
||||
)
|
||||
_histogram_cache = (value, time.monotonic())
|
||||
return value
|
||||
|
||||
|
||||
@router.get("/logs/histogram", tags=["Logs"],
|
||||
responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}},)
|
||||
responses={401: {"description": "Could not validate credentials"}, 403: {"description": "Insufficient permissions"}, 422: {"description": "Validation error"}},)
|
||||
@_traced("api.get_logs_histogram")
|
||||
async def get_logs_histogram(
|
||||
search: Optional[str] = None,
|
||||
start_time: Optional[str] = Query(None),
|
||||
end_time: Optional[str] = Query(None),
|
||||
interval_minutes: int = Query(15, ge=1),
|
||||
current_user: str = Depends(get_current_user)
|
||||
user: dict = Depends(require_viewer)
|
||||
) -> list[dict[str, Any]]:
|
||||
def _norm(v: Optional[str]) -> Optional[str]:
|
||||
if v in (None, "null", "NULL", "undefined", ""):
|
||||
@@ -25,4 +63,6 @@ async def get_logs_histogram(
|
||||
st = _norm(start_time)
|
||||
et = _norm(end_time)
|
||||
|
||||
if s is None and st is None and et is None and interval_minutes == _DEFAULT_INTERVAL:
|
||||
return await _get_histogram_cached()
|
||||
return await repo.get_log_histogram(search=s, start_time=st, end_time=et, interval_minutes=interval_minutes)
|
||||
|
||||
@@ -1,22 +1,57 @@
|
||||
import asyncio
|
||||
import time
|
||||
from typing import Any, Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, Query
|
||||
|
||||
from decnet.web.dependencies import get_current_user, repo
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.dependencies import require_viewer, repo
|
||||
from decnet.web.db.models import LogsResponse
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
# Cache the unfiltered total-logs count. Filtered counts bypass the cache
|
||||
# (rare, freshness matters for search). SELECT count(*) FROM logs is a
|
||||
# full scan and gets hammered by paginating clients.
|
||||
_TOTAL_TTL = 2.0
|
||||
_total_cache: tuple[Optional[int], float] = (None, 0.0)
|
||||
_total_lock: Optional[asyncio.Lock] = None
|
||||
|
||||
|
||||
def _reset_total_cache() -> None:
|
||||
global _total_cache, _total_lock
|
||||
_total_cache = (None, 0.0)
|
||||
_total_lock = None
|
||||
|
||||
|
||||
async def _get_total_logs_cached() -> int:
|
||||
global _total_cache, _total_lock
|
||||
value, ts = _total_cache
|
||||
now = time.monotonic()
|
||||
if value is not None and now - ts < _TOTAL_TTL:
|
||||
return value
|
||||
if _total_lock is None:
|
||||
_total_lock = asyncio.Lock()
|
||||
async with _total_lock:
|
||||
value, ts = _total_cache
|
||||
now = time.monotonic()
|
||||
if value is not None and now - ts < _TOTAL_TTL:
|
||||
return value
|
||||
value = await repo.get_total_logs()
|
||||
_total_cache = (value, time.monotonic())
|
||||
return value
|
||||
|
||||
|
||||
@router.get("/logs", response_model=LogsResponse, tags=["Logs"],
|
||||
responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}})
|
||||
responses={401: {"description": "Could not validate credentials"}, 403: {"description": "Insufficient permissions"}, 422: {"description": "Validation error"}})
|
||||
@_traced("api.get_logs")
|
||||
async def get_logs(
|
||||
limit: int = Query(50, ge=1, le=1000),
|
||||
offset: int = Query(0, ge=0, le=2147483647),
|
||||
search: Optional[str] = Query(None, max_length=512),
|
||||
start_time: Optional[str] = Query(None),
|
||||
end_time: Optional[str] = Query(None),
|
||||
current_user: str = Depends(get_current_user)
|
||||
user: dict = Depends(require_viewer)
|
||||
) -> dict[str, Any]:
|
||||
def _norm(v: Optional[str]) -> Optional[str]:
|
||||
if v in (None, "null", "NULL", "undefined", ""):
|
||||
@@ -28,7 +63,10 @@ async def get_logs(
|
||||
et = _norm(end_time)
|
||||
|
||||
_logs: list[dict[str, Any]] = await repo.get_logs(limit=limit, offset=offset, search=s, start_time=st, end_time=et)
|
||||
_total: int = await repo.get_total_logs(search=s, start_time=st, end_time=et)
|
||||
if s is None and st is None and et is None:
|
||||
_total: int = await _get_total_logs_cached()
|
||||
else:
|
||||
_total = await repo.get_total_logs(search=s, start_time=st, end_time=et)
|
||||
return {
|
||||
"total": _total,
|
||||
"limit": limit,
|
||||
|
||||
0
decnet/web/router/orchestrator/__init__.py
Normal file
0
decnet/web/router/orchestrator/__init__.py
Normal file
123
decnet/web/router/orchestrator/api_events.py
Normal file
123
decnet/web/router/orchestrator/api_events.py
Normal file
@@ -0,0 +1,123 @@
|
||||
"""SSE stream of orchestrator events.
|
||||
|
||||
Subscribes to ``orchestrator.>`` for the duration of the request and
|
||||
forwards each event as a Server-Sent Event. Emits a one-shot snapshot
|
||||
on connect (latest 50 rows).
|
||||
|
||||
Mirror of :mod:`decnet.web.router.campaigns.api_events`.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from typing import AsyncGenerator
|
||||
|
||||
import orjson
|
||||
from fastapi import APIRouter, Depends, Request
|
||||
from fastapi.responses import StreamingResponse
|
||||
|
||||
from decnet.bus import topics as _topics
|
||||
from decnet.bus.app import get_app_bus
|
||||
from decnet.logging import get_logger
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.dependencies import repo, require_stream_viewer
|
||||
from decnet.web.sse_limits import sse_connection_slot
|
||||
|
||||
log = get_logger("api.orchestrator.events")
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
_KEEPALIVE_SECS = 15.0
|
||||
_SNAPSHOT_LIMIT = 50
|
||||
|
||||
|
||||
def _format_sse(event_name: str, data: dict) -> str:
|
||||
return f"event: {event_name}\ndata: {orjson.dumps(data).decode()}\n\n"
|
||||
|
||||
|
||||
@router.get(
|
||||
"/orchestrator/events/stream",
|
||||
tags=["Orchestrator"],
|
||||
responses={
|
||||
200: {
|
||||
"content": {"text/event-stream": {}},
|
||||
"description": "SSE stream of orchestrator events",
|
||||
},
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
429: {"description": "Per-user SSE connection cap reached"},
|
||||
},
|
||||
)
|
||||
@_traced("api.orchestrator.events")
|
||||
async def api_orchestrator_events(
|
||||
request: Request,
|
||||
user: dict = Depends(require_stream_viewer),
|
||||
) -> StreamingResponse:
|
||||
snapshot = await repo.list_orchestrator_events(
|
||||
limit=_SNAPSHOT_LIMIT, offset=0,
|
||||
)
|
||||
|
||||
async def generator() -> AsyncGenerator[str, None]:
|
||||
async with sse_connection_slot(user["uuid"]):
|
||||
yield ": keepalive\n\n"
|
||||
yield _format_sse("snapshot", {"events": snapshot})
|
||||
|
||||
bus = await get_app_bus()
|
||||
if bus is None:
|
||||
while not await request.is_disconnected():
|
||||
try:
|
||||
await asyncio.sleep(_KEEPALIVE_SECS)
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
yield ": keepalive\n\n"
|
||||
return
|
||||
|
||||
sub = bus.subscribe(f"{_topics.ORCHESTRATOR}.>")
|
||||
try:
|
||||
async with sub:
|
||||
sub_iter = sub.__aiter__()
|
||||
while True:
|
||||
if await request.is_disconnected():
|
||||
break
|
||||
next_task = asyncio.ensure_future(sub_iter.__anext__())
|
||||
try:
|
||||
event = await asyncio.wait_for(
|
||||
next_task, timeout=_KEEPALIVE_SECS,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
next_task.cancel()
|
||||
yield ": keepalive\n\n"
|
||||
continue
|
||||
except StopAsyncIteration:
|
||||
break
|
||||
yield _format_sse(
|
||||
_sse_name_for(event.topic),
|
||||
{
|
||||
"topic": event.topic,
|
||||
"type": event.type,
|
||||
"ts": event.ts,
|
||||
"payload": event.payload,
|
||||
},
|
||||
)
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
except Exception:
|
||||
log.exception("orchestrator events stream crashed")
|
||||
yield _format_sse("error", {"message": "Stream interrupted"})
|
||||
|
||||
return StreamingResponse(
|
||||
generator(),
|
||||
media_type="text/event-stream",
|
||||
headers={
|
||||
"Cache-Control": "no-cache",
|
||||
"X-Accel-Buffering": "no",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def _sse_name_for(topic: str) -> str:
|
||||
"""``orchestrator.traffic.<uuid>`` → ``traffic``;
|
||||
``orchestrator.file.<uuid>`` → ``file``."""
|
||||
parts = topic.split(".", 2)
|
||||
if len(parts) >= 2 and parts[0] == _topics.ORCHESTRATOR:
|
||||
return parts[1]
|
||||
return topic
|
||||
87
decnet/web/router/orchestrator/api_list_events.py
Normal file
87
decnet/web/router/orchestrator/api_list_events.py
Normal file
@@ -0,0 +1,87 @@
|
||||
"""GET /api/v1/orchestrator/events — paginated orchestrator activity.
|
||||
|
||||
Two underlying tables back this endpoint:
|
||||
|
||||
* ``orchestrator_events`` — SSH traffic + file ops (kind = ``traffic``, ``file``)
|
||||
* ``orchestrator_emails`` — emailgen-generated EMLs (kind = ``email``)
|
||||
|
||||
When the caller filters ``kind=email`` we dispatch to the emails table
|
||||
and adapt rows into the same wire shape the dashboard already renders.
|
||||
The mapping is:
|
||||
|
||||
* ``action`` ← email subject
|
||||
* ``src_decky_uuid`` ← sender_email
|
||||
* ``dst_decky_uuid`` ← recipient_email
|
||||
* ``protocol`` ← ``"smtp"``
|
||||
* email-specific fields (``thread_id``, ``language``, ``mail_decky_uuid``,
|
||||
``message_id``, ``in_reply_to``) ride along as top-level keys for the
|
||||
inspector / future per-email views; the existing event renderer
|
||||
ignores anything it doesn't recognise.
|
||||
|
||||
Mirrors :mod:`decnet.web.router.campaigns.api_list_campaigns`. The
|
||||
orchestrator + emailgen workers are the sole writers; this surface is
|
||||
read-only.
|
||||
"""
|
||||
from typing import Any, Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, Query
|
||||
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.dependencies import repo, require_viewer
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
def _adapt_email_row(e: dict[str, Any]) -> dict[str, Any]:
|
||||
"""Reshape an ``orchestrator_emails`` row into the wire shape the
|
||||
dashboard's event table understands, while carrying the email-only
|
||||
fields through as extras."""
|
||||
return {
|
||||
"uuid": e.get("uuid"),
|
||||
"ts": e.get("ts"),
|
||||
"kind": "email",
|
||||
"protocol": "smtp",
|
||||
"action": e.get("subject", ""),
|
||||
"src_decky_uuid": e.get("sender_email"),
|
||||
"dst_decky_uuid": e.get("recipient_email"),
|
||||
"success": bool(e.get("success")),
|
||||
"payload": e.get("payload", "{}"),
|
||||
# Email-specific extras (renderer keys off ``kind == 'email'``).
|
||||
"subject": e.get("subject"),
|
||||
"sender_email": e.get("sender_email"),
|
||||
"recipient_email": e.get("recipient_email"),
|
||||
"language": e.get("language"),
|
||||
"thread_id": e.get("thread_id"),
|
||||
"mail_decky_uuid": e.get("mail_decky_uuid"),
|
||||
"message_id": e.get("message_id"),
|
||||
"in_reply_to": e.get("in_reply_to"),
|
||||
}
|
||||
|
||||
|
||||
@router.get(
|
||||
"/orchestrator/events",
|
||||
tags=["Orchestrator"],
|
||||
responses={
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
422: {"description": "Validation error"},
|
||||
},
|
||||
)
|
||||
@_traced("api.list_orchestrator_events")
|
||||
async def list_orchestrator_events(
|
||||
limit: int = Query(50, ge=1, le=1000),
|
||||
offset: int = Query(0, ge=0, le=2147483647),
|
||||
kind: Optional[str] = Query(None, pattern="^(traffic|file|email)$"),
|
||||
user: dict = Depends(require_viewer),
|
||||
) -> dict[str, Any]:
|
||||
"""Paginated orchestrator-event list, newest first."""
|
||||
if kind == "email":
|
||||
emails = await repo.list_orchestrator_emails(limit=limit, offset=offset)
|
||||
total = await repo.count_orchestrator_emails()
|
||||
data = [_adapt_email_row(e) for e in emails]
|
||||
else:
|
||||
data = await repo.list_orchestrator_events(
|
||||
limit=limit, offset=offset, kind=kind,
|
||||
)
|
||||
total = await repo.count_orchestrator_events(kind=kind)
|
||||
return {"total": total, "limit": limit, "offset": offset, "data": data}
|
||||
0
decnet/web/router/realism/__init__.py
Normal file
0
decnet/web/router/realism/__init__.py
Normal file
115
decnet/web/router/realism/api_config.py
Normal file
115
decnet/web/router/realism/api_config.py
Normal file
@@ -0,0 +1,115 @@
|
||||
"""GET/PUT ``/api/v1/realism/config`` — operator-tunable realism knobs.
|
||||
|
||||
Today only the planner's content-class weights + canary probability
|
||||
are exposed. The wire shape mirrors what
|
||||
:func:`decnet.realism.planner.current_payload` produces and
|
||||
:func:`decnet.realism.planner.apply_payload` consumes.
|
||||
|
||||
Reads accept viewer; writes are admin (writes mutate sampling
|
||||
behaviour across the whole orchestrator fleet, same trust level as
|
||||
the persona-pool surface).
|
||||
|
||||
The orchestrator worker periodically re-loads from the
|
||||
``realism_config`` table; the API process applies overrides locally
|
||||
on PUT so the GET-after-PUT round-trip reflects the change without
|
||||
waiting for the orchestrator's next refresh tick.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
|
||||
from decnet.logging import get_logger
|
||||
from decnet.realism import planner
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.dependencies import repo, require_admin, require_viewer
|
||||
|
||||
router = APIRouter()
|
||||
log = get_logger("api.realism.config")
|
||||
|
||||
_CONFIG_KEY = "weights"
|
||||
|
||||
|
||||
@router.get(
|
||||
"/realism/config",
|
||||
tags=["Realism"],
|
||||
responses={
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
},
|
||||
)
|
||||
@_traced("api.realism.get_config")
|
||||
async def get_config(
|
||||
user: dict = Depends(require_viewer),
|
||||
) -> dict[str, Any]:
|
||||
"""Return the live planner config in this API process.
|
||||
|
||||
Note: the API process and the orchestrator worker each carry their
|
||||
own in-memory copy of the planner config. After a fresh API
|
||||
restart the ``realism_config`` row is loaded into this process the
|
||||
first time GET is called; subsequent reads are local.
|
||||
"""
|
||||
# Lazy hydration — first call after restart pulls from DB so the
|
||||
# admin sees what the orchestrator is actually using, not the
|
||||
# baked-in defaults.
|
||||
row = await repo.get_realism_config(_CONFIG_KEY)
|
||||
if row is not None:
|
||||
try:
|
||||
stored = json.loads(row.get("value") or "{}")
|
||||
if isinstance(stored, dict):
|
||||
planner.apply_payload(stored)
|
||||
except (json.JSONDecodeError, ValueError) as exc:
|
||||
log.warning(
|
||||
"api.realism.get_config: stored payload invalid, "
|
||||
"serving defaults: %s", exc,
|
||||
)
|
||||
return planner.current_payload()
|
||||
|
||||
|
||||
@router.put(
|
||||
"/realism/config",
|
||||
tags=["Realism"],
|
||||
responses={
|
||||
400: {"description": "Invalid config payload"},
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
},
|
||||
)
|
||||
@_traced("api.realism.put_config")
|
||||
async def put_config(
|
||||
body: dict[str, Any],
|
||||
user: dict = Depends(require_admin),
|
||||
) -> dict[str, Any]:
|
||||
"""Replace (partial) planner config and persist to ``realism_config``.
|
||||
|
||||
Body shape (all fields optional — unset fields keep current value):
|
||||
|
||||
* ``user_class_weights``: ``[{"content_class": "note", "weight": 30}, ...]``
|
||||
* ``system_class_weights``: same shape
|
||||
* ``canary_class_weights``: same shape
|
||||
* ``canary_probability``: float in [0.0, 1.0]
|
||||
|
||||
Validation: any structural failure raises 400 *before* the rebind,
|
||||
so the live config never goes torn.
|
||||
"""
|
||||
if not isinstance(body, dict):
|
||||
raise HTTPException(status_code=400, detail="body must be an object")
|
||||
|
||||
try:
|
||||
planner.apply_payload(body)
|
||||
except ValueError as exc:
|
||||
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
||||
|
||||
# Persist what the planner now reflects (keeps DB in sync with the
|
||||
# in-memory state — partial bodies merge into prior config).
|
||||
snapshot = planner.current_payload()
|
||||
await repo.set_realism_config(_CONFIG_KEY, json.dumps(snapshot))
|
||||
|
||||
log.info(
|
||||
"api.realism.put_config user=%s canary_probability=%.4f",
|
||||
user.get("username", user.get("uuid")),
|
||||
snapshot["canary_probability"],
|
||||
)
|
||||
return snapshot
|
||||
143
decnet/web/router/realism/api_personas.py
Normal file
143
decnet/web/router/realism/api_personas.py
Normal file
@@ -0,0 +1,143 @@
|
||||
"""GET/PUT ``/api/v1/realism/personas`` — global persona pool CRUD.
|
||||
|
||||
The "global pool" is a JSON file consumed by the realism content
|
||||
engine for fleet (MACVLAN/IPVLAN) and SWARM-shard deckies — see
|
||||
:mod:`decnet.realism.personas_pool`. MazeNET topology deckies use
|
||||
``Topology.email_personas`` instead and are configured per-topology
|
||||
elsewhere.
|
||||
|
||||
This endpoint is the API surface behind the dashboard's "Persona
|
||||
Generation" page. Reads accept admin or viewer; writes are admin-only
|
||||
because the persistence target is a config file the worker reads on
|
||||
its hot path.
|
||||
|
||||
Concurrency: last-write-wins. The pool is operator-curated and small
|
||||
(<50 entries typically); the cost of a stronger model isn't justified.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
|
||||
from decnet.logging import get_logger
|
||||
from decnet.realism import personas_pool as global_pool
|
||||
from decnet.realism.personas import EmailPersona, parse_personas
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.dependencies import require_admin, require_viewer
|
||||
from decnet.web.db.models.common import MessageResponse # noqa: F401 - response shape
|
||||
|
||||
router = APIRouter()
|
||||
log = get_logger("api.realism.personas")
|
||||
|
||||
|
||||
def _serialize(personas: list[EmailPersona]) -> list[dict[str, Any]]:
|
||||
"""Pydantic → plain dicts for the response body."""
|
||||
return [p.model_dump(exclude_none=False) for p in personas]
|
||||
|
||||
|
||||
@router.get(
|
||||
"/realism/personas",
|
||||
tags=["Emailgen"],
|
||||
responses={
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
},
|
||||
)
|
||||
@_traced("api.realism.list_personas")
|
||||
async def list_personas(
|
||||
user: dict = Depends(require_viewer),
|
||||
) -> dict[str, Any]:
|
||||
"""Return the current global persona pool + the resolved file path.
|
||||
|
||||
The ``path`` field lets the dashboard show operators where the file
|
||||
lives on disk so a CLI-driven backup / git-tracked workflow stays
|
||||
discoverable.
|
||||
"""
|
||||
# Reset the in-process cache before reading so a fresh CLI-driven
|
||||
# ``decnet realism import-personas`` shows up immediately rather
|
||||
# than waiting on the worker's mtime check.
|
||||
global_pool.reset_cache()
|
||||
personas = global_pool.load()
|
||||
return {
|
||||
"path": str(global_pool.resolve_path()),
|
||||
"personas": _serialize(personas),
|
||||
}
|
||||
|
||||
|
||||
@router.put(
|
||||
"/realism/personas",
|
||||
tags=["Emailgen"],
|
||||
responses={
|
||||
400: {"description": "Invalid persona payload"},
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
},
|
||||
)
|
||||
@_traced("api.realism.replace_personas")
|
||||
async def replace_personas(
|
||||
body: dict[str, Any],
|
||||
user: dict = Depends(require_admin),
|
||||
) -> dict[str, Any]:
|
||||
"""Replace the entire global pool with the supplied list.
|
||||
|
||||
Body shape: ``{"personas": [<EmailPersona>, ...]}``.
|
||||
|
||||
Validation is the same path the worker uses (``parse_personas``):
|
||||
invalid entries are dropped with a warning rather than failing the
|
||||
whole request — operators see exactly what landed by reading back
|
||||
the GET response. An entirely-invalid payload returns 400.
|
||||
"""
|
||||
raw = body.get("personas")
|
||||
if not isinstance(raw, list):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="body.personas must be a list",
|
||||
)
|
||||
|
||||
parsed = parse_personas(raw)
|
||||
if raw and not parsed:
|
||||
# Operator sent a non-empty list and *every* entry was invalid —
|
||||
# almost certainly a schema mistake on their side; fail loudly
|
||||
# rather than silently writing an empty pool.
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=(
|
||||
"All persona entries failed validation. Required fields: "
|
||||
"name, email (user@host.tld), role, tone, mannerisms."
|
||||
),
|
||||
)
|
||||
|
||||
dest = global_pool.resolve_path()
|
||||
try:
|
||||
dest.parent.mkdir(parents=True, exist_ok=True)
|
||||
dest.write_text(
|
||||
json.dumps(_serialize(parsed), indent=2, ensure_ascii=False),
|
||||
encoding="utf-8",
|
||||
)
|
||||
except OSError as exc:
|
||||
# Most common cause on dev boxes: ``/etc/decnet`` exists but is
|
||||
# not writable by the API process. Surface a 500 with the
|
||||
# actionable hint instead of leaking a traceback.
|
||||
log.warning(
|
||||
"api.realism.replace_personas write failed path=%s err=%s",
|
||||
dest, exc,
|
||||
)
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail=(
|
||||
f"Could not write persona pool at {dest}: {exc.strerror or exc}. "
|
||||
f"Set DECNET_EMAILGEN_PERSONAS to a writable path "
|
||||
f"(e.g. ~/.decnet/email_personas.json) and restart the API."
|
||||
),
|
||||
) from exc
|
||||
global_pool.reset_cache()
|
||||
log.info(
|
||||
"api.realism.replace_personas user=%s wrote=%d path=%s",
|
||||
user.get("username", user.get("uuid")), len(parsed), dest,
|
||||
)
|
||||
return {
|
||||
"path": str(dest),
|
||||
"personas": _serialize(parsed),
|
||||
}
|
||||
99
decnet/web/router/realism/api_synthetic_files.py
Normal file
99
decnet/web/router/realism/api_synthetic_files.py
Normal file
@@ -0,0 +1,99 @@
|
||||
"""GET ``/api/v1/realism/synthetic-files`` — browse planted realism files.
|
||||
|
||||
The orchestrator's realism worker grows synthetic files on each decky
|
||||
(notes, TODOs, drafts, scripts, log lines, canary artifacts). The
|
||||
:class:`~decnet.web.db.models.realism.SyntheticFile` table is the
|
||||
canonical record of what's been planted where; this endpoint lets
|
||||
operators inspect the lineage without ssh'ing into a decky.
|
||||
|
||||
Read-only. No writes — the orchestrator is the sole writer; the
|
||||
dashboard is observation surface only.
|
||||
|
||||
The body preview (``last_body``) is repo-clipped at 64 KB
|
||||
(:data:`SYNTHETIC_FILE_BODY_LIMIT`); when the original was larger the
|
||||
detail response carries ``truncated: true`` so the operator knows what
|
||||
they're looking at.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.db.models.realism import SYNTHETIC_FILE_BODY_LIMIT
|
||||
from decnet.web.dependencies import repo, require_viewer
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get(
|
||||
"/realism/synthetic-files",
|
||||
tags=["Realism"],
|
||||
responses={
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
422: {"description": "Validation error"},
|
||||
},
|
||||
)
|
||||
@_traced("api.realism.list_synthetic_files")
|
||||
async def list_synthetic_files(
|
||||
limit: int = Query(50, ge=1, le=500),
|
||||
offset: int = Query(0, ge=0, le=2147483647),
|
||||
decky_uuid: Optional[str] = Query(None, max_length=64),
|
||||
persona: Optional[str] = Query(None, max_length=128),
|
||||
content_class: Optional[str] = Query(None, max_length=32),
|
||||
user: dict = Depends(require_viewer),
|
||||
) -> dict[str, Any]:
|
||||
"""Paginated synthetic_files newest-first.
|
||||
|
||||
Filters: ``decky_uuid``, ``persona``, ``content_class``. The list
|
||||
response strips ``last_body`` to keep the payload bounded — fetch
|
||||
the detail endpoint for the body preview.
|
||||
"""
|
||||
rows = await repo.list_synthetic_files(
|
||||
decky_uuid=decky_uuid,
|
||||
persona=persona,
|
||||
content_class=content_class,
|
||||
limit=limit,
|
||||
offset=offset,
|
||||
)
|
||||
total = await repo.count_synthetic_files(
|
||||
decky_uuid=decky_uuid,
|
||||
persona=persona,
|
||||
content_class=content_class,
|
||||
)
|
||||
# The list view doesn't need bodies; drop them so the response stays
|
||||
# small even when 50 rows each carry ~64 KB. Detail endpoint returns
|
||||
# the body.
|
||||
for r in rows:
|
||||
r.pop("last_body", None)
|
||||
return {"total": total, "limit": limit, "offset": offset, "data": rows}
|
||||
|
||||
|
||||
@router.get(
|
||||
"/realism/synthetic-files/{uuid}",
|
||||
tags=["Realism"],
|
||||
responses={
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "Synthetic file not found"},
|
||||
},
|
||||
)
|
||||
@_traced("api.realism.get_synthetic_file")
|
||||
async def get_synthetic_file(
|
||||
uuid: str,
|
||||
user: dict = Depends(require_viewer),
|
||||
) -> dict[str, Any]:
|
||||
"""Return one synthetic_files row including the body preview.
|
||||
|
||||
``truncated`` is true when the stored body is at the cap — the
|
||||
decky filesystem holds the canonical bytes; the master view is a
|
||||
snapshot.
|
||||
"""
|
||||
row = await repo.get_synthetic_file(uuid)
|
||||
if row is None:
|
||||
raise HTTPException(status_code=404, detail="synthetic file not found")
|
||||
body = row.get("last_body") or ""
|
||||
row["truncated"] = len(body) >= SYNTHETIC_FILE_BODY_LIMIT
|
||||
return row
|
||||
@@ -1,14 +1,50 @@
|
||||
from typing import Any
|
||||
import asyncio
|
||||
import time
|
||||
from typing import Any, Optional
|
||||
|
||||
from fastapi import APIRouter, Depends
|
||||
|
||||
from decnet.web.dependencies import get_current_user, repo
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.dependencies import require_viewer, repo
|
||||
from decnet.web.db.models import StatsResponse
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
# /stats is aggregate telemetry polled constantly by the UI and locust.
|
||||
# A 5s window collapses thousands of concurrent calls — each of which
|
||||
# runs SELECT count(*) FROM logs + SELECT count(DISTINCT attacker_ip) —
|
||||
# into one DB hit per window.
|
||||
_STATS_TTL = 5.0
|
||||
_stats_cache: tuple[Optional[dict[str, Any]], float] = (None, 0.0)
|
||||
_stats_lock: Optional[asyncio.Lock] = None
|
||||
|
||||
|
||||
def _reset_stats_cache() -> None:
|
||||
global _stats_cache, _stats_lock
|
||||
_stats_cache = (None, 0.0)
|
||||
_stats_lock = None
|
||||
|
||||
|
||||
async def _get_stats_cached() -> dict[str, Any]:
|
||||
global _stats_cache, _stats_lock
|
||||
value, ts = _stats_cache
|
||||
now = time.monotonic()
|
||||
if value is not None and now - ts < _STATS_TTL:
|
||||
return value
|
||||
if _stats_lock is None:
|
||||
_stats_lock = asyncio.Lock()
|
||||
async with _stats_lock:
|
||||
value, ts = _stats_cache
|
||||
now = time.monotonic()
|
||||
if value is not None and now - ts < _STATS_TTL:
|
||||
return value
|
||||
value = await repo.get_stats_summary()
|
||||
_stats_cache = (value, time.monotonic())
|
||||
return value
|
||||
|
||||
|
||||
@router.get("/stats", response_model=StatsResponse, tags=["Observability"],
|
||||
responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}},)
|
||||
async def get_stats(current_user: str = Depends(get_current_user)) -> dict[str, Any]:
|
||||
return await repo.get_stats_summary()
|
||||
responses={401: {"description": "Could not validate credentials"}, 403: {"description": "Insufficient permissions"}, 422: {"description": "Validation error"}},)
|
||||
@_traced("api.get_stats")
|
||||
async def get_stats(user: dict = Depends(require_viewer)) -> dict[str, Any]:
|
||||
return await _get_stats_cached()
|
||||
|
||||
@@ -1,19 +1,50 @@
|
||||
import json
|
||||
import asyncio
|
||||
import logging
|
||||
|
||||
import orjson
|
||||
from typing import AsyncGenerator, Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, Query, Request
|
||||
from fastapi.responses import StreamingResponse
|
||||
|
||||
from decnet.env import DECNET_DEVELOPER
|
||||
from decnet.web.dependencies import get_stream_user, repo
|
||||
from decnet.logging import get_logger
|
||||
from decnet.telemetry import traced as _traced, get_tracer as _get_tracer
|
||||
from decnet.web.dependencies import require_stream_viewer, repo
|
||||
from decnet.web.sse_limits import sse_connection_slot
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
log = get_logger("api")
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
def _build_trace_links(logs: list[dict]) -> list:
|
||||
"""Build OTEL span links from persisted trace_id/span_id in log rows.
|
||||
|
||||
Returns an empty list when tracing is disabled (no OTEL imports).
|
||||
"""
|
||||
try:
|
||||
from opentelemetry.trace import Link, SpanContext, TraceFlags
|
||||
except ImportError:
|
||||
return []
|
||||
links: list[Link] = []
|
||||
for entry in logs:
|
||||
tid = entry.get("trace_id")
|
||||
sid = entry.get("span_id")
|
||||
if not tid or not sid or tid == "0":
|
||||
continue
|
||||
try:
|
||||
ctx = SpanContext(
|
||||
trace_id=int(tid, 16),
|
||||
span_id=int(sid, 16),
|
||||
is_remote=True,
|
||||
trace_flags=TraceFlags(TraceFlags.SAMPLED),
|
||||
)
|
||||
links.append(Link(ctx))
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
return links
|
||||
|
||||
|
||||
@router.get("/stream", tags=["Observability"],
|
||||
responses={
|
||||
200: {
|
||||
@@ -21,9 +52,12 @@ router = APIRouter()
|
||||
"description": "Real-time Server-Sent Events (SSE) stream"
|
||||
},
|
||||
401: {"description": "Could not validate credentials"},
|
||||
422: {"description": "Validation error"}
|
||||
403: {"description": "Insufficient permissions"},
|
||||
422: {"description": "Validation error"},
|
||||
429: {"description": "Per-user SSE connection cap reached"},
|
||||
},
|
||||
)
|
||||
@_traced("api.stream_events")
|
||||
async def stream_events(
|
||||
request: Request,
|
||||
last_event_id: int = Query(0, alias="lastEventId"),
|
||||
@@ -31,63 +65,87 @@ async def stream_events(
|
||||
start_time: Optional[str] = None,
|
||||
end_time: Optional[str] = None,
|
||||
max_output: Optional[int] = Query(None, alias="maxOutput"),
|
||||
current_user: str = Depends(get_stream_user)
|
||||
user: dict = Depends(require_stream_viewer)
|
||||
) -> StreamingResponse:
|
||||
# Event types emitted on this stream: logs, stats, histogram.
|
||||
# All three are viewer-safe — same data is reachable via /logs and
|
||||
# /stats (viewer-gated REST). Adding a new event family here
|
||||
# requires a threat-model review for F6/I (role leakage).
|
||||
|
||||
async def event_generator() -> AsyncGenerator[str, None]:
|
||||
last_id = last_event_id
|
||||
stats_interval_sec = 10
|
||||
loops_since_stats = 0
|
||||
emitted_chunks = 0
|
||||
try:
|
||||
if last_id == 0:
|
||||
last_id = await repo.get_max_log_id()
|
||||
|
||||
# Emit initial snapshot immediately so the client never needs to poll /stats
|
||||
stats = await repo.get_stats_summary()
|
||||
yield f"event: message\ndata: {json.dumps({'type': 'stats', 'data': stats})}\n\n"
|
||||
histogram = await repo.get_log_histogram(
|
||||
search=search, start_time=start_time,
|
||||
end_time=end_time, interval_minutes=15,
|
||||
async with sse_connection_slot(user["uuid"]):
|
||||
# Prefetch the initial snapshot before the first yield.
|
||||
# With asyncmy (pure async TCP I/O), a DB await AFTER the first
|
||||
# yield races with the HTTP write callback; running DB reads
|
||||
# here (pre-yield, normal coroutine context) avoids that.
|
||||
# aiosqlite is immune because SQLite runs on a worker thread.
|
||||
_start_id = last_event_id if last_event_id != 0 else await repo.get_max_log_id()
|
||||
_initial_stats = await repo.get_stats_summary()
|
||||
_initial_histogram = await repo.get_log_histogram(
|
||||
search=search, start_time=start_time, end_time=end_time, interval_minutes=15,
|
||||
)
|
||||
yield f"event: message\ndata: {json.dumps({'type': 'histogram', 'data': histogram})}\n\n"
|
||||
last_id = _start_id
|
||||
stats_interval_sec = 10
|
||||
loops_since_stats = 0
|
||||
emitted_chunks = 0
|
||||
try:
|
||||
yield ": keepalive\n\n" # flush headers immediately
|
||||
|
||||
while True:
|
||||
if DECNET_DEVELOPER and max_output is not None:
|
||||
emitted_chunks += 1
|
||||
if emitted_chunks > max_output:
|
||||
log.debug("Developer mode: max_output reached (%d), closing stream", max_output)
|
||||
# Emit pre-fetched initial snapshot — no DB calls in generator until the loop
|
||||
yield f"event: message\ndata: {orjson.dumps({'type': 'stats', 'data': _initial_stats}).decode()}\n\n"
|
||||
yield f"event: message\ndata: {orjson.dumps({'type': 'histogram', 'data': _initial_histogram}).decode()}\n\n"
|
||||
|
||||
while True:
|
||||
if DECNET_DEVELOPER and max_output is not None:
|
||||
emitted_chunks += 1
|
||||
if emitted_chunks > max_output:
|
||||
log.debug("Developer mode: max_output reached (%d), closing stream", max_output)
|
||||
break
|
||||
|
||||
if await request.is_disconnected():
|
||||
break
|
||||
|
||||
if await request.is_disconnected():
|
||||
break
|
||||
|
||||
new_logs = await repo.get_logs_after_id(
|
||||
last_id, limit=50, search=search,
|
||||
start_time=start_time, end_time=end_time,
|
||||
)
|
||||
if new_logs:
|
||||
last_id = max(entry["id"] for entry in new_logs)
|
||||
yield f"event: message\ndata: {json.dumps({'type': 'logs', 'data': new_logs})}\n\n"
|
||||
loops_since_stats = stats_interval_sec
|
||||
|
||||
if loops_since_stats >= stats_interval_sec:
|
||||
stats = await repo.get_stats_summary()
|
||||
yield f"event: message\ndata: {json.dumps({'type': 'stats', 'data': stats})}\n\n"
|
||||
histogram = await repo.get_log_histogram(
|
||||
search=search, start_time=start_time,
|
||||
end_time=end_time, interval_minutes=15,
|
||||
new_logs = await repo.get_logs_after_id(
|
||||
last_id, limit=50, search=search,
|
||||
start_time=start_time, end_time=end_time,
|
||||
)
|
||||
yield f"event: message\ndata: {json.dumps({'type': 'histogram', 'data': histogram})}\n\n"
|
||||
loops_since_stats = 0
|
||||
if new_logs:
|
||||
last_id = max(entry["id"] for entry in new_logs)
|
||||
# Create a span linking back to the ingestion traces
|
||||
# stored in each log row, closing the pipeline gap.
|
||||
_links = _build_trace_links(new_logs)
|
||||
_tracer = _get_tracer("sse")
|
||||
with _tracer.start_as_current_span(
|
||||
"sse.emit_logs", links=_links,
|
||||
attributes={"log_count": len(new_logs)},
|
||||
):
|
||||
yield f"event: message\ndata: {orjson.dumps({'type': 'logs', 'data': new_logs}).decode()}\n\n"
|
||||
loops_since_stats = stats_interval_sec
|
||||
|
||||
loops_since_stats += 1
|
||||
if loops_since_stats >= stats_interval_sec:
|
||||
stats = await repo.get_stats_summary()
|
||||
yield f"event: message\ndata: {orjson.dumps({'type': 'stats', 'data': stats}).decode()}\n\n"
|
||||
histogram = await repo.get_log_histogram(
|
||||
search=search, start_time=start_time,
|
||||
end_time=end_time, interval_minutes=15,
|
||||
)
|
||||
yield f"event: message\ndata: {orjson.dumps({'type': 'histogram', 'data': histogram}).decode()}\n\n"
|
||||
loops_since_stats = 0
|
||||
|
||||
await asyncio.sleep(1)
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
except Exception:
|
||||
log.exception("SSE stream error for user %s", last_event_id)
|
||||
yield f"event: error\ndata: {json.dumps({'type': 'error', 'message': 'Stream interrupted'})}\n\n"
|
||||
loops_since_stats += 1
|
||||
|
||||
return StreamingResponse(event_generator(), media_type="text/event-stream")
|
||||
await asyncio.sleep(1)
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
except Exception:
|
||||
log.exception("SSE stream error for user %s", last_event_id)
|
||||
yield f"event: error\ndata: {orjson.dumps({'type': 'error', 'message': 'Stream interrupted'}).decode()}\n\n"
|
||||
|
||||
return StreamingResponse(
|
||||
event_generator(),
|
||||
media_type="text/event-stream",
|
||||
headers={
|
||||
"Cache-Control": "no-cache",
|
||||
"X-Accel-Buffering": "no",
|
||||
},
|
||||
)
|
||||
|
||||
47
decnet/web/router/swarm/__init__.py
Normal file
47
decnet/web/router/swarm/__init__.py
Normal file
@@ -0,0 +1,47 @@
|
||||
"""Swarm controller routers.
|
||||
|
||||
One file per endpoint, aggregated under the ``/swarm`` prefix. Mounted
|
||||
onto the swarm-api FastAPI app (``decnet/web/swarm_api.py``), a separate
|
||||
process from the main DECNET API so swarm failures cannot cascade into
|
||||
log ingestion / dashboard serving.
|
||||
"""
|
||||
from fastapi import APIRouter
|
||||
|
||||
from .api_enroll_host import router as enroll_host_router
|
||||
from .api_list_hosts import router as list_hosts_router
|
||||
from .api_get_host import router as get_host_router
|
||||
from .api_decommission_host import router as decommission_host_router
|
||||
from .api_deploy_swarm import router as deploy_swarm_router
|
||||
from .api_teardown_swarm import router as teardown_swarm_router
|
||||
from .api_get_swarm_health import router as get_swarm_health_router
|
||||
from .api_check_hosts import router as check_hosts_router
|
||||
from .api_heartbeat import router as heartbeat_router
|
||||
from .api_list_deckies import router as list_deckies_router
|
||||
|
||||
swarm_router = APIRouter(
|
||||
prefix="/swarm",
|
||||
# Error responses that every swarm route can surface. Route-level
|
||||
# `responses=` entries still override/extend these for route-specific
|
||||
# codes (e.g. 409 on /enroll).
|
||||
responses={
|
||||
400: {"description": "Malformed request"},
|
||||
403: {"description": "Peer cert missing or fingerprint mismatch"},
|
||||
404: {"description": "Referenced host does not exist"},
|
||||
},
|
||||
)
|
||||
|
||||
# Hosts
|
||||
swarm_router.include_router(enroll_host_router)
|
||||
swarm_router.include_router(list_hosts_router)
|
||||
swarm_router.include_router(get_host_router)
|
||||
swarm_router.include_router(decommission_host_router)
|
||||
|
||||
# Deployments
|
||||
swarm_router.include_router(deploy_swarm_router)
|
||||
swarm_router.include_router(teardown_swarm_router)
|
||||
swarm_router.include_router(list_deckies_router)
|
||||
|
||||
# Health
|
||||
swarm_router.include_router(get_swarm_health_router)
|
||||
swarm_router.include_router(check_hosts_router)
|
||||
swarm_router.include_router(heartbeat_router)
|
||||
61
decnet/web/router/swarm/api_check_hosts.py
Normal file
61
decnet/web/router/swarm/api_check_hosts.py
Normal file
@@ -0,0 +1,61 @@
|
||||
"""POST /swarm/check — active mTLS probe of every enrolled worker.
|
||||
|
||||
Updates ``SwarmHost.status`` and ``last_heartbeat`` for each host based
|
||||
on the outcome of the probe.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, Depends
|
||||
|
||||
from decnet.logging import get_logger
|
||||
from decnet.swarm.client import AgentClient
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo
|
||||
from decnet.web.db.models import SwarmCheckResponse, SwarmHostHealth
|
||||
|
||||
log = get_logger("swarm.check")
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.post("/check", response_model=SwarmCheckResponse, tags=["Swarm Health"])
|
||||
async def api_check_hosts(
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> SwarmCheckResponse:
|
||||
hosts = await repo.list_swarm_hosts()
|
||||
|
||||
async def _probe(host: dict[str, Any]) -> SwarmHostHealth:
|
||||
try:
|
||||
async with AgentClient(host=host) as agent:
|
||||
body = await agent.health()
|
||||
await repo.update_swarm_host(
|
||||
host["uuid"],
|
||||
{
|
||||
"status": "active",
|
||||
"last_heartbeat": datetime.now(timezone.utc),
|
||||
},
|
||||
)
|
||||
return SwarmHostHealth(
|
||||
host_uuid=host["uuid"],
|
||||
name=host["name"],
|
||||
address=host["address"],
|
||||
reachable=True,
|
||||
detail=body,
|
||||
)
|
||||
except Exception as exc:
|
||||
log.warning("swarm.check unreachable host=%s err=%s", host["name"], exc)
|
||||
await repo.update_swarm_host(host["uuid"], {"status": "unreachable"})
|
||||
return SwarmHostHealth(
|
||||
host_uuid=host["uuid"],
|
||||
name=host["name"],
|
||||
address=host["address"],
|
||||
reachable=False,
|
||||
detail=str(exc),
|
||||
)
|
||||
|
||||
results = await asyncio.gather(*(_probe(h) for h in hosts))
|
||||
return SwarmCheckResponse(results=list(results))
|
||||
63
decnet/web/router/swarm/api_decommission_host.py
Normal file
63
decnet/web/router/swarm/api_decommission_host.py
Normal file
@@ -0,0 +1,63 @@
|
||||
"""DELETE /swarm/hosts/{uuid} — decommission a worker.
|
||||
|
||||
Removes the DeckyShard rows bound to the host (portable cascade — MySQL
|
||||
and SQLite both honor it via the repo layer), deletes the SwarmHost row,
|
||||
and best-effort-cleans the per-worker bundle directory on the master.
|
||||
|
||||
Also asks the worker agent to wipe its own install (keeping logs). A
|
||||
dead/unreachable worker does not block master-side cleanup.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import pathlib
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, status
|
||||
|
||||
from decnet.logging import get_logger
|
||||
from decnet.swarm.client import AgentClient
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo
|
||||
|
||||
log = get_logger("swarm.decommission")
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.delete(
|
||||
"/hosts/{uuid}",
|
||||
status_code=status.HTTP_204_NO_CONTENT,
|
||||
tags=["Swarm Hosts"],
|
||||
responses={404: {"description": "No host with this UUID is enrolled"}},
|
||||
)
|
||||
async def api_decommission_host(
|
||||
uuid: str,
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> None:
|
||||
row = await repo.get_swarm_host_by_uuid(uuid)
|
||||
if row is None:
|
||||
raise HTTPException(status_code=404, detail="host not found")
|
||||
|
||||
try:
|
||||
async with AgentClient(host=row) as agent:
|
||||
await agent.self_destruct()
|
||||
except Exception:
|
||||
log.exception(
|
||||
"decommission: self-destruct dispatch failed host=%s — "
|
||||
"proceeding with master-side cleanup anyway",
|
||||
row.get("name"),
|
||||
)
|
||||
|
||||
await repo.delete_decky_shards_for_host(uuid)
|
||||
await repo.delete_swarm_host(uuid)
|
||||
|
||||
# Best-effort bundle cleanup; if the dir was moved manually, don't fail.
|
||||
bundle_dir = pathlib.Path(row.get("cert_bundle_path") or "")
|
||||
if bundle_dir.is_dir():
|
||||
for child in bundle_dir.iterdir():
|
||||
try:
|
||||
child.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
try:
|
||||
bundle_dir.rmdir()
|
||||
except OSError:
|
||||
pass
|
||||
155
decnet/web/router/swarm/api_deploy_swarm.py
Normal file
155
decnet/web/router/swarm/api_deploy_swarm.py
Normal file
@@ -0,0 +1,155 @@
|
||||
"""POST /swarm/deploy — shard a DecnetConfig across enrolled workers.
|
||||
|
||||
Per worker we build a filtered copy containing only the deckies assigned
|
||||
to that worker (via ``host_uuid``), then POST it to the worker agent.
|
||||
The caller is expected to have already set ``host_uuid`` on every decky;
|
||||
if any decky arrives without one, we fail fast. Auto-sharding lives in
|
||||
the CLI layer, not here.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
|
||||
from decnet.config import DecnetConfig, DeckyConfig
|
||||
from decnet.logging import get_logger
|
||||
from decnet.swarm.client import AgentClient
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo
|
||||
from decnet.web.db.models import (
|
||||
SwarmDeployRequest,
|
||||
SwarmDeployResponse,
|
||||
SwarmHostResult,
|
||||
)
|
||||
|
||||
log = get_logger("swarm.deploy")
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
def _shard_by_host(config: DecnetConfig) -> dict[str, list[DeckyConfig]]:
|
||||
buckets: dict[str, list[DeckyConfig]] = {}
|
||||
for d in config.deckies:
|
||||
if not d.host_uuid:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"decky '{d.name}' has no host_uuid — caller must shard before dispatch",
|
||||
)
|
||||
buckets.setdefault(d.host_uuid, []).append(d)
|
||||
return buckets
|
||||
|
||||
|
||||
def _worker_config(
|
||||
base: DecnetConfig,
|
||||
shard: list[DeckyConfig],
|
||||
host: dict[str, Any],
|
||||
) -> DecnetConfig:
|
||||
updates: dict[str, Any] = {"deckies": shard}
|
||||
# Per-host driver opt-in (Wi-Fi-bridged VMs can't use macvlan — see
|
||||
# SwarmHost.use_ipvlan). Never downgrade: if the operator picked ipvlan
|
||||
# at the deploy level, keep it regardless of the per-host flag.
|
||||
if host.get("use_ipvlan"):
|
||||
updates["ipvlan"] = True
|
||||
return base.model_copy(update=updates)
|
||||
|
||||
|
||||
async def dispatch_decnet_config(
|
||||
config: DecnetConfig,
|
||||
repo: BaseRepository,
|
||||
dry_run: bool = False,
|
||||
no_cache: bool = False,
|
||||
) -> SwarmDeployResponse:
|
||||
"""Shard ``config`` by ``host_uuid`` and dispatch to each worker in parallel.
|
||||
|
||||
Shared between POST /swarm/deploy (explicit swarm call) and the auto-swarm
|
||||
branch of POST /deckies/deploy.
|
||||
"""
|
||||
buckets = _shard_by_host(config)
|
||||
|
||||
hosts: dict[str, dict[str, Any]] = {}
|
||||
for host_uuid in buckets:
|
||||
row = await repo.get_swarm_host_by_uuid(host_uuid)
|
||||
if row is None:
|
||||
raise HTTPException(status_code=404, detail=f"unknown host_uuid: {host_uuid}")
|
||||
hosts[host_uuid] = row
|
||||
|
||||
async def _dispatch(host_uuid: str, shard: list[DeckyConfig]) -> SwarmHostResult:
|
||||
host = hosts[host_uuid]
|
||||
cfg = _worker_config(config, shard, host)
|
||||
try:
|
||||
async with AgentClient(host=host) as agent:
|
||||
body = await agent.deploy(cfg, dry_run=dry_run, no_cache=no_cache)
|
||||
for d in shard:
|
||||
await repo.upsert_decky_shard(
|
||||
{
|
||||
"decky_name": d.name,
|
||||
"host_uuid": host_uuid,
|
||||
"services": json.dumps(d.services),
|
||||
"decky_config": d.model_dump_json(),
|
||||
"decky_ip": d.ip,
|
||||
"state": "running" if not dry_run else "pending",
|
||||
"last_error": None,
|
||||
"updated_at": datetime.now(timezone.utc),
|
||||
}
|
||||
)
|
||||
await repo.update_swarm_host(host_uuid, {"status": "active"})
|
||||
return SwarmHostResult(host_uuid=host_uuid, host_name=host["name"], ok=True, detail=body)
|
||||
except Exception as exc:
|
||||
log.exception("swarm.deploy dispatch failed host=%s", host["name"])
|
||||
# Compose-up is partial-success-friendly: one decky failing to
|
||||
# build doesn't roll back the ones that already came up. Ask the
|
||||
# agent which containers actually exist before painting the whole
|
||||
# shard red — otherwise decky1 and decky2 look "failed" even
|
||||
# though they're live on the worker.
|
||||
runtime: dict[str, Any] = {}
|
||||
try:
|
||||
async with AgentClient(host=host) as probe:
|
||||
snap = await probe.status()
|
||||
runtime = snap.get("runtime") or {}
|
||||
except Exception:
|
||||
log.warning("swarm.deploy: runtime probe failed host=%s — marking shard failed", host["name"])
|
||||
for d in shard:
|
||||
rstate = runtime.get(d.name) or {}
|
||||
is_up = bool(rstate.get("running"))
|
||||
await repo.upsert_decky_shard(
|
||||
{
|
||||
"decky_name": d.name,
|
||||
"host_uuid": host_uuid,
|
||||
"services": json.dumps(d.services),
|
||||
"decky_config": d.model_dump_json(),
|
||||
"decky_ip": d.ip,
|
||||
"state": "running" if is_up else "failed",
|
||||
"last_error": None if is_up else str(exc)[:512],
|
||||
"updated_at": datetime.now(timezone.utc),
|
||||
}
|
||||
)
|
||||
return SwarmHostResult(host_uuid=host_uuid, host_name=host["name"], ok=False, detail=str(exc))
|
||||
|
||||
results = await asyncio.gather(
|
||||
*(_dispatch(uuid_, shard) for uuid_, shard in buckets.items())
|
||||
)
|
||||
return SwarmDeployResponse(results=list(results))
|
||||
|
||||
|
||||
@router.post(
|
||||
"/deploy",
|
||||
response_model=SwarmDeployResponse,
|
||||
tags=["Swarm Deployments"],
|
||||
responses={
|
||||
400: {"description": "Deployment mode must be 'swarm'"},
|
||||
404: {"description": "A referenced host_uuid is not enrolled"},
|
||||
},
|
||||
)
|
||||
async def api_deploy_swarm(
|
||||
req: SwarmDeployRequest,
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> SwarmDeployResponse:
|
||||
if req.config.mode != "swarm":
|
||||
raise HTTPException(status_code=400, detail="mode must be 'swarm'")
|
||||
return await dispatch_decnet_config(
|
||||
req.config, repo, dry_run=req.dry_run, no_cache=req.no_cache
|
||||
)
|
||||
100
decnet/web/router/swarm/api_enroll_host.py
Normal file
100
decnet/web/router/swarm/api_enroll_host.py
Normal file
@@ -0,0 +1,100 @@
|
||||
"""POST /swarm/enroll — issue a worker cert bundle and register the host.
|
||||
|
||||
Enrollment is master-driven: the controller holds the CA private key,
|
||||
generates a fresh worker keypair + CA-signed cert, and returns the full
|
||||
bundle to the operator. Bundle delivery to the worker (scp/sshpass/etc.)
|
||||
is outside this process's trust boundary.
|
||||
|
||||
Rationale: the worker agent speaks ONLY mTLS; there is no pre-auth
|
||||
bootstrap endpoint, so nothing to attack before the worker is enrolled.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid as _uuid
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, status
|
||||
|
||||
from decnet.swarm import pki
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo
|
||||
from decnet.web.db.models import SwarmEnrolledBundle, SwarmEnrollRequest, SwarmUpdaterBundle
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.post(
|
||||
"/enroll",
|
||||
response_model=SwarmEnrolledBundle,
|
||||
status_code=status.HTTP_201_CREATED,
|
||||
tags=["Swarm Hosts"],
|
||||
responses={
|
||||
400: {"description": "Bad Request (malformed JSON body)"},
|
||||
409: {"description": "A worker with this name is already enrolled"},
|
||||
422: {"description": "Request body validation error"},
|
||||
},
|
||||
)
|
||||
async def api_enroll_host(
|
||||
req: SwarmEnrollRequest,
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> SwarmEnrolledBundle:
|
||||
existing = await repo.get_swarm_host_by_name(req.name)
|
||||
if existing is not None:
|
||||
raise HTTPException(status_code=409, detail=f"Worker '{req.name}' is already enrolled")
|
||||
|
||||
ca = pki.ensure_ca()
|
||||
sans = list({*req.sans, req.address, req.name})
|
||||
issued = pki.issue_worker_cert(ca, req.name, sans)
|
||||
|
||||
# Persist the bundle under ~/.decnet/ca/workers/<name>/ so the master
|
||||
# can replay it if the operator loses the original delivery.
|
||||
bundle_dir = pki.DEFAULT_CA_DIR / "workers" / req.name
|
||||
pki.write_worker_bundle(issued, bundle_dir)
|
||||
|
||||
updater_view: Optional[SwarmUpdaterBundle] = None
|
||||
updater_fp: Optional[str] = None
|
||||
if req.issue_updater_bundle:
|
||||
updater_cn = f"updater@{req.name}"
|
||||
updater_sans = list({*sans, updater_cn, "127.0.0.1"})
|
||||
updater_issued = pki.issue_worker_cert(ca, updater_cn, updater_sans)
|
||||
# Persist alongside the worker bundle for replay.
|
||||
updater_dir = bundle_dir / "updater"
|
||||
updater_dir.mkdir(parents=True, exist_ok=True)
|
||||
(updater_dir / "updater.crt").write_bytes(updater_issued.cert_pem)
|
||||
(updater_dir / "updater.key").write_bytes(updater_issued.key_pem)
|
||||
import os as _os
|
||||
_os.chmod(updater_dir / "updater.key", 0o600)
|
||||
updater_fp = updater_issued.fingerprint_sha256
|
||||
updater_view = SwarmUpdaterBundle(
|
||||
fingerprint=updater_fp,
|
||||
updater_cert_pem=updater_issued.cert_pem.decode(),
|
||||
updater_key_pem=updater_issued.key_pem.decode(),
|
||||
)
|
||||
|
||||
host_uuid = str(_uuid.uuid4())
|
||||
await repo.add_swarm_host(
|
||||
{
|
||||
"uuid": host_uuid,
|
||||
"name": req.name,
|
||||
"address": req.address,
|
||||
"agent_port": req.agent_port,
|
||||
"status": "enrolled",
|
||||
"client_cert_fingerprint": issued.fingerprint_sha256,
|
||||
"updater_cert_fingerprint": updater_fp,
|
||||
"cert_bundle_path": str(bundle_dir),
|
||||
"enrolled_at": datetime.now(timezone.utc),
|
||||
"notes": req.notes,
|
||||
}
|
||||
)
|
||||
return SwarmEnrolledBundle(
|
||||
host_uuid=host_uuid,
|
||||
name=req.name,
|
||||
address=req.address,
|
||||
agent_port=req.agent_port,
|
||||
fingerprint=issued.fingerprint_sha256,
|
||||
ca_cert_pem=issued.ca_cert_pem.decode(),
|
||||
worker_cert_pem=issued.cert_pem.decode(),
|
||||
worker_key_pem=issued.key_pem.decode(),
|
||||
updater=updater_view,
|
||||
)
|
||||
26
decnet/web/router/swarm/api_get_host.py
Normal file
26
decnet/web/router/swarm/api_get_host.py
Normal file
@@ -0,0 +1,26 @@
|
||||
"""GET /swarm/hosts/{uuid} — fetch a single worker by UUID."""
|
||||
from __future__ import annotations
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo
|
||||
from decnet.web.db.models import SwarmHostView
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get(
|
||||
"/hosts/{uuid}",
|
||||
response_model=SwarmHostView,
|
||||
tags=["Swarm Hosts"],
|
||||
responses={404: {"description": "No host with this UUID is enrolled"}},
|
||||
)
|
||||
async def api_get_host(
|
||||
uuid: str,
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> SwarmHostView:
|
||||
row = await repo.get_swarm_host_by_uuid(uuid)
|
||||
if row is None:
|
||||
raise HTTPException(status_code=404, detail="host not found")
|
||||
return SwarmHostView(**row)
|
||||
11
decnet/web/router/swarm/api_get_swarm_health.py
Normal file
11
decnet/web/router/swarm/api_get_swarm_health.py
Normal file
@@ -0,0 +1,11 @@
|
||||
"""GET /swarm/health — controller liveness (no I/O)."""
|
||||
from __future__ import annotations
|
||||
|
||||
from fastapi import APIRouter
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get("/health", tags=["Swarm Health"])
|
||||
async def api_get_swarm_health() -> dict[str, str]:
|
||||
return {"status": "ok", "role": "swarm-controller"}
|
||||
212
decnet/web/router/swarm/api_heartbeat.py
Normal file
212
decnet/web/router/swarm/api_heartbeat.py
Normal file
@@ -0,0 +1,212 @@
|
||||
"""POST /swarm/heartbeat — agent→master liveness + decky snapshot refresh.
|
||||
|
||||
Workers call this every ~30 s with the output of ``executor.status()``.
|
||||
The master bumps ``SwarmHost.last_heartbeat`` and re-upserts each
|
||||
``DeckyShard`` with the fresh ``DeckyConfig`` snapshot + runtime-derived
|
||||
state so the dashboard stays current without a master-pull probe.
|
||||
|
||||
Security: CA-signed mTLS is necessary but not sufficient — a
|
||||
decommissioned worker's still-valid cert must not resurrect ghost
|
||||
shards. We pin the presented peer cert's SHA-256 to the
|
||||
``client_cert_fingerprint`` stored for the claimed ``host_uuid``.
|
||||
Mismatch (or decommissioned host) → 403.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Request
|
||||
from pydantic import BaseModel
|
||||
|
||||
from decnet.config import DeckyConfig
|
||||
from decnet.logging import get_logger
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo
|
||||
|
||||
log = get_logger("swarm.heartbeat")
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
class HeartbeatRequest(BaseModel):
|
||||
host_uuid: str
|
||||
agent_version: Optional[str] = None
|
||||
status: dict[str, Any]
|
||||
topology: Optional[dict[str, Any]] = None
|
||||
|
||||
|
||||
def _extract_peer_fingerprint(scope: dict[str, Any]) -> Optional[str]:
|
||||
"""Pull the peer cert's SHA-256 fingerprint from an ASGI scope.
|
||||
|
||||
Tries two extraction paths because uvicorn has historically stashed
|
||||
the TLS peer cert in different scope keys across versions:
|
||||
|
||||
1. Primary: ``scope["extensions"]["tls"]["client_cert_chain"][0]``
|
||||
(uvicorn ≥ 0.30 ASGI TLS extension).
|
||||
2. Fallback: the transport object's ``ssl_object.getpeercert(binary_form=True)``
|
||||
(older uvicorn builds + some other servers).
|
||||
|
||||
Returns the lowercase hex SHA-256 of the DER-encoded cert, or None
|
||||
when neither path yields bytes. The endpoint fails closed on None.
|
||||
"""
|
||||
peer_der: Optional[bytes] = None
|
||||
source = "none"
|
||||
|
||||
try:
|
||||
chain = scope.get("extensions", {}).get("tls", {}).get("client_cert_chain")
|
||||
if chain:
|
||||
peer_der = chain[0]
|
||||
source = "primary"
|
||||
except Exception:
|
||||
peer_der = None
|
||||
|
||||
if peer_der is None:
|
||||
transport = scope.get("transport")
|
||||
try:
|
||||
ssl_obj = transport.get_extra_info("ssl_object") if transport else None
|
||||
if ssl_obj is not None:
|
||||
peer_der = ssl_obj.getpeercert(binary_form=True)
|
||||
if peer_der:
|
||||
source = "fallback"
|
||||
except Exception:
|
||||
peer_der = None
|
||||
|
||||
if not peer_der:
|
||||
log.debug("heartbeat: peer cert extraction failed via none")
|
||||
return None
|
||||
|
||||
log.debug("heartbeat: peer cert extraction succeeded via %s", source)
|
||||
return hashlib.sha256(peer_der).hexdigest().lower()
|
||||
|
||||
|
||||
async def _verify_peer_matches_host(
|
||||
request: Request, host_uuid: str, repo: BaseRepository
|
||||
) -> dict[str, Any]:
|
||||
host = await repo.get_swarm_host_by_uuid(host_uuid)
|
||||
if host is None:
|
||||
raise HTTPException(status_code=404, detail="unknown host")
|
||||
fp = _extract_peer_fingerprint(request.scope)
|
||||
if fp is None:
|
||||
raise HTTPException(status_code=403, detail="peer cert unavailable")
|
||||
expected = (host.get("client_cert_fingerprint") or "").lower()
|
||||
if not expected or fp != expected:
|
||||
raise HTTPException(status_code=403, detail="cert fingerprint mismatch")
|
||||
return host
|
||||
|
||||
|
||||
async def _reconcile_topology_report(
|
||||
repo: BaseRepository,
|
||||
host_uuid: str,
|
||||
reported: Optional[dict[str, Any]],
|
||||
) -> None:
|
||||
"""Compare the agent's reported applied_version_hash against what
|
||||
master expects for any topology pinned to *host_uuid*.
|
||||
|
||||
Sets ``needs_resync=True`` when:
|
||||
- master has an ACTIVE topology targeted here but the agent reports
|
||||
a different hash, OR
|
||||
- master has an ACTIVE topology targeted here but the agent reports
|
||||
no topology at all (fresh boot / wiped cache).
|
||||
|
||||
The actual re-push is handled by the mutator reconcile loop so the
|
||||
heartbeat endpoint stays cheap.
|
||||
"""
|
||||
from decnet.topology.hashing import canonical_hash
|
||||
from decnet.topology.persistence import hydrate
|
||||
from decnet.topology.status import TopologyStatus
|
||||
|
||||
try:
|
||||
topos = await repo.list_topologies(status=TopologyStatus.ACTIVE)
|
||||
except Exception:
|
||||
log.exception("heartbeat: could not list active topologies")
|
||||
return
|
||||
mine = [t for t in topos if t.get("target_host_uuid") == host_uuid]
|
||||
if not mine:
|
||||
return
|
||||
|
||||
reported_id = (reported or {}).get("topology_id")
|
||||
reported_hash = (reported or {}).get("applied_version_hash")
|
||||
|
||||
for topo in mine:
|
||||
tid = topo["id"]
|
||||
if topo.get("needs_resync"):
|
||||
continue
|
||||
expected: Optional[str] = None
|
||||
if reported_id == tid and reported_hash:
|
||||
try:
|
||||
hydrated = await hydrate(repo, tid)
|
||||
except Exception:
|
||||
log.exception("heartbeat: hydrate failed tid=%s", tid)
|
||||
continue
|
||||
if hydrated is None:
|
||||
continue
|
||||
expected = canonical_hash(hydrated)
|
||||
if expected == reported_hash:
|
||||
continue
|
||||
# Either mismatch or agent reports no/other topology — flag it.
|
||||
try:
|
||||
await repo.set_topology_resync(tid, True)
|
||||
log.info(
|
||||
"heartbeat: flagged topology %s for resync (host=%s "
|
||||
"reported_id=%s reported_hash=%s expected=%s)",
|
||||
tid, host_uuid, reported_id, reported_hash, expected,
|
||||
)
|
||||
except Exception:
|
||||
log.exception("heartbeat: failed to flag resync tid=%s", tid)
|
||||
|
||||
|
||||
@router.post(
|
||||
"/heartbeat",
|
||||
status_code=204,
|
||||
tags=["Swarm Health"],
|
||||
responses={
|
||||
400: {"description": "Bad Request (malformed JSON body)"},
|
||||
403: {"description": "Peer cert missing, or its fingerprint does not match the host's pinned cert"},
|
||||
404: {"description": "host_uuid is not enrolled"},
|
||||
422: {"description": "Request body validation error"},
|
||||
},
|
||||
)
|
||||
async def heartbeat(
|
||||
req: HeartbeatRequest,
|
||||
request: Request,
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> None:
|
||||
await _verify_peer_matches_host(request, req.host_uuid, repo)
|
||||
|
||||
now = datetime.now(timezone.utc)
|
||||
await repo.update_swarm_host(
|
||||
req.host_uuid,
|
||||
{"status": "active", "last_heartbeat": now},
|
||||
)
|
||||
|
||||
await _reconcile_topology_report(repo, req.host_uuid, req.topology)
|
||||
|
||||
status_body = req.status or {}
|
||||
if not status_body.get("deployed"):
|
||||
return
|
||||
|
||||
runtime = status_body.get("runtime") or {}
|
||||
for decky_dict in status_body.get("deckies") or []:
|
||||
try:
|
||||
d = DeckyConfig(**decky_dict)
|
||||
except Exception:
|
||||
log.exception("heartbeat: skipping malformed decky payload host=%s", req.host_uuid)
|
||||
continue
|
||||
rstate = runtime.get(d.name) or {}
|
||||
is_up = bool(rstate.get("running"))
|
||||
await repo.upsert_decky_shard(
|
||||
{
|
||||
"decky_name": d.name,
|
||||
"host_uuid": req.host_uuid,
|
||||
"services": json.dumps(d.services),
|
||||
"decky_config": d.model_dump_json(),
|
||||
"decky_ip": d.ip,
|
||||
"state": "running" if is_up else "degraded",
|
||||
"last_error": None,
|
||||
"last_seen": now,
|
||||
"updated_at": now,
|
||||
}
|
||||
)
|
||||
55
decnet/web/router/swarm/api_list_deckies.py
Normal file
55
decnet/web/router/swarm/api_list_deckies.py
Normal file
@@ -0,0 +1,55 @@
|
||||
"""GET /swarm/deckies — list decky shards with their worker host's identity.
|
||||
|
||||
The DeckyShard table maps decky_name → host_uuid; users want to see which
|
||||
deckies are running and *where*, so we enrich each shard with the owning
|
||||
host's name/address/status from SwarmHost rather than making callers do
|
||||
the join themselves.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, Depends
|
||||
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo
|
||||
from decnet.web.db.models import DeckyShardView
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get("/deckies", response_model=list[DeckyShardView], tags=["Swarm Deckies"])
|
||||
async def api_list_deckies(
|
||||
host_uuid: Optional[str] = None,
|
||||
state: Optional[str] = None,
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> list[DeckyShardView]:
|
||||
shards = await repo.list_decky_shards(host_uuid)
|
||||
hosts = {h["uuid"]: h for h in await repo.list_swarm_hosts()}
|
||||
|
||||
out: list[DeckyShardView] = []
|
||||
for s in shards:
|
||||
if state and s.get("state") != state:
|
||||
continue
|
||||
host = hosts.get(s["host_uuid"], {})
|
||||
out.append(DeckyShardView(
|
||||
decky_name=s["decky_name"],
|
||||
decky_ip=s.get("decky_ip"),
|
||||
host_uuid=s["host_uuid"],
|
||||
host_name=host.get("name") or "<unknown>",
|
||||
host_address=host.get("address") or "",
|
||||
host_status=host.get("status") or "unknown",
|
||||
services=s.get("services") or [],
|
||||
state=s.get("state") or "pending",
|
||||
last_error=s.get("last_error"),
|
||||
compose_hash=s.get("compose_hash"),
|
||||
updated_at=s["updated_at"],
|
||||
hostname=s.get("hostname"),
|
||||
distro=s.get("distro"),
|
||||
archetype=s.get("archetype"),
|
||||
service_config=s.get("service_config") or {},
|
||||
mutate_interval=s.get("mutate_interval"),
|
||||
last_mutated=s.get("last_mutated") or 0.0,
|
||||
last_seen=s.get("last_seen"),
|
||||
))
|
||||
return out
|
||||
21
decnet/web/router/swarm/api_list_hosts.py
Normal file
21
decnet/web/router/swarm/api_list_hosts.py
Normal file
@@ -0,0 +1,21 @@
|
||||
"""GET /swarm/hosts — list enrolled workers, optionally filtered by status."""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, Depends
|
||||
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo
|
||||
from decnet.web.db.models import SwarmHostView
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get("/hosts", response_model=list[SwarmHostView], tags=["Swarm Hosts"])
|
||||
async def api_list_hosts(
|
||||
host_status: Optional[str] = None,
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> list[SwarmHostView]:
|
||||
rows = await repo.list_swarm_hosts(host_status)
|
||||
return [SwarmHostView(**r) for r in rows]
|
||||
60
decnet/web/router/swarm/api_teardown_swarm.py
Normal file
60
decnet/web/router/swarm/api_teardown_swarm.py
Normal file
@@ -0,0 +1,60 @@
|
||||
"""POST /swarm/teardown — tear down one or all enrolled workers."""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
|
||||
from decnet.logging import get_logger
|
||||
from decnet.swarm.client import AgentClient
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo
|
||||
from decnet.web.db.models import (
|
||||
SwarmDeployResponse,
|
||||
SwarmHostResult,
|
||||
SwarmTeardownRequest,
|
||||
)
|
||||
|
||||
log = get_logger("swarm.teardown")
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.post(
|
||||
"/teardown",
|
||||
response_model=SwarmDeployResponse,
|
||||
tags=["Swarm Deployments"],
|
||||
responses={
|
||||
400: {"description": "Bad Request (malformed JSON body)"},
|
||||
404: {"description": "A targeted host does not exist"},
|
||||
422: {"description": "Request body validation error"},
|
||||
},
|
||||
)
|
||||
async def api_teardown_swarm(
|
||||
req: SwarmTeardownRequest,
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> SwarmDeployResponse:
|
||||
if req.host_uuid is not None:
|
||||
row = await repo.get_swarm_host_by_uuid(req.host_uuid)
|
||||
if row is None:
|
||||
raise HTTPException(status_code=404, detail="host not found")
|
||||
targets = [row]
|
||||
else:
|
||||
targets = await repo.list_swarm_hosts()
|
||||
|
||||
async def _call(host: dict[str, Any]) -> SwarmHostResult:
|
||||
try:
|
||||
async with AgentClient(host=host) as agent:
|
||||
body = await agent.teardown(req.decky_id)
|
||||
if req.decky_id is None:
|
||||
await repo.delete_decky_shards_for_host(host["uuid"])
|
||||
return SwarmHostResult(host_uuid=host["uuid"], host_name=host["name"], ok=True, detail=body)
|
||||
except Exception as exc:
|
||||
log.exception("swarm.teardown failed host=%s", host["name"])
|
||||
return SwarmHostResult(
|
||||
host_uuid=host["uuid"], host_name=host["name"], ok=False, detail=str(exc)
|
||||
)
|
||||
|
||||
results = await asyncio.gather(*(_call(h) for h in targets))
|
||||
return SwarmDeployResponse(results=list(results))
|
||||
26
decnet/web/router/swarm_mgmt/__init__.py
Normal file
26
decnet/web/router/swarm_mgmt/__init__.py
Normal file
@@ -0,0 +1,26 @@
|
||||
"""Swarm management endpoints for the React dashboard.
|
||||
|
||||
These are *not* the unauthenticated /swarm routes mounted on the separate
|
||||
swarm-controller process (decnet/web/swarm_api.py on port 8770). These
|
||||
live on the main web API, go through ``require_admin``, and are the
|
||||
interface the dashboard uses to list hosts, decommission them, list
|
||||
deckies across the fleet, and generate one-shot agent-enrollment
|
||||
bundles.
|
||||
|
||||
Mounted under ``/api/v1/swarm`` by the main api router.
|
||||
"""
|
||||
from fastapi import APIRouter
|
||||
|
||||
from .api_list_hosts import router as list_hosts_router
|
||||
from .api_decommission_host import router as decommission_host_router
|
||||
from .api_list_deckies import router as list_deckies_router
|
||||
from .api_enroll_bundle import router as enroll_bundle_router
|
||||
from .api_teardown_host import router as teardown_host_router
|
||||
|
||||
swarm_mgmt_router = APIRouter(prefix="/swarm")
|
||||
|
||||
swarm_mgmt_router.include_router(list_hosts_router)
|
||||
swarm_mgmt_router.include_router(decommission_host_router)
|
||||
swarm_mgmt_router.include_router(list_deckies_router)
|
||||
swarm_mgmt_router.include_router(enroll_bundle_router)
|
||||
swarm_mgmt_router.include_router(teardown_host_router)
|
||||
71
decnet/web/router/swarm_mgmt/api_decommission_host.py
Normal file
71
decnet/web/router/swarm_mgmt/api_decommission_host.py
Normal file
@@ -0,0 +1,71 @@
|
||||
"""DELETE /swarm/hosts/{uuid} — decommission a worker from the dashboard.
|
||||
|
||||
Also instructs the worker agent to stop all DECNET services and delete
|
||||
its install footprint (keeping logs). Agent self-destruct failure does
|
||||
not block decommission — the master-side cleanup always runs so a dead
|
||||
worker can still be removed from the dashboard.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import pathlib
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, status
|
||||
|
||||
from decnet.logging import get_logger
|
||||
from decnet.swarm.client import AgentClient
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo, require_admin
|
||||
|
||||
log = get_logger("swarm.decommission")
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.delete(
|
||||
"/hosts/{uuid}",
|
||||
status_code=status.HTTP_204_NO_CONTENT,
|
||||
tags=["Swarm Management"],
|
||||
responses={
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "Host not found"},
|
||||
422: {"description": "Path parameter validation error"},
|
||||
},
|
||||
)
|
||||
async def decommission_host(
|
||||
uuid: str,
|
||||
admin: dict = Depends(require_admin),
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> None:
|
||||
row = await repo.get_swarm_host_by_uuid(uuid)
|
||||
if row is None:
|
||||
raise HTTPException(status_code=404, detail="host not found")
|
||||
|
||||
# Ask the worker to wipe its own install (keeps logs). The agent
|
||||
# schedules the reaper as a detached process and returns immediately,
|
||||
# so this call is fast when the worker is reachable. A dead worker
|
||||
# shouldn't block the operator from cleaning up the dashboard entry,
|
||||
# hence best-effort with a log and continue.
|
||||
try:
|
||||
async with AgentClient(host=row) as agent:
|
||||
await agent.self_destruct()
|
||||
except Exception:
|
||||
log.exception(
|
||||
"decommission: self-destruct dispatch failed host=%s — "
|
||||
"proceeding with master-side cleanup anyway",
|
||||
row.get("name"),
|
||||
)
|
||||
|
||||
await repo.delete_decky_shards_for_host(uuid)
|
||||
await repo.delete_swarm_host(uuid)
|
||||
|
||||
bundle_dir = pathlib.Path(row.get("cert_bundle_path") or "")
|
||||
if bundle_dir.is_dir():
|
||||
for child in bundle_dir.iterdir():
|
||||
try:
|
||||
child.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
try:
|
||||
bundle_dir.rmdir()
|
||||
except OSError:
|
||||
pass
|
||||
504
decnet/web/router/swarm_mgmt/api_enroll_bundle.py
Normal file
504
decnet/web/router/swarm_mgmt/api_enroll_bundle.py
Normal file
@@ -0,0 +1,504 @@
|
||||
"""Agent-enrollment bundles — the Wazuh-style one-liner flow.
|
||||
|
||||
Three endpoints:
|
||||
POST /swarm/enroll-bundle — admin issues certs + builds payload
|
||||
GET /swarm/enroll-bundle/{t}.sh — bootstrap script (idempotent until .tgz)
|
||||
GET /swarm/enroll-bundle/{t}.tgz — tarball payload (one-shot; trips served)
|
||||
|
||||
The operator's paste is a single pipe ``curl -fsSL <.sh> | sudo bash``.
|
||||
Under the hood the bootstrap curls the ``.tgz`` from the same token.
|
||||
Both files are rendered + persisted on POST; the ``.tgz`` GET atomically
|
||||
marks the token served, reads the bytes under the lock, and unlinks both
|
||||
files so a sweeper cannot race it. Unclaimed tokens expire after 5 min.
|
||||
|
||||
We avoid the single-self-extracting-script pattern because ``bash`` run
|
||||
via pipe has ``$0 == "bash"`` — there is no file on disk to ``tail`` for
|
||||
the embedded payload. Two URLs, one paste.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import io
|
||||
import os
|
||||
import pathlib
|
||||
import secrets
|
||||
import tarfile
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Request, Response, status
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from decnet.logging import get_logger
|
||||
from decnet.swarm import pki
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo, require_admin
|
||||
|
||||
log = get_logger("swarm_mgmt.enroll_bundle")
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
BUNDLE_TTL = timedelta(minutes=5)
|
||||
BUNDLE_DIR = pathlib.Path(os.environ.get("DECNET_ENROLL_BUNDLE_DIR", "/tmp/decnet-enroll")) # nosec B108 - short-lived 0600 bundle cache, env-overridable
|
||||
SWEEP_INTERVAL_SECS = 30
|
||||
|
||||
# Include list — explicit set of paths that ship to the agent. An
|
||||
# include list fails closed: anything new on the master (stray .env, dev
|
||||
# venvs, data dumps, editor scratch dirs) cannot leak into the bundle
|
||||
# just because we forgot to exclude it.
|
||||
#
|
||||
# What the agent actually needs:
|
||||
# * pyproject.toml at the repo root, so ``pip install`` works against
|
||||
# the bundle during enroll_bootstrap.sh.
|
||||
# * the ``decnet/`` package, MINUS the master-only subtrees called out
|
||||
# by _EXCLUDED_DECNET_SUBTREES — those never import on an agent host.
|
||||
# Everything else the bootstrap needs (the INI, certs, systemd units) is
|
||||
# synthesized in-memory by ``_build_tarball`` below — it never hits the
|
||||
# filesystem walk.
|
||||
|
||||
# Top-level files shipped verbatim. Relative to the repo root.
|
||||
_INCLUDED_ROOT_FILES: tuple[str, ...] = ("pyproject.toml",)
|
||||
|
||||
# Top-level directories walked into the bundle. Relative to the repo root.
|
||||
_INCLUDED_DIRS: tuple[str, ...] = ("decnet",)
|
||||
|
||||
# Subtrees of an included directory that must NOT ship. Paths are
|
||||
# relative to the repo root, forward-slash separated.
|
||||
# * ``decnet/web`` — FastAPI master app, unused by agents.
|
||||
# * ``decnet/mutator`` — schedules respawns swarm-wide; master-only.
|
||||
# * ``decnet/profiler`` — rebuilds profiles against the master DB.
|
||||
_EXCLUDED_DECNET_SUBTREES: frozenset[str] = frozenset({
|
||||
"decnet/web",
|
||||
"decnet/mutator",
|
||||
"decnet/profiler",
|
||||
})
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# DTOs
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
class EnrollBundleRequest(BaseModel):
|
||||
master_host: str = Field(..., min_length=1, max_length=253,
|
||||
description="IP/host the agent will reach back to")
|
||||
agent_name: str = Field(..., pattern=r"^[a-z0-9][a-z0-9-]{0,62}$",
|
||||
description="Worker name (DNS-label safe)")
|
||||
with_updater: bool = Field(
|
||||
default=True,
|
||||
description="Include updater cert bundle and auto-start decnet updater on the agent",
|
||||
)
|
||||
use_ipvlan: bool = Field(
|
||||
default=False,
|
||||
description=(
|
||||
"Run deckies on this agent over IPvlan L2 instead of MACVLAN. "
|
||||
"Required when the agent is a VirtualBox/VMware guest bridged over Wi-Fi — "
|
||||
"Wi-Fi APs bind one MAC per station, so MACVLAN's extra container MACs "
|
||||
"rotate the VM's DHCP lease. Safe no-op on wired/bare-metal hosts."
|
||||
),
|
||||
)
|
||||
services_ini: Optional[str] = Field(
|
||||
default=None,
|
||||
description="Optional INI text shipped to the agent as /etc/decnet/services.ini",
|
||||
)
|
||||
|
||||
|
||||
class EnrollBundleResponse(BaseModel):
|
||||
token: str
|
||||
command: str
|
||||
expires_at: datetime
|
||||
host_uuid: str
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# In-memory registry
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@dataclass
|
||||
class _Bundle:
|
||||
sh_path: pathlib.Path
|
||||
tgz_path: pathlib.Path
|
||||
expires_at: datetime
|
||||
host_uuid: str
|
||||
served: bool = False
|
||||
|
||||
|
||||
_BUNDLES: dict[str, _Bundle] = {}
|
||||
_LOCK = asyncio.Lock()
|
||||
_SWEEPER_TASK: Optional[asyncio.Task] = None
|
||||
|
||||
|
||||
async def _sweep_loop() -> None:
|
||||
while True:
|
||||
try:
|
||||
await asyncio.sleep(SWEEP_INTERVAL_SECS)
|
||||
now = datetime.now(timezone.utc)
|
||||
async with _LOCK:
|
||||
dead = [t for t, b in _BUNDLES.items() if b.served or b.expires_at <= now]
|
||||
for t in dead:
|
||||
b = _BUNDLES.pop(t)
|
||||
for p in (b.sh_path, b.tgz_path):
|
||||
try:
|
||||
p.unlink()
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
except OSError as exc:
|
||||
log.warning("enroll-bundle sweep unlink failed path=%s err=%s", p, exc)
|
||||
except asyncio.CancelledError:
|
||||
raise
|
||||
except Exception: # noqa: BLE001
|
||||
log.exception("enroll-bundle sweeper iteration failed")
|
||||
|
||||
|
||||
def _ensure_sweeper() -> None:
|
||||
global _SWEEPER_TASK
|
||||
if _SWEEPER_TASK is None or _SWEEPER_TASK.done():
|
||||
_SWEEPER_TASK = asyncio.create_task(_sweep_loop())
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tarball construction
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _repo_root() -> pathlib.Path:
|
||||
# decnet/web/router/swarm_mgmt/api_enroll_bundle.py -> 4 parents = repo root.
|
||||
return pathlib.Path(__file__).resolve().parents[4]
|
||||
|
||||
|
||||
def _iter_included(root: pathlib.Path) -> "list[tuple[pathlib.Path, str]]":
|
||||
"""Return ``(full_path, arcname)`` pairs for every file the agent needs.
|
||||
|
||||
Walk is pruned in-place: ``__pycache__`` and the master-only subtrees
|
||||
in :data:`_EXCLUDED_DECNET_SUBTREES` are skipped at the directory
|
||||
level so we never descend into them (critical on dev boxes where
|
||||
``decnet/web/`` pulls in a fat frontend tree via package-data).
|
||||
"""
|
||||
found: list[tuple[pathlib.Path, str]] = []
|
||||
|
||||
# Top-level files.
|
||||
for rel in _INCLUDED_ROOT_FILES:
|
||||
p = root / rel
|
||||
if p.is_file():
|
||||
found.append((p, rel))
|
||||
|
||||
# Top-level dirs, pruned.
|
||||
for top in _INCLUDED_DIRS:
|
||||
start = root / top
|
||||
if not start.is_dir():
|
||||
continue
|
||||
for dirpath, dirnames, filenames in os.walk(start, topdown=True, followlinks=False):
|
||||
dir_path = pathlib.Path(dirpath)
|
||||
rel_dir = dir_path.relative_to(root).as_posix()
|
||||
|
||||
# Prune excluded subtrees + cache dirs BEFORE descending.
|
||||
dirnames[:] = [
|
||||
d for d in dirnames
|
||||
if d != "__pycache__"
|
||||
and f"{rel_dir}/{d}" not in _EXCLUDED_DECNET_SUBTREES
|
||||
]
|
||||
|
||||
for fn in filenames:
|
||||
if fn.endswith((".pyc", ".pyo")):
|
||||
continue
|
||||
full = dir_path / fn
|
||||
if full.is_symlink():
|
||||
continue
|
||||
found.append((full, f"{rel_dir}/{fn}"))
|
||||
|
||||
# Deterministic tarball ordering.
|
||||
found.sort(key=lambda t: t[1])
|
||||
return found
|
||||
|
||||
|
||||
def _render_decnet_ini(
|
||||
master_host: str,
|
||||
host_uuid: str,
|
||||
use_ipvlan: bool = False,
|
||||
swarmctl_port: int = 8770,
|
||||
) -> bytes:
|
||||
ipvlan_line = f"ipvlan = {'true' if use_ipvlan else 'false'}\n"
|
||||
return (
|
||||
"; Generated by DECNET agent-enrollment bundle.\n"
|
||||
"[decnet]\n"
|
||||
"mode = agent\n"
|
||||
"disallow-master = true\n"
|
||||
"log-directory = /var/log/decnet\n"
|
||||
f"{ipvlan_line}"
|
||||
"\n"
|
||||
"[agent]\n"
|
||||
f"master-host = {master_host}\n"
|
||||
f"swarmctl-port = {swarmctl_port}\n"
|
||||
"swarm-syslog-port = 6514\n"
|
||||
"agent-port = 8765\n"
|
||||
"agent-dir = /etc/decnet/agent\n"
|
||||
"updater-dir = /etc/decnet/updater\n"
|
||||
f"host-uuid = {host_uuid}\n"
|
||||
).encode()
|
||||
|
||||
|
||||
def _add_bytes(tar: tarfile.TarFile, name: str, data: bytes, mode: int = 0o644) -> None:
|
||||
info = tarfile.TarInfo(name)
|
||||
info.size = len(data)
|
||||
info.mode = mode
|
||||
info.mtime = int(datetime.now(timezone.utc).timestamp())
|
||||
tar.addfile(info, io.BytesIO(data))
|
||||
|
||||
|
||||
def _build_tarball(
|
||||
master_host: str,
|
||||
agent_name: str,
|
||||
host_uuid: str,
|
||||
issued: pki.IssuedCert,
|
||||
services_ini: Optional[str],
|
||||
updater_issued: Optional[pki.IssuedCert] = None,
|
||||
use_ipvlan: bool = False,
|
||||
) -> bytes:
|
||||
"""Gzipped tarball with:
|
||||
- agent-required source (see :data:`_INCLUDED_DIRS` /
|
||||
:data:`_INCLUDED_ROOT_FILES`; master-only decnet/ subtrees
|
||||
pruned)
|
||||
- etc/decnet/decnet.ini (pre-baked for mode=agent)
|
||||
- home/.decnet/agent/{ca.crt,worker.crt,worker.key}
|
||||
- home/.decnet/updater/{ca.crt,updater.crt,updater.key} (if updater_issued)
|
||||
- services.ini at root if provided
|
||||
"""
|
||||
root = _repo_root()
|
||||
buf = io.BytesIO()
|
||||
with tarfile.open(fileobj=buf, mode="w:gz") as tar:
|
||||
for path, arcname in _iter_included(root):
|
||||
tar.add(path, arcname=arcname, recursive=False)
|
||||
|
||||
_add_bytes(
|
||||
tar,
|
||||
"etc/decnet/decnet.ini",
|
||||
_render_decnet_ini(master_host, host_uuid, use_ipvlan),
|
||||
)
|
||||
for unit in _SYSTEMD_UNITS:
|
||||
_add_bytes(
|
||||
tar,
|
||||
f"etc/systemd/system/{unit}.service",
|
||||
_render_systemd_unit(unit, agent_name, master_host),
|
||||
)
|
||||
_add_bytes(tar, "home/.decnet/agent/ca.crt", issued.ca_cert_pem)
|
||||
_add_bytes(tar, "home/.decnet/agent/worker.crt", issued.cert_pem)
|
||||
_add_bytes(tar, "home/.decnet/agent/worker.key", issued.key_pem, mode=0o600)
|
||||
|
||||
if updater_issued is not None:
|
||||
_add_bytes(tar, "home/.decnet/updater/ca.crt", updater_issued.ca_cert_pem)
|
||||
_add_bytes(tar, "home/.decnet/updater/updater.crt", updater_issued.cert_pem)
|
||||
_add_bytes(tar, "home/.decnet/updater/updater.key", updater_issued.key_pem, mode=0o600)
|
||||
|
||||
if services_ini:
|
||||
_add_bytes(tar, "services.ini", services_ini.encode())
|
||||
|
||||
return buf.getvalue()
|
||||
|
||||
|
||||
_SYSTEMD_UNITS = (
|
||||
"decnet-agent", "decnet-forwarder", "decnet-engine", "decnet-updater",
|
||||
# Per-host microservices — activated by enroll_bootstrap.sh. The
|
||||
# profiler intentionally stays master-side: it rebuilds attacker
|
||||
# profiles against the master DB, which workers don't share.
|
||||
"decnet-collector", "decnet-prober", "decnet-sniffer",
|
||||
)
|
||||
|
||||
|
||||
def _render_systemd_unit(name: str, agent_name: str, master_host: str) -> bytes:
|
||||
tpl_path = pathlib.Path(__file__).resolve().parents[1].parent / "templates" / f"{name}.service.j2"
|
||||
tpl = tpl_path.read_text()
|
||||
return (
|
||||
tpl.replace("{{ agent_name }}", agent_name)
|
||||
.replace("{{ master_host }}", master_host)
|
||||
).encode()
|
||||
|
||||
|
||||
def _render_bootstrap(
|
||||
agent_name: str,
|
||||
master_host: str,
|
||||
tarball_url: str,
|
||||
expires_at: datetime,
|
||||
with_updater: bool,
|
||||
) -> bytes:
|
||||
tpl_path = pathlib.Path(__file__).resolve().parents[1].parent / "templates" / "enroll_bootstrap.sh.j2"
|
||||
tpl = tpl_path.read_text()
|
||||
now = datetime.now(timezone.utc).replace(microsecond=0).isoformat()
|
||||
rendered = (
|
||||
tpl.replace("{{ agent_name }}", agent_name)
|
||||
.replace("{{ master_host }}", master_host)
|
||||
.replace("{{ tarball_url }}", tarball_url)
|
||||
.replace("{{ generated_at }}", now)
|
||||
.replace("{{ expires_at }}", expires_at.replace(microsecond=0).isoformat())
|
||||
.replace("{{ with_updater }}", "true" if with_updater else "false")
|
||||
)
|
||||
return rendered.encode()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Endpoints
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@router.post(
|
||||
"/enroll-bundle",
|
||||
response_model=EnrollBundleResponse,
|
||||
status_code=status.HTTP_201_CREATED,
|
||||
tags=["Swarm Management"],
|
||||
responses={
|
||||
400: {"description": "Bad Request (malformed JSON body)"},
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
409: {"description": "A worker with this name is already enrolled"},
|
||||
422: {"description": "Request body validation error"},
|
||||
},
|
||||
)
|
||||
async def create_enroll_bundle(
|
||||
req: EnrollBundleRequest,
|
||||
request: Request,
|
||||
admin: dict = Depends(require_admin),
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> EnrollBundleResponse:
|
||||
import uuid as _uuid
|
||||
|
||||
existing = await repo.get_swarm_host_by_name(req.agent_name)
|
||||
if existing is not None:
|
||||
raise HTTPException(status_code=409, detail=f"Worker '{req.agent_name}' is already enrolled")
|
||||
|
||||
# 1. Issue certs (reuses the same code as /swarm/enroll). The worker's own
|
||||
# address is not known yet — the master learns it when the agent fetches
|
||||
# the tarball (see get_payload), which also backfills the SwarmHost row.
|
||||
ca = pki.ensure_ca()
|
||||
sans = list({req.agent_name, req.master_host})
|
||||
issued = pki.issue_worker_cert(ca, req.agent_name, sans)
|
||||
bundle_dir = pki.DEFAULT_CA_DIR / "workers" / req.agent_name
|
||||
pki.write_worker_bundle(issued, bundle_dir)
|
||||
|
||||
updater_issued: Optional[pki.IssuedCert] = None
|
||||
updater_fp: Optional[str] = None
|
||||
if req.with_updater:
|
||||
updater_cn = f"updater@{req.agent_name}"
|
||||
updater_sans = list({*sans, updater_cn, "127.0.0.1"})
|
||||
updater_issued = pki.issue_worker_cert(ca, updater_cn, updater_sans)
|
||||
updater_dir = bundle_dir / "updater"
|
||||
updater_dir.mkdir(parents=True, exist_ok=True)
|
||||
(updater_dir / "updater.crt").write_bytes(updater_issued.cert_pem)
|
||||
(updater_dir / "updater.key").write_bytes(updater_issued.key_pem)
|
||||
os.chmod(updater_dir / "updater.key", 0o600)
|
||||
updater_fp = updater_issued.fingerprint_sha256
|
||||
|
||||
# 2. Register the host row so it shows up in SwarmHosts immediately.
|
||||
host_uuid = str(_uuid.uuid4())
|
||||
await repo.add_swarm_host(
|
||||
{
|
||||
"uuid": host_uuid,
|
||||
"name": req.agent_name,
|
||||
"address": "", # filled in when the agent fetches the .tgz (its source IP)
|
||||
"agent_port": 8765,
|
||||
"status": "enrolled",
|
||||
"client_cert_fingerprint": issued.fingerprint_sha256,
|
||||
"updater_cert_fingerprint": updater_fp,
|
||||
"cert_bundle_path": str(bundle_dir),
|
||||
"enrolled_at": datetime.now(timezone.utc),
|
||||
"notes": "enrolled via UI bundle",
|
||||
"use_ipvlan": req.use_ipvlan,
|
||||
}
|
||||
)
|
||||
|
||||
# 3. Render payload + bootstrap.
|
||||
tarball = _build_tarball(
|
||||
req.master_host, req.agent_name, host_uuid, issued, req.services_ini, updater_issued,
|
||||
use_ipvlan=req.use_ipvlan,
|
||||
)
|
||||
token = secrets.token_urlsafe(24)
|
||||
expires_at = datetime.now(timezone.utc) + BUNDLE_TTL
|
||||
|
||||
BUNDLE_DIR.mkdir(parents=True, exist_ok=True, mode=0o700)
|
||||
sh_path = BUNDLE_DIR / f"{token}.sh"
|
||||
tgz_path = BUNDLE_DIR / f"{token}.tgz"
|
||||
|
||||
# Build URLs against the operator-supplied master_host (reachable from the
|
||||
# new agent) rather than request.base_url, which reflects how the dashboard
|
||||
# user reached us — often 127.0.0.1 behind a proxy or loopback-bound API.
|
||||
scheme = request.url.scheme
|
||||
port = request.url.port
|
||||
netloc = req.master_host if port is None else f"{req.master_host}:{port}"
|
||||
base = f"{scheme}://{netloc}"
|
||||
tarball_url = f"{base}/api/v1/swarm/enroll-bundle/{token}.tgz"
|
||||
bootstrap_url = f"{base}/api/v1/swarm/enroll-bundle/{token}.sh"
|
||||
script = _render_bootstrap(req.agent_name, req.master_host, tarball_url, expires_at, req.with_updater)
|
||||
|
||||
tgz_path.write_bytes(tarball)
|
||||
sh_path.write_bytes(script)
|
||||
os.chmod(tgz_path, 0o600)
|
||||
os.chmod(sh_path, 0o600)
|
||||
|
||||
async with _LOCK:
|
||||
_BUNDLES[token] = _Bundle(
|
||||
sh_path=sh_path, tgz_path=tgz_path, expires_at=expires_at, host_uuid=host_uuid,
|
||||
)
|
||||
_ensure_sweeper()
|
||||
|
||||
log.info("enroll-bundle created agent=%s master=%s token=%s...", req.agent_name, req.master_host, token[:8])
|
||||
|
||||
return EnrollBundleResponse(
|
||||
token=token,
|
||||
command=f"curl -fsSL {bootstrap_url} | sudo bash",
|
||||
expires_at=expires_at,
|
||||
host_uuid=host_uuid,
|
||||
)
|
||||
|
||||
|
||||
def _now() -> datetime:
|
||||
# Indirection so tests can monkeypatch.
|
||||
return datetime.now(timezone.utc)
|
||||
|
||||
|
||||
async def _lookup_live(token: str) -> _Bundle:
|
||||
b = _BUNDLES.get(token)
|
||||
if b is None or b.served or b.expires_at <= _now():
|
||||
raise HTTPException(status_code=404, detail="bundle not found or expired")
|
||||
return b
|
||||
|
||||
|
||||
@router.get(
|
||||
"/enroll-bundle/{token}.sh",
|
||||
tags=["Swarm Management"],
|
||||
include_in_schema=False,
|
||||
)
|
||||
async def get_bootstrap(token: str) -> Response:
|
||||
async with _LOCK:
|
||||
b = await _lookup_live(token)
|
||||
data = b.sh_path.read_bytes()
|
||||
return Response(content=data, media_type="text/x-shellscript")
|
||||
|
||||
|
||||
@router.get(
|
||||
"/enroll-bundle/{token}.tgz",
|
||||
tags=["Swarm Management"],
|
||||
include_in_schema=False,
|
||||
)
|
||||
async def get_payload(
|
||||
token: str,
|
||||
request: Request,
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> Response:
|
||||
async with _LOCK:
|
||||
b = await _lookup_live(token)
|
||||
b.served = True
|
||||
data = b.tgz_path.read_bytes()
|
||||
host_uuid = b.host_uuid
|
||||
for p in (b.sh_path, b.tgz_path):
|
||||
try:
|
||||
p.unlink()
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
# The agent's first connect-back — its source IP is the reachable address
|
||||
# the master will later use to probe it. Backfill the SwarmHost row here
|
||||
# so the operator sees the real address instead of an empty placeholder.
|
||||
client_host = request.client.host if request.client else ""
|
||||
if client_host:
|
||||
try:
|
||||
await repo.update_swarm_host(host_uuid, {"address": client_host})
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("enroll-bundle could not backfill address host=%s err=%s", host_uuid, e)
|
||||
|
||||
return Response(content=data, media_type="application/gzip")
|
||||
58
decnet/web/router/swarm_mgmt/api_list_deckies.py
Normal file
58
decnet/web/router/swarm_mgmt/api_list_deckies.py
Normal file
@@ -0,0 +1,58 @@
|
||||
"""GET /swarm/deckies — admin-gated list of decky shards across the fleet."""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, Depends
|
||||
|
||||
from decnet.web.db.models import DeckyShardView
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo, require_admin
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get("/deckies", response_model=list[DeckyShardView], tags=["Swarm Management"])
|
||||
async def list_deckies(
|
||||
host_uuid: Optional[str] = None,
|
||||
state: Optional[str] = None,
|
||||
admin: dict = Depends(require_admin),
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> list[DeckyShardView]:
|
||||
shards = await repo.list_decky_shards(host_uuid)
|
||||
hosts = {h["uuid"]: h for h in await repo.list_swarm_hosts()}
|
||||
|
||||
# Pre-heartbeat fallback — older rows without decky_config can still
|
||||
# surface their IP from the master's deploy state snapshot.
|
||||
deploy_state = await repo.get_state("deployment") or {}
|
||||
cfg_deckies = (deploy_state.get("config") or {}).get("deckies") or []
|
||||
ip_by_name: dict[str, str] = {
|
||||
d.get("name"): d.get("ip") for d in cfg_deckies if d.get("name")
|
||||
}
|
||||
|
||||
out: list[DeckyShardView] = []
|
||||
for s in shards:
|
||||
if state and s.get("state") != state:
|
||||
continue
|
||||
host = hosts.get(s["host_uuid"], {})
|
||||
out.append(DeckyShardView(
|
||||
decky_name=s["decky_name"],
|
||||
decky_ip=s.get("decky_ip") or ip_by_name.get(s["decky_name"]),
|
||||
host_uuid=s["host_uuid"],
|
||||
host_name=host.get("name") or "<unknown>",
|
||||
host_address=host.get("address") or "",
|
||||
host_status=host.get("status") or "unknown",
|
||||
services=s.get("services") or [],
|
||||
state=s.get("state") or "pending",
|
||||
last_error=s.get("last_error"),
|
||||
compose_hash=s.get("compose_hash"),
|
||||
updated_at=s["updated_at"],
|
||||
hostname=s.get("hostname"),
|
||||
distro=s.get("distro"),
|
||||
archetype=s.get("archetype"),
|
||||
service_config=s.get("service_config") or {},
|
||||
mutate_interval=s.get("mutate_interval"),
|
||||
last_mutated=s.get("last_mutated") or 0.0,
|
||||
last_seen=s.get("last_seen"),
|
||||
))
|
||||
return out
|
||||
60
decnet/web/router/swarm_mgmt/api_list_hosts.py
Normal file
60
decnet/web/router/swarm_mgmt/api_list_hosts.py
Normal file
@@ -0,0 +1,60 @@
|
||||
"""GET /swarm/hosts — admin-gated list of enrolled workers for the dashboard.
|
||||
|
||||
Fans out an ``AgentClient.health()`` probe to each host on every call and
|
||||
updates ``status`` / ``last_heartbeat`` as a side effect. This mirrors how
|
||||
``/swarm-updates/hosts`` probes the updater daemon — the SwarmHosts page
|
||||
polls this endpoint, so probe-on-read is what drives heartbeat freshness
|
||||
in the UI. No separate scheduler needed.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Optional
|
||||
|
||||
from fastapi import APIRouter, Depends
|
||||
|
||||
from decnet.logging import get_logger
|
||||
from decnet.swarm.client import AgentClient
|
||||
from decnet.web.db.models import SwarmHostView
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo, require_admin
|
||||
|
||||
log = get_logger("swarm_mgmt.list_hosts")
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
async def _probe_and_update(
|
||||
host: dict[str, Any], repo: BaseRepository
|
||||
) -> dict[str, Any]:
|
||||
"""Best-effort mTLS probe. Skips hosts with no address yet (pending first
|
||||
connect-back) so we don't pollute the DB with 'unreachable' on fresh
|
||||
enrollments that haven't fetched the tarball."""
|
||||
if not host.get("address"):
|
||||
return host
|
||||
try:
|
||||
async with AgentClient(host=host) as agent:
|
||||
await agent.health()
|
||||
patch = {"status": "active", "last_heartbeat": datetime.now(timezone.utc)}
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.debug("swarm/hosts probe unreachable host=%s err=%s", host.get("name"), exc)
|
||||
patch = {"status": "unreachable"}
|
||||
try:
|
||||
await repo.update_swarm_host(host["uuid"], patch)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.warning("swarm/hosts could not persist probe result host=%s err=%s", host.get("name"), exc)
|
||||
return host
|
||||
host.update(patch)
|
||||
return host
|
||||
|
||||
|
||||
@router.get("/hosts", response_model=list[SwarmHostView], tags=["Swarm Management"])
|
||||
async def list_hosts(
|
||||
host_status: Optional[str] = None,
|
||||
admin: dict = Depends(require_admin),
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> list[SwarmHostView]:
|
||||
rows = await repo.list_swarm_hosts(host_status)
|
||||
probed = await asyncio.gather(*(_probe_and_update(r, repo) for r in rows))
|
||||
return [SwarmHostView(**r) for r in probed]
|
||||
150
decnet/web/router/swarm_mgmt/api_teardown_host.py
Normal file
150
decnet/web/router/swarm_mgmt/api_teardown_host.py
Normal file
@@ -0,0 +1,150 @@
|
||||
"""POST /swarm/hosts/{uuid}/teardown — remote teardown on a swarm worker.
|
||||
|
||||
Body: ``{"decky_id": "..."}`` (optional). When ``decky_id`` is null/omitted
|
||||
the agent tears down the entire host (all deckies + network); otherwise it
|
||||
tears down that single decky.
|
||||
|
||||
Async-by-default: the endpoint returns 202 the moment the request is
|
||||
accepted and runs the actual agent call + DB cleanup in a background task.
|
||||
That lets the operator queue multiple teardowns in parallel without
|
||||
blocking on slow docker-compose-down cycles on the worker.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from typing import Any, Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, status
|
||||
from pydantic import BaseModel
|
||||
|
||||
from decnet.logging import get_logger
|
||||
from decnet.swarm.client import AgentClient
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo, require_admin
|
||||
|
||||
log = get_logger("swarm.teardown")
|
||||
router = APIRouter()
|
||||
|
||||
# Track spawned background tasks so (a) they're not GC'd mid-flight and
|
||||
# (b) tests can drain them deterministically via ``await drain_pending()``.
|
||||
_PENDING: "set[asyncio.Task]" = set()
|
||||
|
||||
|
||||
def _spawn(coro) -> asyncio.Task:
|
||||
task = asyncio.create_task(coro)
|
||||
_PENDING.add(task)
|
||||
task.add_done_callback(_PENDING.discard)
|
||||
return task
|
||||
|
||||
|
||||
async def drain_pending() -> None:
|
||||
"""Await all outstanding teardown tasks. Used by tests."""
|
||||
while _PENDING:
|
||||
await asyncio.gather(*list(_PENDING), return_exceptions=True)
|
||||
|
||||
|
||||
class TeardownHostRequest(BaseModel):
|
||||
decky_id: Optional[str] = None
|
||||
|
||||
|
||||
class TeardownHostResponse(BaseModel):
|
||||
host_uuid: str
|
||||
host_name: str
|
||||
decky_id: Optional[str] = None
|
||||
accepted: bool
|
||||
detail: str
|
||||
|
||||
|
||||
async def _mark_tearing_down(
|
||||
repo: BaseRepository, host_uuid: str, decky_id: Optional[str]
|
||||
) -> None:
|
||||
"""Flip affected shards to state='tearing_down' so the UI can show
|
||||
progress immediately while the background task runs."""
|
||||
shards = await repo.list_decky_shards(host_uuid)
|
||||
for s in shards:
|
||||
if decky_id and s.get("decky_name") != decky_id:
|
||||
continue
|
||||
await repo.upsert_decky_shard({
|
||||
**s,
|
||||
"state": "tearing_down",
|
||||
"last_error": None,
|
||||
})
|
||||
|
||||
|
||||
async def _run_teardown(
|
||||
host: dict[str, Any], repo: BaseRepository, decky_id: Optional[str]
|
||||
) -> None:
|
||||
"""Fire the remote teardown + DB cleanup. Exceptions are logged and
|
||||
reflected on the shard so the UI surfaces them — never re-raised,
|
||||
since nothing is awaiting us."""
|
||||
try:
|
||||
async with AgentClient(host=host) as agent:
|
||||
await agent.teardown(decky_id)
|
||||
except Exception as exc:
|
||||
log.exception(
|
||||
"swarm.teardown background task failed host=%s decky=%s",
|
||||
host.get("name"), decky_id,
|
||||
)
|
||||
# Reflect the failure on the shard(s) — don't delete on failure,
|
||||
# the operator needs to see what went wrong and retry.
|
||||
try:
|
||||
shards = await repo.list_decky_shards(host["uuid"])
|
||||
for s in shards:
|
||||
if decky_id and s.get("decky_name") != decky_id:
|
||||
continue
|
||||
await repo.upsert_decky_shard({
|
||||
**s,
|
||||
"state": "teardown_failed",
|
||||
"last_error": str(exc)[:512],
|
||||
})
|
||||
except Exception:
|
||||
log.exception("swarm.teardown failed to record shard failure")
|
||||
return
|
||||
|
||||
try:
|
||||
if decky_id:
|
||||
await repo.delete_decky_shard(decky_id)
|
||||
else:
|
||||
await repo.delete_decky_shards_for_host(host["uuid"])
|
||||
except Exception:
|
||||
log.exception("swarm.teardown DB cleanup failed (agent call succeeded)")
|
||||
|
||||
|
||||
@router.post(
|
||||
"/hosts/{uuid}/teardown",
|
||||
response_model=TeardownHostResponse,
|
||||
status_code=status.HTTP_202_ACCEPTED,
|
||||
tags=["Swarm Management"],
|
||||
responses={
|
||||
400: {"description": "Bad Request (malformed JSON body)"},
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "Host not found"},
|
||||
422: {"description": "Request body or path parameter validation error"},
|
||||
},
|
||||
)
|
||||
async def teardown_host(
|
||||
uuid: str,
|
||||
req: TeardownHostRequest,
|
||||
admin: dict = Depends(require_admin),
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> TeardownHostResponse:
|
||||
host = await repo.get_swarm_host_by_uuid(uuid)
|
||||
if host is None:
|
||||
raise HTTPException(status_code=404, detail="host not found")
|
||||
|
||||
await _mark_tearing_down(repo, uuid, req.decky_id)
|
||||
|
||||
# Fire-and-forget: asyncio.create_task (not BackgroundTasks) so the
|
||||
# task runs independently of this request's lifecycle — the operator
|
||||
# can queue another teardown the moment this one returns 202 without
|
||||
# waiting for any per-request cleanup phase.
|
||||
_spawn(_run_teardown(host, repo, req.decky_id))
|
||||
|
||||
return TeardownHostResponse(
|
||||
host_uuid=uuid,
|
||||
host_name=host.get("name") or "",
|
||||
decky_id=req.decky_id,
|
||||
accepted=True,
|
||||
detail="teardown queued",
|
||||
)
|
||||
23
decnet/web/router/swarm_updates/__init__.py
Normal file
23
decnet/web/router/swarm_updates/__init__.py
Normal file
@@ -0,0 +1,23 @@
|
||||
"""Remote Updates — master dashboard's surface for pushing code to workers.
|
||||
|
||||
These are *not* the swarm-controller's /swarm routes (those run on a
|
||||
separate process, auth-free, internal-only). They live on the main web
|
||||
API, go through ``require_admin``, and are the interface the React
|
||||
dashboard calls to fan updates out to worker ``decnet updater`` daemons
|
||||
via ``UpdaterClient``.
|
||||
|
||||
Mounted under ``/api/v1/swarm-updates`` by the main api router.
|
||||
"""
|
||||
from fastapi import APIRouter
|
||||
|
||||
from .api_list_host_releases import router as list_host_releases_router
|
||||
from .api_push_update import router as push_update_router
|
||||
from .api_push_update_self import router as push_update_self_router
|
||||
from .api_rollback_host import router as rollback_host_router
|
||||
|
||||
swarm_updates_router = APIRouter(prefix="/swarm-updates")
|
||||
|
||||
swarm_updates_router.include_router(list_host_releases_router)
|
||||
swarm_updates_router.include_router(push_update_router)
|
||||
swarm_updates_router.include_router(push_update_self_router)
|
||||
swarm_updates_router.include_router(rollback_host_router)
|
||||
86
decnet/web/router/swarm_updates/api_list_host_releases.py
Normal file
86
decnet/web/router/swarm_updates/api_list_host_releases.py
Normal file
@@ -0,0 +1,86 @@
|
||||
"""GET /swarm-updates/hosts — per-host updater health + release slots.
|
||||
|
||||
Fans out an ``UpdaterClient.health()`` probe to every enrolled host that
|
||||
has an updater bundle. Each probe is isolated: a single unreachable host
|
||||
never fails the whole list (that's normal partial-failure behaviour for
|
||||
a fleet view).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, Depends
|
||||
|
||||
from decnet.logging import get_logger
|
||||
from decnet.swarm.updater_client import UpdaterClient
|
||||
from decnet.web.db.models import HostReleaseInfo, HostReleasesResponse
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo, require_admin
|
||||
|
||||
log = get_logger("swarm_updates.list")
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
def _extract_shas(releases: list[dict[str, Any]]) -> tuple[str | None, str | None]:
|
||||
"""Pick the (current, previous) SHA from the updater's releases list.
|
||||
|
||||
The updater reports releases as ``[{"slot": "active"|"prev", "sha": ...,
|
||||
...}]`` in no guaranteed order, so pull by slot name rather than index.
|
||||
"""
|
||||
current = next((r.get("sha") for r in releases if r.get("slot") == "active"), None)
|
||||
previous = next((r.get("sha") for r in releases if r.get("slot") == "prev"), None)
|
||||
return current, previous
|
||||
|
||||
|
||||
async def _probe_host(host: dict[str, Any]) -> HostReleaseInfo:
|
||||
try:
|
||||
async with UpdaterClient(host=host) as u:
|
||||
body = await u.health()
|
||||
except Exception as exc: # noqa: BLE001
|
||||
return HostReleaseInfo(
|
||||
host_uuid=host["uuid"],
|
||||
host_name=host["name"],
|
||||
address=host["address"],
|
||||
reachable=False,
|
||||
detail=f"{type(exc).__name__}: {exc}",
|
||||
)
|
||||
releases = body.get("releases") or []
|
||||
current, previous = _extract_shas(releases)
|
||||
return HostReleaseInfo(
|
||||
host_uuid=host["uuid"],
|
||||
host_name=host["name"],
|
||||
address=host["address"],
|
||||
reachable=True,
|
||||
agent_status=body.get("agent_status") or body.get("status"),
|
||||
current_sha=current,
|
||||
previous_sha=previous,
|
||||
releases=releases,
|
||||
)
|
||||
|
||||
|
||||
@router.get(
|
||||
"/hosts",
|
||||
response_model=HostReleasesResponse,
|
||||
tags=["Swarm Updates"],
|
||||
responses={
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
},
|
||||
)
|
||||
async def api_list_host_releases(
|
||||
admin: dict = Depends(require_admin),
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> HostReleasesResponse:
|
||||
rows = await repo.list_swarm_hosts()
|
||||
# Only hosts actually capable of receiving updates — decommissioned
|
||||
# hosts and agent-only enrollments are filtered out.
|
||||
targets = [
|
||||
r for r in rows
|
||||
if r.get("status") != "decommissioned" and r.get("updater_cert_fingerprint")
|
||||
]
|
||||
if not targets:
|
||||
return HostReleasesResponse(hosts=[])
|
||||
results = await asyncio.gather(*(_probe_host(h) for h in targets))
|
||||
return HostReleasesResponse(hosts=list(results))
|
||||
163
decnet/web/router/swarm_updates/api_push_update.py
Normal file
163
decnet/web/router/swarm_updates/api_push_update.py
Normal file
@@ -0,0 +1,163 @@
|
||||
"""POST /swarm-updates/push — fan a tarball of the master's tree to workers.
|
||||
|
||||
Mirrors the ``decnet swarm update`` CLI flow: build the tarball once,
|
||||
dispatch concurrently, collect per-host statuses. Returns HTTP 200 even
|
||||
when individual hosts failed — the operator reads per-host ``status``.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import pathlib
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
|
||||
from decnet.logging import get_logger
|
||||
from decnet.swarm.tar_tree import detect_git_sha, tar_working_tree
|
||||
from decnet.swarm.updater_client import UpdaterClient
|
||||
from decnet.web.db.models import PushUpdateRequest, PushUpdateResponse, PushUpdateResult
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo, require_admin
|
||||
|
||||
log = get_logger("swarm_updates.push")
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
def _master_tree_root() -> pathlib.Path:
|
||||
"""Resolve the master's install tree to tar.
|
||||
|
||||
Walks up from this file: ``decnet/web/router/swarm_updates/`` → 3 parents
|
||||
lands on the repo root. Matches the layout shipped via ``pip install -e .``
|
||||
and the dev checkout at ``~/Tools/DECNET``.
|
||||
"""
|
||||
return pathlib.Path(__file__).resolve().parents[4]
|
||||
|
||||
|
||||
def _classify_update(status_code: int) -> str:
|
||||
if status_code == 200:
|
||||
return "updated"
|
||||
if status_code == 409:
|
||||
return "rolled-back"
|
||||
return "failed"
|
||||
|
||||
|
||||
async def _resolve_targets(
|
||||
repo: BaseRepository,
|
||||
req: PushUpdateRequest,
|
||||
) -> list[dict[str, Any]]:
|
||||
if req.all == bool(req.host_uuids):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="Specify exactly one of host_uuids or all=true.",
|
||||
)
|
||||
rows = await repo.list_swarm_hosts()
|
||||
rows = [r for r in rows if r.get("updater_cert_fingerprint")]
|
||||
if req.all:
|
||||
targets = [r for r in rows if r.get("status") != "decommissioned"]
|
||||
else:
|
||||
wanted = set(req.host_uuids or [])
|
||||
targets = [r for r in rows if r["uuid"] in wanted]
|
||||
missing = wanted - {r["uuid"] for r in targets}
|
||||
if missing:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail=f"Unknown or updater-less host(s): {sorted(missing)}",
|
||||
)
|
||||
if not targets:
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail="No targets: no enrolled hosts have an updater bundle.",
|
||||
)
|
||||
return targets
|
||||
|
||||
|
||||
async def _push_one(
|
||||
host: dict[str, Any],
|
||||
tarball: bytes,
|
||||
sha: str,
|
||||
include_self: bool,
|
||||
) -> PushUpdateResult:
|
||||
try:
|
||||
async with UpdaterClient(host=host) as u:
|
||||
r = await u.update(tarball, sha=sha)
|
||||
body = r.json() if r.content else {}
|
||||
status = _classify_update(r.status_code)
|
||||
stderr = body.get("stderr") if isinstance(body, dict) else None
|
||||
|
||||
if include_self and r.status_code == 200:
|
||||
# Agent first, updater second — a broken updater push must never
|
||||
# strand the fleet on an old agent.
|
||||
try:
|
||||
rs = await u.update_self(tarball, sha=sha)
|
||||
self_ok = rs.status_code in (200, 0) # 0 = connection dropped (expected)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
# Connection drop on update-self is expected and not an error.
|
||||
self_ok = _is_expected_connection_drop(exc)
|
||||
if not self_ok:
|
||||
return PushUpdateResult(
|
||||
host_uuid=host["uuid"], host_name=host["name"],
|
||||
status="self-failed", http_status=r.status_code, sha=sha,
|
||||
detail=f"agent updated OK but self-update failed: {exc}",
|
||||
stderr=stderr,
|
||||
)
|
||||
status = "self-updated" if self_ok else "self-failed"
|
||||
|
||||
return PushUpdateResult(
|
||||
host_uuid=host["uuid"], host_name=host["name"],
|
||||
status=status, http_status=r.status_code, sha=sha,
|
||||
detail=body.get("error") or body.get("probe") if isinstance(body, dict) else None,
|
||||
stderr=stderr,
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.exception("swarm_updates.push failed host=%s", host.get("name"))
|
||||
return PushUpdateResult(
|
||||
host_uuid=host["uuid"], host_name=host["name"],
|
||||
status="failed",
|
||||
detail=f"{type(exc).__name__}: {exc}",
|
||||
)
|
||||
|
||||
|
||||
def _is_expected_connection_drop(exc: BaseException) -> bool:
|
||||
"""update-self re-execs the updater mid-response; httpx raises on the drop."""
|
||||
import httpx
|
||||
return isinstance(exc, (httpx.RemoteProtocolError, httpx.ReadError, httpx.ConnectError))
|
||||
|
||||
|
||||
@router.post(
|
||||
"/push",
|
||||
response_model=PushUpdateResponse,
|
||||
tags=["Swarm Updates"],
|
||||
responses={
|
||||
400: {"description": "Bad Request (malformed JSON body or conflicting host_uuids/all flags)"},
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "No matching target hosts or no updater-capable hosts enrolled"},
|
||||
422: {"description": "Request body validation error"},
|
||||
},
|
||||
)
|
||||
async def api_push_update(
|
||||
req: PushUpdateRequest,
|
||||
admin: dict = Depends(require_admin),
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> PushUpdateResponse:
|
||||
targets = await _resolve_targets(repo, req)
|
||||
tree_root = _master_tree_root()
|
||||
# Both `detect_git_sha` (shells out) and `tar_working_tree` (walks the repo
|
||||
# + gzips a few MB) are synchronous CPU+I/O. Running them directly on the
|
||||
# event loop blocks every other request until the tarball is built — the
|
||||
# dashboard freezes on /swarm-updates push. Offload to a worker thread.
|
||||
sha = await asyncio.to_thread(detect_git_sha, tree_root)
|
||||
tarball = await asyncio.to_thread(tar_working_tree, tree_root, extra_excludes=req.exclude)
|
||||
log.info(
|
||||
"swarm_updates.push sha=%s tarball=%d hosts=%d include_self=%s",
|
||||
sha or "(not a git repo)", len(tarball), len(targets), req.include_self,
|
||||
)
|
||||
results = await asyncio.gather(
|
||||
*(_push_one(h, tarball, sha, req.include_self) for h in targets)
|
||||
)
|
||||
return PushUpdateResponse(
|
||||
sha=sha,
|
||||
tarball_bytes=len(tarball),
|
||||
results=list(results),
|
||||
)
|
||||
101
decnet/web/router/swarm_updates/api_push_update_self.py
Normal file
101
decnet/web/router/swarm_updates/api_push_update_self.py
Normal file
@@ -0,0 +1,101 @@
|
||||
"""POST /swarm-updates/push-self — push only to workers' /update-self.
|
||||
|
||||
Use case: the agent is fine but the updater itself needs an upgrade (e.g.
|
||||
a fix to ``executor.py``). Uploading only ``/update-self`` avoids a
|
||||
redundant agent restart on healthy workers.
|
||||
|
||||
No auto-rollback: the updater re-execs itself on success, so a broken
|
||||
push leaves the worker on the old code — verified by polling ``/health``
|
||||
after the request returns.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, Depends
|
||||
|
||||
from decnet.logging import get_logger
|
||||
from decnet.swarm.tar_tree import detect_git_sha, tar_working_tree
|
||||
from decnet.swarm.updater_client import UpdaterClient
|
||||
from decnet.web.db.models import PushUpdateRequest, PushUpdateResponse, PushUpdateResult
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo, require_admin
|
||||
|
||||
from .api_push_update import _is_expected_connection_drop, _master_tree_root, _resolve_targets
|
||||
|
||||
log = get_logger("swarm_updates.push_self")
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
async def _push_self_one(host: dict[str, Any], tarball: bytes, sha: str) -> PushUpdateResult:
|
||||
try:
|
||||
async with UpdaterClient(host=host) as u:
|
||||
try:
|
||||
r = await u.update_self(tarball, sha=sha)
|
||||
http_status = r.status_code
|
||||
body = r.json() if r.content else {}
|
||||
ok = http_status == 200
|
||||
detail = (body.get("error") or body.get("probe")) if isinstance(body, dict) else None
|
||||
stderr = body.get("stderr") if isinstance(body, dict) else None
|
||||
except Exception as exc: # noqa: BLE001
|
||||
# Connection drops during self-update are expected — the updater
|
||||
# re-execs itself mid-response.
|
||||
if _is_expected_connection_drop(exc):
|
||||
return PushUpdateResult(
|
||||
host_uuid=host["uuid"], host_name=host["name"],
|
||||
status="self-updated", sha=sha,
|
||||
detail="updater re-exec dropped connection (expected)",
|
||||
)
|
||||
raise
|
||||
return PushUpdateResult(
|
||||
host_uuid=host["uuid"], host_name=host["name"],
|
||||
status="self-updated" if ok else "self-failed",
|
||||
http_status=http_status, sha=sha,
|
||||
detail=detail, stderr=stderr,
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.exception("swarm_updates.push_self failed host=%s", host.get("name"))
|
||||
return PushUpdateResult(
|
||||
host_uuid=host["uuid"], host_name=host["name"],
|
||||
status="self-failed",
|
||||
detail=f"{type(exc).__name__}: {exc}",
|
||||
)
|
||||
|
||||
|
||||
@router.post(
|
||||
"/push-self",
|
||||
response_model=PushUpdateResponse,
|
||||
tags=["Swarm Updates"],
|
||||
responses={
|
||||
400: {"description": "Bad Request (malformed JSON body or conflicting host_uuids/all flags)"},
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "No matching target hosts or no updater-capable hosts enrolled"},
|
||||
422: {"description": "Request body validation error"},
|
||||
},
|
||||
)
|
||||
async def api_push_update_self(
|
||||
req: PushUpdateRequest,
|
||||
admin: dict = Depends(require_admin),
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> PushUpdateResponse:
|
||||
targets = await _resolve_targets(repo, req)
|
||||
tree_root = _master_tree_root()
|
||||
# Offload sync I/O (git shell-out + tar+gzip of the repo) so the event
|
||||
# loop stays responsive while the tarball is being built.
|
||||
sha = await asyncio.to_thread(detect_git_sha, tree_root)
|
||||
tarball = await asyncio.to_thread(tar_working_tree, tree_root, extra_excludes=req.exclude)
|
||||
log.info(
|
||||
"swarm_updates.push_self sha=%s tarball=%d hosts=%d",
|
||||
sha or "(not a git repo)", len(tarball), len(targets),
|
||||
)
|
||||
results = await asyncio.gather(
|
||||
*(_push_self_one(h, tarball, sha) for h in targets)
|
||||
)
|
||||
return PushUpdateResponse(
|
||||
sha=sha,
|
||||
tarball_bytes=len(tarball),
|
||||
results=list(results),
|
||||
)
|
||||
77
decnet/web/router/swarm_updates/api_rollback_host.py
Normal file
77
decnet/web/router/swarm_updates/api_rollback_host.py
Normal file
@@ -0,0 +1,77 @@
|
||||
"""POST /swarm-updates/rollback — manual rollback on a single host.
|
||||
|
||||
Calls the worker updater's ``/rollback`` which swaps the ``current``
|
||||
symlink back to ``releases/prev``. Fails with 404 if the target has no
|
||||
previous release slot.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
|
||||
from decnet.logging import get_logger
|
||||
from decnet.swarm.updater_client import UpdaterClient
|
||||
from decnet.web.db.models import RollbackRequest, RollbackResponse
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo, require_admin
|
||||
|
||||
log = get_logger("swarm_updates.rollback")
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.post(
|
||||
"/rollback",
|
||||
response_model=RollbackResponse,
|
||||
tags=["Swarm Updates"],
|
||||
responses={
|
||||
400: {"description": "Bad Request (malformed JSON body or host has no updater bundle)"},
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "Unknown host, or no previous release slot on the worker"},
|
||||
422: {"description": "Request body validation error"},
|
||||
},
|
||||
)
|
||||
async def api_rollback_host(
|
||||
req: RollbackRequest,
|
||||
admin: dict = Depends(require_admin),
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> RollbackResponse:
|
||||
host = await repo.get_swarm_host_by_uuid(req.host_uuid)
|
||||
if host is None:
|
||||
raise HTTPException(status_code=404, detail=f"Unknown host: {req.host_uuid}")
|
||||
if not host.get("updater_cert_fingerprint"):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"Host '{host['name']}' has no updater bundle — nothing to roll back.",
|
||||
)
|
||||
|
||||
try:
|
||||
async with UpdaterClient(host=host) as u:
|
||||
r = await u.rollback()
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.exception("swarm_updates.rollback transport failure host=%s", host["name"])
|
||||
return RollbackResponse(
|
||||
host_uuid=host["uuid"], host_name=host["name"],
|
||||
status="failed",
|
||||
detail=f"{type(exc).__name__}: {exc}",
|
||||
)
|
||||
|
||||
body = r.json() if r.content else {}
|
||||
if r.status_code == 404:
|
||||
# No previous release — surface as 404 so the UI can render the
|
||||
# "nothing to roll back" state distinctly from a transport error.
|
||||
raise HTTPException(
|
||||
status_code=404,
|
||||
detail=body.get("detail") if isinstance(body, dict) else "No previous release on worker.",
|
||||
)
|
||||
if r.status_code != 200:
|
||||
return RollbackResponse(
|
||||
host_uuid=host["uuid"], host_name=host["name"],
|
||||
status="failed", http_status=r.status_code,
|
||||
detail=(body.get("error") or body.get("detail")) if isinstance(body, dict) else None,
|
||||
)
|
||||
return RollbackResponse(
|
||||
host_uuid=host["uuid"], host_name=host["name"],
|
||||
status="rolled-back", http_status=r.status_code,
|
||||
detail=body.get("status") if isinstance(body, dict) else None,
|
||||
)
|
||||
6
decnet/web/router/system/__init__.py
Normal file
6
decnet/web/router/system/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
from fastapi import APIRouter
|
||||
|
||||
from .api_deployment_mode import router as deployment_mode_router
|
||||
|
||||
system_router = APIRouter(prefix="/system", tags=["System"])
|
||||
system_router.include_router(deployment_mode_router)
|
||||
41
decnet/web/router/system/api_deployment_mode.py
Normal file
41
decnet/web/router/system/api_deployment_mode.py
Normal file
@@ -0,0 +1,41 @@
|
||||
"""GET /system/deployment-mode — tells the UI whether a deploy will shard
|
||||
across SWARM workers or land on the master itself.
|
||||
|
||||
Logic mirrors the auto-mode branch in ``api_deploy_deckies``: master role
|
||||
plus at least one reachable enrolled worker = swarm; otherwise unihost.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
|
||||
from fastapi import APIRouter, Depends
|
||||
from pydantic import BaseModel
|
||||
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
from decnet.web.dependencies import get_repo
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
class DeploymentModeResponse(BaseModel):
|
||||
mode: str # "swarm" or "unihost"
|
||||
role: str # "master" or "agent"
|
||||
swarm_host_count: int
|
||||
|
||||
|
||||
@router.get("/deployment-mode", response_model=DeploymentModeResponse)
|
||||
async def get_deployment_mode(
|
||||
repo: BaseRepository = Depends(get_repo),
|
||||
) -> DeploymentModeResponse:
|
||||
role = os.environ.get("DECNET_MODE", "master").lower()
|
||||
hosts = 0
|
||||
if role == "master":
|
||||
hosts = sum(
|
||||
1 for h in await repo.list_swarm_hosts()
|
||||
if h.get("status") in ("active", "enrolled") and h.get("address")
|
||||
)
|
||||
return DeploymentModeResponse(
|
||||
mode="swarm" if hosts > 0 else "unihost",
|
||||
role=role,
|
||||
swarm_host_count=hosts,
|
||||
)
|
||||
55
decnet/web/router/topology/__init__.py
Normal file
55
decnet/web/router/topology/__init__.py
Normal file
@@ -0,0 +1,55 @@
|
||||
"""MazeNET topology REST endpoints (phase 3).
|
||||
|
||||
Thin FastAPI layer over the phase-2 topology machinery:
|
||||
generate/validate/deploy/teardown, pending-only child CRUD, and the
|
||||
live-mutation queue for active|degraded topologies.
|
||||
|
||||
Mounted at ``/api/v1/topologies`` by the main api router. Sub-routers
|
||||
live one-per-file and are aggregated here.
|
||||
"""
|
||||
from fastapi import APIRouter
|
||||
|
||||
from .api_catalog import router as _catalog_router
|
||||
from .api_create_topology import router as _create_router
|
||||
from .api_create_blank_topology import router as _create_blank_router
|
||||
from .api_decky_crud import router as _decky_router
|
||||
from .api_delete_topology import router as _delete_router
|
||||
from .api_deploy_topology import router as _deploy_router
|
||||
from .api_edge_crud import router as _edge_router
|
||||
from .api_events import router as _events_router
|
||||
from .api_get_topology import router as _get_router
|
||||
from .api_lan_crud import router as _lan_router
|
||||
from .api_list_topologies import router as _list_router
|
||||
from .api_mutations import router as _mutations_router
|
||||
from .api_personas import router as _personas_router
|
||||
from .api_reap_orphans import router as _reap_router
|
||||
from .api_teardown_topology import router as _teardown_router
|
||||
|
||||
topology_router = APIRouter(prefix="/topologies", tags=["topologies"])
|
||||
|
||||
# Order matters: catalog routes use literal path segments (e.g.
|
||||
# /services, /next-subnet) that would otherwise be shadowed by the
|
||||
# `/{topology_id}` path in api_get_topology. Keep the catalog router
|
||||
# included first so FastAPI's trie resolves literals before the
|
||||
# parameterized fallback.
|
||||
topology_router.include_router(_catalog_router)
|
||||
topology_router.include_router(_list_router)
|
||||
topology_router.include_router(_create_blank_router)
|
||||
topology_router.include_router(_create_router)
|
||||
topology_router.include_router(_reap_router)
|
||||
topology_router.include_router(_deploy_router)
|
||||
topology_router.include_router(_teardown_router)
|
||||
topology_router.include_router(_delete_router)
|
||||
topology_router.include_router(_lan_router)
|
||||
topology_router.include_router(_decky_router)
|
||||
topology_router.include_router(_edge_router)
|
||||
topology_router.include_router(_mutations_router)
|
||||
topology_router.include_router(_events_router)
|
||||
# Personas use a literal-suffix path (`/{id}/personas`) — register
|
||||
# before the bare `/{id}` getter so FastAPI's trie sees the literal
|
||||
# segment first.
|
||||
topology_router.include_router(_personas_router)
|
||||
topology_router.include_router(_get_router)
|
||||
|
||||
|
||||
__all__ = ["topology_router"]
|
||||
53
decnet/web/router/topology/_guards.py
Normal file
53
decnet/web/router/topology/_guards.py
Normal file
@@ -0,0 +1,53 @@
|
||||
"""Shared helpers for the Phase-3 child-CRUD routes."""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
from fastapi import HTTPException
|
||||
|
||||
from decnet.topology.status import (
|
||||
TopologyNotEditable,
|
||||
TopologyStatus,
|
||||
VersionConflict,
|
||||
)
|
||||
from decnet.web.dependencies import repo
|
||||
|
||||
|
||||
async def get_topology_or_404(topology_id: str) -> dict[str, Any]:
|
||||
topo = await repo.get_topology(topology_id)
|
||||
if topo is None:
|
||||
raise HTTPException(status_code=404, detail="Topology not found")
|
||||
return topo
|
||||
|
||||
|
||||
async def assert_pending_or_409(topology_id: str) -> dict[str, Any]:
|
||||
"""Ensure the topology exists and is in ``pending`` state.
|
||||
|
||||
The repo layer enforces the same rule inside mutation methods, but the
|
||||
``add_*`` helpers don't — re-check here so every write route agrees on
|
||||
the pre-condition before any side effect.
|
||||
"""
|
||||
topo = await get_topology_or_404(topology_id)
|
||||
if topo["status"] != TopologyStatus.PENDING:
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail=(
|
||||
f"Topology is {topo['status']!r}; free-form child edits are "
|
||||
f"pending-only. Use the mutation queue for active topologies."
|
||||
),
|
||||
)
|
||||
return topo
|
||||
|
||||
|
||||
def map_repo_exception(exc: Exception) -> HTTPException:
|
||||
"""Translate repo-layer exceptions to HTTP status codes."""
|
||||
if isinstance(exc, TopologyNotEditable):
|
||||
return HTTPException(status_code=409, detail=str(exc))
|
||||
if isinstance(exc, VersionConflict):
|
||||
return HTTPException(
|
||||
status_code=409,
|
||||
detail=f"Version conflict: expected {exc.expected}, current {exc.current}",
|
||||
)
|
||||
if isinstance(exc, ValueError):
|
||||
return HTTPException(status_code=400, detail=str(exc))
|
||||
return HTTPException(status_code=500, detail="Internal error")
|
||||
66
decnet/web/router/topology/_target_host.py
Normal file
66
decnet/web/router/topology/_target_host.py
Normal file
@@ -0,0 +1,66 @@
|
||||
"""Shared validation for the ``mode`` / ``target_host_uuid`` pair.
|
||||
|
||||
Called by the two topology-create endpoints
|
||||
(``api_create_topology``, ``api_create_blank_topology``). Kept as a
|
||||
tiny module so the rules stay in one place when Step 6 grows the list
|
||||
(e.g. when we start rejecting hosts that already own a topology).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Optional
|
||||
|
||||
from fastapi import HTTPException
|
||||
|
||||
# Hosts we're willing to route a new topology to. ``enrolled`` is fine
|
||||
# because the agent process has certs and will answer mTLS calls as
|
||||
# soon as it's up; ``active`` means we've seen a heartbeat recently.
|
||||
_ROUTABLE_HOST_STATUSES = {"enrolled", "active"}
|
||||
|
||||
|
||||
async def validate_target_host(
|
||||
repo: Any,
|
||||
mode: str,
|
||||
target_host_uuid: Optional[str],
|
||||
) -> None:
|
||||
"""Raise HTTPException(400) if the mode/host combination is invalid.
|
||||
|
||||
Rules:
|
||||
- ``mode=="unihost"`` with a ``target_host_uuid`` → 400 (nonsense).
|
||||
- ``mode=="agent"`` without ``target_host_uuid`` → 400.
|
||||
- ``mode=="agent"`` with an unknown uuid → 400.
|
||||
- ``mode=="agent"`` pointing at a host in ``unreachable`` /
|
||||
``decommissioned`` → 400 (operator asked for a broken path).
|
||||
"""
|
||||
if mode == "unihost":
|
||||
if target_host_uuid is not None:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="target_host_uuid is only valid when mode='agent'",
|
||||
)
|
||||
return
|
||||
|
||||
if mode == "agent":
|
||||
if not target_host_uuid:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="mode='agent' requires target_host_uuid",
|
||||
)
|
||||
host = await repo.get_swarm_host_by_uuid(target_host_uuid)
|
||||
if host is None:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"unknown swarm host {target_host_uuid!r}",
|
||||
)
|
||||
if host.get("status") not in _ROUTABLE_HOST_STATUSES:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=(
|
||||
f"swarm host {target_host_uuid!r} is "
|
||||
f"{host.get('status')!r}; expected one of "
|
||||
f"{sorted(_ROUTABLE_HOST_STATUSES)}"
|
||||
),
|
||||
)
|
||||
return
|
||||
|
||||
# Shouldn't happen — the pydantic pattern should have rejected it.
|
||||
raise HTTPException(status_code=400, detail=f"unknown mode {mode!r}")
|
||||
140
decnet/web/router/topology/api_catalog.py
Normal file
140
decnet/web/router/topology/api_catalog.py
Normal file
@@ -0,0 +1,140 @@
|
||||
"""Read-only catalog endpoints — services, next-subnet, next-ip.
|
||||
|
||||
These wrap fleet/allocator helpers so the phase-4 canvas UI can lean
|
||||
on the server for allocation instead of shipping the logic client-side.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||
|
||||
from decnet.archetypes import all_archetypes
|
||||
from decnet.fleet import all_service_names
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.topology.allocator import (
|
||||
AllocatorExhausted,
|
||||
IPAllocator,
|
||||
SubnetAllocator,
|
||||
reserved_subnets,
|
||||
)
|
||||
from decnet.web.db.models import (
|
||||
ArchetypeCatalogResponse,
|
||||
ArchetypeEntry,
|
||||
NextIPResponse,
|
||||
NextSubnetResponse,
|
||||
ServiceCatalogResponse,
|
||||
)
|
||||
from decnet.web.dependencies import repo, require_viewer
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get(
|
||||
"/services",
|
||||
tags=["MazeNET Topologies"],
|
||||
response_model=ServiceCatalogResponse,
|
||||
responses={
|
||||
400: {"description": "Malformed query parameters"},
|
||||
401: {"description": "Missing or invalid credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
},
|
||||
)
|
||||
@_traced("api.topology.catalog.services")
|
||||
async def api_list_services(
|
||||
_viewer: dict = Depends(require_viewer),
|
||||
) -> ServiceCatalogResponse:
|
||||
return ServiceCatalogResponse(services=all_service_names())
|
||||
|
||||
|
||||
@router.get(
|
||||
"/archetypes",
|
||||
tags=["MazeNET Topologies"],
|
||||
response_model=ArchetypeCatalogResponse,
|
||||
responses={
|
||||
401: {"description": "Missing or invalid credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
},
|
||||
)
|
||||
@_traced("api.topology.catalog.archetypes")
|
||||
async def api_list_archetypes(
|
||||
_viewer: dict = Depends(require_viewer),
|
||||
) -> ArchetypeCatalogResponse:
|
||||
return ArchetypeCatalogResponse(
|
||||
archetypes=[
|
||||
ArchetypeEntry(
|
||||
slug=a.slug,
|
||||
display_name=a.display_name,
|
||||
description=a.description,
|
||||
services=list(a.services),
|
||||
preferred_distros=list(a.preferred_distros),
|
||||
nmap_os=a.nmap_os,
|
||||
)
|
||||
for a in all_archetypes().values()
|
||||
],
|
||||
)
|
||||
|
||||
|
||||
@router.get(
|
||||
"/next-subnet",
|
||||
tags=["MazeNET Topologies"],
|
||||
response_model=NextSubnetResponse,
|
||||
responses={
|
||||
401: {"description": "Missing or invalid credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
409: {"description": "Allocator exhausted"},
|
||||
},
|
||||
)
|
||||
@_traced("api.topology.catalog.next_subnet")
|
||||
async def api_next_subnet(
|
||||
base: str = Query(
|
||||
default="172.16.0.0/12",
|
||||
pattern=r"^\d{1,3}\.\d{1,3}(\.\d{1,3}\.\d{1,3}/\d{1,2})?$",
|
||||
),
|
||||
_viewer: dict = Depends(require_viewer),
|
||||
) -> NextSubnetResponse:
|
||||
reserved = await reserved_subnets(repo)
|
||||
alloc = SubnetAllocator(base_prefix=base, reserved=reserved)
|
||||
try:
|
||||
subnet = alloc.next_free()
|
||||
except AllocatorExhausted as e:
|
||||
raise HTTPException(status_code=409, detail=str(e))
|
||||
return NextSubnetResponse(subnet=subnet)
|
||||
|
||||
|
||||
@router.get(
|
||||
"/{topology_id}/lans/{lan_id}/next-ip",
|
||||
tags=["MazeNET Topologies"],
|
||||
response_model=NextIPResponse,
|
||||
responses={
|
||||
400: {"description": "Malformed path parameters"},
|
||||
401: {"description": "Missing or invalid credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "Topology or LAN not found"},
|
||||
409: {"description": "Allocator exhausted"},
|
||||
},
|
||||
)
|
||||
@_traced("api.topology.catalog.next_ip")
|
||||
async def api_next_ip(
|
||||
topology_id: str,
|
||||
lan_id: str,
|
||||
_viewer: dict = Depends(require_viewer),
|
||||
) -> NextIPResponse:
|
||||
if await repo.get_topology(topology_id) is None:
|
||||
raise HTTPException(status_code=404, detail="Topology not found")
|
||||
lans = await repo.list_lans_for_topology(topology_id)
|
||||
lan = next((ln for ln in lans if ln["id"] == lan_id), None)
|
||||
if lan is None:
|
||||
raise HTTPException(status_code=404, detail="LAN not found")
|
||||
deckies = await repo.list_topology_deckies(topology_id)
|
||||
alloc = IPAllocator(subnet=lan["subnet"])
|
||||
for d in deckies:
|
||||
ip = (d.get("decky_config") or {}).get("ips_by_lan", {}).get(lan["name"])
|
||||
if ip:
|
||||
try:
|
||||
alloc.reserve(ip)
|
||||
except ValueError:
|
||||
continue
|
||||
try:
|
||||
ip = alloc.next_free()
|
||||
except AllocatorExhausted as e:
|
||||
raise HTTPException(status_code=409, detail=str(e))
|
||||
return NextIPResponse(subnet=lan["subnet"], ip=ip)
|
||||
123
decnet/web/router/topology/api_create_blank_topology.py
Normal file
123
decnet/web/router/topology/api_create_blank_topology.py
Normal file
@@ -0,0 +1,123 @@
|
||||
"""POST /topologies/blank — create an empty editable topology.
|
||||
|
||||
Produces a minimal ``pending`` topology seeded with exactly one DMZ LAN
|
||||
and its mandatory host-gateway decky. Intended for the MazeNET editor
|
||||
landing flow: unlike ``POST /topologies`` (which runs the generator),
|
||||
this endpoint takes no generator parameters and skips the planner
|
||||
entirely. The DMZ+gateway invariant is enforced server-side so the
|
||||
editor never has to special-case a "no DMZ yet" state.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, status
|
||||
from pydantic import BaseModel, Field as PydanticField
|
||||
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.topology.allocator import SubnetAllocator, reserved_subnets
|
||||
from decnet.web.db.models import TopologySummary
|
||||
from decnet.web.dependencies import repo, require_admin
|
||||
from decnet.web.router.topology._target_host import validate_target_host
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
class BlankTopologyRequest(BaseModel):
|
||||
"""Body for POST /topologies/blank — name plus optional agent pinning."""
|
||||
name: str = PydanticField(..., min_length=1, max_length=64)
|
||||
mode: str = PydanticField(default="unihost", pattern=r"^(unihost|agent)$")
|
||||
target_host_uuid: str | None = PydanticField(default=None)
|
||||
|
||||
|
||||
@router.post(
|
||||
"/blank",
|
||||
tags=["MazeNET Topologies"],
|
||||
response_model=TopologySummary,
|
||||
status_code=status.HTTP_201_CREATED,
|
||||
responses={
|
||||
400: {"description": "Malformed body or invalid topology name"},
|
||||
401: {"description": "Missing or invalid credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
409: {"description": "Name collision or subnet pool exhausted"},
|
||||
},
|
||||
)
|
||||
@_traced("api.topology.create_blank")
|
||||
async def api_create_blank_topology(
|
||||
body: BlankTopologyRequest,
|
||||
_admin: dict = Depends(require_admin),
|
||||
) -> TopologySummary:
|
||||
# 0. Validate mode/host pairing before any writes.
|
||||
await validate_target_host(repo, body.mode, body.target_host_uuid)
|
||||
|
||||
# 1. Topology row
|
||||
try:
|
||||
topology_id = await repo.create_topology(
|
||||
{
|
||||
"name": body.name,
|
||||
"mode": body.mode,
|
||||
"target_host_uuid": body.target_host_uuid,
|
||||
"status": "pending",
|
||||
"config_snapshot": json.dumps({"blank": True}),
|
||||
}
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001 — surface duplicate-name as 409
|
||||
raise HTTPException(status_code=409, detail=str(exc)) from exc
|
||||
|
||||
# 2. DMZ LAN with auto-allocated subnet
|
||||
try:
|
||||
allocator = SubnetAllocator(
|
||||
"10.0", reserved=await reserved_subnets(repo)
|
||||
)
|
||||
subnet = allocator.next_free()
|
||||
except RuntimeError as exc:
|
||||
raise HTTPException(status_code=409, detail=str(exc)) from exc
|
||||
|
||||
lan_id = await repo.add_lan(
|
||||
{
|
||||
"topology_id": topology_id,
|
||||
"name": "dmz",
|
||||
"subnet": subnet,
|
||||
"is_dmz": True,
|
||||
"x": 40,
|
||||
"y": 40,
|
||||
}
|
||||
)
|
||||
|
||||
# 3. DMZ-gateway decky — a normal multi-homed bridge decky.
|
||||
# `forwards_l3=True` turns on net.ipv4.ip_forward + NET_ADMIN at
|
||||
# compose time (see decnet/topology/compose.py). No host-mode,
|
||||
# no MACVLAN — the gateway reaches the outside world via Docker
|
||||
# port publishing (see composer port emission).
|
||||
decky_uuid = await repo.add_topology_decky(
|
||||
{
|
||||
"topology_id": topology_id,
|
||||
"name": "dmz-gateway",
|
||||
"services": ["ssh"],
|
||||
"decky_config": {
|
||||
"archetype": "deaddeck",
|
||||
"forwards_l3": True,
|
||||
},
|
||||
"state": "pending",
|
||||
"x": 20,
|
||||
"y": 60,
|
||||
}
|
||||
)
|
||||
|
||||
# 4. Membership edge on the DMZ — is_bridge=True marks this decky
|
||||
# as the topology's bridge gateway; forwards_l3 mirrors the decky
|
||||
# config so the generator/compose paths stay consistent.
|
||||
await repo.add_topology_edge(
|
||||
{
|
||||
"topology_id": topology_id,
|
||||
"decky_uuid": decky_uuid,
|
||||
"lan_id": lan_id,
|
||||
"is_bridge": True,
|
||||
"forwards_l3": True,
|
||||
}
|
||||
)
|
||||
|
||||
row = await repo.get_topology(topology_id)
|
||||
if row is None: # pragma: no cover — create then vanish
|
||||
raise HTTPException(status_code=500, detail="topology insert vanished")
|
||||
return TopologySummary(**row)
|
||||
77
decnet/web/router/topology/api_create_topology.py
Normal file
77
decnet/web/router/topology/api_create_topology.py
Normal file
@@ -0,0 +1,77 @@
|
||||
"""POST /topologies — generate and persist a new MazeNET topology."""
|
||||
from __future__ import annotations
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, status
|
||||
from sqlalchemy.exc import IntegrityError
|
||||
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.topology.allocator import reserved_subnets
|
||||
from decnet.topology.config import TopologyConfig
|
||||
from decnet.topology.generator import generate
|
||||
from decnet.topology.persistence import persist
|
||||
from decnet.web.db.models import TopologyGenerateRequest, TopologySummary
|
||||
from decnet.web.dependencies import repo, require_admin
|
||||
from decnet.web.router.topology._target_host import validate_target_host
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.post(
|
||||
"/",
|
||||
tags=["MazeNET Topologies"],
|
||||
response_model=TopologySummary,
|
||||
status_code=status.HTTP_201_CREATED,
|
||||
responses={
|
||||
400: {"description": "Malformed or invalid generation parameters"},
|
||||
401: {"description": "Missing or invalid credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
409: {"description": "Duplicate topology name, or generator could not allocate subnets (exhausted pool)"},
|
||||
},
|
||||
)
|
||||
@_traced("api.topology.create")
|
||||
async def api_create_topology(
|
||||
body: TopologyGenerateRequest,
|
||||
_admin: dict = Depends(require_admin),
|
||||
) -> TopologySummary:
|
||||
await validate_target_host(repo, body.mode, body.target_host_uuid)
|
||||
try:
|
||||
config = TopologyConfig(
|
||||
name=body.name,
|
||||
mode=body.mode,
|
||||
depth=body.depth,
|
||||
branching_factor=body.branching_factor,
|
||||
deckies_per_lan_min=body.deckies_per_lan_min,
|
||||
deckies_per_lan_max=body.deckies_per_lan_max,
|
||||
bridge_forward_probability=body.bridge_forward_probability,
|
||||
cross_edge_probability=body.cross_edge_probability,
|
||||
services_explicit=body.services_explicit,
|
||||
randomize_services=body.randomize_services,
|
||||
seed=body.seed,
|
||||
)
|
||||
except (ValueError, TypeError) as exc:
|
||||
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
||||
|
||||
try:
|
||||
plan = generate(config, reserved_subnets=await reserved_subnets(repo))
|
||||
except RuntimeError as exc:
|
||||
# Subnet allocator exhaustion or similar planner-level failure.
|
||||
raise HTTPException(status_code=409, detail=str(exc)) from exc
|
||||
except (ValueError, TypeError) as exc:
|
||||
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
||||
|
||||
try:
|
||||
topology_id = await persist(repo, plan, target_host_uuid=body.target_host_uuid)
|
||||
except IntegrityError as exc:
|
||||
# Unique constraint on topologies.name is the only integrity
|
||||
# error the create path can realistically hit — inspecting the
|
||||
# constraint name keeps us from silently mapping unrelated
|
||||
# integrity failures to 409.
|
||||
msg = str(exc.orig) if exc.orig is not None else str(exc)
|
||||
if "ix_topologies_name" in msg or "topologies.name" in msg:
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail=f"A topology named {body.name!r} already exists.",
|
||||
) from exc
|
||||
raise
|
||||
row = await repo.get_topology(topology_id)
|
||||
return TopologySummary(**row)
|
||||
136
decnet/web/router/topology/api_decky_crud.py
Normal file
136
decnet/web/router/topology/api_decky_crud.py
Normal file
@@ -0,0 +1,136 @@
|
||||
"""Decky CRUD endpoints — pending-only child mutations.
|
||||
|
||||
POST /topologies/{id}/deckies
|
||||
PATCH /topologies/{id}/deckies/{uuid}
|
||||
DELETE /topologies/{id}/deckies/{uuid}
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Response, status
|
||||
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.topology.status import (
|
||||
TopologyNotEditable,
|
||||
VersionConflict,
|
||||
)
|
||||
from decnet.web.db.models import DeckyCreateRequest, DeckyRow, DeckyUpdateRequest
|
||||
from decnet.web.dependencies import repo, require_admin
|
||||
|
||||
from ._guards import assert_pending_or_409, map_repo_exception
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.post(
|
||||
"/{topology_id}/deckies",
|
||||
tags=["MazeNET Topologies"],
|
||||
response_model=DeckyRow,
|
||||
status_code=status.HTTP_201_CREATED,
|
||||
responses={
|
||||
400: {"description": "Malformed body or invalid decky fields"},
|
||||
401: {"description": "Missing or invalid credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "Topology not found"},
|
||||
409: {"description": "Topology not editable or version conflict"},
|
||||
},
|
||||
)
|
||||
@_traced("api.topology.decky.create")
|
||||
async def api_create_decky(
|
||||
topology_id: str,
|
||||
body: DeckyCreateRequest,
|
||||
_admin: dict = Depends(require_admin),
|
||||
) -> DeckyRow:
|
||||
await assert_pending_or_409(topology_id)
|
||||
|
||||
payload = {
|
||||
"topology_id": topology_id,
|
||||
"name": body.name,
|
||||
"services": body.services,
|
||||
"decky_config": body.decky_config,
|
||||
"x": body.x,
|
||||
"y": body.y,
|
||||
}
|
||||
try:
|
||||
decky_uuid = await repo.add_topology_decky(
|
||||
payload, expected_version=body.expected_version
|
||||
)
|
||||
except (TopologyNotEditable, VersionConflict, ValueError) as exc:
|
||||
raise map_repo_exception(exc) from exc
|
||||
|
||||
rows = await repo.list_topology_deckies(topology_id)
|
||||
row = next((r for r in rows if r["uuid"] == decky_uuid), None)
|
||||
if row is None: # pragma: no cover
|
||||
raise HTTPException(status_code=500, detail="Decky insert vanished")
|
||||
return DeckyRow(**row)
|
||||
|
||||
|
||||
@router.patch(
|
||||
"/{topology_id}/deckies/{decky_uuid}",
|
||||
tags=["MazeNET Topologies"],
|
||||
response_model=DeckyRow,
|
||||
responses={
|
||||
400: {"description": "Malformed body"},
|
||||
401: {"description": "Missing or invalid credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "Topology or decky not found"},
|
||||
409: {"description": "Topology not editable or version conflict"},
|
||||
},
|
||||
)
|
||||
@_traced("api.topology.decky.update")
|
||||
async def api_update_decky(
|
||||
topology_id: str,
|
||||
decky_uuid: str,
|
||||
body: DeckyUpdateRequest,
|
||||
_admin: dict = Depends(require_admin),
|
||||
) -> DeckyRow:
|
||||
await assert_pending_or_409(topology_id)
|
||||
|
||||
fields = body.model_dump(exclude_unset=True, exclude={"expected_version"})
|
||||
try:
|
||||
await repo.update_topology_decky(
|
||||
decky_uuid,
|
||||
fields,
|
||||
expected_version=body.expected_version,
|
||||
enforce_pending=True,
|
||||
)
|
||||
except (TopologyNotEditable, VersionConflict) as exc:
|
||||
raise map_repo_exception(exc) from exc
|
||||
except ValueError as exc:
|
||||
raise HTTPException(status_code=404, detail=str(exc)) from exc
|
||||
|
||||
rows = await repo.list_topology_deckies(topology_id)
|
||||
row = next((r for r in rows if r["uuid"] == decky_uuid), None)
|
||||
if row is None:
|
||||
raise HTTPException(status_code=404, detail="Decky not found")
|
||||
return DeckyRow(**row)
|
||||
|
||||
|
||||
@router.delete(
|
||||
"/{topology_id}/deckies/{decky_uuid}",
|
||||
tags=["MazeNET Topologies"],
|
||||
status_code=status.HTTP_204_NO_CONTENT,
|
||||
responses={
|
||||
400: {"description": "Malformed path"},
|
||||
401: {"description": "Missing or invalid credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "Topology or decky not found"},
|
||||
409: {"description": "Topology not editable or version conflict"},
|
||||
},
|
||||
)
|
||||
@_traced("api.topology.decky.delete")
|
||||
async def api_delete_decky(
|
||||
topology_id: str,
|
||||
decky_uuid: str,
|
||||
_admin: dict = Depends(require_admin),
|
||||
) -> Response:
|
||||
await assert_pending_or_409(topology_id)
|
||||
|
||||
rows = await repo.list_topology_deckies(topology_id)
|
||||
if not any(r["uuid"] == decky_uuid for r in rows):
|
||||
raise HTTPException(status_code=404, detail="Decky not found")
|
||||
|
||||
try:
|
||||
await repo.delete_topology_decky(decky_uuid)
|
||||
except (TopologyNotEditable, VersionConflict, ValueError) as exc:
|
||||
raise map_repo_exception(exc) from exc
|
||||
return Response(status_code=status.HTTP_204_NO_CONTENT)
|
||||
51
decnet/web/router/topology/api_delete_topology.py
Normal file
51
decnet/web/router/topology/api_delete_topology.py
Normal file
@@ -0,0 +1,51 @@
|
||||
"""DELETE /topologies/{id} — cascade-delete a pending or torn-down topology."""
|
||||
from __future__ import annotations
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Response, status
|
||||
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.topology.status import TopologyStatus
|
||||
from decnet.web.dependencies import repo, require_admin
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
# Only allow delete when containers are guaranteed not to be running.
|
||||
# ACTIVE / DEPLOYING / DEGRADED / TEARING_DOWN must teardown first.
|
||||
_DELETABLE: frozenset[str] = frozenset(
|
||||
{TopologyStatus.PENDING, TopologyStatus.TORN_DOWN, TopologyStatus.FAILED}
|
||||
)
|
||||
|
||||
|
||||
@router.delete(
|
||||
"/{topology_id}",
|
||||
tags=["MazeNET Topologies"],
|
||||
status_code=status.HTTP_204_NO_CONTENT,
|
||||
responses={
|
||||
400: {"description": "Malformed path parameters"},
|
||||
401: {"description": "Missing or invalid credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "Topology not found"},
|
||||
409: {"description": "Topology has running resources; teardown first"},
|
||||
},
|
||||
)
|
||||
@_traced("api.topology.delete")
|
||||
async def api_delete_topology(
|
||||
topology_id: str,
|
||||
_admin: dict = Depends(require_admin),
|
||||
) -> Response:
|
||||
topo = await repo.get_topology(topology_id)
|
||||
if topo is None:
|
||||
raise HTTPException(status_code=404, detail="Topology not found")
|
||||
if topo["status"] not in _DELETABLE:
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail=(
|
||||
f"Topology is {topo['status']!r}; teardown to 'torn_down' "
|
||||
f"before delete."
|
||||
),
|
||||
)
|
||||
deleted = await repo.delete_topology_cascade(topology_id)
|
||||
if not deleted:
|
||||
# Race: row vanished between the status check and the cascade.
|
||||
raise HTTPException(status_code=404, detail="Topology not found")
|
||||
return Response(status_code=status.HTTP_204_NO_CONTENT)
|
||||
76
decnet/web/router/topology/api_deploy_topology.py
Normal file
76
decnet/web/router/topology/api_deploy_topology.py
Normal file
@@ -0,0 +1,76 @@
|
||||
"""POST /topologies/{id}/deploy — transition pending → deploying and fire
|
||||
the background deploy.
|
||||
|
||||
The actual Docker work happens in a BackgroundTask so the HTTP caller
|
||||
returns quickly with ``202 Accepted``. Status transitions
|
||||
(``deploying`` → ``active`` | ``failed``) are written by
|
||||
:func:`decnet.engine.deployer.deploy_topology` itself.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
|
||||
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, status
|
||||
|
||||
from decnet.engine.deployer import deploy_topology
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.topology.status import TopologyStatus
|
||||
from decnet.web.db.models import TopologySummary
|
||||
from decnet.web.dependencies import repo, require_admin
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
async def _run_deploy(topology_id: str) -> None:
|
||||
"""BackgroundTask body: deploy, swallow + log any exception so the
|
||||
task runner doesn't crash. Status on failure is marked by
|
||||
:func:`deploy_topology` via its own exception handler.
|
||||
"""
|
||||
try:
|
||||
await deploy_topology(repo, topology_id)
|
||||
except asyncio.CancelledError: # pragma: no cover — shutdown
|
||||
raise
|
||||
except Exception as exc: # noqa: BLE001
|
||||
from decnet.engine.deployer import _format_subprocess_error
|
||||
log.error(
|
||||
"background deploy of %s failed: %s",
|
||||
topology_id, _format_subprocess_error(exc),
|
||||
)
|
||||
|
||||
|
||||
@router.post(
|
||||
"/{topology_id}/deploy",
|
||||
tags=["MazeNET Topologies"],
|
||||
response_model=TopologySummary,
|
||||
status_code=status.HTTP_202_ACCEPTED,
|
||||
responses={
|
||||
400: {"description": "Malformed path parameters"},
|
||||
401: {"description": "Missing or invalid credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "Topology not found"},
|
||||
409: {"description": "Topology is not in 'pending' status"},
|
||||
},
|
||||
)
|
||||
@_traced("api.topology.deploy")
|
||||
async def api_deploy_topology(
|
||||
topology_id: str,
|
||||
background: BackgroundTasks,
|
||||
_admin: dict = Depends(require_admin),
|
||||
) -> TopologySummary:
|
||||
topo = await repo.get_topology(topology_id)
|
||||
if topo is None:
|
||||
raise HTTPException(status_code=404, detail="Topology not found")
|
||||
if topo["status"] != TopologyStatus.PENDING:
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail=(
|
||||
f"Topology is {topo['status']!r}; only 'pending' topologies "
|
||||
f"can be deployed."
|
||||
),
|
||||
)
|
||||
|
||||
background.add_task(_run_deploy, topology_id)
|
||||
return TopologySummary(**topo)
|
||||
110
decnet/web/router/topology/api_edge_crud.py
Normal file
110
decnet/web/router/topology/api_edge_crud.py
Normal file
@@ -0,0 +1,110 @@
|
||||
"""Edge CRUD endpoints — pending-only child mutations.
|
||||
|
||||
POST /topologies/{id}/edges
|
||||
DELETE /topologies/{id}/edges/{edge_id}
|
||||
|
||||
Edges are the decky↔LAN membership table (bipartite). Creating an
|
||||
edge attaches a decky to an additional LAN; deleting one detaches.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Response, status
|
||||
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.topology.status import (
|
||||
TopologyNotEditable,
|
||||
VersionConflict,
|
||||
)
|
||||
from decnet.web.db.models import EdgeCreateRequest, EdgeRow
|
||||
from decnet.web.dependencies import repo, require_admin
|
||||
|
||||
from ._guards import assert_pending_or_409, map_repo_exception
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.post(
|
||||
"/{topology_id}/edges",
|
||||
tags=["MazeNET Topologies"],
|
||||
response_model=EdgeRow,
|
||||
status_code=status.HTTP_201_CREATED,
|
||||
responses={
|
||||
400: {"description": "Malformed body or unknown decky/LAN"},
|
||||
401: {"description": "Missing or invalid credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "Topology not found"},
|
||||
409: {"description": "Topology not editable or version conflict"},
|
||||
},
|
||||
)
|
||||
@_traced("api.topology.edge.create")
|
||||
async def api_create_edge(
|
||||
topology_id: str,
|
||||
body: EdgeCreateRequest,
|
||||
_admin: dict = Depends(require_admin),
|
||||
) -> EdgeRow:
|
||||
await assert_pending_or_409(topology_id)
|
||||
|
||||
# Referential integrity: decky + LAN must belong to this topology.
|
||||
deckies = await repo.list_topology_deckies(topology_id)
|
||||
if not any(d["uuid"] == body.decky_uuid for d in deckies):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"decky {body.decky_uuid!r} not in topology {topology_id!r}",
|
||||
)
|
||||
lans = await repo.list_lans_for_topology(topology_id)
|
||||
if not any(r["id"] == body.lan_id for r in lans):
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=f"lan {body.lan_id!r} not in topology {topology_id!r}",
|
||||
)
|
||||
|
||||
payload = {
|
||||
"topology_id": topology_id,
|
||||
"decky_uuid": body.decky_uuid,
|
||||
"lan_id": body.lan_id,
|
||||
"is_bridge": body.is_bridge,
|
||||
"forwards_l3": body.forwards_l3,
|
||||
}
|
||||
try:
|
||||
edge_id = await repo.add_topology_edge(
|
||||
payload, expected_version=body.expected_version
|
||||
)
|
||||
except (TopologyNotEditable, VersionConflict, ValueError) as exc:
|
||||
raise map_repo_exception(exc) from exc
|
||||
|
||||
edges = await repo.list_topology_edges(topology_id)
|
||||
row = next((e for e in edges if e["id"] == edge_id), None)
|
||||
if row is None: # pragma: no cover
|
||||
raise HTTPException(status_code=500, detail="Edge insert vanished")
|
||||
return EdgeRow(**row)
|
||||
|
||||
|
||||
@router.delete(
|
||||
"/{topology_id}/edges/{edge_id}",
|
||||
tags=["MazeNET Topologies"],
|
||||
status_code=status.HTTP_204_NO_CONTENT,
|
||||
responses={
|
||||
400: {"description": "Malformed path"},
|
||||
401: {"description": "Missing or invalid credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "Topology or edge not found"},
|
||||
409: {"description": "Topology not editable or version conflict"},
|
||||
},
|
||||
)
|
||||
@_traced("api.topology.edge.delete")
|
||||
async def api_delete_edge(
|
||||
topology_id: str,
|
||||
edge_id: str,
|
||||
_admin: dict = Depends(require_admin),
|
||||
) -> Response:
|
||||
await assert_pending_or_409(topology_id)
|
||||
|
||||
edges = await repo.list_topology_edges(topology_id)
|
||||
if not any(e["id"] == edge_id for e in edges):
|
||||
raise HTTPException(status_code=404, detail="Edge not found")
|
||||
|
||||
try:
|
||||
await repo.delete_topology_edge(edge_id)
|
||||
except (TopologyNotEditable, VersionConflict, ValueError) as exc:
|
||||
raise map_repo_exception(exc) from exc
|
||||
return Response(status_code=status.HTTP_204_NO_CONTENT)
|
||||
157
decnet/web/router/topology/api_events.py
Normal file
157
decnet/web/router/topology/api_events.py
Normal file
@@ -0,0 +1,157 @@
|
||||
"""SSE stream of topology lifecycle events — one connection per editor.
|
||||
|
||||
Subscribes to ``topology.<id>.>`` on the :class:`~decnet.bus.base.BaseBus`
|
||||
for the duration of the request and forwards each matching bus event as
|
||||
a Server-Sent Event to the browser. Emits a one-shot snapshot on connect
|
||||
(current status + any in-flight mutations) so the client doesn't need a
|
||||
separate fetch to initialise the "pending" buffer.
|
||||
|
||||
Authorization matches :mod:`decnet.web.router.stream.api_stream_events`
|
||||
— a JWT passed via the ``?token=`` query parameter (EventSource can't
|
||||
set arbitrary headers) + ``require_stream_viewer`` role gate. The
|
||||
per-topology 404 is enforced after auth so existence probes can't leak
|
||||
a topology id to an unauthenticated caller.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from typing import AsyncGenerator
|
||||
|
||||
import orjson
|
||||
from fastapi import APIRouter, Depends, Request
|
||||
from fastapi.responses import StreamingResponse
|
||||
|
||||
from decnet.bus import topics as _topics
|
||||
from decnet.bus.app import get_app_bus
|
||||
from decnet.logging import get_logger
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.dependencies import repo, require_stream_viewer
|
||||
from decnet.web.sse_limits import sse_connection_slot
|
||||
|
||||
from ._guards import get_topology_or_404
|
||||
|
||||
log = get_logger("api.topology.events")
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
_KEEPALIVE_SECS = 15.0
|
||||
_IN_FLIGHT_STATES = ("pending", "applying")
|
||||
|
||||
|
||||
def _format_sse(event_name: str, data: dict) -> str:
|
||||
"""Build one SSE frame: ``event: <name>\\ndata: <json>\\n\\n``."""
|
||||
return f"event: {event_name}\ndata: {orjson.dumps(data).decode()}\n\n"
|
||||
|
||||
|
||||
@router.get(
|
||||
"/{topology_id}/events",
|
||||
tags=["MazeNET Topologies"],
|
||||
responses={
|
||||
200: {
|
||||
"content": {"text/event-stream": {}},
|
||||
"description": "SSE stream of mutation and status events for one topology",
|
||||
},
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "Topology not found"},
|
||||
429: {"description": "Per-user SSE connection cap reached"},
|
||||
},
|
||||
)
|
||||
@_traced("api.topology.events")
|
||||
async def api_topology_events(
|
||||
topology_id: str,
|
||||
request: Request,
|
||||
user: dict = Depends(require_stream_viewer),
|
||||
) -> StreamingResponse:
|
||||
# Event types emitted: snapshot, status, mutation.{enqueued,
|
||||
# applying,applied,failed}. All wrap bus events whose payload is
|
||||
# also reachable via viewer-gated REST (GET /topologies/{id},
|
||||
# GET /topologies/{id}/mutations). Adding a new event family here
|
||||
# requires a threat-model review for F6/I (role leakage).
|
||||
topo = await get_topology_or_404(topology_id)
|
||||
snapshot_status = topo["status"]
|
||||
in_flight: list[dict] = []
|
||||
for state in _IN_FLIGHT_STATES:
|
||||
in_flight.extend(await repo.list_topology_mutations(topology_id, state=state))
|
||||
|
||||
async def generator() -> AsyncGenerator[str, None]:
|
||||
async with sse_connection_slot(user["uuid"]):
|
||||
# Flush headers immediately so the browser's EventSource sees a
|
||||
# live connection before the first real event arrives.
|
||||
yield ": keepalive\n\n"
|
||||
|
||||
# One-shot snapshot — pair the current topology status with any
|
||||
# mutations the mutator is still holding, so the client buffer
|
||||
# can render an accurate "already in flight" state.
|
||||
yield _format_sse("snapshot", {
|
||||
"topology_id": topology_id,
|
||||
"status": snapshot_status,
|
||||
"in_flight": in_flight,
|
||||
})
|
||||
|
||||
bus = await get_app_bus()
|
||||
if bus is None:
|
||||
# Bus disabled (NullBus) or unreachable. The snapshot is
|
||||
# still useful; we idle on keepalives so the client stays
|
||||
# connected and will re-poll on its own timers.
|
||||
while not await request.is_disconnected():
|
||||
try:
|
||||
await asyncio.sleep(_KEEPALIVE_SECS)
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
yield ": keepalive\n\n"
|
||||
return
|
||||
|
||||
sub = bus.subscribe(f"{_topics.TOPOLOGY}.{topology_id}.>")
|
||||
try:
|
||||
async with sub:
|
||||
sub_iter = sub.__aiter__()
|
||||
while True:
|
||||
if await request.is_disconnected():
|
||||
break
|
||||
next_task = asyncio.ensure_future(sub_iter.__anext__())
|
||||
try:
|
||||
event = await asyncio.wait_for(next_task, timeout=_KEEPALIVE_SECS)
|
||||
except asyncio.TimeoutError:
|
||||
next_task.cancel()
|
||||
yield ": keepalive\n\n"
|
||||
continue
|
||||
except StopAsyncIteration:
|
||||
break
|
||||
# Map the bus event onto an SSE ``event:`` name that
|
||||
# the frontend can switch on without parsing topics.
|
||||
yield _format_sse(
|
||||
_sse_name_for(event.topic),
|
||||
{
|
||||
"topic": event.topic,
|
||||
"type": event.type,
|
||||
"ts": event.ts,
|
||||
"payload": event.payload,
|
||||
},
|
||||
)
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
except Exception:
|
||||
log.exception("topology events stream crashed topology_id=%s", topology_id)
|
||||
yield _format_sse("error", {"message": "Stream interrupted"})
|
||||
|
||||
return StreamingResponse(
|
||||
generator(),
|
||||
media_type="text/event-stream",
|
||||
headers={
|
||||
"Cache-Control": "no-cache",
|
||||
"X-Accel-Buffering": "no",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def _sse_name_for(topic: str) -> str:
|
||||
"""Derive an SSE ``event:`` name from a bus topic.
|
||||
|
||||
``topology.<id>.mutation.applied`` → ``mutation.applied``
|
||||
``topology.<id>.status`` → ``status``
|
||||
Anything else is passed through unchanged so future topic families
|
||||
don't silently collapse onto a generic bucket.
|
||||
"""
|
||||
parts = topic.split(".", 2)
|
||||
return parts[2] if len(parts) >= 3 else topic
|
||||
68
decnet/web/router/topology/api_get_topology.py
Normal file
68
decnet/web/router/topology/api_get_topology.py
Normal file
@@ -0,0 +1,68 @@
|
||||
"""GET /topologies/{id} and /topologies/{id}/status-events."""
|
||||
from __future__ import annotations
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.topology.persistence import hydrate
|
||||
from decnet.web.db.models import (
|
||||
DeckyRow,
|
||||
EdgeRow,
|
||||
LANRow,
|
||||
TopologyDetail,
|
||||
TopologyStatusEventRow,
|
||||
TopologySummary,
|
||||
)
|
||||
from decnet.web.dependencies import repo, require_viewer
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get(
|
||||
"/{topology_id}",
|
||||
tags=["MazeNET Topologies"],
|
||||
response_model=TopologyDetail,
|
||||
responses={
|
||||
400: {"description": "Malformed path parameters"},
|
||||
401: {"description": "Missing or invalid credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "Topology not found"},
|
||||
},
|
||||
)
|
||||
@_traced("api.topology.get")
|
||||
async def api_get_topology(
|
||||
topology_id: str,
|
||||
_viewer: dict = Depends(require_viewer),
|
||||
) -> TopologyDetail:
|
||||
hydrated = await hydrate(repo, topology_id)
|
||||
if hydrated is None:
|
||||
raise HTTPException(status_code=404, detail="Topology not found")
|
||||
return TopologyDetail(
|
||||
topology=TopologySummary(**hydrated["topology"]),
|
||||
lans=[LANRow(**r) for r in hydrated["lans"]],
|
||||
deckies=[DeckyRow(**r) for r in hydrated["deckies"]],
|
||||
edges=[EdgeRow(**r) for r in hydrated["edges"]],
|
||||
)
|
||||
|
||||
|
||||
@router.get(
|
||||
"/{topology_id}/status-events",
|
||||
tags=["MazeNET Topologies"],
|
||||
response_model=list[TopologyStatusEventRow],
|
||||
responses={
|
||||
400: {"description": "Malformed query parameters"},
|
||||
401: {"description": "Missing or invalid credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "Topology not found"},
|
||||
},
|
||||
)
|
||||
@_traced("api.topology.status_events")
|
||||
async def api_get_status_events(
|
||||
topology_id: str,
|
||||
limit: int = Query(default=100, ge=1, le=1000),
|
||||
_viewer: dict = Depends(require_viewer),
|
||||
) -> list[TopologyStatusEventRow]:
|
||||
if await repo.get_topology(topology_id) is None:
|
||||
raise HTTPException(status_code=404, detail="Topology not found")
|
||||
rows = await repo.list_topology_status_events(topology_id, limit=limit)
|
||||
return [TopologyStatusEventRow(**r) for r in rows]
|
||||
152
decnet/web/router/topology/api_lan_crud.py
Normal file
152
decnet/web/router/topology/api_lan_crud.py
Normal file
@@ -0,0 +1,152 @@
|
||||
"""LAN CRUD endpoints — pending-only child mutations.
|
||||
|
||||
POST /topologies/{id}/lans
|
||||
PATCH /topologies/{id}/lans/{lan_id}
|
||||
DELETE /topologies/{id}/lans/{lan_id}
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Response, status
|
||||
|
||||
from decnet.logging import get_logger
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.topology.allocator import reserved_subnets
|
||||
from decnet.topology.status import (
|
||||
TopologyNotEditable,
|
||||
VersionConflict,
|
||||
)
|
||||
from decnet.web.db.models import LANCreateRequest, LANRow, LANUpdateRequest
|
||||
from decnet.web.dependencies import repo, require_admin
|
||||
|
||||
from ._guards import assert_pending_or_409, map_repo_exception
|
||||
|
||||
log = get_logger("api.topology.lan")
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.post(
|
||||
"/{topology_id}/lans",
|
||||
tags=["MazeNET Topologies"],
|
||||
response_model=LANRow,
|
||||
status_code=status.HTTP_201_CREATED,
|
||||
responses={
|
||||
400: {"description": "Malformed body or invalid LAN fields"},
|
||||
401: {"description": "Missing or invalid credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "Topology not found"},
|
||||
409: {"description": "Topology not editable or version conflict"},
|
||||
},
|
||||
)
|
||||
@_traced("api.topology.lan.create")
|
||||
async def api_create_lan(
|
||||
topology_id: str,
|
||||
body: LANCreateRequest,
|
||||
_admin: dict = Depends(require_admin),
|
||||
) -> LANRow:
|
||||
await assert_pending_or_409(topology_id)
|
||||
|
||||
subnet = body.subnet
|
||||
if subnet is None:
|
||||
# Mint a free /24. The allocator scans the claimed set and hands
|
||||
# back the next free subnet base — same logic as the catalog
|
||||
# /next-subnet endpoint, but inlined so create is atomic.
|
||||
from decnet.topology.allocator import SubnetAllocator
|
||||
|
||||
allocator = SubnetAllocator(
|
||||
"10.0", reserved=await reserved_subnets(repo)
|
||||
)
|
||||
subnet = allocator.next_free()
|
||||
|
||||
payload = {
|
||||
"topology_id": topology_id,
|
||||
"name": body.name,
|
||||
"subnet": subnet,
|
||||
"is_dmz": body.is_dmz,
|
||||
"x": body.x,
|
||||
"y": body.y,
|
||||
}
|
||||
try:
|
||||
lan_id = await repo.add_lan(
|
||||
payload, expected_version=body.expected_version
|
||||
)
|
||||
except (TopologyNotEditable, VersionConflict, ValueError) as exc:
|
||||
raise map_repo_exception(exc) from exc
|
||||
|
||||
rows = await repo.list_lans_for_topology(topology_id)
|
||||
row = next((r for r in rows if r["id"] == lan_id), None)
|
||||
if row is None: # pragma: no cover — would mean insert vanished
|
||||
raise HTTPException(status_code=500, detail="LAN insert vanished")
|
||||
|
||||
return LANRow(**row)
|
||||
|
||||
|
||||
@router.patch(
|
||||
"/{topology_id}/lans/{lan_id}",
|
||||
tags=["MazeNET Topologies"],
|
||||
response_model=LANRow,
|
||||
responses={
|
||||
400: {"description": "Malformed body or invalid LAN fields"},
|
||||
401: {"description": "Missing or invalid credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "Topology or LAN not found"},
|
||||
409: {"description": "Topology not editable or version conflict"},
|
||||
},
|
||||
)
|
||||
@_traced("api.topology.lan.update")
|
||||
async def api_update_lan(
|
||||
topology_id: str,
|
||||
lan_id: str,
|
||||
body: LANUpdateRequest,
|
||||
_admin: dict = Depends(require_admin),
|
||||
) -> LANRow:
|
||||
await assert_pending_or_409(topology_id)
|
||||
|
||||
fields = body.model_dump(exclude_unset=True, exclude={"expected_version"})
|
||||
try:
|
||||
await repo.update_lan(
|
||||
lan_id,
|
||||
fields,
|
||||
expected_version=body.expected_version,
|
||||
enforce_pending=True,
|
||||
)
|
||||
except (TopologyNotEditable, VersionConflict) as exc:
|
||||
raise map_repo_exception(exc) from exc
|
||||
except ValueError as exc:
|
||||
raise HTTPException(status_code=404, detail=str(exc)) from exc
|
||||
|
||||
rows = await repo.list_lans_for_topology(topology_id)
|
||||
row = next((r for r in rows if r["id"] == lan_id), None)
|
||||
if row is None:
|
||||
raise HTTPException(status_code=404, detail="LAN not found")
|
||||
return LANRow(**row)
|
||||
|
||||
|
||||
@router.delete(
|
||||
"/{topology_id}/lans/{lan_id}",
|
||||
tags=["MazeNET Topologies"],
|
||||
status_code=status.HTTP_204_NO_CONTENT,
|
||||
responses={
|
||||
400: {"description": "Cannot delete: LAN has orphan-risking deckies"},
|
||||
401: {"description": "Missing or invalid credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "Topology or LAN not found"},
|
||||
409: {"description": "Topology not editable or version conflict"},
|
||||
},
|
||||
)
|
||||
@_traced("api.topology.lan.delete")
|
||||
async def api_delete_lan(
|
||||
topology_id: str,
|
||||
lan_id: str,
|
||||
_admin: dict = Depends(require_admin),
|
||||
) -> Response:
|
||||
await assert_pending_or_409(topology_id)
|
||||
|
||||
rows = await repo.list_lans_for_topology(topology_id)
|
||||
if not any(r["id"] == lan_id for r in rows):
|
||||
raise HTTPException(status_code=404, detail="LAN not found")
|
||||
|
||||
try:
|
||||
await repo.delete_lan(lan_id)
|
||||
except (TopologyNotEditable, VersionConflict, ValueError) as exc:
|
||||
raise map_repo_exception(exc) from exc
|
||||
return Response(status_code=status.HTTP_204_NO_CONTENT)
|
||||
39
decnet/web/router/topology/api_list_topologies.py
Normal file
39
decnet/web/router/topology/api_list_topologies.py
Normal file
@@ -0,0 +1,39 @@
|
||||
"""GET /topologies — paginated list of MazeNET topologies."""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, Query
|
||||
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.db.models import TopologyListResponse, TopologySummary
|
||||
from decnet.web.dependencies import repo, require_viewer
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.get(
|
||||
"/",
|
||||
tags=["MazeNET Topologies"],
|
||||
response_model=TopologyListResponse,
|
||||
responses={
|
||||
400: {"description": "Malformed query parameters"},
|
||||
401: {"description": "Missing or invalid credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
},
|
||||
)
|
||||
@_traced("api.topology.list")
|
||||
async def api_list_topologies(
|
||||
status: Optional[str] = Query(default=None, description="Filter by topology status"),
|
||||
limit: int = Query(default=50, ge=1, le=500),
|
||||
offset: int = Query(default=0, ge=0, le=2147483647),
|
||||
_viewer: dict = Depends(require_viewer),
|
||||
) -> TopologyListResponse:
|
||||
total = await repo.count_topologies(status=status)
|
||||
rows = await repo.list_topologies(status=status, limit=limit, offset=offset)
|
||||
return TopologyListResponse(
|
||||
total=total,
|
||||
limit=limit,
|
||||
offset=offset,
|
||||
data=[TopologySummary(**r) for r in rows],
|
||||
)
|
||||
127
decnet/web/router/topology/api_mutations.py
Normal file
127
decnet/web/router/topology/api_mutations.py
Normal file
@@ -0,0 +1,127 @@
|
||||
"""Live-mutation queue endpoints — for active | degraded topologies.
|
||||
|
||||
POST /topologies/{id}/mutations enqueue one mutation op
|
||||
GET /topologies/{id}/mutations list queued / applied / failed rows
|
||||
|
||||
The mutator worker claims pending rows via ``claim_next_mutation`` and
|
||||
transitions them to ``applying`` → ``applied`` | ``failed``. The API
|
||||
layer only stages rows and reports them back.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query, status
|
||||
|
||||
from decnet.bus import topics as _topics
|
||||
from decnet.bus.app import get_app_bus
|
||||
from decnet.logging import get_logger
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.topology.status import (
|
||||
TopologyStatus,
|
||||
VersionConflict,
|
||||
)
|
||||
from decnet.web.db.models import (
|
||||
MutationEnqueueRequest,
|
||||
MutationEnqueueResponse,
|
||||
MutationRow,
|
||||
)
|
||||
from decnet.web.dependencies import repo, require_admin, require_viewer
|
||||
|
||||
from ._guards import get_topology_or_404, map_repo_exception
|
||||
|
||||
_log = get_logger("api.topology.mutations")
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
_MUTATABLE: frozenset[str] = frozenset(
|
||||
{TopologyStatus.ACTIVE, TopologyStatus.DEGRADED}
|
||||
)
|
||||
|
||||
|
||||
@router.post(
|
||||
"/{topology_id}/mutations",
|
||||
tags=["MazeNET Topologies"],
|
||||
response_model=MutationEnqueueResponse,
|
||||
status_code=status.HTTP_202_ACCEPTED,
|
||||
responses={
|
||||
400: {"description": "Malformed body or unknown mutation op"},
|
||||
401: {"description": "Missing or invalid credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "Topology not found"},
|
||||
409: {
|
||||
"description": (
|
||||
"Topology is not active|degraded, or version conflict"
|
||||
)
|
||||
},
|
||||
},
|
||||
)
|
||||
@_traced("api.topology.mutation.enqueue")
|
||||
async def api_enqueue_mutation(
|
||||
topology_id: str,
|
||||
body: MutationEnqueueRequest,
|
||||
_admin: dict = Depends(require_admin),
|
||||
) -> MutationEnqueueResponse:
|
||||
topo = await get_topology_or_404(topology_id)
|
||||
if topo["status"] not in _MUTATABLE:
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail=(
|
||||
f"Topology is {topo['status']!r}; the mutation queue is "
|
||||
f"only open for 'active' or 'degraded' topologies. Use "
|
||||
f"child-CRUD endpoints while pending."
|
||||
),
|
||||
)
|
||||
|
||||
try:
|
||||
mutation_id = await repo.enqueue_topology_mutation(
|
||||
topology_id,
|
||||
body.op,
|
||||
body.payload,
|
||||
expected_version=body.expected_version,
|
||||
)
|
||||
except VersionConflict as exc:
|
||||
raise map_repo_exception(exc) from exc
|
||||
except ValueError as exc:
|
||||
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
||||
|
||||
# Fire-and-forget bus publish so the mutator can wake immediately and
|
||||
# the SSE route can notify connected editors. Bus failure here must
|
||||
# never mask a successful enqueue — the DB row is authoritative.
|
||||
bus = await get_app_bus()
|
||||
if bus is not None:
|
||||
try:
|
||||
await bus.publish(
|
||||
_topics.topology_mutation(topology_id, _topics.MUTATION_ENQUEUED),
|
||||
{"mutation_id": mutation_id, "op": body.op, "payload": body.payload},
|
||||
event_type=_topics.MUTATION_ENQUEUED,
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
_log.warning("bus publish (enqueued) failed: %s", exc)
|
||||
|
||||
return MutationEnqueueResponse(mutation_id=mutation_id, state="pending")
|
||||
|
||||
|
||||
@router.get(
|
||||
"/{topology_id}/mutations",
|
||||
tags=["MazeNET Topologies"],
|
||||
response_model=list[MutationRow],
|
||||
responses={
|
||||
400: {"description": "Malformed query parameters"},
|
||||
401: {"description": "Missing or invalid credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "Topology not found"},
|
||||
},
|
||||
)
|
||||
@_traced("api.topology.mutation.list")
|
||||
async def api_list_mutations(
|
||||
topology_id: str,
|
||||
state: Optional[str] = Query(
|
||||
default=None,
|
||||
description="Filter by state: pending | applying | applied | failed",
|
||||
),
|
||||
_viewer: dict = Depends(require_viewer),
|
||||
) -> list[MutationRow]:
|
||||
await get_topology_or_404(topology_id)
|
||||
rows = await repo.list_topology_mutations(topology_id, state=state)
|
||||
return [MutationRow(**r) for r in rows]
|
||||
131
decnet/web/router/topology/api_personas.py
Normal file
131
decnet/web/router/topology/api_personas.py
Normal file
@@ -0,0 +1,131 @@
|
||||
"""GET/PUT ``/topologies/{id}/personas`` — per-topology email persona pool.
|
||||
|
||||
The global pool (``decnet/web/router/emailgen/api_personas.py``) drives
|
||||
non-MazeNET fleet/SWARM-shard mail deckies. MazeNET topology mail
|
||||
deckies use ``Topology.email_personas`` instead — one JSON-serialized
|
||||
list per topology, parsed by the emailgen scheduler each tick.
|
||||
|
||||
This endpoint is the API surface behind the dashboard's per-topology
|
||||
"Personas" editor. Reads accept admin or viewer; writes are admin-only.
|
||||
|
||||
Concurrency: last-write-wins. The list is operator-curated and small
|
||||
(typically <20 entries); no need for optimistic versioning here.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
|
||||
from decnet.logging import get_logger
|
||||
from decnet.realism.personas import EmailPersona, parse_personas
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.dependencies import repo, require_admin, require_viewer
|
||||
|
||||
router = APIRouter()
|
||||
log = get_logger("api.topology.personas")
|
||||
|
||||
|
||||
def _serialize(personas: list[EmailPersona]) -> list[dict[str, Any]]:
|
||||
return [p.model_dump(exclude_none=False) for p in personas]
|
||||
|
||||
|
||||
@router.get(
|
||||
"/{topology_id}/personas",
|
||||
tags=["MazeNET Topologies"],
|
||||
responses={
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "Topology not found"},
|
||||
},
|
||||
)
|
||||
@_traced("api.topology.list_personas")
|
||||
async def list_topology_personas(
|
||||
topology_id: str,
|
||||
_viewer: dict = Depends(require_viewer),
|
||||
) -> dict[str, Any]:
|
||||
"""Return the topology's persona list and its language default.
|
||||
|
||||
``language_default`` is included so the editor can show which
|
||||
language unset entries fall back to — same fallback the scheduler
|
||||
applies when building prompts.
|
||||
"""
|
||||
topo = await repo.get_topology(topology_id)
|
||||
if topo is None:
|
||||
raise HTTPException(status_code=404, detail="Topology not found")
|
||||
language_default = topo.get("language_default") or "en"
|
||||
personas = parse_personas(
|
||||
topo.get("email_personas"), language_default=language_default,
|
||||
)
|
||||
return {
|
||||
"topology_id": topology_id,
|
||||
"topology_name": topo.get("name", ""),
|
||||
"language_default": language_default,
|
||||
"personas": _serialize(personas),
|
||||
}
|
||||
|
||||
|
||||
@router.put(
|
||||
"/{topology_id}/personas",
|
||||
tags=["MazeNET Topologies"],
|
||||
responses={
|
||||
400: {"description": "Invalid persona payload"},
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "Topology not found"},
|
||||
},
|
||||
)
|
||||
@_traced("api.topology.replace_personas")
|
||||
async def replace_topology_personas(
|
||||
topology_id: str,
|
||||
body: dict[str, Any],
|
||||
user: dict = Depends(require_admin),
|
||||
) -> dict[str, Any]:
|
||||
"""Replace the topology's persona list.
|
||||
|
||||
Body shape: ``{"personas": [<EmailPersona>, ...]}``.
|
||||
|
||||
Drop-invalid semantics mirror the global-pool endpoint: bad entries
|
||||
are skipped with a warning rather than failing the whole request, but
|
||||
a wholly invalid payload returns 400 so a schema mistake doesn't
|
||||
silently wipe the list.
|
||||
"""
|
||||
raw = body.get("personas")
|
||||
if not isinstance(raw, list):
|
||||
raise HTTPException(
|
||||
status_code=400, detail="body.personas must be a list",
|
||||
)
|
||||
|
||||
topo = await repo.get_topology(topology_id)
|
||||
if topo is None:
|
||||
raise HTTPException(status_code=404, detail="Topology not found")
|
||||
language_default = topo.get("language_default") or "en"
|
||||
|
||||
parsed = parse_personas(raw, language_default=language_default)
|
||||
if raw and not parsed:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail=(
|
||||
"All persona entries failed validation. Required fields: "
|
||||
"name, email (user@host.tld), role, tone, mannerisms."
|
||||
),
|
||||
)
|
||||
|
||||
serialized = _serialize(parsed)
|
||||
payload = json.dumps(serialized, ensure_ascii=False)
|
||||
updated = await repo.set_topology_email_personas(topology_id, payload)
|
||||
if not updated:
|
||||
# Race: row vanished between the get and the update.
|
||||
raise HTTPException(status_code=404, detail="Topology not found")
|
||||
|
||||
log.info(
|
||||
"api.topology.replace_personas user=%s topology=%s wrote=%d",
|
||||
user.get("username", user.get("uuid")), topology_id, len(parsed),
|
||||
)
|
||||
return {
|
||||
"topology_id": topology_id,
|
||||
"topology_name": topo.get("name", ""),
|
||||
"language_default": language_default,
|
||||
"personas": serialized,
|
||||
}
|
||||
48
decnet/web/router/topology/api_reap_orphans.py
Normal file
48
decnet/web/router/topology/api_reap_orphans.py
Normal file
@@ -0,0 +1,48 @@
|
||||
"""POST /topologies/reap-orphans — remove Docker resources for topology
|
||||
ids the DB no longer knows about.
|
||||
|
||||
A topology row deleted outside the teardown flow (operator error,
|
||||
crashed master, direct DB edit) leaves its containers and bridge
|
||||
networks behind. The orphan networks keep their IPAM pools, so the
|
||||
next deploy at the same subnet hits a 403 ``Pool overlaps`` from the
|
||||
Docker daemon.
|
||||
|
||||
This endpoint walks the local Docker daemon, computes the set of
|
||||
topology prefixes still known to the repo, and force-removes every
|
||||
container + network whose prefix is orphaned. Resources belonging to
|
||||
live topologies are never touched.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from fastapi import APIRouter, Depends
|
||||
|
||||
from decnet.engine.reaper import reap_orphan_topology_resources
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.db.models import ReapReportResponse
|
||||
from decnet.web.dependencies import repo, require_admin
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.post(
|
||||
"/reap-orphans",
|
||||
tags=["MazeNET Topologies"],
|
||||
response_model=ReapReportResponse,
|
||||
responses={
|
||||
401: {"description": "Missing or invalid credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
},
|
||||
)
|
||||
@_traced("api.topology.reap_orphans")
|
||||
async def api_reap_orphans(
|
||||
_admin: dict = Depends(require_admin),
|
||||
) -> dict:
|
||||
"""Reap Docker resources whose topology id is absent from the DB.
|
||||
|
||||
Returns a report with the live prefixes, the orphan prefixes that
|
||||
were identified, every container + network actually removed, and
|
||||
any per-resource errors encountered. Errors are non-fatal — a
|
||||
single stuck resource does not abort the sweep.
|
||||
"""
|
||||
report = await reap_orphan_topology_resources(repo)
|
||||
return report.to_dict()
|
||||
79
decnet/web/router/topology/api_teardown_topology.py
Normal file
79
decnet/web/router/topology/api_teardown_topology.py
Normal file
@@ -0,0 +1,79 @@
|
||||
"""POST /topologies/{id}/teardown — transition an active/degraded/failed
|
||||
topology to ``tearing_down`` and fire the background teardown.
|
||||
|
||||
Mirrors :mod:`api_deploy_topology`: the real Docker work runs in a
|
||||
BackgroundTask, the caller returns ``202 Accepted``, and
|
||||
:func:`decnet.engine.deployer.teardown_topology` writes the terminal
|
||||
``torn_down`` status when it finishes.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
|
||||
from fastapi import APIRouter, BackgroundTasks, Depends, HTTPException, status
|
||||
|
||||
from decnet.engine.deployer import teardown_topology
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.topology.status import TopologyStatus
|
||||
from decnet.web.db.models import TopologySummary
|
||||
from decnet.web.dependencies import repo, require_admin
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
# Statuses that can legally transition to TEARING_DOWN (see
|
||||
# decnet.topology.status._LEGAL).
|
||||
_TEARDOWNABLE: frozenset[str] = frozenset(
|
||||
{
|
||||
TopologyStatus.ACTIVE,
|
||||
TopologyStatus.DEGRADED,
|
||||
TopologyStatus.FAILED,
|
||||
TopologyStatus.DEPLOYING,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
async def _run_teardown(topology_id: str) -> None:
|
||||
try:
|
||||
await teardown_topology(repo, topology_id)
|
||||
except asyncio.CancelledError: # pragma: no cover — shutdown
|
||||
raise
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.error("background teardown of %s failed: %s", topology_id, exc)
|
||||
|
||||
|
||||
@router.post(
|
||||
"/{topology_id}/teardown",
|
||||
tags=["MazeNET Topologies"],
|
||||
response_model=TopologySummary,
|
||||
status_code=status.HTTP_202_ACCEPTED,
|
||||
responses={
|
||||
400: {"description": "Malformed path parameters"},
|
||||
401: {"description": "Missing or invalid credentials"},
|
||||
403: {"description": "Insufficient permissions"},
|
||||
404: {"description": "Topology not found"},
|
||||
409: {"description": "Topology cannot be torn down from its current status"},
|
||||
},
|
||||
)
|
||||
@_traced("api.topology.teardown")
|
||||
async def api_teardown_topology(
|
||||
topology_id: str,
|
||||
background: BackgroundTasks,
|
||||
_admin: dict = Depends(require_admin),
|
||||
) -> TopologySummary:
|
||||
topo = await repo.get_topology(topology_id)
|
||||
if topo is None:
|
||||
raise HTTPException(status_code=404, detail="Topology not found")
|
||||
if topo["status"] not in _TEARDOWNABLE:
|
||||
raise HTTPException(
|
||||
status_code=409,
|
||||
detail=(
|
||||
f"Topology is {topo['status']!r}; cannot teardown "
|
||||
f"(allowed from: {sorted(_TEARDOWNABLE)})."
|
||||
),
|
||||
)
|
||||
|
||||
background.add_task(_run_teardown, topology_id)
|
||||
return TopologySummary(**topo)
|
||||
6
decnet/web/router/transcripts/__init__.py
Normal file
6
decnet/web/router/transcripts/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
from fastapi import APIRouter
|
||||
|
||||
from .api_get_transcript import router as transcript_router
|
||||
|
||||
transcripts_router = APIRouter()
|
||||
transcripts_router.include_router(transcript_router)
|
||||
243
decnet/web/router/transcripts/api_get_transcript.py
Normal file
243
decnet/web/router/transcripts/api_get_transcript.py
Normal file
@@ -0,0 +1,243 @@
|
||||
"""
|
||||
Paged asciinema v2 transcript endpoint.
|
||||
|
||||
Transcripts are stored as one JSONL day-shard per (decky, UTC day) under
|
||||
/var/lib/decnet/artifacts/{decky}/{service}/transcripts/sessions-YYYY-MM-DD.jsonl
|
||||
Each line carries a ``sid`` tag; multiple concurrent sessions interleave into
|
||||
the same shard (O_APPEND + sub-PIPE_BUF writes keep lines atomic — see
|
||||
decnet/templates/_shared/sessrec/sessrec.c for the guarantee).
|
||||
|
||||
Rather than scanning the whole shard on every request, the first hit for a
|
||||
given (shard path, mtime) builds an in-memory index of ``sid → [byte offsets]``
|
||||
by one pass. Subsequent paged reads pread() exact line slices in O(limit).
|
||||
Index is bounded by the disk-free precheck (< 200 MB free → no recording)
|
||||
and the 10 MB per-session cap.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from collections import OrderedDict
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.dependencies import require_admin, repo
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
ARTIFACTS_ROOT = Path(os.environ.get("DECNET_ARTIFACTS_ROOT", "/var/lib/decnet/artifacts"))
|
||||
|
||||
_DECKY_RE = re.compile(r"^[a-z0-9][a-z0-9-]{0,62}$")
|
||||
_SID_RE = re.compile(r"^[a-f0-9-]{36}$")
|
||||
_SERVICE_RE = re.compile(r"^(ssh|telnet)$")
|
||||
# Shard filename is built by sessrec from UTC date — keep the charset tight
|
||||
# so a forged shard_path in the Log row can't traverse.
|
||||
_SHARD_BASENAME_RE = re.compile(r"^sessions-\d{4}-\d{2}-\d{2}\.jsonl$")
|
||||
|
||||
# (path, mtime_ns) → {sid: [(offset, length), ...]}
|
||||
_INDEX_CACHE: "OrderedDict[tuple[str, int], dict[str, list[tuple[int, int]]]]" = OrderedDict()
|
||||
_CACHE_MAX = 32
|
||||
|
||||
|
||||
def _get_index(path: Path) -> tuple[dict[str, list[tuple[int, int]]], int]:
|
||||
st = path.stat()
|
||||
key = (str(path), st.st_mtime_ns)
|
||||
if key in _INDEX_CACHE:
|
||||
_INDEX_CACHE.move_to_end(key)
|
||||
return _INDEX_CACHE[key], st.st_size
|
||||
index: dict[str, list[tuple[int, int]]] = {}
|
||||
with path.open("rb") as f:
|
||||
offset = 0
|
||||
for line in f:
|
||||
length = len(line)
|
||||
# Fast sid extract: look for `"sid":"<36 chars>"` prefix — every
|
||||
# sessrec line starts with that field (see emit_*).
|
||||
try:
|
||||
m = re.search(rb'"sid"\s*:\s*"([a-f0-9-]{36})"', line)
|
||||
except re.error:
|
||||
m = None
|
||||
if m:
|
||||
sid = m.group(1).decode("ascii")
|
||||
index.setdefault(sid, []).append((offset, length))
|
||||
offset += length
|
||||
_INDEX_CACHE[key] = index
|
||||
_INDEX_CACHE.move_to_end(key)
|
||||
while len(_INDEX_CACHE) > _CACHE_MAX:
|
||||
_INDEX_CACHE.popitem(last=False)
|
||||
return index, st.st_size
|
||||
|
||||
|
||||
def _validate_names(decky: str, service: str) -> None:
|
||||
if not _DECKY_RE.fullmatch(decky):
|
||||
raise HTTPException(status_code=400, detail="invalid decky name")
|
||||
if not _SERVICE_RE.fullmatch(service):
|
||||
raise HTTPException(status_code=400, detail="invalid service")
|
||||
|
||||
|
||||
def _resolve_shard(decky: str, service: str, shard_name: str) -> Path:
|
||||
_validate_names(decky, service)
|
||||
if not _SHARD_BASENAME_RE.fullmatch(shard_name):
|
||||
raise HTTPException(status_code=400, detail="invalid shard name")
|
||||
root = ARTIFACTS_ROOT.resolve()
|
||||
candidate = (root / decky / service / "transcripts" / shard_name).resolve()
|
||||
if root not in candidate.parents and candidate != root:
|
||||
raise HTTPException(status_code=400, detail="path escapes artifacts root")
|
||||
return candidate
|
||||
|
||||
|
||||
def _find_shard_with_sid(decky: str, service: str, sid: str) -> Path | None:
|
||||
"""Scan every ``sessions-YYYY-MM-DD.jsonl`` under the decky's transcripts
|
||||
dir until one claims this sid.
|
||||
|
||||
Fallback for rows where ``fields.shard_path`` is missing (current
|
||||
sessrec.c does not emit it) or for sessions that span UTC midnight
|
||||
(events land in two shards; the emitted SD could only name one).
|
||||
Newest shards first — most transcript lookups are for recent
|
||||
sessions. Result is cached by ``_get_index`` keyed on
|
||||
(path, mtime), so repeated calls are ~free.
|
||||
"""
|
||||
_validate_names(decky, service)
|
||||
root = ARTIFACTS_ROOT.resolve()
|
||||
transcripts_dir = (root / decky / service / "transcripts").resolve()
|
||||
if root not in transcripts_dir.parents:
|
||||
return None
|
||||
# Absent dir, or dir the API process can't stat/read — treat as
|
||||
# "no transcript", not as a 500 traceback. Most commonly the decky
|
||||
# container wrote this tree as a container-side uid that the API
|
||||
# (running under --user / --group) can't cross.
|
||||
try:
|
||||
if not transcripts_dir.is_dir():
|
||||
return None
|
||||
entries = list(transcripts_dir.iterdir())
|
||||
except (OSError, PermissionError):
|
||||
return None
|
||||
shards = sorted(
|
||||
(p for p in entries if _SHARD_BASENAME_RE.fullmatch(p.name)),
|
||||
reverse=True, # newest day first
|
||||
)
|
||||
for shard in shards:
|
||||
try:
|
||||
index, _size = _get_index(shard)
|
||||
except (OSError, PermissionError):
|
||||
continue
|
||||
if sid in index:
|
||||
return shard
|
||||
return None
|
||||
|
||||
|
||||
@router.get(
|
||||
"/transcripts/{decky}/{sid}",
|
||||
tags=["Transcripts"],
|
||||
responses={
|
||||
400: {"description": "Invalid decky or sid parameter"},
|
||||
401: {"description": "Could not validate credentials"},
|
||||
403: {"description": "Admin access required"},
|
||||
404: {"description": "Transcript not found"},
|
||||
},
|
||||
)
|
||||
@_traced("api.get_transcript")
|
||||
async def get_transcript(
|
||||
decky: str,
|
||||
sid: str,
|
||||
offset: int = Query(0, ge=0, le=2147483647),
|
||||
limit: int = Query(500, ge=1, le=5000),
|
||||
admin: dict = Depends(require_admin),
|
||||
) -> dict[str, Any]:
|
||||
if not _DECKY_RE.fullmatch(decky):
|
||||
raise HTTPException(status_code=400, detail="invalid decky name")
|
||||
if not _SID_RE.fullmatch(sid):
|
||||
raise HTTPException(status_code=400, detail="invalid sid")
|
||||
|
||||
log = await repo.get_session_log(sid)
|
||||
if not log:
|
||||
raise HTTPException(status_code=404, detail="session not found")
|
||||
|
||||
try:
|
||||
fields = json.loads(log.get("fields") or "{}")
|
||||
except (ValueError, TypeError):
|
||||
fields = {}
|
||||
|
||||
service = fields.get("service") or log.get("service")
|
||||
shard_path_field = fields.get("shard_path") or ""
|
||||
shard_name = Path(shard_path_field).name
|
||||
log_decky = log.get("decky") or fields.get("decky")
|
||||
|
||||
if log_decky and log_decky != decky:
|
||||
raise HTTPException(status_code=404, detail="session not found")
|
||||
|
||||
# Fast path: the Log row carries a fields.shard_path we can validate
|
||||
# and hit directly. Falls back to scanning all shards when the SD
|
||||
# didn't include one (current sessrec.c doesn't emit shard_path) or
|
||||
# when the named shard isn't on disk anymore.
|
||||
path: Path | None = None
|
||||
if _SHARD_BASENAME_RE.fullmatch(shard_name or ""):
|
||||
candidate = _resolve_shard(decky, service or "", shard_name)
|
||||
if candidate.is_file():
|
||||
path = candidate
|
||||
if path is None:
|
||||
path = _find_shard_with_sid(decky, service or "", sid)
|
||||
if path is None:
|
||||
raise HTTPException(status_code=404, detail="transcript not found")
|
||||
|
||||
index, _size = _get_index(path)
|
||||
lines_meta = index.get(sid)
|
||||
if not lines_meta:
|
||||
raise HTTPException(status_code=404, detail="sid not present in shard")
|
||||
|
||||
header: dict[str, Any] = {}
|
||||
events: list[list[Any]] = []
|
||||
truncated = False
|
||||
|
||||
# First pass: find the header line (has "hdr" key) and count events.
|
||||
# Keep it O(n lines for this sid), not O(shard).
|
||||
total_events = 0
|
||||
event_positions: list[tuple[int, int]] = []
|
||||
with path.open("rb") as f:
|
||||
for off, ln in lines_meta:
|
||||
f.seek(off)
|
||||
raw = f.read(ln)
|
||||
try:
|
||||
obj = json.loads(raw)
|
||||
except ValueError:
|
||||
continue
|
||||
if "hdr" in obj:
|
||||
header = obj["hdr"]
|
||||
continue
|
||||
if obj.get("trunc"):
|
||||
truncated = True
|
||||
continue
|
||||
event_positions.append((off, ln))
|
||||
total_events += 1
|
||||
|
||||
# Page the events window.
|
||||
window = event_positions[offset:offset + limit]
|
||||
for off, ln in window:
|
||||
f.seek(off)
|
||||
raw = f.read(ln)
|
||||
try:
|
||||
obj = json.loads(raw)
|
||||
except ValueError:
|
||||
continue
|
||||
t = obj.get("t")
|
||||
ch = obj.get("ch")
|
||||
d = obj.get("d")
|
||||
if t is None or ch is None or d is None:
|
||||
continue
|
||||
events.append([t, ch, d])
|
||||
|
||||
return {
|
||||
"sid": sid,
|
||||
"service": service,
|
||||
"header": header,
|
||||
"events": events,
|
||||
"offset": offset,
|
||||
"limit": limit,
|
||||
"total": total_events,
|
||||
"has_more": (offset + limit) < total_events,
|
||||
"truncated": truncated,
|
||||
}
|
||||
18
decnet/web/router/webhooks/__init__.py
Normal file
18
decnet/web/router/webhooks/__init__.py
Normal file
@@ -0,0 +1,18 @@
|
||||
"""Webhook subscription CRUD.
|
||||
|
||||
Admin-gated management of external-egress webhook subscriptions. The
|
||||
actual delivery happens in the `decnet webhook` worker, which watches
|
||||
the DB + bus and POSTs matching events out. This module is the API
|
||||
surface operators use to configure destinations.
|
||||
|
||||
Mounted under `/api/v1/webhooks` by the main api router.
|
||||
"""
|
||||
from fastapi import APIRouter
|
||||
|
||||
from .api_manage_webhooks import router as manage_webhooks_router
|
||||
from .api_test_webhook import router as test_webhook_router
|
||||
|
||||
webhooks_router = APIRouter(prefix="/webhooks")
|
||||
|
||||
webhooks_router.include_router(manage_webhooks_router)
|
||||
webhooks_router.include_router(test_webhook_router)
|
||||
231
decnet/web/router/webhooks/api_manage_webhooks.py
Normal file
231
decnet/web/router/webhooks/api_manage_webhooks.py
Normal file
@@ -0,0 +1,231 @@
|
||||
"""Webhook subscription CRUD — admin-gated."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import secrets
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
|
||||
from decnet.bus import topics as _topics
|
||||
from decnet.bus.app import get_app_bus
|
||||
from decnet.logging import get_logger
|
||||
from decnet.telemetry import traced as _traced
|
||||
from decnet.web.db.models import (
|
||||
MessageResponse,
|
||||
WebhookCreateRequest,
|
||||
WebhookCreateResponse,
|
||||
WebhookResponse,
|
||||
WebhookUpdateRequest,
|
||||
)
|
||||
from decnet.web.db.models.webhooks import _row_to_response_dict
|
||||
from decnet.web.dependencies import repo, require_admin
|
||||
from decnet.webhook.enums import merge_patterns
|
||||
|
||||
log = get_logger("api.webhooks")
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
async def _notify_subscriptions_changed() -> None:
|
||||
"""Publish `system.webhook.subscriptions_changed` on the bus.
|
||||
|
||||
Fire-and-forget per the bus contract — a dropped signal is recoverable
|
||||
because the webhook worker also reloads on a slow timer as a fallback.
|
||||
"""
|
||||
try:
|
||||
bus = await get_app_bus()
|
||||
if bus is None:
|
||||
return
|
||||
await bus.publish(
|
||||
_topics.WEBHOOK_SUBSCRIPTIONS_CHANGED,
|
||||
{},
|
||||
event_type="changed",
|
||||
)
|
||||
except Exception as e: # noqa: BLE001 — bus failures must not break CRUD
|
||||
log.warning("webhook subscriptions-changed publish failed: %s", e)
|
||||
|
||||
|
||||
def _row_to_response(row: dict[str, Any]) -> WebhookResponse:
|
||||
return WebhookResponse(**_row_to_response_dict(row))
|
||||
|
||||
|
||||
@router.post(
|
||||
"/",
|
||||
tags=["Webhooks"],
|
||||
response_model=WebhookCreateResponse,
|
||||
status_code=201,
|
||||
responses={
|
||||
400: {"description": "At least one of simple_events / topic_patterns required"},
|
||||
409: {"description": "Name already in use"},
|
||||
},
|
||||
)
|
||||
@_traced("api.webhook.create")
|
||||
async def api_create_webhook(
|
||||
req: WebhookCreateRequest,
|
||||
admin: dict = Depends(require_admin),
|
||||
) -> WebhookCreateResponse:
|
||||
patterns = merge_patterns(req.simple_events, req.topic_patterns)
|
||||
if not patterns:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="Provide at least one simple_events entry or topic_patterns pattern.",
|
||||
)
|
||||
|
||||
existing = await repo.get_webhook_subscription_by_name(req.name)
|
||||
if existing:
|
||||
raise HTTPException(status_code=409, detail="Webhook name already exists")
|
||||
|
||||
# Auto-generate a URL-safe secret if the caller didn't provide one.
|
||||
# 32 bytes of os-entropy is the same ballpark as a CSRF token.
|
||||
secret = req.secret or secrets.token_urlsafe(32)
|
||||
|
||||
now = datetime.now(timezone.utc)
|
||||
data = {
|
||||
"name": req.name,
|
||||
"url": str(req.url),
|
||||
"secret": secret,
|
||||
"topic_patterns": json.dumps(patterns),
|
||||
"enabled": req.enabled,
|
||||
"consecutive_failures": 0,
|
||||
"created_at": now,
|
||||
"updated_at": now,
|
||||
}
|
||||
await repo.create_webhook_subscription(data)
|
||||
row = await repo.get_webhook_subscription_by_name(req.name)
|
||||
if row is None:
|
||||
# Should never happen — the create just committed. Treat as 500
|
||||
# rather than silently masking a storage bug.
|
||||
raise HTTPException(status_code=500, detail="Webhook created but not retrievable")
|
||||
|
||||
await _notify_subscriptions_changed()
|
||||
|
||||
return WebhookCreateResponse(
|
||||
**_row_to_response_dict(row),
|
||||
secret=secret,
|
||||
)
|
||||
|
||||
|
||||
@router.get(
|
||||
"/",
|
||||
tags=["Webhooks"],
|
||||
response_model=list[WebhookResponse],
|
||||
)
|
||||
@_traced("api.webhook.list")
|
||||
async def api_list_webhooks(
|
||||
admin: dict = Depends(require_admin),
|
||||
) -> list[WebhookResponse]:
|
||||
rows = await repo.list_webhook_subscriptions()
|
||||
return [_row_to_response(r) for r in rows]
|
||||
|
||||
|
||||
@router.get(
|
||||
"/{uuid}",
|
||||
tags=["Webhooks"],
|
||||
response_model=WebhookResponse,
|
||||
responses={404: {"description": "Webhook not found"}},
|
||||
)
|
||||
@_traced("api.webhook.get")
|
||||
async def api_get_webhook(
|
||||
uuid: str,
|
||||
admin: dict = Depends(require_admin),
|
||||
) -> WebhookResponse:
|
||||
row = await repo.get_webhook_subscription(uuid)
|
||||
if not row:
|
||||
raise HTTPException(status_code=404, detail="Webhook not found")
|
||||
return _row_to_response(row)
|
||||
|
||||
|
||||
@router.patch(
|
||||
"/{uuid}",
|
||||
tags=["Webhooks"],
|
||||
response_model=WebhookResponse,
|
||||
responses={
|
||||
400: {"description": "Empty or invalid patch"},
|
||||
404: {"description": "Webhook not found"},
|
||||
409: {"description": "Name already in use"},
|
||||
},
|
||||
)
|
||||
@_traced("api.webhook.update")
|
||||
async def api_update_webhook(
|
||||
uuid: str,
|
||||
req: WebhookUpdateRequest,
|
||||
admin: dict = Depends(require_admin),
|
||||
) -> WebhookResponse:
|
||||
current = await repo.get_webhook_subscription(uuid)
|
||||
if not current:
|
||||
raise HTTPException(status_code=404, detail="Webhook not found")
|
||||
|
||||
patch: dict[str, Any] = {}
|
||||
|
||||
if req.name is not None and req.name != current["name"]:
|
||||
clash = await repo.get_webhook_subscription_by_name(req.name)
|
||||
if clash and clash["uuid"] != uuid:
|
||||
raise HTTPException(status_code=409, detail="Webhook name already exists")
|
||||
patch["name"] = req.name
|
||||
|
||||
if req.url is not None:
|
||||
patch["url"] = str(req.url)
|
||||
|
||||
if req.secret is not None:
|
||||
patch["secret"] = req.secret
|
||||
|
||||
if req.enabled is not None:
|
||||
patch["enabled"] = req.enabled
|
||||
# Re-enabling after a circuit trip clears the trip stamp and
|
||||
# zeros the failure count — the operator has acknowledged and
|
||||
# is ready to resume delivery. Admin-paused → re-enabled also
|
||||
# hits this path harmlessly (auto_disabled_at is already NULL
|
||||
# and consecutive_failures is already 0).
|
||||
if req.enabled is True and not current.get("enabled"):
|
||||
patch["auto_disabled_at"] = None
|
||||
patch["consecutive_failures"] = 0
|
||||
patch["last_error"] = None
|
||||
|
||||
if req.simple_events is not None or req.topic_patterns is not None:
|
||||
# Re-merge using whatever the caller supplied; a caller that wants
|
||||
# to clear all patterns must explicitly pass both as empty lists.
|
||||
simple = req.simple_events if req.simple_events is not None else []
|
||||
raw = req.topic_patterns if req.topic_patterns is not None else []
|
||||
patterns = merge_patterns(simple, raw)
|
||||
if not patterns:
|
||||
raise HTTPException(
|
||||
status_code=400,
|
||||
detail="Cannot clear all patterns; disable the webhook instead.",
|
||||
)
|
||||
patch["topic_patterns"] = json.dumps(patterns)
|
||||
|
||||
if not patch:
|
||||
# No-op patch — return the current row untouched.
|
||||
return _row_to_response(current)
|
||||
|
||||
updated = await repo.update_webhook_subscription(uuid, patch)
|
||||
if not updated:
|
||||
raise HTTPException(status_code=404, detail="Webhook not found")
|
||||
|
||||
await _notify_subscriptions_changed()
|
||||
|
||||
row = await repo.get_webhook_subscription(uuid)
|
||||
if row is None:
|
||||
raise HTTPException(status_code=404, detail="Webhook not found")
|
||||
return _row_to_response(row)
|
||||
|
||||
|
||||
@router.delete(
|
||||
"/{uuid}",
|
||||
tags=["Webhooks"],
|
||||
response_model=MessageResponse,
|
||||
responses={404: {"description": "Webhook not found"}},
|
||||
)
|
||||
@_traced("api.webhook.delete")
|
||||
async def api_delete_webhook(
|
||||
uuid: str,
|
||||
admin: dict = Depends(require_admin),
|
||||
) -> dict[str, str]:
|
||||
deleted = await repo.delete_webhook_subscription(uuid)
|
||||
if not deleted:
|
||||
raise HTTPException(status_code=404, detail="Webhook not found")
|
||||
|
||||
await _notify_subscriptions_changed()
|
||||
return {"message": "Webhook deleted"}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user