feat: complete OTEL tracing across all services with pipeline bridge and docs

Extends tracing to every remaining module: all 23 API route handlers,
correlation engine, sniffer (fingerprint/p0f/syslog), prober (jarm/hassh/tcpfp),
profiler behavioral analysis, logging subsystem, engine, and mutator.

Bridges the ingester→SSE trace gap by persisting trace_id/span_id columns on
the logs table and creating OTEL span links in the SSE endpoint. Adds log-trace
correlation via _TraceContextFilter injecting otel_trace_id into Python LogRecords.

Includes development/docs/TRACING.md with full span reference (76 spans),
pipeline propagation architecture, quick start guide, and troubleshooting.
This commit is contained in:
2026-04-16 00:58:08 -04:00
parent 04db13afae
commit 70d8ffc607
38 changed files with 577 additions and 124 deletions

View File

@@ -43,6 +43,11 @@ class Log(SQLModel, table=True):
raw_line: str = Field(sa_column=Column("raw_line", Text, nullable=False))
fields: str = Field(sa_column=Column("fields", Text, nullable=False))
msg: Optional[str] = Field(default=None, sa_column=Column("msg", Text, nullable=True))
# OTEL trace context — bridges the collector→ingester trace to the SSE
# read path. Nullable so pre-existing rows and non-traced deployments
# are unaffected.
trace_id: Optional[str] = Field(default=None)
span_id: Optional[str] = Field(default=None)
class Bounty(SQLModel, table=True):
__tablename__ = "bounty"

View File

@@ -75,6 +75,14 @@ async def log_ingestion_worker(repo: BaseRepository) -> None:
_span.set_attribute("service", _log_data.get("service", ""))
_span.set_attribute("event_type", _log_data.get("event_type", ""))
_span.set_attribute("attacker_ip", _log_data.get("attacker_ip", ""))
# Persist trace context in the DB row so the SSE
# read path can link back to this ingestion trace.
_sctx = getattr(_span, "get_span_context", None)
if _sctx:
_ctx = _sctx()
if _ctx and getattr(_ctx, "trace_id", 0):
_log_data["trace_id"] = format(_ctx.trace_id, "032x")
_log_data["span_id"] = format(_ctx.span_id, "016x")
logger.debug("ingest: record decky=%s event_type=%s", _log_data.get("decky"), _log_data.get("event_type"))
await repo.add_log(_log_data)
await _extract_bounty(repo, _log_data)

View File

@@ -2,6 +2,7 @@ from typing import Any, Optional
from fastapi import APIRouter, Depends, HTTPException, Query
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo
router = APIRouter()
@@ -15,6 +16,7 @@ router = APIRouter()
404: {"description": "Attacker not found"},
},
)
@_traced("api.get_attacker_commands")
async def get_attacker_commands(
uuid: str,
limit: int = Query(50, ge=1, le=200),

View File

@@ -2,6 +2,7 @@ from typing import Any
from fastapi import APIRouter, Depends, HTTPException
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo
router = APIRouter()
@@ -15,6 +16,7 @@ router = APIRouter()
404: {"description": "Attacker not found"},
},
)
@_traced("api.get_attacker_detail")
async def get_attacker_detail(
uuid: str,
user: dict = Depends(require_viewer),

View File

@@ -2,6 +2,7 @@ from typing import Any, Optional
from fastapi import APIRouter, Depends, Query
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo
from decnet.web.db.models import AttackersResponse
@@ -17,6 +18,7 @@ router = APIRouter()
422: {"description": "Validation error"},
},
)
@_traced("api.get_attackers")
async def get_attackers(
limit: int = Query(50, ge=1, le=1000),
offset: int = Query(0, ge=0, le=2147483647),

View File

@@ -2,6 +2,7 @@ from typing import Any, Optional
from fastapi import APIRouter, Depends, HTTPException, status
from decnet.telemetry import traced as _traced
from decnet.web.auth import get_password_hash, verify_password
from decnet.web.dependencies import get_current_user_unchecked, repo
from decnet.web.db.models import ChangePasswordRequest
@@ -18,6 +19,7 @@ router = APIRouter()
422: {"description": "Validation error"}
},
)
@_traced("api.change_password")
async def change_password(request: ChangePasswordRequest, current_user: str = Depends(get_current_user_unchecked)) -> dict[str, str]:
_user: Optional[dict[str, Any]] = await repo.get_user_by_uuid(current_user)
if not _user or not verify_password(request.old_password, _user["password_hash"]):

View File

@@ -3,6 +3,7 @@ from typing import Any, Optional
from fastapi import APIRouter, HTTPException, status
from decnet.telemetry import traced as _traced
from decnet.web.auth import (
ACCESS_TOKEN_EXPIRE_MINUTES,
create_access_token,
@@ -24,6 +25,7 @@ router = APIRouter()
422: {"description": "Validation error"}
},
)
@_traced("api.login")
async def login(request: LoginRequest) -> dict[str, Any]:
_user: Optional[dict[str, Any]] = await repo.get_user_by_username(request.username)
if not _user or not verify_password(request.password, _user["password_hash"]):

View File

@@ -2,6 +2,7 @@ from typing import Any, Optional
from fastapi import APIRouter, Depends, Query
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo
from decnet.web.db.models import BountyResponse
@@ -10,6 +11,7 @@ router = APIRouter()
@router.get("/bounty", response_model=BountyResponse, tags=["Bounty Vault"],
responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}},)
@_traced("api.get_bounties")
async def get_bounties(
limit: int = Query(50, ge=1, le=1000),
offset: int = Query(0, ge=0, le=2147483647),

View File

@@ -1,6 +1,7 @@
from fastapi import APIRouter, Depends
from decnet.env import DECNET_DEVELOPER
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo
from decnet.web.db.models import UserResponse
@@ -17,6 +18,7 @@ _DEFAULT_MUTATION_INTERVAL = "30m"
401: {"description": "Could not validate credentials"},
},
)
@_traced("api.get_config")
async def api_get_config(user: dict = Depends(require_viewer)) -> dict:
limits_state = await repo.get_state("config_limits")
globals_state = await repo.get_state("config_globals")

View File

@@ -2,6 +2,7 @@ import uuid as _uuid
from fastapi import APIRouter, Depends, HTTPException
from decnet.telemetry import traced as _traced
from decnet.web.auth import get_password_hash
from decnet.web.dependencies import require_admin, repo
from decnet.web.db.models import (
@@ -24,6 +25,7 @@ router = APIRouter()
422: {"description": "Validation error"},
},
)
@_traced("api.create_user")
async def api_create_user(
req: CreateUserRequest,
admin: dict = Depends(require_admin),
@@ -57,6 +59,7 @@ async def api_create_user(
404: {"description": "User not found"},
},
)
@_traced("api.delete_user")
async def api_delete_user(
user_uuid: str,
admin: dict = Depends(require_admin),
@@ -80,6 +83,7 @@ async def api_delete_user(
422: {"description": "Validation error"},
},
)
@_traced("api.update_user_role")
async def api_update_user_role(
user_uuid: str,
req: UpdateUserRoleRequest,
@@ -106,6 +110,7 @@ async def api_update_user_role(
422: {"description": "Validation error"},
},
)
@_traced("api.reset_user_password")
async def api_reset_user_password(
user_uuid: str,
req: ResetUserPasswordRequest,

View File

@@ -1,6 +1,7 @@
from fastapi import APIRouter, Depends, HTTPException
from decnet.env import DECNET_DEVELOPER
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_admin, repo
router = APIRouter()
@@ -14,6 +15,7 @@ router = APIRouter()
403: {"description": "Admin access required or developer mode not enabled"},
},
)
@_traced("api.reinit")
async def api_reinit(admin: dict = Depends(require_admin)) -> dict:
if not DECNET_DEVELOPER:
raise HTTPException(status_code=403, detail="Developer mode is not enabled")

View File

@@ -1,5 +1,6 @@
from fastapi import APIRouter, Depends
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_admin, repo
from decnet.web.db.models import DeploymentLimitRequest, GlobalMutationIntervalRequest
@@ -15,6 +16,7 @@ router = APIRouter()
422: {"description": "Validation error"},
},
)
@_traced("api.update_deployment_limit")
async def api_update_deployment_limit(
req: DeploymentLimitRequest,
admin: dict = Depends(require_admin),
@@ -32,6 +34,7 @@ async def api_update_deployment_limit(
422: {"description": "Validation error"},
},
)
@_traced("api.update_global_mutation_interval")
async def api_update_global_mutation_interval(
req: GlobalMutationIntervalRequest,
admin: dict = Depends(require_admin),

View File

@@ -3,6 +3,7 @@ import os
from fastapi import APIRouter, Depends, HTTPException
from decnet.logging import get_logger
from decnet.telemetry import traced as _traced
from decnet.config import DEFAULT_MUTATE_INTERVAL, DecnetConfig, _ROOT
from decnet.engine import deploy as _deploy
from decnet.ini_loader import load_ini_from_string
@@ -27,6 +28,7 @@ router = APIRouter()
500: {"description": "Deployment failed"}
}
)
@_traced("api.deploy_deckies")
async def api_deploy_deckies(req: DeployIniRequest, admin: dict = Depends(require_admin)) -> dict[str, str]:
from decnet.fleet import build_deckies_from_ini

View File

@@ -2,6 +2,7 @@ from typing import Any
from fastapi import APIRouter, Depends
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo
router = APIRouter()
@@ -9,5 +10,6 @@ router = APIRouter()
@router.get("/deckies", tags=["Fleet Management"],
responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}},)
@_traced("api.get_deckies")
async def get_deckies(user: dict = Depends(require_viewer)) -> list[dict[str, Any]]:
return await repo.get_deckies()

View File

@@ -1,6 +1,7 @@
import os
from fastapi import APIRouter, Depends, HTTPException, Path
from decnet.telemetry import traced as _traced
from decnet.mutator import mutate_decky
from decnet.web.dependencies import require_admin, repo
@@ -12,6 +13,7 @@ router = APIRouter()
tags=["Fleet Management"],
responses={401: {"description": "Could not validate credentials"}, 403: {"description": "Insufficient permissions"}, 404: {"description": "Decky not found"}}
)
@_traced("api.mutate_decky")
async def api_mutate_decky(
decky_name: str = Path(..., pattern=r"^[a-z0-9\-]{1,64}$"),
admin: dict = Depends(require_admin),

View File

@@ -1,5 +1,6 @@
from fastapi import APIRouter, Depends, HTTPException
from decnet.telemetry import traced as _traced
from decnet.config import DecnetConfig
from decnet.web.dependencies import require_admin, repo
from decnet.web.db.models import MutateIntervalRequest
@@ -24,6 +25,7 @@ def _parse_duration(s: str) -> int:
422: {"description": "Validation error"}
},
)
@_traced("api.update_mutate_interval")
async def api_update_mutate_interval(decky_name: str, req: MutateIntervalRequest, admin: dict = Depends(require_admin)) -> dict[str, str]:
state_dict = await repo.get_state("deployment")
if not state_dict:

View File

@@ -3,6 +3,7 @@ from typing import Any
from fastapi import APIRouter, Depends
from fastapi.responses import JSONResponse
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo
from decnet.web.db.models import HealthResponse, ComponentHealth
@@ -20,6 +21,7 @@ _OPTIONAL_SERVICES = {"sniffer_worker"}
503: {"model": HealthResponse, "description": "System unhealthy"},
},
)
@_traced("api.get_health")
async def get_health(user: dict = Depends(require_viewer)) -> Any:
components: dict[str, ComponentHealth] = {}

View File

@@ -2,6 +2,7 @@ from typing import Any, Optional
from fastapi import APIRouter, Depends, Query
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo
router = APIRouter()
@@ -9,6 +10,7 @@ router = APIRouter()
@router.get("/logs/histogram", tags=["Logs"],
responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}},)
@_traced("api.get_logs_histogram")
async def get_logs_histogram(
search: Optional[str] = None,
start_time: Optional[str] = Query(None),

View File

@@ -2,6 +2,7 @@ from typing import Any, Optional
from fastapi import APIRouter, Depends, Query
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo
from decnet.web.db.models import LogsResponse
@@ -10,6 +11,7 @@ router = APIRouter()
@router.get("/logs", response_model=LogsResponse, tags=["Logs"],
responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}})
@_traced("api.get_logs")
async def get_logs(
limit: int = Query(50, ge=1, le=1000),
offset: int = Query(0, ge=0, le=2147483647),

View File

@@ -2,6 +2,7 @@ from typing import Any
from fastapi import APIRouter, Depends
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo
from decnet.web.db.models import StatsResponse
@@ -10,5 +11,6 @@ router = APIRouter()
@router.get("/stats", response_model=StatsResponse, tags=["Observability"],
responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}},)
@_traced("api.get_stats")
async def get_stats(user: dict = Depends(require_viewer)) -> dict[str, Any]:
return await repo.get_stats_summary()

View File

@@ -7,6 +7,7 @@ from fastapi.responses import StreamingResponse
from decnet.env import DECNET_DEVELOPER
from decnet.logging import get_logger
from decnet.telemetry import traced as _traced, get_tracer as _get_tracer
from decnet.web.dependencies import require_stream_viewer, repo
log = get_logger("api")
@@ -14,6 +15,34 @@ log = get_logger("api")
router = APIRouter()
def _build_trace_links(logs: list[dict]) -> list:
"""Build OTEL span links from persisted trace_id/span_id in log rows.
Returns an empty list when tracing is disabled (no OTEL imports).
"""
try:
from opentelemetry.trace import Link, SpanContext, TraceFlags
except ImportError:
return []
links: list[Link] = []
for entry in logs:
tid = entry.get("trace_id")
sid = entry.get("span_id")
if not tid or not sid or tid == "0":
continue
try:
ctx = SpanContext(
trace_id=int(tid, 16),
span_id=int(sid, 16),
is_remote=True,
trace_flags=TraceFlags(TraceFlags.SAMPLED),
)
links.append(Link(ctx))
except (ValueError, TypeError):
continue
return links
@router.get("/stream", tags=["Observability"],
responses={
200: {
@@ -24,6 +53,7 @@ router = APIRouter()
422: {"description": "Validation error"}
},
)
@_traced("api.stream_events")
async def stream_events(
request: Request,
last_event_id: int = Query(0, alias="lastEventId"),
@@ -75,7 +105,15 @@ async def stream_events(
)
if new_logs:
last_id = max(entry["id"] for entry in new_logs)
yield f"event: message\ndata: {json.dumps({'type': 'logs', 'data': new_logs})}\n\n"
# Create a span linking back to the ingestion traces
# stored in each log row, closing the pipeline gap.
_links = _build_trace_links(new_logs)
_tracer = _get_tracer("sse")
with _tracer.start_as_current_span(
"sse.emit_logs", links=_links,
attributes={"log_count": len(new_logs)},
):
yield f"event: message\ndata: {json.dumps({'type': 'logs', 'data': new_logs})}\n\n"
loops_since_stats = stats_interval_sec
if loops_since_stats >= stats_interval_sec: