feat: complete OTEL tracing across all services with pipeline bridge and docs

Extends tracing to every remaining module: all 23 API route handlers,
correlation engine, sniffer (fingerprint/p0f/syslog), prober (jarm/hassh/tcpfp),
profiler behavioral analysis, logging subsystem, engine, and mutator.

Bridges the ingester→SSE trace gap by persisting trace_id/span_id columns on
the logs table and creating OTEL span links in the SSE endpoint. Adds log-trace
correlation via _TraceContextFilter injecting otel_trace_id into Python LogRecords.

Includes development/docs/TRACING.md with full span reference (76 spans),
pipeline propagation architecture, quick start guide, and troubleshooting.
This commit is contained in:
2026-04-16 00:58:08 -04:00
parent 04db13afae
commit 70d8ffc607
38 changed files with 577 additions and 124 deletions

View File

@@ -33,6 +33,7 @@ from decnet.logging.syslog_formatter import (
SEVERITY_WARNING,
format_rfc5424,
)
from decnet.telemetry import traced as _traced, get_tracer as _get_tracer
class CorrelationEngine:
@@ -64,6 +65,7 @@ class CorrelationEngine:
self.events_indexed += 1
return event
@_traced("correlation.ingest_file")
def ingest_file(self, path: Path) -> int:
"""
Parse every line of *path* and index it.
@@ -73,12 +75,18 @@ class CorrelationEngine:
with open(path) as fh:
for line in fh:
self.ingest(line)
_tracer = _get_tracer("correlation")
with _tracer.start_as_current_span("correlation.ingest_file.summary") as _span:
_span.set_attribute("lines_parsed", self.lines_parsed)
_span.set_attribute("events_indexed", self.events_indexed)
_span.set_attribute("unique_ips", len(self._events))
return self.events_indexed
# ------------------------------------------------------------------ #
# Query #
# ------------------------------------------------------------------ #
@_traced("correlation.traversals")
def traversals(self, min_deckies: int = 2) -> list[AttackerTraversal]:
"""
Return all attackers that touched at least *min_deckies* distinct
@@ -135,6 +143,7 @@ class CorrelationEngine:
)
return table
@_traced("correlation.report_json")
def report_json(self, min_deckies: int = 2) -> dict:
"""Serialisable dict representation of all traversals."""
return {
@@ -147,6 +156,7 @@ class CorrelationEngine:
"traversals": [t.to_dict() for t in self.traversals(min_deckies)],
}
@_traced("correlation.traversal_syslog_lines")
def traversal_syslog_lines(self, min_deckies: int = 2) -> list[str]:
"""
Emit one RFC 5424 syslog line per detected traversal.

View File

@@ -68,6 +68,7 @@ _PERMANENT_ERRORS = (
)
@_traced("engine.compose_with_retry")
def _compose_with_retry(
*args: str,
compose_file: Path = COMPOSE_FILE,

View File

@@ -7,6 +7,11 @@ Usage:
The returned logger propagates to the root logger (configured in config.py with
Rfc5424Formatter), so level control via DECNET_DEVELOPER still applies globally.
When ``DECNET_DEVELOPER_TRACING`` is active, every LogRecord is enriched with
``otel_trace_id`` and ``otel_span_id`` from the current OTEL span context.
This lets you correlate log lines with Jaeger traces — click a log entry and
jump straight to the span that produced it.
"""
from __future__ import annotations
@@ -27,6 +32,51 @@ class _ComponentFilter(logging.Filter):
return True
class _TraceContextFilter(logging.Filter):
"""Injects ``otel_trace_id`` and ``otel_span_id`` onto every LogRecord
from the active OTEL span context.
Installed once by ``enable_trace_context()`` on the root ``decnet`` logger
so all child loggers inherit the enrichment via propagation.
When no span is active, both fields are set to ``"0"`` (cheap string
comparison downstream, no None-checks needed).
"""
def filter(self, record: logging.LogRecord) -> bool:
try:
from opentelemetry import trace
span = trace.get_current_span()
ctx = span.get_span_context()
if ctx and ctx.trace_id:
record.otel_trace_id = format(ctx.trace_id, "032x") # type: ignore[attr-defined]
record.otel_span_id = format(ctx.span_id, "016x") # type: ignore[attr-defined]
else:
record.otel_trace_id = "0" # type: ignore[attr-defined]
record.otel_span_id = "0" # type: ignore[attr-defined]
except Exception:
record.otel_trace_id = "0" # type: ignore[attr-defined]
record.otel_span_id = "0" # type: ignore[attr-defined]
return True
_trace_filter_installed: bool = False
def enable_trace_context() -> None:
"""Install the OTEL trace-context filter on the root ``decnet`` logger.
Called once from ``decnet.telemetry.setup_tracing()`` after the
TracerProvider is initialised. Safe to call multiple times (idempotent).
"""
global _trace_filter_installed
if _trace_filter_installed:
return
root = logging.getLogger("decnet")
root.addFilter(_TraceContextFilter())
_trace_filter_installed = True
def get_logger(component: str) -> logging.Logger:
"""Return a named logger that self-identifies as *component* in RFC 5424.

View File

@@ -13,6 +13,8 @@ import logging.handlers
import os
from pathlib import Path
from decnet.telemetry import traced as _traced
_LOG_FILE_ENV = "DECNET_LOG_FILE"
_DEFAULT_LOG_FILE = "/var/log/decnet/decnet.log"
_MAX_BYTES = 10 * 1024 * 1024 # 10 MB
@@ -22,10 +24,10 @@ _handler: logging.handlers.RotatingFileHandler | None = None
_logger: logging.Logger | None = None
def _get_logger() -> logging.Logger:
@_traced("logging.init_file_handler")
def _init_file_handler() -> logging.Logger:
"""One-time initialisation of the rotating file handler."""
global _handler, _logger
if _logger is not None:
return _logger
log_path = Path(os.environ.get(_LOG_FILE_ENV, _DEFAULT_LOG_FILE))
log_path.parent.mkdir(parents=True, exist_ok=True)
@@ -46,6 +48,12 @@ def _get_logger() -> logging.Logger:
return _logger
def _get_logger() -> logging.Logger:
if _logger is not None:
return _logger
return _init_file_handler()
def write_syslog(line: str) -> None:
"""Write a single RFC 5424 syslog line to the rotating log file."""
try:

View File

@@ -11,6 +11,8 @@ shared utilities for validating and parsing the log_target string.
import socket
from decnet.telemetry import traced as _traced
def parse_log_target(log_target: str) -> tuple[str, int]:
"""
@@ -23,6 +25,7 @@ def parse_log_target(log_target: str) -> tuple[str, int]:
return parts[0], int(parts[1])
@_traced("logging.probe_log_target")
def probe_log_target(log_target: str, timeout: float = 2.0) -> bool:
"""
Return True if the log target is reachable (TCP connect succeeds).

View File

@@ -133,6 +133,7 @@ async def mutate_all(repo: BaseRepository, force: bool = False) -> None:
log.info("mutate_all: complete mutated_count=%d", mutated_count)
@_traced("mutator.watch_loop")
async def run_watch_loop(repo: BaseRepository, poll_interval_secs: int = 10) -> None:
"""Run an infinite loop checking for deckies that need mutation."""
log.info("mutator watch loop started poll_interval_secs=%d", poll_interval_secs)

View File

@@ -9,7 +9,7 @@ This is the *server* variant of HASSH (HASSHServer). It fingerprints what
the server *offers*, which identifies the SSH implementation (OpenSSH,
Paramiko, libssh, Cobalt Strike SSH, etc.).
Stdlib only (socket, struct, hashlib). No DECNET imports.
Stdlib only (socket, struct, hashlib) plus decnet.telemetry for tracing (zero-cost when disabled).
"""
from __future__ import annotations
@@ -19,6 +19,8 @@ import socket
import struct
from typing import Any
from decnet.telemetry import traced as _traced
# SSH protocol constants
_SSH_MSG_KEXINIT = 20
_KEX_INIT_COOKIE_LEN = 16
@@ -36,6 +38,7 @@ _MAX_PACKET_LEN = 35000
# ─── SSH connection + KEX_INIT capture ──────────────────────────────────────
@_traced("prober.hassh_ssh_connect")
def _ssh_connect(
host: str,
port: int,
@@ -213,6 +216,7 @@ def _compute_hassh(kex: str, enc: str, mac: str, comp: str) -> str:
# ─── Public API ─────────────────────────────────────────────────────────────
@_traced("prober.hassh_server")
def hassh_server(
host: str,
port: int,

View File

@@ -8,7 +8,7 @@ fingerprint that identifies the TLS server implementation.
Reference: https://github.com/salesforce/jarm
No DECNET imports — this module is self-contained and testable in isolation.
Only DECNET import is decnet.telemetry for tracing (zero-cost when disabled).
"""
from __future__ import annotations
@@ -19,6 +19,8 @@ import struct
import time
from typing import Any
from decnet.telemetry import traced as _traced
# ─── Constants ────────────────────────────────────────────────────────────────
JARM_EMPTY_HASH = "0" * 62
@@ -379,6 +381,7 @@ def _version_to_str(version: int) -> str:
# ─── Probe sender ────────────────────────────────────────────────────────────
@_traced("prober.jarm_send_probe")
def _send_probe(host: str, port: int, hello: bytes, timeout: float = 5.0) -> bytes | None:
"""
Open a TCP connection, send the ClientHello, and read the ServerHello.
@@ -471,6 +474,7 @@ def _compute_jarm(responses: list[str]) -> str:
# ─── Public API ──────────────────────────────────────────────────────────────
@_traced("prober.jarm_hash")
def jarm_hash(host: str, port: int, timeout: float = 5.0) -> str:
"""
Compute the JARM fingerprint for a TLS server.

View File

@@ -15,6 +15,8 @@ import hashlib
import random
from typing import Any
from decnet.telemetry import traced as _traced
# Lazy-import scapy to avoid breaking non-root usage of HASSH/JARM.
# The actual import happens inside functions that need it.
@@ -36,6 +38,7 @@ _OPT_CODES: dict[str, str] = {
# ─── Packet construction ───────────────────────────────────────────────────
@_traced("prober.tcpfp_send_syn")
def _send_syn(
host: str,
port: int,
@@ -196,6 +199,7 @@ def _compute_fingerprint(fields: dict[str, Any]) -> tuple[str, str]:
# ─── Public API ─────────────────────────────────────────────────────────────
@_traced("prober.tcp_fingerprint")
def tcp_fingerprint(
host: str,
port: int,

View File

@@ -169,6 +169,7 @@ def _write_event(
# ─── Target discovery from log stream ────────────────────────────────────────
@_traced("prober.discover_attackers")
def _discover_attackers(json_path: Path, position: int) -> tuple[set[str], int]:
"""
Read new JSON log lines from the given position and extract unique
@@ -399,6 +400,7 @@ def _tcpfp_phase(
# ─── Main worker ─────────────────────────────────────────────────────────────
@_traced("prober.worker")
async def prober_worker(
log_file: str,
interval: int = 300,

View File

@@ -31,6 +31,7 @@ from collections import Counter
from typing import Any
from decnet.correlation.parser import LogEvent
from decnet.telemetry import traced as _traced, get_tracer as _get_tracer
# ─── Event-type taxonomy ────────────────────────────────────────────────────
@@ -147,6 +148,7 @@ def _os_from_ttl(ttl_str: str | None) -> str | None:
# ─── Timing stats ───────────────────────────────────────────────────────────
@_traced("profiler.timing_stats")
def timing_stats(events: list[LogEvent]) -> dict[str, Any]:
"""
Compute inter-arrival-time statistics across *events* (sorted by ts).
@@ -221,6 +223,7 @@ def timing_stats(events: list[LogEvent]) -> dict[str, Any]:
# ─── Behavior classification ────────────────────────────────────────────────
@_traced("profiler.classify_behavior")
def classify_behavior(stats: dict[str, Any], services_count: int) -> str:
"""
Coarse behavior bucket:
@@ -305,6 +308,7 @@ def guess_tool(mean_iat_s: float | None, cv: float | None) -> str | None:
# ─── Header-based tool detection ────────────────────────────────────────────
@_traced("profiler.detect_tools_from_headers")
def detect_tools_from_headers(events: list[LogEvent]) -> list[str]:
"""
Scan HTTP `request` events for tool-identifying headers.
@@ -372,6 +376,7 @@ def detect_tools_from_headers(events: list[LogEvent]) -> list[str]:
# ─── Phase sequencing ───────────────────────────────────────────────────────
@_traced("profiler.phase_sequence")
def phase_sequence(events: list[LogEvent]) -> dict[str, Any]:
"""
Derive recon→exfil phase transition info.
@@ -418,6 +423,7 @@ def phase_sequence(events: list[LogEvent]) -> dict[str, Any]:
# ─── Sniffer rollup (OS fingerprint + retransmits) ──────────────────────────
@_traced("profiler.sniffer_rollup")
def sniffer_rollup(events: list[LogEvent]) -> dict[str, Any]:
"""
Roll up sniffer-emitted `tcp_syn_fingerprint` and `tcp_flow_timing`
@@ -535,6 +541,7 @@ def _int_or_none(v: Any) -> int | None:
# ─── Composite: build the full AttackerBehavior record ──────────────────────
@_traced("profiler.build_behavior_record")
def build_behavior_record(events: list[LogEvent]) -> dict[str, Any]:
"""
Build the dict to persist in the `attacker_behavior` table.
@@ -572,6 +579,15 @@ def build_behavior_record(events: list[LogEvent]) -> dict[str, Any]:
cv = stats.get("cv")
beacon_jitter_pct = round(cv * 100, 2) if cv is not None else None
_tracer = _get_tracer("profiler")
with _tracer.start_as_current_span("profiler.behavior_summary") as _span:
_span.set_attribute("behavior_class", behavior)
_span.set_attribute("os_guess", rollup["os_guess"] or "unknown")
_span.set_attribute("tool_count", len(all_tools))
_span.set_attribute("event_count", stats.get("event_count", 0))
if all_tools:
_span.set_attribute("tools", ",".join(all_tools))
return {
"os_guess": rollup["os_guess"],
"hop_distance": rollup["hop_distance"],

View File

@@ -17,6 +17,7 @@ from typing import Any, Callable
from decnet.prober.tcpfp import _extract_options_order
from decnet.sniffer.p0f import guess_os, hop_distance, initial_ttl
from decnet.sniffer.syslog import SEVERITY_INFO, SEVERITY_WARNING, syslog_line
from decnet.telemetry import traced as _traced, get_tracer as _get_tracer
# ─── Constants ───────────────────────────────────────────────────────────────
@@ -94,6 +95,7 @@ def _filter_grease(values: list[int]) -> list[int]:
# ─── TLS parsers ─────────────────────────────────────────────────────────────
@_traced("sniffer.parse_client_hello")
def _parse_client_hello(data: bytes) -> dict[str, Any] | None:
try:
if len(data) < 6:
@@ -228,6 +230,7 @@ def _parse_client_hello(data: bytes) -> dict[str, Any] | None:
return None
@_traced("sniffer.parse_server_hello")
def _parse_server_hello(data: bytes) -> dict[str, Any] | None:
try:
if len(data) < 6 or data[0] != _TLS_RECORD_HANDSHAKE:
@@ -294,6 +297,7 @@ def _parse_server_hello(data: bytes) -> dict[str, Any] | None:
return None
@_traced("sniffer.parse_certificate")
def _parse_certificate(data: bytes) -> dict[str, Any] | None:
try:
if len(data) < 6 or data[0] != _TLS_RECORD_HANDSHAKE:
@@ -547,6 +551,7 @@ def _tls_version_str(version: int) -> str:
}.get(version, f"0x{version:04x}")
@_traced("sniffer.ja3")
def _ja3(ch: dict[str, Any]) -> tuple[str, str]:
parts = [
str(ch["tls_version"]),
@@ -559,6 +564,7 @@ def _ja3(ch: dict[str, Any]) -> tuple[str, str]:
return ja3_str, hashlib.md5(ja3_str.encode()).hexdigest() # nosec B324
@_traced("sniffer.ja3s")
def _ja3s(sh: dict[str, Any]) -> tuple[str, str]:
parts = [
str(sh["tls_version"]),
@@ -605,6 +611,7 @@ def _sha256_12(text: str) -> str:
return hashlib.sha256(text.encode()).hexdigest()[:12]
@_traced("sniffer.ja4")
def _ja4(ch: dict[str, Any]) -> str:
proto = "t"
ver = _ja4_version(ch)
@@ -624,6 +631,7 @@ def _ja4(ch: dict[str, Any]) -> str:
return f"{section_a}_{section_b}_{section_c}"
@_traced("sniffer.ja4s")
def _ja4s(sh: dict[str, Any]) -> str:
proto = "t"
selected = sh.get("selected_version")
@@ -653,6 +661,7 @@ def _ja4l(
# ─── Session resumption ─────────────────────────────────────────────────────
@_traced("sniffer.session_resumption_info")
def _session_resumption_info(ch: dict[str, Any]) -> dict[str, Any]:
mechanisms: list[str] = []
if ch.get("has_session_ticket_data"):
@@ -965,6 +974,10 @@ class SnifferEngine:
# when the destination is a known decky, i.e. we're seeing an
# attacker's initial packet.
if dst_ip in self._ip_to_decky:
_tracer = _get_tracer("sniffer")
with _tracer.start_as_current_span("sniffer.tcp_syn_fingerprint") as _span:
_span.set_attribute("attacker_ip", src_ip)
_span.set_attribute("dst_port", dst_port)
tcp_fp = _extract_tcp_fingerprint(list(tcp.options or []))
os_label = guess_os(
ttl=ip.ttl,
@@ -973,6 +986,7 @@ class SnifferEngine:
wscale=tcp_fp["wscale"],
options_sig=tcp_fp["options_sig"],
)
_span.set_attribute("os_guess", os_label)
target_node = self._ip_to_decky[dst_ip]
self._log(
target_node,
@@ -1019,6 +1033,10 @@ class SnifferEngine:
# ClientHello
ch = _parse_client_hello(payload)
if ch is not None:
_tracer = _get_tracer("sniffer")
with _tracer.start_as_current_span("sniffer.tls_client_hello") as _span:
_span.set_attribute("attacker_ip", src_ip)
_span.set_attribute("dst_port", dst_port)
self._cleanup_sessions()
key = (src_ip, src_port, dst_ip, dst_port)
@@ -1027,6 +1045,10 @@ class SnifferEngine:
resumption = _session_resumption_info(ch)
rtt_data = _ja4l(key, self._tcp_rtt)
_span.set_attribute("ja3", ja3_hash)
_span.set_attribute("ja4", ja4_hash)
_span.set_attribute("sni", ch["sni"] or "")
self._sessions[key] = {
"ja3": ja3_hash,
"ja3_str": ja3_str,
@@ -1071,6 +1093,9 @@ class SnifferEngine:
# ServerHello
sh = _parse_server_hello(payload)
if sh is not None:
_tracer = _get_tracer("sniffer")
with _tracer.start_as_current_span("sniffer.tls_server_hello") as _span:
_span.set_attribute("attacker_ip", dst_ip)
rev_key = (dst_ip, dst_port, src_ip, src_port)
ch_data = self._sessions.pop(rev_key, None)
self._session_ts.pop(rev_key, None)
@@ -1078,6 +1103,9 @@ class SnifferEngine:
ja3s_str, ja3s_hash = _ja3s(sh)
ja4s_hash = _ja4s(sh)
_span.set_attribute("ja3s", ja3s_hash)
_span.set_attribute("ja4s", ja4s_hash)
fields: dict[str, Any] = {
"src_ip": dst_ip,
"src_port": str(dst_port),
@@ -1111,6 +1139,10 @@ class SnifferEngine:
# Certificate (TLS 1.2 only)
cert = _parse_certificate(payload)
if cert is not None:
_tracer = _get_tracer("sniffer")
with _tracer.start_as_current_span("sniffer.tls_certificate") as _span:
_span.set_attribute("subject_cn", cert["subject_cn"])
_span.set_attribute("self_signed", cert["self_signed"])
rev_key = (dst_ip, dst_port, src_ip, src_port)
ch_data = self._sessions.get(rev_key)

View File

@@ -22,6 +22,8 @@ No external dependencies.
from __future__ import annotations
from decnet.telemetry import traced as _traced
# ─── TTL → initial TTL bucket ───────────────────────────────────────────────
# Common "hop 0" TTLs. Packets decrement TTL once per hop, so we round up
@@ -216,6 +218,7 @@ def _match_signature(
return True
@_traced("sniffer.p0f_guess_os")
def guess_os(
ttl: int,
window: int,

View File

@@ -11,6 +11,7 @@ from pathlib import Path
from typing import Any
from decnet.collector.worker import parse_rfc5424
from decnet.telemetry import traced as _traced
# ─── Constants (must match templates/sniffer/decnet_logging.py) ──────────────
@@ -57,6 +58,7 @@ def syslog_line(
return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}"
@_traced("sniffer.write_event")
def write_event(line: str, log_path: Path, json_path: Path) -> None:
"""Append a syslog line to the raw log and its parsed JSON to the json log."""
with open(log_path, "a", encoding="utf-8") as lf:

View File

@@ -110,6 +110,7 @@ def _sniff_loop(
logger.info("sniffer: sniff loop ended")
@_traced("sniffer.worker")
async def sniffer_worker(log_file: str) -> None:
"""
Async entry point — started as asyncio.create_task in the API lifespan.

View File

@@ -64,7 +64,9 @@ def setup_tracing(app: Any) -> None:
_init_provider()
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
FastAPIInstrumentor.instrument_app(app)
log.info("FastAPI auto-instrumentation active")
from decnet.logging import enable_trace_context
enable_trace_context()
log.info("FastAPI auto-instrumentation active, log-trace correlation enabled")
except Exception as exc:
log.warning("OTEL setup failed — continuing without tracing: %s", exc)

View File

@@ -43,6 +43,11 @@ class Log(SQLModel, table=True):
raw_line: str = Field(sa_column=Column("raw_line", Text, nullable=False))
fields: str = Field(sa_column=Column("fields", Text, nullable=False))
msg: Optional[str] = Field(default=None, sa_column=Column("msg", Text, nullable=True))
# OTEL trace context — bridges the collector→ingester trace to the SSE
# read path. Nullable so pre-existing rows and non-traced deployments
# are unaffected.
trace_id: Optional[str] = Field(default=None)
span_id: Optional[str] = Field(default=None)
class Bounty(SQLModel, table=True):
__tablename__ = "bounty"

View File

@@ -75,6 +75,14 @@ async def log_ingestion_worker(repo: BaseRepository) -> None:
_span.set_attribute("service", _log_data.get("service", ""))
_span.set_attribute("event_type", _log_data.get("event_type", ""))
_span.set_attribute("attacker_ip", _log_data.get("attacker_ip", ""))
# Persist trace context in the DB row so the SSE
# read path can link back to this ingestion trace.
_sctx = getattr(_span, "get_span_context", None)
if _sctx:
_ctx = _sctx()
if _ctx and getattr(_ctx, "trace_id", 0):
_log_data["trace_id"] = format(_ctx.trace_id, "032x")
_log_data["span_id"] = format(_ctx.span_id, "016x")
logger.debug("ingest: record decky=%s event_type=%s", _log_data.get("decky"), _log_data.get("event_type"))
await repo.add_log(_log_data)
await _extract_bounty(repo, _log_data)

View File

@@ -2,6 +2,7 @@ from typing import Any, Optional
from fastapi import APIRouter, Depends, HTTPException, Query
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo
router = APIRouter()
@@ -15,6 +16,7 @@ router = APIRouter()
404: {"description": "Attacker not found"},
},
)
@_traced("api.get_attacker_commands")
async def get_attacker_commands(
uuid: str,
limit: int = Query(50, ge=1, le=200),

View File

@@ -2,6 +2,7 @@ from typing import Any
from fastapi import APIRouter, Depends, HTTPException
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo
router = APIRouter()
@@ -15,6 +16,7 @@ router = APIRouter()
404: {"description": "Attacker not found"},
},
)
@_traced("api.get_attacker_detail")
async def get_attacker_detail(
uuid: str,
user: dict = Depends(require_viewer),

View File

@@ -2,6 +2,7 @@ from typing import Any, Optional
from fastapi import APIRouter, Depends, Query
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo
from decnet.web.db.models import AttackersResponse
@@ -17,6 +18,7 @@ router = APIRouter()
422: {"description": "Validation error"},
},
)
@_traced("api.get_attackers")
async def get_attackers(
limit: int = Query(50, ge=1, le=1000),
offset: int = Query(0, ge=0, le=2147483647),

View File

@@ -2,6 +2,7 @@ from typing import Any, Optional
from fastapi import APIRouter, Depends, HTTPException, status
from decnet.telemetry import traced as _traced
from decnet.web.auth import get_password_hash, verify_password
from decnet.web.dependencies import get_current_user_unchecked, repo
from decnet.web.db.models import ChangePasswordRequest
@@ -18,6 +19,7 @@ router = APIRouter()
422: {"description": "Validation error"}
},
)
@_traced("api.change_password")
async def change_password(request: ChangePasswordRequest, current_user: str = Depends(get_current_user_unchecked)) -> dict[str, str]:
_user: Optional[dict[str, Any]] = await repo.get_user_by_uuid(current_user)
if not _user or not verify_password(request.old_password, _user["password_hash"]):

View File

@@ -3,6 +3,7 @@ from typing import Any, Optional
from fastapi import APIRouter, HTTPException, status
from decnet.telemetry import traced as _traced
from decnet.web.auth import (
ACCESS_TOKEN_EXPIRE_MINUTES,
create_access_token,
@@ -24,6 +25,7 @@ router = APIRouter()
422: {"description": "Validation error"}
},
)
@_traced("api.login")
async def login(request: LoginRequest) -> dict[str, Any]:
_user: Optional[dict[str, Any]] = await repo.get_user_by_username(request.username)
if not _user or not verify_password(request.password, _user["password_hash"]):

View File

@@ -2,6 +2,7 @@ from typing import Any, Optional
from fastapi import APIRouter, Depends, Query
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo
from decnet.web.db.models import BountyResponse
@@ -10,6 +11,7 @@ router = APIRouter()
@router.get("/bounty", response_model=BountyResponse, tags=["Bounty Vault"],
responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}},)
@_traced("api.get_bounties")
async def get_bounties(
limit: int = Query(50, ge=1, le=1000),
offset: int = Query(0, ge=0, le=2147483647),

View File

@@ -1,6 +1,7 @@
from fastapi import APIRouter, Depends
from decnet.env import DECNET_DEVELOPER
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo
from decnet.web.db.models import UserResponse
@@ -17,6 +18,7 @@ _DEFAULT_MUTATION_INTERVAL = "30m"
401: {"description": "Could not validate credentials"},
},
)
@_traced("api.get_config")
async def api_get_config(user: dict = Depends(require_viewer)) -> dict:
limits_state = await repo.get_state("config_limits")
globals_state = await repo.get_state("config_globals")

View File

@@ -2,6 +2,7 @@ import uuid as _uuid
from fastapi import APIRouter, Depends, HTTPException
from decnet.telemetry import traced as _traced
from decnet.web.auth import get_password_hash
from decnet.web.dependencies import require_admin, repo
from decnet.web.db.models import (
@@ -24,6 +25,7 @@ router = APIRouter()
422: {"description": "Validation error"},
},
)
@_traced("api.create_user")
async def api_create_user(
req: CreateUserRequest,
admin: dict = Depends(require_admin),
@@ -57,6 +59,7 @@ async def api_create_user(
404: {"description": "User not found"},
},
)
@_traced("api.delete_user")
async def api_delete_user(
user_uuid: str,
admin: dict = Depends(require_admin),
@@ -80,6 +83,7 @@ async def api_delete_user(
422: {"description": "Validation error"},
},
)
@_traced("api.update_user_role")
async def api_update_user_role(
user_uuid: str,
req: UpdateUserRoleRequest,
@@ -106,6 +110,7 @@ async def api_update_user_role(
422: {"description": "Validation error"},
},
)
@_traced("api.reset_user_password")
async def api_reset_user_password(
user_uuid: str,
req: ResetUserPasswordRequest,

View File

@@ -1,6 +1,7 @@
from fastapi import APIRouter, Depends, HTTPException
from decnet.env import DECNET_DEVELOPER
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_admin, repo
router = APIRouter()
@@ -14,6 +15,7 @@ router = APIRouter()
403: {"description": "Admin access required or developer mode not enabled"},
},
)
@_traced("api.reinit")
async def api_reinit(admin: dict = Depends(require_admin)) -> dict:
if not DECNET_DEVELOPER:
raise HTTPException(status_code=403, detail="Developer mode is not enabled")

View File

@@ -1,5 +1,6 @@
from fastapi import APIRouter, Depends
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_admin, repo
from decnet.web.db.models import DeploymentLimitRequest, GlobalMutationIntervalRequest
@@ -15,6 +16,7 @@ router = APIRouter()
422: {"description": "Validation error"},
},
)
@_traced("api.update_deployment_limit")
async def api_update_deployment_limit(
req: DeploymentLimitRequest,
admin: dict = Depends(require_admin),
@@ -32,6 +34,7 @@ async def api_update_deployment_limit(
422: {"description": "Validation error"},
},
)
@_traced("api.update_global_mutation_interval")
async def api_update_global_mutation_interval(
req: GlobalMutationIntervalRequest,
admin: dict = Depends(require_admin),

View File

@@ -3,6 +3,7 @@ import os
from fastapi import APIRouter, Depends, HTTPException
from decnet.logging import get_logger
from decnet.telemetry import traced as _traced
from decnet.config import DEFAULT_MUTATE_INTERVAL, DecnetConfig, _ROOT
from decnet.engine import deploy as _deploy
from decnet.ini_loader import load_ini_from_string
@@ -27,6 +28,7 @@ router = APIRouter()
500: {"description": "Deployment failed"}
}
)
@_traced("api.deploy_deckies")
async def api_deploy_deckies(req: DeployIniRequest, admin: dict = Depends(require_admin)) -> dict[str, str]:
from decnet.fleet import build_deckies_from_ini

View File

@@ -2,6 +2,7 @@ from typing import Any
from fastapi import APIRouter, Depends
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo
router = APIRouter()
@@ -9,5 +10,6 @@ router = APIRouter()
@router.get("/deckies", tags=["Fleet Management"],
responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}},)
@_traced("api.get_deckies")
async def get_deckies(user: dict = Depends(require_viewer)) -> list[dict[str, Any]]:
return await repo.get_deckies()

View File

@@ -1,6 +1,7 @@
import os
from fastapi import APIRouter, Depends, HTTPException, Path
from decnet.telemetry import traced as _traced
from decnet.mutator import mutate_decky
from decnet.web.dependencies import require_admin, repo
@@ -12,6 +13,7 @@ router = APIRouter()
tags=["Fleet Management"],
responses={401: {"description": "Could not validate credentials"}, 403: {"description": "Insufficient permissions"}, 404: {"description": "Decky not found"}}
)
@_traced("api.mutate_decky")
async def api_mutate_decky(
decky_name: str = Path(..., pattern=r"^[a-z0-9\-]{1,64}$"),
admin: dict = Depends(require_admin),

View File

@@ -1,5 +1,6 @@
from fastapi import APIRouter, Depends, HTTPException
from decnet.telemetry import traced as _traced
from decnet.config import DecnetConfig
from decnet.web.dependencies import require_admin, repo
from decnet.web.db.models import MutateIntervalRequest
@@ -24,6 +25,7 @@ def _parse_duration(s: str) -> int:
422: {"description": "Validation error"}
},
)
@_traced("api.update_mutate_interval")
async def api_update_mutate_interval(decky_name: str, req: MutateIntervalRequest, admin: dict = Depends(require_admin)) -> dict[str, str]:
state_dict = await repo.get_state("deployment")
if not state_dict:

View File

@@ -3,6 +3,7 @@ from typing import Any
from fastapi import APIRouter, Depends
from fastapi.responses import JSONResponse
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo
from decnet.web.db.models import HealthResponse, ComponentHealth
@@ -20,6 +21,7 @@ _OPTIONAL_SERVICES = {"sniffer_worker"}
503: {"model": HealthResponse, "description": "System unhealthy"},
},
)
@_traced("api.get_health")
async def get_health(user: dict = Depends(require_viewer)) -> Any:
components: dict[str, ComponentHealth] = {}

View File

@@ -2,6 +2,7 @@ from typing import Any, Optional
from fastapi import APIRouter, Depends, Query
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo
router = APIRouter()
@@ -9,6 +10,7 @@ router = APIRouter()
@router.get("/logs/histogram", tags=["Logs"],
responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}},)
@_traced("api.get_logs_histogram")
async def get_logs_histogram(
search: Optional[str] = None,
start_time: Optional[str] = Query(None),

View File

@@ -2,6 +2,7 @@ from typing import Any, Optional
from fastapi import APIRouter, Depends, Query
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo
from decnet.web.db.models import LogsResponse
@@ -10,6 +11,7 @@ router = APIRouter()
@router.get("/logs", response_model=LogsResponse, tags=["Logs"],
responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}})
@_traced("api.get_logs")
async def get_logs(
limit: int = Query(50, ge=1, le=1000),
offset: int = Query(0, ge=0, le=2147483647),

View File

@@ -2,6 +2,7 @@ from typing import Any
from fastapi import APIRouter, Depends
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo
from decnet.web.db.models import StatsResponse
@@ -10,5 +11,6 @@ router = APIRouter()
@router.get("/stats", response_model=StatsResponse, tags=["Observability"],
responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}},)
@_traced("api.get_stats")
async def get_stats(user: dict = Depends(require_viewer)) -> dict[str, Any]:
return await repo.get_stats_summary()

View File

@@ -7,6 +7,7 @@ from fastapi.responses import StreamingResponse
from decnet.env import DECNET_DEVELOPER
from decnet.logging import get_logger
from decnet.telemetry import traced as _traced, get_tracer as _get_tracer
from decnet.web.dependencies import require_stream_viewer, repo
log = get_logger("api")
@@ -14,6 +15,34 @@ log = get_logger("api")
router = APIRouter()
def _build_trace_links(logs: list[dict]) -> list:
"""Build OTEL span links from persisted trace_id/span_id in log rows.
Returns an empty list when tracing is disabled (no OTEL imports).
"""
try:
from opentelemetry.trace import Link, SpanContext, TraceFlags
except ImportError:
return []
links: list[Link] = []
for entry in logs:
tid = entry.get("trace_id")
sid = entry.get("span_id")
if not tid or not sid or tid == "0":
continue
try:
ctx = SpanContext(
trace_id=int(tid, 16),
span_id=int(sid, 16),
is_remote=True,
trace_flags=TraceFlags(TraceFlags.SAMPLED),
)
links.append(Link(ctx))
except (ValueError, TypeError):
continue
return links
@router.get("/stream", tags=["Observability"],
responses={
200: {
@@ -24,6 +53,7 @@ router = APIRouter()
422: {"description": "Validation error"}
},
)
@_traced("api.stream_events")
async def stream_events(
request: Request,
last_event_id: int = Query(0, alias="lastEventId"),
@@ -75,6 +105,14 @@ async def stream_events(
)
if new_logs:
last_id = max(entry["id"] for entry in new_logs)
# Create a span linking back to the ingestion traces
# stored in each log row, closing the pipeline gap.
_links = _build_trace_links(new_logs)
_tracer = _get_tracer("sse")
with _tracer.start_as_current_span(
"sse.emit_logs", links=_links,
attributes={"log_count": len(new_logs)},
):
yield f"event: message\ndata: {json.dumps({'type': 'logs', 'data': new_logs})}\n\n"
loops_since_stats = stats_interval_sec

219
development/docs/TRACING.md Normal file
View File

@@ -0,0 +1,219 @@
# Distributed Tracing
OpenTelemetry (OTEL) distributed tracing across all DECNET services. Gated by the `DECNET_DEVELOPER_TRACING` environment variable (off by default). When disabled, zero overhead: no OTEL imports occur, `@traced` returns the original unwrapped function, and no middleware is installed.
## Quick Start
```bash
# 1. Start Jaeger (OTLP receiver on :4317, UI on :16686)
docker compose -f development/docker-compose.otel.yml up -d
# 2. Run DECNET with tracing enabled
DECNET_DEVELOPER_TRACING=true decnet web
# 3. Open Jaeger UI — service name is "decnet"
open http://localhost:16686
```
| Variable | Default | Purpose |
|----------|---------|---------|
| `DECNET_DEVELOPER_TRACING` | `false` | Enable/disable all tracing |
| `DECNET_OTEL_ENDPOINT` | `http://localhost:4317` | OTLP gRPC exporter target |
## Architecture
The core module is `decnet/telemetry.py`. All tracing flows through it.
| Export | Purpose |
|--------|---------|
| `setup_tracing(app)` | Init TracerProvider, instrument FastAPI, enable log-trace correlation |
| `shutdown_tracing()` | Flush and shut down the TracerProvider |
| `get_tracer(component)` | Return an OTEL Tracer or `_NoOpTracer` when disabled |
| `@traced(name)` | Decorator wrapping sync/async functions in spans (no-op when disabled) |
| `wrap_repository(repo)` | Dynamic `__getattr__` proxy adding `db.*` spans to every async method |
| `inject_context(record)` | Embed W3C trace context into a JSON record under `_trace` |
| `extract_context(record)` | Recover trace context from `_trace` and remove it from the record |
| `start_span_with_context(tracer, name, ctx)` | Start a span as child of an extracted context |
**TracerProvider config**: Resource(`service.name=decnet`, `service.version=0.2.0`), `BatchSpanProcessor`, OTLP gRPC exporter.
**When disabled**: `_NoOpTracer` and `_NoOpSpan` stubs are returned. No OTEL SDK packages are imported. The `@traced` decorator returns the original function object at decoration time.
## Pipeline Trace Propagation
The DECNET data pipeline is decoupled through JSON files and the database, which normally breaks trace continuity. Four mechanisms bridge the gaps:
1. **Collector → JSON**: `inject_context()` embeds W3C `traceparent`/`tracestate` into each JSON log record under a `_trace` key.
2. **JSON → Ingester**: `extract_context()` recovers the parent context. The ingester creates `ingester.process_record` as a child span, preserving the collector→ingester parent-child relationship.
3. **Ingester → DB**: The ingester persists the current span's `trace_id` and `span_id` as columns on the `logs` table before calling `repo.add_log()`.
4. **DB → SSE**: The SSE endpoint reads `trace_id`/`span_id` from log rows and creates OTEL **span links** (FOLLOWS_FROM) on `sse.emit_logs`, connecting the read path back to the original ingestion traces.
**Log-trace correlation**: `_TraceContextFilter` (installed by `enable_trace_context()`) injects `otel_trace_id` and `otel_span_id` into Python `LogRecord` objects, bridging structured logs with trace context.
## Span Reference
### API Endpoints (20 spans)
| Span | Endpoint |
|------|----------|
| `api.login` | `POST /auth/login` |
| `api.change_password` | `POST /auth/change-password` |
| `api.get_logs` | `GET /logs` |
| `api.get_logs_histogram` | `GET /logs/histogram` |
| `api.get_bounties` | `GET /bounty` |
| `api.get_attackers` | `GET /attackers` |
| `api.get_attacker_detail` | `GET /attackers/{uuid}` |
| `api.get_attacker_commands` | `GET /attackers/{uuid}/commands` |
| `api.get_stats` | `GET /stats` |
| `api.get_deckies` | `GET /fleet/deckies` |
| `api.deploy_deckies` | `POST /fleet/deploy` |
| `api.mutate_decky` | `POST /fleet/mutate/{decky_id}` |
| `api.update_mutate_interval` | `POST /fleet/mutate-interval/{decky_id}` |
| `api.get_config` | `GET /config` |
| `api.update_deployment_limit` | `PUT /config/deployment-limit` |
| `api.update_global_mutation_interval` | `PUT /config/global-mutation-interval` |
| `api.create_user` | `POST /config/users` |
| `api.delete_user` | `DELETE /config/users/{uuid}` |
| `api.update_user_role` | `PUT /config/users/{uuid}/role` |
| `api.reset_user_password` | `PUT /config/users/{uuid}/password` |
| `api.reinit` | `POST /config/reinit` |
| `api.get_health` | `GET /health` |
| `api.stream_events` | `GET /stream` |
### DB Layer (dynamic)
Every async method on `BaseRepository` is automatically wrapped by `TracedRepository` as `db.<method_name>` (e.g. `db.add_log`, `db.get_attackers`, `db.upsert_attacker`).
### Collector
| Span | Type |
|------|------|
| `collector.stream_container` | `@traced` |
| `collector.event` | inline |
### Ingester
| Span | Type |
|------|------|
| `ingester.process_record` | inline (with parent context) |
| `ingester.extract_bounty` | `@traced` |
### Profiler
| Span | Type |
|------|------|
| `profiler.incremental_update` | `@traced` |
| `profiler.update_profiles` | `@traced` |
| `profiler.process_ip` | inline |
| `profiler.timing_stats` | `@traced` |
| `profiler.classify_behavior` | `@traced` |
| `profiler.detect_tools_from_headers` | `@traced` |
| `profiler.phase_sequence` | `@traced` |
| `profiler.sniffer_rollup` | `@traced` |
| `profiler.build_behavior_record` | `@traced` |
| `profiler.behavior_summary` | inline |
### Sniffer
| Span | Type |
|------|------|
| `sniffer.worker` | `@traced` |
| `sniffer.sniff_loop` | `@traced` |
| `sniffer.tcp_syn_fingerprint` | inline |
| `sniffer.tls_client_hello` | inline |
| `sniffer.tls_server_hello` | inline |
| `sniffer.tls_certificate` | inline |
| `sniffer.parse_client_hello` | `@traced` |
| `sniffer.parse_server_hello` | `@traced` |
| `sniffer.parse_certificate` | `@traced` |
| `sniffer.ja3` | `@traced` |
| `sniffer.ja3s` | `@traced` |
| `sniffer.ja4` | `@traced` |
| `sniffer.ja4s` | `@traced` |
| `sniffer.session_resumption_info` | `@traced` |
| `sniffer.p0f_guess_os` | `@traced` |
| `sniffer.write_event` | `@traced` |
### Prober
| Span | Type |
|------|------|
| `prober.worker` | `@traced` |
| `prober.discover_attackers` | `@traced` |
| `prober.probe_cycle` | `@traced` |
| `prober.jarm_phase` | `@traced` |
| `prober.hassh_phase` | `@traced` |
| `prober.tcpfp_phase` | `@traced` |
| `prober.jarm_hash` | `@traced` |
| `prober.jarm_send_probe` | `@traced` |
| `prober.hassh_server` | `@traced` |
| `prober.hassh_ssh_connect` | `@traced` |
| `prober.tcp_fingerprint` | `@traced` |
| `prober.tcpfp_send_syn` | `@traced` |
### Engine
| Span | Type |
|------|------|
| `engine.deploy` | `@traced` |
| `engine.teardown` | `@traced` |
| `engine.compose_with_retry` | `@traced` |
### Mutator
| Span | Type |
|------|------|
| `mutator.mutate_decky` | `@traced` |
| `mutator.mutate_all` | `@traced` |
| `mutator.watch_loop` | `@traced` |
### Correlation
| Span | Type |
|------|------|
| `correlation.ingest_file` | `@traced` |
| `correlation.ingest_file.summary` | inline |
| `correlation.traversals` | `@traced` |
| `correlation.report_json` | `@traced` |
| `correlation.traversal_syslog_lines` | `@traced` |
### Logging
| Span | Type |
|------|------|
| `logging.init_file_handler` | `@traced` |
| `logging.probe_log_target` | `@traced` |
### SSE
| Span | Type |
|------|------|
| `sse.emit_logs` | inline (with span links to ingestion traces) |
## Adding New Traces
```python
from decnet.telemetry import traced as _traced, get_tracer as _get_tracer
# Decorator (preferred for entire functions)
@_traced("component.operation")
async def my_function():
...
# Inline (for sub-sections within a function)
with _get_tracer("component").start_as_current_span("component.sub_op") as span:
span.set_attribute("key", "value")
...
```
Naming convention: `component.operation` (e.g. `prober.jarm_hash`, `profiler.timing_stats`).
## Troubleshooting
| Symptom | Check |
|---------|-------|
| No traces in Jaeger | `DECNET_DEVELOPER_TRACING=true`? Jaeger running on port 4317? |
| `ImportError` on OTEL packages | Run `pip install -e ".[dev]"` (OTEL is in optional deps) |
| Partial traces (ingester orphaned) | Verify `_trace` key present in JSON log file records |
| SSE spans have no links | Confirm `trace_id`/`span_id` columns exist in `logs` table |
| Performance concern | BatchSpanProcessor adds ~1ms per span; zero overhead when disabled |