feat: complete OTEL tracing across all services with pipeline bridge and docs

Extends tracing to every remaining module: all 23 API route handlers,
correlation engine, sniffer (fingerprint/p0f/syslog), prober (jarm/hassh/tcpfp),
profiler behavioral analysis, logging subsystem, engine, and mutator.

Bridges the ingester→SSE trace gap by persisting trace_id/span_id columns on
the logs table and creating OTEL span links in the SSE endpoint. Adds log-trace
correlation via _TraceContextFilter injecting otel_trace_id into Python LogRecords.

Includes development/docs/TRACING.md with full span reference (76 spans),
pipeline propagation architecture, quick start guide, and troubleshooting.
This commit is contained in:
2026-04-16 00:58:08 -04:00
parent 04db13afae
commit 70d8ffc607
38 changed files with 577 additions and 124 deletions

View File

@@ -33,6 +33,7 @@ from decnet.logging.syslog_formatter import (
SEVERITY_WARNING, SEVERITY_WARNING,
format_rfc5424, format_rfc5424,
) )
from decnet.telemetry import traced as _traced, get_tracer as _get_tracer
class CorrelationEngine: class CorrelationEngine:
@@ -64,6 +65,7 @@ class CorrelationEngine:
self.events_indexed += 1 self.events_indexed += 1
return event return event
@_traced("correlation.ingest_file")
def ingest_file(self, path: Path) -> int: def ingest_file(self, path: Path) -> int:
""" """
Parse every line of *path* and index it. Parse every line of *path* and index it.
@@ -73,12 +75,18 @@ class CorrelationEngine:
with open(path) as fh: with open(path) as fh:
for line in fh: for line in fh:
self.ingest(line) self.ingest(line)
_tracer = _get_tracer("correlation")
with _tracer.start_as_current_span("correlation.ingest_file.summary") as _span:
_span.set_attribute("lines_parsed", self.lines_parsed)
_span.set_attribute("events_indexed", self.events_indexed)
_span.set_attribute("unique_ips", len(self._events))
return self.events_indexed return self.events_indexed
# ------------------------------------------------------------------ # # ------------------------------------------------------------------ #
# Query # # Query #
# ------------------------------------------------------------------ # # ------------------------------------------------------------------ #
@_traced("correlation.traversals")
def traversals(self, min_deckies: int = 2) -> list[AttackerTraversal]: def traversals(self, min_deckies: int = 2) -> list[AttackerTraversal]:
""" """
Return all attackers that touched at least *min_deckies* distinct Return all attackers that touched at least *min_deckies* distinct
@@ -135,6 +143,7 @@ class CorrelationEngine:
) )
return table return table
@_traced("correlation.report_json")
def report_json(self, min_deckies: int = 2) -> dict: def report_json(self, min_deckies: int = 2) -> dict:
"""Serialisable dict representation of all traversals.""" """Serialisable dict representation of all traversals."""
return { return {
@@ -147,6 +156,7 @@ class CorrelationEngine:
"traversals": [t.to_dict() for t in self.traversals(min_deckies)], "traversals": [t.to_dict() for t in self.traversals(min_deckies)],
} }
@_traced("correlation.traversal_syslog_lines")
def traversal_syslog_lines(self, min_deckies: int = 2) -> list[str]: def traversal_syslog_lines(self, min_deckies: int = 2) -> list[str]:
""" """
Emit one RFC 5424 syslog line per detected traversal. Emit one RFC 5424 syslog line per detected traversal.

View File

@@ -68,6 +68,7 @@ _PERMANENT_ERRORS = (
) )
@_traced("engine.compose_with_retry")
def _compose_with_retry( def _compose_with_retry(
*args: str, *args: str,
compose_file: Path = COMPOSE_FILE, compose_file: Path = COMPOSE_FILE,

View File

@@ -7,6 +7,11 @@ Usage:
The returned logger propagates to the root logger (configured in config.py with The returned logger propagates to the root logger (configured in config.py with
Rfc5424Formatter), so level control via DECNET_DEVELOPER still applies globally. Rfc5424Formatter), so level control via DECNET_DEVELOPER still applies globally.
When ``DECNET_DEVELOPER_TRACING`` is active, every LogRecord is enriched with
``otel_trace_id`` and ``otel_span_id`` from the current OTEL span context.
This lets you correlate log lines with Jaeger traces — click a log entry and
jump straight to the span that produced it.
""" """
from __future__ import annotations from __future__ import annotations
@@ -27,6 +32,51 @@ class _ComponentFilter(logging.Filter):
return True return True
class _TraceContextFilter(logging.Filter):
"""Injects ``otel_trace_id`` and ``otel_span_id`` onto every LogRecord
from the active OTEL span context.
Installed once by ``enable_trace_context()`` on the root ``decnet`` logger
so all child loggers inherit the enrichment via propagation.
When no span is active, both fields are set to ``"0"`` (cheap string
comparison downstream, no None-checks needed).
"""
def filter(self, record: logging.LogRecord) -> bool:
try:
from opentelemetry import trace
span = trace.get_current_span()
ctx = span.get_span_context()
if ctx and ctx.trace_id:
record.otel_trace_id = format(ctx.trace_id, "032x") # type: ignore[attr-defined]
record.otel_span_id = format(ctx.span_id, "016x") # type: ignore[attr-defined]
else:
record.otel_trace_id = "0" # type: ignore[attr-defined]
record.otel_span_id = "0" # type: ignore[attr-defined]
except Exception:
record.otel_trace_id = "0" # type: ignore[attr-defined]
record.otel_span_id = "0" # type: ignore[attr-defined]
return True
_trace_filter_installed: bool = False
def enable_trace_context() -> None:
"""Install the OTEL trace-context filter on the root ``decnet`` logger.
Called once from ``decnet.telemetry.setup_tracing()`` after the
TracerProvider is initialised. Safe to call multiple times (idempotent).
"""
global _trace_filter_installed
if _trace_filter_installed:
return
root = logging.getLogger("decnet")
root.addFilter(_TraceContextFilter())
_trace_filter_installed = True
def get_logger(component: str) -> logging.Logger: def get_logger(component: str) -> logging.Logger:
"""Return a named logger that self-identifies as *component* in RFC 5424. """Return a named logger that self-identifies as *component* in RFC 5424.

View File

@@ -13,6 +13,8 @@ import logging.handlers
import os import os
from pathlib import Path from pathlib import Path
from decnet.telemetry import traced as _traced
_LOG_FILE_ENV = "DECNET_LOG_FILE" _LOG_FILE_ENV = "DECNET_LOG_FILE"
_DEFAULT_LOG_FILE = "/var/log/decnet/decnet.log" _DEFAULT_LOG_FILE = "/var/log/decnet/decnet.log"
_MAX_BYTES = 10 * 1024 * 1024 # 10 MB _MAX_BYTES = 10 * 1024 * 1024 # 10 MB
@@ -22,10 +24,10 @@ _handler: logging.handlers.RotatingFileHandler | None = None
_logger: logging.Logger | None = None _logger: logging.Logger | None = None
def _get_logger() -> logging.Logger: @_traced("logging.init_file_handler")
def _init_file_handler() -> logging.Logger:
"""One-time initialisation of the rotating file handler."""
global _handler, _logger global _handler, _logger
if _logger is not None:
return _logger
log_path = Path(os.environ.get(_LOG_FILE_ENV, _DEFAULT_LOG_FILE)) log_path = Path(os.environ.get(_LOG_FILE_ENV, _DEFAULT_LOG_FILE))
log_path.parent.mkdir(parents=True, exist_ok=True) log_path.parent.mkdir(parents=True, exist_ok=True)
@@ -46,6 +48,12 @@ def _get_logger() -> logging.Logger:
return _logger return _logger
def _get_logger() -> logging.Logger:
if _logger is not None:
return _logger
return _init_file_handler()
def write_syslog(line: str) -> None: def write_syslog(line: str) -> None:
"""Write a single RFC 5424 syslog line to the rotating log file.""" """Write a single RFC 5424 syslog line to the rotating log file."""
try: try:

View File

@@ -11,6 +11,8 @@ shared utilities for validating and parsing the log_target string.
import socket import socket
from decnet.telemetry import traced as _traced
def parse_log_target(log_target: str) -> tuple[str, int]: def parse_log_target(log_target: str) -> tuple[str, int]:
""" """
@@ -23,6 +25,7 @@ def parse_log_target(log_target: str) -> tuple[str, int]:
return parts[0], int(parts[1]) return parts[0], int(parts[1])
@_traced("logging.probe_log_target")
def probe_log_target(log_target: str, timeout: float = 2.0) -> bool: def probe_log_target(log_target: str, timeout: float = 2.0) -> bool:
""" """
Return True if the log target is reachable (TCP connect succeeds). Return True if the log target is reachable (TCP connect succeeds).

View File

@@ -133,6 +133,7 @@ async def mutate_all(repo: BaseRepository, force: bool = False) -> None:
log.info("mutate_all: complete mutated_count=%d", mutated_count) log.info("mutate_all: complete mutated_count=%d", mutated_count)
@_traced("mutator.watch_loop")
async def run_watch_loop(repo: BaseRepository, poll_interval_secs: int = 10) -> None: async def run_watch_loop(repo: BaseRepository, poll_interval_secs: int = 10) -> None:
"""Run an infinite loop checking for deckies that need mutation.""" """Run an infinite loop checking for deckies that need mutation."""
log.info("mutator watch loop started poll_interval_secs=%d", poll_interval_secs) log.info("mutator watch loop started poll_interval_secs=%d", poll_interval_secs)

View File

@@ -9,7 +9,7 @@ This is the *server* variant of HASSH (HASSHServer). It fingerprints what
the server *offers*, which identifies the SSH implementation (OpenSSH, the server *offers*, which identifies the SSH implementation (OpenSSH,
Paramiko, libssh, Cobalt Strike SSH, etc.). Paramiko, libssh, Cobalt Strike SSH, etc.).
Stdlib only (socket, struct, hashlib). No DECNET imports. Stdlib only (socket, struct, hashlib) plus decnet.telemetry for tracing (zero-cost when disabled).
""" """
from __future__ import annotations from __future__ import annotations
@@ -19,6 +19,8 @@ import socket
import struct import struct
from typing import Any from typing import Any
from decnet.telemetry import traced as _traced
# SSH protocol constants # SSH protocol constants
_SSH_MSG_KEXINIT = 20 _SSH_MSG_KEXINIT = 20
_KEX_INIT_COOKIE_LEN = 16 _KEX_INIT_COOKIE_LEN = 16
@@ -36,6 +38,7 @@ _MAX_PACKET_LEN = 35000
# ─── SSH connection + KEX_INIT capture ────────────────────────────────────── # ─── SSH connection + KEX_INIT capture ──────────────────────────────────────
@_traced("prober.hassh_ssh_connect")
def _ssh_connect( def _ssh_connect(
host: str, host: str,
port: int, port: int,
@@ -213,6 +216,7 @@ def _compute_hassh(kex: str, enc: str, mac: str, comp: str) -> str:
# ─── Public API ───────────────────────────────────────────────────────────── # ─── Public API ─────────────────────────────────────────────────────────────
@_traced("prober.hassh_server")
def hassh_server( def hassh_server(
host: str, host: str,
port: int, port: int,

View File

@@ -8,7 +8,7 @@ fingerprint that identifies the TLS server implementation.
Reference: https://github.com/salesforce/jarm Reference: https://github.com/salesforce/jarm
No DECNET imports — this module is self-contained and testable in isolation. Only DECNET import is decnet.telemetry for tracing (zero-cost when disabled).
""" """
from __future__ import annotations from __future__ import annotations
@@ -19,6 +19,8 @@ import struct
import time import time
from typing import Any from typing import Any
from decnet.telemetry import traced as _traced
# ─── Constants ──────────────────────────────────────────────────────────────── # ─── Constants ────────────────────────────────────────────────────────────────
JARM_EMPTY_HASH = "0" * 62 JARM_EMPTY_HASH = "0" * 62
@@ -379,6 +381,7 @@ def _version_to_str(version: int) -> str:
# ─── Probe sender ──────────────────────────────────────────────────────────── # ─── Probe sender ────────────────────────────────────────────────────────────
@_traced("prober.jarm_send_probe")
def _send_probe(host: str, port: int, hello: bytes, timeout: float = 5.0) -> bytes | None: def _send_probe(host: str, port: int, hello: bytes, timeout: float = 5.0) -> bytes | None:
""" """
Open a TCP connection, send the ClientHello, and read the ServerHello. Open a TCP connection, send the ClientHello, and read the ServerHello.
@@ -471,6 +474,7 @@ def _compute_jarm(responses: list[str]) -> str:
# ─── Public API ────────────────────────────────────────────────────────────── # ─── Public API ──────────────────────────────────────────────────────────────
@_traced("prober.jarm_hash")
def jarm_hash(host: str, port: int, timeout: float = 5.0) -> str: def jarm_hash(host: str, port: int, timeout: float = 5.0) -> str:
""" """
Compute the JARM fingerprint for a TLS server. Compute the JARM fingerprint for a TLS server.

View File

@@ -15,6 +15,8 @@ import hashlib
import random import random
from typing import Any from typing import Any
from decnet.telemetry import traced as _traced
# Lazy-import scapy to avoid breaking non-root usage of HASSH/JARM. # Lazy-import scapy to avoid breaking non-root usage of HASSH/JARM.
# The actual import happens inside functions that need it. # The actual import happens inside functions that need it.
@@ -36,6 +38,7 @@ _OPT_CODES: dict[str, str] = {
# ─── Packet construction ─────────────────────────────────────────────────── # ─── Packet construction ───────────────────────────────────────────────────
@_traced("prober.tcpfp_send_syn")
def _send_syn( def _send_syn(
host: str, host: str,
port: int, port: int,
@@ -196,6 +199,7 @@ def _compute_fingerprint(fields: dict[str, Any]) -> tuple[str, str]:
# ─── Public API ───────────────────────────────────────────────────────────── # ─── Public API ─────────────────────────────────────────────────────────────
@_traced("prober.tcp_fingerprint")
def tcp_fingerprint( def tcp_fingerprint(
host: str, host: str,
port: int, port: int,

View File

@@ -169,6 +169,7 @@ def _write_event(
# ─── Target discovery from log stream ──────────────────────────────────────── # ─── Target discovery from log stream ────────────────────────────────────────
@_traced("prober.discover_attackers")
def _discover_attackers(json_path: Path, position: int) -> tuple[set[str], int]: def _discover_attackers(json_path: Path, position: int) -> tuple[set[str], int]:
""" """
Read new JSON log lines from the given position and extract unique Read new JSON log lines from the given position and extract unique
@@ -399,6 +400,7 @@ def _tcpfp_phase(
# ─── Main worker ───────────────────────────────────────────────────────────── # ─── Main worker ─────────────────────────────────────────────────────────────
@_traced("prober.worker")
async def prober_worker( async def prober_worker(
log_file: str, log_file: str,
interval: int = 300, interval: int = 300,

View File

@@ -31,6 +31,7 @@ from collections import Counter
from typing import Any from typing import Any
from decnet.correlation.parser import LogEvent from decnet.correlation.parser import LogEvent
from decnet.telemetry import traced as _traced, get_tracer as _get_tracer
# ─── Event-type taxonomy ──────────────────────────────────────────────────── # ─── Event-type taxonomy ────────────────────────────────────────────────────
@@ -147,6 +148,7 @@ def _os_from_ttl(ttl_str: str | None) -> str | None:
# ─── Timing stats ─────────────────────────────────────────────────────────── # ─── Timing stats ───────────────────────────────────────────────────────────
@_traced("profiler.timing_stats")
def timing_stats(events: list[LogEvent]) -> dict[str, Any]: def timing_stats(events: list[LogEvent]) -> dict[str, Any]:
""" """
Compute inter-arrival-time statistics across *events* (sorted by ts). Compute inter-arrival-time statistics across *events* (sorted by ts).
@@ -221,6 +223,7 @@ def timing_stats(events: list[LogEvent]) -> dict[str, Any]:
# ─── Behavior classification ──────────────────────────────────────────────── # ─── Behavior classification ────────────────────────────────────────────────
@_traced("profiler.classify_behavior")
def classify_behavior(stats: dict[str, Any], services_count: int) -> str: def classify_behavior(stats: dict[str, Any], services_count: int) -> str:
""" """
Coarse behavior bucket: Coarse behavior bucket:
@@ -305,6 +308,7 @@ def guess_tool(mean_iat_s: float | None, cv: float | None) -> str | None:
# ─── Header-based tool detection ──────────────────────────────────────────── # ─── Header-based tool detection ────────────────────────────────────────────
@_traced("profiler.detect_tools_from_headers")
def detect_tools_from_headers(events: list[LogEvent]) -> list[str]: def detect_tools_from_headers(events: list[LogEvent]) -> list[str]:
""" """
Scan HTTP `request` events for tool-identifying headers. Scan HTTP `request` events for tool-identifying headers.
@@ -372,6 +376,7 @@ def detect_tools_from_headers(events: list[LogEvent]) -> list[str]:
# ─── Phase sequencing ─────────────────────────────────────────────────────── # ─── Phase sequencing ───────────────────────────────────────────────────────
@_traced("profiler.phase_sequence")
def phase_sequence(events: list[LogEvent]) -> dict[str, Any]: def phase_sequence(events: list[LogEvent]) -> dict[str, Any]:
""" """
Derive recon→exfil phase transition info. Derive recon→exfil phase transition info.
@@ -418,6 +423,7 @@ def phase_sequence(events: list[LogEvent]) -> dict[str, Any]:
# ─── Sniffer rollup (OS fingerprint + retransmits) ────────────────────────── # ─── Sniffer rollup (OS fingerprint + retransmits) ──────────────────────────
@_traced("profiler.sniffer_rollup")
def sniffer_rollup(events: list[LogEvent]) -> dict[str, Any]: def sniffer_rollup(events: list[LogEvent]) -> dict[str, Any]:
""" """
Roll up sniffer-emitted `tcp_syn_fingerprint` and `tcp_flow_timing` Roll up sniffer-emitted `tcp_syn_fingerprint` and `tcp_flow_timing`
@@ -535,6 +541,7 @@ def _int_or_none(v: Any) -> int | None:
# ─── Composite: build the full AttackerBehavior record ────────────────────── # ─── Composite: build the full AttackerBehavior record ──────────────────────
@_traced("profiler.build_behavior_record")
def build_behavior_record(events: list[LogEvent]) -> dict[str, Any]: def build_behavior_record(events: list[LogEvent]) -> dict[str, Any]:
""" """
Build the dict to persist in the `attacker_behavior` table. Build the dict to persist in the `attacker_behavior` table.
@@ -572,6 +579,15 @@ def build_behavior_record(events: list[LogEvent]) -> dict[str, Any]:
cv = stats.get("cv") cv = stats.get("cv")
beacon_jitter_pct = round(cv * 100, 2) if cv is not None else None beacon_jitter_pct = round(cv * 100, 2) if cv is not None else None
_tracer = _get_tracer("profiler")
with _tracer.start_as_current_span("profiler.behavior_summary") as _span:
_span.set_attribute("behavior_class", behavior)
_span.set_attribute("os_guess", rollup["os_guess"] or "unknown")
_span.set_attribute("tool_count", len(all_tools))
_span.set_attribute("event_count", stats.get("event_count", 0))
if all_tools:
_span.set_attribute("tools", ",".join(all_tools))
return { return {
"os_guess": rollup["os_guess"], "os_guess": rollup["os_guess"],
"hop_distance": rollup["hop_distance"], "hop_distance": rollup["hop_distance"],

View File

@@ -17,6 +17,7 @@ from typing import Any, Callable
from decnet.prober.tcpfp import _extract_options_order from decnet.prober.tcpfp import _extract_options_order
from decnet.sniffer.p0f import guess_os, hop_distance, initial_ttl from decnet.sniffer.p0f import guess_os, hop_distance, initial_ttl
from decnet.sniffer.syslog import SEVERITY_INFO, SEVERITY_WARNING, syslog_line from decnet.sniffer.syslog import SEVERITY_INFO, SEVERITY_WARNING, syslog_line
from decnet.telemetry import traced as _traced, get_tracer as _get_tracer
# ─── Constants ─────────────────────────────────────────────────────────────── # ─── Constants ───────────────────────────────────────────────────────────────
@@ -94,6 +95,7 @@ def _filter_grease(values: list[int]) -> list[int]:
# ─── TLS parsers ───────────────────────────────────────────────────────────── # ─── TLS parsers ─────────────────────────────────────────────────────────────
@_traced("sniffer.parse_client_hello")
def _parse_client_hello(data: bytes) -> dict[str, Any] | None: def _parse_client_hello(data: bytes) -> dict[str, Any] | None:
try: try:
if len(data) < 6: if len(data) < 6:
@@ -228,6 +230,7 @@ def _parse_client_hello(data: bytes) -> dict[str, Any] | None:
return None return None
@_traced("sniffer.parse_server_hello")
def _parse_server_hello(data: bytes) -> dict[str, Any] | None: def _parse_server_hello(data: bytes) -> dict[str, Any] | None:
try: try:
if len(data) < 6 or data[0] != _TLS_RECORD_HANDSHAKE: if len(data) < 6 or data[0] != _TLS_RECORD_HANDSHAKE:
@@ -294,6 +297,7 @@ def _parse_server_hello(data: bytes) -> dict[str, Any] | None:
return None return None
@_traced("sniffer.parse_certificate")
def _parse_certificate(data: bytes) -> dict[str, Any] | None: def _parse_certificate(data: bytes) -> dict[str, Any] | None:
try: try:
if len(data) < 6 or data[0] != _TLS_RECORD_HANDSHAKE: if len(data) < 6 or data[0] != _TLS_RECORD_HANDSHAKE:
@@ -547,6 +551,7 @@ def _tls_version_str(version: int) -> str:
}.get(version, f"0x{version:04x}") }.get(version, f"0x{version:04x}")
@_traced("sniffer.ja3")
def _ja3(ch: dict[str, Any]) -> tuple[str, str]: def _ja3(ch: dict[str, Any]) -> tuple[str, str]:
parts = [ parts = [
str(ch["tls_version"]), str(ch["tls_version"]),
@@ -559,6 +564,7 @@ def _ja3(ch: dict[str, Any]) -> tuple[str, str]:
return ja3_str, hashlib.md5(ja3_str.encode()).hexdigest() # nosec B324 return ja3_str, hashlib.md5(ja3_str.encode()).hexdigest() # nosec B324
@_traced("sniffer.ja3s")
def _ja3s(sh: dict[str, Any]) -> tuple[str, str]: def _ja3s(sh: dict[str, Any]) -> tuple[str, str]:
parts = [ parts = [
str(sh["tls_version"]), str(sh["tls_version"]),
@@ -605,6 +611,7 @@ def _sha256_12(text: str) -> str:
return hashlib.sha256(text.encode()).hexdigest()[:12] return hashlib.sha256(text.encode()).hexdigest()[:12]
@_traced("sniffer.ja4")
def _ja4(ch: dict[str, Any]) -> str: def _ja4(ch: dict[str, Any]) -> str:
proto = "t" proto = "t"
ver = _ja4_version(ch) ver = _ja4_version(ch)
@@ -624,6 +631,7 @@ def _ja4(ch: dict[str, Any]) -> str:
return f"{section_a}_{section_b}_{section_c}" return f"{section_a}_{section_b}_{section_c}"
@_traced("sniffer.ja4s")
def _ja4s(sh: dict[str, Any]) -> str: def _ja4s(sh: dict[str, Any]) -> str:
proto = "t" proto = "t"
selected = sh.get("selected_version") selected = sh.get("selected_version")
@@ -653,6 +661,7 @@ def _ja4l(
# ─── Session resumption ───────────────────────────────────────────────────── # ─── Session resumption ─────────────────────────────────────────────────────
@_traced("sniffer.session_resumption_info")
def _session_resumption_info(ch: dict[str, Any]) -> dict[str, Any]: def _session_resumption_info(ch: dict[str, Any]) -> dict[str, Any]:
mechanisms: list[str] = [] mechanisms: list[str] = []
if ch.get("has_session_ticket_data"): if ch.get("has_session_ticket_data"):
@@ -965,6 +974,10 @@ class SnifferEngine:
# when the destination is a known decky, i.e. we're seeing an # when the destination is a known decky, i.e. we're seeing an
# attacker's initial packet. # attacker's initial packet.
if dst_ip in self._ip_to_decky: if dst_ip in self._ip_to_decky:
_tracer = _get_tracer("sniffer")
with _tracer.start_as_current_span("sniffer.tcp_syn_fingerprint") as _span:
_span.set_attribute("attacker_ip", src_ip)
_span.set_attribute("dst_port", dst_port)
tcp_fp = _extract_tcp_fingerprint(list(tcp.options or [])) tcp_fp = _extract_tcp_fingerprint(list(tcp.options or []))
os_label = guess_os( os_label = guess_os(
ttl=ip.ttl, ttl=ip.ttl,
@@ -973,6 +986,7 @@ class SnifferEngine:
wscale=tcp_fp["wscale"], wscale=tcp_fp["wscale"],
options_sig=tcp_fp["options_sig"], options_sig=tcp_fp["options_sig"],
) )
_span.set_attribute("os_guess", os_label)
target_node = self._ip_to_decky[dst_ip] target_node = self._ip_to_decky[dst_ip]
self._log( self._log(
target_node, target_node,
@@ -1019,6 +1033,10 @@ class SnifferEngine:
# ClientHello # ClientHello
ch = _parse_client_hello(payload) ch = _parse_client_hello(payload)
if ch is not None: if ch is not None:
_tracer = _get_tracer("sniffer")
with _tracer.start_as_current_span("sniffer.tls_client_hello") as _span:
_span.set_attribute("attacker_ip", src_ip)
_span.set_attribute("dst_port", dst_port)
self._cleanup_sessions() self._cleanup_sessions()
key = (src_ip, src_port, dst_ip, dst_port) key = (src_ip, src_port, dst_ip, dst_port)
@@ -1027,6 +1045,10 @@ class SnifferEngine:
resumption = _session_resumption_info(ch) resumption = _session_resumption_info(ch)
rtt_data = _ja4l(key, self._tcp_rtt) rtt_data = _ja4l(key, self._tcp_rtt)
_span.set_attribute("ja3", ja3_hash)
_span.set_attribute("ja4", ja4_hash)
_span.set_attribute("sni", ch["sni"] or "")
self._sessions[key] = { self._sessions[key] = {
"ja3": ja3_hash, "ja3": ja3_hash,
"ja3_str": ja3_str, "ja3_str": ja3_str,
@@ -1071,6 +1093,9 @@ class SnifferEngine:
# ServerHello # ServerHello
sh = _parse_server_hello(payload) sh = _parse_server_hello(payload)
if sh is not None: if sh is not None:
_tracer = _get_tracer("sniffer")
with _tracer.start_as_current_span("sniffer.tls_server_hello") as _span:
_span.set_attribute("attacker_ip", dst_ip)
rev_key = (dst_ip, dst_port, src_ip, src_port) rev_key = (dst_ip, dst_port, src_ip, src_port)
ch_data = self._sessions.pop(rev_key, None) ch_data = self._sessions.pop(rev_key, None)
self._session_ts.pop(rev_key, None) self._session_ts.pop(rev_key, None)
@@ -1078,6 +1103,9 @@ class SnifferEngine:
ja3s_str, ja3s_hash = _ja3s(sh) ja3s_str, ja3s_hash = _ja3s(sh)
ja4s_hash = _ja4s(sh) ja4s_hash = _ja4s(sh)
_span.set_attribute("ja3s", ja3s_hash)
_span.set_attribute("ja4s", ja4s_hash)
fields: dict[str, Any] = { fields: dict[str, Any] = {
"src_ip": dst_ip, "src_ip": dst_ip,
"src_port": str(dst_port), "src_port": str(dst_port),
@@ -1111,6 +1139,10 @@ class SnifferEngine:
# Certificate (TLS 1.2 only) # Certificate (TLS 1.2 only)
cert = _parse_certificate(payload) cert = _parse_certificate(payload)
if cert is not None: if cert is not None:
_tracer = _get_tracer("sniffer")
with _tracer.start_as_current_span("sniffer.tls_certificate") as _span:
_span.set_attribute("subject_cn", cert["subject_cn"])
_span.set_attribute("self_signed", cert["self_signed"])
rev_key = (dst_ip, dst_port, src_ip, src_port) rev_key = (dst_ip, dst_port, src_ip, src_port)
ch_data = self._sessions.get(rev_key) ch_data = self._sessions.get(rev_key)

View File

@@ -22,6 +22,8 @@ No external dependencies.
from __future__ import annotations from __future__ import annotations
from decnet.telemetry import traced as _traced
# ─── TTL → initial TTL bucket ─────────────────────────────────────────────── # ─── TTL → initial TTL bucket ───────────────────────────────────────────────
# Common "hop 0" TTLs. Packets decrement TTL once per hop, so we round up # Common "hop 0" TTLs. Packets decrement TTL once per hop, so we round up
@@ -216,6 +218,7 @@ def _match_signature(
return True return True
@_traced("sniffer.p0f_guess_os")
def guess_os( def guess_os(
ttl: int, ttl: int,
window: int, window: int,

View File

@@ -11,6 +11,7 @@ from pathlib import Path
from typing import Any from typing import Any
from decnet.collector.worker import parse_rfc5424 from decnet.collector.worker import parse_rfc5424
from decnet.telemetry import traced as _traced
# ─── Constants (must match templates/sniffer/decnet_logging.py) ────────────── # ─── Constants (must match templates/sniffer/decnet_logging.py) ──────────────
@@ -57,6 +58,7 @@ def syslog_line(
return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}" return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}"
@_traced("sniffer.write_event")
def write_event(line: str, log_path: Path, json_path: Path) -> None: def write_event(line: str, log_path: Path, json_path: Path) -> None:
"""Append a syslog line to the raw log and its parsed JSON to the json log.""" """Append a syslog line to the raw log and its parsed JSON to the json log."""
with open(log_path, "a", encoding="utf-8") as lf: with open(log_path, "a", encoding="utf-8") as lf:

View File

@@ -110,6 +110,7 @@ def _sniff_loop(
logger.info("sniffer: sniff loop ended") logger.info("sniffer: sniff loop ended")
@_traced("sniffer.worker")
async def sniffer_worker(log_file: str) -> None: async def sniffer_worker(log_file: str) -> None:
""" """
Async entry point — started as asyncio.create_task in the API lifespan. Async entry point — started as asyncio.create_task in the API lifespan.

View File

@@ -64,7 +64,9 @@ def setup_tracing(app: Any) -> None:
_init_provider() _init_provider()
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
FastAPIInstrumentor.instrument_app(app) FastAPIInstrumentor.instrument_app(app)
log.info("FastAPI auto-instrumentation active") from decnet.logging import enable_trace_context
enable_trace_context()
log.info("FastAPI auto-instrumentation active, log-trace correlation enabled")
except Exception as exc: except Exception as exc:
log.warning("OTEL setup failed — continuing without tracing: %s", exc) log.warning("OTEL setup failed — continuing without tracing: %s", exc)

View File

@@ -43,6 +43,11 @@ class Log(SQLModel, table=True):
raw_line: str = Field(sa_column=Column("raw_line", Text, nullable=False)) raw_line: str = Field(sa_column=Column("raw_line", Text, nullable=False))
fields: str = Field(sa_column=Column("fields", Text, nullable=False)) fields: str = Field(sa_column=Column("fields", Text, nullable=False))
msg: Optional[str] = Field(default=None, sa_column=Column("msg", Text, nullable=True)) msg: Optional[str] = Field(default=None, sa_column=Column("msg", Text, nullable=True))
# OTEL trace context — bridges the collector→ingester trace to the SSE
# read path. Nullable so pre-existing rows and non-traced deployments
# are unaffected.
trace_id: Optional[str] = Field(default=None)
span_id: Optional[str] = Field(default=None)
class Bounty(SQLModel, table=True): class Bounty(SQLModel, table=True):
__tablename__ = "bounty" __tablename__ = "bounty"

View File

@@ -75,6 +75,14 @@ async def log_ingestion_worker(repo: BaseRepository) -> None:
_span.set_attribute("service", _log_data.get("service", "")) _span.set_attribute("service", _log_data.get("service", ""))
_span.set_attribute("event_type", _log_data.get("event_type", "")) _span.set_attribute("event_type", _log_data.get("event_type", ""))
_span.set_attribute("attacker_ip", _log_data.get("attacker_ip", "")) _span.set_attribute("attacker_ip", _log_data.get("attacker_ip", ""))
# Persist trace context in the DB row so the SSE
# read path can link back to this ingestion trace.
_sctx = getattr(_span, "get_span_context", None)
if _sctx:
_ctx = _sctx()
if _ctx and getattr(_ctx, "trace_id", 0):
_log_data["trace_id"] = format(_ctx.trace_id, "032x")
_log_data["span_id"] = format(_ctx.span_id, "016x")
logger.debug("ingest: record decky=%s event_type=%s", _log_data.get("decky"), _log_data.get("event_type")) logger.debug("ingest: record decky=%s event_type=%s", _log_data.get("decky"), _log_data.get("event_type"))
await repo.add_log(_log_data) await repo.add_log(_log_data)
await _extract_bounty(repo, _log_data) await _extract_bounty(repo, _log_data)

View File

@@ -2,6 +2,7 @@ from typing import Any, Optional
from fastapi import APIRouter, Depends, HTTPException, Query from fastapi import APIRouter, Depends, HTTPException, Query
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo from decnet.web.dependencies import require_viewer, repo
router = APIRouter() router = APIRouter()
@@ -15,6 +16,7 @@ router = APIRouter()
404: {"description": "Attacker not found"}, 404: {"description": "Attacker not found"},
}, },
) )
@_traced("api.get_attacker_commands")
async def get_attacker_commands( async def get_attacker_commands(
uuid: str, uuid: str,
limit: int = Query(50, ge=1, le=200), limit: int = Query(50, ge=1, le=200),

View File

@@ -2,6 +2,7 @@ from typing import Any
from fastapi import APIRouter, Depends, HTTPException from fastapi import APIRouter, Depends, HTTPException
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo from decnet.web.dependencies import require_viewer, repo
router = APIRouter() router = APIRouter()
@@ -15,6 +16,7 @@ router = APIRouter()
404: {"description": "Attacker not found"}, 404: {"description": "Attacker not found"},
}, },
) )
@_traced("api.get_attacker_detail")
async def get_attacker_detail( async def get_attacker_detail(
uuid: str, uuid: str,
user: dict = Depends(require_viewer), user: dict = Depends(require_viewer),

View File

@@ -2,6 +2,7 @@ from typing import Any, Optional
from fastapi import APIRouter, Depends, Query from fastapi import APIRouter, Depends, Query
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo from decnet.web.dependencies import require_viewer, repo
from decnet.web.db.models import AttackersResponse from decnet.web.db.models import AttackersResponse
@@ -17,6 +18,7 @@ router = APIRouter()
422: {"description": "Validation error"}, 422: {"description": "Validation error"},
}, },
) )
@_traced("api.get_attackers")
async def get_attackers( async def get_attackers(
limit: int = Query(50, ge=1, le=1000), limit: int = Query(50, ge=1, le=1000),
offset: int = Query(0, ge=0, le=2147483647), offset: int = Query(0, ge=0, le=2147483647),

View File

@@ -2,6 +2,7 @@ from typing import Any, Optional
from fastapi import APIRouter, Depends, HTTPException, status from fastapi import APIRouter, Depends, HTTPException, status
from decnet.telemetry import traced as _traced
from decnet.web.auth import get_password_hash, verify_password from decnet.web.auth import get_password_hash, verify_password
from decnet.web.dependencies import get_current_user_unchecked, repo from decnet.web.dependencies import get_current_user_unchecked, repo
from decnet.web.db.models import ChangePasswordRequest from decnet.web.db.models import ChangePasswordRequest
@@ -18,6 +19,7 @@ router = APIRouter()
422: {"description": "Validation error"} 422: {"description": "Validation error"}
}, },
) )
@_traced("api.change_password")
async def change_password(request: ChangePasswordRequest, current_user: str = Depends(get_current_user_unchecked)) -> dict[str, str]: async def change_password(request: ChangePasswordRequest, current_user: str = Depends(get_current_user_unchecked)) -> dict[str, str]:
_user: Optional[dict[str, Any]] = await repo.get_user_by_uuid(current_user) _user: Optional[dict[str, Any]] = await repo.get_user_by_uuid(current_user)
if not _user or not verify_password(request.old_password, _user["password_hash"]): if not _user or not verify_password(request.old_password, _user["password_hash"]):

View File

@@ -3,6 +3,7 @@ from typing import Any, Optional
from fastapi import APIRouter, HTTPException, status from fastapi import APIRouter, HTTPException, status
from decnet.telemetry import traced as _traced
from decnet.web.auth import ( from decnet.web.auth import (
ACCESS_TOKEN_EXPIRE_MINUTES, ACCESS_TOKEN_EXPIRE_MINUTES,
create_access_token, create_access_token,
@@ -24,6 +25,7 @@ router = APIRouter()
422: {"description": "Validation error"} 422: {"description": "Validation error"}
}, },
) )
@_traced("api.login")
async def login(request: LoginRequest) -> dict[str, Any]: async def login(request: LoginRequest) -> dict[str, Any]:
_user: Optional[dict[str, Any]] = await repo.get_user_by_username(request.username) _user: Optional[dict[str, Any]] = await repo.get_user_by_username(request.username)
if not _user or not verify_password(request.password, _user["password_hash"]): if not _user or not verify_password(request.password, _user["password_hash"]):

View File

@@ -2,6 +2,7 @@ from typing import Any, Optional
from fastapi import APIRouter, Depends, Query from fastapi import APIRouter, Depends, Query
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo from decnet.web.dependencies import require_viewer, repo
from decnet.web.db.models import BountyResponse from decnet.web.db.models import BountyResponse
@@ -10,6 +11,7 @@ router = APIRouter()
@router.get("/bounty", response_model=BountyResponse, tags=["Bounty Vault"], @router.get("/bounty", response_model=BountyResponse, tags=["Bounty Vault"],
responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}},) responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}},)
@_traced("api.get_bounties")
async def get_bounties( async def get_bounties(
limit: int = Query(50, ge=1, le=1000), limit: int = Query(50, ge=1, le=1000),
offset: int = Query(0, ge=0, le=2147483647), offset: int = Query(0, ge=0, le=2147483647),

View File

@@ -1,6 +1,7 @@
from fastapi import APIRouter, Depends from fastapi import APIRouter, Depends
from decnet.env import DECNET_DEVELOPER from decnet.env import DECNET_DEVELOPER
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo from decnet.web.dependencies import require_viewer, repo
from decnet.web.db.models import UserResponse from decnet.web.db.models import UserResponse
@@ -17,6 +18,7 @@ _DEFAULT_MUTATION_INTERVAL = "30m"
401: {"description": "Could not validate credentials"}, 401: {"description": "Could not validate credentials"},
}, },
) )
@_traced("api.get_config")
async def api_get_config(user: dict = Depends(require_viewer)) -> dict: async def api_get_config(user: dict = Depends(require_viewer)) -> dict:
limits_state = await repo.get_state("config_limits") limits_state = await repo.get_state("config_limits")
globals_state = await repo.get_state("config_globals") globals_state = await repo.get_state("config_globals")

View File

@@ -2,6 +2,7 @@ import uuid as _uuid
from fastapi import APIRouter, Depends, HTTPException from fastapi import APIRouter, Depends, HTTPException
from decnet.telemetry import traced as _traced
from decnet.web.auth import get_password_hash from decnet.web.auth import get_password_hash
from decnet.web.dependencies import require_admin, repo from decnet.web.dependencies import require_admin, repo
from decnet.web.db.models import ( from decnet.web.db.models import (
@@ -24,6 +25,7 @@ router = APIRouter()
422: {"description": "Validation error"}, 422: {"description": "Validation error"},
}, },
) )
@_traced("api.create_user")
async def api_create_user( async def api_create_user(
req: CreateUserRequest, req: CreateUserRequest,
admin: dict = Depends(require_admin), admin: dict = Depends(require_admin),
@@ -57,6 +59,7 @@ async def api_create_user(
404: {"description": "User not found"}, 404: {"description": "User not found"},
}, },
) )
@_traced("api.delete_user")
async def api_delete_user( async def api_delete_user(
user_uuid: str, user_uuid: str,
admin: dict = Depends(require_admin), admin: dict = Depends(require_admin),
@@ -80,6 +83,7 @@ async def api_delete_user(
422: {"description": "Validation error"}, 422: {"description": "Validation error"},
}, },
) )
@_traced("api.update_user_role")
async def api_update_user_role( async def api_update_user_role(
user_uuid: str, user_uuid: str,
req: UpdateUserRoleRequest, req: UpdateUserRoleRequest,
@@ -106,6 +110,7 @@ async def api_update_user_role(
422: {"description": "Validation error"}, 422: {"description": "Validation error"},
}, },
) )
@_traced("api.reset_user_password")
async def api_reset_user_password( async def api_reset_user_password(
user_uuid: str, user_uuid: str,
req: ResetUserPasswordRequest, req: ResetUserPasswordRequest,

View File

@@ -1,6 +1,7 @@
from fastapi import APIRouter, Depends, HTTPException from fastapi import APIRouter, Depends, HTTPException
from decnet.env import DECNET_DEVELOPER from decnet.env import DECNET_DEVELOPER
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_admin, repo from decnet.web.dependencies import require_admin, repo
router = APIRouter() router = APIRouter()
@@ -14,6 +15,7 @@ router = APIRouter()
403: {"description": "Admin access required or developer mode not enabled"}, 403: {"description": "Admin access required or developer mode not enabled"},
}, },
) )
@_traced("api.reinit")
async def api_reinit(admin: dict = Depends(require_admin)) -> dict: async def api_reinit(admin: dict = Depends(require_admin)) -> dict:
if not DECNET_DEVELOPER: if not DECNET_DEVELOPER:
raise HTTPException(status_code=403, detail="Developer mode is not enabled") raise HTTPException(status_code=403, detail="Developer mode is not enabled")

View File

@@ -1,5 +1,6 @@
from fastapi import APIRouter, Depends from fastapi import APIRouter, Depends
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_admin, repo from decnet.web.dependencies import require_admin, repo
from decnet.web.db.models import DeploymentLimitRequest, GlobalMutationIntervalRequest from decnet.web.db.models import DeploymentLimitRequest, GlobalMutationIntervalRequest
@@ -15,6 +16,7 @@ router = APIRouter()
422: {"description": "Validation error"}, 422: {"description": "Validation error"},
}, },
) )
@_traced("api.update_deployment_limit")
async def api_update_deployment_limit( async def api_update_deployment_limit(
req: DeploymentLimitRequest, req: DeploymentLimitRequest,
admin: dict = Depends(require_admin), admin: dict = Depends(require_admin),
@@ -32,6 +34,7 @@ async def api_update_deployment_limit(
422: {"description": "Validation error"}, 422: {"description": "Validation error"},
}, },
) )
@_traced("api.update_global_mutation_interval")
async def api_update_global_mutation_interval( async def api_update_global_mutation_interval(
req: GlobalMutationIntervalRequest, req: GlobalMutationIntervalRequest,
admin: dict = Depends(require_admin), admin: dict = Depends(require_admin),

View File

@@ -3,6 +3,7 @@ import os
from fastapi import APIRouter, Depends, HTTPException from fastapi import APIRouter, Depends, HTTPException
from decnet.logging import get_logger from decnet.logging import get_logger
from decnet.telemetry import traced as _traced
from decnet.config import DEFAULT_MUTATE_INTERVAL, DecnetConfig, _ROOT from decnet.config import DEFAULT_MUTATE_INTERVAL, DecnetConfig, _ROOT
from decnet.engine import deploy as _deploy from decnet.engine import deploy as _deploy
from decnet.ini_loader import load_ini_from_string from decnet.ini_loader import load_ini_from_string
@@ -27,6 +28,7 @@ router = APIRouter()
500: {"description": "Deployment failed"} 500: {"description": "Deployment failed"}
} }
) )
@_traced("api.deploy_deckies")
async def api_deploy_deckies(req: DeployIniRequest, admin: dict = Depends(require_admin)) -> dict[str, str]: async def api_deploy_deckies(req: DeployIniRequest, admin: dict = Depends(require_admin)) -> dict[str, str]:
from decnet.fleet import build_deckies_from_ini from decnet.fleet import build_deckies_from_ini

View File

@@ -2,6 +2,7 @@ from typing import Any
from fastapi import APIRouter, Depends from fastapi import APIRouter, Depends
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo from decnet.web.dependencies import require_viewer, repo
router = APIRouter() router = APIRouter()
@@ -9,5 +10,6 @@ router = APIRouter()
@router.get("/deckies", tags=["Fleet Management"], @router.get("/deckies", tags=["Fleet Management"],
responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}},) responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}},)
@_traced("api.get_deckies")
async def get_deckies(user: dict = Depends(require_viewer)) -> list[dict[str, Any]]: async def get_deckies(user: dict = Depends(require_viewer)) -> list[dict[str, Any]]:
return await repo.get_deckies() return await repo.get_deckies()

View File

@@ -1,6 +1,7 @@
import os import os
from fastapi import APIRouter, Depends, HTTPException, Path from fastapi import APIRouter, Depends, HTTPException, Path
from decnet.telemetry import traced as _traced
from decnet.mutator import mutate_decky from decnet.mutator import mutate_decky
from decnet.web.dependencies import require_admin, repo from decnet.web.dependencies import require_admin, repo
@@ -12,6 +13,7 @@ router = APIRouter()
tags=["Fleet Management"], tags=["Fleet Management"],
responses={401: {"description": "Could not validate credentials"}, 403: {"description": "Insufficient permissions"}, 404: {"description": "Decky not found"}} responses={401: {"description": "Could not validate credentials"}, 403: {"description": "Insufficient permissions"}, 404: {"description": "Decky not found"}}
) )
@_traced("api.mutate_decky")
async def api_mutate_decky( async def api_mutate_decky(
decky_name: str = Path(..., pattern=r"^[a-z0-9\-]{1,64}$"), decky_name: str = Path(..., pattern=r"^[a-z0-9\-]{1,64}$"),
admin: dict = Depends(require_admin), admin: dict = Depends(require_admin),

View File

@@ -1,5 +1,6 @@
from fastapi import APIRouter, Depends, HTTPException from fastapi import APIRouter, Depends, HTTPException
from decnet.telemetry import traced as _traced
from decnet.config import DecnetConfig from decnet.config import DecnetConfig
from decnet.web.dependencies import require_admin, repo from decnet.web.dependencies import require_admin, repo
from decnet.web.db.models import MutateIntervalRequest from decnet.web.db.models import MutateIntervalRequest
@@ -24,6 +25,7 @@ def _parse_duration(s: str) -> int:
422: {"description": "Validation error"} 422: {"description": "Validation error"}
}, },
) )
@_traced("api.update_mutate_interval")
async def api_update_mutate_interval(decky_name: str, req: MutateIntervalRequest, admin: dict = Depends(require_admin)) -> dict[str, str]: async def api_update_mutate_interval(decky_name: str, req: MutateIntervalRequest, admin: dict = Depends(require_admin)) -> dict[str, str]:
state_dict = await repo.get_state("deployment") state_dict = await repo.get_state("deployment")
if not state_dict: if not state_dict:

View File

@@ -3,6 +3,7 @@ from typing import Any
from fastapi import APIRouter, Depends from fastapi import APIRouter, Depends
from fastapi.responses import JSONResponse from fastapi.responses import JSONResponse
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo from decnet.web.dependencies import require_viewer, repo
from decnet.web.db.models import HealthResponse, ComponentHealth from decnet.web.db.models import HealthResponse, ComponentHealth
@@ -20,6 +21,7 @@ _OPTIONAL_SERVICES = {"sniffer_worker"}
503: {"model": HealthResponse, "description": "System unhealthy"}, 503: {"model": HealthResponse, "description": "System unhealthy"},
}, },
) )
@_traced("api.get_health")
async def get_health(user: dict = Depends(require_viewer)) -> Any: async def get_health(user: dict = Depends(require_viewer)) -> Any:
components: dict[str, ComponentHealth] = {} components: dict[str, ComponentHealth] = {}

View File

@@ -2,6 +2,7 @@ from typing import Any, Optional
from fastapi import APIRouter, Depends, Query from fastapi import APIRouter, Depends, Query
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo from decnet.web.dependencies import require_viewer, repo
router = APIRouter() router = APIRouter()
@@ -9,6 +10,7 @@ router = APIRouter()
@router.get("/logs/histogram", tags=["Logs"], @router.get("/logs/histogram", tags=["Logs"],
responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}},) responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}},)
@_traced("api.get_logs_histogram")
async def get_logs_histogram( async def get_logs_histogram(
search: Optional[str] = None, search: Optional[str] = None,
start_time: Optional[str] = Query(None), start_time: Optional[str] = Query(None),

View File

@@ -2,6 +2,7 @@ from typing import Any, Optional
from fastapi import APIRouter, Depends, Query from fastapi import APIRouter, Depends, Query
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo from decnet.web.dependencies import require_viewer, repo
from decnet.web.db.models import LogsResponse from decnet.web.db.models import LogsResponse
@@ -10,6 +11,7 @@ router = APIRouter()
@router.get("/logs", response_model=LogsResponse, tags=["Logs"], @router.get("/logs", response_model=LogsResponse, tags=["Logs"],
responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}}) responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}})
@_traced("api.get_logs")
async def get_logs( async def get_logs(
limit: int = Query(50, ge=1, le=1000), limit: int = Query(50, ge=1, le=1000),
offset: int = Query(0, ge=0, le=2147483647), offset: int = Query(0, ge=0, le=2147483647),

View File

@@ -2,6 +2,7 @@ from typing import Any
from fastapi import APIRouter, Depends from fastapi import APIRouter, Depends
from decnet.telemetry import traced as _traced
from decnet.web.dependencies import require_viewer, repo from decnet.web.dependencies import require_viewer, repo
from decnet.web.db.models import StatsResponse from decnet.web.db.models import StatsResponse
@@ -10,5 +11,6 @@ router = APIRouter()
@router.get("/stats", response_model=StatsResponse, tags=["Observability"], @router.get("/stats", response_model=StatsResponse, tags=["Observability"],
responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}},) responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}},)
@_traced("api.get_stats")
async def get_stats(user: dict = Depends(require_viewer)) -> dict[str, Any]: async def get_stats(user: dict = Depends(require_viewer)) -> dict[str, Any]:
return await repo.get_stats_summary() return await repo.get_stats_summary()

View File

@@ -7,6 +7,7 @@ from fastapi.responses import StreamingResponse
from decnet.env import DECNET_DEVELOPER from decnet.env import DECNET_DEVELOPER
from decnet.logging import get_logger from decnet.logging import get_logger
from decnet.telemetry import traced as _traced, get_tracer as _get_tracer
from decnet.web.dependencies import require_stream_viewer, repo from decnet.web.dependencies import require_stream_viewer, repo
log = get_logger("api") log = get_logger("api")
@@ -14,6 +15,34 @@ log = get_logger("api")
router = APIRouter() router = APIRouter()
def _build_trace_links(logs: list[dict]) -> list:
"""Build OTEL span links from persisted trace_id/span_id in log rows.
Returns an empty list when tracing is disabled (no OTEL imports).
"""
try:
from opentelemetry.trace import Link, SpanContext, TraceFlags
except ImportError:
return []
links: list[Link] = []
for entry in logs:
tid = entry.get("trace_id")
sid = entry.get("span_id")
if not tid or not sid or tid == "0":
continue
try:
ctx = SpanContext(
trace_id=int(tid, 16),
span_id=int(sid, 16),
is_remote=True,
trace_flags=TraceFlags(TraceFlags.SAMPLED),
)
links.append(Link(ctx))
except (ValueError, TypeError):
continue
return links
@router.get("/stream", tags=["Observability"], @router.get("/stream", tags=["Observability"],
responses={ responses={
200: { 200: {
@@ -24,6 +53,7 @@ router = APIRouter()
422: {"description": "Validation error"} 422: {"description": "Validation error"}
}, },
) )
@_traced("api.stream_events")
async def stream_events( async def stream_events(
request: Request, request: Request,
last_event_id: int = Query(0, alias="lastEventId"), last_event_id: int = Query(0, alias="lastEventId"),
@@ -75,6 +105,14 @@ async def stream_events(
) )
if new_logs: if new_logs:
last_id = max(entry["id"] for entry in new_logs) last_id = max(entry["id"] for entry in new_logs)
# Create a span linking back to the ingestion traces
# stored in each log row, closing the pipeline gap.
_links = _build_trace_links(new_logs)
_tracer = _get_tracer("sse")
with _tracer.start_as_current_span(
"sse.emit_logs", links=_links,
attributes={"log_count": len(new_logs)},
):
yield f"event: message\ndata: {json.dumps({'type': 'logs', 'data': new_logs})}\n\n" yield f"event: message\ndata: {json.dumps({'type': 'logs', 'data': new_logs})}\n\n"
loops_since_stats = stats_interval_sec loops_since_stats = stats_interval_sec

219
development/docs/TRACING.md Normal file
View File

@@ -0,0 +1,219 @@
# Distributed Tracing
OpenTelemetry (OTEL) distributed tracing across all DECNET services. Gated by the `DECNET_DEVELOPER_TRACING` environment variable (off by default). When disabled, zero overhead: no OTEL imports occur, `@traced` returns the original unwrapped function, and no middleware is installed.
## Quick Start
```bash
# 1. Start Jaeger (OTLP receiver on :4317, UI on :16686)
docker compose -f development/docker-compose.otel.yml up -d
# 2. Run DECNET with tracing enabled
DECNET_DEVELOPER_TRACING=true decnet web
# 3. Open Jaeger UI — service name is "decnet"
open http://localhost:16686
```
| Variable | Default | Purpose |
|----------|---------|---------|
| `DECNET_DEVELOPER_TRACING` | `false` | Enable/disable all tracing |
| `DECNET_OTEL_ENDPOINT` | `http://localhost:4317` | OTLP gRPC exporter target |
## Architecture
The core module is `decnet/telemetry.py`. All tracing flows through it.
| Export | Purpose |
|--------|---------|
| `setup_tracing(app)` | Init TracerProvider, instrument FastAPI, enable log-trace correlation |
| `shutdown_tracing()` | Flush and shut down the TracerProvider |
| `get_tracer(component)` | Return an OTEL Tracer or `_NoOpTracer` when disabled |
| `@traced(name)` | Decorator wrapping sync/async functions in spans (no-op when disabled) |
| `wrap_repository(repo)` | Dynamic `__getattr__` proxy adding `db.*` spans to every async method |
| `inject_context(record)` | Embed W3C trace context into a JSON record under `_trace` |
| `extract_context(record)` | Recover trace context from `_trace` and remove it from the record |
| `start_span_with_context(tracer, name, ctx)` | Start a span as child of an extracted context |
**TracerProvider config**: Resource(`service.name=decnet`, `service.version=0.2.0`), `BatchSpanProcessor`, OTLP gRPC exporter.
**When disabled**: `_NoOpTracer` and `_NoOpSpan` stubs are returned. No OTEL SDK packages are imported. The `@traced` decorator returns the original function object at decoration time.
## Pipeline Trace Propagation
The DECNET data pipeline is decoupled through JSON files and the database, which normally breaks trace continuity. Four mechanisms bridge the gaps:
1. **Collector → JSON**: `inject_context()` embeds W3C `traceparent`/`tracestate` into each JSON log record under a `_trace` key.
2. **JSON → Ingester**: `extract_context()` recovers the parent context. The ingester creates `ingester.process_record` as a child span, preserving the collector→ingester parent-child relationship.
3. **Ingester → DB**: The ingester persists the current span's `trace_id` and `span_id` as columns on the `logs` table before calling `repo.add_log()`.
4. **DB → SSE**: The SSE endpoint reads `trace_id`/`span_id` from log rows and creates OTEL **span links** (FOLLOWS_FROM) on `sse.emit_logs`, connecting the read path back to the original ingestion traces.
**Log-trace correlation**: `_TraceContextFilter` (installed by `enable_trace_context()`) injects `otel_trace_id` and `otel_span_id` into Python `LogRecord` objects, bridging structured logs with trace context.
## Span Reference
### API Endpoints (20 spans)
| Span | Endpoint |
|------|----------|
| `api.login` | `POST /auth/login` |
| `api.change_password` | `POST /auth/change-password` |
| `api.get_logs` | `GET /logs` |
| `api.get_logs_histogram` | `GET /logs/histogram` |
| `api.get_bounties` | `GET /bounty` |
| `api.get_attackers` | `GET /attackers` |
| `api.get_attacker_detail` | `GET /attackers/{uuid}` |
| `api.get_attacker_commands` | `GET /attackers/{uuid}/commands` |
| `api.get_stats` | `GET /stats` |
| `api.get_deckies` | `GET /fleet/deckies` |
| `api.deploy_deckies` | `POST /fleet/deploy` |
| `api.mutate_decky` | `POST /fleet/mutate/{decky_id}` |
| `api.update_mutate_interval` | `POST /fleet/mutate-interval/{decky_id}` |
| `api.get_config` | `GET /config` |
| `api.update_deployment_limit` | `PUT /config/deployment-limit` |
| `api.update_global_mutation_interval` | `PUT /config/global-mutation-interval` |
| `api.create_user` | `POST /config/users` |
| `api.delete_user` | `DELETE /config/users/{uuid}` |
| `api.update_user_role` | `PUT /config/users/{uuid}/role` |
| `api.reset_user_password` | `PUT /config/users/{uuid}/password` |
| `api.reinit` | `POST /config/reinit` |
| `api.get_health` | `GET /health` |
| `api.stream_events` | `GET /stream` |
### DB Layer (dynamic)
Every async method on `BaseRepository` is automatically wrapped by `TracedRepository` as `db.<method_name>` (e.g. `db.add_log`, `db.get_attackers`, `db.upsert_attacker`).
### Collector
| Span | Type |
|------|------|
| `collector.stream_container` | `@traced` |
| `collector.event` | inline |
### Ingester
| Span | Type |
|------|------|
| `ingester.process_record` | inline (with parent context) |
| `ingester.extract_bounty` | `@traced` |
### Profiler
| Span | Type |
|------|------|
| `profiler.incremental_update` | `@traced` |
| `profiler.update_profiles` | `@traced` |
| `profiler.process_ip` | inline |
| `profiler.timing_stats` | `@traced` |
| `profiler.classify_behavior` | `@traced` |
| `profiler.detect_tools_from_headers` | `@traced` |
| `profiler.phase_sequence` | `@traced` |
| `profiler.sniffer_rollup` | `@traced` |
| `profiler.build_behavior_record` | `@traced` |
| `profiler.behavior_summary` | inline |
### Sniffer
| Span | Type |
|------|------|
| `sniffer.worker` | `@traced` |
| `sniffer.sniff_loop` | `@traced` |
| `sniffer.tcp_syn_fingerprint` | inline |
| `sniffer.tls_client_hello` | inline |
| `sniffer.tls_server_hello` | inline |
| `sniffer.tls_certificate` | inline |
| `sniffer.parse_client_hello` | `@traced` |
| `sniffer.parse_server_hello` | `@traced` |
| `sniffer.parse_certificate` | `@traced` |
| `sniffer.ja3` | `@traced` |
| `sniffer.ja3s` | `@traced` |
| `sniffer.ja4` | `@traced` |
| `sniffer.ja4s` | `@traced` |
| `sniffer.session_resumption_info` | `@traced` |
| `sniffer.p0f_guess_os` | `@traced` |
| `sniffer.write_event` | `@traced` |
### Prober
| Span | Type |
|------|------|
| `prober.worker` | `@traced` |
| `prober.discover_attackers` | `@traced` |
| `prober.probe_cycle` | `@traced` |
| `prober.jarm_phase` | `@traced` |
| `prober.hassh_phase` | `@traced` |
| `prober.tcpfp_phase` | `@traced` |
| `prober.jarm_hash` | `@traced` |
| `prober.jarm_send_probe` | `@traced` |
| `prober.hassh_server` | `@traced` |
| `prober.hassh_ssh_connect` | `@traced` |
| `prober.tcp_fingerprint` | `@traced` |
| `prober.tcpfp_send_syn` | `@traced` |
### Engine
| Span | Type |
|------|------|
| `engine.deploy` | `@traced` |
| `engine.teardown` | `@traced` |
| `engine.compose_with_retry` | `@traced` |
### Mutator
| Span | Type |
|------|------|
| `mutator.mutate_decky` | `@traced` |
| `mutator.mutate_all` | `@traced` |
| `mutator.watch_loop` | `@traced` |
### Correlation
| Span | Type |
|------|------|
| `correlation.ingest_file` | `@traced` |
| `correlation.ingest_file.summary` | inline |
| `correlation.traversals` | `@traced` |
| `correlation.report_json` | `@traced` |
| `correlation.traversal_syslog_lines` | `@traced` |
### Logging
| Span | Type |
|------|------|
| `logging.init_file_handler` | `@traced` |
| `logging.probe_log_target` | `@traced` |
### SSE
| Span | Type |
|------|------|
| `sse.emit_logs` | inline (with span links to ingestion traces) |
## Adding New Traces
```python
from decnet.telemetry import traced as _traced, get_tracer as _get_tracer
# Decorator (preferred for entire functions)
@_traced("component.operation")
async def my_function():
...
# Inline (for sub-sections within a function)
with _get_tracer("component").start_as_current_span("component.sub_op") as span:
span.set_attribute("key", "value")
...
```
Naming convention: `component.operation` (e.g. `prober.jarm_hash`, `profiler.timing_stats`).
## Troubleshooting
| Symptom | Check |
|---------|-------|
| No traces in Jaeger | `DECNET_DEVELOPER_TRACING=true`? Jaeger running on port 4317? |
| `ImportError` on OTEL packages | Run `pip install -e ".[dev]"` (OTEL is in optional deps) |
| Partial traces (ingester orphaned) | Verify `_trace` key present in JSON log file records |
| SSE spans have no links | Confirm `trace_id`/`span_id` columns exist in `logs` table |
| Performance concern | BatchSpanProcessor adds ~1ms per span; zero overhead when disabled |