merge: testing → main (reconcile 2-week divergence)

This commit is contained in:
2026-04-28 18:36:00 -04:00
parent 499836c9e4
commit 862e4dbb31
1235 changed files with 160255 additions and 7996 deletions

View File

@@ -0,0 +1,11 @@
"""
Fleet-wide MACVLAN sniffer microservice.
Runs as a single host-side background task (not per-decky) that sniffs
all TLS traffic on the MACVLAN interface, extracts fingerprints, and
feeds events into the existing log pipeline.
"""
from decnet.sniffer.worker import sniffer_worker
__all__ = ["sniffer_worker"]

File diff suppressed because it is too large Load Diff

238
decnet/sniffer/p0f.py Normal file
View File

@@ -0,0 +1,238 @@
"""
Passive OS fingerprinting (p0f-lite) for the DECNET sniffer.
Pure-Python lookup module. Given the values of an incoming TCP SYN packet
(TTL, window, MSS, window-scale, and TCP option ordering), returns a coarse
OS bucket (linux / windows / macos_ios / freebsd / openbsd / nmap / unknown)
plus derived hop distance and inferred initial TTL.
Rationale
---------
Full p0f v3 distinguishes several dozen OS/tool profiles by combining dozens
of low-level quirks (OLEN, WSIZE, EOL padding, PCLASS, quirks, payload class).
For DECNET we only need a coarse bucket — enough to tag an attacker as
"linux beacon" vs "windows interactive" vs "active scan". The curated
table below covers default stacks that dominate real-world attacker traffic.
References (public p0f v3 DB, nmap-os-db, and Mozilla OS Fingerprint table):
https://github.com/p0f/p0f/blob/master/p0f.fp
No external dependencies.
"""
from __future__ import annotations
from decnet.telemetry import traced as _traced
# ─── TTL → initial TTL bucket ───────────────────────────────────────────────
# Common "hop 0" TTLs. Packets decrement TTL once per hop, so we round up
# the observed TTL to the nearest known starting value.
_TTL_BUCKETS: tuple[int, ...] = (32, 64, 128, 255)
def initial_ttl(ttl: int) -> int:
"""
Round *ttl* up to the nearest known initial-TTL bucket.
A SYN with TTL=59 was almost certainly emitted by a Linux/BSD host
(initial 64) five hops away; TTL=120 by a Windows host (initial 128)
eight hops away.
"""
for bucket in _TTL_BUCKETS:
if ttl <= bucket:
return bucket
return 255
def hop_distance(ttl: int) -> int:
"""
Estimate hops between the attacker and the sniffer based on TTL.
Upper-bounded at 64 (anything further has most likely been mangled
by a misconfigured firewall or a TTL-spoofing NAT).
"""
dist = initial_ttl(ttl) - ttl
if dist < 0:
return 0
if dist > 64:
return 64
return dist
# ─── OS signature table (TTL bucket, window, MSS, wscale, option-order) ─────
# Each entry is a set of loose predicates. If all predicates match, the
# OS label is returned. First-match wins. `None` means "don't care".
#
# The option signatures use the short-code alphabet from
# decnet/prober/tcpfp.py :: _OPT_CODES (M=MSS, N=NOP, W=WScale,
# T=Timestamp, S=SAckOK, E=EOL).
_SIGNATURES: tuple[tuple[dict, str], ...] = (
# ── nmap -sS / -sT default probe ───────────────────────────────────────
# nmap crafts very distinctive SYNs: tiny window (1024/4096/etc.), full
# option set including WScale=10 and SAckOK. Match these first so they
# don't get misclassified as Linux.
(
{
"ttl_bucket": 64,
"window_in": {1024, 2048, 3072, 4096, 31337, 32768, 65535},
"mss": 1460,
"wscale": 10,
"options": "M,W,T,S,S",
},
"nmap",
),
(
{
"ttl_bucket": 64,
"window_in": {1024, 2048, 3072, 4096, 31337, 32768, 65535},
"options_starts_with": "M,W,T,S",
},
"nmap",
),
# ── macOS / iOS default SYN (match before Linux — shares TTL 64) ──────
# TTL 64, window 65535, MSS 1460, WScale 6, specific option order
# M,N,W,N,N,T,S,E (Darwin signature with EOL padding).
(
{
"ttl_bucket": 64,
"window": 65535,
"wscale": 6,
"options": "M,N,W,N,N,T,S,E",
},
"macos_ios",
),
(
{
"ttl_bucket": 64,
"window_in": {65535},
"wscale_in": {5, 6},
"has_timestamps": True,
"options_ends_with": "E",
},
"macos_ios",
),
# ── FreeBSD default SYN (TTL 64, no EOL) ───────────────────────────────
(
{
"ttl_bucket": 64,
"window": 65535,
"wscale": 6,
"has_sack": True,
"has_timestamps": True,
"options_no_eol": True,
},
"freebsd",
),
# ── Linux (kernel 3.x 6.x) default SYN ───────────────────────────────
# TTL 64, window 29200 / 64240 / 65535, MSS 1460, WScale 7, full options.
(
{
"ttl_bucket": 64,
"window_min": 5000,
"wscale_in": {6, 7, 8, 9, 10, 11, 12, 13, 14},
"has_sack": True,
"has_timestamps": True,
},
"linux",
),
# ── OpenBSD default SYN ─────────────────────────────────────────────────
# TTL 64, window 16384, WScale 3-6, MSS 1460
(
{
"ttl_bucket": 64,
"window_in": {16384, 16960},
"wscale_in": {3, 4, 5, 6},
},
"openbsd",
),
# ── Windows 10/11/Server default SYN ────────────────────────────────────
# TTL 128, window 64240/65535, MSS 1460, WScale 8, SACK+TS
(
{
"ttl_bucket": 128,
"window_min": 8192,
"wscale_in": {2, 6, 7, 8},
"has_sack": True,
},
"windows",
),
# ── Windows 7/XP (legacy) ───────────────────────────────────────────────
(
{
"ttl_bucket": 128,
"window_in": {8192, 16384, 65535},
},
"windows",
),
# ── Embedded / Cisco / network gear ─────────────────────────────────────
(
{
"ttl_bucket": 255,
},
"embedded",
),
)
def _match_signature(
sig: dict,
ttl: int,
window: int,
mss: int,
wscale: int | None,
options_sig: str,
) -> bool:
"""Evaluate every predicate in *sig* against the observed values."""
tb = initial_ttl(ttl)
if "ttl_bucket" in sig and sig["ttl_bucket"] != tb:
return False
if "window" in sig and sig["window"] != window:
return False
if "window_in" in sig and window not in sig["window_in"]:
return False
if "window_min" in sig and window < sig["window_min"]:
return False
if "mss" in sig and sig["mss"] != mss:
return False
if "wscale" in sig and sig["wscale"] != wscale:
return False
if "wscale_in" in sig and wscale not in sig["wscale_in"]:
return False
if "has_sack" in sig:
if sig["has_sack"] != ("S" in options_sig):
return False
if "has_timestamps" in sig:
if sig["has_timestamps"] != ("T" in options_sig):
return False
if "options" in sig and sig["options"] != options_sig:
return False
if "options_starts_with" in sig and not options_sig.startswith(sig["options_starts_with"]):
return False
if "options_ends_with" in sig and not options_sig.endswith(sig["options_ends_with"]):
return False
if "options_no_eol" in sig and sig["options_no_eol"] and "E" in options_sig:
return False
return True
@_traced("sniffer.p0f_guess_os")
def guess_os(
ttl: int,
window: int,
mss: int = 0,
wscale: int | None = None,
options_sig: str = "",
) -> str:
"""
Return a coarse OS bucket for the given SYN characteristics.
One of: "linux", "windows", "macos_ios", "freebsd", "openbsd",
"embedded", "nmap", "unknown".
"""
for sig, label in _SIGNATURES:
if _match_signature(sig, ttl, window, mss, wscale, options_sig):
return label
return "unknown"

View File

@@ -0,0 +1,63 @@
"""
Sequence-pattern classifier for TCP/IP fields that are useful as a tooling
fingerprint when sampled across multiple packets from the same source.
Two callers today:
- IP-ID sequence per attacker (random/incremental/zero/constant).
- TCP ISN sequence per attacker; modern stacks randomise, so a non-random
result is itself a strong signal (legacy stacks, custom raw-socket tools).
Pure stdlib so it stays trivially unit-testable.
"""
from __future__ import annotations
import statistics
# Minimum samples needed for a meaningful classification. Below this we
# return "unknown" rather than guess from 1-3 noisy values.
_MIN_SAMPLES = 4
# Max plausible delta for an "incremental" classification. The IP-ID field
# is 16-bit so kernel-emitted increments wrap rapidly under load — anything
# over 4096 between consecutive SYNs from the same host is almost certainly
# random rather than a counter we just happen to be sampling sparsely.
_INCREMENTAL_MAX_DELTA = 0x1000
# Coefficient-of-variation threshold above which we call a sequence random.
# stddev/mean > 0.5 is well past anything a counter would produce.
_RANDOM_CV_THRESHOLD = 0.5
def classify_sequence(samples: list[int]) -> str:
"""
Classify an integer sequence as one of:
- "zero": every sample is 0
- "constant": every sample is the same non-zero value
- "incremental": strictly monotonic with small positive deltas
- "random": high coefficient of variation, no monotonic pattern
- "unknown": fewer than _MIN_SAMPLES samples
Order is preserved — pass the deque/list in arrival order.
"""
if len(samples) < _MIN_SAMPLES:
return "unknown"
if all(s == 0 for s in samples):
return "zero"
first = samples[0]
if all(s == first for s in samples):
return "constant"
deltas = [b - a for a, b in zip(samples, samples[1:])]
if all(0 < d <= _INCREMENTAL_MAX_DELTA for d in deltas):
return "incremental"
mean = statistics.fmean(samples)
if mean > 0:
stdev = statistics.pstdev(samples)
if stdev / mean > _RANDOM_CV_THRESHOLD:
return "random"
return "random"

71
decnet/sniffer/syslog.py Normal file
View File

@@ -0,0 +1,71 @@
"""
RFC 5424 syslog formatting and log-file writing for the fleet sniffer.
Reuses the same wire format as templates/sniffer/decnet_logging.py so the
existing collector parser and ingester can consume events without changes.
"""
import json
from datetime import datetime, timezone
from pathlib import Path
from typing import Any
from decnet.collector.worker import parse_rfc5424
from decnet.telemetry import traced as _traced
# ─── Constants (must match templates/sniffer/decnet_logging.py) ──────────────
_FACILITY_LOCAL0 = 16
_SD_ID = "relay@55555"
_NILVALUE = "-"
SEVERITY_INFO = 6
SEVERITY_WARNING = 4
_MAX_HOSTNAME = 255
_MAX_APPNAME = 48
_MAX_MSGID = 32
# ─── Formatter ───────────────────────────────────────────────────────────────
def _sd_escape(value: str) -> str:
return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]")
def _sd_element(fields: dict[str, Any]) -> str:
if not fields:
return _NILVALUE
params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items())
return f"[{_SD_ID} {params}]"
def syslog_line(
service: str,
hostname: str,
event_type: str,
severity: int = SEVERITY_INFO,
msg: str | None = None,
**fields: Any,
) -> str:
pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>"
ts = datetime.now(timezone.utc).isoformat()
host = (hostname or _NILVALUE)[:_MAX_HOSTNAME]
appname = (service or _NILVALUE)[:_MAX_APPNAME]
msgid = (event_type or _NILVALUE)[:_MAX_MSGID]
sd = _sd_element(fields)
message = f" {msg}" if msg else ""
return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}"
@_traced("sniffer.write_event")
def write_event(line: str, log_path: Path, json_path: Path) -> None:
"""Append a syslog line to the raw log and its parsed JSON to the json log."""
with open(log_path, "a", encoding="utf-8") as lf:
lf.write(line + "\n")
lf.flush()
parsed = parse_rfc5424(line)
if parsed:
with open(json_path, "a", encoding="utf-8") as jf:
jf.write(json.dumps(parsed) + "\n")
jf.flush()

243
decnet/sniffer/worker.py Normal file
View File

@@ -0,0 +1,243 @@
"""
Fleet-wide MACVLAN sniffer worker.
Runs as a single host-side async background task that sniffs all TLS
traffic on the MACVLAN host interface. Maps packets to deckies by IP
and feeds fingerprint events into the existing log pipeline.
Modeled on decnet.collector.worker — same lifecycle pattern.
Fault-isolated: any exception is logged and the worker exits cleanly.
The API never depends on this worker being alive.
"""
import asyncio
import contextlib
import os
import subprocess # nosec B404 — needed for interface checks
import threading
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from typing import Any, Callable
from decnet.bus import topics as _topics
from decnet.bus.base import BaseBus
from decnet.bus.factory import get_bus
from decnet.bus.publish import (
make_thread_safe_publisher,
run_control_listener_signal,
run_health_heartbeat,
)
from decnet.logging import get_logger
from decnet.network import HOST_IPVLAN_IFACE, HOST_MACVLAN_IFACE
from decnet.sniffer.fingerprint import SnifferEngine
from decnet.sniffer.syslog import write_event
from decnet.telemetry import traced as _traced
logger = get_logger("sniffer")
_IP_MAP_REFRESH_INTERVAL: float = 60.0
def _load_ip_to_decky() -> dict[str, str]:
"""Build IP → decky-name mapping from decnet-state.json."""
from decnet.config import load_state
state = load_state()
if state is None:
return {}
config, _ = state
mapping: dict[str, str] = {}
for decky in config.deckies:
mapping[decky.ip] = decky.name
return mapping
def _make_decky_traffic_publisher(
bus: BaseBus,
loop: asyncio.AbstractEventLoop,
) -> Callable[[str, str, dict[str, Any]], None]:
"""Wrap :func:`make_thread_safe_publisher` with the decky-traffic topic.
The scapy sniff loop runs in a dedicated worker thread — this adapter
turns ``(decky_name, event_type, payload)`` calls from the engine into
a bus publish on ``decky.{name}.traffic`` without blocking the sniff
thread on the network round-trip.
"""
raw = make_thread_safe_publisher(bus, loop)
def _publish(decky_name: str, event_type: str, payload: dict[str, Any]) -> None:
topic = _topics.decky(decky_name, _topics.DECKY_TRAFFIC)
raw(topic, payload, event_type)
return _publish
def _interface_exists(iface: str) -> bool:
"""Check if a network interface exists on this host."""
try:
result = subprocess.run( # nosec B603 B607 — hardcoded args
["ip", "link", "show", iface],
capture_output=True, text=True, check=False,
)
return result.returncode == 0
except Exception:
return False
@_traced("sniffer.sniff_loop")
def _sniff_loop(
interface: str,
log_path: Path,
json_path: Path,
stop_event: threading.Event,
publish_fn: Callable[[str, str, dict[str, Any]], None] | None = None,
) -> None:
"""Blocking sniff loop. Runs in a dedicated thread via asyncio.to_thread."""
try:
from scapy.sendrecv import sniff
except ImportError:
logger.error("scapy not installed — sniffer cannot start")
return
ip_map = _load_ip_to_decky()
if not ip_map:
logger.warning("sniffer: no deckies in state — nothing to sniff")
return
def _write_fn(line: str) -> None:
write_event(line, log_path, json_path)
engine = SnifferEngine(
ip_to_decky=ip_map, write_fn=_write_fn, publish_fn=publish_fn,
)
# Periodically refresh IP map in a background daemon thread
def _refresh_loop() -> None:
while not stop_event.is_set():
stop_event.wait(_IP_MAP_REFRESH_INTERVAL)
if stop_event.is_set():
break
try:
new_map = _load_ip_to_decky()
if new_map:
engine.update_ip_map(new_map)
except Exception as exc:
logger.debug("sniffer: ip map refresh failed: %s", exc)
refresh_thread = threading.Thread(target=_refresh_loop, daemon=True)
refresh_thread.start()
logger.info("sniffer: sniffing on interface=%s deckies=%d", interface, len(ip_map))
try:
sniff(
iface=interface,
filter="tcp",
prn=engine.on_packet,
store=False,
stop_filter=lambda pkt: stop_event.is_set(),
)
except Exception as exc:
logger.error("sniffer: scapy sniff exited: %s", exc)
finally:
stop_event.set()
logger.info("sniffer: sniff loop ended")
@_traced("sniffer.worker")
async def sniffer_worker(log_file: str) -> None:
"""
Async entry point — started as asyncio.create_task in the API lifespan.
Fully fault-isolated: catches all exceptions, logs them, and returns
cleanly. The API continues running regardless of sniffer state.
"""
try:
# Interface selection: explicit env override wins, otherwise probe
# both the MACVLAN and IPvlan host-side names since the driver
# choice is per-deploy (--ipvlan flag).
env_iface = os.environ.get("DECNET_SNIFFER_IFACE")
if env_iface:
interface = env_iface
elif _interface_exists(HOST_MACVLAN_IFACE):
interface = HOST_MACVLAN_IFACE
elif _interface_exists(HOST_IPVLAN_IFACE):
interface = HOST_IPVLAN_IFACE
else:
logger.warning(
"sniffer: neither %s nor %s found — sniffer disabled "
"(fleet may not be deployed yet)",
HOST_MACVLAN_IFACE, HOST_IPVLAN_IFACE,
)
return
if not _interface_exists(interface):
logger.warning(
"sniffer: interface %s not found — sniffer disabled "
"(fleet may not be deployed yet)", interface,
)
return
log_path = Path(log_file)
json_path = log_path.with_suffix(".json")
log_path.parent.mkdir(parents=True, exist_ok=True)
stop_event = threading.Event()
loop = asyncio.get_running_loop()
# Connect to the bus for decky.{id}.traffic fan-out. Failure here
# is non-fatal: the sniffer still writes syslog, it just doesn't
# push notifications to downstream consumers.
bus: BaseBus | None = None
try:
candidate = get_bus(client_name="sniffer")
await candidate.connect()
bus = candidate
except Exception as exc: # noqa: BLE001
logger.warning(
"sniffer: bus unavailable, running in publish-off mode: %s", exc,
)
publish_fn: Callable[[str, str, dict[str, Any]], None] | None = None
if bus is not None:
publish_fn = _make_decky_traffic_publisher(bus, loop)
# Workers panel: heartbeat + SIGTERM-based stop control. The
# sniff loop is a blocking scapy thread, so an asyncio shutdown
# event can't reach it — translating the bus stop into SIGTERM
# routes through the existing CancelledError path below.
heartbeat_task = asyncio.create_task(run_health_heartbeat(bus, "sniffer"))
control_task = asyncio.create_task(
run_control_listener_signal(bus, "sniffer"),
)
# Dedicated thread pool so the long-running sniff loop doesn't
# occupy a slot in the default asyncio executor.
sniffer_pool = ThreadPoolExecutor(
max_workers=2, thread_name_prefix="decnet-sniffer",
)
try:
await loop.run_in_executor(
sniffer_pool, _sniff_loop,
interface, log_path, json_path, stop_event, publish_fn,
)
except asyncio.CancelledError:
logger.info("sniffer: shutdown requested")
stop_event.set()
sniffer_pool.shutdown(wait=False)
raise
finally:
sniffer_pool.shutdown(wait=False)
for t in (heartbeat_task, control_task):
t.cancel()
with contextlib.suppress(Exception, asyncio.CancelledError):
await t
if bus is not None:
with contextlib.suppress(Exception):
await bus.close()
except asyncio.CancelledError:
raise
except Exception as exc:
logger.error("sniffer: worker failed — API continues without sniffing: %s", exc)