The decky's Layer-2 extension (commit 291b78c1) emits body_simhash /
body_base64_bytes / html_smuggling on the message_stored log and adds
macro_indicator / encrypted booleans to each attachments_json
manifest entry. Lift them all onto the email.received bus payload:
* body_simhash — passes through as-is (16 hex chars or "")
* body_base64_bytes — coerced to int (0 on absent / malformed)
* attachment_macros / attachment_password_protected — OR-reduced
across the per-attachment manifest booleans; matches R0046's
matched_trigger semantics where a single positive lane fires the
rule
* html_smuggling — coerced bool from the decky's 0/1 int
Pre-Layer-2 message_stored events (older deckies, malformed log
rows) project to safe defaults: empty simhash, zero base64-bytes,
all booleans False — the EmailLifter then stays silent, never
fires a false positive on missing data.
R0042 (mass-phish) / R0046 macro / R0046 password / R0046 smuggling
/ R0048 (encoded payload) all fire end-to-end after this commit.
R0046 mal_hash_match and R0047 BEC remain deferred per their
respective DEBT entries (filed in the next commit).
1366 lines
54 KiB
Python
1366 lines
54 KiB
Python
import asyncio
|
||
import base64
|
||
import contextlib
|
||
import hashlib
|
||
import ipaddress
|
||
import os
|
||
import json
|
||
import re
|
||
import time
|
||
from typing import Any, Optional
|
||
from pathlib import Path
|
||
|
||
from decnet.bus import topics as _topics
|
||
from decnet.bus.factory import get_bus
|
||
from decnet.bus.publish import publish_safely
|
||
from decnet.env import DECNET_BATCH_SIZE, DECNET_BATCH_MAX_WAIT_MS
|
||
from decnet.logging import get_logger
|
||
from decnet.telemetry import (
|
||
traced as _traced,
|
||
get_tracer as _get_tracer,
|
||
extract_context as _extract_ctx,
|
||
start_span_with_context as _start_span,
|
||
)
|
||
from decnet.web.db.repository import BaseRepository
|
||
|
||
logger = get_logger("api")
|
||
|
||
_INGEST_STATE_KEY = "ingest_worker_position"
|
||
|
||
|
||
async def log_ingestion_worker(repo: BaseRepository) -> None:
|
||
"""
|
||
Background task that tails the DECNET_INGEST_LOG_FILE.json and
|
||
inserts structured JSON logs into the SQLite repository.
|
||
"""
|
||
_base_log_file: str | None = os.environ.get("DECNET_INGEST_LOG_FILE")
|
||
if not _base_log_file:
|
||
logger.warning("DECNET_INGEST_LOG_FILE not set. Log ingestion disabled.")
|
||
return
|
||
|
||
_json_log_path: Path = Path(_base_log_file).with_suffix(".json")
|
||
|
||
_saved = await repo.get_state(_INGEST_STATE_KEY)
|
||
_position: int = _saved.get("position", 0) if _saved else 0
|
||
|
||
logger.info("ingest worker started path=%s position=%d", _json_log_path, _position)
|
||
|
||
# Optional bus wiring — emit one system.log event per committed batch so
|
||
# downstream consumers (dashboard heartbeats, federation forwarder) can
|
||
# track DB-persisted progress without polling the state table.
|
||
_bus = None
|
||
try:
|
||
_bus = get_bus(client_name="ingester")
|
||
await _bus.connect()
|
||
except Exception as _exc:
|
||
logger.warning("ingester: bus unavailable, continuing without publish: %s", _exc)
|
||
_bus = None
|
||
|
||
try:
|
||
await _run_loop(repo, _json_log_path, _position, _bus)
|
||
finally:
|
||
if _bus is not None:
|
||
with contextlib.suppress(Exception):
|
||
await _bus.close()
|
||
|
||
|
||
async def _run_loop(
|
||
repo: BaseRepository,
|
||
_json_log_path: Path,
|
||
_position: int,
|
||
_bus: Any,
|
||
) -> None:
|
||
while True:
|
||
try:
|
||
if not _json_log_path.exists():
|
||
await asyncio.sleep(2)
|
||
continue
|
||
|
||
_stat: os.stat_result = _json_log_path.stat()
|
||
if _stat.st_size < _position:
|
||
# File rotated or truncated
|
||
_position = 0
|
||
await repo.set_state(_INGEST_STATE_KEY, {"position": 0})
|
||
|
||
if _stat.st_size == _position:
|
||
# No new data
|
||
await asyncio.sleep(1)
|
||
continue
|
||
|
||
# Accumulate parsed rows and the file offset they end at. We
|
||
# only advance _position after the batch is successfully
|
||
# committed — if we get cancelled mid-flush, the next run
|
||
# re-reads the un-committed lines rather than losing them.
|
||
_batch: list[tuple[dict[str, Any], int]] = []
|
||
_batch_started: float = time.monotonic()
|
||
_max_wait_s: float = DECNET_BATCH_MAX_WAIT_MS / 1000.0
|
||
|
||
with open(_json_log_path, "r", encoding="utf-8", errors="replace") as _f:
|
||
_f.seek(_position)
|
||
while True:
|
||
_line: str = _f.readline()
|
||
if not _line or not _line.endswith('\n'):
|
||
# EOF or partial line — flush what we have and stop
|
||
break
|
||
|
||
try:
|
||
_log_data: dict[str, Any] = json.loads(_line.strip())
|
||
# Collector injects trace context so the ingester span
|
||
# chains off the collector's — full event journey in Jaeger.
|
||
_parent_ctx = _extract_ctx(_log_data)
|
||
_tracer = _get_tracer("ingester")
|
||
with _start_span(_tracer, "ingester.process_record", context=_parent_ctx) as _span:
|
||
_span.set_attribute("decky", _log_data.get("decky", ""))
|
||
_span.set_attribute("service", _log_data.get("service", ""))
|
||
_span.set_attribute("event_type", _log_data.get("event_type", ""))
|
||
_span.set_attribute("attacker_ip", _log_data.get("attacker_ip", ""))
|
||
_sctx = getattr(_span, "get_span_context", None)
|
||
if _sctx:
|
||
_ctx = _sctx()
|
||
if _ctx and getattr(_ctx, "trace_id", 0):
|
||
_log_data["trace_id"] = format(_ctx.trace_id, "032x")
|
||
_log_data["span_id"] = format(_ctx.span_id, "016x")
|
||
_batch.append((_log_data, _f.tell()))
|
||
except json.JSONDecodeError:
|
||
logger.error("ingest: failed to decode JSON log line: %s", _line.strip())
|
||
# Skip past bad line so we don't loop forever on it.
|
||
_position = _f.tell()
|
||
continue
|
||
|
||
if len(_batch) >= DECNET_BATCH_SIZE or (
|
||
time.monotonic() - _batch_started >= _max_wait_s
|
||
):
|
||
_flushed = len(_batch)
|
||
_position = await _flush_batch(repo, _batch, _bus)
|
||
_batch.clear()
|
||
_batch_started = time.monotonic()
|
||
await _publish_batch(_bus, _flushed, _position)
|
||
|
||
# Flush any remainder collected before EOF / partial-line break.
|
||
if _batch:
|
||
_flushed = len(_batch)
|
||
_position = await _flush_batch(repo, _batch, _bus)
|
||
await _publish_batch(_bus, _flushed, _position)
|
||
|
||
except Exception as _e:
|
||
_err_str = str(_e).lower()
|
||
if "no such table" in _err_str or "no active connection" in _err_str or "connection closed" in _err_str:
|
||
logger.error("ingest: post-shutdown or fatal DB error: %s", _e)
|
||
break # Exit worker — DB is gone or uninitialized
|
||
|
||
logger.error("ingest: error in worker: %s", _e)
|
||
await asyncio.sleep(5)
|
||
|
||
await asyncio.sleep(1)
|
||
|
||
|
||
async def _publish_batch(bus: Any, flushed: int, position: int) -> None:
|
||
"""Emit one ``system.log`` event summarising a committed batch.
|
||
|
||
Fire-and-forget via :func:`publish_safely`; a dead bus never blocks the
|
||
ingestion loop. Zero-row flushes are suppressed so the topic stays
|
||
meaningful.
|
||
"""
|
||
if bus is None or flushed <= 0:
|
||
return
|
||
await publish_safely(
|
||
bus,
|
||
_topics.system(_topics.SYSTEM_LOG),
|
||
{"component": "ingester", "flushed": flushed, "position": position},
|
||
event_type="batch_committed",
|
||
)
|
||
|
||
|
||
async def _flush_batch(
|
||
repo: BaseRepository,
|
||
batch: list[tuple[dict[str, Any], int]],
|
||
bus: Any = None,
|
||
) -> int:
|
||
"""Commit a batch of log rows and return the new file position.
|
||
|
||
If the enclosing task is being cancelled, bail out without touching
|
||
the DB — the session factory may already be disposed during lifespan
|
||
teardown, and awaiting it would stall the worker. The un-flushed
|
||
lines stay uncommitted; the next startup re-reads them from the
|
||
last persisted position.
|
||
"""
|
||
_task = asyncio.current_task()
|
||
if _task is not None and _task.cancelling():
|
||
raise asyncio.CancelledError()
|
||
|
||
_entries = [_entry for _entry, _ in batch]
|
||
_new_position = batch[-1][1]
|
||
await repo.add_logs(_entries)
|
||
for _entry in _entries:
|
||
await _extract_bounty(repo, _entry, bus)
|
||
await repo.set_state(_INGEST_STATE_KEY, {"position": _new_position})
|
||
return _new_position
|
||
|
||
|
||
# RFC 5424-ish SD-PARAM-VALUE sanitization, mirrored from auth-helper.c.
|
||
# Bytes outside [0x20, 0x7f) collapse to '?', matching the C escape rule.
|
||
# The hash is always computed over the *original* bytes so reuse queries
|
||
# survive any sanitization on the printable form.
|
||
_SECRET_B64_RE = re.compile(r"^[A-Za-z0-9+/]*={0,2}$")
|
||
_SECRET_PRINTABLE_MAX = 512 # mirrors Credential.secret_printable max_length
|
||
_PRINCIPAL_MAX = 256
|
||
_SECRET_B64_MAX = 2048
|
||
|
||
|
||
def _truncate_with_warn(s: Optional[str], cap: int, label: str) -> Optional[str]:
|
||
if s is None:
|
||
return None
|
||
if len(s) <= cap:
|
||
return s
|
||
logger.warning("ingester: %s truncated %d → %d chars", label, len(s), cap)
|
||
return s[:cap]
|
||
|
||
|
||
async def _ingest_credential_native(
|
||
repo: BaseRepository,
|
||
log_data: dict[str, Any],
|
||
fields: dict[str, Any],
|
||
bus: Any = None,
|
||
) -> None:
|
||
"""Native-shape credential: SD-block already carries secret_b64.
|
||
|
||
Validates the b64, computes sha256 over the decoded bytes, hands off
|
||
to the repo upsert. Drops the row on validation failure (the
|
||
underlying Log row still lands).
|
||
"""
|
||
b64 = fields.get("secret_b64")
|
||
if not isinstance(b64, str) or not _SECRET_B64_RE.match(b64):
|
||
logger.warning(
|
||
"ingester: dropping credential — invalid secret_b64 from %s/%s",
|
||
log_data.get("decky"), log_data.get("service"),
|
||
)
|
||
return
|
||
try:
|
||
raw = base64.b64decode(b64, validate=True)
|
||
except (ValueError, TypeError):
|
||
logger.warning(
|
||
"ingester: dropping credential — secret_b64 decode failed from %s/%s",
|
||
log_data.get("decky"), log_data.get("service"),
|
||
)
|
||
return
|
||
|
||
sha256_hex = hashlib.sha256(raw).hexdigest()
|
||
principal = fields.get("principal") or fields.get("username")
|
||
secret_printable = fields.get("secret_printable")
|
||
secret_kind = fields.get("secret_kind") or "plaintext"
|
||
|
||
await repo.upsert_credential({
|
||
"attacker_ip": log_data.get("attacker_ip"),
|
||
"decky_name": log_data.get("decky"),
|
||
"service": log_data.get("service"),
|
||
"principal": _truncate_with_warn(principal, _PRINCIPAL_MAX, "principal"),
|
||
"secret_kind": secret_kind,
|
||
"secret_sha256": sha256_hex,
|
||
"secret_b64": _truncate_with_warn(b64, _SECRET_B64_MAX, "secret_b64"),
|
||
"secret_printable": _truncate_with_warn(
|
||
secret_printable, _SECRET_PRINTABLE_MAX, "secret_printable"
|
||
),
|
||
"outcome": fields.get("outcome"),
|
||
"fields": fields, # repo handles json.dumps with ensure_ascii=True
|
||
})
|
||
|
||
# Wake the reuse correlator. Fire-and-forget; a dead bus never blocks
|
||
# ingestion. Worker receives this and re-runs the GROUP BY pass.
|
||
await publish_safely(
|
||
bus,
|
||
_topics.credential(_topics.CREDENTIAL_CAPTURED),
|
||
{
|
||
"secret_sha256": sha256_hex,
|
||
"secret_kind": secret_kind,
|
||
"attacker_ip": log_data.get("attacker_ip"),
|
||
"decky": log_data.get("decky"),
|
||
"service": log_data.get("service"),
|
||
},
|
||
event_type=_topics.CREDENTIAL_CAPTURED,
|
||
)
|
||
|
||
|
||
@_traced("ingester.extract_bounty")
|
||
async def _extract_bounty(
|
||
repo: BaseRepository,
|
||
log_data: dict[str, Any],
|
||
bus: Any = None,
|
||
) -> None:
|
||
"""Detect and extract valuable artifacts (bounties) from log entries."""
|
||
_fields = log_data.get("fields")
|
||
if not isinstance(_fields, dict):
|
||
return
|
||
|
||
# 1. Credentials — every cred-emitting service writes the universal
|
||
# SD shape (`secret_b64` present). The legacy `username`+`password`
|
||
# adapter that bridged FTP/POP3/IMAP/SMTP through DEBT-039 was
|
||
# removed once those services migrated; emitters now feed the
|
||
# native branch directly. Redis (no principal) and LDAP (principal=
|
||
# dn) also land here — they were previously dropped silently.
|
||
if "secret_b64" in _fields:
|
||
await _ingest_credential_native(repo, log_data, _fields, bus)
|
||
|
||
# 2. HTTP User-Agent fingerprint
|
||
_h_raw = _fields.get("headers")
|
||
if isinstance(_h_raw, dict):
|
||
_headers = _h_raw
|
||
elif isinstance(_h_raw, str):
|
||
try:
|
||
_parsed = json.loads(_h_raw)
|
||
_headers = _parsed if isinstance(_parsed, dict) else {}
|
||
except (json.JSONDecodeError, ValueError):
|
||
_headers = {}
|
||
else:
|
||
_headers = {}
|
||
# Read both casings without `or` short-circuiting: an explicit
|
||
# empty User-Agent is itself a signal and must not collapse to the
|
||
# lowercase fallback.
|
||
_ua = _headers.get("User-Agent")
|
||
if _ua is None:
|
||
_ua = _headers.get("user-agent")
|
||
if _ua is not None:
|
||
# Classify: browser / cli / library / scanner / bot / nonstandard
|
||
# / empty. `nonstandard` is the interesting one — UAs like
|
||
# "FUCKYOU/1.0" land there and deserve an analyst's attention.
|
||
# Classification is deterministic given the UA string, so the
|
||
# payload stays dedup-stable across repeat requests.
|
||
_ua_category, _ua_tool, _ua_signals = _classify_ua(_ua)
|
||
await repo.add_bounty({
|
||
"decky": log_data.get("decky"),
|
||
"service": log_data.get("service"),
|
||
"attacker_ip": log_data.get("attacker_ip"),
|
||
"bounty_type": "fingerprint",
|
||
"payload": {
|
||
"fingerprint_type": "http_useragent",
|
||
"value": _ua,
|
||
"category": _ua_category,
|
||
"tool": _ua_tool,
|
||
"signals": _ua_signals or None,
|
||
# Request context — which endpoint did the tool hit?
|
||
# Useful in the dashboard so analysts see "Nikto hit
|
||
# /admin" rather than just "Nikto seen on this decky".
|
||
"method": _fields.get("method"),
|
||
"path": _fields.get("path"),
|
||
}
|
||
})
|
||
|
||
# 2b. IP leak — the attacker's real IP accidentally forwarded in a
|
||
# proxy-family header (X-Forwarded-For / Forwarded / X-Real-IP /
|
||
# CDN variants). Left-most value differing from the TCP source is
|
||
# a high-confidence attribution signal. DECNET_TRUSTED_PROXIES
|
||
# opts specific source IPs out (legitimate reverse proxy in front
|
||
# of DECNET).
|
||
_leak = _detect_ip_leak(log_data, _headers)
|
||
if _leak is not None:
|
||
await repo.add_bounty({
|
||
"decky": log_data.get("decky"),
|
||
"service": log_data.get("service"),
|
||
"attacker_ip": log_data.get("attacker_ip"),
|
||
"bounty_type": "ip_leak",
|
||
"payload": _leak,
|
||
})
|
||
|
||
# 2b.2 Spoofed source — attacker tried to pass a non-routable IP
|
||
# (loopback / RFC1918 / link-local / reserved) in a proxy header.
|
||
# Classic WAF-bypass: `X-Forwarded-For: 127.0.0.1` hoping an
|
||
# upstream filter waves localhost through. Distinct bounty type
|
||
# from ip_leak because the semantic is inverted — attack attempt,
|
||
# not opsec failure.
|
||
_spoof = _detect_spoofed_source(log_data, _headers)
|
||
if _spoof is not None:
|
||
await repo.add_bounty({
|
||
"decky": log_data.get("decky"),
|
||
"service": log_data.get("service"),
|
||
"attacker_ip": log_data.get("attacker_ip"),
|
||
"bounty_type": "fingerprint",
|
||
"payload": _spoof,
|
||
})
|
||
|
||
# 2c. HTTP header quirks — order + casing fingerprint per request.
|
||
# Real HTTP clients have distinctive header orderings and casing
|
||
# patterns (curl vs python-requests vs Go-http-client vs nmap vs
|
||
# browsers all differ). Attackers routinely spoof User-Agent but
|
||
# forget to match the stack's native header order. Bounty dedup
|
||
# collapses repeat fingerprints from the same attacker, so this
|
||
# fires once per distinct hash per source.
|
||
_quirks = _http_quirks_fingerprint(log_data, _headers)
|
||
if _quirks is not None:
|
||
await repo.add_bounty({
|
||
"decky": log_data.get("decky"),
|
||
"service": log_data.get("service"),
|
||
"attacker_ip": log_data.get("attacker_ip"),
|
||
"bounty_type": "fingerprint",
|
||
"payload": _quirks,
|
||
})
|
||
|
||
# 3. VNC client version fingerprint
|
||
_vnc_ver = _fields.get("client_version")
|
||
if _vnc_ver and log_data.get("event_type") == "version":
|
||
await repo.add_bounty({
|
||
"decky": log_data.get("decky"),
|
||
"service": log_data.get("service"),
|
||
"attacker_ip": log_data.get("attacker_ip"),
|
||
"bounty_type": "fingerprint",
|
||
"payload": {
|
||
"fingerprint_type": "vnc_client_version",
|
||
"value": _vnc_ver,
|
||
}
|
||
})
|
||
|
||
# 4. SSH client banner fingerprint (deferred — requires asyncssh server)
|
||
# Fires on: service=ssh, event_type=client_banner, fields.client_banner
|
||
|
||
# 5. JA3/JA3S TLS fingerprint from sniffer container
|
||
_ja3 = _fields.get("ja3")
|
||
if _ja3 and log_data.get("service") == "sniffer":
|
||
await repo.add_bounty({
|
||
"decky": log_data.get("decky"),
|
||
"service": "sniffer",
|
||
"attacker_ip": log_data.get("attacker_ip"),
|
||
"bounty_type": "fingerprint",
|
||
"payload": {
|
||
"fingerprint_type": "ja3",
|
||
"ja3": _ja3,
|
||
"ja3s": _fields.get("ja3s"),
|
||
"ja4": _fields.get("ja4"),
|
||
"ja4s": _fields.get("ja4s"),
|
||
"tls_version": _fields.get("tls_version"),
|
||
"sni": _fields.get("sni") or None,
|
||
"alpn": _fields.get("alpn") or None,
|
||
"dst_port": _fields.get("dst_port"),
|
||
"raw_ciphers": _fields.get("raw_ciphers"),
|
||
"raw_extensions": _fields.get("raw_extensions"),
|
||
},
|
||
})
|
||
|
||
# 6. JA4L latency fingerprint from sniffer
|
||
_ja4l_rtt = _fields.get("ja4l_rtt_ms")
|
||
if _ja4l_rtt and log_data.get("service") == "sniffer":
|
||
await repo.add_bounty({
|
||
"decky": log_data.get("decky"),
|
||
"service": "sniffer",
|
||
"attacker_ip": log_data.get("attacker_ip"),
|
||
"bounty_type": "fingerprint",
|
||
"payload": {
|
||
"fingerprint_type": "ja4l",
|
||
"rtt_ms": _ja4l_rtt,
|
||
"client_ttl": _fields.get("ja4l_client_ttl"),
|
||
},
|
||
})
|
||
|
||
# 7. TLS session resumption behavior
|
||
_resumption = _fields.get("resumption")
|
||
if _resumption and log_data.get("service") == "sniffer":
|
||
await repo.add_bounty({
|
||
"decky": log_data.get("decky"),
|
||
"service": "sniffer",
|
||
"attacker_ip": log_data.get("attacker_ip"),
|
||
"bounty_type": "fingerprint",
|
||
"payload": {
|
||
"fingerprint_type": "tls_resumption",
|
||
"mechanisms": _resumption,
|
||
},
|
||
})
|
||
|
||
# 8. TLS certificate details (TLS 1.2 only — passive extraction)
|
||
_subject_cn = _fields.get("subject_cn")
|
||
if _subject_cn and log_data.get("service") == "sniffer":
|
||
await repo.add_bounty({
|
||
"decky": log_data.get("decky"),
|
||
"service": "sniffer",
|
||
"attacker_ip": log_data.get("attacker_ip"),
|
||
"bounty_type": "fingerprint",
|
||
"payload": {
|
||
"fingerprint_type": "tls_certificate",
|
||
"subject_cn": _subject_cn,
|
||
"issuer": _fields.get("issuer"),
|
||
"self_signed": _fields.get("self_signed"),
|
||
"not_before": _fields.get("not_before"),
|
||
"not_after": _fields.get("not_after"),
|
||
"sans": _fields.get("sans"),
|
||
"sni": _fields.get("sni") or None,
|
||
},
|
||
})
|
||
|
||
# 9. JARM fingerprint from active prober
|
||
_jarm = _fields.get("jarm_hash")
|
||
if _jarm and log_data.get("service") == "prober":
|
||
await repo.add_bounty({
|
||
"decky": log_data.get("decky"),
|
||
"service": "prober",
|
||
"attacker_ip": _fields.get("target_ip", "Unknown"),
|
||
"bounty_type": "fingerprint",
|
||
"payload": {
|
||
"fingerprint_type": "jarm",
|
||
"hash": _jarm,
|
||
"target_ip": _fields.get("target_ip"),
|
||
"target_port": _fields.get("target_port"),
|
||
},
|
||
})
|
||
|
||
# 9b. TLS certificate from active prober (sibling of the JARM probe;
|
||
# captured by a follow-up ssl.wrap_socket() against attacker-run TLS
|
||
# servers). Disjoint from clause 8 above which is the sniffer path.
|
||
_prober_subject_cn = _fields.get("subject_cn")
|
||
if _prober_subject_cn and log_data.get("service") == "prober":
|
||
await repo.add_bounty({
|
||
"decky": log_data.get("decky"),
|
||
"service": "prober",
|
||
"attacker_ip": _fields.get("target_ip", "Unknown"),
|
||
"bounty_type": "fingerprint",
|
||
"payload": {
|
||
"fingerprint_type": "tls_certificate",
|
||
"subject_cn": _prober_subject_cn,
|
||
"issuer": _fields.get("issuer"),
|
||
"self_signed": _fields.get("self_signed"),
|
||
"not_before": _fields.get("not_before"),
|
||
"not_after": _fields.get("not_after"),
|
||
"sans": _fields.get("sans"),
|
||
"cert_sha256": _fields.get("cert_sha256"),
|
||
"target_ip": _fields.get("target_ip"),
|
||
"target_port": _fields.get("target_port"),
|
||
},
|
||
})
|
||
|
||
# 10. HASSHServer fingerprint from active prober
|
||
_hassh = _fields.get("hassh_server_hash")
|
||
if _hassh and log_data.get("service") == "prober":
|
||
await repo.add_bounty({
|
||
"decky": log_data.get("decky"),
|
||
"service": "prober",
|
||
"attacker_ip": _fields.get("target_ip", "Unknown"),
|
||
"bounty_type": "fingerprint",
|
||
"payload": {
|
||
"fingerprint_type": "hassh_server",
|
||
"hash": _hassh,
|
||
"target_ip": _fields.get("target_ip"),
|
||
"target_port": _fields.get("target_port"),
|
||
"ssh_banner": _fields.get("ssh_banner"),
|
||
"kex_algorithms": _fields.get("kex_algorithms"),
|
||
"encryption_s2c": _fields.get("encryption_s2c"),
|
||
"mac_s2c": _fields.get("mac_s2c"),
|
||
"compression_s2c": _fields.get("compression_s2c"),
|
||
},
|
||
})
|
||
|
||
# 11. TCP/IP stack fingerprint from active prober
|
||
_tcpfp = _fields.get("tcpfp_hash")
|
||
if _tcpfp and log_data.get("service") == "prober":
|
||
await repo.add_bounty({
|
||
"decky": log_data.get("decky"),
|
||
"service": "prober",
|
||
"attacker_ip": _fields.get("target_ip", "Unknown"),
|
||
"bounty_type": "fingerprint",
|
||
"payload": {
|
||
"fingerprint_type": "tcpfp",
|
||
"hash": _tcpfp,
|
||
"raw": _fields.get("tcpfp_raw"),
|
||
"target_ip": _fields.get("target_ip"),
|
||
"target_port": _fields.get("target_port"),
|
||
"ttl": _fields.get("ttl"),
|
||
"window_size": _fields.get("window_size"),
|
||
"df_bit": _fields.get("df_bit"),
|
||
"mss": _fields.get("mss"),
|
||
"window_scale": _fields.get("window_scale"),
|
||
"sack_ok": _fields.get("sack_ok"),
|
||
"timestamp": _fields.get("timestamp"),
|
||
"options_order": _fields.get("options_order"),
|
||
},
|
||
})
|
||
|
||
# 12. Captured file drops + stored mail. The `file_captured` event
|
||
# comes from inotifywait quarantines on SSH deckies; `message_stored`
|
||
# comes from the SMTP template's DATA-commit handler. Both are
|
||
# already what AttackerDetail's artifacts/mail tabs read; mirroring
|
||
# them into the Bounty table makes the global Vault page show them
|
||
# alongside credentials and fingerprints. Dedup on (bounty_type,
|
||
# attacker_ip, payload); sha256 in the payload guarantees per-drop
|
||
# uniqueness so repeat captures don't multiply rows.
|
||
_evt = log_data.get("event_type")
|
||
if _evt == "file_captured" and _fields.get("stored_as"):
|
||
await repo.add_bounty({
|
||
"decky": log_data.get("decky"),
|
||
"service": log_data.get("service"),
|
||
"attacker_ip": log_data.get("attacker_ip"),
|
||
"bounty_type": "artifact",
|
||
"payload": {
|
||
"kind": "file",
|
||
"stored_as": _fields.get("stored_as"),
|
||
"sha256": _fields.get("sha256"),
|
||
"size": _fields.get("size"),
|
||
"orig_path": _fields.get("orig_path"),
|
||
"attribution": _fields.get("attribution"),
|
||
"writer_comm": _fields.get("writer_comm"),
|
||
},
|
||
})
|
||
elif _evt == "message_stored" and _fields.get("stored_as"):
|
||
await repo.add_bounty({
|
||
"decky": log_data.get("decky"),
|
||
"service": log_data.get("service"),
|
||
"attacker_ip": log_data.get("attacker_ip"),
|
||
"bounty_type": "artifact",
|
||
"payload": {
|
||
"kind": "mail",
|
||
"stored_as": _fields.get("stored_as"),
|
||
"sha256": _fields.get("sha256"),
|
||
"size": _fields.get("size"),
|
||
"subject": _fields.get("subject"),
|
||
"from_hdr": _fields.get("from_hdr"),
|
||
"to_hdr": _fields.get("to_hdr"),
|
||
"mail_from": _fields.get("mail_from"),
|
||
"rcpt_to": _fields.get("rcpt_to"),
|
||
"attachment_count": _fields.get("attachment_count"),
|
||
"content_type": _fields.get("content_type"),
|
||
},
|
||
})
|
||
# Fan the captured email out to the TTP worker — same hook
|
||
# ``attacker.intel.enriched`` uses, but for the EmailLifter
|
||
# (R0041–R0048). Always fires for both relay and non-relay
|
||
# services; the TTP path doesn't care which mode the decky was
|
||
# in. Best-effort — bus-down or unresolved attacker UUID
|
||
# downgrades to a no-op silently.
|
||
await _publish_email_received(repo, log_data, _fields)
|
||
# Signal the realism worker to forward this as a probe if it's the
|
||
# first message from this IP on an smtp_relay decky. The worker has
|
||
# real internet access (the container is on MACVLAN and doesn't).
|
||
if log_data.get("service") == "smtp_relay":
|
||
await _publish_probe_pending(log_data, _fields)
|
||
|
||
|
||
_RCPT_SPLIT_RE = re.compile(r"[,\s]+")
|
||
_ADDR_AT_RE = re.compile(r"@([A-Za-z0-9.\-]+)")
|
||
|
||
|
||
def _domain_of(addr_or_header: str | None) -> str | None:
|
||
"""Extract the trailing domain from an address-like string.
|
||
|
||
Tolerant of full RFC 5322 headers (``"Name" <a@b.com>``), bare
|
||
addresses (``a@b.com``), and angle-bracketed envelope values
|
||
(``<a@b.com>``). Returns the domain lowercased, or ``None`` when
|
||
no ``@``-delimited domain is present. The EmailLifter expects
|
||
just the domain — display names and local-parts ride in the
|
||
artifact, not on the bus.
|
||
"""
|
||
if not addr_or_header:
|
||
return None
|
||
match = _ADDR_AT_RE.search(addr_or_header)
|
||
if not match:
|
||
return None
|
||
domain = match.group(1).strip(".").lower()
|
||
return domain or None
|
||
|
||
|
||
def _rcpt_projection(rcpt_to_field: str | None) -> tuple[int, list[str]]:
|
||
"""Split the comma-or-whitespace-separated RCPT TO blob into
|
||
``(count, unique_domains_first_seen_order)``."""
|
||
if not rcpt_to_field:
|
||
return 0, []
|
||
parts = [p for p in _RCPT_SPLIT_RE.split(rcpt_to_field) if p]
|
||
domains: dict[str, None] = {}
|
||
for part in parts:
|
||
domain = _domain_of(part)
|
||
if domain:
|
||
domains.setdefault(domain, None)
|
||
return len(parts), list(domains.keys())
|
||
|
||
|
||
def _attachment_extensions(manifest: list[dict]) -> list[str]:
|
||
"""Return unique lowercased file extensions (with dot) from the
|
||
attachment manifest, preserving first-seen order. ``"payload.exe"``
|
||
→ ``".exe"``; missing / dotless filenames are skipped."""
|
||
seen: dict[str, None] = {}
|
||
for entry in manifest:
|
||
if not isinstance(entry, dict):
|
||
continue
|
||
filename = entry.get("filename") or ""
|
||
if not isinstance(filename, str):
|
||
continue
|
||
idx = filename.rfind(".")
|
||
if idx < 0 or idx == len(filename) - 1:
|
||
continue
|
||
ext = filename[idx:].lower()
|
||
seen.setdefault(ext, None)
|
||
return list(seen.keys())
|
||
|
||
|
||
async def _publish_email_received(
|
||
repo: BaseRepository, log_data: dict, fields: dict,
|
||
) -> None:
|
||
"""Project the SMTP capture event onto the EmailLifter wire contract.
|
||
|
||
Mirrors :func:`_publish_probe_pending`: a fresh bus connection per
|
||
publish, fire-and-forget, all exceptions swallowed (the bus is the
|
||
notification layer, not the source of truth). Producer for
|
||
``email.received`` per the 2026-05-02 DEBT #3 paydown.
|
||
"""
|
||
attacker_ip = log_data.get("attacker_ip")
|
||
try:
|
||
attacker_uuid = (
|
||
await repo.get_attacker_uuid_by_ip(attacker_ip)
|
||
if attacker_ip else None
|
||
)
|
||
except Exception as exc: # noqa: BLE001
|
||
logger.debug("email_received: attacker resolve failed: %s", exc)
|
||
attacker_uuid = None
|
||
if not attacker_uuid:
|
||
# Without an attacker_uuid the TTP worker cannot anchor the
|
||
# tags to a row. Drop rather than emit an orphan event.
|
||
return
|
||
|
||
rcpt_count, rcpt_domains = _rcpt_projection(fields.get("rcpt_to"))
|
||
try:
|
||
attachment_manifest = json.loads(fields.get("attachments_json") or "[]")
|
||
except (TypeError, ValueError):
|
||
attachment_manifest = []
|
||
if not isinstance(attachment_manifest, list):
|
||
attachment_manifest = []
|
||
attachment_sha256s = [
|
||
entry.get("sha256") for entry in attachment_manifest
|
||
if isinstance(entry, dict) and isinstance(entry.get("sha256"), str)
|
||
and entry.get("sha256")
|
||
]
|
||
try:
|
||
urls = json.loads(fields.get("urls_json") or "[]")
|
||
except (TypeError, ValueError):
|
||
urls = []
|
||
if not isinstance(urls, list):
|
||
urls = []
|
||
|
||
# The decky writes ``dkim_signed`` / ``spf_pass`` as 0/1 ints
|
||
# because syslog SD-values are strings; coerce back to the bool
|
||
# the EmailLifter predicates check with ``is True`` / ``is False``.
|
||
def _to_bool(raw: Any) -> bool:
|
||
if isinstance(raw, bool):
|
||
return raw
|
||
if isinstance(raw, (int, float)):
|
||
return raw != 0
|
||
if isinstance(raw, str):
|
||
return raw.strip() in {"1", "true", "True", "yes"}
|
||
return False
|
||
|
||
# Reduce per-attachment booleans (added by the decky's
|
||
# _summarize_message Layer-2 extension) to top-level rule fields.
|
||
# OR across all attachments — R0046 fires on a single positive.
|
||
attachment_macros = any(
|
||
bool(entry.get("macro_indicator"))
|
||
for entry in attachment_manifest
|
||
if isinstance(entry, dict)
|
||
)
|
||
attachment_password_protected = any(
|
||
bool(entry.get("encrypted"))
|
||
for entry in attachment_manifest
|
||
if isinstance(entry, dict)
|
||
)
|
||
|
||
body_base64_bytes_raw = fields.get("body_base64_bytes")
|
||
try:
|
||
body_base64_bytes = (
|
||
int(body_base64_bytes_raw)
|
||
if body_base64_bytes_raw is not None else 0
|
||
)
|
||
except (TypeError, ValueError):
|
||
body_base64_bytes = 0
|
||
|
||
payload: dict[str, Any] = {
|
||
"source_id": fields.get("msg_id") or fields.get("stored_as"),
|
||
"attacker_uuid": attacker_uuid,
|
||
"attacker_ip": attacker_ip,
|
||
"decky_id": log_data.get("decky"),
|
||
"service": log_data.get("service"),
|
||
"subject": fields.get("subject"),
|
||
"from_domain": _domain_of(fields.get("from_hdr")),
|
||
"mail_from_domain": _domain_of(fields.get("mail_from")),
|
||
"return_path_domain": _domain_of(fields.get("return_path")),
|
||
"rcpt_count": rcpt_count,
|
||
"rcpt_domains": rcpt_domains,
|
||
"x_mailer": fields.get("x_mailer") or None,
|
||
"dkim_signed": _to_bool(fields.get("dkim_signed")),
|
||
"spf_pass": _to_bool(fields.get("spf_pass")),
|
||
"urls": [u for u in urls if isinstance(u, str)],
|
||
"attachment_count": fields.get("attachment_count"),
|
||
"attachment_sha256s": attachment_sha256s,
|
||
"attachment_extensions": _attachment_extensions(attachment_manifest),
|
||
# Heavyweight Layer-2 fields consumed by R0042 / R0046 / R0048.
|
||
# body_simhash is a 16-hex-char string per the decky's
|
||
# _body_simhash; ``""`` when no body text was extractable —
|
||
# the lifter's _p_mass_phish predicate rejects non-strings, so
|
||
# an empty string is the right "no signal" value (predicate
|
||
# accepts str|int and treats "" as falsy via the rcpt threshold
|
||
# gate fallback).
|
||
"body_simhash": fields.get("body_simhash") or "",
|
||
"body_base64_bytes": body_base64_bytes,
|
||
"attachment_macros": attachment_macros,
|
||
"attachment_password_protected": attachment_password_protected,
|
||
"html_smuggling": _to_bool(fields.get("html_smuggling")),
|
||
"stored_as": fields.get("stored_as"),
|
||
"body_sha256": fields.get("sha256"),
|
||
}
|
||
try:
|
||
bus = get_bus(client_name="ingester-email")
|
||
await bus.connect()
|
||
await publish_safely(
|
||
bus,
|
||
_topics.email_topic(_topics.EMAIL_RECEIVED),
|
||
payload,
|
||
event_type=_topics.EMAIL_RECEIVED,
|
||
)
|
||
await bus.close()
|
||
except Exception as exc: # noqa: BLE001
|
||
logger.debug("email.received publish failed: %s", exc)
|
||
|
||
|
||
async def _publish_probe_pending(log_data: dict, fields: dict) -> None:
|
||
try:
|
||
bus = get_bus(client_name="ingester-probe")
|
||
await bus.connect()
|
||
await publish_safely(
|
||
bus,
|
||
_topics.smtp("probe.pending"),
|
||
{
|
||
"decky": log_data.get("decky"),
|
||
"attacker_ip": log_data.get("attacker_ip"),
|
||
"stored_as": fields.get("stored_as"),
|
||
"mail_from": fields.get("mail_from"),
|
||
"rcpt_to": fields.get("rcpt_to"),
|
||
},
|
||
event_type="probe.pending",
|
||
)
|
||
await bus.close()
|
||
except Exception as exc: # noqa: BLE001
|
||
logger.debug("probe pending publish failed: %s", exc)
|
||
|
||
|
||
# ─── IP-leak detection (XFF / Forwarded / X-Real-IP / CDN variants) ──────────
|
||
|
||
# Proxy-family headers we inspect, in priority order. Forwarded (RFC 7239)
|
||
# is the "proper" way; X-Forwarded-For is de-facto; X-Real-IP and CDN
|
||
# variants are common nginx / CloudFlare conventions.
|
||
_PROXY_HEADERS = (
|
||
"Forwarded",
|
||
"X-Forwarded-For",
|
||
"X-Real-IP",
|
||
"True-Client-IP",
|
||
"CF-Connecting-IP",
|
||
)
|
||
|
||
# RFC 7239 `Forwarded: for=1.2.3.4` / `for="[2001:db8::1]:4711"`. The
|
||
# capture grabs the raw for= value up to the next pair/element
|
||
# delimiter (; or ,) or end-of-string; _parse_forwarded strips quotes
|
||
# / IPv6 brackets / port afterwards.
|
||
_FORWARDED_KV_RE = re.compile(
|
||
r'for\s*=\s*"?([^",;]+?)"?(?=[;,]|$)',
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
|
||
def _get_trusted_proxies() -> list[ipaddress._BaseNetwork]:
|
||
"""Parse DECNET_TRUSTED_PROXIES once per process into network objects.
|
||
|
||
Empty / unset → empty list (no opt-outs). Malformed entries are logged
|
||
at WARNING and silently dropped — a typo in the env shouldn't brick
|
||
the ingester.
|
||
"""
|
||
raw = os.environ.get("DECNET_TRUSTED_PROXIES", "")
|
||
out: list[ipaddress._BaseNetwork] = []
|
||
for token in raw.split(","):
|
||
token = token.strip()
|
||
if not token:
|
||
continue
|
||
try:
|
||
# Accept both bare IPs ("1.2.3.4") and CIDRs ("10.0.0.0/8").
|
||
if "/" in token:
|
||
out.append(ipaddress.ip_network(token, strict=False))
|
||
else:
|
||
out.append(ipaddress.ip_network(f"{token}/32", strict=False))
|
||
except (ValueError, TypeError) as exc:
|
||
logger.warning("DECNET_TRUSTED_PROXIES: ignoring %r: %s", token, exc)
|
||
return out
|
||
|
||
|
||
def _is_trusted_source(source_ip: str) -> bool:
|
||
try:
|
||
addr = ipaddress.ip_address(source_ip)
|
||
except (ValueError, TypeError):
|
||
return False
|
||
for net in _get_trusted_proxies():
|
||
try:
|
||
if addr in net:
|
||
return True
|
||
except (ValueError, TypeError):
|
||
continue
|
||
return False
|
||
|
||
|
||
def _lookup_header(headers: dict[str, Any], name: str) -> Optional[str]:
|
||
"""Case-insensitive header fetch; HTTP template logs headers as-received."""
|
||
lowered = name.lower()
|
||
for k, v in headers.items():
|
||
if isinstance(k, str) and k.lower() == lowered:
|
||
if isinstance(v, str) and v.strip():
|
||
return v.strip()
|
||
return None
|
||
|
||
|
||
def _parse_forwarded(value: str) -> Optional[str]:
|
||
"""Return the first `for=` IP from an RFC 7239 Forwarded header.
|
||
|
||
Handles the quoted IPv6-bracket-port form (`for="[2001:db8::1]:4711"`)
|
||
plus the bare IPv4 (`for=1.2.3.4`) and IPv4:port (`for=1.2.3.4:80`)
|
||
variants. Returns None on any parse failure.
|
||
"""
|
||
match = _FORWARDED_KV_RE.search(value)
|
||
if not match:
|
||
return None
|
||
token = match.group(1).strip()
|
||
if not token:
|
||
return None
|
||
# Strip IPv6 brackets (+ optional :port after them).
|
||
if token.startswith("["):
|
||
end = token.find("]")
|
||
if end > 0:
|
||
token = token[1:end]
|
||
elif token.count(":") == 1:
|
||
# IPv4:port. IPv6 bare literals have ≥2 colons so we leave those.
|
||
token = token.split(":")[0]
|
||
try:
|
||
ipaddress.ip_address(token)
|
||
except (ValueError, TypeError):
|
||
return None
|
||
return token
|
||
|
||
|
||
def _parse_xff_chain(value: str) -> Optional[str]:
|
||
"""Return the left-most parseable IP from an X-Forwarded-For chain."""
|
||
for token in value.split(","):
|
||
token = token.strip().strip('"').lstrip("[").rstrip("]")
|
||
if not token:
|
||
continue
|
||
try:
|
||
ipaddress.ip_address(token)
|
||
except (ValueError, TypeError):
|
||
continue
|
||
return token
|
||
return None
|
||
|
||
|
||
def _extract_claimed_ip(headers: dict[str, Any]) -> tuple[Optional[str], Optional[str]]:
|
||
"""Walk the proxy-header priority list; return (claimed_ip, header_name)."""
|
||
for header in _PROXY_HEADERS:
|
||
raw = _lookup_header(headers, header)
|
||
if raw is None:
|
||
continue
|
||
if header == "Forwarded":
|
||
claimed = _parse_forwarded(raw)
|
||
elif header == "X-Forwarded-For":
|
||
claimed = _parse_xff_chain(raw)
|
||
else:
|
||
# Single-IP headers — may still carry port or IPv6 brackets.
|
||
token = raw.strip().strip('"').lstrip("[").rstrip("]")
|
||
try:
|
||
ipaddress.ip_address(token)
|
||
claimed = token
|
||
except (ValueError, TypeError):
|
||
claimed = None
|
||
if claimed is not None:
|
||
return claimed, header
|
||
return None, None
|
||
|
||
|
||
def _categorize_claimed_ip(ip: str) -> str:
|
||
"""Return a category label for a claimed IP string.
|
||
|
||
Public routable addresses are potential real-IP leaks. Anything
|
||
else (loopback, private, link-local, multicast, reserved,
|
||
unspecified) is almost certainly a forgery — XFF spoofing is the
|
||
classic WAF-bypass / IP-allowlist trick. Callers branch on this:
|
||
``public`` → :data:`ip_leak` bounty, anything else →
|
||
``spoofed_source`` fingerprint bounty.
|
||
"""
|
||
try:
|
||
addr = ipaddress.ip_address(ip)
|
||
except (ValueError, TypeError):
|
||
return "unparseable"
|
||
if addr.is_unspecified:
|
||
return "unspecified"
|
||
if addr.is_loopback:
|
||
return "loopback"
|
||
if addr.is_link_local:
|
||
return "link_local"
|
||
if addr.is_multicast:
|
||
return "multicast"
|
||
if addr.is_reserved:
|
||
return "reserved"
|
||
if addr.is_private:
|
||
return "private"
|
||
return "public"
|
||
|
||
|
||
def _classify_proxy_header_claim(
|
||
log_data: dict[str, Any], headers: dict[str, Any],
|
||
) -> Optional[tuple[str, dict[str, Any]]]:
|
||
"""Shared worker for the two XFF-family detectors.
|
||
|
||
Returns ``(kind, payload)`` where ``kind`` is ``"leak"`` (public
|
||
claim, real attribution leak) or ``"spoof"`` (non-routable claim,
|
||
WAF-bypass attempt). Returns ``None`` for non-HTTP / trusted-proxy
|
||
source / no proxy header / claim matches source / unparseable claim.
|
||
"""
|
||
if log_data.get("service") != "http":
|
||
return None
|
||
if not isinstance(headers, dict) or not headers:
|
||
return None
|
||
source_ip = log_data.get("attacker_ip")
|
||
if not isinstance(source_ip, str) or not source_ip:
|
||
return None
|
||
if _is_trusted_source(source_ip):
|
||
return None
|
||
|
||
claimed, header_name = _extract_claimed_ip(headers)
|
||
if claimed is None or claimed == source_ip:
|
||
return None
|
||
|
||
category = _categorize_claimed_ip(claimed)
|
||
if category == "unparseable":
|
||
return None
|
||
|
||
seen: dict[str, str] = {}
|
||
for h in _PROXY_HEADERS:
|
||
raw = _lookup_header(headers, h)
|
||
if raw is not None:
|
||
seen[h] = raw
|
||
|
||
base = {
|
||
"source_ip": source_ip,
|
||
"claimed_ip": claimed,
|
||
"source_header": header_name,
|
||
"headers_seen": seen,
|
||
"claim_category": category,
|
||
}
|
||
return ("leak" if category == "public" else "spoof"), base
|
||
|
||
|
||
def _detect_ip_leak(
|
||
log_data: dict[str, Any], headers: dict[str, Any],
|
||
) -> Optional[dict[str, Any]]:
|
||
"""Return an ip_leak bounty payload iff a PUBLIC proxy-claim
|
||
mismatch is present — an attacker whose misconfigured VPN / proxy
|
||
forwarded their real routable IP in an XFF-family header. Returns
|
||
``None`` for spoofing attempts (loopback / private / link-local /
|
||
etc.); those land as ``spoofed_source`` fingerprints instead.
|
||
"""
|
||
result = _classify_proxy_header_claim(log_data, headers)
|
||
if result is None or result[0] != "leak":
|
||
return None
|
||
payload = result[1]
|
||
# Preserve the legacy field name so existing UI consumers
|
||
# (AttackerDetail "LEAKED IPs" row, repo JSON decode) keep working.
|
||
payload["real_ip_claim"] = payload.pop("claimed_ip")
|
||
payload.pop("claim_category", None) # always "public" for leaks
|
||
return payload
|
||
|
||
|
||
def _detect_spoofed_source(
|
||
log_data: dict[str, Any], headers: dict[str, Any],
|
||
) -> Optional[dict[str, Any]]:
|
||
"""Return a fingerprint payload iff a NON-ROUTABLE proxy-claim
|
||
is present — the attacker tried to pass loopback / private /
|
||
link-local / reserved / etc. in an XFF-family header.
|
||
|
||
That's the classic IP-allowlist / WAF-bypass trick: ``curl -H
|
||
'X-Forwarded-For: 127.0.0.1'`` hoping an upstream WAF sees
|
||
"localhost" and waves them through. No leak of their real IP;
|
||
they're telling us "I know what this header does."
|
||
|
||
Caller wraps this in ``bounty_type="fingerprint"`` with
|
||
``fingerprint_type="spoofed_source"``.
|
||
"""
|
||
result = _classify_proxy_header_claim(log_data, headers)
|
||
if result is None or result[0] != "spoof":
|
||
return None
|
||
_, payload = result
|
||
# Promote to fingerprint_type for the UI renderer dispatcher.
|
||
return {
|
||
"fingerprint_type": "spoofed_source",
|
||
**payload,
|
||
}
|
||
|
||
|
||
# ─── HTTP header quirks fingerprint ─────────────────────────────────────────
|
||
|
||
# Headers that vary with per-request content (payload-body size, cookies
|
||
# set by prior responses) and therefore aren't useful identity. Stripped
|
||
# before hashing so a tool's order fingerprint is stable across different
|
||
# targets/sessions.
|
||
_VOLATILE_HEADERS = frozenset({
|
||
"content-length",
|
||
"cookie",
|
||
"authorization",
|
||
"x-forwarded-for", # carries attacker-dependent values
|
||
"forwarded",
|
||
"x-real-ip",
|
||
"true-client-ip",
|
||
"cf-connecting-ip",
|
||
"x-request-id",
|
||
"x-correlation-id",
|
||
"x-amzn-trace-id",
|
||
})
|
||
|
||
|
||
# Distinctive order signatures for common tools. The match is on the
|
||
# lowercased-name list MINUS the volatile set. A prefix match wins —
|
||
# many tools tack on "User-Agent / Accept-Encoding / Accept" in the
|
||
# same order regardless of method.
|
||
_TOOL_SIGNATURES: tuple[tuple[str, tuple[str, ...]], ...] = (
|
||
# curl sends: Host, User-Agent, Accept, <body-headers>.
|
||
("curl", ("host", "user-agent", "accept")),
|
||
# python-requests: User-Agent, Accept-Encoding, Accept, Connection, Host.
|
||
("python-requests", ("host", "user-agent", "accept-encoding", "accept", "connection")),
|
||
# Go-http-client: Host, User-Agent, Accept-Encoding.
|
||
("go-http-client", ("host", "user-agent", "accept-encoding")),
|
||
# nmap http-enum / http-* scripts: short, Host+User-Agent ordering.
|
||
("nmap-nse", ("host", "user-agent")),
|
||
# Nikto / Nuclei send distinctive Accept-Language preferences — treat
|
||
# User-Agent check as the secondary signal elsewhere; order alone is
|
||
# ambiguous here.
|
||
)
|
||
|
||
|
||
def _casing_category(name: str) -> str:
|
||
"""Classify a header-name casing pattern.
|
||
|
||
Real HTTP clients and stacks pick one convention and stick to it:
|
||
browsers send `Title-Case`; python-requests sends `Title-Case`;
|
||
Go's stdlib canonicalises to `Title-Case`; curl sends literal
|
||
`Title-Case`; nmap/masscan often send `lowercase`; custom scanners
|
||
sometimes send `UPPERCASE`.
|
||
"""
|
||
if not name:
|
||
return "empty"
|
||
if name == name.upper():
|
||
return "upper"
|
||
if name == name.lower():
|
||
return "lower"
|
||
# "Title-Case" test: each dash-separated token starts with an
|
||
# uppercase; trailing chars (if any) must be lowercase. Single-
|
||
# letter tokens like the `X` in `X-Forwarded-For` qualify when
|
||
# uppercase — "".islower() is False in Python so the naive form
|
||
# of this test misfires.
|
||
parts = [p for p in name.split("-") if p]
|
||
if parts and all(
|
||
p[:1].isupper() and (len(p) == 1 or p[1:].islower())
|
||
for p in parts
|
||
):
|
||
return "title"
|
||
return "mixed"
|
||
|
||
|
||
def _short_hash(value: str) -> str:
|
||
"""16-hex-char SHA-256 prefix — stable identity, short display."""
|
||
import hashlib
|
||
return hashlib.sha256(value.encode("utf-8")).hexdigest()[:16]
|
||
|
||
|
||
def _guess_tool_from_order(lowered: list[str]) -> Optional[str]:
|
||
"""Return the first matching tool signature, or None."""
|
||
for name, sig in _TOOL_SIGNATURES:
|
||
if len(lowered) >= len(sig) and tuple(lowered[: len(sig)]) == sig:
|
||
return name
|
||
return None
|
||
|
||
|
||
def _http_quirks_fingerprint(
|
||
log_data: dict[str, Any], headers: dict[str, Any],
|
||
) -> Optional[dict[str, Any]]:
|
||
"""Build an HTTP request-header quirks fingerprint.
|
||
|
||
Captures the header-order hash, casing pattern, count, and a
|
||
best-effort tool guess. Returns ``None`` for non-HTTP services or
|
||
when no usable headers are present. Bounty dedup will collapse
|
||
repeat fingerprints from the same attacker.
|
||
"""
|
||
if log_data.get("service") != "http":
|
||
return None
|
||
if not isinstance(headers, dict) or not headers:
|
||
return None
|
||
|
||
# Preserve insertion order (Python 3.7+ dict guarantee, and JSON
|
||
# round-trip also preserves it). Drop volatile headers for the
|
||
# identity hash but keep them in the display order list.
|
||
names_full: list[str] = [k for k in headers.keys() if isinstance(k, str)]
|
||
if not names_full:
|
||
return None
|
||
|
||
names_stable = [n for n in names_full if n.lower() not in _VOLATILE_HEADERS]
|
||
lowered = [n.lower() for n in names_stable]
|
||
|
||
order_hash = _short_hash("\n".join(lowered))
|
||
casing_per_header = [_casing_category(n) for n in names_stable]
|
||
casing_hash = _short_hash("\n".join(casing_per_header))
|
||
|
||
# A single "dominant" casing category — useful for at-a-glance display.
|
||
categories = set(casing_per_header)
|
||
if not categories:
|
||
dominant = "empty"
|
||
elif len(categories) == 1:
|
||
dominant = next(iter(categories))
|
||
else:
|
||
dominant = "mixed"
|
||
|
||
# Identity-only payload — every field must be stable for two
|
||
# requests from the same client stack. add_bounty dedups on the
|
||
# full payload JSON, so a per-request-varying key (path, method,
|
||
# header_count when Cookie presence varies) would spawn one row
|
||
# per request. The hashes ARE the identity; per-request context
|
||
# lives in the logs table.
|
||
return {
|
||
"fingerprint_type": "http_quirks",
|
||
"order_hash": order_hash,
|
||
"order": names_stable,
|
||
"casing_hash": casing_hash,
|
||
"casing_category": dominant,
|
||
"stable_count": len(names_stable),
|
||
"tool_guess": _guess_tool_from_order(lowered),
|
||
}
|
||
|
||
|
||
# ─── User-Agent classifier ──────────────────────────────────────────────────
|
||
#
|
||
# Bucket UAs into one of {browser, cli, library, scanner, bot, nonstandard,
|
||
# empty}, and surface optional `tool` name + `signals` list (suspicious_short
|
||
# / suspicious_long / nonprintable / injection_like). The main analytic
|
||
# value is `nonstandard` — UAs that don't match any known pattern are
|
||
# either custom tooling, adversarial labels ("FUCKYOU/1.0"), or
|
||
# misconfigured scanners that deserve an analyst's eye.
|
||
|
||
_UA_BROWSER_RE = re.compile(r"^Mozilla/\d")
|
||
# Substring match without word boundaries so "bingbot", "Googlebot",
|
||
# "Baiduspider" etc. register. Downside: matches "robot" or "spidery"
|
||
# in pathological payloads — acceptable at this classifier's precision.
|
||
_UA_BOT_RE = re.compile(r"(bot|crawler|spider|slurp|monitor)", re.IGNORECASE)
|
||
|
||
# Order matters inside each bucket — first match wins, so list the more
|
||
# specific pattern first (e.g. python-requests before Python/).
|
||
_UA_CLI_RES: tuple[tuple[re.Pattern[str], str], ...] = (
|
||
(re.compile(r"^curl/", re.IGNORECASE), "curl"),
|
||
(re.compile(r"^Wget/", re.IGNORECASE), "wget"),
|
||
(re.compile(r"^HTTPie/", re.IGNORECASE), "httpie"),
|
||
(re.compile(r"^xh/", re.IGNORECASE), "xh"),
|
||
(re.compile(r"^fetch/", re.IGNORECASE), "fetch"),
|
||
)
|
||
|
||
_UA_LIBRARY_RES: tuple[tuple[re.Pattern[str], str], ...] = (
|
||
(re.compile(r"^python-requests/", re.IGNORECASE), "python-requests"),
|
||
(re.compile(r"^aiohttp/", re.IGNORECASE), "aiohttp"),
|
||
(re.compile(r"^httpx/", re.IGNORECASE), "httpx"),
|
||
(re.compile(r"^urllib/", re.IGNORECASE), "urllib"),
|
||
(re.compile(r"^Python-urllib/", re.IGNORECASE), "urllib"),
|
||
(re.compile(r"^Python/\d", re.IGNORECASE), "python-stdlib"),
|
||
(re.compile(r"^Go-http-client/", re.IGNORECASE), "go-stdlib"),
|
||
(re.compile(r"^go-resty/", re.IGNORECASE), "go-resty"),
|
||
(re.compile(r"^Java/\d", re.IGNORECASE), "java-stdlib"),
|
||
(re.compile(r"^okhttp/", re.IGNORECASE), "okhttp"),
|
||
(re.compile(r"^Apache-HttpClient/", re.IGNORECASE), "apache-httpclient"),
|
||
(re.compile(r"^Jersey/", re.IGNORECASE), "jersey"),
|
||
(re.compile(r"^axios/", re.IGNORECASE), "axios"),
|
||
(re.compile(r"^node-fetch/", re.IGNORECASE), "node-fetch"),
|
||
(re.compile(r"^got\s?\(|^got/", re.IGNORECASE), "got"),
|
||
(re.compile(r"^undici", re.IGNORECASE), "undici"),
|
||
(re.compile(r"^PHP/\d", re.IGNORECASE), "php-stdlib"),
|
||
(re.compile(r"GuzzleHttp/", re.IGNORECASE), "guzzle"),
|
||
(re.compile(r"^Ruby\b", re.IGNORECASE), "ruby-stdlib"),
|
||
(re.compile(r"^Faraday\b", re.IGNORECASE), "faraday"),
|
||
(re.compile(r"^HTTParty", re.IGNORECASE), "httparty"),
|
||
(re.compile(r"^\.NET/|System\.Net\.Http|RestSharp/", re.IGNORECASE), "dotnet"),
|
||
(re.compile(r"^PostmanRuntime/", re.IGNORECASE), "postman"),
|
||
(re.compile(r"^Insomnia/", re.IGNORECASE), "insomnia"),
|
||
)
|
||
|
||
_UA_SCANNER_RES: tuple[tuple[re.Pattern[str], str], ...] = (
|
||
(re.compile(r"\bnmap\b", re.IGNORECASE), "nmap"),
|
||
(re.compile(r"\bmasscan\b", re.IGNORECASE), "masscan"),
|
||
(re.compile(r"\bzgrab\b", re.IGNORECASE), "zgrab"),
|
||
(re.compile(r"\bzmap\b", re.IGNORECASE), "zmap"),
|
||
(re.compile(r"\bNuclei\b", re.IGNORECASE), "nuclei"),
|
||
(re.compile(r"\bsqlmap\b", re.IGNORECASE), "sqlmap"),
|
||
(re.compile(r"\bgobuster\b", re.IGNORECASE), "gobuster"),
|
||
(re.compile(r"\bdirb\b", re.IGNORECASE), "dirb"),
|
||
(re.compile(r"\bdirbuster\b", re.IGNORECASE), "dirbuster"),
|
||
(re.compile(r"\bnikto\b", re.IGNORECASE), "nikto"),
|
||
(re.compile(r"\bferoxbuster\b", re.IGNORECASE), "feroxbuster"),
|
||
(re.compile(r"\bwfuzz\b", re.IGNORECASE), "wfuzz"),
|
||
(re.compile(r"\bffuf\b", re.IGNORECASE), "ffuf"),
|
||
(re.compile(r"\bwpscan\b", re.IGNORECASE), "wpscan"),
|
||
(re.compile(r"\bkatana\b", re.IGNORECASE), "katana"),
|
||
(re.compile(r"\bBurp\b", re.IGNORECASE), "burp"),
|
||
(re.compile(r"\bAcunetix\b", re.IGNORECASE), "acunetix"),
|
||
(re.compile(r"\bNessus\b", re.IGNORECASE), "nessus"),
|
||
(re.compile(r"\bOpenVAS\b", re.IGNORECASE), "openvas"),
|
||
(re.compile(r"\bArachni\b", re.IGNORECASE), "arachni"),
|
||
(re.compile(r"\bWhatWeb\b", re.IGNORECASE), "whatweb"),
|
||
(re.compile(r"\bWappalyzer\b", re.IGNORECASE), "wappalyzer"),
|
||
(re.compile(r"\bSploitScan\b", re.IGNORECASE), "sploitscan"),
|
||
)
|
||
|
||
# Substring markers that strongly suggest a payload attempt embedded in
|
||
# the UA itself. Attackers sometimes park SQLi / path traversal / XSS
|
||
# test strings in User-Agent hoping a middleware or log-ingestion tool
|
||
# mishandles it.
|
||
_UA_INJECTION_MARKERS: tuple[str, ...] = (
|
||
"<script",
|
||
"';",
|
||
"' or '",
|
||
"' or 1",
|
||
"1=1",
|
||
"' --",
|
||
"/../",
|
||
"/etc/passwd",
|
||
"${jndi:", # Log4Shell
|
||
"{{", # SSTI
|
||
)
|
||
|
||
|
||
def _classify_ua(ua: str) -> tuple[str, Optional[str], list[str]]:
|
||
"""Return ``(category, tool, signals)``.
|
||
|
||
category ∈ {empty, browser, cli, library, scanner, bot, nonstandard}.
|
||
tool is the matched tool name when ``category`` ∈ {cli, library,
|
||
scanner}, else None. signals is a list of auxiliary flags —
|
||
suspicious_short, suspicious_long, nonprintable, injection_like —
|
||
always present on top of the category, since a scanner UA with an
|
||
injection marker is a distinct signal from a scanner UA alone.
|
||
"""
|
||
signals: list[str] = []
|
||
if ua is None or ua == "":
|
||
return "empty", None, signals
|
||
|
||
# Detectors that apply regardless of category.
|
||
if len(ua) < 8:
|
||
signals.append("suspicious_short")
|
||
if len(ua) > 512:
|
||
signals.append("suspicious_long")
|
||
if any(ord(c) < 32 and c != "\t" for c in ua):
|
||
signals.append("nonprintable")
|
||
lowered = ua.lower()
|
||
if any(marker in lowered for marker in _UA_INJECTION_MARKERS):
|
||
signals.append("injection_like")
|
||
|
||
# Priority: scanner > cli > library > bot > browser > nonstandard.
|
||
# Bots before browser because well-behaved crawlers ship UAs like
|
||
# "Mozilla/5.0 (compatible; Googlebot/2.1)" — the Mozilla prefix
|
||
# would win under browser-first ordering and misclassify them.
|
||
for regex, name in _UA_SCANNER_RES:
|
||
if regex.search(ua):
|
||
return "scanner", name, signals
|
||
for regex, name in _UA_CLI_RES:
|
||
if regex.search(ua):
|
||
return "cli", name, signals
|
||
for regex, name in _UA_LIBRARY_RES:
|
||
if regex.search(ua):
|
||
return "library", name, signals
|
||
if _UA_BOT_RE.search(ua):
|
||
return "bot", None, signals
|
||
if _UA_BROWSER_RE.match(ua):
|
||
return "browser", None, signals
|
||
return "nonstandard", None, signals
|
||
|