An attacker hitting /admin with `X-Forwarded-For: 127.0.0.1` was
previously flagged as an IP leak. It isn't — that's the classic
IP-allowlist / WAF-bypass payload ("treat me as localhost and skip
your auth checks"). Misclassifying it as "LEAKED IPs" in the UI
confuses analysts and burns trust in the signal.
Split by claim category. After pulling the left-most claimed IP
from the proxy header, classify:
- public (routable) → bounty_type=ip_leak (real attribution leak;
the attacker's upstream proxy forwarded their real IP).
- loopback / private / link-local / multicast / reserved /
unspecified → bounty_type=fingerprint, fingerprint_type=
spoofed_source (WAF-bypass / allowlist-probing attempt; the
attacker is telling us they know what XFF does).
- unparseable → dropped.
Same extraction pipeline; diverges only at the last step. A new
shared _classify_proxy_header_claim returns (kind, payload);
_detect_ip_leak keeps its public-only contract for backward-
compat; _detect_spoofed_source is the new sibling.
UI renderer FpSpoofedSource shows the claimed IP in warn color with
the claim_category tag (LOOPBACK / PRIVATE / ...) and a WAF-BYPASS
ATTEMPT badge — distinct visual from the "LEAKED IPs" row which
stays reserved for genuine public-IP leaks.
Test addresses updated: RFC 5737 doc ranges (198.51.100.0/24,
203.0.113.0/24) are flagged `is_reserved` in Python's ipaddress
module, so they now correctly belong to the spoof bucket — tests
that meant to exercise real public IPs now use 8.8.8.8 / 1.1.1.1 /
Cloudflare DNS. Added eleven new tests locking the classifier +
the two detectors' mutual exclusion.
844 lines
32 KiB
Python
844 lines
32 KiB
Python
import asyncio
|
|
import contextlib
|
|
import ipaddress
|
|
import os
|
|
import json
|
|
import re
|
|
import time
|
|
from typing import Any, Optional
|
|
from pathlib import Path
|
|
|
|
from decnet.bus import topics as _topics
|
|
from decnet.bus.factory import get_bus
|
|
from decnet.bus.publish import publish_safely
|
|
from decnet.env import DECNET_BATCH_SIZE, DECNET_BATCH_MAX_WAIT_MS
|
|
from decnet.logging import get_logger
|
|
from decnet.telemetry import (
|
|
traced as _traced,
|
|
get_tracer as _get_tracer,
|
|
extract_context as _extract_ctx,
|
|
start_span_with_context as _start_span,
|
|
)
|
|
from decnet.web.db.repository import BaseRepository
|
|
|
|
logger = get_logger("api")
|
|
|
|
_INGEST_STATE_KEY = "ingest_worker_position"
|
|
|
|
|
|
async def log_ingestion_worker(repo: BaseRepository) -> None:
|
|
"""
|
|
Background task that tails the DECNET_INGEST_LOG_FILE.json and
|
|
inserts structured JSON logs into the SQLite repository.
|
|
"""
|
|
_base_log_file: str | None = os.environ.get("DECNET_INGEST_LOG_FILE")
|
|
if not _base_log_file:
|
|
logger.warning("DECNET_INGEST_LOG_FILE not set. Log ingestion disabled.")
|
|
return
|
|
|
|
_json_log_path: Path = Path(_base_log_file).with_suffix(".json")
|
|
|
|
_saved = await repo.get_state(_INGEST_STATE_KEY)
|
|
_position: int = _saved.get("position", 0) if _saved else 0
|
|
|
|
logger.info("ingest worker started path=%s position=%d", _json_log_path, _position)
|
|
|
|
# Optional bus wiring — emit one system.log event per committed batch so
|
|
# downstream consumers (dashboard heartbeats, federation forwarder) can
|
|
# track DB-persisted progress without polling the state table.
|
|
_bus = None
|
|
try:
|
|
_bus = get_bus(client_name="ingester")
|
|
await _bus.connect()
|
|
except Exception as _exc:
|
|
logger.warning("ingester: bus unavailable, continuing without publish: %s", _exc)
|
|
_bus = None
|
|
|
|
try:
|
|
await _run_loop(repo, _json_log_path, _position, _bus)
|
|
finally:
|
|
if _bus is not None:
|
|
with contextlib.suppress(Exception):
|
|
await _bus.close()
|
|
|
|
|
|
async def _run_loop(
|
|
repo: BaseRepository,
|
|
_json_log_path: Path,
|
|
_position: int,
|
|
_bus: Any,
|
|
) -> None:
|
|
while True:
|
|
try:
|
|
if not _json_log_path.exists():
|
|
await asyncio.sleep(2)
|
|
continue
|
|
|
|
_stat: os.stat_result = _json_log_path.stat()
|
|
if _stat.st_size < _position:
|
|
# File rotated or truncated
|
|
_position = 0
|
|
await repo.set_state(_INGEST_STATE_KEY, {"position": 0})
|
|
|
|
if _stat.st_size == _position:
|
|
# No new data
|
|
await asyncio.sleep(1)
|
|
continue
|
|
|
|
# Accumulate parsed rows and the file offset they end at. We
|
|
# only advance _position after the batch is successfully
|
|
# committed — if we get cancelled mid-flush, the next run
|
|
# re-reads the un-committed lines rather than losing them.
|
|
_batch: list[tuple[dict[str, Any], int]] = []
|
|
_batch_started: float = time.monotonic()
|
|
_max_wait_s: float = DECNET_BATCH_MAX_WAIT_MS / 1000.0
|
|
|
|
with open(_json_log_path, "r", encoding="utf-8", errors="replace") as _f:
|
|
_f.seek(_position)
|
|
while True:
|
|
_line: str = _f.readline()
|
|
if not _line or not _line.endswith('\n'):
|
|
# EOF or partial line — flush what we have and stop
|
|
break
|
|
|
|
try:
|
|
_log_data: dict[str, Any] = json.loads(_line.strip())
|
|
# Collector injects trace context so the ingester span
|
|
# chains off the collector's — full event journey in Jaeger.
|
|
_parent_ctx = _extract_ctx(_log_data)
|
|
_tracer = _get_tracer("ingester")
|
|
with _start_span(_tracer, "ingester.process_record", context=_parent_ctx) as _span:
|
|
_span.set_attribute("decky", _log_data.get("decky", ""))
|
|
_span.set_attribute("service", _log_data.get("service", ""))
|
|
_span.set_attribute("event_type", _log_data.get("event_type", ""))
|
|
_span.set_attribute("attacker_ip", _log_data.get("attacker_ip", ""))
|
|
_sctx = getattr(_span, "get_span_context", None)
|
|
if _sctx:
|
|
_ctx = _sctx()
|
|
if _ctx and getattr(_ctx, "trace_id", 0):
|
|
_log_data["trace_id"] = format(_ctx.trace_id, "032x")
|
|
_log_data["span_id"] = format(_ctx.span_id, "016x")
|
|
_batch.append((_log_data, _f.tell()))
|
|
except json.JSONDecodeError:
|
|
logger.error("ingest: failed to decode JSON log line: %s", _line.strip())
|
|
# Skip past bad line so we don't loop forever on it.
|
|
_position = _f.tell()
|
|
continue
|
|
|
|
if len(_batch) >= DECNET_BATCH_SIZE or (
|
|
time.monotonic() - _batch_started >= _max_wait_s
|
|
):
|
|
_flushed = len(_batch)
|
|
_position = await _flush_batch(repo, _batch, _position)
|
|
_batch.clear()
|
|
_batch_started = time.monotonic()
|
|
await _publish_batch(_bus, _flushed, _position)
|
|
|
|
# Flush any remainder collected before EOF / partial-line break.
|
|
if _batch:
|
|
_flushed = len(_batch)
|
|
_position = await _flush_batch(repo, _batch, _position)
|
|
await _publish_batch(_bus, _flushed, _position)
|
|
|
|
except Exception as _e:
|
|
_err_str = str(_e).lower()
|
|
if "no such table" in _err_str or "no active connection" in _err_str or "connection closed" in _err_str:
|
|
logger.error("ingest: post-shutdown or fatal DB error: %s", _e)
|
|
break # Exit worker — DB is gone or uninitialized
|
|
|
|
logger.error("ingest: error in worker: %s", _e)
|
|
await asyncio.sleep(5)
|
|
|
|
await asyncio.sleep(1)
|
|
|
|
|
|
async def _publish_batch(bus: Any, flushed: int, position: int) -> None:
|
|
"""Emit one ``system.log`` event summarising a committed batch.
|
|
|
|
Fire-and-forget via :func:`publish_safely`; a dead bus never blocks the
|
|
ingestion loop. Zero-row flushes are suppressed so the topic stays
|
|
meaningful.
|
|
"""
|
|
if bus is None or flushed <= 0:
|
|
return
|
|
await publish_safely(
|
|
bus,
|
|
_topics.system(_topics.SYSTEM_LOG),
|
|
{"component": "ingester", "flushed": flushed, "position": position},
|
|
event_type="batch_committed",
|
|
)
|
|
|
|
|
|
async def _flush_batch(
|
|
repo: BaseRepository,
|
|
batch: list[tuple[dict[str, Any], int]],
|
|
current_position: int,
|
|
) -> int:
|
|
"""Commit a batch of log rows and return the new file position.
|
|
|
|
If the enclosing task is being cancelled, bail out without touching
|
|
the DB — the session factory may already be disposed during lifespan
|
|
teardown, and awaiting it would stall the worker. The un-flushed
|
|
lines stay uncommitted; the next startup re-reads them from
|
|
``current_position``.
|
|
"""
|
|
_task = asyncio.current_task()
|
|
if _task is not None and _task.cancelling():
|
|
raise asyncio.CancelledError()
|
|
|
|
_entries = [_entry for _entry, _ in batch]
|
|
_new_position = batch[-1][1]
|
|
await repo.add_logs(_entries)
|
|
for _entry in _entries:
|
|
await _extract_bounty(repo, _entry)
|
|
await repo.set_state(_INGEST_STATE_KEY, {"position": _new_position})
|
|
return _new_position
|
|
|
|
|
|
@_traced("ingester.extract_bounty")
|
|
async def _extract_bounty(repo: BaseRepository, log_data: dict[str, Any]) -> None:
|
|
"""Detect and extract valuable artifacts (bounties) from log entries."""
|
|
_fields = log_data.get("fields")
|
|
if not isinstance(_fields, dict):
|
|
return
|
|
|
|
# 1. Credentials (User/Pass)
|
|
_user = _fields.get("username")
|
|
_pass = _fields.get("password")
|
|
|
|
if _user and _pass:
|
|
await repo.add_bounty({
|
|
"decky": log_data.get("decky"),
|
|
"service": log_data.get("service"),
|
|
"attacker_ip": log_data.get("attacker_ip"),
|
|
"bounty_type": "credential",
|
|
"payload": {
|
|
"username": _user,
|
|
"password": _pass
|
|
}
|
|
})
|
|
|
|
# 2. HTTP User-Agent fingerprint
|
|
_h_raw = _fields.get("headers")
|
|
if isinstance(_h_raw, dict):
|
|
_headers = _h_raw
|
|
elif isinstance(_h_raw, str):
|
|
try:
|
|
_parsed = json.loads(_h_raw)
|
|
_headers = _parsed if isinstance(_parsed, dict) else {}
|
|
except (json.JSONDecodeError, ValueError):
|
|
_headers = {}
|
|
else:
|
|
_headers = {}
|
|
_ua = _headers.get("User-Agent") or _headers.get("user-agent")
|
|
if _ua:
|
|
# Payload must be identity-only (no per-request method/path) —
|
|
# add_bounty dedups on (attacker_ip, bounty_type, full payload
|
|
# JSON), so including path here would create one row per URL
|
|
# the scanner hits. Per-request context belongs in the logs
|
|
# table, not the bounty table.
|
|
await repo.add_bounty({
|
|
"decky": log_data.get("decky"),
|
|
"service": log_data.get("service"),
|
|
"attacker_ip": log_data.get("attacker_ip"),
|
|
"bounty_type": "fingerprint",
|
|
"payload": {
|
|
"fingerprint_type": "http_useragent",
|
|
"value": _ua,
|
|
}
|
|
})
|
|
|
|
# 2b. IP leak — the attacker's real IP accidentally forwarded in a
|
|
# proxy-family header (X-Forwarded-For / Forwarded / X-Real-IP /
|
|
# CDN variants). Left-most value differing from the TCP source is
|
|
# a high-confidence attribution signal. DECNET_TRUSTED_PROXIES
|
|
# opts specific source IPs out (legitimate reverse proxy in front
|
|
# of DECNET).
|
|
_leak = _detect_ip_leak(log_data, _headers)
|
|
if _leak is not None:
|
|
await repo.add_bounty({
|
|
"decky": log_data.get("decky"),
|
|
"service": log_data.get("service"),
|
|
"attacker_ip": log_data.get("attacker_ip"),
|
|
"bounty_type": "ip_leak",
|
|
"payload": _leak,
|
|
})
|
|
|
|
# 2b.2 Spoofed source — attacker tried to pass a non-routable IP
|
|
# (loopback / RFC1918 / link-local / reserved) in a proxy header.
|
|
# Classic WAF-bypass: `X-Forwarded-For: 127.0.0.1` hoping an
|
|
# upstream filter waves localhost through. Distinct bounty type
|
|
# from ip_leak because the semantic is inverted — attack attempt,
|
|
# not opsec failure.
|
|
_spoof = _detect_spoofed_source(log_data, _headers)
|
|
if _spoof is not None:
|
|
await repo.add_bounty({
|
|
"decky": log_data.get("decky"),
|
|
"service": log_data.get("service"),
|
|
"attacker_ip": log_data.get("attacker_ip"),
|
|
"bounty_type": "fingerprint",
|
|
"payload": _spoof,
|
|
})
|
|
|
|
# 2c. HTTP header quirks — order + casing fingerprint per request.
|
|
# Real HTTP clients have distinctive header orderings and casing
|
|
# patterns (curl vs python-requests vs Go-http-client vs nmap vs
|
|
# browsers all differ). Attackers routinely spoof User-Agent but
|
|
# forget to match the stack's native header order. Bounty dedup
|
|
# collapses repeat fingerprints from the same attacker, so this
|
|
# fires once per distinct hash per source.
|
|
_quirks = _http_quirks_fingerprint(log_data, _headers)
|
|
if _quirks is not None:
|
|
await repo.add_bounty({
|
|
"decky": log_data.get("decky"),
|
|
"service": log_data.get("service"),
|
|
"attacker_ip": log_data.get("attacker_ip"),
|
|
"bounty_type": "fingerprint",
|
|
"payload": _quirks,
|
|
})
|
|
|
|
# 3. VNC client version fingerprint
|
|
_vnc_ver = _fields.get("client_version")
|
|
if _vnc_ver and log_data.get("event_type") == "version":
|
|
await repo.add_bounty({
|
|
"decky": log_data.get("decky"),
|
|
"service": log_data.get("service"),
|
|
"attacker_ip": log_data.get("attacker_ip"),
|
|
"bounty_type": "fingerprint",
|
|
"payload": {
|
|
"fingerprint_type": "vnc_client_version",
|
|
"value": _vnc_ver,
|
|
}
|
|
})
|
|
|
|
# 4. SSH client banner fingerprint (deferred — requires asyncssh server)
|
|
# Fires on: service=ssh, event_type=client_banner, fields.client_banner
|
|
|
|
# 5. JA3/JA3S TLS fingerprint from sniffer container
|
|
_ja3 = _fields.get("ja3")
|
|
if _ja3 and log_data.get("service") == "sniffer":
|
|
await repo.add_bounty({
|
|
"decky": log_data.get("decky"),
|
|
"service": "sniffer",
|
|
"attacker_ip": log_data.get("attacker_ip"),
|
|
"bounty_type": "fingerprint",
|
|
"payload": {
|
|
"fingerprint_type": "ja3",
|
|
"ja3": _ja3,
|
|
"ja3s": _fields.get("ja3s"),
|
|
"ja4": _fields.get("ja4"),
|
|
"ja4s": _fields.get("ja4s"),
|
|
"tls_version": _fields.get("tls_version"),
|
|
"sni": _fields.get("sni") or None,
|
|
"alpn": _fields.get("alpn") or None,
|
|
"dst_port": _fields.get("dst_port"),
|
|
"raw_ciphers": _fields.get("raw_ciphers"),
|
|
"raw_extensions": _fields.get("raw_extensions"),
|
|
},
|
|
})
|
|
|
|
# 6. JA4L latency fingerprint from sniffer
|
|
_ja4l_rtt = _fields.get("ja4l_rtt_ms")
|
|
if _ja4l_rtt and log_data.get("service") == "sniffer":
|
|
await repo.add_bounty({
|
|
"decky": log_data.get("decky"),
|
|
"service": "sniffer",
|
|
"attacker_ip": log_data.get("attacker_ip"),
|
|
"bounty_type": "fingerprint",
|
|
"payload": {
|
|
"fingerprint_type": "ja4l",
|
|
"rtt_ms": _ja4l_rtt,
|
|
"client_ttl": _fields.get("ja4l_client_ttl"),
|
|
},
|
|
})
|
|
|
|
# 7. TLS session resumption behavior
|
|
_resumption = _fields.get("resumption")
|
|
if _resumption and log_data.get("service") == "sniffer":
|
|
await repo.add_bounty({
|
|
"decky": log_data.get("decky"),
|
|
"service": "sniffer",
|
|
"attacker_ip": log_data.get("attacker_ip"),
|
|
"bounty_type": "fingerprint",
|
|
"payload": {
|
|
"fingerprint_type": "tls_resumption",
|
|
"mechanisms": _resumption,
|
|
},
|
|
})
|
|
|
|
# 8. TLS certificate details (TLS 1.2 only — passive extraction)
|
|
_subject_cn = _fields.get("subject_cn")
|
|
if _subject_cn and log_data.get("service") == "sniffer":
|
|
await repo.add_bounty({
|
|
"decky": log_data.get("decky"),
|
|
"service": "sniffer",
|
|
"attacker_ip": log_data.get("attacker_ip"),
|
|
"bounty_type": "fingerprint",
|
|
"payload": {
|
|
"fingerprint_type": "tls_certificate",
|
|
"subject_cn": _subject_cn,
|
|
"issuer": _fields.get("issuer"),
|
|
"self_signed": _fields.get("self_signed"),
|
|
"not_before": _fields.get("not_before"),
|
|
"not_after": _fields.get("not_after"),
|
|
"sans": _fields.get("sans"),
|
|
"sni": _fields.get("sni") or None,
|
|
},
|
|
})
|
|
|
|
# 9. JARM fingerprint from active prober
|
|
_jarm = _fields.get("jarm_hash")
|
|
if _jarm and log_data.get("service") == "prober":
|
|
await repo.add_bounty({
|
|
"decky": log_data.get("decky"),
|
|
"service": "prober",
|
|
"attacker_ip": _fields.get("target_ip", "Unknown"),
|
|
"bounty_type": "fingerprint",
|
|
"payload": {
|
|
"fingerprint_type": "jarm",
|
|
"hash": _jarm,
|
|
"target_ip": _fields.get("target_ip"),
|
|
"target_port": _fields.get("target_port"),
|
|
},
|
|
})
|
|
|
|
# 10. HASSHServer fingerprint from active prober
|
|
_hassh = _fields.get("hassh_server_hash")
|
|
if _hassh and log_data.get("service") == "prober":
|
|
await repo.add_bounty({
|
|
"decky": log_data.get("decky"),
|
|
"service": "prober",
|
|
"attacker_ip": _fields.get("target_ip", "Unknown"),
|
|
"bounty_type": "fingerprint",
|
|
"payload": {
|
|
"fingerprint_type": "hassh_server",
|
|
"hash": _hassh,
|
|
"target_ip": _fields.get("target_ip"),
|
|
"target_port": _fields.get("target_port"),
|
|
"ssh_banner": _fields.get("ssh_banner"),
|
|
"kex_algorithms": _fields.get("kex_algorithms"),
|
|
"encryption_s2c": _fields.get("encryption_s2c"),
|
|
"mac_s2c": _fields.get("mac_s2c"),
|
|
"compression_s2c": _fields.get("compression_s2c"),
|
|
},
|
|
})
|
|
|
|
# 11. TCP/IP stack fingerprint from active prober
|
|
_tcpfp = _fields.get("tcpfp_hash")
|
|
if _tcpfp and log_data.get("service") == "prober":
|
|
await repo.add_bounty({
|
|
"decky": log_data.get("decky"),
|
|
"service": "prober",
|
|
"attacker_ip": _fields.get("target_ip", "Unknown"),
|
|
"bounty_type": "fingerprint",
|
|
"payload": {
|
|
"fingerprint_type": "tcpfp",
|
|
"hash": _tcpfp,
|
|
"raw": _fields.get("tcpfp_raw"),
|
|
"target_ip": _fields.get("target_ip"),
|
|
"target_port": _fields.get("target_port"),
|
|
"ttl": _fields.get("ttl"),
|
|
"window_size": _fields.get("window_size"),
|
|
"df_bit": _fields.get("df_bit"),
|
|
"mss": _fields.get("mss"),
|
|
"window_scale": _fields.get("window_scale"),
|
|
"sack_ok": _fields.get("sack_ok"),
|
|
"timestamp": _fields.get("timestamp"),
|
|
"options_order": _fields.get("options_order"),
|
|
},
|
|
})
|
|
|
|
|
|
# ─── IP-leak detection (XFF / Forwarded / X-Real-IP / CDN variants) ──────────
|
|
|
|
# Proxy-family headers we inspect, in priority order. Forwarded (RFC 7239)
|
|
# is the "proper" way; X-Forwarded-For is de-facto; X-Real-IP and CDN
|
|
# variants are common nginx / CloudFlare conventions.
|
|
_PROXY_HEADERS = (
|
|
"Forwarded",
|
|
"X-Forwarded-For",
|
|
"X-Real-IP",
|
|
"True-Client-IP",
|
|
"CF-Connecting-IP",
|
|
)
|
|
|
|
# RFC 7239 `Forwarded: for=1.2.3.4` / `for="[2001:db8::1]:4711"`. The
|
|
# capture grabs the raw for= value up to the next pair/element
|
|
# delimiter (; or ,) or end-of-string; _parse_forwarded strips quotes
|
|
# / IPv6 brackets / port afterwards.
|
|
_FORWARDED_KV_RE = re.compile(
|
|
r'for\s*=\s*"?([^",;]+?)"?(?=[;,]|$)',
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
|
|
def _get_trusted_proxies() -> list[ipaddress._BaseNetwork]:
|
|
"""Parse DECNET_TRUSTED_PROXIES once per process into network objects.
|
|
|
|
Empty / unset → empty list (no opt-outs). Malformed entries are logged
|
|
at WARNING and silently dropped — a typo in the env shouldn't brick
|
|
the ingester.
|
|
"""
|
|
raw = os.environ.get("DECNET_TRUSTED_PROXIES", "")
|
|
out: list[ipaddress._BaseNetwork] = []
|
|
for token in raw.split(","):
|
|
token = token.strip()
|
|
if not token:
|
|
continue
|
|
try:
|
|
# Accept both bare IPs ("1.2.3.4") and CIDRs ("10.0.0.0/8").
|
|
if "/" in token:
|
|
out.append(ipaddress.ip_network(token, strict=False))
|
|
else:
|
|
out.append(ipaddress.ip_network(f"{token}/32", strict=False))
|
|
except (ValueError, TypeError) as exc:
|
|
logger.warning("DECNET_TRUSTED_PROXIES: ignoring %r: %s", token, exc)
|
|
return out
|
|
|
|
|
|
def _is_trusted_source(source_ip: str) -> bool:
|
|
try:
|
|
addr = ipaddress.ip_address(source_ip)
|
|
except (ValueError, TypeError):
|
|
return False
|
|
for net in _get_trusted_proxies():
|
|
try:
|
|
if addr in net:
|
|
return True
|
|
except (ValueError, TypeError):
|
|
continue
|
|
return False
|
|
|
|
|
|
def _lookup_header(headers: dict[str, Any], name: str) -> Optional[str]:
|
|
"""Case-insensitive header fetch; HTTP template logs headers as-received."""
|
|
lowered = name.lower()
|
|
for k, v in headers.items():
|
|
if isinstance(k, str) and k.lower() == lowered:
|
|
if isinstance(v, str) and v.strip():
|
|
return v.strip()
|
|
return None
|
|
|
|
|
|
def _parse_forwarded(value: str) -> Optional[str]:
|
|
"""Return the first `for=` IP from an RFC 7239 Forwarded header.
|
|
|
|
Handles the quoted IPv6-bracket-port form (`for="[2001:db8::1]:4711"`)
|
|
plus the bare IPv4 (`for=1.2.3.4`) and IPv4:port (`for=1.2.3.4:80`)
|
|
variants. Returns None on any parse failure.
|
|
"""
|
|
match = _FORWARDED_KV_RE.search(value)
|
|
if not match:
|
|
return None
|
|
token = match.group(1).strip()
|
|
if not token:
|
|
return None
|
|
# Strip IPv6 brackets (+ optional :port after them).
|
|
if token.startswith("["):
|
|
end = token.find("]")
|
|
if end > 0:
|
|
token = token[1:end]
|
|
elif token.count(":") == 1:
|
|
# IPv4:port. IPv6 bare literals have ≥2 colons so we leave those.
|
|
token = token.split(":")[0]
|
|
try:
|
|
ipaddress.ip_address(token)
|
|
except (ValueError, TypeError):
|
|
return None
|
|
return token
|
|
|
|
|
|
def _parse_xff_chain(value: str) -> Optional[str]:
|
|
"""Return the left-most parseable IP from an X-Forwarded-For chain."""
|
|
for token in value.split(","):
|
|
token = token.strip().strip('"').lstrip("[").rstrip("]")
|
|
if not token:
|
|
continue
|
|
try:
|
|
ipaddress.ip_address(token)
|
|
except (ValueError, TypeError):
|
|
continue
|
|
return token
|
|
return None
|
|
|
|
|
|
def _extract_claimed_ip(headers: dict[str, Any]) -> tuple[Optional[str], Optional[str]]:
|
|
"""Walk the proxy-header priority list; return (claimed_ip, header_name)."""
|
|
for header in _PROXY_HEADERS:
|
|
raw = _lookup_header(headers, header)
|
|
if raw is None:
|
|
continue
|
|
if header == "Forwarded":
|
|
claimed = _parse_forwarded(raw)
|
|
elif header == "X-Forwarded-For":
|
|
claimed = _parse_xff_chain(raw)
|
|
else:
|
|
# Single-IP headers — may still carry port or IPv6 brackets.
|
|
token = raw.strip().strip('"').lstrip("[").rstrip("]")
|
|
try:
|
|
ipaddress.ip_address(token)
|
|
claimed = token
|
|
except (ValueError, TypeError):
|
|
claimed = None
|
|
if claimed is not None:
|
|
return claimed, header
|
|
return None, None
|
|
|
|
|
|
def _categorize_claimed_ip(ip: str) -> str:
|
|
"""Return a category label for a claimed IP string.
|
|
|
|
Public routable addresses are potential real-IP leaks. Anything
|
|
else (loopback, private, link-local, multicast, reserved,
|
|
unspecified) is almost certainly a forgery — XFF spoofing is the
|
|
classic WAF-bypass / IP-allowlist trick. Callers branch on this:
|
|
``public`` → :data:`ip_leak` bounty, anything else →
|
|
``spoofed_source`` fingerprint bounty.
|
|
"""
|
|
try:
|
|
addr = ipaddress.ip_address(ip)
|
|
except (ValueError, TypeError):
|
|
return "unparseable"
|
|
if addr.is_unspecified:
|
|
return "unspecified"
|
|
if addr.is_loopback:
|
|
return "loopback"
|
|
if addr.is_link_local:
|
|
return "link_local"
|
|
if addr.is_multicast:
|
|
return "multicast"
|
|
if addr.is_reserved:
|
|
return "reserved"
|
|
if addr.is_private:
|
|
return "private"
|
|
return "public"
|
|
|
|
|
|
def _classify_proxy_header_claim(
|
|
log_data: dict[str, Any], headers: dict[str, Any],
|
|
) -> Optional[tuple[str, dict[str, Any]]]:
|
|
"""Shared worker for the two XFF-family detectors.
|
|
|
|
Returns ``(kind, payload)`` where ``kind`` is ``"leak"`` (public
|
|
claim, real attribution leak) or ``"spoof"`` (non-routable claim,
|
|
WAF-bypass attempt). Returns ``None`` for non-HTTP / trusted-proxy
|
|
source / no proxy header / claim matches source / unparseable claim.
|
|
"""
|
|
if log_data.get("service") != "http":
|
|
return None
|
|
if not isinstance(headers, dict) or not headers:
|
|
return None
|
|
source_ip = log_data.get("attacker_ip")
|
|
if not isinstance(source_ip, str) or not source_ip:
|
|
return None
|
|
if _is_trusted_source(source_ip):
|
|
return None
|
|
|
|
claimed, header_name = _extract_claimed_ip(headers)
|
|
if claimed is None or claimed == source_ip:
|
|
return None
|
|
|
|
category = _categorize_claimed_ip(claimed)
|
|
if category == "unparseable":
|
|
return None
|
|
|
|
seen: dict[str, str] = {}
|
|
for h in _PROXY_HEADERS:
|
|
raw = _lookup_header(headers, h)
|
|
if raw is not None:
|
|
seen[h] = raw
|
|
|
|
base = {
|
|
"source_ip": source_ip,
|
|
"claimed_ip": claimed,
|
|
"source_header": header_name,
|
|
"headers_seen": seen,
|
|
"claim_category": category,
|
|
}
|
|
return ("leak" if category == "public" else "spoof"), base
|
|
|
|
|
|
def _detect_ip_leak(
|
|
log_data: dict[str, Any], headers: dict[str, Any],
|
|
) -> Optional[dict[str, Any]]:
|
|
"""Return an ip_leak bounty payload iff a PUBLIC proxy-claim
|
|
mismatch is present — an attacker whose misconfigured VPN / proxy
|
|
forwarded their real routable IP in an XFF-family header. Returns
|
|
``None`` for spoofing attempts (loopback / private / link-local /
|
|
etc.); those land as ``spoofed_source`` fingerprints instead.
|
|
"""
|
|
result = _classify_proxy_header_claim(log_data, headers)
|
|
if result is None or result[0] != "leak":
|
|
return None
|
|
payload = result[1]
|
|
# Preserve the legacy field name so existing UI consumers
|
|
# (AttackerDetail "LEAKED IPs" row, repo JSON decode) keep working.
|
|
payload["real_ip_claim"] = payload.pop("claimed_ip")
|
|
payload.pop("claim_category", None) # always "public" for leaks
|
|
return payload
|
|
|
|
|
|
def _detect_spoofed_source(
|
|
log_data: dict[str, Any], headers: dict[str, Any],
|
|
) -> Optional[dict[str, Any]]:
|
|
"""Return a fingerprint payload iff a NON-ROUTABLE proxy-claim
|
|
is present — the attacker tried to pass loopback / private /
|
|
link-local / reserved / etc. in an XFF-family header.
|
|
|
|
That's the classic IP-allowlist / WAF-bypass trick: ``curl -H
|
|
'X-Forwarded-For: 127.0.0.1'`` hoping an upstream WAF sees
|
|
"localhost" and waves them through. No leak of their real IP;
|
|
they're telling us "I know what this header does."
|
|
|
|
Caller wraps this in ``bounty_type="fingerprint"`` with
|
|
``fingerprint_type="spoofed_source"``.
|
|
"""
|
|
result = _classify_proxy_header_claim(log_data, headers)
|
|
if result is None or result[0] != "spoof":
|
|
return None
|
|
_, payload = result
|
|
# Promote to fingerprint_type for the UI renderer dispatcher.
|
|
return {
|
|
"fingerprint_type": "spoofed_source",
|
|
**payload,
|
|
}
|
|
|
|
|
|
# ─── HTTP header quirks fingerprint ─────────────────────────────────────────
|
|
|
|
# Headers that vary with per-request content (payload-body size, cookies
|
|
# set by prior responses) and therefore aren't useful identity. Stripped
|
|
# before hashing so a tool's order fingerprint is stable across different
|
|
# targets/sessions.
|
|
_VOLATILE_HEADERS = frozenset({
|
|
"content-length",
|
|
"cookie",
|
|
"authorization",
|
|
"x-forwarded-for", # carries attacker-dependent values
|
|
"forwarded",
|
|
"x-real-ip",
|
|
"true-client-ip",
|
|
"cf-connecting-ip",
|
|
"x-request-id",
|
|
"x-correlation-id",
|
|
"x-amzn-trace-id",
|
|
})
|
|
|
|
|
|
# Distinctive order signatures for common tools. The match is on the
|
|
# lowercased-name list MINUS the volatile set. A prefix match wins —
|
|
# many tools tack on "User-Agent / Accept-Encoding / Accept" in the
|
|
# same order regardless of method.
|
|
_TOOL_SIGNATURES: tuple[tuple[str, tuple[str, ...]], ...] = (
|
|
# curl sends: Host, User-Agent, Accept, <body-headers>.
|
|
("curl", ("host", "user-agent", "accept")),
|
|
# python-requests: User-Agent, Accept-Encoding, Accept, Connection, Host.
|
|
("python-requests", ("host", "user-agent", "accept-encoding", "accept", "connection")),
|
|
# Go-http-client: Host, User-Agent, Accept-Encoding.
|
|
("go-http-client", ("host", "user-agent", "accept-encoding")),
|
|
# nmap http-enum / http-* scripts: short, Host+User-Agent ordering.
|
|
("nmap-nse", ("host", "user-agent")),
|
|
# Nikto / Nuclei send distinctive Accept-Language preferences — treat
|
|
# User-Agent check as the secondary signal elsewhere; order alone is
|
|
# ambiguous here.
|
|
)
|
|
|
|
|
|
def _casing_category(name: str) -> str:
|
|
"""Classify a header-name casing pattern.
|
|
|
|
Real HTTP clients and stacks pick one convention and stick to it:
|
|
browsers send `Title-Case`; python-requests sends `Title-Case`;
|
|
Go's stdlib canonicalises to `Title-Case`; curl sends literal
|
|
`Title-Case`; nmap/masscan often send `lowercase`; custom scanners
|
|
sometimes send `UPPERCASE`.
|
|
"""
|
|
if not name:
|
|
return "empty"
|
|
if name == name.upper():
|
|
return "upper"
|
|
if name == name.lower():
|
|
return "lower"
|
|
# "Title-Case" test: each dash-separated token starts with an
|
|
# uppercase; trailing chars (if any) must be lowercase. Single-
|
|
# letter tokens like the `X` in `X-Forwarded-For` qualify when
|
|
# uppercase — "".islower() is False in Python so the naive form
|
|
# of this test misfires.
|
|
parts = [p for p in name.split("-") if p]
|
|
if parts and all(
|
|
p[:1].isupper() and (len(p) == 1 or p[1:].islower())
|
|
for p in parts
|
|
):
|
|
return "title"
|
|
return "mixed"
|
|
|
|
|
|
def _short_hash(value: str) -> str:
|
|
"""16-hex-char SHA-256 prefix — stable identity, short display."""
|
|
import hashlib
|
|
return hashlib.sha256(value.encode("utf-8")).hexdigest()[:16]
|
|
|
|
|
|
def _guess_tool_from_order(lowered: list[str]) -> Optional[str]:
|
|
"""Return the first matching tool signature, or None."""
|
|
for name, sig in _TOOL_SIGNATURES:
|
|
if len(lowered) >= len(sig) and tuple(lowered[: len(sig)]) == sig:
|
|
return name
|
|
return None
|
|
|
|
|
|
def _http_quirks_fingerprint(
|
|
log_data: dict[str, Any], headers: dict[str, Any],
|
|
) -> Optional[dict[str, Any]]:
|
|
"""Build an HTTP request-header quirks fingerprint.
|
|
|
|
Captures the header-order hash, casing pattern, count, and a
|
|
best-effort tool guess. Returns ``None`` for non-HTTP services or
|
|
when no usable headers are present. Bounty dedup will collapse
|
|
repeat fingerprints from the same attacker.
|
|
"""
|
|
if log_data.get("service") != "http":
|
|
return None
|
|
if not isinstance(headers, dict) or not headers:
|
|
return None
|
|
|
|
# Preserve insertion order (Python 3.7+ dict guarantee, and JSON
|
|
# round-trip also preserves it). Drop volatile headers for the
|
|
# identity hash but keep them in the display order list.
|
|
names_full: list[str] = [k for k in headers.keys() if isinstance(k, str)]
|
|
if not names_full:
|
|
return None
|
|
|
|
names_stable = [n for n in names_full if n.lower() not in _VOLATILE_HEADERS]
|
|
lowered = [n.lower() for n in names_stable]
|
|
|
|
order_hash = _short_hash("\n".join(lowered))
|
|
casing_per_header = [_casing_category(n) for n in names_stable]
|
|
casing_hash = _short_hash("\n".join(casing_per_header))
|
|
|
|
# A single "dominant" casing category — useful for at-a-glance display.
|
|
categories = set(casing_per_header)
|
|
if not categories:
|
|
dominant = "empty"
|
|
elif len(categories) == 1:
|
|
dominant = next(iter(categories))
|
|
else:
|
|
dominant = "mixed"
|
|
|
|
# Identity-only payload — every field must be stable for two
|
|
# requests from the same client stack. add_bounty dedups on the
|
|
# full payload JSON, so a per-request-varying key (path, method,
|
|
# header_count when Cookie presence varies) would spawn one row
|
|
# per request. The hashes ARE the identity; per-request context
|
|
# lives in the logs table.
|
|
return {
|
|
"fingerprint_type": "http_quirks",
|
|
"order_hash": order_hash,
|
|
"order": names_stable,
|
|
"casing_hash": casing_hash,
|
|
"casing_category": dominant,
|
|
"stable_count": len(names_stable),
|
|
"tool_guess": _guess_tool_from_order(lowered),
|
|
}
|
|
|