From dccb410bb3f9a48b114ef9c590e3a9583221f604 Mon Sep 17 00:00:00 2001 From: anti Date: Fri, 24 Apr 2026 17:51:40 -0400 Subject: [PATCH] =?UTF-8?q?feat(http):=20header-quirks=20fingerprint=20?= =?UTF-8?q?=E2=80=94=20order=20+=20casing=20+=20tool=20guess?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per-request HTTP fingerprint derived from the header dict we already log. Captures: - order_hash: SHA-256 prefix (16 hex) over the lowercased header-name sequence, minus volatile/per-request headers (Content-Length, Cookie, Authorization, XFF family, trace IDs). Stable identity for a given client stack regardless of which target / path is hit. - casing_hash: same shape but over the per-header casing category (Title-Case / lower / UPPER / mixed). Attackers frequently spoof User-Agent but forget their stack sends `user-agent` while browsers send `User-Agent`. - tool_guess: prefix match against curl / python-requests / Go-http-client / nmap-nse signatures. Cheap, best-effort — the hash is the hard signal. - duplicates: reserved for when the HTTP template switches from dict(request.headers) to a list form; today it always fires empty because dict() collapses duplicates. Payload is a fingerprint bounty (bounty_type="fingerprint", fingerprint_type="http_quirks"). Bounty dedup collapses identical hashes per attacker — one row per distinct fingerprint — so a chatty scanner doesn't spam the vault, but a tool-chain change from the same IP surfaces as a new row. UI renderer (FpHttpQuirks) shows the two hashes, tool guess badge in violet, casing/count tags, and a collapsible header-order list. Added to the passiveTypes group so it nests with JA3/JA4L/etc. in the AttackerDetail fingerprints panel. One library note: the naive "title-case" classifier failed on tokens like `X-Forwarded-For` because Python's "".islower() returns False so `p[1:].islower()` rejects single-letter tokens like the `X`. Fix: explicitly accept single-char tokens when uppercase. --- decnet/web/ingester.py | 158 ++++++++++++++ decnet_web/src/components/AttackerDetail.tsx | 46 ++++- tests/web/test_ingester_http_quirks.py | 206 +++++++++++++++++++ 3 files changed, 409 insertions(+), 1 deletion(-) create mode 100644 tests/web/test_ingester_http_quirks.py diff --git a/decnet/web/ingester.py b/decnet/web/ingester.py index fc7f422a..f40f98b7 100644 --- a/decnet/web/ingester.py +++ b/decnet/web/ingester.py @@ -261,6 +261,23 @@ async def _extract_bounty(repo: BaseRepository, log_data: dict[str, Any]) -> Non "payload": _leak, }) + # 2c. HTTP header quirks — order + casing fingerprint per request. + # Real HTTP clients have distinctive header orderings and casing + # patterns (curl vs python-requests vs Go-http-client vs nmap vs + # browsers all differ). Attackers routinely spoof User-Agent but + # forget to match the stack's native header order. Bounty dedup + # collapses repeat fingerprints from the same attacker, so this + # fires once per distinct hash per source. + _quirks = _http_quirks_fingerprint(log_data, _headers) + if _quirks is not None: + await repo.add_bounty({ + "decky": log_data.get("decky"), + "service": log_data.get("service"), + "attacker_ip": log_data.get("attacker_ip"), + "bounty_type": "fingerprint", + "payload": _quirks, + }) + # 3. VNC client version fingerprint _vnc_ver = _fields.get("client_version") if _vnc_ver and log_data.get("event_type") == "version": @@ -593,3 +610,144 @@ def _detect_ip_leak( "method": log_data.get("fields", {}).get("method"), } + +# ─── HTTP header quirks fingerprint ───────────────────────────────────────── + +# Headers that vary with per-request content (payload-body size, cookies +# set by prior responses) and therefore aren't useful identity. Stripped +# before hashing so a tool's order fingerprint is stable across different +# targets/sessions. +_VOLATILE_HEADERS = frozenset({ + "content-length", + "cookie", + "authorization", + "x-forwarded-for", # carries attacker-dependent values + "forwarded", + "x-real-ip", + "true-client-ip", + "cf-connecting-ip", + "x-request-id", + "x-correlation-id", + "x-amzn-trace-id", +}) + + +# Distinctive order signatures for common tools. The match is on the +# lowercased-name list MINUS the volatile set. A prefix match wins — +# many tools tack on "User-Agent / Accept-Encoding / Accept" in the +# same order regardless of method. +_TOOL_SIGNATURES: tuple[tuple[str, tuple[str, ...]], ...] = ( + # curl sends: Host, User-Agent, Accept, . + ("curl", ("host", "user-agent", "accept")), + # python-requests: User-Agent, Accept-Encoding, Accept, Connection, Host. + ("python-requests", ("host", "user-agent", "accept-encoding", "accept", "connection")), + # Go-http-client: Host, User-Agent, Accept-Encoding. + ("go-http-client", ("host", "user-agent", "accept-encoding")), + # nmap http-enum / http-* scripts: short, Host+User-Agent ordering. + ("nmap-nse", ("host", "user-agent")), + # Nikto / Nuclei send distinctive Accept-Language preferences — treat + # User-Agent check as the secondary signal elsewhere; order alone is + # ambiguous here. +) + + +def _casing_category(name: str) -> str: + """Classify a header-name casing pattern. + + Real HTTP clients and stacks pick one convention and stick to it: + browsers send `Title-Case`; python-requests sends `Title-Case`; + Go's stdlib canonicalises to `Title-Case`; curl sends literal + `Title-Case`; nmap/masscan often send `lowercase`; custom scanners + sometimes send `UPPERCASE`. + """ + if not name: + return "empty" + if name == name.upper(): + return "upper" + if name == name.lower(): + return "lower" + # "Title-Case" test: each dash-separated token starts with an + # uppercase; trailing chars (if any) must be lowercase. Single- + # letter tokens like the `X` in `X-Forwarded-For` qualify when + # uppercase — "".islower() is False in Python so the naive form + # of this test misfires. + parts = [p for p in name.split("-") if p] + if parts and all( + p[:1].isupper() and (len(p) == 1 or p[1:].islower()) + for p in parts + ): + return "title" + return "mixed" + + +def _short_hash(value: str) -> str: + """16-hex-char SHA-256 prefix — stable identity, short display.""" + import hashlib + return hashlib.sha256(value.encode("utf-8")).hexdigest()[:16] + + +def _guess_tool_from_order(lowered: list[str]) -> Optional[str]: + """Return the first matching tool signature, or None.""" + for name, sig in _TOOL_SIGNATURES: + if len(lowered) >= len(sig) and tuple(lowered[: len(sig)]) == sig: + return name + return None + + +def _http_quirks_fingerprint( + log_data: dict[str, Any], headers: dict[str, Any], +) -> Optional[dict[str, Any]]: + """Build an HTTP request-header quirks fingerprint. + + Captures the header-order hash, casing pattern, count, and a + best-effort tool guess. Returns ``None`` for non-HTTP services or + when no usable headers are present. Bounty dedup will collapse + repeat fingerprints from the same attacker. + """ + if log_data.get("service") != "http": + return None + if not isinstance(headers, dict) or not headers: + return None + + # Preserve insertion order (Python 3.7+ dict guarantee, and JSON + # round-trip also preserves it). Drop volatile headers for the + # identity hash but keep them in the display order list. + names_full: list[str] = [k for k in headers.keys() if isinstance(k, str)] + if not names_full: + return None + + names_stable = [n for n in names_full if n.lower() not in _VOLATILE_HEADERS] + lowered = [n.lower() for n in names_stable] + + order_hash = _short_hash("\n".join(lowered)) + casing_per_header = [_casing_category(n) for n in names_stable] + casing_hash = _short_hash("\n".join(casing_per_header)) + + # A single "dominant" casing category — useful for at-a-glance display. + categories = set(casing_per_header) + if not categories: + dominant = "empty" + elif len(categories) == 1: + dominant = next(iter(categories)) + else: + dominant = "mixed" + + # Duplicate detection: in the dict we got, duplicates would have + # collapsed to one key. But we can still flag if the template + # someday passes a list — future-proofing, no-op today. + duplicates = [n for n in {x for x in names_full if names_full.count(x) > 1}] + + return { + "fingerprint_type": "http_quirks", + "order_hash": order_hash, + "order": names_stable, + "casing_hash": casing_hash, + "casing_category": dominant, + "header_count": len(names_full), + "stable_count": len(names_stable), + "tool_guess": _guess_tool_from_order(lowered), + "duplicates": duplicates or None, + "method": log_data.get("fields", {}).get("method"), + "path": log_data.get("fields", {}).get("path"), + } + diff --git a/decnet_web/src/components/AttackerDetail.tsx b/decnet_web/src/components/AttackerDetail.tsx index 80230adc..beb419e7 100644 --- a/decnet_web/src/components/AttackerDetail.tsx +++ b/decnet_web/src/components/AttackerDetail.tsx @@ -92,6 +92,7 @@ const fpTypeLabel: Record = { tls_resumption: 'SESSION RESUMPTION', tls_certificate: 'CERTIFICATE', http_useragent: 'HTTP USER-AGENT', + http_quirks: 'HTTP HEADER QUIRKS', vnc_client_version: 'VNC CLIENT', jarm: 'JARM', hassh_server: 'HASSH SERVER', @@ -104,6 +105,7 @@ const fpTypeIcon: Record = { tls_resumption: , tls_certificate: , http_useragent: , + http_quirks: , vnc_client_version: , jarm: , hassh_server: , @@ -338,6 +340,47 @@ const FpGeneric: React.FC<{ p: any }> = ({ p }) => ( ); +const FpHttpQuirks: React.FC<{ p: any }> = ({ p }) => { + const order: string[] = Array.isArray(p.order) ? p.order : []; + return ( +
+ + +
+ {p.tool_guess && ( + {String(p.tool_guess).toUpperCase()} + )} + {p.casing_category && ( + CASE · {String(p.casing_category).toUpperCase()} + )} + {typeof p.header_count === 'number' && ( + {p.header_count} HEADERS + )} + {p.duplicates && ( + DUPLICATES + )} +
+ {order.length > 0 && ( +
+ + HEADER ORDER + +
+ {order.map((h, i) => ( + {h} + ))} +
+
+ )} + {(p.method || p.path) && ( +
+ {p.method} {p.path} +
+ )} +
+ ); +}; + const FingerprintGroup: React.FC<{ fpType: string; items: any[] }> = ({ fpType, items }) => { const label = fpTypeLabel[fpType] || fpType.toUpperCase().replace(/_/g, ' '); const icon = fpTypeIcon[fpType] || ; @@ -365,6 +408,7 @@ const FingerprintGroup: React.FC<{ fpType: string; items: any[] }> = ({ fpType, case 'jarm': return ; case 'hassh_server': return ; case 'tcpfp': return ; + case 'http_quirks': return ; default: return ; } })} @@ -1245,7 +1289,7 @@ const AttackerDetail: React.FC = () => { // Active probes first, then passive, then unknown const activeTypes = ['jarm', 'hassh_server', 'tcpfp']; - const passiveTypes = ['ja3', 'ja4l', 'tls_resumption', 'tls_certificate', 'http_useragent', 'vnc_client_version']; + const passiveTypes = ['ja3', 'ja4l', 'tls_resumption', 'tls_certificate', 'http_useragent', 'http_quirks', 'vnc_client_version']; const knownTypes = [...activeTypes, ...passiveTypes]; const unknownTypes = Object.keys(groups).filter((t) => !knownTypes.includes(t)); diff --git a/tests/web/test_ingester_http_quirks.py b/tests/web/test_ingester_http_quirks.py new file mode 100644 index 00000000..886299f3 --- /dev/null +++ b/tests/web/test_ingester_http_quirks.py @@ -0,0 +1,206 @@ +"""HTTP header-quirks fingerprint extraction in the ingester.""" +from __future__ import annotations + +from unittest.mock import AsyncMock + +import pytest + +from decnet.web.ingester import ( + _casing_category, + _guess_tool_from_order, + _http_quirks_fingerprint, + _short_hash, + _extract_bounty, +) + + +def _log_row(headers: dict[str, str], *, service: str = "http") -> dict: + return { + "decky": "http-01", + "service": service, + "attacker_ip": "1.2.3.4", + "event_type": "request", + "fields": { + "method": "GET", + "path": "/", + "headers": headers, + }, + } + + +# ─── casing classifier ───────────────────────────────────────────────────── + +def test_casing_title(): + assert _casing_category("User-Agent") == "title" + assert _casing_category("Host") == "title" + assert _casing_category("X-Forwarded-For") == "title" + + +def test_casing_lower(): + assert _casing_category("user-agent") == "lower" + assert _casing_category("x-forwarded-for") == "lower" + + +def test_casing_upper(): + assert _casing_category("USER-AGENT") == "upper" + + +def test_casing_mixed(): + assert _casing_category("USer-AgEnt") == "mixed" + + +# ─── order + casing hash stability ────────────────────────────────────────── + +def test_same_order_same_hash(): + row_a = _log_row({"Host": "x", "User-Agent": "curl/8", "Accept": "*/*"}) + row_b = _log_row({"Host": "y", "User-Agent": "curl/7", "Accept": "*/*"}) + fa = _http_quirks_fingerprint(row_a, row_a["fields"]["headers"]) + fb = _http_quirks_fingerprint(row_b, row_b["fields"]["headers"]) + assert fa["order_hash"] == fb["order_hash"] + assert fa["casing_hash"] == fb["casing_hash"] + + +def test_different_order_different_hash(): + row_a = _log_row({"Host": "x", "User-Agent": "a", "Accept": "*/*"}) + row_b = _log_row({"Accept": "*/*", "User-Agent": "a", "Host": "x"}) + fa = _http_quirks_fingerprint(row_a, row_a["fields"]["headers"]) + fb = _http_quirks_fingerprint(row_b, row_b["fields"]["headers"]) + assert fa["order_hash"] != fb["order_hash"] + + +def test_different_casing_different_hash(): + row_a = _log_row({"Host": "x", "User-Agent": "a"}) + row_b = _log_row({"host": "x", "user-agent": "a"}) + fa = _http_quirks_fingerprint(row_a, row_a["fields"]["headers"]) + fb = _http_quirks_fingerprint(row_b, row_b["fields"]["headers"]) + assert fa["casing_hash"] != fb["casing_hash"] + assert fa["casing_category"] == "title" + assert fb["casing_category"] == "lower" + + +def test_volatile_headers_excluded_from_hash(): + """Content-Length, Cookie, XFF etc. are per-request; the identity + hash shouldn't depend on them.""" + row_a = _log_row({ + "Host": "x", "User-Agent": "a", "Content-Length": "100", + }) + row_b = _log_row({ + "Host": "x", "User-Agent": "a", "Content-Length": "999", + "Cookie": "session=abc", + }) + fa = _http_quirks_fingerprint(row_a, row_a["fields"]["headers"]) + fb = _http_quirks_fingerprint(row_b, row_b["fields"]["headers"]) + assert fa["order_hash"] == fb["order_hash"] + # Count reflects ALL headers (the volatile ones WERE there). + assert fa["header_count"] == 3 + assert fb["header_count"] == 4 + # Stable count excludes the volatile ones. + assert fa["stable_count"] == 2 + assert fb["stable_count"] == 2 + + +# ─── tool guesses ────────────────────────────────────────────────────────── + +def test_curl_signature_guessed(): + assert _guess_tool_from_order(["host", "user-agent", "accept"]) == "curl" + + +def test_python_requests_signature_guessed(): + assert _guess_tool_from_order([ + "host", "user-agent", "accept-encoding", "accept", "connection", + ]) == "python-requests" + + +def test_go_http_client_signature_guessed(): + assert _guess_tool_from_order([ + "host", "user-agent", "accept-encoding", + ]) == "go-http-client" + + +def test_nmap_nse_signature_guessed(): + # Short order starting with host, user-agent → nmap-nse. + assert _guess_tool_from_order(["host", "user-agent"]) == "nmap-nse" + + +def test_unknown_tool_returns_none(): + assert _guess_tool_from_order(["accept", "host", "user-agent"]) is None + + +def test_fingerprint_includes_tool_guess_curl(): + row = _log_row({ + "Host": "target", "User-Agent": "curl/8.0", "Accept": "*/*", + }) + f = _http_quirks_fingerprint(row, row["fields"]["headers"]) + assert f["tool_guess"] == "curl" + + +# ─── gating ───────────────────────────────────────────────────────────────── + +def test_non_http_service_skipped(): + row = _log_row({"Host": "x"}, service="ssh") + assert _http_quirks_fingerprint(row, row["fields"]["headers"]) is None + + +def test_empty_headers_skipped(): + row = _log_row({}) + assert _http_quirks_fingerprint(row, {}) is None + + +def test_only_volatile_headers_still_emits(): + """If every header is in the volatile set we still want a fingerprint, + just with empty order — header count alone is still a signal.""" + row = _log_row({"Content-Length": "10", "Cookie": "a=b"}) + f = _http_quirks_fingerprint(row, row["fields"]["headers"]) + assert f is not None + assert f["header_count"] == 2 + assert f["stable_count"] == 0 + assert f["order"] == [] + + +# ─── end-to-end via _extract_bounty ───────────────────────────────────────── + +@pytest.mark.asyncio +async def test_extract_bounty_emits_http_quirks(): + row = _log_row({ + "Host": "target", "User-Agent": "curl/8.0", "Accept": "*/*", + }) + repo = AsyncMock() + await _extract_bounty(repo, row) + + calls = [ + c.args[0] for c in repo.add_bounty.call_args_list + ] + # Expect: http_useragent fingerprint + http_quirks fingerprint. + fp_types = [ + c["payload"].get("fingerprint_type") + for c in calls + if c["bounty_type"] == "fingerprint" + ] + assert "http_useragent" in fp_types + assert "http_quirks" in fp_types + + quirks = next( + c for c in calls + if c["bounty_type"] == "fingerprint" + and c["payload"].get("fingerprint_type") == "http_quirks" + ) + assert quirks["payload"]["tool_guess"] == "curl" + assert quirks["payload"]["casing_category"] == "title" + + +@pytest.mark.asyncio +async def test_extract_bounty_non_http_skips_quirks(): + row = _log_row({"Host": "x"}, service="ssh") + repo = AsyncMock() + await _extract_bounty(repo, row) + for call in repo.add_bounty.call_args_list: + payload = call.args[0].get("payload") or {} + assert payload.get("fingerprint_type") != "http_quirks" + + +# ─── hash stability across restarts ───────────────────────────────────────── + +def test_short_hash_deterministic(): + assert _short_hash("abc") == _short_hash("abc") + assert _short_hash("abc") != _short_hash("def") + assert len(_short_hash("anything")) == 16