feat(http): header-quirks fingerprint — order + casing + tool guess

Per-request HTTP fingerprint derived from the header dict we already
log. Captures:

- order_hash: SHA-256 prefix (16 hex) over the lowercased header-name
  sequence, minus volatile/per-request headers (Content-Length,
  Cookie, Authorization, XFF family, trace IDs). Stable identity for
  a given client stack regardless of which target / path is hit.
- casing_hash: same shape but over the per-header casing category
  (Title-Case / lower / UPPER / mixed). Attackers frequently spoof
  User-Agent but forget their stack sends `user-agent` while browsers
  send `User-Agent`.
- tool_guess: prefix match against curl / python-requests /
  Go-http-client / nmap-nse signatures. Cheap, best-effort — the
  hash is the hard signal.
- duplicates: reserved for when the HTTP template switches from
  dict(request.headers) to a list form; today it always fires empty
  because dict() collapses duplicates.

Payload is a fingerprint bounty (bounty_type="fingerprint",
fingerprint_type="http_quirks"). Bounty dedup collapses identical
hashes per attacker — one row per distinct fingerprint — so a chatty
scanner doesn't spam the vault, but a tool-chain change from the
same IP surfaces as a new row.

UI renderer (FpHttpQuirks) shows the two hashes, tool guess badge in
violet, casing/count tags, and a collapsible header-order list.
Added to the passiveTypes group so it nests with JA3/JA4L/etc. in
the AttackerDetail fingerprints panel.

One library note: the naive "title-case" classifier failed on tokens
like `X-Forwarded-For` because Python's "".islower() returns False
so `p[1:].islower()` rejects single-letter tokens like the `X`.
Fix: explicitly accept single-char tokens when uppercase.
This commit is contained in:
2026-04-24 17:51:40 -04:00
parent 2a0c5ca410
commit dccb410bb3
3 changed files with 409 additions and 1 deletions

View File

@@ -261,6 +261,23 @@ async def _extract_bounty(repo: BaseRepository, log_data: dict[str, Any]) -> Non
"payload": _leak,
})
# 2c. HTTP header quirks — order + casing fingerprint per request.
# Real HTTP clients have distinctive header orderings and casing
# patterns (curl vs python-requests vs Go-http-client vs nmap vs
# browsers all differ). Attackers routinely spoof User-Agent but
# forget to match the stack's native header order. Bounty dedup
# collapses repeat fingerprints from the same attacker, so this
# fires once per distinct hash per source.
_quirks = _http_quirks_fingerprint(log_data, _headers)
if _quirks is not None:
await repo.add_bounty({
"decky": log_data.get("decky"),
"service": log_data.get("service"),
"attacker_ip": log_data.get("attacker_ip"),
"bounty_type": "fingerprint",
"payload": _quirks,
})
# 3. VNC client version fingerprint
_vnc_ver = _fields.get("client_version")
if _vnc_ver and log_data.get("event_type") == "version":
@@ -593,3 +610,144 @@ def _detect_ip_leak(
"method": log_data.get("fields", {}).get("method"),
}
# ─── HTTP header quirks fingerprint ─────────────────────────────────────────
# Headers that vary with per-request content (payload-body size, cookies
# set by prior responses) and therefore aren't useful identity. Stripped
# before hashing so a tool's order fingerprint is stable across different
# targets/sessions.
_VOLATILE_HEADERS = frozenset({
"content-length",
"cookie",
"authorization",
"x-forwarded-for", # carries attacker-dependent values
"forwarded",
"x-real-ip",
"true-client-ip",
"cf-connecting-ip",
"x-request-id",
"x-correlation-id",
"x-amzn-trace-id",
})
# Distinctive order signatures for common tools. The match is on the
# lowercased-name list MINUS the volatile set. A prefix match wins —
# many tools tack on "User-Agent / Accept-Encoding / Accept" in the
# same order regardless of method.
_TOOL_SIGNATURES: tuple[tuple[str, tuple[str, ...]], ...] = (
# curl sends: Host, User-Agent, Accept, <body-headers>.
("curl", ("host", "user-agent", "accept")),
# python-requests: User-Agent, Accept-Encoding, Accept, Connection, Host.
("python-requests", ("host", "user-agent", "accept-encoding", "accept", "connection")),
# Go-http-client: Host, User-Agent, Accept-Encoding.
("go-http-client", ("host", "user-agent", "accept-encoding")),
# nmap http-enum / http-* scripts: short, Host+User-Agent ordering.
("nmap-nse", ("host", "user-agent")),
# Nikto / Nuclei send distinctive Accept-Language preferences — treat
# User-Agent check as the secondary signal elsewhere; order alone is
# ambiguous here.
)
def _casing_category(name: str) -> str:
"""Classify a header-name casing pattern.
Real HTTP clients and stacks pick one convention and stick to it:
browsers send `Title-Case`; python-requests sends `Title-Case`;
Go's stdlib canonicalises to `Title-Case`; curl sends literal
`Title-Case`; nmap/masscan often send `lowercase`; custom scanners
sometimes send `UPPERCASE`.
"""
if not name:
return "empty"
if name == name.upper():
return "upper"
if name == name.lower():
return "lower"
# "Title-Case" test: each dash-separated token starts with an
# uppercase; trailing chars (if any) must be lowercase. Single-
# letter tokens like the `X` in `X-Forwarded-For` qualify when
# uppercase — "".islower() is False in Python so the naive form
# of this test misfires.
parts = [p for p in name.split("-") if p]
if parts and all(
p[:1].isupper() and (len(p) == 1 or p[1:].islower())
for p in parts
):
return "title"
return "mixed"
def _short_hash(value: str) -> str:
"""16-hex-char SHA-256 prefix — stable identity, short display."""
import hashlib
return hashlib.sha256(value.encode("utf-8")).hexdigest()[:16]
def _guess_tool_from_order(lowered: list[str]) -> Optional[str]:
"""Return the first matching tool signature, or None."""
for name, sig in _TOOL_SIGNATURES:
if len(lowered) >= len(sig) and tuple(lowered[: len(sig)]) == sig:
return name
return None
def _http_quirks_fingerprint(
log_data: dict[str, Any], headers: dict[str, Any],
) -> Optional[dict[str, Any]]:
"""Build an HTTP request-header quirks fingerprint.
Captures the header-order hash, casing pattern, count, and a
best-effort tool guess. Returns ``None`` for non-HTTP services or
when no usable headers are present. Bounty dedup will collapse
repeat fingerprints from the same attacker.
"""
if log_data.get("service") != "http":
return None
if not isinstance(headers, dict) or not headers:
return None
# Preserve insertion order (Python 3.7+ dict guarantee, and JSON
# round-trip also preserves it). Drop volatile headers for the
# identity hash but keep them in the display order list.
names_full: list[str] = [k for k in headers.keys() if isinstance(k, str)]
if not names_full:
return None
names_stable = [n for n in names_full if n.lower() not in _VOLATILE_HEADERS]
lowered = [n.lower() for n in names_stable]
order_hash = _short_hash("\n".join(lowered))
casing_per_header = [_casing_category(n) for n in names_stable]
casing_hash = _short_hash("\n".join(casing_per_header))
# A single "dominant" casing category — useful for at-a-glance display.
categories = set(casing_per_header)
if not categories:
dominant = "empty"
elif len(categories) == 1:
dominant = next(iter(categories))
else:
dominant = "mixed"
# Duplicate detection: in the dict we got, duplicates would have
# collapsed to one key. But we can still flag if the template
# someday passes a list — future-proofing, no-op today.
duplicates = [n for n in {x for x in names_full if names_full.count(x) > 1}]
return {
"fingerprint_type": "http_quirks",
"order_hash": order_hash,
"order": names_stable,
"casing_hash": casing_hash,
"casing_category": dominant,
"header_count": len(names_full),
"stable_count": len(names_stable),
"tool_guess": _guess_tool_from_order(lowered),
"duplicates": duplicates or None,
"method": log_data.get("fields", {}).get("method"),
"path": log_data.get("fields", {}).get("path"),
}