Brings the federation-gossip columns on AttackerIdentity to life —
ja3_hashes, hassh_hashes, and the new tls_cert_sha256 — by projecting
the union of every member observation's fingerprints JSON onto the
identity at clusterer create / link / merge time.
- decnet/profiler/identity_rollup.py: pure extract_fp_summaries()
reads the production bounty shape (payload.fingerprint_type +
payload.{ja3,hash,cert_sha256}) and returns deduped+sorted JSON
list[str] per family, or None when a family has no signal so the
column stays NULL instead of '[]'.
- BaseRepository.update_identity_fingerprints + SQLModel impl: one
idempotent write that overwrites the three summary columns and
bumps updated_at.
- ConnectedComponentsClusterer: after every per-component
reconciliation (fresh-create OR existing-merge+link), recomputes
and writes the rollup for the target identity. Wrapped in a
best-effort helper so a write failure logs but never breaks the
tick.
- Tests: extract_fp_summaries unit (dedup, sort determinism,
unknown types ignored, malformed JSON, nested-stringified
payloads, non-string values); end-to-end clusterer ticks
populate the columns on create + on later observation links;
no-fingerprint clusters keep the columns NULL.
142 lines
5.3 KiB
Python
142 lines
5.3 KiB
Python
"""Tests for ``decnet.profiler.identity_rollup.extract_fp_summaries``.
|
|
|
|
Pure unit tests against the production bounty shape that
|
|
``decnet.profiler.worker._build_record`` writes into
|
|
``Attacker.fingerprints`` — a list of ``{bounty_type, payload, ...}``
|
|
dicts where the meaningful data lives under ``payload.fingerprint_type``.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
|
|
from decnet.profiler.identity_rollup import extract_fp_summaries
|
|
|
|
|
|
def _bounty(fp_type: str, **payload_extras) -> dict:
|
|
"""Build a bounty dict shaped the way the profiler writes it."""
|
|
return {
|
|
"bounty_type": "fingerprint",
|
|
"payload": {"fingerprint_type": fp_type, **payload_extras},
|
|
}
|
|
|
|
|
|
def _row_with(*entries) -> dict:
|
|
return {"fingerprints": json.dumps(list(entries))}
|
|
|
|
|
|
class TestExtractFpSummaries:
|
|
|
|
def test_empty_input_returns_all_none(self):
|
|
result = extract_fp_summaries([])
|
|
assert result == {
|
|
"ja3_hashes": None,
|
|
"hassh_hashes": None,
|
|
"tls_cert_sha256": None,
|
|
}
|
|
|
|
def test_single_row_single_cert(self):
|
|
row = _row_with(_bounty("tls_certificate", cert_sha256="ab" * 32))
|
|
result = extract_fp_summaries([row])
|
|
assert result["ja3_hashes"] is None
|
|
assert result["hassh_hashes"] is None
|
|
assert json.loads(result["tls_cert_sha256"]) == ["ab" * 32]
|
|
|
|
def test_dedupe_across_rows(self):
|
|
sha = "ab" * 32
|
|
a = _row_with(_bounty("tls_certificate", cert_sha256=sha))
|
|
b = _row_with(_bounty("tls_certificate", cert_sha256=sha))
|
|
result = extract_fp_summaries([a, b])
|
|
assert json.loads(result["tls_cert_sha256"]) == [sha]
|
|
|
|
def test_sorted_output_is_deterministic(self):
|
|
a = _row_with(
|
|
_bounty("tls_certificate", cert_sha256="ff" * 32),
|
|
_bounty("tls_certificate", cert_sha256="11" * 32),
|
|
_bounty("tls_certificate", cert_sha256="aa" * 32),
|
|
)
|
|
result = extract_fp_summaries([a])
|
|
# Same input twice must produce byte-identical output.
|
|
assert result == extract_fp_summaries([a])
|
|
assert json.loads(result["tls_cert_sha256"]) == sorted(
|
|
["ff" * 32, "11" * 32, "aa" * 32]
|
|
)
|
|
|
|
def test_all_three_families_at_once(self):
|
|
row = _row_with(
|
|
_bounty("ja3", ja3="ja3-abc"),
|
|
_bounty("hassh_server", hash="hassh-def"),
|
|
_bounty("tls_certificate", cert_sha256="ab" * 32),
|
|
)
|
|
result = extract_fp_summaries([row])
|
|
assert json.loads(result["ja3_hashes"]) == ["ja3-abc"]
|
|
assert json.loads(result["hassh_hashes"]) == ["hassh-def"]
|
|
assert json.loads(result["tls_cert_sha256"]) == ["ab" * 32]
|
|
|
|
def test_unknown_fingerprint_type_ignored(self):
|
|
# tcpfp / ja4l / http_quirks have no rollup column yet; they
|
|
# must not pollute the three families that do.
|
|
row = _row_with(
|
|
_bounty("tcpfp", hash="tcpfp-x"),
|
|
_bounty("ja4l", ja4l="ja4l-y"),
|
|
_bounty("http_quirks", quirks="..."),
|
|
)
|
|
result = extract_fp_summaries([row])
|
|
assert result["ja3_hashes"] is None
|
|
assert result["hassh_hashes"] is None
|
|
assert result["tls_cert_sha256"] is None
|
|
|
|
def test_missing_payload_key_skipped(self):
|
|
# tls_certificate bounty shaped like a sniffer-only payload
|
|
# (no cert_sha256). Must not crash, must not record an entry.
|
|
row = _row_with({
|
|
"bounty_type": "fingerprint",
|
|
"payload": {"fingerprint_type": "tls_certificate", "subject_cn": "x"},
|
|
})
|
|
result = extract_fp_summaries([row])
|
|
assert result["tls_cert_sha256"] is None
|
|
|
|
def test_malformed_fingerprints_json_returns_all_none(self):
|
|
result = extract_fp_summaries([{"fingerprints": "not json"}])
|
|
assert all(v is None for v in result.values())
|
|
|
|
def test_missing_fingerprints_field_returns_all_none(self):
|
|
result = extract_fp_summaries([{"some_other_field": True}])
|
|
assert all(v is None for v in result.values())
|
|
|
|
def test_payload_as_string_is_json_decoded(self):
|
|
# Defensive: some legacy storage may have nested-stringified payloads.
|
|
row = {
|
|
"fingerprints": json.dumps([{
|
|
"bounty_type": "fingerprint",
|
|
"payload": json.dumps({
|
|
"fingerprint_type": "tls_certificate",
|
|
"cert_sha256": "cd" * 32,
|
|
}),
|
|
}]),
|
|
}
|
|
result = extract_fp_summaries([row])
|
|
assert json.loads(result["tls_cert_sha256"]) == ["cd" * 32]
|
|
|
|
def test_non_string_hash_values_skipped(self):
|
|
row = _row_with({
|
|
"bounty_type": "fingerprint",
|
|
"payload": {"fingerprint_type": "tls_certificate", "cert_sha256": 12345},
|
|
})
|
|
result = extract_fp_summaries([row])
|
|
assert result["tls_cert_sha256"] is None
|
|
|
|
def test_dedup_across_many_rows_with_overlap(self):
|
|
rows = [
|
|
_row_with(_bounty("ja3", ja3="ja3-shared")),
|
|
_row_with(
|
|
_bounty("ja3", ja3="ja3-shared"),
|
|
_bounty("ja3", ja3="ja3-second"),
|
|
),
|
|
_row_with(_bounty("ja3", ja3="ja3-third")),
|
|
]
|
|
result = extract_fp_summaries(rows)
|
|
assert json.loads(result["ja3_hashes"]) == sorted(
|
|
["ja3-shared", "ja3-second", "ja3-third"]
|
|
)
|