feat(prober-cert): roll up fingerprints onto AttackerIdentity

Brings the federation-gossip columns on AttackerIdentity to life —
ja3_hashes, hassh_hashes, and the new tls_cert_sha256 — by projecting
the union of every member observation's fingerprints JSON onto the
identity at clusterer create / link / merge time.

- decnet/profiler/identity_rollup.py: pure extract_fp_summaries()
  reads the production bounty shape (payload.fingerprint_type +
  payload.{ja3,hash,cert_sha256}) and returns deduped+sorted JSON
  list[str] per family, or None when a family has no signal so the
  column stays NULL instead of '[]'.
- BaseRepository.update_identity_fingerprints + SQLModel impl: one
  idempotent write that overwrites the three summary columns and
  bumps updated_at.
- ConnectedComponentsClusterer: after every per-component
  reconciliation (fresh-create OR existing-merge+link), recomputes
  and writes the rollup for the target identity. Wrapped in a
  best-effort helper so a write failure logs but never breaks the
  tick.
- Tests: extract_fp_summaries unit (dedup, sort determinism,
  unknown types ignored, malformed JSON, nested-stringified
  payloads, non-string values); end-to-end clusterer ticks
  populate the columns on create + on later observation links;
  no-fingerprint clusters keep the columns NULL.
This commit is contained in:
2026-04-28 11:28:54 -04:00
parent 9ab43b4ea4
commit 72cc928ebf
6 changed files with 416 additions and 2 deletions

View File

@@ -140,14 +140,41 @@ async def repo(tmp_path):
async def _seed_attacker(
repo, ip: str, *,
ja3: str | None = None, hassh: str | None = None, asn: int | None = None,
ja3: str | None = None,
hassh: str | None = None,
asn: int | None = None,
cert_sha256: str | None = None,
) -> str:
now = datetime.now(timezone.utc)
fingerprints = []
# Two-shape fingerprint payload:
# - the "kind" entries feed the clusterer's from_attacker_row
# (test-fixture shape, line ~115 of connected_components.py)
# - the "bounty_type/payload" entries feed identity_rollup's
# extract_fp_summaries (production shape, written by the
# profiler from real bounty rows). Both shapes coexist in
# the same JSON list so the same seed exercises clustering
# AND the identity-column rollup.
fingerprints: list[dict] = []
if ja3:
fingerprints.append({"kind": "ja3", "hash": ja3})
fingerprints.append({
"bounty_type": "fingerprint",
"payload": {"fingerprint_type": "ja3", "ja3": ja3},
})
if hassh:
fingerprints.append({"kind": "hassh", "hash": hassh})
fingerprints.append({
"bounty_type": "fingerprint",
"payload": {"fingerprint_type": "hassh_server", "hash": hassh},
})
if cert_sha256:
fingerprints.append({
"bounty_type": "fingerprint",
"payload": {
"fingerprint_type": "tls_certificate",
"cert_sha256": cert_sha256,
},
})
return await repo.upsert_attacker({
"ip": ip,
"first_seen": now,
@@ -377,6 +404,70 @@ async def test_tick_links_new_observation_to_existing_identity(repo):
assert d in linked_uuids
# ─── identity fingerprint rollup ───────────────────────────────────────────
@pytest.mark.anyio
async def test_tick_rolls_up_fingerprint_columns_on_create(repo):
"""A fresh-component tick must populate ja3_hashes / hassh_hashes /
tls_cert_sha256 on the newly-minted identity row, deduplicated and
sorted across all member observations."""
await _seed_attacker(
repo, "1.1.1.1", ja3="ja3-x", hassh="hassh-y", cert_sha256="ab" * 32,
)
await _seed_attacker(
repo, "2.2.2.2", ja3="ja3-x", hassh="hassh-y", cert_sha256="cd" * 32,
)
c = ConnectedComponentsClusterer()
result = await c.tick(repo)
assert len(result.identities_formed) == 1
identity_uuid = result.identities_formed[0]["identity_uuid"]
rows = {i["uuid"]: i for i in await repo.list_all_identities()}
identity = rows[identity_uuid]
assert json.loads(identity["ja3_hashes"]) == ["ja3-x"]
assert json.loads(identity["hassh_hashes"]) == ["hassh-y"]
assert json.loads(identity["tls_cert_sha256"]) == sorted(["ab" * 32, "cd" * 32])
@pytest.mark.anyio
async def test_tick_rolls_up_fingerprints_on_link(repo):
"""When a new observation links into an existing identity, the
rollup must reflect any new cert SHA-256 it brings."""
await _seed_attacker(
repo, "1.1.1.1", ja3="ja3-x", cert_sha256="ab" * 32,
)
c = ConnectedComponentsClusterer()
first = await c.tick(repo)
identity_uuid = first.identities_formed[0]["identity_uuid"]
# New observation, same JA3, fresh cert.
await _seed_attacker(
repo, "2.2.2.2", ja3="ja3-x", cert_sha256="cd" * 32,
)
await c.tick(repo)
rows = {i["uuid"]: i for i in await repo.list_all_identities()}
identity = rows[identity_uuid]
assert json.loads(identity["tls_cert_sha256"]) == sorted(["ab" * 32, "cd" * 32])
@pytest.mark.anyio
async def test_tick_leaves_columns_null_when_no_fingerprints(repo):
"""Two attackers with NO fingerprint signal cluster as separate
singletons; their identity rows must keep all rollup columns NULL
(not "[]" — NULL distinguishes 'no signal yet' from 'known empty')."""
await _seed_attacker(repo, "1.1.1.1")
await _seed_attacker(repo, "2.2.2.2")
c = ConnectedComponentsClusterer()
await c.tick(repo)
for identity in await repo.list_all_identities():
assert identity["ja3_hashes"] is None
assert identity["hassh_hashes"] is None
assert identity["tls_cert_sha256"] is None
# ─── fixture-bound assertions (in-memory) ──────────────────────────────────