Files
DECNET/decnet/web/db/models/attackers.py
anti 69ecc4cc20 feat(models): add IPv6 link-local leak columns to Attacker + AttackerIdentity
Attacker gains five denormalized cache fields (ipv6_leak_count,
last_ipv6_leak_at, last_ipv6_link_local, last_ipv6_iid_kind,
last_ipv6_mac_oui) mirroring the rotation_count/last_rotation_at pattern.
AttackerIdentity gains ipv6_link_local_iids (JSON list[dict]) for
EUI-64-derived MAC cluster signals that survive VPN/IP rotation.
No ALTER TABLE helpers — direct SQLModel column additions per pre-v1 policy.
2026-05-17 20:12:08 -04:00

339 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Attacker core + per-attacker behavioral rows.
Per-session keystroke-dynamics fingerprints have moved out of this
module: the column-zoo ``SessionProfile`` shipped here pre-v0 was
superseded by the BEHAVE-SHELL ``observations`` table
(``decnet/web/db/models/observations.py``), which mirrors the BEHAVE
``Observation`` envelope and accepts every primitive the extractor
emits. See ``development/BEHAVE-INTEGRATION.md`` for the design and
``DEBT-036`` (stale) → ``DEBT-050`` for the paydown trail.
"""
from datetime import datetime, timezone
from typing import Any, List, Optional
from pydantic import BaseModel
from sqlalchemy import BINARY, Column, Text, UniqueConstraint
from sqlmodel import Field, SQLModel
from ._base import _BIG_TEXT
class Attacker(SQLModel, table=True):
"""
Per-IP **observation** row. Every distinct source IP we observe gets
one of these. The semantic role is "observation event," not "actor
identity" — an actor rotating across N IPs produces N rows here.
The deduped actor view lives in ``AttackerIdentity`` (one identity
per actor; many observations per identity); the per-operation view
lives in ``Campaign``. ``identity_id`` is set by the clusterer
worker once it resolves which observations are the same hands.
NULL while the clusterer hasn't run on this row yet.
See ``development/IDENTITY_RESOLUTION.md`` for the three-level
hierarchy rationale.
"""
__tablename__ = "attackers"
uuid: str = Field(primary_key=True)
ip: str = Field(index=True)
identity_id: Optional[str] = Field(
default=None,
foreign_key="attacker_identities.uuid",
index=True,
)
first_seen: datetime = Field(index=True)
last_seen: datetime = Field(index=True)
event_count: int = Field(default=0)
service_count: int = Field(default=0)
decky_count: int = Field(default=0)
# JSON blobs — these grow over the attacker's lifetime. Use MEDIUMTEXT on
# MySQL (16 MiB) for the fields that accumulate (fingerprints, commands,
# and the deckies/services lists that are unbounded in principle).
services: str = Field(
default="[]", sa_column=Column("services", _BIG_TEXT, nullable=False, default="[]")
) # JSON list[str]
deckies: str = Field(
default="[]", sa_column=Column("deckies", _BIG_TEXT, nullable=False, default="[]")
) # JSON list[str], first-contact ordered
traversal_path: Optional[str] = Field(
default=None, sa_column=Column("traversal_path", Text, nullable=True)
) # "decky-01 → decky-03 → decky-05"
is_traversal: bool = Field(default=False)
bounty_count: int = Field(default=0)
credential_count: int = Field(default=0)
fingerprints: str = Field(
default="[]", sa_column=Column("fingerprints", _BIG_TEXT, nullable=False, default="[]")
) # JSON list[dict] — bounty fingerprints
commands: str = Field(
default="[]", sa_column=Column("commands", _BIG_TEXT, nullable=False, default="[]")
) # JSON list[dict] — commands per service/decky
# GeoIP enrichment (populated by the profiler from decnet.geoip.enrich_ip).
# Nullable because private / loopback / IPv6 sources never resolve.
country_code: Optional[str] = Field(default=None, max_length=2, index=True)
country_source: Optional[str] = Field(default=None, max_length=16)
# ASN enrichment (populated by the profiler from decnet.asn.enrich_ip).
# Nullable for the same reasons as country_code, plus IPs not currently
# announced in the global BGP table (e.g. CGNAT, dark space).
asn: Optional[int] = Field(default=None, index=True)
as_name: Optional[str] = Field(default=None, max_length=128)
asn_source: Optional[str] = Field(default=None, max_length=16)
# Reverse-DNS (PTR) name, one-shot resolved by the profiler at first
# sighting. Nullable — many attackers run infra with no rDNS, and
# private/loopback addresses never resolve. 256 chars matches
# RFC 1035 max hostname length.
ptr_record: Optional[str] = Field(default=None, max_length=256)
# Substrate-rotation telemetry, maintained by
# ``decnet.correlation.fingerprint_rotation.record_fingerprint`` whenever
# the prober observes a new hash for an (attacker, port, probe_type)
# triple it has seen before. Lets the dashboard render "rotated 3×
# last 24h" without joining to AttackerFingerprintState.
rotation_count: int = Field(default=0)
last_rotation_at: Optional[datetime] = Field(default=None, index=True)
# IPv6 link-local leak telemetry. Denormalized cache — authoritative
# history lives in TTPTag rows with source_kind="ipv6_leak". Kept here
# so the attacker drawer can render the leaked address in one query.
ipv6_leak_count: int = Field(default=0)
last_ipv6_leak_at: Optional[datetime] = Field(default=None, index=True)
last_ipv6_link_local: Optional[str] = Field(default=None, max_length=45)
last_ipv6_iid_kind: Optional[str] = Field(default=None, max_length=16)
last_ipv6_mac_oui: Optional[str] = Field(default=None, max_length=8)
updated_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc), index=True
)
class AttackerFingerprintState(SQLModel, table=True):
"""Per-(attacker, port, probe_type) latest-hash row.
Sole purpose: give the prober memory across runs so it can detect when
an attacker's HASSH/JARM/TCP fingerprint flips for the same port — i.e.
they rotated their VPS, rebuilt their SSH server, swapped their TLS
cert. Diff detection lives in
``decnet.correlation.fingerprint_rotation``; the prober calls into
that library inline at each emit site and this table is the only
persistence it needs.
Bounded by ``attackers × probe families × ports`` — small in practice;
a busy fleet sees O(thousands) of rows, not O(millions).
"""
__tablename__ = "attacker_fingerprint_state"
uuid: str = Field(primary_key=True)
attacker_uuid: str = Field(foreign_key="attackers.uuid", index=True)
port: int
probe_type: str = Field(max_length=16) # "jarm" | "hassh" | "tcpfp"
last_hash: str = Field(max_length=128)
last_seen: datetime = Field(index=True)
rotation_count: int = Field(default=0)
__table_args__ = (
UniqueConstraint(
"attacker_uuid", "port", "probe_type",
name="uq_attacker_fingerprint_state_natural",
),
)
class AttackerIdentity(SQLModel, table=True):
"""
Resolved actor identity — the dedup'd "same hands" row that one or
more ``Attacker`` observations FK into. Populated by the (future)
clusterer worker; NULL on every observation until it runs.
Why a separate table from ``Attacker``: an actor rotating across N
IPs produces N observation rows but only ONE identity row. The
identity is recovered from signals the attacker can't cheaply
rotate — JA3, HASSH, payload hashes, C2 callbacks, and (V2)
keystroke-rhythm SimHash. See ``development/IDENTITY_RESOLUTION.md``.
All clusterer-populated fields are nullable; the table ships empty
in the schema-only PR (commit 1) and stays empty until the
clusterer lands. Empty is valid.
``schema_version`` is non-negotiable from day one. Federation
gossip in V2 will share identity vectors across operators;
bumping feature definitions without a version field silently
poisons receivers.
"""
__tablename__ = "attacker_identities"
uuid: str = Field(primary_key=True)
schema_version: int = Field(default=1)
# Set by the campaign clusterer. The ``campaigns`` table now
# exists; this is a real FK. Nullable until the campaign clusterer
# has run on this identity row.
campaign_id: Optional[str] = Field(
default=None, foreign_key="campaigns.uuid", index=True
)
first_seen_at: Optional[datetime] = Field(default=None, index=True)
last_seen_at: Optional[datetime] = Field(default=None, index=True)
created_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc), index=True
)
updated_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc), index=True
)
# Identity-cohesion score from the clusterer. Range [0, 1]; null
# until the clusterer writes. Higher = more confident the
# observations linked to this identity are the same hands.
confidence: Optional[float] = Field(default=None)
# Denormalized count of FK'd Attacker rows. Maintained by the
# clusterer when it links/unlinks. Cheap dashboard read.
observation_count: int = Field(default=0)
# Fingerprint summary columns. JSON-serialized list[str] in TEXT
# because: (a) federation gossip wants this exact shape on the
# wire, (b) MySQL can't index BLOB/TEXT without prefix lengths,
# (c) actors can present multiple JA3/HASSH values across tools
# so a scalar column is wrong.
ja3_hashes: Optional[str] = Field(
default=None, sa_column=Column("ja3_hashes", Text, nullable=True)
)
hassh_hashes: Optional[str] = Field(
default=None, sa_column=Column("hassh_hashes", Text, nullable=True)
)
ja4h_hashes: Optional[str] = Field(
default=None, sa_column=Column("ja4h_hashes", Text, nullable=True)
)
ja4_quic_hashes: Optional[str] = Field(
default=None, sa_column=Column("ja4_quic_hashes", Text, nullable=True)
)
http_versions_seen: Optional[str] = Field(
default=None, sa_column=Column("http_versions_seen", Text, nullable=True)
)
# JSON list[str] — SHA-256 fingerprints of leaf certs presented by
# attacker-run TLS servers, captured by the active prober alongside
# JARM. Same federation-gossip rationale as ja3_hashes/hassh_hashes:
# a self-signed cert reused across C2 nodes is an instant cluster-link
# signal, and TEXT keeps MySQL indexable via prefix length.
tls_cert_sha256: Optional[str] = Field(
default=None, sa_column=Column("tls_cert_sha256", Text, nullable=True)
)
# JSON list[dict] — observed IPv6 link-local IIDs per identity.
# EUI-64-derived MACs survive VPN/IP rotation and are a stronger
# cluster signal than any rotatable TLS/SSH hash. Federation gossip
# (V2) will share these. Shape per entry:
# {"iid": "fe80::aabb:ccff:fedd:eeff", "oui": "aa:bb:cc",
# "kind": "eui64", "first_seen": "<ISO8601>"}
ipv6_link_local_iids: Optional[str] = Field(
default=None, sa_column=Column("ipv6_link_local_iids", Text, nullable=True)
)
# Payload SimHash list — 64-bit ints serialized as hex strings.
# SimHashes are Hamming-comparable, which is the entire reason
# they're a list (not a set).
payload_simhashes: Optional[str] = Field(
default=None, sa_column=Column("payload_simhashes", Text, nullable=True)
)
c2_endpoints: Optional[str] = Field(
default=None, sa_column=Column("c2_endpoints", Text, nullable=True)
)
# V2 keystroke-dynamics hook. Per-identity centroid (or majority
# vote) across the identity's session-level digraph SimHashes.
# The per-session SimHashes themselves now ride as BEHAVE
# observations (``cognitive.*`` digraph primitive — see
# ``development/BEHAVE-INTEGRATION.md`` and the BEHAVE-SHELL
# registry); this column is the rollup the (future) attribution
# engine will write into so the federation gossip layer
# has one identity-level fingerprint to compare across operators.
# BINARY(8) so MySQL can index without a prefix length.
kd_digraph_simhash: Optional[bytes] = Field(
default=None,
sa_column=Column("kd_digraph_simhash", BINARY(8), nullable=True, index=True),
)
# Soft-merge audit trail. When the clusterer collapses two
# identities, the loser's row stays in place with this set to the
# winner's UUID — preserves the audit trail without orphaning FKs
# from any cached subscribers. Resolvers (e.g.
# GET /identities/{uuid}) follow the chain and surface the winner.
merged_into_uuid: Optional[str] = Field(
default=None, foreign_key="attacker_identities.uuid", index=True
)
# Operator-editable free-form notes — annotation surface for human
# analysts ("known APT-XX cluster," "matches MISP event 1234").
notes: Optional[str] = Field(
default=None, sa_column=Column("notes", Text, nullable=True)
)
class AttackerBehavior(SQLModel, table=True):
"""
Timing & behavioral profile for an attacker, joined to Attacker by uuid.
Kept in a separate table so the core Attacker row stays narrow and
behavior data can be updated independently (e.g. as the sniffer observes
more packets) without touching the event-count aggregates.
"""
__tablename__ = "attacker_behavior"
attacker_uuid: str = Field(primary_key=True, foreign_key="attackers.uuid")
# OS / TCP stack fingerprint (rolled up from sniffer events)
os_guess: Optional[str] = None
hop_distance: Optional[int] = None
tcp_fingerprint: str = Field(
default="{}",
sa_column=Column("tcp_fingerprint", Text, nullable=False, default="{}"),
) # JSON: window, wscale, mss, options_sig
# Raw SSH KEX algorithm preference strings observed across HASSH probes
# (one entry per hassh_fingerprint event). Keeping the raw ordered list
# enables post-hoc KEX-order fingerprinting beyond the HASSH hash.
kex_order_raw: Optional[str] = Field(
default=None,
sa_column=Column("kex_order_raw", Text, nullable=True),
) # JSON list[str] — kex_algorithms comma-separated strings
# Sniffer-observed SSH client identification strings (RFC 4253 §4.2),
# deduped in observation order. Captures the attacker's SSH client
# software (e.g. "SSH-2.0-OpenSSH_9.2p1", "SSH-2.0-libssh2_1.10.0").
ssh_client_banners: Optional[str] = Field(
default=None,
sa_column=Column("ssh_client_banners", Text, nullable=True),
) # JSON list[str]
retransmit_count: int = Field(default=0)
# Behavioral (derived by the profiler from log-event timing)
behavior_class: Optional[str] = None # beaconing | interactive | scanning | brute_force | slow_scan | mixed | unknown
beacon_interval_s: Optional[float] = None
beacon_jitter_pct: Optional[float] = None
tool_guesses: Optional[str] = None # JSON list[str] — all matched tools
timing_stats: str = Field(
default="{}",
sa_column=Column("timing_stats", Text, nullable=False, default="{}"),
) # JSON: mean/median/stdev/min/max IAT
phase_sequence: str = Field(
default="{}",
sa_column=Column("phase_sequence", Text, nullable=False, default="{}"),
) # JSON: recon_end/exfil_start/latency
updated_at: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc), index=True
)
class SmtpTarget(SQLModel, table=True):
"""
Per-attacker list of victim domains observed via the SMTP honeypots.
Each row is one (attacker_uuid, domain) pair — an attacker who relays
mail to 500 addresses at acme.com collapses into a single row with
count=500. Only the *domain* is stored; local-parts (the bit before
`@`) are dropped at ingestion, so this table contains no PII beyond
the target organisation's identity.
Shape is designed for future V2 federation gossip: the
`smtp_target_seen(domain)` query returns aggregate counts with zero
cross-org attacker leakage — each operator can answer "have you seen
this domain being targeted?" without exposing *which* attackers did.
"""
__tablename__ = "smtp_targets"
id: Optional[int] = Field(default=None, primary_key=True)
attacker_uuid: str = Field(foreign_key="attackers.uuid", index=True)
domain: str = Field(index=True)
first_seen: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
last_seen: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc), index=True
)
# Aggregate counter — one rcpt_to / message_accepted recipient bumps this.
count: int = Field(default=1)
__table_args__ = (
UniqueConstraint("attacker_uuid", "domain", name="uq_smtp_targets_attacker_domain"),
)
class AttackersResponse(BaseModel):
total: int
limit: int
offset: int
data: List[dict[str, Any]]