DECNET/decnet/web/db/models/attacker_intel.py

"""Threat-intel enrichment row — one per attacker IP, TTL-cached."""
import json as _json
from datetime import datetime, timezone
from typing import Any, Optional

from sqlalchemy import Column
from sqlmodel import Field, SQLModel

from ._base import _BIG_TEXT


def _decode_json_list(value: Any) -> list[Any]:
    if isinstance(value, list):
        return value
    if isinstance(value, str) and value:
        try:
            decoded = _json.loads(value)
        except (_json.JSONDecodeError, TypeError):
            return []
        return decoded if isinstance(decoded, list) else []
    return []


class AttackerIntel(SQLModel, table=True):
    """Aggregated threat-intel verdict for a single attacker IP.

    Populated by the ``decnet enrich`` worker, which queries multiple
    free-tier intel providers (GreyNoise Community, AbuseIPDB,
    abuse.ch Feodo Tracker + ThreatFox) and writes one row per
    attacker IP. The row is TTL-cached via ``expires_at`` so re-firings
    inside the cache window short-circuit before any HTTP egress.

    Per-provider columns are nullable until each provider has answered;
    the enrichment pass writes whichever providers succeeded and leaves
    the rest unchanged on a partial failure.

    ``schema_version`` is committed to storage from day one — federation
    gossip in v2/v3 requires cross-operator compatibility, and
    retrofitting a version column after rows exist is painful. Mirrors
    the rationale on :class:`AttackerIdentity`'s ``schema_version``.
    """

    __tablename__ = "attacker_intel"

    uuid: str = Field(primary_key=True)  # uuid.uuid4().hex, generated by writer
    # Canonical key. One intel row per attacker UUID; FK guarantees no orphan
    # rows when an attacker is deleted, and UNIQUE keeps upserts honest.
    attacker_uuid: str = Field(
        foreign_key="attackers.uuid",
        unique=True,
        index=True,
    )
    # DENORMALISED — NOT a key. The IP the worker queried providers with at
    # write time. Useful for SIEM payloads and audit lookups; updated on every
    # upsert if the attacker rotates IPs. Never use this column as a lookup
    # key; ``attacker_uuid`` is the only canonical identifier here.
    attacker_ip: str = Field(index=True)
    schema_version: int = Field(default=1)

    # ── GreyNoise Community ─────────────────────────────────────────────
    # classification ∈ {"benign", "malicious", "suspicious", "unknown"}.
    # The Community endpoint does not return tags; ``greynoise_tags`` stays
    # empty unless an operator wires a non-Community provider that does.
    greynoise_classification: Optional[str] = Field(default=None, max_length=32)
    greynoise_name: Optional[str] = Field(default=None, max_length=128)
    greynoise_tags: str = Field(
        default="[]",
        sa_column=Column("greynoise_tags", _BIG_TEXT, nullable=False, default="[]"),
    )  # JSON list[str] — behavioral / actor tags
    greynoise_raw: str = Field(
        default="{}",
        sa_column=Column("greynoise_raw", _BIG_TEXT, nullable=False, default="{}"),
    )
    greynoise_queried_at: Optional[datetime] = Field(default=None)

    # ── AbuseIPDB ────────────────────────────────────────────────────────
    # 0..100 abuse confidence score
    abuseipdb_score: Optional[int] = Field(default=None)
    abuseipdb_categories: str = Field(
        default="[]",
        sa_column=Column(
            "abuseipdb_categories", _BIG_TEXT, nullable=False, default="[]",
        ),
    )  # JSON list[int] — flattened set of categories across recent reports
    abuseipdb_raw: str = Field(
        default="{}",
        sa_column=Column("abuseipdb_raw", _BIG_TEXT, nullable=False, default="{}"),
    )
    abuseipdb_queried_at: Optional[datetime] = Field(default=None)

    # ── abuse.ch Feodo Tracker ───────────────────────────────────────────
    feodo_listed: Optional[bool] = Field(default=None)
    feodo_malware_family: Optional[str] = Field(default=None, max_length=64)
    feodo_raw: str = Field(
        default="{}",
        sa_column=Column("feodo_raw", _BIG_TEXT, nullable=False, default="{}"),
    )
    feodo_queried_at: Optional[datetime] = Field(default=None)

    # ── abuse.ch ThreatFox ───────────────────────────────────────────────
    # ThreatFox returns a list of matches for a queried IP. Each match has
    # a ``threat_type`` (botnet_cc / payload_delivery / payload /
    # cc_skimming) and an ``ioc_type`` (url / domain / ip:port / hash
    # variants). We flatten the unique sets across all matches; the
    # IntelLifter keys ATT&CK techniques on ``threat_type``, the canonical
    # taxonomy field per ThreatFox's API.
    threatfox_listed: Optional[bool] = Field(default=None)
    threatfox_threat_types: str = Field(
        default="[]",
        sa_column=Column(
            "threatfox_threat_types", _BIG_TEXT, nullable=False, default="[]",
        ),
    )  # JSON list[str]
    threatfox_ioc_types: str = Field(
        default="[]",
        sa_column=Column(
            "threatfox_ioc_types", _BIG_TEXT, nullable=False, default="[]",
        ),
    )  # JSON list[str]
    threatfox_malware_families: str = Field(
        default="[]",
        sa_column=Column(
            "threatfox_malware_families", _BIG_TEXT, nullable=False, default="[]",
        ),
    )  # JSON list[str]
    threatfox_raw: str = Field(
        default="{}",
        sa_column=Column("threatfox_raw", _BIG_TEXT, nullable=False, default="{}"),
    )
    threatfox_queried_at: Optional[datetime] = Field(default=None)

    # ── Aggregate verdict ────────────────────────────────────────────────
    # Synthesised from per-provider columns. ∈ {"malicious", "suspicious",
    # "benign", "unknown"}. Used by the dashboard and webhook consumers
    # that don't want to reason over four provider columns.
    aggregate_verdict: Optional[str] = Field(
        default=None, max_length=32, index=True
    )

    # ── TTL bookkeeping ──────────────────────────────────────────────────
    cached_at: datetime = Field(
        default_factory=lambda: datetime.now(timezone.utc), index=True
    )
    expires_at: datetime = Field(index=True)

    def to_intel_event_payload(
        self,
        *,
        providers: Optional[list[str]] = None,
    ) -> dict[str, Any]:
        """Project this row into the payload shape the IntelLifter consumes.

        Called by both the intel worker (on live publish of
        ``attacker.intel.enriched``) and the TTP worker (on
        ``attacker.session.ended`` catch-up). The two callers produce
        identical payloads for the same row, so IntelLifter tag UUIDs
        are deterministic regardless of which path delivered them.

        ``providers`` is included when the intel worker knows which
        providers contributed; the TTP catch-up path omits it (the
        IntelLifter does not predicate on ``providers``).
        """
        d: dict[str, Any] = {
            "attacker_uuid": self.attacker_uuid,
            "attacker_ip": self.attacker_ip,
            "aggregate_verdict": self.aggregate_verdict,
            # AbuseIPDB
            "abuseipdb_score": self.abuseipdb_score,
            "abuseipdb_categories": _decode_json_list(self.abuseipdb_categories),
            # GreyNoise
            "greynoise_classification": self.greynoise_classification,
            "greynoise_name": self.greynoise_name,
            "greynoise_tags": _decode_json_list(self.greynoise_tags),
            # Feodo
            "feodo_listed": self.feodo_listed,
            "feodo_malware_family": self.feodo_malware_family,
            # ThreatFox
            "threatfox_listed": self.threatfox_listed,
            "threatfox_threat_types": _decode_json_list(self.threatfox_threat_types),
            "threatfox_ioc_types": _decode_json_list(self.threatfox_ioc_types),
            "threatfox_malware_families": _decode_json_list(
                self.threatfox_malware_families
            ),
        }
        if providers is not None:
            d["providers"] = providers
        return d