DECNET/decnet/ttp/attack_stix.py

"""STIX 2.1 backed MITRE ATT&CK lookups.

Replaces the hand-maintained technique-name dict that used to live in
``decnet/ttp/attack_catalog.py``. Single source of truth is the
official ``enterprise-attack-<version>.json`` STIX bundle published by
MITRE; consumers (rule engine, intel lifters, web router, frontend
rollups) call the small public API below instead of reading raw STIX.

Bundle resolution order
-----------------------

1. ``DECNET_ATTACK_BUNDLE`` env var (absolute path to a JSON file).
2. ``<cache_dir>/enterprise-attack-<version>.json`` where ``<cache_dir>``
   defaults to ``~/.cache/decnet/attack`` and is overridable with
   ``DECNET_ATTACK_CACHE_DIR``.
3. Fetch from :data:`decnet.ttp.attack_version.ATTACK_BUNDLE_URL` into
   the cache dir.

In every case the loaded bytes are verified against
:data:`decnet.ttp.attack_version.ATTACK_BUNDLE_SHA256` *before* the
bundle is parsed. A mismatch raises :class:`AttackBundleError` and the
loader refuses to serve queries. This is intentional — drift between
DECNET's expected ATT&CK version and what the operator (or a tampered
mirror) actually placed on disk would silently mistag thousands of
events.

Lazy-loading: the bundle (~50 MB) is parsed on first call to any
public function, never at import time. Tests that don't touch ATT&CK
should never pay the cost.
"""
from __future__ import annotations

import hashlib
import logging
import os
import sys
from functools import lru_cache
from pathlib import Path
from threading import Lock
from typing import Final

from mitreattack.stix20 import MitreAttackData

from decnet.ttp.attack_version import (
    ATTACK_BUNDLE_SHA256,
    ATTACK_BUNDLE_URL,
    ATTACK_BUNDLE_VERSION,
)

logger = logging.getLogger(__name__)

_ENV_BUNDLE_PATH: Final[str] = "DECNET_ATTACK_BUNDLE"
_ENV_CACHE_DIR: Final[str] = "DECNET_ATTACK_CACHE_DIR"
_DEFAULT_CACHE_DIR: Final[Path] = Path.home() / ".cache" / "decnet" / "attack"

_data_lock = Lock()
_data: MitreAttackData | None = None
_loaded_path: Path | None = None


class AttackBundleError(RuntimeError):
    """Raised when the ATT&CK STIX bundle cannot be loaded or verified."""


def _cache_dir() -> Path:
    override = os.environ.get(_ENV_CACHE_DIR)
    return Path(override) if override else _DEFAULT_CACHE_DIR


def _expected_cache_path() -> Path:
    return _cache_dir() / f"enterprise-attack-{ATTACK_BUNDLE_VERSION}.json"


def _verify_sha256(path: Path) -> None:
    h = hashlib.sha256()
    with path.open("rb") as f:
        for chunk in iter(lambda: f.read(1 << 20), b""):
            h.update(chunk)
    actual = h.hexdigest()
    if actual != ATTACK_BUNDLE_SHA256:
        raise AttackBundleError(
            f"ATT&CK bundle at {path} sha256={actual} does not match "
            f"pinned {ATTACK_BUNDLE_SHA256} (version {ATTACK_BUNDLE_VERSION}). "
            "Refusing to load — re-fetch or update attack_version.py."
        )


def _fetch_bundle(target: Path) -> None:
    import requests

    target.parent.mkdir(parents=True, exist_ok=True)
    logger.info("Fetching ATT&CK bundle %s -> %s", ATTACK_BUNDLE_URL, target)
    tmp = target.with_suffix(target.suffix + ".part")
    try:
        resp = requests.get(ATTACK_BUNDLE_URL, timeout=60, stream=True)
        resp.raise_for_status()
        with tmp.open("wb") as f:
            for chunk in resp.iter_content(1 << 20):
                if chunk:
                    f.write(chunk)
        tmp.replace(target)
    except Exception:
        tmp.unlink(missing_ok=True)
        raise


def resolve_bundle_path() -> Path:
    """Return the verified bundle path, fetching if necessary."""
    override = os.environ.get(_ENV_BUNDLE_PATH)
    if override:
        path = Path(override)
        if not path.is_file():
            raise AttackBundleError(
                f"{_ENV_BUNDLE_PATH}={override} does not point to a file"
            )
        _verify_sha256(path)
        return path

    cached = _expected_cache_path()
    if not cached.is_file():
        _fetch_bundle(cached)
    _verify_sha256(cached)
    return cached


def _load() -> MitreAttackData:
    global _data, _loaded_path
    with _data_lock:
        if _data is not None:
            return _data
        path = resolve_bundle_path()
        _data = MitreAttackData(str(path))
        _loaded_path = path
        logger.info(
            "Loaded ATT&CK bundle version=%s path=%s",
            ATTACK_BUNDLE_VERSION,
            path,
        )
        return _data


def loaded_bundle_path() -> Path | None:
    """Return the path the bundle was loaded from, or ``None`` if not loaded yet."""
    return _loaded_path


@lru_cache(maxsize=4096)
def _attack_pattern_by_id(technique_id: str) -> dict | None:
    obj = _load().get_object_by_attack_id(technique_id, "attack-pattern")
    if obj is None:
        return None
    return dict(obj)


@lru_cache(maxsize=64)
def _tactic_by_id(tactic_id: str) -> dict | None:
    obj = _load().get_object_by_attack_id(tactic_id, "x-mitre-tactic")
    if obj is None:
        return None
    return dict(obj)


@lru_cache(maxsize=64)
def _tactic_by_short_name(short_name: str) -> dict | None:
    for obj in _load().get_tactics():
        if obj.get("x_mitre_shortname") == short_name:
            return dict(obj)
    return None


def technique_name(technique_id: str | None) -> str | None:
    """Return the canonical ATT&CK display name for *technique_id*.

    For a sub-technique (``T1059.004``) the parent is prepended so the
    rendered string matches the historical format
    ``"Command and Scripting Interpreter: Unix Shell"``. ``None`` for
    unknown IDs — callers (UI, exporter) fall back to showing the bare
    ID. Drift is caught at startup by
    :func:`assert_known_technique_ids`, so a ``None`` here in
    production means an upstream emitted an ID that wasn't on the
    validation list (likely a hot-loaded rule).
    """
    if not technique_id:
        return None
    obj = _attack_pattern_by_id(technique_id)
    if obj is None:
        return None
    name = obj.get("name")
    if "." not in technique_id or not obj.get("x_mitre_is_subtechnique"):
        return name
    parent = subtechnique_parent_name(technique_id)
    if parent is None:
        return name
    return f"{parent}: {name}"


def subtechnique_parent_name(technique_id: str) -> str | None:
    parents = _load().get_parent_technique_of_subtechnique(
        _attack_pattern_by_id(technique_id)["id"]  # type: ignore[index]
    )
    if not parents:
        return None
    return parents[0]["object"].name


def is_subtechnique(technique_id: str) -> bool:
    obj = _attack_pattern_by_id(technique_id)
    return bool(obj and obj.get("x_mitre_is_subtechnique"))


def tactic_name(tactic_id_or_short_name: str | None) -> str | None:
    """Return the tactic display name for either a ``TA0001``-style ID or a kill-chain short name."""
    if not tactic_id_or_short_name:
        return None
    if tactic_id_or_short_name.startswith("TA"):
        obj = _tactic_by_id(tactic_id_or_short_name)
    else:
        obj = _tactic_by_short_name(tactic_id_or_short_name)
    return obj.get("name") if obj else None


def tactic_id_for_short_name(short_name: str) -> str | None:
    obj = _tactic_by_short_name(short_name)
    if obj is None:
        return None
    for ref in obj.get("external_references", []):
        if ref.get("source_name") == "mitre-attack":
            return ref.get("external_id")
    return None


def kill_chain_phases(technique_id: str) -> list[str]:
    """Return the kill-chain phase short-names for a technique."""
    obj = _attack_pattern_by_id(technique_id)
    if obj is None:
        return []
    return [
        p["phase_name"]
        for p in obj.get("kill_chain_phases", [])
        if p.get("kill_chain_name") == "mitre-attack"
    ]


def technique_exists(technique_id: str) -> bool:
    return _attack_pattern_by_id(technique_id) is not None


def tactic_exists(tactic_id: str) -> bool:
    return _tactic_by_id(tactic_id) is not None


def assert_known_technique_ids(
    technique_ids: list[str] | set[str] | tuple[str, ...],
    *,
    source: str,
) -> None:
    """Raise :class:`AttackBundleError` listing any IDs missing from the bundle."""
    missing = sorted({t for t in technique_ids if t and not technique_exists(t)})
    if missing:
        raise AttackBundleError(
            f"{source}: technique IDs not present in ATT&CK Enterprise "
            f"v{ATTACK_BUNDLE_VERSION}: {missing}"
        )


def assert_known_tactic_ids(
    tactic_ids: list[str] | set[str] | tuple[str, ...],
    *,
    source: str,
    exempt: set[str] | None = None,
) -> None:
    exempt = exempt or set()
    missing = sorted(
        {t for t in tactic_ids if t and t not in exempt and not tactic_exists(t)}
    )
    if missing:
        raise AttackBundleError(
            f"{source}: tactic IDs not present in ATT&CK Enterprise "
            f"v{ATTACK_BUNDLE_VERSION}: {missing}"
        )


def _cli_fetch(print_sha: bool) -> int:
    cached = _expected_cache_path()
    if not cached.is_file():
        try:
            _fetch_bundle(cached)
        except Exception as exc:  # pragma: no cover - network failure path
            print(f"fetch failed: {exc}", file=sys.stderr)
            return 1
    if print_sha:
        h = hashlib.sha256()
        with cached.open("rb") as f:
            for chunk in iter(lambda: f.read(1 << 20), b""):
                h.update(chunk)
        print(f"{h.hexdigest()}  {cached}")
        return 0
    try:
        _verify_sha256(cached)
    except AttackBundleError as exc:
        print(str(exc), file=sys.stderr)
        return 2
    print(f"OK {cached} (version {ATTACK_BUNDLE_VERSION})")
    return 0


def main(argv: list[str] | None = None) -> int:
    import argparse

    p = argparse.ArgumentParser(prog="python -m decnet.ttp.attack_stix")
    sub = p.add_subparsers(dest="cmd", required=True)
    f = sub.add_parser("fetch", help="Fetch and verify the pinned ATT&CK bundle.")
    f.add_argument(
        "--print-sha",
        action="store_true",
        help="Print sha256 of the cached bundle (for updating attack_version.py).",
    )
    args = p.parse_args(argv)
    if args.cmd == "fetch":
        return _cli_fetch(args.print_sha)
    return 1


if __name__ == "__main__":  # pragma: no cover
    raise SystemExit(main())


__all__ = [
    "ATTACK_BUNDLE_VERSION",
    "AttackBundleError",
    "assert_known_tactic_ids",
    "assert_known_technique_ids",
    "is_subtechnique",
    "kill_chain_phases",
    "loaded_bundle_path",
    "resolve_bundle_path",
    "subtechnique_parent_name",
    "tactic_exists",
    "tactic_id_for_short_name",
    "tactic_name",
    "technique_exists",
    "technique_name",
]