feat(intel,ingester): mal_hash feed + observed_attachments table (DEBT-046)

New MalHashProvider sibling ABC (decnet/intel/base.py) since SHA-256
is a different keyspace from IntelProvider's IPs. MalwareBazaarProvider
mirrors FeodoProvider's bulk-feed shape: 24h refresh via _ensure_fresh
/ _refresh, in-memory set[str] of hex-lowercased hashes, set-membership
lookup. Auth-keyed via DECNET_MALWAREBAZAAR_AUTH_KEY; absent key
silent-no-ops the lane (single warning, no HTTP traffic).

Per-hash observations persist to a new observed_attachments table.
DECNET is a honeypot platform — every attachment hash an attacker
delivers is intel, regardless of whether anyone classified it. Verdict
is sticky: True never downgrades to False/None on subsequent
observations. Out of scope: API surface, federation export, retention.

Ingester _publish_email_received calls the provider for each attachment
sha256, sets mal_hash_match on the bus payload (omitted entirely when
the message had no attachments — keeps R0046's `is True` predicate
silent on hash-less mail, matching pre-paydown behavior), and upserts
the row regardless of provider availability.
This commit is contained in:
2026-05-03 05:56:46 -04:00
parent 03beff3840
commit 3f080f601d
13 changed files with 1135 additions and 31 deletions

View File

@@ -78,3 +78,33 @@ class IntelProvider(ABC):
entire IP. Implementations should also respect
``self._semaphore`` to bound in-flight calls.
"""
class MalHashProvider(ABC):
"""Abstract bad-hash lookup provider.
Sibling to :class:`IntelProvider` — different keyspace (file SHA-256
vs IP), different consumer (the email ingester at observation time,
not the IP-keyed intel-worker fan-out). Kept as a separate ABC so
the ``lookup(ip)`` semantics on ``IntelProvider`` stay honest.
Concrete impls today:
* :class:`decnet.intel.mal_hash.MalwareBazaarProvider` — bulk-feed
shape mirroring :class:`decnet.intel.feodo.FeodoProvider`.
Future impls (paid VirusTotal subscription, in-house allowlist) plug
in behind the same factory in :func:`decnet.intel.factory.get_mal_hash_provider`.
"""
name: str
@abstractmethod
async def is_known_bad(self, sha256: str) -> bool:
"""Return whether *sha256* is on this provider's bad-hash list.
MUST NOT raise — return ``False`` on any error (the caller is the
ingester, not a worker; an exception here would taint a totally
unrelated bus payload). The provider is responsible for logging
its own errors.
"""

View File

@@ -21,7 +21,7 @@ from __future__ import annotations
import os
from typing import List
from decnet.intel.base import IntelProvider
from decnet.intel.base import IntelProvider, MalHashProvider
_KNOWN_PROVIDERS = ("greynoise", "abuseipdb", "feodo", "threatfox")
@@ -37,6 +37,40 @@ def _provider_list() -> list[str]:
return [p.strip().lower() for p in raw.split(",") if p.strip()]
_mal_hash_singleton: MalHashProvider | None = None
_mal_hash_initialized: bool = False
def get_mal_hash_provider() -> MalHashProvider | None:
"""Return the configured malware-hash lookup provider singleton.
Sibling factory to :func:`get_intel_providers` — different keyspace
(file SHA-256 vs IP), different consumer (the email ingester at
observation time, not the IP-keyed intel-worker fan-out). Returns
``None`` only if intel is disabled wholesale; otherwise returns a
provider whose :meth:`is_known_bad` self-disables to a no-op when
``DECNET_MALWAREBAZAAR_AUTH_KEY`` is unset, so the ingester never
has to special-case "no provider configured."
"""
global _mal_hash_singleton, _mal_hash_initialized
if _mal_hash_initialized:
return _mal_hash_singleton
_mal_hash_initialized = True
if not _enabled():
_mal_hash_singleton = None
return None
from decnet.intel.mal_hash import MalwareBazaarProvider
_mal_hash_singleton = MalwareBazaarProvider()
return _mal_hash_singleton
def _reset_mal_hash_provider_for_testing() -> None:
"""Test hook — drop the singleton so the next call re-reads env."""
global _mal_hash_singleton, _mal_hash_initialized
_mal_hash_singleton = None
_mal_hash_initialized = False
def get_intel_providers() -> List[IntelProvider]:
"""Return the configured threat-intel providers.

195
decnet/intel/mal_hash.py Normal file
View File

@@ -0,0 +1,195 @@
"""MalwareBazaar bad-hash provider — bulk SHA-256 feed.
Mirrors :mod:`decnet.intel.feodo` for the refresh / TTL / set-membership
shape, but operates on the SHA-256 keyspace instead of IPs and so
implements :class:`decnet.intel.base.MalHashProvider` rather than
:class:`IntelProvider`. Keep the two ABCs disjoint — see ``base.py``.
Endpoint: ``GET https://bazaar.abuse.ch/export/csv/full/`` with
``Auth-Key: <key>`` header. Returns a ZIP'd CSV with one row per
sample; the ``sha256_hash`` column is the natural key. ~900K rows ≈
30 MB resident as a ``set[str]`` of hex-lowercased hashes.
Auth-key is read from ``DECNET_MALWAREBAZAAR_AUTH_KEY``. When unset,
the provider logs one warning at first refresh attempt and disables
itself for the process lifetime — :meth:`is_known_bad` returns ``False``
without ever making a network call. The ingester treats that the same
as "no opinion," so R0046's ``mal_hash_match`` lane stays absent on the
bus payload (which is exactly what the predicate's ``is True`` check
does today, so the silent-no-op is behaviorally identical to "lane not
shipped yet").
"""
from __future__ import annotations
import csv
import io
import os
import time
import zipfile
from typing import Optional
from decnet.intel.base import MalHashProvider
from decnet.logging import get_logger
from decnet.net.http import stealth_client
log = get_logger("intel.mal_hash")
_ENDPOINT = "https://bazaar.abuse.ch/export/csv/full/"
_DEFAULT_REFRESH_S = 86_400.0 # 24h — feed is daily, no need to hammer
_AUTH_KEY_ENV = "DECNET_MALWAREBAZAAR_AUTH_KEY"
_REFRESH_INTERVAL_ENV = "DECNET_MAL_HASH_REFRESH_INTERVAL_S"
def _read_refresh_interval() -> float:
raw = os.environ.get(_REFRESH_INTERVAL_ENV)
if raw is None:
return _DEFAULT_REFRESH_S
try:
return float(raw)
except ValueError:
log.warning(
"%s=%r not a float; falling back to default %.0f",
_REFRESH_INTERVAL_ENV, raw, _DEFAULT_REFRESH_S,
)
return _DEFAULT_REFRESH_S
class MalwareBazaarProvider(MalHashProvider):
"""Bulk SHA-256 lookup against MalwareBazaar's full export."""
name = "malwarebazaar"
def __init__(
self,
*,
auth_key: Optional[str] = None,
refresh_interval_s: Optional[float] = None,
) -> None:
self._auth_key = auth_key or os.environ.get(_AUTH_KEY_ENV) or None
self._refresh_interval_s = (
refresh_interval_s
if refresh_interval_s is not None
else _read_refresh_interval()
)
self._known: set[str] = set()
self._loaded_at: float = 0.0
self._last_error: Optional[str] = None
self._disabled_warned: bool = False
@property
def disabled(self) -> bool:
return self._auth_key is None
async def _refresh(self) -> Optional[str]:
"""Refetch the bulk feed. Returns an error string or ``None``."""
if self._auth_key is None:
return "no auth key"
try:
async with stealth_client(timeout=60.0) as client:
resp = await client.get(
_ENDPOINT, headers={"Auth-Key": self._auth_key},
)
except Exception as exc: # noqa: BLE001
return f"network: {exc}"
if resp.status_code != 200:
return f"HTTP {resp.status_code}"
body = resp.content
try:
new_known = _parse_dump(body)
except Exception as exc: # noqa: BLE001
return f"parse: {exc}"
if not new_known:
return "feed: empty"
self._known = new_known
self._loaded_at = time.monotonic()
self._last_error = None
log.info("malwarebazaar: refreshed bulk feed entries=%d", len(new_known))
return None
async def _ensure_fresh(self) -> None:
if self.disabled:
if not self._disabled_warned:
log.warning(
"R0046 mal_hash_match disabled: %s unset",
_AUTH_KEY_ENV,
)
self._disabled_warned = True
return
if (
not self._known
or (time.monotonic() - self._loaded_at) >= self._refresh_interval_s
):
err = await self._refresh()
if err:
self._last_error = err
log.warning("malwarebazaar refresh failed: %s", err)
async def is_known_bad(self, sha256: str) -> bool:
if self.disabled:
return False
try:
await self._ensure_fresh()
except Exception as exc: # noqa: BLE001
# Belt and braces: _ensure_fresh swallows refresh failures
# but a bug in there shouldn't blow up the ingester payload.
log.exception("malwarebazaar refresh raised: %s", exc)
return False
return sha256.lower() in self._known
def _parse_dump(body: bytes) -> set[str]:
"""Extract SHA-256 hashes from MalwareBazaar's full dump.
The endpoint returns a ZIP archive containing a single CSV with a
``sha256_hash`` column. Some abuse.ch flavours of the same feed
family ship plain CSV instead — handle both by sniffing the magic
bytes. Hashes are lowercased; non-hex / wrong-length values are
dropped (defense in depth — we set-membership-test by exact match).
"""
if body[:2] == b"PK":
with zipfile.ZipFile(io.BytesIO(body)) as zf:
csv_names = [n for n in zf.namelist() if n.lower().endswith(".csv")]
if not csv_names:
raise ValueError("zip has no .csv member")
with zf.open(csv_names[0]) as fh:
csv_bytes = fh.read()
else:
csv_bytes = body
text = csv_bytes.decode("utf-8", errors="replace")
return _extract_hashes(text)
def _extract_hashes(text: str) -> set[str]:
"""Pull the ``sha256_hash`` column out of MalwareBazaar's CSV.
The dump prefaces the table with ``#``-prefixed comment lines.
Skip those, find the header row, locate the column, then read the
rest. csv.reader handles the quoting (the ``signature`` column
contains commas and is properly quoted in the dump).
"""
body_lines = [
line for line in text.splitlines()
if line and not line.lstrip().startswith("#")
]
if not body_lines:
return set()
reader = csv.reader(body_lines)
header = next(reader, None)
if not header:
return set()
norm = [h.strip().strip('"').lower() for h in header]
try:
col = norm.index("sha256_hash")
except ValueError:
# Fallback — first column is sha256 in every documented
# variant; if the header naming changes upstream we still
# capture something rather than silently emptying the set.
col = 0
out: set[str] = set()
for row in reader:
if len(row) <= col:
continue
cell = row[col].strip().strip('"').lower()
if len(cell) == 64 and all(c in "0123456789abcdef" for c in cell):
out.add(cell)
return out

View File

@@ -54,6 +54,9 @@ from .attackers import (
from .attacker_intel import (
AttackerIntel,
)
from .attachments import (
ObservedAttachment,
)
from .campaigns import (
Campaign,
CampaignsResponse,
@@ -247,6 +250,7 @@ __all__ = [
"AttackerIdentity",
"AttackerIntel",
"AttackersResponse",
"ObservedAttachment",
"SessionProfile",
"SmtpTarget",
# campaigns

View File

@@ -0,0 +1,76 @@
"""Observed-attachment intel — purpose-built table for the per-hash
keyspace of attachments delivered by attackers.
DECNET is a honeypot **platform**, not a one-off appliance. Every
attachment SHA-256 that crosses a decky is itself an artifact: it
seeds future cross-attacker correlation ("same hash, multiple
unrelated attackers? cross-decky propagation?"), feeds the EmailLifter
R0046 ``mal_hash_match`` lane with provider-attributed verdicts at
observation time, and underwrites future federation work without
locking us into a particular outbound shape today.
Per the standing rule "new use cases get their own table with UUID
PK," this is its own table — NOT a column-bag on ``attacker_intel``
(which is IP-keyed; one hash can ride many IPs) or on the email rows
(one hash can ride many emails; the cross-correlation question is
per-hash).
"""
from datetime import datetime, timezone
from typing import List, Optional
from uuid import uuid4
from sqlalchemy import JSON, Column, Index
from sqlmodel import Field, SQLModel
class ObservedAttachment(SQLModel, table=True):
"""One distinct file-attachment hash observed across the fleet.
The natural key is ``sha256``; the row is upserted per observation
via :meth:`BaseRepository.upsert_observed_attachment`. ``uuid`` is
the surrogate PK — the ingester never refers to it directly, but
future API surfaces benefit from the indirection (and from a
UUID-shaped foreign-key column once federation work lands).
"""
__tablename__ = "observed_attachments"
__table_args__ = (
Index("ix_observed_attachments_first_seen", "first_seen"),
Index("ix_observed_attachments_last_seen", "last_seen"),
Index("ix_observed_attachments_mal_hash_match", "mal_hash_match"),
)
uuid: str = Field(default_factory=lambda: str(uuid4()), primary_key=True)
sha256: str = Field(unique=True, index=True, max_length=64)
first_seen: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
)
last_seen: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
)
observation_count: int = Field(default=1)
first_seen_decky_uuid: Optional[str] = Field(default=None, index=True)
first_seen_attacker_uuid: Optional[str] = Field(default=None, index=True)
last_seen_attacker_uuid: Optional[str] = Field(default=None, index=True)
# Native JSON list[str] — every distinct file extension this hash has
# been delivered as. One hash, multiple extensions = obfuscation
# signal worth keeping. Per the standing typed-evidence rule:
# default_factory, not default=[].
extensions: List[str] = Field(
default_factory=list,
sa_column=Column(JSON, nullable=False, default=list),
)
first_subject: Optional[str] = Field(default=None)
# Verdict captured at observation time. ``None`` = no provider has
# classified yet. ``True`` is sticky — once any provider says
# "known bad," subsequent ``None``/``False`` observations don't
# downgrade the verdict (a hash a feed later forgets is still a
# hash that feed once flagged).
mal_hash_match: Optional[bool] = Field(default=None)
mal_hash_match_provider: Optional[str] = Field(
default=None, max_length=64,
)
mal_hash_match_at: Optional[datetime] = Field(default=None)

View File

@@ -313,6 +313,27 @@ class BaseRepository(ABC):
"""Retrieve the keystroke-dynamics profile row for a session."""
pass
async def upsert_observed_attachment(
self,
*,
sha256: str,
decky_uuid: Optional[str],
attacker_uuid: Optional[str],
extension: Optional[str],
subject: Optional[str],
mal_hash_match: Optional[bool],
mal_hash_match_provider: Optional[str],
) -> str:
"""Record one observation of *sha256* against ``observed_attachments``.
Returns the row UUID. Verdict semantics: ``True`` is sticky;
once set, subsequent ``False`` / ``None`` observations don't
downgrade. See :class:`ObservedAttachment` for the full column
list and the rationale (DECNET as a honeypot platform — every
delivered hash is intel, even before any provider classifies).
"""
raise NotImplementedError
@abstractmethod
async def upsert_attacker_intel(self, data: dict[str, Any]) -> str:
"""Insert or update the threat-intel row for an attacker UUID.

View File

@@ -44,6 +44,7 @@ from decnet.web.db.sqlmodel_repo.deckies import DeckiesMixin
from decnet.web.db.sqlmodel_repo.fleet import FleetMixin
from decnet.web.db.sqlmodel_repo.identities import IdentitiesMixin
from decnet.web.db.sqlmodel_repo.logs import LogsMixin
from decnet.web.db.sqlmodel_repo.observed_attachments import ObservedAttachmentsMixin
from decnet.web.db.sqlmodel_repo.orchestrator import OrchestratorMixin
from decnet.web.db.sqlmodel_repo.realism import RealismMixin
from decnet.web.db.sqlmodel_repo.swarm import SwarmMixin
@@ -65,6 +66,7 @@ class SQLModelRepository(
FleetMixin,
IdentitiesMixin,
LogsMixin,
ObservedAttachmentsMixin,
OrchestratorMixin,
RealismMixin,
SwarmMixin,

View File

@@ -0,0 +1,108 @@
"""Repo mixin for the ``observed_attachments`` table.
Composed onto :class:`SQLModelRepository` alongside the existing
per-domain mixins. The single public method is an upsert: if the
sha256 isn't there, insert with ``observation_count=1`` and the
caller's anchor metadata; otherwise increment ``observation_count``,
roll forward ``last_seen`` and ``last_seen_attacker_uuid``, dedupe a
new ``extension`` into ``extensions``, and stick the
``mal_hash_match`` verdict if either the row had no verdict or the
caller is upgrading ``False/None`` to ``True``.
"""
from __future__ import annotations
from datetime import datetime, timezone
from typing import Optional
from sqlalchemy import select
from decnet.web.db.models import ObservedAttachment
from decnet.web.db.sqlmodel_repo._helpers import _MixinBase
class ObservedAttachmentsMixin(_MixinBase):
"""Mixin: composed onto ``SQLModelRepository``."""
async def upsert_observed_attachment(
self,
*,
sha256: str,
decky_uuid: Optional[str],
attacker_uuid: Optional[str],
extension: Optional[str],
subject: Optional[str],
mal_hash_match: Optional[bool],
mal_hash_match_provider: Optional[str],
) -> str:
"""Record one observation of *sha256*. Returns the row ``uuid``.
Verdict semantics:
* Row has no verdict (``None``) → write whatever the caller has,
including ``None`` (no-op) or ``False`` (provider checked and
said clean).
* Row already has ``False`` → upgrade to ``True`` if the caller
says so; otherwise leave alone.
* Row already has ``True`` → never downgrade. A hash a feed
later forgets is still a hash that feed once flagged.
"""
sha = sha256.lower()
ext = extension.lower() if extension else None
now = datetime.now(timezone.utc)
async with self._session() as session:
stmt = select(ObservedAttachment).where(
ObservedAttachment.sha256 == sha,
)
row = (await session.execute(stmt)).scalar_one_or_none()
if row is None:
row = ObservedAttachment(
sha256=sha,
first_seen=now,
last_seen=now,
observation_count=1,
first_seen_decky_uuid=decky_uuid,
first_seen_attacker_uuid=attacker_uuid,
last_seen_attacker_uuid=attacker_uuid,
extensions=[ext] if ext else [],
first_subject=subject,
mal_hash_match=mal_hash_match,
mal_hash_match_provider=(
mal_hash_match_provider
if mal_hash_match is not None
else None
),
mal_hash_match_at=(
now if mal_hash_match is not None else None
),
)
session.add(row)
await session.commit()
await session.refresh(row)
return row.uuid
row.observation_count = (row.observation_count or 0) + 1
row.last_seen = now
if attacker_uuid:
row.last_seen_attacker_uuid = attacker_uuid
if ext:
exts = list(row.extensions or [])
if ext not in exts:
exts.append(ext)
row.extensions = exts
# Verdict: only write if the row had no opinion, or the
# caller is upgrading to True. Never downgrade True.
if mal_hash_match is True and row.mal_hash_match is not True:
row.mal_hash_match = True
row.mal_hash_match_provider = mal_hash_match_provider
row.mal_hash_match_at = now
elif (
mal_hash_match is not None
and row.mal_hash_match is None
):
row.mal_hash_match = mal_hash_match
row.mal_hash_match_provider = mal_hash_match_provider
row.mal_hash_match_at = now
session.add(row)
await session.commit()
return row.uuid

View File

@@ -714,10 +714,12 @@ async def _publish_email_received(
attachment_manifest = []
if not isinstance(attachment_manifest, list):
attachment_manifest = []
attachment_sha256s = [
entry.get("sha256") for entry in attachment_manifest
if isinstance(entry, dict) and isinstance(entry.get("sha256"), str)
and entry.get("sha256")
attachment_sha256s: list[str] = [
sha for sha in (
entry.get("sha256") for entry in attachment_manifest
if isinstance(entry, dict)
)
if isinstance(sha, str) and sha
]
try:
urls = json.loads(fields.get("urls_json") or "[]")
@@ -761,6 +763,60 @@ async def _publish_email_received(
except (TypeError, ValueError):
body_base64_bytes = 0
# Per-hash mal-hash lookup + ObservedAttachment persistence. The
# boolean drops onto the bus payload as ``mal_hash_match`` so
# EmailLifter R0046's ``mal_hash_match`` lane fires; the per-hash
# observations land in ``observed_attachments`` for cross-attacker
# correlation independent of the rule's view. Field is omitted from
# the payload entirely on hash-less mail so the predicate stays
# silent (matches today's behavior).
mal_hash_match: Optional[bool] = None
if attachment_sha256s:
mal_hash_match = False
try:
from decnet.intel.factory import get_mal_hash_provider
provider = get_mal_hash_provider()
except Exception as exc: # noqa: BLE001
logger.debug("mal_hash provider unavailable: %s", exc)
provider = None
provider_name = provider.name if provider is not None else None
for sha in attachment_sha256s:
verdict: Optional[bool] = None
if provider is not None:
try:
verdict = await provider.is_known_bad(sha)
except Exception as exc: # noqa: BLE001
logger.debug("mal_hash lookup failed for %s: %s", sha, exc)
verdict = None
if verdict is True:
mal_hash_match = True
ext = next(
(
str(entry.get("extension") or "").lower()
for entry in attachment_manifest
if isinstance(entry, dict)
and entry.get("sha256") == sha
and entry.get("extension")
),
None,
)
try:
await repo.upsert_observed_attachment(
sha256=sha,
decky_uuid=log_data.get("decky"),
attacker_uuid=attacker_uuid,
extension=ext or None,
subject=fields.get("subject"),
mal_hash_match=verdict,
mal_hash_match_provider=(
provider_name if verdict is not None else None
),
)
except Exception as exc: # noqa: BLE001
logger.debug(
"observed_attachments upsert failed for %s: %s", sha, exc,
)
payload: dict[str, Any] = {
"source_id": fields.get("msg_id") or fields.get("stored_as"),
"attacker_uuid": attacker_uuid,
@@ -795,6 +851,8 @@ async def _publish_email_received(
"stored_as": fields.get("stored_as"),
"body_sha256": fields.get("sha256"),
}
if mal_hash_match is not None:
payload["mal_hash_match"] = mal_hash_match
try:
bus = get_bus(client_name="ingester-email")
await bus.connect()