feat(intel,ingester): mal_hash feed + observed_attachments table (DEBT-046)

New MalHashProvider sibling ABC (decnet/intel/base.py) since SHA-256
is a different keyspace from IntelProvider's IPs. MalwareBazaarProvider
mirrors FeodoProvider's bulk-feed shape: 24h refresh via _ensure_fresh
/ _refresh, in-memory set[str] of hex-lowercased hashes, set-membership
lookup. Auth-keyed via DECNET_MALWAREBAZAAR_AUTH_KEY; absent key
silent-no-ops the lane (single warning, no HTTP traffic).

Per-hash observations persist to a new observed_attachments table.
DECNET is a honeypot platform — every attachment hash an attacker
delivers is intel, regardless of whether anyone classified it. Verdict
is sticky: True never downgrades to False/None on subsequent
observations. Out of scope: API surface, federation export, retention.

Ingester _publish_email_received calls the provider for each attachment
sha256, sets mal_hash_match on the bus payload (omitted entirely when
the message had no attachments — keeps R0046's `is True` predicate
silent on hash-less mail, matching pre-paydown behavior), and upserts
the row regardless of provider availability.
This commit is contained in:
2026-05-03 05:56:46 -04:00
parent 03beff3840
commit 3f080f601d
13 changed files with 1135 additions and 31 deletions

View File

@@ -54,6 +54,9 @@ from .attackers import (
from .attacker_intel import (
AttackerIntel,
)
from .attachments import (
ObservedAttachment,
)
from .campaigns import (
Campaign,
CampaignsResponse,
@@ -247,6 +250,7 @@ __all__ = [
"AttackerIdentity",
"AttackerIntel",
"AttackersResponse",
"ObservedAttachment",
"SessionProfile",
"SmtpTarget",
# campaigns

View File

@@ -0,0 +1,76 @@
"""Observed-attachment intel — purpose-built table for the per-hash
keyspace of attachments delivered by attackers.
DECNET is a honeypot **platform**, not a one-off appliance. Every
attachment SHA-256 that crosses a decky is itself an artifact: it
seeds future cross-attacker correlation ("same hash, multiple
unrelated attackers? cross-decky propagation?"), feeds the EmailLifter
R0046 ``mal_hash_match`` lane with provider-attributed verdicts at
observation time, and underwrites future federation work without
locking us into a particular outbound shape today.
Per the standing rule "new use cases get their own table with UUID
PK," this is its own table — NOT a column-bag on ``attacker_intel``
(which is IP-keyed; one hash can ride many IPs) or on the email rows
(one hash can ride many emails; the cross-correlation question is
per-hash).
"""
from datetime import datetime, timezone
from typing import List, Optional
from uuid import uuid4
from sqlalchemy import JSON, Column, Index
from sqlmodel import Field, SQLModel
class ObservedAttachment(SQLModel, table=True):
"""One distinct file-attachment hash observed across the fleet.
The natural key is ``sha256``; the row is upserted per observation
via :meth:`BaseRepository.upsert_observed_attachment`. ``uuid`` is
the surrogate PK — the ingester never refers to it directly, but
future API surfaces benefit from the indirection (and from a
UUID-shaped foreign-key column once federation work lands).
"""
__tablename__ = "observed_attachments"
__table_args__ = (
Index("ix_observed_attachments_first_seen", "first_seen"),
Index("ix_observed_attachments_last_seen", "last_seen"),
Index("ix_observed_attachments_mal_hash_match", "mal_hash_match"),
)
uuid: str = Field(default_factory=lambda: str(uuid4()), primary_key=True)
sha256: str = Field(unique=True, index=True, max_length=64)
first_seen: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
)
last_seen: datetime = Field(
default_factory=lambda: datetime.now(timezone.utc),
)
observation_count: int = Field(default=1)
first_seen_decky_uuid: Optional[str] = Field(default=None, index=True)
first_seen_attacker_uuid: Optional[str] = Field(default=None, index=True)
last_seen_attacker_uuid: Optional[str] = Field(default=None, index=True)
# Native JSON list[str] — every distinct file extension this hash has
# been delivered as. One hash, multiple extensions = obfuscation
# signal worth keeping. Per the standing typed-evidence rule:
# default_factory, not default=[].
extensions: List[str] = Field(
default_factory=list,
sa_column=Column(JSON, nullable=False, default=list),
)
first_subject: Optional[str] = Field(default=None)
# Verdict captured at observation time. ``None`` = no provider has
# classified yet. ``True`` is sticky — once any provider says
# "known bad," subsequent ``None``/``False`` observations don't
# downgrade the verdict (a hash a feed later forgets is still a
# hash that feed once flagged).
mal_hash_match: Optional[bool] = Field(default=None)
mal_hash_match_provider: Optional[str] = Field(
default=None, max_length=64,
)
mal_hash_match_at: Optional[datetime] = Field(default=None)

View File

@@ -313,6 +313,27 @@ class BaseRepository(ABC):
"""Retrieve the keystroke-dynamics profile row for a session."""
pass
async def upsert_observed_attachment(
self,
*,
sha256: str,
decky_uuid: Optional[str],
attacker_uuid: Optional[str],
extension: Optional[str],
subject: Optional[str],
mal_hash_match: Optional[bool],
mal_hash_match_provider: Optional[str],
) -> str:
"""Record one observation of *sha256* against ``observed_attachments``.
Returns the row UUID. Verdict semantics: ``True`` is sticky;
once set, subsequent ``False`` / ``None`` observations don't
downgrade. See :class:`ObservedAttachment` for the full column
list and the rationale (DECNET as a honeypot platform — every
delivered hash is intel, even before any provider classifies).
"""
raise NotImplementedError
@abstractmethod
async def upsert_attacker_intel(self, data: dict[str, Any]) -> str:
"""Insert or update the threat-intel row for an attacker UUID.

View File

@@ -44,6 +44,7 @@ from decnet.web.db.sqlmodel_repo.deckies import DeckiesMixin
from decnet.web.db.sqlmodel_repo.fleet import FleetMixin
from decnet.web.db.sqlmodel_repo.identities import IdentitiesMixin
from decnet.web.db.sqlmodel_repo.logs import LogsMixin
from decnet.web.db.sqlmodel_repo.observed_attachments import ObservedAttachmentsMixin
from decnet.web.db.sqlmodel_repo.orchestrator import OrchestratorMixin
from decnet.web.db.sqlmodel_repo.realism import RealismMixin
from decnet.web.db.sqlmodel_repo.swarm import SwarmMixin
@@ -65,6 +66,7 @@ class SQLModelRepository(
FleetMixin,
IdentitiesMixin,
LogsMixin,
ObservedAttachmentsMixin,
OrchestratorMixin,
RealismMixin,
SwarmMixin,

View File

@@ -0,0 +1,108 @@
"""Repo mixin for the ``observed_attachments`` table.
Composed onto :class:`SQLModelRepository` alongside the existing
per-domain mixins. The single public method is an upsert: if the
sha256 isn't there, insert with ``observation_count=1`` and the
caller's anchor metadata; otherwise increment ``observation_count``,
roll forward ``last_seen`` and ``last_seen_attacker_uuid``, dedupe a
new ``extension`` into ``extensions``, and stick the
``mal_hash_match`` verdict if either the row had no verdict or the
caller is upgrading ``False/None`` to ``True``.
"""
from __future__ import annotations
from datetime import datetime, timezone
from typing import Optional
from sqlalchemy import select
from decnet.web.db.models import ObservedAttachment
from decnet.web.db.sqlmodel_repo._helpers import _MixinBase
class ObservedAttachmentsMixin(_MixinBase):
"""Mixin: composed onto ``SQLModelRepository``."""
async def upsert_observed_attachment(
self,
*,
sha256: str,
decky_uuid: Optional[str],
attacker_uuid: Optional[str],
extension: Optional[str],
subject: Optional[str],
mal_hash_match: Optional[bool],
mal_hash_match_provider: Optional[str],
) -> str:
"""Record one observation of *sha256*. Returns the row ``uuid``.
Verdict semantics:
* Row has no verdict (``None``) → write whatever the caller has,
including ``None`` (no-op) or ``False`` (provider checked and
said clean).
* Row already has ``False`` → upgrade to ``True`` if the caller
says so; otherwise leave alone.
* Row already has ``True`` → never downgrade. A hash a feed
later forgets is still a hash that feed once flagged.
"""
sha = sha256.lower()
ext = extension.lower() if extension else None
now = datetime.now(timezone.utc)
async with self._session() as session:
stmt = select(ObservedAttachment).where(
ObservedAttachment.sha256 == sha,
)
row = (await session.execute(stmt)).scalar_one_or_none()
if row is None:
row = ObservedAttachment(
sha256=sha,
first_seen=now,
last_seen=now,
observation_count=1,
first_seen_decky_uuid=decky_uuid,
first_seen_attacker_uuid=attacker_uuid,
last_seen_attacker_uuid=attacker_uuid,
extensions=[ext] if ext else [],
first_subject=subject,
mal_hash_match=mal_hash_match,
mal_hash_match_provider=(
mal_hash_match_provider
if mal_hash_match is not None
else None
),
mal_hash_match_at=(
now if mal_hash_match is not None else None
),
)
session.add(row)
await session.commit()
await session.refresh(row)
return row.uuid
row.observation_count = (row.observation_count or 0) + 1
row.last_seen = now
if attacker_uuid:
row.last_seen_attacker_uuid = attacker_uuid
if ext:
exts = list(row.extensions or [])
if ext not in exts:
exts.append(ext)
row.extensions = exts
# Verdict: only write if the row had no opinion, or the
# caller is upgrading to True. Never downgrade True.
if mal_hash_match is True and row.mal_hash_match is not True:
row.mal_hash_match = True
row.mal_hash_match_provider = mal_hash_match_provider
row.mal_hash_match_at = now
elif (
mal_hash_match is not None
and row.mal_hash_match is None
):
row.mal_hash_match = mal_hash_match
row.mal_hash_match_provider = mal_hash_match_provider
row.mal_hash_match_at = now
session.add(row)
await session.commit()
return row.uuid

View File

@@ -714,10 +714,12 @@ async def _publish_email_received(
attachment_manifest = []
if not isinstance(attachment_manifest, list):
attachment_manifest = []
attachment_sha256s = [
entry.get("sha256") for entry in attachment_manifest
if isinstance(entry, dict) and isinstance(entry.get("sha256"), str)
and entry.get("sha256")
attachment_sha256s: list[str] = [
sha for sha in (
entry.get("sha256") for entry in attachment_manifest
if isinstance(entry, dict)
)
if isinstance(sha, str) and sha
]
try:
urls = json.loads(fields.get("urls_json") or "[]")
@@ -761,6 +763,60 @@ async def _publish_email_received(
except (TypeError, ValueError):
body_base64_bytes = 0
# Per-hash mal-hash lookup + ObservedAttachment persistence. The
# boolean drops onto the bus payload as ``mal_hash_match`` so
# EmailLifter R0046's ``mal_hash_match`` lane fires; the per-hash
# observations land in ``observed_attachments`` for cross-attacker
# correlation independent of the rule's view. Field is omitted from
# the payload entirely on hash-less mail so the predicate stays
# silent (matches today's behavior).
mal_hash_match: Optional[bool] = None
if attachment_sha256s:
mal_hash_match = False
try:
from decnet.intel.factory import get_mal_hash_provider
provider = get_mal_hash_provider()
except Exception as exc: # noqa: BLE001
logger.debug("mal_hash provider unavailable: %s", exc)
provider = None
provider_name = provider.name if provider is not None else None
for sha in attachment_sha256s:
verdict: Optional[bool] = None
if provider is not None:
try:
verdict = await provider.is_known_bad(sha)
except Exception as exc: # noqa: BLE001
logger.debug("mal_hash lookup failed for %s: %s", sha, exc)
verdict = None
if verdict is True:
mal_hash_match = True
ext = next(
(
str(entry.get("extension") or "").lower()
for entry in attachment_manifest
if isinstance(entry, dict)
and entry.get("sha256") == sha
and entry.get("extension")
),
None,
)
try:
await repo.upsert_observed_attachment(
sha256=sha,
decky_uuid=log_data.get("decky"),
attacker_uuid=attacker_uuid,
extension=ext or None,
subject=fields.get("subject"),
mal_hash_match=verdict,
mal_hash_match_provider=(
provider_name if verdict is not None else None
),
)
except Exception as exc: # noqa: BLE001
logger.debug(
"observed_attachments upsert failed for %s: %s", sha, exc,
)
payload: dict[str, Any] = {
"source_id": fields.get("msg_id") or fields.get("stored_as"),
"attacker_uuid": attacker_uuid,
@@ -795,6 +851,8 @@ async def _publish_email_received(
"stored_as": fields.get("stored_as"),
"body_sha256": fields.get("sha256"),
}
if mal_hash_match is not None:
payload["mal_hash_match"] = mal_hash_match
try:
bus = get_bus(client_name="ingester-email")
await bus.connect()