feat(intel,ingester): mal_hash feed + observed_attachments table (DEBT-046)
New MalHashProvider sibling ABC (decnet/intel/base.py) since SHA-256 is a different keyspace from IntelProvider's IPs. MalwareBazaarProvider mirrors FeodoProvider's bulk-feed shape: 24h refresh via _ensure_fresh / _refresh, in-memory set[str] of hex-lowercased hashes, set-membership lookup. Auth-keyed via DECNET_MALWAREBAZAAR_AUTH_KEY; absent key silent-no-ops the lane (single warning, no HTTP traffic). Per-hash observations persist to a new observed_attachments table. DECNET is a honeypot platform — every attachment hash an attacker delivers is intel, regardless of whether anyone classified it. Verdict is sticky: True never downgrades to False/None on subsequent observations. Out of scope: API surface, federation export, retention. Ingester _publish_email_received calls the provider for each attachment sha256, sets mal_hash_match on the bus payload (omitted entirely when the message had no attachments — keeps R0046's `is True` predicate silent on hash-less mail, matching pre-paydown behavior), and upserts the row regardless of provider availability.
This commit is contained in:
@@ -54,6 +54,9 @@ from .attackers import (
|
||||
from .attacker_intel import (
|
||||
AttackerIntel,
|
||||
)
|
||||
from .attachments import (
|
||||
ObservedAttachment,
|
||||
)
|
||||
from .campaigns import (
|
||||
Campaign,
|
||||
CampaignsResponse,
|
||||
@@ -247,6 +250,7 @@ __all__ = [
|
||||
"AttackerIdentity",
|
||||
"AttackerIntel",
|
||||
"AttackersResponse",
|
||||
"ObservedAttachment",
|
||||
"SessionProfile",
|
||||
"SmtpTarget",
|
||||
# campaigns
|
||||
|
||||
76
decnet/web/db/models/attachments.py
Normal file
76
decnet/web/db/models/attachments.py
Normal file
@@ -0,0 +1,76 @@
|
||||
"""Observed-attachment intel — purpose-built table for the per-hash
|
||||
keyspace of attachments delivered by attackers.
|
||||
|
||||
DECNET is a honeypot **platform**, not a one-off appliance. Every
|
||||
attachment SHA-256 that crosses a decky is itself an artifact: it
|
||||
seeds future cross-attacker correlation ("same hash, multiple
|
||||
unrelated attackers? cross-decky propagation?"), feeds the EmailLifter
|
||||
R0046 ``mal_hash_match`` lane with provider-attributed verdicts at
|
||||
observation time, and underwrites future federation work without
|
||||
locking us into a particular outbound shape today.
|
||||
|
||||
Per the standing rule "new use cases get their own table with UUID
|
||||
PK," this is its own table — NOT a column-bag on ``attacker_intel``
|
||||
(which is IP-keyed; one hash can ride many IPs) or on the email rows
|
||||
(one hash can ride many emails; the cross-correlation question is
|
||||
per-hash).
|
||||
"""
|
||||
from datetime import datetime, timezone
|
||||
from typing import List, Optional
|
||||
from uuid import uuid4
|
||||
|
||||
from sqlalchemy import JSON, Column, Index
|
||||
from sqlmodel import Field, SQLModel
|
||||
|
||||
|
||||
class ObservedAttachment(SQLModel, table=True):
|
||||
"""One distinct file-attachment hash observed across the fleet.
|
||||
|
||||
The natural key is ``sha256``; the row is upserted per observation
|
||||
via :meth:`BaseRepository.upsert_observed_attachment`. ``uuid`` is
|
||||
the surrogate PK — the ingester never refers to it directly, but
|
||||
future API surfaces benefit from the indirection (and from a
|
||||
UUID-shaped foreign-key column once federation work lands).
|
||||
"""
|
||||
__tablename__ = "observed_attachments"
|
||||
__table_args__ = (
|
||||
Index("ix_observed_attachments_first_seen", "first_seen"),
|
||||
Index("ix_observed_attachments_last_seen", "last_seen"),
|
||||
Index("ix_observed_attachments_mal_hash_match", "mal_hash_match"),
|
||||
)
|
||||
|
||||
uuid: str = Field(default_factory=lambda: str(uuid4()), primary_key=True)
|
||||
sha256: str = Field(unique=True, index=True, max_length=64)
|
||||
|
||||
first_seen: datetime = Field(
|
||||
default_factory=lambda: datetime.now(timezone.utc),
|
||||
)
|
||||
last_seen: datetime = Field(
|
||||
default_factory=lambda: datetime.now(timezone.utc),
|
||||
)
|
||||
observation_count: int = Field(default=1)
|
||||
|
||||
first_seen_decky_uuid: Optional[str] = Field(default=None, index=True)
|
||||
first_seen_attacker_uuid: Optional[str] = Field(default=None, index=True)
|
||||
last_seen_attacker_uuid: Optional[str] = Field(default=None, index=True)
|
||||
|
||||
# Native JSON list[str] — every distinct file extension this hash has
|
||||
# been delivered as. One hash, multiple extensions = obfuscation
|
||||
# signal worth keeping. Per the standing typed-evidence rule:
|
||||
# default_factory, not default=[].
|
||||
extensions: List[str] = Field(
|
||||
default_factory=list,
|
||||
sa_column=Column(JSON, nullable=False, default=list),
|
||||
)
|
||||
first_subject: Optional[str] = Field(default=None)
|
||||
|
||||
# Verdict captured at observation time. ``None`` = no provider has
|
||||
# classified yet. ``True`` is sticky — once any provider says
|
||||
# "known bad," subsequent ``None``/``False`` observations don't
|
||||
# downgrade the verdict (a hash a feed later forgets is still a
|
||||
# hash that feed once flagged).
|
||||
mal_hash_match: Optional[bool] = Field(default=None)
|
||||
mal_hash_match_provider: Optional[str] = Field(
|
||||
default=None, max_length=64,
|
||||
)
|
||||
mal_hash_match_at: Optional[datetime] = Field(default=None)
|
||||
@@ -313,6 +313,27 @@ class BaseRepository(ABC):
|
||||
"""Retrieve the keystroke-dynamics profile row for a session."""
|
||||
pass
|
||||
|
||||
async def upsert_observed_attachment(
|
||||
self,
|
||||
*,
|
||||
sha256: str,
|
||||
decky_uuid: Optional[str],
|
||||
attacker_uuid: Optional[str],
|
||||
extension: Optional[str],
|
||||
subject: Optional[str],
|
||||
mal_hash_match: Optional[bool],
|
||||
mal_hash_match_provider: Optional[str],
|
||||
) -> str:
|
||||
"""Record one observation of *sha256* against ``observed_attachments``.
|
||||
|
||||
Returns the row UUID. Verdict semantics: ``True`` is sticky;
|
||||
once set, subsequent ``False`` / ``None`` observations don't
|
||||
downgrade. See :class:`ObservedAttachment` for the full column
|
||||
list and the rationale (DECNET as a honeypot platform — every
|
||||
delivered hash is intel, even before any provider classifies).
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
@abstractmethod
|
||||
async def upsert_attacker_intel(self, data: dict[str, Any]) -> str:
|
||||
"""Insert or update the threat-intel row for an attacker UUID.
|
||||
|
||||
@@ -44,6 +44,7 @@ from decnet.web.db.sqlmodel_repo.deckies import DeckiesMixin
|
||||
from decnet.web.db.sqlmodel_repo.fleet import FleetMixin
|
||||
from decnet.web.db.sqlmodel_repo.identities import IdentitiesMixin
|
||||
from decnet.web.db.sqlmodel_repo.logs import LogsMixin
|
||||
from decnet.web.db.sqlmodel_repo.observed_attachments import ObservedAttachmentsMixin
|
||||
from decnet.web.db.sqlmodel_repo.orchestrator import OrchestratorMixin
|
||||
from decnet.web.db.sqlmodel_repo.realism import RealismMixin
|
||||
from decnet.web.db.sqlmodel_repo.swarm import SwarmMixin
|
||||
@@ -65,6 +66,7 @@ class SQLModelRepository(
|
||||
FleetMixin,
|
||||
IdentitiesMixin,
|
||||
LogsMixin,
|
||||
ObservedAttachmentsMixin,
|
||||
OrchestratorMixin,
|
||||
RealismMixin,
|
||||
SwarmMixin,
|
||||
|
||||
108
decnet/web/db/sqlmodel_repo/observed_attachments.py
Normal file
108
decnet/web/db/sqlmodel_repo/observed_attachments.py
Normal file
@@ -0,0 +1,108 @@
|
||||
"""Repo mixin for the ``observed_attachments`` table.
|
||||
|
||||
Composed onto :class:`SQLModelRepository` alongside the existing
|
||||
per-domain mixins. The single public method is an upsert: if the
|
||||
sha256 isn't there, insert with ``observation_count=1`` and the
|
||||
caller's anchor metadata; otherwise increment ``observation_count``,
|
||||
roll forward ``last_seen`` and ``last_seen_attacker_uuid``, dedupe a
|
||||
new ``extension`` into ``extensions``, and stick the
|
||||
``mal_hash_match`` verdict if either the row had no verdict or the
|
||||
caller is upgrading ``False/None`` to ``True``.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
from sqlalchemy import select
|
||||
|
||||
from decnet.web.db.models import ObservedAttachment
|
||||
from decnet.web.db.sqlmodel_repo._helpers import _MixinBase
|
||||
|
||||
|
||||
class ObservedAttachmentsMixin(_MixinBase):
|
||||
"""Mixin: composed onto ``SQLModelRepository``."""
|
||||
|
||||
async def upsert_observed_attachment(
|
||||
self,
|
||||
*,
|
||||
sha256: str,
|
||||
decky_uuid: Optional[str],
|
||||
attacker_uuid: Optional[str],
|
||||
extension: Optional[str],
|
||||
subject: Optional[str],
|
||||
mal_hash_match: Optional[bool],
|
||||
mal_hash_match_provider: Optional[str],
|
||||
) -> str:
|
||||
"""Record one observation of *sha256*. Returns the row ``uuid``.
|
||||
|
||||
Verdict semantics:
|
||||
|
||||
* Row has no verdict (``None``) → write whatever the caller has,
|
||||
including ``None`` (no-op) or ``False`` (provider checked and
|
||||
said clean).
|
||||
* Row already has ``False`` → upgrade to ``True`` if the caller
|
||||
says so; otherwise leave alone.
|
||||
* Row already has ``True`` → never downgrade. A hash a feed
|
||||
later forgets is still a hash that feed once flagged.
|
||||
"""
|
||||
sha = sha256.lower()
|
||||
ext = extension.lower() if extension else None
|
||||
now = datetime.now(timezone.utc)
|
||||
|
||||
async with self._session() as session:
|
||||
stmt = select(ObservedAttachment).where(
|
||||
ObservedAttachment.sha256 == sha,
|
||||
)
|
||||
row = (await session.execute(stmt)).scalar_one_or_none()
|
||||
if row is None:
|
||||
row = ObservedAttachment(
|
||||
sha256=sha,
|
||||
first_seen=now,
|
||||
last_seen=now,
|
||||
observation_count=1,
|
||||
first_seen_decky_uuid=decky_uuid,
|
||||
first_seen_attacker_uuid=attacker_uuid,
|
||||
last_seen_attacker_uuid=attacker_uuid,
|
||||
extensions=[ext] if ext else [],
|
||||
first_subject=subject,
|
||||
mal_hash_match=mal_hash_match,
|
||||
mal_hash_match_provider=(
|
||||
mal_hash_match_provider
|
||||
if mal_hash_match is not None
|
||||
else None
|
||||
),
|
||||
mal_hash_match_at=(
|
||||
now if mal_hash_match is not None else None
|
||||
),
|
||||
)
|
||||
session.add(row)
|
||||
await session.commit()
|
||||
await session.refresh(row)
|
||||
return row.uuid
|
||||
|
||||
row.observation_count = (row.observation_count or 0) + 1
|
||||
row.last_seen = now
|
||||
if attacker_uuid:
|
||||
row.last_seen_attacker_uuid = attacker_uuid
|
||||
if ext:
|
||||
exts = list(row.extensions or [])
|
||||
if ext not in exts:
|
||||
exts.append(ext)
|
||||
row.extensions = exts
|
||||
# Verdict: only write if the row had no opinion, or the
|
||||
# caller is upgrading to True. Never downgrade True.
|
||||
if mal_hash_match is True and row.mal_hash_match is not True:
|
||||
row.mal_hash_match = True
|
||||
row.mal_hash_match_provider = mal_hash_match_provider
|
||||
row.mal_hash_match_at = now
|
||||
elif (
|
||||
mal_hash_match is not None
|
||||
and row.mal_hash_match is None
|
||||
):
|
||||
row.mal_hash_match = mal_hash_match
|
||||
row.mal_hash_match_provider = mal_hash_match_provider
|
||||
row.mal_hash_match_at = now
|
||||
session.add(row)
|
||||
await session.commit()
|
||||
return row.uuid
|
||||
@@ -714,10 +714,12 @@ async def _publish_email_received(
|
||||
attachment_manifest = []
|
||||
if not isinstance(attachment_manifest, list):
|
||||
attachment_manifest = []
|
||||
attachment_sha256s = [
|
||||
entry.get("sha256") for entry in attachment_manifest
|
||||
if isinstance(entry, dict) and isinstance(entry.get("sha256"), str)
|
||||
and entry.get("sha256")
|
||||
attachment_sha256s: list[str] = [
|
||||
sha for sha in (
|
||||
entry.get("sha256") for entry in attachment_manifest
|
||||
if isinstance(entry, dict)
|
||||
)
|
||||
if isinstance(sha, str) and sha
|
||||
]
|
||||
try:
|
||||
urls = json.loads(fields.get("urls_json") or "[]")
|
||||
@@ -761,6 +763,60 @@ async def _publish_email_received(
|
||||
except (TypeError, ValueError):
|
||||
body_base64_bytes = 0
|
||||
|
||||
# Per-hash mal-hash lookup + ObservedAttachment persistence. The
|
||||
# boolean drops onto the bus payload as ``mal_hash_match`` so
|
||||
# EmailLifter R0046's ``mal_hash_match`` lane fires; the per-hash
|
||||
# observations land in ``observed_attachments`` for cross-attacker
|
||||
# correlation independent of the rule's view. Field is omitted from
|
||||
# the payload entirely on hash-less mail so the predicate stays
|
||||
# silent (matches today's behavior).
|
||||
mal_hash_match: Optional[bool] = None
|
||||
if attachment_sha256s:
|
||||
mal_hash_match = False
|
||||
try:
|
||||
from decnet.intel.factory import get_mal_hash_provider
|
||||
provider = get_mal_hash_provider()
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.debug("mal_hash provider unavailable: %s", exc)
|
||||
provider = None
|
||||
provider_name = provider.name if provider is not None else None
|
||||
for sha in attachment_sha256s:
|
||||
verdict: Optional[bool] = None
|
||||
if provider is not None:
|
||||
try:
|
||||
verdict = await provider.is_known_bad(sha)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.debug("mal_hash lookup failed for %s: %s", sha, exc)
|
||||
verdict = None
|
||||
if verdict is True:
|
||||
mal_hash_match = True
|
||||
ext = next(
|
||||
(
|
||||
str(entry.get("extension") or "").lower()
|
||||
for entry in attachment_manifest
|
||||
if isinstance(entry, dict)
|
||||
and entry.get("sha256") == sha
|
||||
and entry.get("extension")
|
||||
),
|
||||
None,
|
||||
)
|
||||
try:
|
||||
await repo.upsert_observed_attachment(
|
||||
sha256=sha,
|
||||
decky_uuid=log_data.get("decky"),
|
||||
attacker_uuid=attacker_uuid,
|
||||
extension=ext or None,
|
||||
subject=fields.get("subject"),
|
||||
mal_hash_match=verdict,
|
||||
mal_hash_match_provider=(
|
||||
provider_name if verdict is not None else None
|
||||
),
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.debug(
|
||||
"observed_attachments upsert failed for %s: %s", sha, exc,
|
||||
)
|
||||
|
||||
payload: dict[str, Any] = {
|
||||
"source_id": fields.get("msg_id") or fields.get("stored_as"),
|
||||
"attacker_uuid": attacker_uuid,
|
||||
@@ -795,6 +851,8 @@ async def _publish_email_received(
|
||||
"stored_as": fields.get("stored_as"),
|
||||
"body_sha256": fields.get("sha256"),
|
||||
}
|
||||
if mal_hash_match is not None:
|
||||
payload["mal_hash_match"] = mal_hash_match
|
||||
try:
|
||||
bus = get_bus(client_name="ingester-email")
|
||||
await bus.connect()
|
||||
|
||||
Reference in New Issue
Block a user