feat(intel,ingester): mal_hash feed + observed_attachments table (DEBT-046)

New MalHashProvider sibling ABC (decnet/intel/base.py) since SHA-256
is a different keyspace from IntelProvider's IPs. MalwareBazaarProvider
mirrors FeodoProvider's bulk-feed shape: 24h refresh via _ensure_fresh
/ _refresh, in-memory set[str] of hex-lowercased hashes, set-membership
lookup. Auth-keyed via DECNET_MALWAREBAZAAR_AUTH_KEY; absent key
silent-no-ops the lane (single warning, no HTTP traffic).

Per-hash observations persist to a new observed_attachments table.
DECNET is a honeypot platform — every attachment hash an attacker
delivers is intel, regardless of whether anyone classified it. Verdict
is sticky: True never downgrades to False/None on subsequent
observations. Out of scope: API surface, federation export, retention.

Ingester _publish_email_received calls the provider for each attachment
sha256, sets mal_hash_match on the bus payload (omitted entirely when
the message had no attachments — keeps R0046's `is True` predicate
silent on hash-less mail, matching pre-paydown behavior), and upserts
the row regardless of provider availability.
This commit is contained in:
2026-05-03 05:56:46 -04:00
parent 03beff3840
commit 3f080f601d
13 changed files with 1135 additions and 31 deletions

View File

@@ -0,0 +1,172 @@
"""Unit tests for MalwareBazaarProvider (DEBT-046).
Bulk-feed shape: one HTTP fetch loads ``_known``, subsequent
``is_known_bad`` calls hit memory. We assert:
* no auth key → silent no-op (False, no HTTP traffic)
* fresh provider triggers exactly one refresh, then answers from cache
* hits / misses by exact 64-char hex match (case-insensitive)
* refresh failure keeps last-known-good data + does not raise
* CSV header detection survives column reordering
* ZIP'd dump is unwrapped before parsing
"""
from __future__ import annotations
import csv
import io
import zipfile
import httpx
import pytest
from decnet.intel.mal_hash import MalwareBazaarProvider, _extract_hashes
def _install_transport(handler) -> list[httpx.Request]:
captured: list[httpx.Request] = []
async def _wrapped(request: httpx.Request) -> httpx.Response:
captured.append(request)
return await handler(request)
transport = httpx.MockTransport(_wrapped)
from decnet.intel import mal_hash as mod
def _factory(*, timeout: float = 60.0):
return httpx.AsyncClient(
transport=transport, timeout=timeout,
)
mod.stealth_client = _factory # type: ignore[assignment]
return captured
def _zip_csv(rows: list[dict[str, str]]) -> bytes:
buf = io.StringIO()
if not rows:
return b""
writer = csv.DictWriter(buf, fieldnames=list(rows[0].keys()))
writer.writeheader()
writer.writerows(rows)
raw_csv = buf.getvalue().encode()
zip_buf = io.BytesIO()
with zipfile.ZipFile(zip_buf, "w") as zf:
zf.writestr("full.csv", raw_csv)
return zip_buf.getvalue()
_HASH_A = "a" * 64
_HASH_B = "b" * 64
_HASH_C = "c" * 64
@pytest.mark.asyncio
async def test_disabled_when_auth_key_unset(monkeypatch):
monkeypatch.delenv("DECNET_MALWAREBAZAAR_AUTH_KEY", raising=False)
async def _h(_req):
return httpx.Response(200, content=_zip_csv([]))
captured = _install_transport(_h)
p = MalwareBazaarProvider()
assert p.disabled is True
assert await p.is_known_bad(_HASH_A) is False
assert captured == [] # no network call ever
@pytest.mark.asyncio
async def test_refresh_populates_known_set():
body = _zip_csv([
{"sha256_hash": _HASH_A, "signature": "Emotet"},
{"sha256_hash": _HASH_B, "signature": "TrickBot"},
])
async def _h(_req):
return httpx.Response(200, content=body)
captured = _install_transport(_h)
p = MalwareBazaarProvider(auth_key="test-key")
assert await p.is_known_bad(_HASH_A) is True
assert await p.is_known_bad(_HASH_B) is True
assert await p.is_known_bad(_HASH_C) is False
# All four lookups answered from one refresh.
assert len(captured) == 1
# Auth-Key header threaded through.
assert captured[0].headers.get("Auth-Key") == "test-key"
@pytest.mark.asyncio
async def test_lookup_is_case_insensitive():
body = _zip_csv([{"sha256_hash": _HASH_A.upper(), "signature": "x"}])
async def _h(_req):
return httpx.Response(200, content=body)
_install_transport(_h)
p = MalwareBazaarProvider(auth_key="k")
# Provider lowercases on parse + lowercases the query.
assert await p.is_known_bad(_HASH_A.upper()) is True
@pytest.mark.asyncio
async def test_refresh_failure_keeps_last_known_good():
"""First refresh succeeds with one hash; the next refresh after TTL
expiry returns 500 — provider must keep answering from the prior
set, not lose it."""
call_count = {"n": 0}
async def handler(req):
call_count["n"] += 1
if call_count["n"] == 1:
return httpx.Response(
200, content=_zip_csv([{"sha256_hash": _HASH_A, "signature": "x"}]),
)
return httpx.Response(500, content=b"")
_install_transport(handler)
p = MalwareBazaarProvider(auth_key="k", refresh_interval_s=0.0)
assert await p.is_known_bad(_HASH_A) is True
# Second call: TTL=0 forces refresh; refresh fails; cache survives.
assert await p.is_known_bad(_HASH_A) is True
assert p._last_error is not None
@pytest.mark.asyncio
async def test_refresh_network_error_does_not_raise():
async def handler(req):
raise httpx.ConnectError("boom")
_install_transport(handler)
p = MalwareBazaarProvider(auth_key="k")
assert await p.is_known_bad(_HASH_A) is False
assert p._last_error is not None
def test_extract_hashes_skips_comment_lines():
text = (
"# Generated 2026-05-03\n"
"# Header: comment\n"
"sha256_hash,signature\n"
f"{_HASH_A},Emotet\n"
f"{_HASH_B},Cobalt Strike\n"
)
out = _extract_hashes(text)
assert out == {_HASH_A, _HASH_B}
def test_extract_hashes_drops_invalid_rows():
text = (
"sha256_hash,signature\n"
f"{_HASH_A},Emotet\n"
"not-a-hash,foo\n"
"shorthex,bar\n"
f"{'g' * 64},badchars\n" # right length, wrong charset
)
out = _extract_hashes(text)
assert out == {_HASH_A}
def test_extract_hashes_finds_column_after_reorder():
text = (
"first_seen,sha256_hash,signature\n"
f"2026-05-03,{_HASH_A},Emotet\n"
)
out = _extract_hashes(text)
assert out == {_HASH_A}

View File

@@ -0,0 +1,216 @@
"""Ingester wiring for mal_hash + observed_attachments (DEBT-046).
Validates `_publish_email_received` against a stub repo + stub provider:
* Provider hit on any attachment hash → ``mal_hash_match=True`` on the bus payload
* Provider clean on every hash → ``mal_hash_match=False`` on the bus payload
* No attachments → field omitted from the payload entirely
* Every observed hash lands in ``observed_attachments`` with the verdict baked in
"""
from __future__ import annotations
import json
from unittest.mock import AsyncMock
import pytest
from decnet.intel import factory as intel_factory
class _StubRepo:
def __init__(self) -> None:
self.observed: list[dict] = []
self.get_attacker_uuid_by_ip = AsyncMock(return_value="atk-1")
async def upsert_observed_attachment(self, **kwargs):
self.observed.append(kwargs)
return "obs-uuid"
class _StubBus:
def __init__(self) -> None:
self.published: list[dict] = []
async def connect(self):
return None
async def close(self):
return None
class _StubProvider:
name = "malwarebazaar"
def __init__(self, hits: set[str]):
self._hits = hits
async def is_known_bad(self, sha256: str) -> bool:
return sha256 in self._hits
@pytest.fixture(autouse=True)
def _reset_factory():
intel_factory._reset_mal_hash_provider_for_testing()
yield
intel_factory._reset_mal_hash_provider_for_testing()
@pytest.fixture
def patched_bus(monkeypatch):
"""Patch out the ingester's bus singleton so publishes capture
instead of going to the wire."""
captured: list[dict] = []
async def _publish_safely(bus, topic, payload, *, event_type=None):
captured.append({"topic": topic, "payload": payload, "event_type": event_type})
def _get_bus(client_name=""):
return _StubBus()
from decnet.web import ingester as mod
monkeypatch.setattr(mod, "publish_safely", _publish_safely)
monkeypatch.setattr(mod, "get_bus", _get_bus)
return captured
def _log_data() -> dict:
return {
"attacker_ip": "203.0.113.5",
"decky": "decky-uuid",
"service": "smtp",
}
def _fields(*, attachments: list[dict] | None) -> dict:
return {
"msg_id": "<m1@x>",
"subject": "Test",
"from_hdr": "atk@evil.example",
"mail_from": "atk@evil.example",
"return_path": "atk@evil.example",
"rcpt_to": "victim@corp.example",
"x_mailer": "Outlook",
"dkim_signed": 0,
"spf_pass": 0,
"urls_json": "[]",
"attachments_json": json.dumps(attachments) if attachments is not None else "[]",
"attachment_count": len(attachments) if attachments else 0,
"body_simhash": "0123456789abcdef",
"body_base64_bytes": 0,
"html_smuggling": 0,
"stored_as": "/spool/m1.eml",
"sha256": "f" * 64,
}
@pytest.mark.asyncio
async def test_known_bad_attachment_sets_mal_hash_match_true(patched_bus, monkeypatch):
bad = "a" * 64
clean = "b" * 64
def _factory():
return _StubProvider(hits={bad})
monkeypatch.setattr(intel_factory, "get_mal_hash_provider", _factory)
from decnet.web import ingester as mod
monkeypatch.setattr(
"decnet.intel.factory.get_mal_hash_provider", _factory,
)
repo = _StubRepo()
await mod._publish_email_received(
repo, _log_data(),
_fields(attachments=[
{"sha256": bad, "extension": "docx"},
{"sha256": clean, "extension": "pdf"},
]),
)
assert len(patched_bus) == 1
payload = patched_bus[0]["payload"]
assert payload["mal_hash_match"] is True
assert payload["attachment_sha256s"] == [bad, clean]
# Both hashes recorded with their verdicts.
by_hash = {o["sha256"]: o for o in repo.observed}
assert by_hash[bad]["mal_hash_match"] is True
assert by_hash[bad]["mal_hash_match_provider"] == "malwarebazaar"
assert by_hash[clean]["mal_hash_match"] is False
@pytest.mark.asyncio
async def test_clean_attachments_sets_mal_hash_match_false(patched_bus, monkeypatch):
clean = "c" * 64
def _factory():
return _StubProvider(hits=set())
monkeypatch.setattr(intel_factory, "get_mal_hash_provider", _factory)
monkeypatch.setattr(
"decnet.intel.factory.get_mal_hash_provider", _factory,
)
from decnet.web import ingester as mod
repo = _StubRepo()
await mod._publish_email_received(
repo, _log_data(),
_fields(attachments=[{"sha256": clean, "extension": "pdf"}]),
)
payload = patched_bus[0]["payload"]
assert payload["mal_hash_match"] is False
assert len(repo.observed) == 1
assert repo.observed[0]["mal_hash_match"] is False
@pytest.mark.asyncio
async def test_no_attachments_omits_mal_hash_match(patched_bus, monkeypatch):
def _factory():
return _StubProvider(hits=set())
monkeypatch.setattr(intel_factory, "get_mal_hash_provider", _factory)
monkeypatch.setattr(
"decnet.intel.factory.get_mal_hash_provider", _factory,
)
from decnet.web import ingester as mod
repo = _StubRepo()
await mod._publish_email_received(
repo, _log_data(), _fields(attachments=[]),
)
payload = patched_bus[0]["payload"]
assert "mal_hash_match" not in payload
assert repo.observed == []
@pytest.mark.asyncio
async def test_provider_unavailable_still_persists_hashes_without_verdict(
patched_bus, monkeypatch,
):
"""If the provider factory returns None (intel disabled), the
ingester must still write observations — DECNET is a platform; we
keep the hashes regardless of whether anyone classified them."""
def _factory():
return None
monkeypatch.setattr(intel_factory, "get_mal_hash_provider", _factory)
monkeypatch.setattr(
"decnet.intel.factory.get_mal_hash_provider", _factory,
)
from decnet.web import ingester as mod
repo = _StubRepo()
sha = "d" * 64
await mod._publish_email_received(
repo, _log_data(),
_fields(attachments=[{"sha256": sha, "extension": "exe"}]),
)
payload = patched_bus[0]["payload"]
# No provider → False on the bus (everything checked = clean), and
# the row lands with mal_hash_match=None (no verdict).
assert payload["mal_hash_match"] is False
assert len(repo.observed) == 1
assert repo.observed[0]["mal_hash_match"] is None
assert repo.observed[0]["mal_hash_match_provider"] is None

View File

@@ -0,0 +1,187 @@
"""Repo tests for ``observed_attachments`` upsert (DEBT-046).
The table is the per-hash sibling of ``attacker_intel`` — every
attachment hash crossing a decky lands here, with metadata accumulated
across observations.
"""
from __future__ import annotations
import pytest
from decnet.web.db.sqlite.repository import SQLiteRepository
_HASH_A = "a" * 64
_HASH_B = "b" * 64
async def _make_repo(tmp_path) -> SQLiteRepository:
r = SQLiteRepository(db_path=str(tmp_path / "obs.db"))
await r.initialize()
return r
@pytest.mark.asyncio
async def test_first_observation_creates_row(tmp_path):
repo = await _make_repo(tmp_path)
uuid = await repo.upsert_observed_attachment(
sha256=_HASH_A.upper(), # provider may pass mixed-case
decky_uuid="d-1",
attacker_uuid="atk-1",
extension="DOCX",
subject="Invoice",
mal_hash_match=False,
mal_hash_match_provider="malwarebazaar",
)
assert uuid
from decnet.web.db.models import ObservedAttachment
from sqlalchemy import select
async with repo._session() as session:
row = (
await session.execute(
select(ObservedAttachment).where(
ObservedAttachment.sha256 == _HASH_A,
),
)
).scalar_one()
assert row.sha256 == _HASH_A # lowercased
assert row.observation_count == 1
assert row.first_seen_decky_uuid == "d-1"
assert row.first_seen_attacker_uuid == "atk-1"
assert row.last_seen_attacker_uuid == "atk-1"
assert row.extensions == ["docx"]
assert row.first_subject == "Invoice"
assert row.mal_hash_match is False
assert row.mal_hash_match_provider == "malwarebazaar"
assert row.mal_hash_match_at is not None
@pytest.mark.asyncio
async def test_re_observation_increments_and_updates_last_seen(tmp_path):
repo = await _make_repo(tmp_path)
await repo.upsert_observed_attachment(
sha256=_HASH_A, decky_uuid="d-1", attacker_uuid="atk-1",
extension="docx", subject="Old subject",
mal_hash_match=None, mal_hash_match_provider=None,
)
await repo.upsert_observed_attachment(
sha256=_HASH_A, decky_uuid="d-2", attacker_uuid="atk-2",
extension="docx", subject="New subject",
mal_hash_match=None, mal_hash_match_provider=None,
)
from decnet.web.db.models import ObservedAttachment
from sqlalchemy import select
async with repo._session() as session:
row = (
await session.execute(
select(ObservedAttachment).where(
ObservedAttachment.sha256 == _HASH_A,
),
)
).scalar_one()
assert row.observation_count == 2
# First-seen anchors stay pinned; last-seen attacker rolls forward.
assert row.first_seen_decky_uuid == "d-1"
assert row.first_seen_attacker_uuid == "atk-1"
assert row.last_seen_attacker_uuid == "atk-2"
# Subject is the FIRST subject; not overwritten.
assert row.first_subject == "Old subject"
# Extension already known — no duplicate.
assert row.extensions == ["docx"]
@pytest.mark.asyncio
async def test_distinct_extension_appends_deduped(tmp_path):
repo = await _make_repo(tmp_path)
await repo.upsert_observed_attachment(
sha256=_HASH_A, decky_uuid="d", attacker_uuid="a",
extension="docx", subject=None,
mal_hash_match=None, mal_hash_match_provider=None,
)
await repo.upsert_observed_attachment(
sha256=_HASH_A, decky_uuid="d", attacker_uuid="a",
extension="DOC", # different ext, mixed case
subject=None, mal_hash_match=None, mal_hash_match_provider=None,
)
await repo.upsert_observed_attachment(
sha256=_HASH_A, decky_uuid="d", attacker_uuid="a",
extension="doc", # repeat → no-op
subject=None, mal_hash_match=None, mal_hash_match_provider=None,
)
from decnet.web.db.models import ObservedAttachment
from sqlalchemy import select
async with repo._session() as session:
row = (
await session.execute(
select(ObservedAttachment).where(
ObservedAttachment.sha256 == _HASH_A,
),
)
).scalar_one()
assert sorted(row.extensions) == ["doc", "docx"]
@pytest.mark.asyncio
async def test_verdict_true_is_sticky(tmp_path):
"""Once any provider says True, subsequent None/False observations
don't downgrade. A hash a feed later forgets is still a hash that
feed once flagged."""
repo = await _make_repo(tmp_path)
await repo.upsert_observed_attachment(
sha256=_HASH_A, decky_uuid="d", attacker_uuid="a",
extension=None, subject=None,
mal_hash_match=True, mal_hash_match_provider="malwarebazaar",
)
await repo.upsert_observed_attachment(
sha256=_HASH_A, decky_uuid="d", attacker_uuid="a",
extension=None, subject=None,
mal_hash_match=False, mal_hash_match_provider="malwarebazaar",
)
await repo.upsert_observed_attachment(
sha256=_HASH_A, decky_uuid="d", attacker_uuid="a",
extension=None, subject=None,
mal_hash_match=None, mal_hash_match_provider=None,
)
from decnet.web.db.models import ObservedAttachment
from sqlalchemy import select
async with repo._session() as session:
row = (
await session.execute(
select(ObservedAttachment).where(
ObservedAttachment.sha256 == _HASH_A,
),
)
).scalar_one()
assert row.mal_hash_match is True
assert row.mal_hash_match_provider == "malwarebazaar"
@pytest.mark.asyncio
async def test_verdict_none_then_true_writes_through(tmp_path):
repo = await _make_repo(tmp_path)
await repo.upsert_observed_attachment(
sha256=_HASH_B, decky_uuid="d", attacker_uuid="a",
extension=None, subject=None,
mal_hash_match=None, mal_hash_match_provider=None,
)
await repo.upsert_observed_attachment(
sha256=_HASH_B, decky_uuid="d", attacker_uuid="a",
extension=None, subject=None,
mal_hash_match=True, mal_hash_match_provider="malwarebazaar",
)
from decnet.web.db.models import ObservedAttachment
from sqlalchemy import select
async with repo._session() as session:
row = (
await session.execute(
select(ObservedAttachment).where(
ObservedAttachment.sha256 == _HASH_B,
),
)
).scalar_one()
assert row.mal_hash_match is True
assert row.mal_hash_match_provider == "malwarebazaar"