feat(intel,ingester): mal_hash feed + observed_attachments table (DEBT-046)
New MalHashProvider sibling ABC (decnet/intel/base.py) since SHA-256 is a different keyspace from IntelProvider's IPs. MalwareBazaarProvider mirrors FeodoProvider's bulk-feed shape: 24h refresh via _ensure_fresh / _refresh, in-memory set[str] of hex-lowercased hashes, set-membership lookup. Auth-keyed via DECNET_MALWAREBAZAAR_AUTH_KEY; absent key silent-no-ops the lane (single warning, no HTTP traffic). Per-hash observations persist to a new observed_attachments table. DECNET is a honeypot platform — every attachment hash an attacker delivers is intel, regardless of whether anyone classified it. Verdict is sticky: True never downgrades to False/None on subsequent observations. Out of scope: API surface, federation export, retention. Ingester _publish_email_received calls the provider for each attachment sha256, sets mal_hash_match on the bus payload (omitted entirely when the message had no attachments — keeps R0046's `is True` predicate silent on hash-less mail, matching pre-paydown behavior), and upserts the row regardless of provider availability.
This commit is contained in:
216
tests/web/test_ingester_mal_hash.py
Normal file
216
tests/web/test_ingester_mal_hash.py
Normal file
@@ -0,0 +1,216 @@
|
||||
"""Ingester wiring for mal_hash + observed_attachments (DEBT-046).
|
||||
|
||||
Validates `_publish_email_received` against a stub repo + stub provider:
|
||||
|
||||
* Provider hit on any attachment hash → ``mal_hash_match=True`` on the bus payload
|
||||
* Provider clean on every hash → ``mal_hash_match=False`` on the bus payload
|
||||
* No attachments → field omitted from the payload entirely
|
||||
* Every observed hash lands in ``observed_attachments`` with the verdict baked in
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from unittest.mock import AsyncMock
|
||||
|
||||
import pytest
|
||||
|
||||
from decnet.intel import factory as intel_factory
|
||||
|
||||
|
||||
class _StubRepo:
|
||||
def __init__(self) -> None:
|
||||
self.observed: list[dict] = []
|
||||
self.get_attacker_uuid_by_ip = AsyncMock(return_value="atk-1")
|
||||
|
||||
async def upsert_observed_attachment(self, **kwargs):
|
||||
self.observed.append(kwargs)
|
||||
return "obs-uuid"
|
||||
|
||||
|
||||
class _StubBus:
|
||||
def __init__(self) -> None:
|
||||
self.published: list[dict] = []
|
||||
|
||||
async def connect(self):
|
||||
return None
|
||||
|
||||
async def close(self):
|
||||
return None
|
||||
|
||||
|
||||
class _StubProvider:
|
||||
name = "malwarebazaar"
|
||||
|
||||
def __init__(self, hits: set[str]):
|
||||
self._hits = hits
|
||||
|
||||
async def is_known_bad(self, sha256: str) -> bool:
|
||||
return sha256 in self._hits
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _reset_factory():
|
||||
intel_factory._reset_mal_hash_provider_for_testing()
|
||||
yield
|
||||
intel_factory._reset_mal_hash_provider_for_testing()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def patched_bus(monkeypatch):
|
||||
"""Patch out the ingester's bus singleton so publishes capture
|
||||
instead of going to the wire."""
|
||||
captured: list[dict] = []
|
||||
|
||||
async def _publish_safely(bus, topic, payload, *, event_type=None):
|
||||
captured.append({"topic": topic, "payload": payload, "event_type": event_type})
|
||||
|
||||
def _get_bus(client_name=""):
|
||||
return _StubBus()
|
||||
|
||||
from decnet.web import ingester as mod
|
||||
monkeypatch.setattr(mod, "publish_safely", _publish_safely)
|
||||
monkeypatch.setattr(mod, "get_bus", _get_bus)
|
||||
return captured
|
||||
|
||||
|
||||
def _log_data() -> dict:
|
||||
return {
|
||||
"attacker_ip": "203.0.113.5",
|
||||
"decky": "decky-uuid",
|
||||
"service": "smtp",
|
||||
}
|
||||
|
||||
|
||||
def _fields(*, attachments: list[dict] | None) -> dict:
|
||||
return {
|
||||
"msg_id": "<m1@x>",
|
||||
"subject": "Test",
|
||||
"from_hdr": "atk@evil.example",
|
||||
"mail_from": "atk@evil.example",
|
||||
"return_path": "atk@evil.example",
|
||||
"rcpt_to": "victim@corp.example",
|
||||
"x_mailer": "Outlook",
|
||||
"dkim_signed": 0,
|
||||
"spf_pass": 0,
|
||||
"urls_json": "[]",
|
||||
"attachments_json": json.dumps(attachments) if attachments is not None else "[]",
|
||||
"attachment_count": len(attachments) if attachments else 0,
|
||||
"body_simhash": "0123456789abcdef",
|
||||
"body_base64_bytes": 0,
|
||||
"html_smuggling": 0,
|
||||
"stored_as": "/spool/m1.eml",
|
||||
"sha256": "f" * 64,
|
||||
}
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_known_bad_attachment_sets_mal_hash_match_true(patched_bus, monkeypatch):
|
||||
bad = "a" * 64
|
||||
clean = "b" * 64
|
||||
|
||||
def _factory():
|
||||
return _StubProvider(hits={bad})
|
||||
|
||||
monkeypatch.setattr(intel_factory, "get_mal_hash_provider", _factory)
|
||||
from decnet.web import ingester as mod
|
||||
monkeypatch.setattr(
|
||||
"decnet.intel.factory.get_mal_hash_provider", _factory,
|
||||
)
|
||||
|
||||
repo = _StubRepo()
|
||||
await mod._publish_email_received(
|
||||
repo, _log_data(),
|
||||
_fields(attachments=[
|
||||
{"sha256": bad, "extension": "docx"},
|
||||
{"sha256": clean, "extension": "pdf"},
|
||||
]),
|
||||
)
|
||||
|
||||
assert len(patched_bus) == 1
|
||||
payload = patched_bus[0]["payload"]
|
||||
assert payload["mal_hash_match"] is True
|
||||
assert payload["attachment_sha256s"] == [bad, clean]
|
||||
|
||||
# Both hashes recorded with their verdicts.
|
||||
by_hash = {o["sha256"]: o for o in repo.observed}
|
||||
assert by_hash[bad]["mal_hash_match"] is True
|
||||
assert by_hash[bad]["mal_hash_match_provider"] == "malwarebazaar"
|
||||
assert by_hash[clean]["mal_hash_match"] is False
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_clean_attachments_sets_mal_hash_match_false(patched_bus, monkeypatch):
|
||||
clean = "c" * 64
|
||||
|
||||
def _factory():
|
||||
return _StubProvider(hits=set())
|
||||
|
||||
monkeypatch.setattr(intel_factory, "get_mal_hash_provider", _factory)
|
||||
monkeypatch.setattr(
|
||||
"decnet.intel.factory.get_mal_hash_provider", _factory,
|
||||
)
|
||||
|
||||
from decnet.web import ingester as mod
|
||||
repo = _StubRepo()
|
||||
await mod._publish_email_received(
|
||||
repo, _log_data(),
|
||||
_fields(attachments=[{"sha256": clean, "extension": "pdf"}]),
|
||||
)
|
||||
|
||||
payload = patched_bus[0]["payload"]
|
||||
assert payload["mal_hash_match"] is False
|
||||
assert len(repo.observed) == 1
|
||||
assert repo.observed[0]["mal_hash_match"] is False
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_no_attachments_omits_mal_hash_match(patched_bus, monkeypatch):
|
||||
def _factory():
|
||||
return _StubProvider(hits=set())
|
||||
|
||||
monkeypatch.setattr(intel_factory, "get_mal_hash_provider", _factory)
|
||||
monkeypatch.setattr(
|
||||
"decnet.intel.factory.get_mal_hash_provider", _factory,
|
||||
)
|
||||
|
||||
from decnet.web import ingester as mod
|
||||
repo = _StubRepo()
|
||||
await mod._publish_email_received(
|
||||
repo, _log_data(), _fields(attachments=[]),
|
||||
)
|
||||
|
||||
payload = patched_bus[0]["payload"]
|
||||
assert "mal_hash_match" not in payload
|
||||
assert repo.observed == []
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_provider_unavailable_still_persists_hashes_without_verdict(
|
||||
patched_bus, monkeypatch,
|
||||
):
|
||||
"""If the provider factory returns None (intel disabled), the
|
||||
ingester must still write observations — DECNET is a platform; we
|
||||
keep the hashes regardless of whether anyone classified them."""
|
||||
def _factory():
|
||||
return None
|
||||
|
||||
monkeypatch.setattr(intel_factory, "get_mal_hash_provider", _factory)
|
||||
monkeypatch.setattr(
|
||||
"decnet.intel.factory.get_mal_hash_provider", _factory,
|
||||
)
|
||||
|
||||
from decnet.web import ingester as mod
|
||||
repo = _StubRepo()
|
||||
sha = "d" * 64
|
||||
await mod._publish_email_received(
|
||||
repo, _log_data(),
|
||||
_fields(attachments=[{"sha256": sha, "extension": "exe"}]),
|
||||
)
|
||||
|
||||
payload = patched_bus[0]["payload"]
|
||||
# No provider → False on the bus (everything checked = clean), and
|
||||
# the row lands with mal_hash_match=None (no verdict).
|
||||
assert payload["mal_hash_match"] is False
|
||||
assert len(repo.observed) == 1
|
||||
assert repo.observed[0]["mal_hash_match"] is None
|
||||
assert repo.observed[0]["mal_hash_match_provider"] is None
|
||||
187
tests/web/test_observed_attachments_repo.py
Normal file
187
tests/web/test_observed_attachments_repo.py
Normal file
@@ -0,0 +1,187 @@
|
||||
"""Repo tests for ``observed_attachments`` upsert (DEBT-046).
|
||||
|
||||
The table is the per-hash sibling of ``attacker_intel`` — every
|
||||
attachment hash crossing a decky lands here, with metadata accumulated
|
||||
across observations.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from decnet.web.db.sqlite.repository import SQLiteRepository
|
||||
|
||||
_HASH_A = "a" * 64
|
||||
_HASH_B = "b" * 64
|
||||
|
||||
|
||||
async def _make_repo(tmp_path) -> SQLiteRepository:
|
||||
r = SQLiteRepository(db_path=str(tmp_path / "obs.db"))
|
||||
await r.initialize()
|
||||
return r
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_first_observation_creates_row(tmp_path):
|
||||
repo = await _make_repo(tmp_path)
|
||||
uuid = await repo.upsert_observed_attachment(
|
||||
sha256=_HASH_A.upper(), # provider may pass mixed-case
|
||||
decky_uuid="d-1",
|
||||
attacker_uuid="atk-1",
|
||||
extension="DOCX",
|
||||
subject="Invoice",
|
||||
mal_hash_match=False,
|
||||
mal_hash_match_provider="malwarebazaar",
|
||||
)
|
||||
assert uuid
|
||||
|
||||
from decnet.web.db.models import ObservedAttachment
|
||||
from sqlalchemy import select
|
||||
async with repo._session() as session:
|
||||
row = (
|
||||
await session.execute(
|
||||
select(ObservedAttachment).where(
|
||||
ObservedAttachment.sha256 == _HASH_A,
|
||||
),
|
||||
)
|
||||
).scalar_one()
|
||||
assert row.sha256 == _HASH_A # lowercased
|
||||
assert row.observation_count == 1
|
||||
assert row.first_seen_decky_uuid == "d-1"
|
||||
assert row.first_seen_attacker_uuid == "atk-1"
|
||||
assert row.last_seen_attacker_uuid == "atk-1"
|
||||
assert row.extensions == ["docx"]
|
||||
assert row.first_subject == "Invoice"
|
||||
assert row.mal_hash_match is False
|
||||
assert row.mal_hash_match_provider == "malwarebazaar"
|
||||
assert row.mal_hash_match_at is not None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_re_observation_increments_and_updates_last_seen(tmp_path):
|
||||
repo = await _make_repo(tmp_path)
|
||||
await repo.upsert_observed_attachment(
|
||||
sha256=_HASH_A, decky_uuid="d-1", attacker_uuid="atk-1",
|
||||
extension="docx", subject="Old subject",
|
||||
mal_hash_match=None, mal_hash_match_provider=None,
|
||||
)
|
||||
await repo.upsert_observed_attachment(
|
||||
sha256=_HASH_A, decky_uuid="d-2", attacker_uuid="atk-2",
|
||||
extension="docx", subject="New subject",
|
||||
mal_hash_match=None, mal_hash_match_provider=None,
|
||||
)
|
||||
|
||||
from decnet.web.db.models import ObservedAttachment
|
||||
from sqlalchemy import select
|
||||
async with repo._session() as session:
|
||||
row = (
|
||||
await session.execute(
|
||||
select(ObservedAttachment).where(
|
||||
ObservedAttachment.sha256 == _HASH_A,
|
||||
),
|
||||
)
|
||||
).scalar_one()
|
||||
assert row.observation_count == 2
|
||||
# First-seen anchors stay pinned; last-seen attacker rolls forward.
|
||||
assert row.first_seen_decky_uuid == "d-1"
|
||||
assert row.first_seen_attacker_uuid == "atk-1"
|
||||
assert row.last_seen_attacker_uuid == "atk-2"
|
||||
# Subject is the FIRST subject; not overwritten.
|
||||
assert row.first_subject == "Old subject"
|
||||
# Extension already known — no duplicate.
|
||||
assert row.extensions == ["docx"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_distinct_extension_appends_deduped(tmp_path):
|
||||
repo = await _make_repo(tmp_path)
|
||||
await repo.upsert_observed_attachment(
|
||||
sha256=_HASH_A, decky_uuid="d", attacker_uuid="a",
|
||||
extension="docx", subject=None,
|
||||
mal_hash_match=None, mal_hash_match_provider=None,
|
||||
)
|
||||
await repo.upsert_observed_attachment(
|
||||
sha256=_HASH_A, decky_uuid="d", attacker_uuid="a",
|
||||
extension="DOC", # different ext, mixed case
|
||||
subject=None, mal_hash_match=None, mal_hash_match_provider=None,
|
||||
)
|
||||
await repo.upsert_observed_attachment(
|
||||
sha256=_HASH_A, decky_uuid="d", attacker_uuid="a",
|
||||
extension="doc", # repeat → no-op
|
||||
subject=None, mal_hash_match=None, mal_hash_match_provider=None,
|
||||
)
|
||||
|
||||
from decnet.web.db.models import ObservedAttachment
|
||||
from sqlalchemy import select
|
||||
async with repo._session() as session:
|
||||
row = (
|
||||
await session.execute(
|
||||
select(ObservedAttachment).where(
|
||||
ObservedAttachment.sha256 == _HASH_A,
|
||||
),
|
||||
)
|
||||
).scalar_one()
|
||||
assert sorted(row.extensions) == ["doc", "docx"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_verdict_true_is_sticky(tmp_path):
|
||||
"""Once any provider says True, subsequent None/False observations
|
||||
don't downgrade. A hash a feed later forgets is still a hash that
|
||||
feed once flagged."""
|
||||
repo = await _make_repo(tmp_path)
|
||||
await repo.upsert_observed_attachment(
|
||||
sha256=_HASH_A, decky_uuid="d", attacker_uuid="a",
|
||||
extension=None, subject=None,
|
||||
mal_hash_match=True, mal_hash_match_provider="malwarebazaar",
|
||||
)
|
||||
await repo.upsert_observed_attachment(
|
||||
sha256=_HASH_A, decky_uuid="d", attacker_uuid="a",
|
||||
extension=None, subject=None,
|
||||
mal_hash_match=False, mal_hash_match_provider="malwarebazaar",
|
||||
)
|
||||
await repo.upsert_observed_attachment(
|
||||
sha256=_HASH_A, decky_uuid="d", attacker_uuid="a",
|
||||
extension=None, subject=None,
|
||||
mal_hash_match=None, mal_hash_match_provider=None,
|
||||
)
|
||||
|
||||
from decnet.web.db.models import ObservedAttachment
|
||||
from sqlalchemy import select
|
||||
async with repo._session() as session:
|
||||
row = (
|
||||
await session.execute(
|
||||
select(ObservedAttachment).where(
|
||||
ObservedAttachment.sha256 == _HASH_A,
|
||||
),
|
||||
)
|
||||
).scalar_one()
|
||||
assert row.mal_hash_match is True
|
||||
assert row.mal_hash_match_provider == "malwarebazaar"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_verdict_none_then_true_writes_through(tmp_path):
|
||||
repo = await _make_repo(tmp_path)
|
||||
await repo.upsert_observed_attachment(
|
||||
sha256=_HASH_B, decky_uuid="d", attacker_uuid="a",
|
||||
extension=None, subject=None,
|
||||
mal_hash_match=None, mal_hash_match_provider=None,
|
||||
)
|
||||
await repo.upsert_observed_attachment(
|
||||
sha256=_HASH_B, decky_uuid="d", attacker_uuid="a",
|
||||
extension=None, subject=None,
|
||||
mal_hash_match=True, mal_hash_match_provider="malwarebazaar",
|
||||
)
|
||||
|
||||
from decnet.web.db.models import ObservedAttachment
|
||||
from sqlalchemy import select
|
||||
async with repo._session() as session:
|
||||
row = (
|
||||
await session.execute(
|
||||
select(ObservedAttachment).where(
|
||||
ObservedAttachment.sha256 == _HASH_B,
|
||||
),
|
||||
)
|
||||
).scalar_one()
|
||||
assert row.mal_hash_match is True
|
||||
assert row.mal_hash_match_provider == "malwarebazaar"
|
||||
Reference in New Issue
Block a user