feat(intel,ingester): mal_hash feed + observed_attachments table (DEBT-046)
New MalHashProvider sibling ABC (decnet/intel/base.py) since SHA-256 is a different keyspace from IntelProvider's IPs. MalwareBazaarProvider mirrors FeodoProvider's bulk-feed shape: 24h refresh via _ensure_fresh / _refresh, in-memory set[str] of hex-lowercased hashes, set-membership lookup. Auth-keyed via DECNET_MALWAREBAZAAR_AUTH_KEY; absent key silent-no-ops the lane (single warning, no HTTP traffic). Per-hash observations persist to a new observed_attachments table. DECNET is a honeypot platform — every attachment hash an attacker delivers is intel, regardless of whether anyone classified it. Verdict is sticky: True never downgrades to False/None on subsequent observations. Out of scope: API surface, federation export, retention. Ingester _publish_email_received calls the provider for each attachment sha256, sets mal_hash_match on the bus payload (omitted entirely when the message had no attachments — keeps R0046's `is True` predicate silent on hash-less mail, matching pre-paydown behavior), and upserts the row regardless of provider availability.
This commit is contained in:
@@ -78,3 +78,33 @@ class IntelProvider(ABC):
|
||||
entire IP. Implementations should also respect
|
||||
``self._semaphore`` to bound in-flight calls.
|
||||
"""
|
||||
|
||||
|
||||
class MalHashProvider(ABC):
|
||||
"""Abstract bad-hash lookup provider.
|
||||
|
||||
Sibling to :class:`IntelProvider` — different keyspace (file SHA-256
|
||||
vs IP), different consumer (the email ingester at observation time,
|
||||
not the IP-keyed intel-worker fan-out). Kept as a separate ABC so
|
||||
the ``lookup(ip)`` semantics on ``IntelProvider`` stay honest.
|
||||
|
||||
Concrete impls today:
|
||||
|
||||
* :class:`decnet.intel.mal_hash.MalwareBazaarProvider` — bulk-feed
|
||||
shape mirroring :class:`decnet.intel.feodo.FeodoProvider`.
|
||||
|
||||
Future impls (paid VirusTotal subscription, in-house allowlist) plug
|
||||
in behind the same factory in :func:`decnet.intel.factory.get_mal_hash_provider`.
|
||||
"""
|
||||
|
||||
name: str
|
||||
|
||||
@abstractmethod
|
||||
async def is_known_bad(self, sha256: str) -> bool:
|
||||
"""Return whether *sha256* is on this provider's bad-hash list.
|
||||
|
||||
MUST NOT raise — return ``False`` on any error (the caller is the
|
||||
ingester, not a worker; an exception here would taint a totally
|
||||
unrelated bus payload). The provider is responsible for logging
|
||||
its own errors.
|
||||
"""
|
||||
|
||||
@@ -21,7 +21,7 @@ from __future__ import annotations
|
||||
import os
|
||||
from typing import List
|
||||
|
||||
from decnet.intel.base import IntelProvider
|
||||
from decnet.intel.base import IntelProvider, MalHashProvider
|
||||
|
||||
_KNOWN_PROVIDERS = ("greynoise", "abuseipdb", "feodo", "threatfox")
|
||||
|
||||
@@ -37,6 +37,40 @@ def _provider_list() -> list[str]:
|
||||
return [p.strip().lower() for p in raw.split(",") if p.strip()]
|
||||
|
||||
|
||||
_mal_hash_singleton: MalHashProvider | None = None
|
||||
_mal_hash_initialized: bool = False
|
||||
|
||||
|
||||
def get_mal_hash_provider() -> MalHashProvider | None:
|
||||
"""Return the configured malware-hash lookup provider singleton.
|
||||
|
||||
Sibling factory to :func:`get_intel_providers` — different keyspace
|
||||
(file SHA-256 vs IP), different consumer (the email ingester at
|
||||
observation time, not the IP-keyed intel-worker fan-out). Returns
|
||||
``None`` only if intel is disabled wholesale; otherwise returns a
|
||||
provider whose :meth:`is_known_bad` self-disables to a no-op when
|
||||
``DECNET_MALWAREBAZAAR_AUTH_KEY`` is unset, so the ingester never
|
||||
has to special-case "no provider configured."
|
||||
"""
|
||||
global _mal_hash_singleton, _mal_hash_initialized
|
||||
if _mal_hash_initialized:
|
||||
return _mal_hash_singleton
|
||||
_mal_hash_initialized = True
|
||||
if not _enabled():
|
||||
_mal_hash_singleton = None
|
||||
return None
|
||||
from decnet.intel.mal_hash import MalwareBazaarProvider
|
||||
_mal_hash_singleton = MalwareBazaarProvider()
|
||||
return _mal_hash_singleton
|
||||
|
||||
|
||||
def _reset_mal_hash_provider_for_testing() -> None:
|
||||
"""Test hook — drop the singleton so the next call re-reads env."""
|
||||
global _mal_hash_singleton, _mal_hash_initialized
|
||||
_mal_hash_singleton = None
|
||||
_mal_hash_initialized = False
|
||||
|
||||
|
||||
def get_intel_providers() -> List[IntelProvider]:
|
||||
"""Return the configured threat-intel providers.
|
||||
|
||||
|
||||
195
decnet/intel/mal_hash.py
Normal file
195
decnet/intel/mal_hash.py
Normal file
@@ -0,0 +1,195 @@
|
||||
"""MalwareBazaar bad-hash provider — bulk SHA-256 feed.
|
||||
|
||||
Mirrors :mod:`decnet.intel.feodo` for the refresh / TTL / set-membership
|
||||
shape, but operates on the SHA-256 keyspace instead of IPs and so
|
||||
implements :class:`decnet.intel.base.MalHashProvider` rather than
|
||||
:class:`IntelProvider`. Keep the two ABCs disjoint — see ``base.py``.
|
||||
|
||||
Endpoint: ``GET https://bazaar.abuse.ch/export/csv/full/`` with
|
||||
``Auth-Key: <key>`` header. Returns a ZIP'd CSV with one row per
|
||||
sample; the ``sha256_hash`` column is the natural key. ~900K rows ≈
|
||||
30 MB resident as a ``set[str]`` of hex-lowercased hashes.
|
||||
|
||||
Auth-key is read from ``DECNET_MALWAREBAZAAR_AUTH_KEY``. When unset,
|
||||
the provider logs one warning at first refresh attempt and disables
|
||||
itself for the process lifetime — :meth:`is_known_bad` returns ``False``
|
||||
without ever making a network call. The ingester treats that the same
|
||||
as "no opinion," so R0046's ``mal_hash_match`` lane stays absent on the
|
||||
bus payload (which is exactly what the predicate's ``is True`` check
|
||||
does today, so the silent-no-op is behaviorally identical to "lane not
|
||||
shipped yet").
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import io
|
||||
import os
|
||||
import time
|
||||
import zipfile
|
||||
from typing import Optional
|
||||
|
||||
from decnet.intel.base import MalHashProvider
|
||||
from decnet.logging import get_logger
|
||||
from decnet.net.http import stealth_client
|
||||
|
||||
log = get_logger("intel.mal_hash")
|
||||
|
||||
_ENDPOINT = "https://bazaar.abuse.ch/export/csv/full/"
|
||||
_DEFAULT_REFRESH_S = 86_400.0 # 24h — feed is daily, no need to hammer
|
||||
_AUTH_KEY_ENV = "DECNET_MALWAREBAZAAR_AUTH_KEY"
|
||||
_REFRESH_INTERVAL_ENV = "DECNET_MAL_HASH_REFRESH_INTERVAL_S"
|
||||
|
||||
|
||||
def _read_refresh_interval() -> float:
|
||||
raw = os.environ.get(_REFRESH_INTERVAL_ENV)
|
||||
if raw is None:
|
||||
return _DEFAULT_REFRESH_S
|
||||
try:
|
||||
return float(raw)
|
||||
except ValueError:
|
||||
log.warning(
|
||||
"%s=%r not a float; falling back to default %.0f",
|
||||
_REFRESH_INTERVAL_ENV, raw, _DEFAULT_REFRESH_S,
|
||||
)
|
||||
return _DEFAULT_REFRESH_S
|
||||
|
||||
|
||||
class MalwareBazaarProvider(MalHashProvider):
|
||||
"""Bulk SHA-256 lookup against MalwareBazaar's full export."""
|
||||
|
||||
name = "malwarebazaar"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
auth_key: Optional[str] = None,
|
||||
refresh_interval_s: Optional[float] = None,
|
||||
) -> None:
|
||||
self._auth_key = auth_key or os.environ.get(_AUTH_KEY_ENV) or None
|
||||
self._refresh_interval_s = (
|
||||
refresh_interval_s
|
||||
if refresh_interval_s is not None
|
||||
else _read_refresh_interval()
|
||||
)
|
||||
self._known: set[str] = set()
|
||||
self._loaded_at: float = 0.0
|
||||
self._last_error: Optional[str] = None
|
||||
self._disabled_warned: bool = False
|
||||
|
||||
@property
|
||||
def disabled(self) -> bool:
|
||||
return self._auth_key is None
|
||||
|
||||
async def _refresh(self) -> Optional[str]:
|
||||
"""Refetch the bulk feed. Returns an error string or ``None``."""
|
||||
if self._auth_key is None:
|
||||
return "no auth key"
|
||||
try:
|
||||
async with stealth_client(timeout=60.0) as client:
|
||||
resp = await client.get(
|
||||
_ENDPOINT, headers={"Auth-Key": self._auth_key},
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
return f"network: {exc}"
|
||||
if resp.status_code != 200:
|
||||
return f"HTTP {resp.status_code}"
|
||||
body = resp.content
|
||||
try:
|
||||
new_known = _parse_dump(body)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
return f"parse: {exc}"
|
||||
if not new_known:
|
||||
return "feed: empty"
|
||||
self._known = new_known
|
||||
self._loaded_at = time.monotonic()
|
||||
self._last_error = None
|
||||
log.info("malwarebazaar: refreshed bulk feed entries=%d", len(new_known))
|
||||
return None
|
||||
|
||||
async def _ensure_fresh(self) -> None:
|
||||
if self.disabled:
|
||||
if not self._disabled_warned:
|
||||
log.warning(
|
||||
"R0046 mal_hash_match disabled: %s unset",
|
||||
_AUTH_KEY_ENV,
|
||||
)
|
||||
self._disabled_warned = True
|
||||
return
|
||||
if (
|
||||
not self._known
|
||||
or (time.monotonic() - self._loaded_at) >= self._refresh_interval_s
|
||||
):
|
||||
err = await self._refresh()
|
||||
if err:
|
||||
self._last_error = err
|
||||
log.warning("malwarebazaar refresh failed: %s", err)
|
||||
|
||||
async def is_known_bad(self, sha256: str) -> bool:
|
||||
if self.disabled:
|
||||
return False
|
||||
try:
|
||||
await self._ensure_fresh()
|
||||
except Exception as exc: # noqa: BLE001
|
||||
# Belt and braces: _ensure_fresh swallows refresh failures
|
||||
# but a bug in there shouldn't blow up the ingester payload.
|
||||
log.exception("malwarebazaar refresh raised: %s", exc)
|
||||
return False
|
||||
return sha256.lower() in self._known
|
||||
|
||||
|
||||
def _parse_dump(body: bytes) -> set[str]:
|
||||
"""Extract SHA-256 hashes from MalwareBazaar's full dump.
|
||||
|
||||
The endpoint returns a ZIP archive containing a single CSV with a
|
||||
``sha256_hash`` column. Some abuse.ch flavours of the same feed
|
||||
family ship plain CSV instead — handle both by sniffing the magic
|
||||
bytes. Hashes are lowercased; non-hex / wrong-length values are
|
||||
dropped (defense in depth — we set-membership-test by exact match).
|
||||
"""
|
||||
if body[:2] == b"PK":
|
||||
with zipfile.ZipFile(io.BytesIO(body)) as zf:
|
||||
csv_names = [n for n in zf.namelist() if n.lower().endswith(".csv")]
|
||||
if not csv_names:
|
||||
raise ValueError("zip has no .csv member")
|
||||
with zf.open(csv_names[0]) as fh:
|
||||
csv_bytes = fh.read()
|
||||
else:
|
||||
csv_bytes = body
|
||||
text = csv_bytes.decode("utf-8", errors="replace")
|
||||
return _extract_hashes(text)
|
||||
|
||||
|
||||
def _extract_hashes(text: str) -> set[str]:
|
||||
"""Pull the ``sha256_hash`` column out of MalwareBazaar's CSV.
|
||||
|
||||
The dump prefaces the table with ``#``-prefixed comment lines.
|
||||
Skip those, find the header row, locate the column, then read the
|
||||
rest. csv.reader handles the quoting (the ``signature`` column
|
||||
contains commas and is properly quoted in the dump).
|
||||
"""
|
||||
body_lines = [
|
||||
line for line in text.splitlines()
|
||||
if line and not line.lstrip().startswith("#")
|
||||
]
|
||||
if not body_lines:
|
||||
return set()
|
||||
reader = csv.reader(body_lines)
|
||||
header = next(reader, None)
|
||||
if not header:
|
||||
return set()
|
||||
norm = [h.strip().strip('"').lower() for h in header]
|
||||
try:
|
||||
col = norm.index("sha256_hash")
|
||||
except ValueError:
|
||||
# Fallback — first column is sha256 in every documented
|
||||
# variant; if the header naming changes upstream we still
|
||||
# capture something rather than silently emptying the set.
|
||||
col = 0
|
||||
out: set[str] = set()
|
||||
for row in reader:
|
||||
if len(row) <= col:
|
||||
continue
|
||||
cell = row[col].strip().strip('"').lower()
|
||||
if len(cell) == 64 and all(c in "0123456789abcdef" for c in cell):
|
||||
out.add(cell)
|
||||
return out
|
||||
Reference in New Issue
Block a user