SMTP message-level technique tagger per Appendix A.6: open relay abuse (rcpt_count + foreign From), mass phishing (rcpt_count + body simhash), phishing-kit X-Mailer, IDN/punycode URL, sender masquerade composite (From/Return-Path/DKIM/SPF), malicious attachment (macro/.lnk/.iso/.img/ hash match), BEC subject+body composite, encoded payload in body. PII discipline (TTP_TAGGING.md §'Hard parts §6') is enforced at the lifter layer via _filter_evidence(): emitted TTPTag.evidence is restricted to the EmailEvidence-allowed allowlist (body_sha256, matched_headers — names only, rcpt_domain_set — domains only, attachment_sha256s, rcpt_count) plus PII-safe match discriminators (matched_kit, matched_trigger, matched_url_host, etc). Raw addresses, raw body bytes, full URLs, and decoded base64 previews NEVER appear in evidence — defense-in-depth over the YAML evidence_fields hint. Tests: tests/ttp/test_email_lifter.py per-rule positive + negative + PII allowlist guard + state modulation. tests/ttp/rule_precision/ test_email_rules.py xfail flipped to real precision (R0041-R0048 H-band ≥95%). Corpus rows updated to acknowledge that R0045 (masquerade) co-fires with R0041 / R0047 when the sender-masquerade signals are present alongside open-relay or BEC patterns — overlap is by design, not a precision bug.
89 lines
2.8 KiB
Python
89 lines
2.8 KiB
Python
"""R0041-R0048 — email cohort.
|
||
|
||
EmailLifter (E.3.12) consumes these by rule_id. The v0
|
||
:class:`RuleEngine` cannot parse SMTP envelopes, walk attachment
|
||
trees, or compose header / body / attachment signals — so these
|
||
rules are inert under the regex matcher.
|
||
|
||
Asserts each YAML compiles, none fire from the v0 engine, and a
|
||
strict-xfail precision case that flips green when E.3.12 lands.
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
from collections.abc import Callable
|
||
from pathlib import Path
|
||
|
||
import pytest
|
||
|
||
from decnet.ttp.impl.rule_engine import RuleEngine
|
||
from decnet.ttp.store.base import RuleState
|
||
from decnet.ttp.store.impl.filesystem import _parse_and_compile
|
||
from tests.ttp.rule_precision.conftest import CorpusRow, make_event
|
||
|
||
CohortLoader = Callable[[str], list[CorpusRow]]
|
||
|
||
_RULE_IDS = [f"R{n:04d}" for n in range(41, 49)]
|
||
|
||
|
||
@pytest.mark.parametrize("rule_id", _RULE_IDS)
|
||
def test_rule_yaml_present(rule_id: str) -> None:
|
||
path = Path("rules/ttp") / f"{rule_id}.yaml"
|
||
assert path.exists(), f"missing YAML: {path}"
|
||
compiled = _parse_and_compile(path, RuleState())
|
||
assert compiled.rule_id == rule_id
|
||
|
||
|
||
@pytest.mark.parametrize("rule_id", _RULE_IDS)
|
||
async def test_lifter_bound_inert_in_v0(
|
||
rule_id: str,
|
||
precision_engine: RuleEngine,
|
||
corpus_loader: CohortLoader,
|
||
) -> None:
|
||
fired: set[str] = set()
|
||
for row in corpus_loader("email"):
|
||
tags = await precision_engine.evaluate(make_event(row))
|
||
fired.update(tag.rule_id for tag in tags)
|
||
assert rule_id not in fired, (
|
||
f"{rule_id} is lifter-bound but fired from the regex engine"
|
||
)
|
||
|
||
|
||
def _build_lifter() -> "EmailLifter":
|
||
from decnet.ttp.impl.email_lifter import EmailLifter
|
||
from tests.ttp._stub_store import StubRuleStore
|
||
|
||
rules = [
|
||
_parse_and_compile(Path("rules/ttp") / f"{rid}.yaml", RuleState())
|
||
for rid in _RULE_IDS
|
||
]
|
||
lifter = EmailLifter(StubRuleStore(compiled=rules))
|
||
for rule in rules:
|
||
lifter._index.install(rule)
|
||
return lifter
|
||
|
||
|
||
@pytest.mark.parametrize("rule_id", _RULE_IDS)
|
||
def test_email_rule_precision(
|
||
rule_id: str,
|
||
corpus_loader: CohortLoader,
|
||
) -> None:
|
||
"""E.3.12 — drive EmailLifter over the labelled corpus and assert
|
||
per-rule precision. R0041–R0048 are all H-band (≥0.85) → ≥95%.
|
||
"""
|
||
import asyncio
|
||
|
||
from tests.ttp.rule_precision.conftest import precision_for
|
||
|
||
rows = corpus_loader("email")
|
||
if not rows:
|
||
pytest.skip("no email corpus available")
|
||
lifter = _build_lifter()
|
||
fired: dict[str, list[str]] = {}
|
||
for row in rows:
|
||
tags = asyncio.run(lifter.tag(make_event(row)))
|
||
fired[row.label] = [tag.rule_id for tag in tags]
|
||
precision, _tp, _fp = precision_for(rule_id, rows, fired)
|
||
assert precision >= 0.95, (
|
||
f"{rule_id} precision {precision:.2f} < 0.95 on email corpus"
|
||
)
|