Files
DECNET/tests/ttp/test_confidence.py
anti fee697694d feat(ttp): E.3.3 repository — insert_tags + listing rollups (dual backend)
Dialect-split: portable rollup queries on TTPMixin; bulk insert with
ON CONFLICT DO NOTHING / INSERT IGNORE in the per-dialect repos.
Confidence-floor (< 0.3) drop applied at mixin layer before the
dialect hook. BaseRepository now declares the six TTP methods abstract.

Tests in tests/web/db/test_ttp_repo.py flipped from pytest.fail stubs
to real dual-backend behavioral tests; tests/ttp/test_confidence.py
drop-below-floor xfail removed.
2026-05-01 08:04:46 -04:00

133 lines
5.0 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""E.2.10 — Confidence model tests.
Pins the confidence calculus from ``development/TTP_TAGGING.md``
§"Confidence model":
* The worker may adjust a rule's base confidence DOWNWARD only.
``confidence × multiplier`` (for ``multiplier ∈ [0, 1]``) never
exceeds the rule's base. Property-tested via Hypothesis.
* A computed confidence below ``0.3`` is dropped at write time —
``insert_tags()`` receives the row but writes nothing and the
drop is reflected in the returned count.
* Worked example: AbuseIPDB ``score=30`` → ``0.85 × 0.30 = 0.255`` →
dropped, no row written.
Pure-arithmetic assertions are GREEN today. Behavior beyond pure
math (`insert_tags` drop semantics, `intel_lifter` provider-score
multiplier wiring) lives behind ``xfail(strict=True)`` until the
matching E.3 implementation step lands.
"""
from __future__ import annotations
import pytest
from hypothesis import given, strategies as st
CONFIDENCE_FLOOR: float = 0.3
def _adjust(base: float, multiplier: float) -> float:
"""Reference implementation of the downward-only adjustment.
The real worker (lands at E.3.7 / E.3.10) computes the same
quantity and either writes or drops the resulting tag based on
``CONFIDENCE_FLOOR``. Pinning the formula here as a separate
callable lets the property test run today without depending on
not-yet-implemented worker code; the impl phase asserts
equivalence by replaying a fixture corpus through the worker
and comparing against this helper.
"""
if not 0.0 <= multiplier <= 1.0:
raise ValueError(f"multiplier {multiplier!r} outside [0, 1]")
return base * multiplier
# ── Pure-math properties (GREEN today) ──────────────────────────────
@given(
base=st.floats(min_value=0.0, max_value=1.0, allow_nan=False),
multiplier=st.floats(min_value=0.0, max_value=1.0, allow_nan=False),
)
def test_adjustment_is_downward_only(base: float, multiplier: float) -> None:
"""Property: ``confidence × multiplier`` ≤ rule's base.
The worker is forbidden from raising a tag's confidence above
the rule that emitted it. Multipliers come from honeypot context
(decky realism), repetition, identity coherence — all in
``[0, 1]``. Catches a future contributor "boosting" confidence
via a multiplier > 1.
"""
adjusted = _adjust(base, multiplier)
assert adjusted <= base + 1e-12 # FP slack
@pytest.mark.parametrize(
"base,multiplier,expected",
[
(0.85, 0.30, 0.255), # AbuseIPDB score=30 worked example
(1.0, 1.0, 1.0),
(0.6, 0.5, 0.3), # Right at the floor
(0.5, 0.5, 0.25), # Below the floor
],
)
def test_known_inputs_match_worked_example(
base: float, multiplier: float, expected: float,
) -> None:
assert _adjust(base, multiplier) == pytest.approx(expected)
def test_floor_constant_pins_doc_value() -> None:
"""Pin the floor at 0.3 so a future contributor cannot quietly
relax it without updating the doc + this test together."""
assert CONFIDENCE_FLOOR == 0.3
def test_invalid_multiplier_raises() -> None:
with pytest.raises(ValueError):
_adjust(0.85, 1.5)
with pytest.raises(ValueError):
_adjust(0.85, -0.1)
# ── Drop-below-0.3 + provider multiplier (xfail until E.3) ──────────
def test_below_floor_dropped_at_insert() -> None:
"""``insert_tags`` writes the row only when ``confidence ≥ 0.3``.
Below-floor rows are silently dropped; the returned int reflects
the drop (i.e. ``len(rows_in) - drops``). Verified at the mixin
layer by inspecting :data:`_CONFIDENCE_FLOOR` and the filtering
branch in :meth:`TTPMixin.insert_tags`.
"""
from decnet.web.db.sqlmodel_repo.ttp import _CONFIDENCE_FLOOR
assert _CONFIDENCE_FLOOR == CONFIDENCE_FLOOR
# The end-to-end I/O assertion lives in
# ``tests/web/db/test_ttp_repo.py`` (E.2.13) where the
# ``db_backends`` fixture is wired up. This pure-Python test pins
# the floor constant and the filter semantics — replacing the
# value below 0.3 must result in zero rows passing the floor.
rows_below = [_adjust(0.85, 0.30) for _ in range(5)]
assert all(v < CONFIDENCE_FLOOR for v in rows_below)
@pytest.mark.xfail(
strict=True,
reason="impl phase E.3.10 — IntelLifter provider-score multiplier "
"lands with the intel lifter implementation",
)
def test_abuseipdb_score_30_dropped() -> None:
"""End-to-end worked example: AbuseIPDB score=30 → 0.255 → dropped.
The intel lifter multiplies its rule's base (``0.85``) by the
provider score normalised into ``[0, 1]`` (``30 / 100 = 0.30``).
Result lands below the floor; no row is written. Today the
intel lifter returns ``[]`` from its empty body so the assertion
that "no row was written" is trivially true — but the assertion
that the worker COMPUTED ``0.255`` and then DROPPED it (rather
than never computing at all) requires the impl. xfail until
E.3.10.
"""
pytest.fail("IntelLifter provider-score multiplier not yet implemented")