Files
DECNET/tests/profiler/behave_shell/test_intent_lexicon.py
anti f2b3393669 chore: relicense to AGPL-3.0-or-later and add SPDX headers
Replaces LICENSE (GPLv3 -> AGPLv3) and prepends
`SPDX-License-Identifier: AGPL-3.0-or-later` to every source file
across decnet/, decnet_web/, tests/, scripts/, and tools/.

Rationale: closes the GPLv3 ASP loophole so any party operating a
modified DECNET as a network service must offer their modified
source. Personal copyright (Samuel Paschuan) + inbound=outbound
contributions make a future unilateral relicense infeasible.

- LICENSE: full AGPL-3.0 text (gnu.org/licenses/agpl-3.0.txt)
- COPYRIGHT: project copyright notice
- tools/add_spdx_headers.py: idempotent header injector
  (shebang- and PEP 263-aware)

Touches 1565 source files (.py, .ts, .tsx, .js, .jsx, .css, .sh).
No behavior change; comments only.
2026-05-22 21:04:16 -04:00

149 lines
5.5 KiB
Python

# SPDX-License-Identifier: AGPL-3.0-or-later
"""Step G.0: command-intent lexicon + lexical counter pass.
No primitive emitted by this commit — it's the shared infrastructure
G.1-G.8 read from. Tests cover:
* hash-set sanity (no precedence-corrupting overlaps)
* :func:`classify_intent` returns the correct registry label
* the typed-text counter pass increments lexical counters and tracks
caps / bang runs
* paste-class events do NOT contribute to the typed counters
* PII regression: counters land on ctx, no raw text on observations
"""
from __future__ import annotations
import json
from decnet.profiler.behave_shell import build_context, extract_session
from decnet.profiler.behave_shell._intent import (
DESTRUCTIVE_TOKENS,
EXFIL_TOKENS,
INTENT_PRECEDENCE,
LATERAL_TOKENS,
LEXEME_MAX_LEN,
NEGATIVE_LEXEMES,
OBSCENITY_LEXEMES,
OPSEC_HISTORY_TOKENS,
PERSISTENCE_TOKENS,
POSITIVE_LEXEMES,
RECON_TOKENS,
classify_intent,
)
from decnet.profiler.behave_shell._parse import AsciinemaEvent, hash_token
def _typed(text: str, t0: float = 0.0, dt: float = 0.05) -> list[AsciinemaEvent]:
return [(t0 + i * dt, "i", c) for i, c in enumerate(text)]
def test_intent_sets_disjoint_where_precedence_matters() -> None:
"""``destructive`` and ``recon`` must not overlap — recon-only tokens
should never accidentally classify as destructive (the high-precedence
label). Cross-set overlap is *allowed*; precedence corruption is not.
"""
# rm appears in destructive AND in some cleanup contexts elsewhere;
# but recon must not accidentally pull a destructive token.
assert not (RECON_TOKENS & DESTRUCTIVE_TOKENS)
assert not (RECON_TOKENS & PERSISTENCE_TOKENS)
assert not (LATERAL_TOKENS & EXFIL_TOKENS)
def test_classify_intent_returns_registry_labels() -> None:
assert classify_intent(hash_token("rm")) == "destructive"
assert classify_intent(hash_token("crontab")) == "persistence"
assert classify_intent(hash_token("curl")) == "exfil"
assert classify_intent(hash_token("ssh")) == "lateral"
assert classify_intent(hash_token("ls")) == "recon"
def test_classify_intent_unknown_returns_none() -> None:
assert classify_intent(hash_token("vim")) is None
assert classify_intent(hash_token("nonsense_xyz")) is None
def test_lexicon_max_len_bounded() -> None:
"""Lexeme buffer can't grow without bound."""
assert LEXEME_MAX_LEN >= max(len(x) for x in OBSCENITY_LEXEMES)
assert LEXEME_MAX_LEN < 32 # sanity — single short word forms only
def test_obscenity_counter_fires_on_typed_token() -> None:
"""Typed ``fuck `` (with trailing boundary) increments
``obscenity_hits``; the lexeme is not retained as text."""
events = _typed("fuck ")
ctx = build_context(events, sid="g0-obs")
assert ctx.obscenity_hits == 1
assert ctx.positive_lex_hits == 0
assert ctx.negative_lex_hits == 0
def test_lexeme_longest_match_fucking_counts_once() -> None:
"""``fucking`` is in the obscenity set; it should match once — not
twice (``fuck`` + ``fucking``)."""
events = _typed("fucking ")
ctx = build_context(events, sid="g0-long")
assert ctx.obscenity_hits == 1
def test_positive_and_negative_counters() -> None:
events = _typed("nice work damn it ")
ctx = build_context(events, sid="g0-mix")
assert ctx.positive_lex_hits == 1 # nice
assert ctx.negative_lex_hits == 1 # damn
def test_caps_run_max_tracks_longest_uppercase_streak() -> None:
events = _typed("ok FUCK and OK ")
ctx = build_context(events, sid="g0-caps")
assert ctx.caps_run_max >= 4 # FUCK
# obscenity is case-folded → still counts
assert ctx.obscenity_hits >= 1
def test_bang_run_max_tracks_longest_bang_streak() -> None:
events = _typed("wait!!! no!!\n")
ctx = build_context(events, sid="g0-bang")
assert ctx.bang_run_max == 3
def test_paste_class_events_excluded_from_lex_counters() -> None:
"""A pasted obscenity must NOT increment counters — paste-class
events are the F.4 / G.0 boundary the operator's own typing is on
one side of, pasted text on the other."""
events: list[AsciinemaEvent] = [(0.0, "i", "fuck and shit pasted in")]
ctx = build_context(events, sid="g0-paste")
assert ctx.obscenity_hits == 0
assert ctx.negative_lex_hits == 0
def test_no_lex_text_in_observation_values() -> None:
"""PII regression: lexeme word forms must not appear in any emitted
observation's ``value`` field. (Primitive names like ``shell_type``
legitimately contain ``hell`` — this test guards the data, not the
schema.)"""
events = _typed("oh fuck this is broken damn ")
obs = list(extract_session(events, sid="g0-pii"))
for o in obs:
v_str = json.dumps(o.value)
for lex in (OBSCENITY_LEXEMES | NEGATIVE_LEXEMES | POSITIVE_LEXEMES):
assert lex not in v_str, (
f"raw lexeme {lex!r} leaked into observation value "
f"for primitive {o.primitive!r}: {o.value!r}"
)
def test_intent_precedence_destructive_outranks_recon() -> None:
"""``rm`` must classify as destructive even though recon includes
file-system tools."""
h = hash_token("rm")
assert h in DESTRUCTIVE_TOKENS
assert classify_intent(h) == "destructive"
# Sanity: the precedence tuple's first entry is destructive.
assert INTENT_PRECEDENCE[0][0] == "destructive"
def test_opsec_history_tokens_populated() -> None:
assert hash_token("history") in OPSEC_HISTORY_TOKENS
assert hash_token("unset") in OPSEC_HISTORY_TOKENS