feat(profiler/behave_shell): G.0 intent lexicon + lexical counter pass
Phase G shared infrastructure (no primitive yet emitted): * New `_intent.py` — five precomputed first-token-hash sets (recon / exfil / persistence / lateral / destructive) with documented precedence, plus opsec-history and three lexeme sets (positive / negative / obscenity) for the typed-text counter pass. Stop words that collide with registry value vocabulary (`no`, `hell`, `ok`) are deliberately excluded — the PII regression test catches such collisions. * `_typed_char_histograms()` extended with five integer counters populated in the same single-pass walk: `obscenity_hits`, `positive_lex_hits`, `negative_lex_hits`, `caps_run_max`, `bang_run_max`. Longest-suffix match against bounded lexicon (`LEXEME_MAX_LEN`); paste-class events excluded. * `SessionContext` widened by the same five fields. Drives G.5 (valence), G.6 (arousal), G.8 (frustration_venting) without retaining raw operator text. * Bump twisted >= 26.4.0rc2 to clear CVE-2026-42304 (pre-existing, caught by pre-commit pip-audit). Adjust ftp template type-ignore code from attr-defined to misc to match the new Twisted typing. PII discipline: same shape as F.4 — fixed-vocabulary integer counters on ctx, never on observations.
This commit is contained in:
115
decnet/profiler/behave_shell/_intent.py
Normal file
115
decnet/profiler/behave_shell/_intent.py
Normal file
@@ -0,0 +1,115 @@
|
||||
"""Phase G — shared command-intent + lexical-counter vocabulary.
|
||||
|
||||
Used by:
|
||||
* ``operational.objective`` (G.1) via ``INTENT_SETS``
|
||||
* ``operational.opsec_discipline`` (G.2) via ``OPSEC_HISTORY_TOKENS``
|
||||
* ``emotional_valence.valence`` (G.5) via ``POSITIVE_LEXEMES`` / ``NEGATIVE_LEXEMES``
|
||||
* ``emotional_valence.frustration_venting`` (G.8) via ``OBSCENITY_LEXEMES``
|
||||
|
||||
All ``*_TOKENS`` frozensets contain ``hash_token()`` SHA256 hexes — the
|
||||
only PII-safe handle on a command's first token. Lexeme frozensets
|
||||
contain lowercased word forms (used by the typed-text counter pass in
|
||||
``_ctx.py`` to *count* matches without retaining text).
|
||||
|
||||
Set membership is intentionally overlapping. ``rm`` rides in
|
||||
``DESTRUCTIVE_TOKENS`` AND in the cleanup vocabulary; ``unset`` rides
|
||||
in ``OPSEC_HISTORY_TOKENS`` AND in cleanup. G.1's classifier resolves
|
||||
multi-membership by fixed precedence (see :data:`INTENT_PRECEDENCE`).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from decnet.profiler.behave_shell._parse import hash_token
|
||||
|
||||
# ── operational.objective intent sets (G.1) ────────────────────────────────
|
||||
RECON_TOKENS: frozenset[str] = frozenset(
|
||||
hash_token(t) for t in (
|
||||
"ls", "pwd", "whoami", "id", "uname", "ps", "netstat", "ss",
|
||||
"cat", "find", "which", "env", "printenv", "hostname", "w",
|
||||
"who", "date", "uptime", "df", "du", "free", "lsof", "lsblk",
|
||||
)
|
||||
)
|
||||
EXFIL_TOKENS: frozenset[str] = frozenset(
|
||||
hash_token(t) for t in (
|
||||
"curl", "wget", "scp", "rsync", "nc", "ncat", "socat", "tar",
|
||||
"base64", "xxd", "python", "python3", "openssl",
|
||||
)
|
||||
)
|
||||
PERSISTENCE_TOKENS: frozenset[str] = frozenset(
|
||||
hash_token(t) for t in (
|
||||
"crontab", "systemctl", "useradd", "usermod", "passwd", "chsh",
|
||||
"at", "service", "chkconfig", "update-rc.d", "authorized_keys",
|
||||
)
|
||||
)
|
||||
LATERAL_TOKENS: frozenset[str] = frozenset(
|
||||
hash_token(t) for t in (
|
||||
"ssh", "telnet", "rsh", "rlogin", "ftp", "sftp", "mosh",
|
||||
"kubectl", "docker", "psql", "mysql", "redis-cli",
|
||||
)
|
||||
)
|
||||
DESTRUCTIVE_TOKENS: frozenset[str] = frozenset(
|
||||
hash_token(t) for t in (
|
||||
"rm", "dd", "mkfs", "shred", "wipe", "kill", "pkill", "killall",
|
||||
"truncate", "fdisk",
|
||||
)
|
||||
)
|
||||
|
||||
# G.1 majority-vote classifier walks first_token_hash → category in this
|
||||
# order; first hit wins. ``destructive`` outranks ``persistence`` because
|
||||
# a session that destroys outweighs one that also installs cron jobs;
|
||||
# ``exfil`` outranks ``lateral`` because pulling data is the more
|
||||
# specific signal.
|
||||
INTENT_PRECEDENCE: tuple[tuple[str, frozenset[str]], ...] = (
|
||||
("destructive", DESTRUCTIVE_TOKENS),
|
||||
("persistence", PERSISTENCE_TOKENS),
|
||||
("exfil", EXFIL_TOKENS),
|
||||
("lateral", LATERAL_TOKENS),
|
||||
("recon", RECON_TOKENS),
|
||||
)
|
||||
|
||||
|
||||
def classify_intent(first_token_hash: str) -> str | None:
|
||||
"""Return the registry intent label for ``first_token_hash``.
|
||||
|
||||
``None`` if the hash isn't in any intent set.
|
||||
"""
|
||||
for label, hashes in INTENT_PRECEDENCE:
|
||||
if first_token_hash in hashes:
|
||||
return label
|
||||
return None
|
||||
|
||||
|
||||
# ── operational.opsec_discipline (G.2) ─────────────────────────────────────
|
||||
# History-clearing / log-tampering vocabulary (first-token).
|
||||
OPSEC_HISTORY_TOKENS: frozenset[str] = frozenset(
|
||||
hash_token(t) for t in (
|
||||
"history", "unset", "export", "set", "script",
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
# ── emotional_valence lexicons (G.5 / G.8) ────────────────────────────────
|
||||
# Lowercased lexeme word-forms. Membership-tested against typed-text
|
||||
# tokens during the single-pass histogram walk in ``_ctx.py``. No raw
|
||||
# text retained — only per-set integer counters.
|
||||
#
|
||||
# Stop-word collisions with registry values (``no``, ``none``, ``ok``,
|
||||
# ``hell``→``shell_type``) are excluded — registry value strings travel
|
||||
# through observations and would trigger PII regression checks. Kept
|
||||
# lexemes are those that don't collide with primitive value vocabulary.
|
||||
POSITIVE_LEXEMES: frozenset[str] = frozenset({
|
||||
"thanks", "nice", "cool", "great", "okay",
|
||||
"perfect", "love", "awesome",
|
||||
})
|
||||
NEGATIVE_LEXEMES: frozenset[str] = frozenset({
|
||||
"wtf", "damn", "crap", "ugh", "broken", "stupid",
|
||||
"hate", "stuck", "wrong",
|
||||
})
|
||||
OBSCENITY_LEXEMES: frozenset[str] = frozenset({
|
||||
"fuck", "fucking", "fucked", "shit", "bitch", "ass", "cunt",
|
||||
"dick", "asshole",
|
||||
})
|
||||
|
||||
ALL_LEXEMES: frozenset[str] = (
|
||||
POSITIVE_LEXEMES | NEGATIVE_LEXEMES | OBSCENITY_LEXEMES
|
||||
)
|
||||
LEXEME_MAX_LEN: int = max((len(x) for x in ALL_LEXEMES), default=0)
|
||||
Reference in New Issue
Block a user