DECNET/decnet/profiler/behave_shell/_intent.py

"""Phase G — shared command-intent + lexical-counter vocabulary.

Used by:
* ``operational.objective``        (G.1)  via ``INTENT_SETS``
* ``operational.opsec_discipline`` (G.2)  via ``OPSEC_HISTORY_TOKENS``
* ``emotional_valence.valence``    (G.5)  via ``POSITIVE_LEXEMES`` / ``NEGATIVE_LEXEMES``
* ``emotional_valence.frustration_venting`` (G.8) via ``OBSCENITY_LEXEMES``

All ``*_TOKENS`` frozensets contain ``hash_token()`` SHA256 hexes — the
only PII-safe handle on a command's first token. Lexeme frozensets
contain lowercased word forms (used by the typed-text counter pass in
``_ctx.py`` to *count* matches without retaining text).

Set membership is intentionally overlapping. ``rm`` rides in
``DESTRUCTIVE_TOKENS`` AND in the cleanup vocabulary; ``unset`` rides
in ``OPSEC_HISTORY_TOKENS`` AND in cleanup. G.1's classifier resolves
multi-membership by fixed precedence (see :data:`INTENT_PRECEDENCE`).
"""
from __future__ import annotations

from decnet.profiler.behave_shell._parse import hash_token

# ── operational.objective intent sets (G.1) ────────────────────────────────
RECON_TOKENS: frozenset[str] = frozenset(
    hash_token(t) for t in (
        "ls", "pwd", "whoami", "id", "uname", "ps", "netstat", "ss",
        "cat", "find", "which", "env", "printenv", "hostname", "w",
        "who", "date", "uptime", "df", "du", "free", "lsof", "lsblk",
    )
)
EXFIL_TOKENS: frozenset[str] = frozenset(
    hash_token(t) for t in (
        "curl", "wget", "scp", "rsync", "nc", "ncat", "socat", "tar",
        "base64", "xxd", "python", "python3", "openssl",
    )
)
PERSISTENCE_TOKENS: frozenset[str] = frozenset(
    hash_token(t) for t in (
        "crontab", "systemctl", "useradd", "usermod", "passwd", "chsh",
        "at", "service", "chkconfig", "update-rc.d", "authorized_keys",
    )
)
LATERAL_TOKENS: frozenset[str] = frozenset(
    hash_token(t) for t in (
        "ssh", "telnet", "rsh", "rlogin", "ftp", "sftp", "mosh",
        "kubectl", "docker", "psql", "mysql", "redis-cli",
    )
)
DESTRUCTIVE_TOKENS: frozenset[str] = frozenset(
    hash_token(t) for t in (
        "rm", "dd", "mkfs", "shred", "wipe", "kill", "pkill", "killall",
        "truncate", "fdisk",
    )
)

# G.1 majority-vote classifier walks first_token_hash → category in this
# order; first hit wins. ``destructive`` outranks ``persistence`` because
# a session that destroys outweighs one that also installs cron jobs;
# ``exfil`` outranks ``lateral`` because pulling data is the more
# specific signal.
INTENT_PRECEDENCE: tuple[tuple[str, frozenset[str]], ...] = (
    ("destructive",  DESTRUCTIVE_TOKENS),
    ("persistence",  PERSISTENCE_TOKENS),
    ("exfil",        EXFIL_TOKENS),
    ("lateral",      LATERAL_TOKENS),
    ("recon",        RECON_TOKENS),
)


def classify_intent(first_token_hash: str) -> str | None:
    """Return the registry intent label for ``first_token_hash``.

    ``None`` if the hash isn't in any intent set.
    """
    for label, hashes in INTENT_PRECEDENCE:
        if first_token_hash in hashes:
            return label
    return None


# ── operational.opsec_discipline (G.2) ─────────────────────────────────────
# History-clearing / log-tampering vocabulary (first-token).
OPSEC_HISTORY_TOKENS: frozenset[str] = frozenset(
    hash_token(t) for t in (
        "history", "unset", "export", "set", "script",
    )
)


# ── emotional_valence lexicons (G.5 / G.8) ────────────────────────────────
# Lowercased lexeme word-forms. Membership-tested against typed-text
# tokens during the single-pass histogram walk in ``_ctx.py``. No raw
# text retained — only per-set integer counters.
#
# Stop-word collisions with registry values (``no``, ``none``, ``ok``,
# ``hell``→``shell_type``) are excluded — registry value strings travel
# through observations and would trigger PII regression checks. Kept
# lexemes are those that don't collide with primitive value vocabulary.
POSITIVE_LEXEMES: frozenset[str] = frozenset({
    "thanks", "nice", "cool", "great", "okay",
    "perfect", "love", "awesome",
})
NEGATIVE_LEXEMES: frozenset[str] = frozenset({
    "wtf", "damn", "crap", "ugh", "broken", "stupid",
    "hate", "stuck", "wrong",
})
OBSCENITY_LEXEMES: frozenset[str] = frozenset({
    "fuck", "fucking", "fucked", "shit", "bitch", "ass", "cunt",
    "dick", "asshole",
})

ALL_LEXEMES: frozenset[str] = (
    POSITIVE_LEXEMES | NEGATIVE_LEXEMES | OBSCENITY_LEXEMES
)
LEXEME_MAX_LEN: int = max((len(x) for x in ALL_LEXEMES), default=0)