feat(ttp): split bash CMD evidence into structured uid/user/src/pwd/cmd rows

The inspector was dumping the whole `CMD uid=0 user=root src=… pwd=… cmd=nmap -p- 192.168.1.0/24` syslog body into a single ``command_text`` blob. ANTI: "I'd like to separate the fields." Done — three layers work together: 1. Collector session aggregator: new `_parse_cmd_msg` splits the bash PROMPT_COMMAND msg into `{uid, user, src, pwd, command}`. The session-ended envelope's per-command dict now carries the structured fields, with `command_text` set to just the cmd= value (preserving embedded whitespace — `nmap -p- 1.2.3.0/24` etc.). 2. Rule engine: per-source_kind auxiliary evidence list (`_AUX_EVIDENCE_FIELDS`). For `command` events the engine automatically promotes uid/user/src/pwd into the persisted `evidence` dict on top of the rule's explicit `evidence_fields`. Engine-controlled, not per-rule — adding a new aux field is one line here, not a 30-rule YAML sweep, and rule authors can't accidentally drop it. 3. TTPInspector frontend: evidence renders as a structured `kvs` grid (UID / USER / SRC / PWD / CMD rows) instead of pretty-printed JSON. Primary-order list keeps shell fields at the top; everything else falls below alphabetically so unfamiliar evidence shapes still surface predictably. Tests: - session_aggregator pins the structured-fields emit (uid/user/src/ pwd/command_text without "CMD" prefix, embedded whitespace preserved). - rule_engine_tagger pins the aux-field auto-promotion + the no-`None`-leakage path when payload doesn't carry an aux key.
2026-05-02 03:20:53 -04:00
parent 84699f89da
commit d1c4a48963
6 changed files with 268 additions and 4 deletions
--- a/decnet/collector/worker.py
+++ b/decnet/collector/worker.py
@@ -151,6 +151,33 @@ _SESSION_AGG_TTL_SEC: float = _parse_float_env(
 )


+# Body of a bash PROMPT_COMMAND CMD line:
+#   ``CMD uid=0 user=root src=192.168.1.5 pwd=/root cmd=ls /var/www/html``
+# Splits into the structured fields the inspector renders + the
+# residual ``cmd=`` value (which may itself contain spaces — preserve
+# everything after ``cmd=`` as one token, do NOT word-split).
+_CMD_BODY_HEAD_KV_RE = re.compile(r'(\w+)=(\S+)')
+
+
+def _parse_cmd_msg(msg: str) -> dict[str, str]:
+    """Split a bash CMD msg body into ``{uid, user, src, pwd, command}``.
+
+    Returns the empty dict on a non-CMD msg. ``command`` carries the
+    full post-``cmd=`` rest, including any embedded whitespace —
+    tools like ``nmap -p- 192.168.1.0/24`` would otherwise lose
+    everything after the first space.
+    """
+    if not msg.startswith("CMD "):
+        return {}
+    head, sep, cmd_rest = msg[4:].partition("cmd=")
+    out: dict[str, str] = {}
+    for k, v in _CMD_BODY_HEAD_KV_RE.findall(head):
+        out[k] = v
+    if sep:
+        out["command"] = cmd_rest
+    return out
+
+
 def _parse_iso_ts(value: str) -> Optional[datetime]:
    """Best-effort ISO-8601 parse for parsed event timestamps.

@@ -252,18 +279,30 @@ class _SessionAggregator:
            if cmd_ts.timestamp() < cutoff_lo:
                continue
            cmd_fields = cmd_parsed.get("fields", {}) or {}
+            # Pull structured uid/user/src/pwd/command from the bash
+            # msg body. The inspector renders these as separate
+            # key/value rows, which is much friendlier than dumping
+            # the raw ``CMD uid=0 user=... cmd=...`` string into a
+            # single ``command_text`` blob.
+            parsed_kv = _parse_cmd_msg(str(cmd_parsed.get("msg", "")))
            cmd_text = (
                cmd_fields.get("command")
                or cmd_fields.get("cmd")
+                or parsed_kv.get("command")
                or cmd_parsed.get("msg", "")
            )
-            commands.append({
+            entry: dict[str, Any] = {
                "id": f"{sid}#{idx}" if sid else f"{attacker_ip}-{cmd_ts.isoformat()}",
                "command_text": str(cmd_text),
                "ts": cmd_ts.isoformat(),
                "decky": cmd_parsed.get("decky", ""),
                "service": cmd_parsed.get("service", ""),
-            })
+            }
+            for key in ("uid", "user", "src", "pwd"):
+                value = parsed_kv.get(key) or cmd_fields.get(key)
+                if value is not None:
+                    entry[key] = value
+            commands.append(entry)

        payload: dict[str, Any] = {
            "session_id": sid or None,
--- a/decnet/ttp/impl/rule_engine.py
+++ b/decnet/ttp/impl/rule_engine.py
@@ -296,6 +296,21 @@ _DEFAULT_MATCH_FIELD: dict[str, str] = {
 }


+# Per-``source_kind`` auxiliary evidence fields that the engine
+# auto-promotes onto every emitted tag, on top of the rule's
+# explicit ``evidence_fields`` list. The point is operator UX: when
+# a shell rule fires on ``cat /etc/shadow``, the inspector should
+# show *who* ran it (``user``), *where from* (``src``), *as whom*
+# (``uid``), and the working directory (``pwd``) — without forcing
+# every rule author to add the same four fields to every shell
+# rule's ``evidence_fields`` list. Engine-controlled, not per-rule:
+# adding a new aux field is a one-line edit here, not a 30-rule
+# YAML sweep.
+_AUX_EVIDENCE_FIELDS: dict[str, tuple[str, ...]] = {
+    "command": ("uid", "user", "src", "pwd"),
+}
+
+
 def _evaluate_rules(
    rules: list[CompiledRule], event: TaggerEvent,
 ) -> list[TTPTag]:
@@ -330,6 +345,12 @@ def _evaluate_rules(
                    for field in rule.evidence_fields
                    if field in event.payload
                }
+                # Engine-controlled auxiliary fields per source_kind —
+                # added on top of the rule's explicit list so the
+                # inspector always sees uid/user/src/pwd on shell tags.
+                for aux in _AUX_EVIDENCE_FIELDS.get(event.source_kind, ()):
+                    if aux in event.payload and aux not in evidence:
+                        evidence[aux] = event.payload.get(aux)
                out.append(TTPTag(
                    uuid=tag_uuid,
                    source_kind=event.source_kind,