fix(collector): unwrap double-wrapped RFC5424 around bash PROMPT_COMMAND

Honeypot SSH containers run `PROMPT_COMMAND` that calls
`logger --rfc5424 --msgid command -t bash "CMD …"`. The Docker-stdout
reader prepends an outer RFC5424 envelope (HOSTNAME=<decky>,
APP-NAME=1, MSGID=NIL) around that inner syslog line. Both the
collector parser (`parse_rfc5424`) and the correlation parser
(`parse_line`) saw the outer NIL MSGID and emitted `event_type="-"`
for every shell command — which:
  - kept `Attacker.commands` rows missing `command_text`
  - left R0001–R0030 (the pattern rule pack that matches shell
    commands) with no haystack
  - made `decnet.collector.log` show `event written … type=-`
    for the very lines that should be `type=command`

Both parsers now detect the inner-RFC5424 shape (`<TS> <HOST> <APP>
<PROCID> <MSGID> <rest>`) when the outer MSGID is NIL and the SD-arm
is also NIL, and re-extract HOSTNAME / APP-NAME / MSGID / remainder
from the body. The collector parser also recovers the post-SD msg
tail when the SD block isn't `relay@55555` (the bash CMD line carries
a `[timeQuality …]` block) so the kv-fallback can find `src_ip`.

Mirroring tests in tests/collector and tests/correlation pin both
the unwrap and the regression guard for non-double-wrapped lines.
This commit is contained in:
2026-05-02 02:32:21 -04:00
parent e08bfc4a73
commit d9d2a80573
4 changed files with 234 additions and 4 deletions

View File

@@ -145,6 +145,27 @@ _RFC5424_RE = re.compile(
r"(\S+) " # 4: MSGID (event_type)
r"(.+)$", # 5: SD element + optional MSG
)
# Honeypot SSH containers export a ``PROMPT_COMMAND`` that calls
# ``logger --rfc5424 --msgid command -p user.info -t bash "CMD …"``.
# That inner RFC 5424 line lands on the container's stdout, where the
# Docker stream reader prepends ANOTHER RFC 5424 envelope (PRI=14,
# HOSTNAME=<decky>, APP-NAME=1, MSGID=NIL). The outer parse therefore
# sees ``event_type == "-"`` while the real MSGID (``command``) is
# inside the body. We detect that case and re-extract the inner
# ``HOSTNAME APP-NAME PROCID MSGID rest`` so downstream consumers see
# ``event_type == "command"`` plus the real source hostname.
#
# Anchored on an ISO-8601 timestamp at the head of the body so we
# don't false-match free-form prose like "Connection from 1.2.3.4".
_INNER_RFC5424_RE = re.compile(
r"^(\d{4}-\d{2}-\d{2}T\S+)\s+" # 1: inner TIMESTAMP
r"(\S+)\s+" # 2: inner HOSTNAME
r"(\S+)\s+" # 3: inner APP-NAME
r"\S+\s+" # PROCID (NIL or PID)
r"(\S+)\s+" # 4: inner MSGID
r"(.+)$", # 5: inner SD/MSG remainder
)
_SD_BLOCK_RE = re.compile(r'\[relay@55555\s+(.*?)\]', re.DOTALL)
_PARAM_RE = re.compile(r'(\w+)="((?:[^"\\]|\\.)*)"')
_IP_FIELDS = ("src_ip", "src", "client_ip", "remote_ip", "remote_addr", "target_ip", "ip")
@@ -184,8 +205,23 @@ def parse_rfc5424(line: str) -> Optional[dict[str, Any]]:
ts_raw, decky, service, event_type, sd_rest = m.groups()
fields: dict[str, str] = {}
msg: str = ""
# Honeypot SSH PROMPT_COMMAND lines are double-wrapped (Docker
# stdout envelope around the inner ``logger --msgid command`` line).
# Outer MSGID is NIL; the real MSGID is inside the body. Detect
# the inner shape and re-extract HOSTNAME / APP-NAME / MSGID /
# remainder so downstream extraction sees the real header.
if event_type == "-" and sd_rest.startswith("-"):
body = sd_rest[1:].lstrip()
inner = _INNER_RFC5424_RE.match(body)
if inner is not None:
_i_ts, i_host, i_app, i_msgid, i_rest = inner.groups()
decky = i_host
service = i_app
event_type = i_msgid
sd_rest = i_rest
msg: str = ""
if sd_rest.startswith("-"):
msg = sd_rest[1:].lstrip()
elif sd_rest.startswith("["):
@@ -193,6 +229,11 @@ def parse_rfc5424(line: str) -> Optional[dict[str, Any]]:
if block:
for k, v in _PARAM_RE.findall(block.group(1)):
fields[k] = v.replace('\\"', '"').replace("\\\\", "\\").replace("\\]", "]")
# Always recover the post-SD message tail, even when the SD
# block isn't ``relay@55555`` (e.g. the ``timeQuality`` block
# syslog auto-emits on bash CMD lines). Without this the body
# of unwrapped PROMPT_COMMAND lines stays empty and the
# attacker_ip kv-fallback below has nothing to scan.
msg_match = re.search(r'\]\s+(.+)$', sd_rest)
if msg_match:
msg = msg_match.group(1).strip()

View File

@@ -32,6 +32,21 @@ _RFC5424_RE = re.compile(
r"(.+)$", # 5: SD element + optional MSG
)
# Honeypot SSH PROMPT_COMMAND lines arrive double-wrapped: the
# Docker-stdout collector envelope wraps the inner ``logger
# --rfc5424 --msgid command -t bash …`` line. Outer MSGID is NIL,
# real MSGID lives in the body. Mirrors the unwrap logic in
# ``decnet.collector.worker._INNER_RFC5424_RE`` — the two parsers
# read the same on-wire format.
_INNER_RFC5424_RE = re.compile(
r"^(\d{4}-\d{2}-\d{2}T\S+)\s+" # 1: inner TIMESTAMP
r"(\S+)\s+" # 2: inner HOSTNAME
r"(\S+)\s+" # 3: inner APP-NAME
r"\S+\s+" # PROCID (NIL or PID)
r"(\S+)\s+" # 4: inner MSGID
r"(.+)$", # 5: inner SD/MSG remainder
)
# Structured data block: [relay@55555 k="v" ...]
_SD_BLOCK_RE = re.compile(r'\[relay@55555\s+(.*?)\]', re.DOTALL)
@@ -121,6 +136,21 @@ def parse_line(line: str) -> LogEvent | None:
ts_raw, decky, service, event_type, sd_rest = m.groups()
# Unwrap double-wrapped Docker-stdout envelopes around bash
# PROMPT_COMMAND lines. See ``_INNER_RFC5424_RE`` and the matching
# logic in ``decnet.collector.worker.parse_rfc5424``. Must run
# before the decky/service NIL-guard below — the OUTER decky is
# the docker host, the inner header carries the real source.
if event_type == "-" and sd_rest.startswith("-"):
body = sd_rest[1:].lstrip()
inner = _INNER_RFC5424_RE.match(body)
if inner is not None:
_i_ts, i_host, i_app, i_msgid, i_rest = inner.groups()
decky = i_host
service = i_app
event_type = i_msgid
sd_rest = i_rest
if decky == "-" or service == "-":
return None

View File

@@ -0,0 +1,87 @@
"""Collector parser unwraps double-wrapped RFC5424 lines.
Honeypot SSH containers export a ``PROMPT_COMMAND`` that calls
``logger --rfc5424 --msgid command -p user.info -t bash "CMD …"``.
The Docker-stdout reader prepends an outer RFC 5424 envelope around
that inner syslog line. Outer MSGID is NIL, so without an unwrap step
every shell command lands as ``event_type="-"`` in the collector
output and the TTP rule pack never sees it.
"""
from __future__ import annotations
from decnet.collector.worker import parse_rfc5424
_DOUBLE_WRAPPED_CMD = (
"<14>1 2026-05-02T06:22:48.089309+00:00 omega-decky 1 - - - "
" 2026-05-02T06:22:48.089286+00:00 SRV-DELTA-77 bash - command "
"[timeQuality tzKnown=\"1\" isSynced=\"1\" syncAccuracy=\"326228\"] "
"CMD uid=0 user=root src=192.168.1.5 pwd=/root cmd=ls /var/www/html"
)
def test_double_wrapped_bash_cmd_extracts_inner_msgid() -> None:
p = parse_rfc5424(_DOUBLE_WRAPPED_CMD)
assert p is not None
assert p["event_type"] == "command"
# decky / service come from the INNER header — not the outer
# ``omega-decky`` / ``1`` Docker envelope fields.
assert p["decky"] == "SRV-DELTA-77"
assert p["service"] == "bash"
def test_double_wrapped_bash_cmd_recovers_attacker_ip_from_msg() -> None:
p = parse_rfc5424(_DOUBLE_WRAPPED_CMD)
assert p is not None
assert p["attacker_ip"] == "192.168.1.5"
def test_double_wrapped_bash_cmd_preserves_full_msg_body() -> None:
p = parse_rfc5424(_DOUBLE_WRAPPED_CMD)
assert p is not None
# The cmd= value lives at the tail of msg; downstream consumers
# (session aggregator, profiler) extract it from there.
assert "cmd=ls /var/www/html" in p["msg"]
def test_single_wrapped_relay_line_still_parses_unchanged() -> None:
"""Regression guard: non-double-wrapped lines must keep their shape."""
line = (
"<134>1 2026-05-02T06:00:25.453826+00:00 omega-decky smtp - "
"disconnect [relay@55555 src_ip=\"192.168.1.5\"]"
)
p = parse_rfc5424(line)
assert p is not None
assert p["event_type"] == "disconnect"
assert p["decky"] == "omega-decky"
assert p["service"] == "smtp"
assert p["attacker_ip"] == "192.168.1.5"
def test_outer_msgid_set_does_not_recurse() -> None:
"""When outer MSGID is real, leave it alone — no inner-header lookup."""
# Synthetic: outer MSGID=auth_attempt, body contains an
# accidental inner-shaped substring. We must NOT replace
# auth_attempt with anything from inside the body.
line = (
"<134>1 2026-05-02T06:22:48.089309+00:00 omega-decky auth-helper - "
"auth_attempt [relay@55555 username=\"root\" src_ip=\"192.168.1.5\"]"
)
p = parse_rfc5424(line)
assert p is not None
assert p["event_type"] == "auth_attempt"
assert p["decky"] == "omega-decky"
assert p["service"] == "auth-helper"
def test_outer_nil_msgid_with_non_inner_body_unchanged() -> None:
"""NIL-MSGID lines whose body isn't a wrapped RFC5424 line stay NIL."""
# The body is plain prose, not a `<TS> <HOST> <APP> <PROCID> <MSGID>` head.
line = (
"<14>1 2026-05-02T06:22:48.000000+00:00 host app - - - "
"Failed password for root from 192.168.1.5 port 42772 ssh2"
)
p = parse_rfc5424(line)
assert p is not None
assert p["event_type"] == "-"
assert p["attacker_ip"] == "192.168.1.5"

View File

@@ -0,0 +1,72 @@
"""Correlation parser unwraps double-wrapped RFC5424 lines.
Mirrors ``tests/collector/test_parse_rfc5424_double_wrap.py``. Both
parsers read the same on-wire format; the profiler's ``parse_line``
must agree with the collector's ``parse_rfc5424`` so that
``Attacker.commands`` rows carry the real ``command`` event_type
(not the outer Docker envelope's NIL MSGID).
"""
from __future__ import annotations
from decnet.correlation.parser import parse_line
_DOUBLE_WRAPPED_CMD = (
"<14>1 2026-05-02T06:22:48.089309+00:00 omega-decky 1 - - - "
" 2026-05-02T06:22:48.089286+00:00 SRV-DELTA-77 bash - command "
"[timeQuality tzKnown=\"1\" isSynced=\"1\" syncAccuracy=\"326228\"] "
"CMD uid=0 user=root src=192.168.1.5 pwd=/root cmd=ls /var/www/html"
)
def test_double_wrapped_bash_cmd_event_type_is_command() -> None:
e = parse_line(_DOUBLE_WRAPPED_CMD)
assert e is not None
assert e.event_type == "command"
def test_double_wrapped_bash_cmd_uses_inner_decky_and_service() -> None:
e = parse_line(_DOUBLE_WRAPPED_CMD)
assert e is not None
assert e.decky == "SRV-DELTA-77"
assert e.service == "bash"
def test_double_wrapped_bash_cmd_extracts_attacker_ip() -> None:
e = parse_line(_DOUBLE_WRAPPED_CMD)
assert e is not None
assert e.attacker_ip == "192.168.1.5"
def test_double_wrapped_bash_cmd_extracts_command_field() -> None:
"""The behavioral profiler reads ``fields['command']`` for shell
rules and the per-attacker ``commands`` rollup. Without it the
R0001R0030 pattern rules have no haystack."""
e = parse_line(_DOUBLE_WRAPPED_CMD)
assert e is not None
assert e.fields.get("command") == "ls /var/www/html"
def test_single_wrapped_line_unchanged() -> None:
line = (
"<134>1 2026-05-02T06:00:25.453826+00:00 omega-decky smtp - "
"disconnect [relay@55555 src_ip=\"192.168.1.5\"]"
)
e = parse_line(line)
assert e is not None
assert e.event_type == "disconnect"
assert e.decky == "omega-decky"
assert e.service == "smtp"
assert e.attacker_ip == "192.168.1.5"
def test_outer_msgid_set_does_not_recurse() -> None:
line = (
"<134>1 2026-05-02T06:22:48.089309+00:00 omega-decky auth-helper - "
"auth_attempt [relay@55555 username=\"root\" src_ip=\"192.168.1.5\"]"
)
e = parse_line(line)
assert e is not None
assert e.event_type == "auth_attempt"
assert e.decky == "omega-decky"
assert e.service == "auth-helper"