feat(attackers): scanned vs. interacted service bucketing on detail page

Adds a new card on AttackerDetail: SCANNED · N services | INTERACTED WITH · M services. Distinguishes port-scanners (N high, M=0) from actual engagement (M>0) at a glance — the analyst's first question when triaging a new attacker row. Classifier lives in decnet/correlation/event_kinds.py, a single source of truth for the event-type vocabulary: - INTERACTION_EVENT_TYPES — command-family (command/exec/query/...), SMTP engagement (mail_from/rcpt_to/message_accepted), file/payload activity (file_captured/upload/download_attempt/retr), pub/sub (publish/subscribe), recorded TTY sessions. - NOISE_EVENT_TYPES — DECNET-internal (startup/shutdown/parse_error/ unknown_*). - Everything else defaults to scan. Conservative by design: new template verbs show up as "scanned" until explicitly promoted. Bucket logic: a service is "interacted" if ≥1 of its events classifies as interaction; otherwise "scanned" if ≥1 scan event; noise-only services drop. Disjoint by construction. Deliberate no-schema path: compute on-the-fly in the detail endpoint via SELECT DISTINCT service, event_type FROM logs. Small result set (tens of pairs per attacker), cost is trivial vs. the existing behavior/commands queries. Trade-off: one more DB round-trip per detail view in exchange for zero ALTER TABLE migration pain and immediate classifier-change feedback loop. Profiler's _COMMAND_EVENT_TYPES stays as-is (strict subset of interactions that carry executable text), with a comment pointing at the new canonical module. Closes DEVELOPMENT.md "Attacker Intelligence §Service-Level Behavioral Profiling — Services actively interacted with".
2026-04-24 17:12:20 -04:00
parent ce6b4a4174
commit 351a8939c3
8 changed files with 322 additions and 1 deletions
--- a/decnet/correlation/event_kinds.py
+++ b/decnet/correlation/event_kinds.py
@@ -0,0 +1,113 @@
+"""Classify RFC 5424 event_type strings as interaction vs. scan vs. noise.
+
+Used by:
+- The attacker detail endpoint to split services into "scanned" and
+  "interacted with" buckets, distinguishing port scanners from
+  attackers who actually engaged.
+- The profiler worker to filter command-family events when extracting
+  executed-command history.
+
+Classification is conservative: an unknown event_type defaults to
+``scan`` rather than ``interaction``. That way a new service template
+emitting a fresh verb shows up as "scanned" on the dashboard — visible
+but not over-credited. Adding it to ``INTERACTION_EVENT_TYPES`` is
+always a deliberate promotion.
+"""
+from __future__ import annotations
+
+from typing import Literal
+
+# Events that mean the attacker did something past reconnaissance —
+# executed a command, sent mail, uploaded a file, subscribed to a topic.
+# A service with ≥1 of these from a given attacker is "interacted with".
+INTERACTION_EVENT_TYPES: frozenset[str] = frozenset({
+    # Shell / command-family — lifted from the profiler's original
+    # command-extraction frozenset; this module is now the source of
+    # truth for that vocabulary too.
+    "command",
+    "exec",
+    "query",
+    "input",
+    "shell_input",
+    "execute",
+    "run",
+    "sql_query",
+    "redis_command",
+    "ldap_search",
+    # SMTP meaningful engagement — once MAIL FROM / RCPT TO lands the
+    # attacker is trying to send mail, not just banner-grab.
+    # message_accepted is the DATA-commit moment.
+    "mail_from",
+    "rcpt_to",
+    "rcpt_denied",
+    "message_accepted",
+    # File / payload activity
+    "file_captured",
+    "upload",
+    "download_attempt",
+    "retr",  # FTP retrieve
+    # Pub/sub operational use (vs. mere connection)
+    "publish",
+    "subscribe",
+    # A recorded TTY session is always an interaction — sessrec only
+    # writes when there was PTY input.
+    "session_recorded",
+})
+
+
+# Events that are DECNET-internal or protocol-framework noise rather
+# than attacker-caused signal. Dropped from both buckets.
+NOISE_EVENT_TYPES: frozenset[str] = frozenset({
+    "startup",
+    "shutdown",
+    "config_error",
+    "parse_error",
+    "unknown_packet",
+    "unknown_opcode",
+    "unknown_command",
+    "protocol_error",
+})
+
+
+EventKind = Literal["interaction", "scan", "noise"]
+
+
+def classify_event(event_type: str) -> EventKind:
+    """Return the kind label for a single event_type string."""
+    if event_type in INTERACTION_EVENT_TYPES:
+        return "interaction"
+    if event_type in NOISE_EVENT_TYPES:
+        return "noise"
+    return "scan"
+
+
+def bucket_services(
+    pairs: list[tuple[str, str]],
+) -> dict[str, list[str]]:
+    """Group distinct service names into scanned vs. interacted buckets.
+
+    *pairs* is an iterable of ``(service, event_type)`` tuples — the
+    shape the repo returns from a ``SELECT DISTINCT service, event_type``
+    query. A service is placed in ``interacted`` if any of its events
+    classifies as interaction; otherwise in ``scanned`` if any event
+    classifies as scan; noise-only services are dropped.
+
+    Return shape: ``{"interacted": [...sorted...], "scanned": [...sorted...]}``.
+    Buckets are disjoint by construction.
+    """
+    best: dict[str, EventKind] = {}
+    for service, event_type in pairs:
+        kind = classify_event(event_type)
+        current = best.get(service)
+        # Rank: interaction > scan > noise > unset.
+        if current == "interaction":
+            continue
+        if kind == "interaction":
+            best[service] = "interaction"
+        elif kind == "scan" and current != "interaction":
+            best[service] = "scan"
+        elif kind == "noise" and current is None:
+            best[service] = "noise"
+    interacted = sorted(s for s, k in best.items() if k == "interaction")
+    scanned = sorted(s for s, k in best.items() if k == "scan")
+    return {"interacted": interacted, "scanned": scanned}
--- a/decnet/profiler/worker.py
+++ b/decnet/profiler/worker.py
@@ -40,7 +40,13 @@ logger = get_logger("attacker_worker")
 _BATCH_SIZE = 500
 _STATE_KEY = "attacker_worker_cursor"

-# Event types that indicate active command/query execution (not just connection/scan)
+# Event types that indicate active command/query execution — the
+# shell-family subset of INTERACTION_EVENT_TYPES in
+# decnet/correlation/event_kinds.py. Kept here because this set is a
+# stricter filter (commands that carry text to extract, vs. interactions
+# like RCPT TO or file upload that don't). A test in
+# tests/profiler/ asserts it's a subset of the canonical interaction
+# set so they can't drift.
 _COMMAND_EVENT_TYPES = frozenset({
    "command", "exec", "query", "input", "shell_input",
    "execute", "run", "sql_query", "redis_command",
--- a/decnet/web/db/repository.py
+++ b/decnet/web/db/repository.py
@@ -247,6 +247,16 @@ class BaseRepository(ABC):
        """Return `session_recorded` log rows for this attacker, newest first."""
        pass

+    async def get_attacker_service_activity(
+        self, attacker_uuid: str
+    ) -> list[tuple[str, str]]:
+        """Return the distinct ``(service, event_type)`` pairs observed
+        for one attacker, for bucketing into scanned vs. interacted
+        services.  Default is NotImplementedError so non-SQLModel backends
+        must opt in; SQLModelRepository overrides with a cheap DISTINCT
+        query."""
+        raise NotImplementedError
+
    @abstractmethod
    async def get_session_log(self, sid: str) -> Optional[dict[str, Any]]:
        """Look up the `session_recorded` Log row for a given session UUID."""
--- a/decnet/web/db/sqlmodel_repo.py
+++ b/decnet/web/db/sqlmodel_repo.py
@@ -881,6 +881,32 @@ class SQLModelRepository(BaseRepository):
            page = commands[offset: offset + limit]
            return {"total": total, "data": page}

+    async def get_attacker_service_activity(
+        self, attacker_uuid: str
+    ) -> list[tuple[str, str]]:
+        """Return distinct ``(service, event_type)`` pairs for an attacker.
+
+        Resolves IP then ``SELECT DISTINCT service, event_type FROM logs
+        WHERE attacker_ip = :ip`` — the result set is bounded by the
+        cardinality of services × event_types (tens, not thousands), so
+        this stays cheap even for attackers with long event streams.
+        Caller applies `event_kinds.bucket_services` to split into
+        scanned vs. interacted.
+        """
+        async with self._session() as session:
+            ip_res = await session.execute(
+                select(Attacker.ip).where(Attacker.uuid == attacker_uuid)
+            )
+            ip = ip_res.scalar_one_or_none()
+            if not ip:
+                return []
+            rows = await session.execute(
+                select(Log.service, Log.event_type)
+                .where(Log.attacker_ip == ip)
+                .distinct()
+            )
+            return [(svc, evt) for svc, evt in rows.all()]
+
    async def get_attacker_artifacts(self, uuid: str) -> list[dict[str, Any]]:
        """Return `file_captured` logs for the attacker identified by UUID.

--- a/decnet/web/router/attackers/api_get_attacker_detail.py
+++ b/decnet/web/router/attackers/api_get_attacker_detail.py
@@ -2,6 +2,7 @@ from typing import Any

 from fastapi import APIRouter, Depends, HTTPException

+from decnet.correlation.event_kinds import bucket_services
 from decnet.telemetry import traced as _traced
 from decnet.web.dependencies import require_viewer, repo

@@ -27,4 +28,10 @@ async def get_attacker_detail(
    if not attacker:
        raise HTTPException(status_code=404, detail="Attacker not found")
    attacker["behavior"] = await repo.get_attacker_behavior(uuid)
+    # Scanned vs. interacted-with — computed per-request from the log
+    # stream, not persisted. Cheap (DISTINCT bounded by service ×
+    # event_type cardinality), and changes to the classifier take effect
+    # immediately without a profiler re-tick.
+    pairs = await repo.get_attacker_service_activity(uuid)
+    attacker["service_activity"] = bucket_services(pairs)
    return attacker