"""Long-running TTP-tagging worker. E.3.14 of ``development/TTP_TAGGING.md``. Drains the bus topics declared in :data:`_TOPICS`, dispatches each event through the :class:`~decnet.ttp.factory.CompositeTagger`, persists the produced :class:`~decnet.web.db.models.ttp.TTPTag` rows via :meth:`BaseRepository.insert_tags`, and publishes the documented ``ttp.tagged`` + ``ttp.rule.fired.`` events — but *only* when ``insert_tags`` reported a non-zero rowcount, per the "loop-prevention invariant" in TTP_TAGGING.md §"Bus topics". Bus subscriptions are enumerated as the module-level constant :data:`_TOPICS` so E.2.12 can assert subscription wiring without invoking the loop. The constant is the *single source of truth* — the loop iterates over it; tests introspect it. The inner loop drains a shared ``asyncio.Queue`` populated by one task per topic. Each queued item is a ``(topic, Event)`` pair — the topic decides the lifter family (and therefore the ``source_kind``), the payload carries the per-event identifiers. Bus loss is tolerated: on transport error the per-topic pump task exits and the loop falls back to the poll interval, which still heartbeats and accepts a clean shutdown. """ from __future__ import annotations import asyncio import contextlib from collections.abc import Iterator from contextlib import contextmanager from typing import Any, Optional from decnet import telemetry as _telemetry from decnet.bus import topics as _topics from decnet.bus.base import BaseBus, Event from decnet.bus.factory import get_bus from decnet.bus.publish import ( run_control_listener_signal as _run_control_listener_signal, run_health_heartbeat as _run_health_heartbeat, ) from decnet.logging import get_logger from decnet.ttp.base import Tagger, TaggerEvent from decnet.ttp.factory import CompositeTagger, get_tagger from decnet.web.db.models.ttp import TTPTag from decnet.web.db.repository import BaseRepository log = get_logger("ttp.worker") _DEFAULT_POLL_SECS = 60.0 # Bus topics the worker subscribes to. Kept as a module-level constant # so E.2.12 can assert subscription wiring without invoking the loop — # the test introspects this tuple, the loop iterates it. The set # matches the design doc "Worker shape" section: session-ended primary # trigger, observed for low-latency rules, intel-enriched + identity # events for opportunistic re-tag, credential-reuse + email for the # dedicated lifters, and ``canary.>`` for fleet-wide canary triggers. _TOPICS: tuple[str, ...] = ( _topics.attacker(_topics.ATTACKER_SESSION_ENDED), _topics.attacker(_topics.ATTACKER_OBSERVED), _topics.attacker(_topics.ATTACKER_INTEL_ENRICHED), _topics.identity(_topics.IDENTITY_FORMED), _topics.identity(_topics.IDENTITY_MERGED), _topics.credential(_topics.CREDENTIAL_REUSE_DETECTED), _topics.email_topic(_topics.EMAIL_RECEIVED), # Canary triggers carry a per-token segment, so subscribe with the # multi-token wildcard rather than enumerating per-token. Pattern # validated against ``decnet.bus.topics.canary()``'s shape. f"{_topics.CANARY}.>", ) # Topic-segment → ``source_kind`` for the resulting TaggerEvent. We # match on a short token contained in the topic so wildcard topics # (``canary.{id}.triggered``) and per-event topics work uniformly. _TOPIC_SOURCE_KIND: tuple[tuple[str, str], ...] = ( ("session.ended", "session"), ("observed", "session"), ("intel.enriched", "intel"), ("identity.formed", "identity"), ("identity.merged", "identity"), ("reuse.detected", "credential"), ("email.received", "email"), ("canary.", "canary_fingerprint"), ) def _source_kind_for(topic: str) -> str | None: for fragment, kind in _TOPIC_SOURCE_KIND: if fragment in topic: return kind return None @contextmanager def _span(name: str, **attrs: Any) -> Iterator[Any]: """Tracing helper short-circuiting on ``DECNET_DEVELOPER_TRACING``. Same shape as the engine / store helpers — single attribute lookup when off, late-bound tracer when on so test monkeypatches reach us. """ if not _telemetry._ENABLED: yield None return tracer = _telemetry.get_tracer("ttp.worker") with tracer.start_as_current_span(name) as span: for key, value in attrs.items(): try: span.set_attribute(key, value) except (TypeError, ValueError): continue yield span def _build_event(topic: str, payload: dict[str, Any]) -> TaggerEvent | None: """Translate one bus payload into a :class:`TaggerEvent`. Returns ``None`` if the topic isn't one we know how to dispatch (defensive — :data:`_TOPICS` and :data:`_TOPIC_SOURCE_KIND` are kept in sync, but a wildcard subscription could in theory deliver a topic outside the table). ``source_id`` is the stable per-event identifier the repository uses for idempotency. We prefer the most-specific ID present in the payload so a replay of the same upstream event produces the same :func:`compute_tag_uuid` and the ``INSERT OR IGNORE`` write becomes a no-op the second time around. The order below is the same priority list the lifters use internally. """ source_kind = _source_kind_for(topic) if source_kind is None: return None source_id = ( payload.get("source_id") or payload.get("session_id") or payload.get("token_id") or payload.get("identity_uuid") or payload.get("credential_id") or payload.get("attacker_uuid") or payload.get("uuid") or topic ) return TaggerEvent( source_kind=source_kind, source_id=str(source_id), attacker_uuid=_str_or_none(payload.get("attacker_uuid")), identity_uuid=_str_or_none(payload.get("identity_uuid")), session_id=_str_or_none(payload.get("session_id")), decky_id=_str_or_none(payload.get("decky_id")), payload=dict(payload), ) def _str_or_none(value: Any) -> str | None: if value is None: return None return str(value) async def run_ttp_worker_loop( repo: BaseRepository, *, poll_interval_secs: float = _DEFAULT_POLL_SECS, tagger: Optional[Tagger] = None, shutdown: Optional[asyncio.Event] = None, bus: Optional[BaseBus] = None, ) -> None: """Run the TTP-tagging loop until cancelled. *tagger* defaults to :func:`decnet.ttp.factory.get_tagger`; tests pass a fake. *shutdown* is an optional external stop signal; the loop also exits cleanly on :class:`asyncio.CancelledError` and :class:`KeyboardInterrupt`. *bus* is an optional pre-wired bus; when omitted the worker calls :func:`get_bus` itself, falling back to poll-only when the bus is unavailable (typical dev box without a NATS daemon). """ if tagger is None: tagger = get_tagger() log.info( "ttp worker started tagger=%s poll_interval_secs=%s topics=%d", tagger.name, poll_interval_secs, len(_TOPICS), ) owned_bus = False queue: asyncio.Queue[tuple[str, Event] | None] = asyncio.Queue() pump_tasks: list[asyncio.Task[None]] = [] watch_tasks: list[asyncio.Task[None]] = [] heartbeat_task: Optional[asyncio.Task[None]] = None control_task: Optional[asyncio.Task[None]] = None # Hydrate per-lifter rule indexes. Each WatchableTagger # (CompositeTagger children + the RuleEngineTagger) owns its own # RuleIndex and drains store change events forever via # `watch_store`. Without these tasks every dispatch index stays # empty and no rule fires — the bus subscriptions work, the # pump tasks run, and tagger.tag() returns [] every call. Tasks # are independent of the bus, so this fan-out runs even in # poll-only mode. if isinstance(tagger, CompositeTagger): for watchable in tagger.iter_watchables(): watch_tasks.append(asyncio.create_task( _run_watch(watchable), )) try: if bus is None: try: candidate = get_bus(client_name="ttp") await candidate.connect() bus = candidate owned_bus = True except Exception as exc: # noqa: BLE001 log.warning( "ttp worker: bus unavailable, running in poll-only mode: %s", exc, ) bus = None if bus is not None: for pattern in _TOPICS: pump_tasks.append(asyncio.create_task( _pump(bus, queue, pattern), )) heartbeat_task = asyncio.create_task( _run_health_heartbeat(bus, "ttp"), ) control_task = asyncio.create_task( _run_control_listener_signal(bus, "ttp"), ) except Exception as exc: # noqa: BLE001 log.warning( "ttp worker: bus setup failed, running in poll-only mode: %s", exc, ) if shutdown is None: shutdown = asyncio.Event() try: while not shutdown.is_set(): try: item = await asyncio.wait_for( queue.get(), timeout=float(poll_interval_secs), ) except asyncio.TimeoutError: continue if item is None: continue topic, event = item await _process_event(topic, event, tagger, repo, bus) except (asyncio.CancelledError, KeyboardInterrupt): log.info("ttp worker stopped") finally: for task in pump_tasks: task.cancel() for task in watch_tasks: task.cancel() if heartbeat_task is not None: heartbeat_task.cancel() if control_task is not None: control_task.cancel() for task in pump_tasks: with contextlib.suppress(asyncio.CancelledError, Exception): await task for task in watch_tasks: with contextlib.suppress(asyncio.CancelledError, Exception): await task for opt in (heartbeat_task, control_task): if opt is None: continue with contextlib.suppress(asyncio.CancelledError, Exception): await opt if owned_bus and bus is not None: with contextlib.suppress(Exception): await bus.close() async def _process_event( topic: str, event: Event, tagger: Tagger, repo: BaseRepository, bus: BaseBus | None, ) -> None: """Dispatch one event through the tagger, persist, publish if new. Loop-prevention invariant: ``ttp.tagged`` is published ONLY when :meth:`BaseRepository.insert_tags` returned a non-zero count. A replay of the same upstream event hits the idempotent ``INSERT OR IGNORE`` and writes zero rows → publishes zero events. """ tagger_event = _build_event(topic, event.payload) if tagger_event is None: return with _span( "ttp.worker.tick", topic=topic, source_kind=tagger_event.source_kind, ): try: tags = await tagger.tag(tagger_event) except Exception: # noqa: BLE001 # Composite + TolerantTagger normally swallow per-lifter # blow-ups already; this is the worst-case backstop so a # single bad event can't take down the whole loop. log.exception( "ttp worker: tagger raised on topic=%r", topic, ) return if not tags: return try: inserted = await repo.insert_tags(tags) except Exception: # noqa: BLE001 log.exception( "ttp worker: insert_tags failed on topic=%r", topic, ) return if inserted <= 0: # Idempotent re-eval — the loop-prevention invariant # forbids publishing here. return if bus is not None: await _publish_tagged(bus, tags) async def _publish_tagged(bus: BaseBus, tags: list[TTPTag]) -> None: """Publish ``ttp.tagged`` + per-technique ``ttp.rule.fired.*``. ``ttp.tagged`` carries the deduped technique list so a SIEM subscriber can correlate without a DB read; per-technique fires are 1:1 with the technique IDs touched by this batch (deduped so a single batch produces one ``ttp.rule.fired.T1110`` even if three rules emitted T1110). """ if not tags: return techniques = sorted({t.technique_id for t in tags}) aggregate_payload: dict[str, Any] = { "attacker_uuid": tags[0].attacker_uuid, "identity_uuid": tags[0].identity_uuid, "session_id": tags[0].session_id, "tag_uuids": [t.uuid for t in tags], "techniques_added": techniques, } await bus.publish( _topics.ttp(_topics.TTP_TAGGED), aggregate_payload, event_type=_topics.TTP_TAGGED, ) for technique_id in techniques: per_tech_payload: dict[str, Any] = { "technique_id": technique_id, "tag_uuids": [t.uuid for t in tags if t.technique_id == technique_id], "attacker_uuid": tags[0].attacker_uuid, "identity_uuid": tags[0].identity_uuid, "session_id": tags[0].session_id, } await bus.publish( _topics.ttp_rule_fired(technique_id), per_tech_payload, event_type=_topics.TTP_RULE_FIRED, ) async def _run_watch(watchable: Any) -> None: """Drive one lifter's ``watch_store()`` coroutine forever. Mirrors :func:`_pump`'s tolerance contract: a transient store error logs and exits the watch task without taking the worker down. The main loop's poll-interval fallback continues to heartbeat; a subsequent worker restart re-runs the watch fan-out and rehydrates. """ name = getattr(watchable, "name", watchable.__class__.__name__) try: await watchable.watch_store() except asyncio.CancelledError: raise except Exception as exc: # noqa: BLE001 log.warning( "ttp worker: watch_store for %s died (%s); index will not " "hot-reload until next worker restart", name, exc, ) async def _pump( bus: BaseBus, queue: "asyncio.Queue[tuple[str, Event] | None]", pattern: str, ) -> None: """Forward every event matching *pattern* into *queue*. Survives transient subscriber errors by logging and exiting; the poll-interval fallback in the main loop keeps the worker alive until the next reconnect attempt. """ try: sub = bus.subscribe(pattern) async with sub: async for event in sub: await queue.put((event.topic, event)) except asyncio.CancelledError: raise except Exception as exc: # noqa: BLE001 log.warning( "ttp worker: subscriber for %s died (%s); falling back to poll", pattern, exc, ) __all__ = ["run_ttp_worker_loop", "_TOPICS"]