diff --git a/.env.example b/.env.example new file mode 100644 index 00000000..021cf9a3 --- /dev/null +++ b/.env.example @@ -0,0 +1,12 @@ +# API Options +DECNET_API_HOST=0.0.0.0 +DECNET_API_PORT=8000 +DECNET_JWT_SECRET=supersecretkey12345678901234567 +DECNET_INGEST_LOG_FILE=/var/log/decnet/decnet.log + +# Web Dashboard Options +DECNET_WEB_HOST=0.0.0.0 +DECNET_WEB_PORT=8080 +DECNET_ADMIN_USER=admin +DECNET_ADMIN_PASSWORD=admin +DECNET_DEVELOPER=False diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml index 97330d4a..1602429c 100644 --- a/.gitea/workflows/ci.yml +++ b/.gitea/workflows/ci.yml @@ -11,79 +11,131 @@ jobs: lint: name: Lint (ruff) runs-on: ubuntu-latest + if: github.ref == 'refs/heads/dev' steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: python-version: "3.11" - run: pip install ruff - - run: ruff check . - - test: - name: Test (pytest) - runs-on: ubuntu-latest - strategy: - matrix: - python-version: ["3.11", "3.12"] - steps: - - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 - with: - python-version: ${{ matrix.python-version }} - - run: pip install -e . - - run: pytest tests/ -v --tb=short + - run: ruff check decnet/ bandit: name: SAST (bandit) runs-on: ubuntu-latest + if: github.ref == 'refs/heads/dev' steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: python-version: "3.11" - run: pip install bandit - - run: bandit -r decnet/ -ll -x decnet/services/registry.py + - run: bandit -r decnet/ -ll -x decnet/services/registry.py -x decnet/templates/ pip-audit: name: Dependency audit (pip-audit) runs-on: ubuntu-latest + if: github.ref == 'refs/heads/dev' steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 with: python-version: "3.11" - run: pip install pip-audit - - run: pip install -e . - - run: pip-audit --skip-editable + - run: pip install -e .[dev] + - run: pip-audit --skip-editable --ignore-vuln CVE-2025-65896 --ignore-vuln CVE-2026-3219 - open-pr: - name: Open PR to main + merge-to-testing: + name: Merge dev → testing runs-on: ubuntu-latest - needs: [lint, test, bandit, pip-audit] + needs: [lint, bandit, pip-audit] if: github.ref == 'refs/heads/dev' steps: - - name: Open PR via Gitea API + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + token: ${{ secrets.DECNET_PR_TOKEN }} + - name: Configure git run: | - echo "--- Checking for existing open PRs ---" - LIST_RESPONSE=$(curl -s \ - -H "Authorization: token ${{ secrets.DECNET_PR_TOKEN }}" \ - "https://git.resacachile.cl/api/v1/repos/anti/DECNET/pulls?state=open&head=anti:dev&base=main&limit=5") - echo "$LIST_RESPONSE" - EXISTING=$(echo "$LIST_RESPONSE" | python3 -c "import sys, json; print(len(json.load(sys.stdin)))") - echo "Open PRs found: $EXISTING" - if [ "$EXISTING" -gt "0" ]; then - echo "PR already open, skipping." - exit 0 - fi - echo "--- Creating PR ---" - CREATE_RESPONSE=$(curl -s -X POST \ - -H "Authorization: token ${{ secrets.DECNET_PR_TOKEN }}" \ - -H "Content-Type: application/json" \ - -d '{ - "title": "Auto PR: dev → main", - "head": "dev", - "base": "main", - "body": "All CI and security checks passed. Review and merge when ready." - }' \ - "https://git.resacachile.cl/api/v1/repos/anti/DECNET/pulls") - echo "$CREATE_RESPONSE" + git config user.name "DECNET CI" + git config user.email "ci@decnet.local" + - name: Merge dev into testing + run: | + git fetch origin testing + git checkout testing + git merge origin/dev --no-ff -m "ci: auto-merge dev → testing" + git push origin testing + + test-standard: + name: Test (Standard) + runs-on: ubuntu-latest + if: github.ref == 'refs/heads/testing' + strategy: + matrix: + python-version: ["3.11"] + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - run: pip install -e .[dev] + - run: pytest + + test-live: + name: Test (Live) + runs-on: ubuntu-latest + if: github.ref == 'refs/heads/testing' + needs: [test-standard] + services: + mysql: + image: mysql:8.0 + env: + MYSQL_ROOT_PASSWORD: root + MYSQL_DATABASE: decnet_test + ports: + - 3307:3306 + options: >- + --health-cmd="mysqladmin ping -h 127.0.0.1" + --health-interval=10s + --health-timeout=5s + --health-retries=5 + strategy: + matrix: + python-version: ["3.11"] + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + - run: pip install -e .[dev] + - run: pytest -m live + env: + DECNET_MYSQL_HOST: 127.0.0.1 + DECNET_MYSQL_PORT: 3307 + DECNET_MYSQL_USER: root + DECNET_MYSQL_PASSWORD: root + DECNET_MYSQL_DATABASE: decnet_test + + merge-to-main: + name: Merge testing → main + runs-on: ubuntu-latest + needs: [test-standard, test-live] + if: github.ref == 'refs/heads/testing' + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + token: ${{ secrets.DECNET_PR_TOKEN }} + - name: Configure git + run: | + git config user.name "DECNET CI" + git config user.email "ci@decnet.local" + - name: Merge testing into main + run: | + git fetch origin main + git checkout main + git merge origin/testing --no-ff -m "ci: auto-merge testing → main" || { + echo "CONFLICT: testing and main have diverged — manual resolution required" + exit 1 + } + git push origin main diff --git a/.gitea/workflows/pr.yml b/.gitea/workflows/pr.yml index b9426943..9c2a6779 100644 --- a/.gitea/workflows/pr.yml +++ b/.gitea/workflows/pr.yml @@ -30,5 +30,28 @@ jobs: - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} - - run: pip install -e . + - run: pip install -e .[dev] - run: pytest tests/ -v --tb=short + + bandit: + name: SAST (bandit) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + - run: pip install bandit + - run: bandit -r decnet/ -ll -x decnet/services/registry.py + + pip-audit: + name: Dependency audit (pip-audit) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.11" + - run: pip install pip-audit + - run: pip install -e .[dev] + - run: pip-audit --skip-editable diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml index 49d88969..cbe6ec68 100644 --- a/.gitea/workflows/release.yml +++ b/.gitea/workflows/release.yml @@ -22,27 +22,42 @@ jobs: - uses: actions/checkout@v4 with: fetch-depth: 0 + token: ${{ secrets.DECNET_PR_TOKEN }} - - name: Extract version from pyproject.toml + - name: Configure git + run: | + git config user.name "DECNET CI" + git config user.email "ci@decnet.local" + + - name: Bump version and Tag id: version run: | - VERSION=$(python3 -c "import tomllib; f=open('pyproject.toml','rb'); d=tomllib.load(f); print(d['project']['version'])") - echo "version=$VERSION" >> $GITHUB_OUTPUT + # Calculate next version (v0.x) + LATEST_TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "v0.0.0") + NEXT_VER=$(python3 -c " + tag = '$LATEST_TAG'.lstrip('v') + parts = tag.split('.') + major = int(parts[0]) if parts[0] else 0 + minor = int(parts[1]) if len(parts) > 1 else 0 + print(f'{major}.{minor + 1}.0') + ") + + echo "Next version: $NEXT_VER (calculated from $LATEST_TAG)" + + # Update pyproject.toml + sed -i "s/^version = \".*\"/version = \"$NEXT_VER\"/" pyproject.toml + + git add pyproject.toml + git commit -m "chore: auto-release v$NEXT_VER [skip ci]" || echo "No changes to commit" + CHANGELOG=$(git log ${LATEST_TAG}..HEAD --oneline --no-decorate --no-merges) + git tag -a "v$NEXT_VER" -m "Auto-release v$NEXT_VER - - name: Create tag if not exists - id: tag - run: | - VERSION=${{ steps.version.outputs.version }} - if git rev-parse "v$VERSION" >/dev/null 2>&1; then - echo "Tag v$VERSION already exists, skipping." - echo "created=false" >> $GITHUB_OUTPUT - else - git config user.name "gitea-actions" - git config user.email "actions@git.resacachile.cl" - git tag -a "v$VERSION" -m "Release v$VERSION" - git push origin "v$VERSION" - echo "created=true" >> $GITHUB_OUTPUT - fi +Changes since $LATEST_TAG: +$CHANGELOG" + git push origin main --follow-tags + + echo "version=$NEXT_VER" >> $GITHUB_OUTPUT + echo "created=true" >> $GITHUB_OUTPUT docker: name: Build, scan & push ${{ matrix.service }} @@ -52,7 +67,7 @@ jobs: fail-fast: false matrix: service: - - cowrie + - conpot - docker_api - elasticsearch - ftp @@ -69,11 +84,12 @@ jobs: - postgres - rdp - redis - - real_ssh - sip - smb - smtp - snmp + - ssh + - telnet - tftp - vnc steps: @@ -99,13 +115,13 @@ jobs: cache-from: type=gha cache-to: type=gha,mode=max + - name: Install Trivy + run: | + curl -sfL https://raw.githubusercontent.com/aquasecurity/trivy/main/contrib/install.sh | sh -s -- -b /usr/local/bin + - name: Scan with Trivy - uses: aquasecurity/trivy-action@master - with: - image-ref: decnet-${{ matrix.service }}:scan - exit-code: "1" - severity: CRITICAL - ignore-unfixed: true + run: | + trivy image --exit-code 1 --severity CRITICAL --ignore-unfixed decnet-${{ matrix.service }}:scan - name: Push image if: success() diff --git a/.gitignore b/.gitignore index f432c66a..bc49506f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,11 @@ .venv/ +.venv*/ +docker-compose.yaml +.311/ +.3[0-9][0-9]/ +logs/ +.claude/* +CLAUDE.md __pycache__/ *.pyc *.pyo @@ -6,13 +13,64 @@ __pycache__/ dist/ build/ decnet-compose.yml +# Per-topology compose fragments emitted by `decnet topology deploy`. +decnet-topology-*-compose.yml +# Docker build context cache. +.docker/ decnet-state.json *.ini -.env +# tracked: Alembic CLI config (migrations live in decnet/web/db/migrations) +!alembic.ini decnet.log* *.loggy *.nmap linterfails.log -test-scan webmail windows1 +*.db +*.db-shm +*.db-wal +decnet.*.log +# Rotated copies (logrotate appends .1, .2, .gz...) — the existing +# decnet.*.log glob doesn't catch the suffix. +decnet.*.log.* +decnet.json +.env* +.env.local +.coverage +.hypothesis/ +profiles/* +tests/test_decnet.db* + +# Nested git clone of the wiki — not a submodule, just a local +# working copy so we can edit docs without a full round-trip. +wiki-checkout/ + +# Scratch test/debug outputs that leak from saved `pytest > hang.log` +# or `pytest > schem` redirections. +hang.log +schem +*.pytest.log + +# pydeps-style dependency graph dumps from local analysis runs. +deps.txt + +# Node modules vendored under decnet/canary/ for the obfuscator helper. +# The package.json is the source of truth; modules are reinstalled at +# build/deploy time. +node_modules/ +package-lock.json + +# TTP rule-precision corpus pulled from prod sqlite. Real attacker +# payloads — operator-only artifact. The synthetic ``seed_*.jsonl`` +# files alongside ARE committed and exercise the harness in CI. +tests/ttp/rule_precision/corpus/*.jsonl +tests/ttp/rule_precision/corpus/seed_*.jsonl +threatfox-api.json + +# MITRE ATT&CK STIX bundle — 50 MB, fetched at runtime via attack_stix.py +enterprise-attack-*.json + +# pytest failure dump files +testfail +.phaseloop/ diff --git a/CLAUDE.md b/CLAUDE.md deleted file mode 100644 index 999ec8f2..00000000 --- a/CLAUDE.md +++ /dev/null @@ -1,57 +0,0 @@ -# CLAUDE.md - -This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. - -## Commands - -```bash -# Install (dev) -pip install -e . - -# List registered service plugins -decnet services - -# Dry-run (generates compose, no containers) -decnet deploy --mode unihost --deckies 3 --randomize-services --dry-run - -# Full deploy (requires root for MACVLAN) -sudo decnet deploy --mode unihost --deckies 5 --interface eth0 --randomize-services -sudo decnet deploy --mode unihost --deckies 3 --services ssh,smb --log-target 192.168.1.5:5140 - -# Status / teardown -decnet status -sudo decnet teardown --all -sudo decnet teardown --id decky-01 -``` - -## Project Overview - -DECNET is a honeypot/deception network framework. It deploys fake machines (called **deckies**) with realistic services (RDP, SMB, SSH, FTP, etc.) to lure and profile attackers. All attacker interactions are aggregated to an isolated logging network (ELK stack / SIEM). - -## Deployment Models - -**UNIHOST** — one real host spins up _n_ deckies via a container orchestrator. Simpler, single-machine deployment. - -**SWARM (MULTIHOST)** — _n_ real hosts each running deckies. Orchestrated via Ansible/sshpass or similar tooling. - -## Core Technology Choices - -- **Containers**: Docker Compose is the starting point but other orchestration frameworks should be evaluated if they serve the project better. `debian:bookworm-slim` is the default base image; mixing in Ubuntu, CentOS, or other distros is encouraged to make the decoy network look heterogeneous. -- **Networking**: Deckies need to appear as real machines on the LAN (own MACs/IPs). MACVLAN and IPVLAN are candidates; the right driver depends on the host environment. WSL has known limitations — bare metal or a VM is preferred for testing. -- **Log pipeline**: Logstash → ELK stack → SIEM (isolated network, not reachable from decoy network) - -## Architecture Constraints - -- The decoy network must be reachable from the outside (attacker-facing). -- The logging/aggregation network must be isolated from the decoy network. -- A publicly accessible real server acts as the bridge between the two networks. -- Deckies should differ in exposed services and OS fingerprints to appear as a heterogeneous network. - -## Development and testing - -- For every new feature, pytests must me made. -- Pytest is the main testing framework in use. -- NEVER pass broken code to the user. - - Broken means: not running, not passing 100% tests, etc. -- After tests pass with 100%, always git commit your changes. -- NEVER add "Co-Authored-By" or any Claude attribution lines to git commit messages. diff --git a/COPYRIGHT b/COPYRIGHT new file mode 100644 index 00000000..0d770e15 --- /dev/null +++ b/COPYRIGHT @@ -0,0 +1,17 @@ +DECNET - Deception Network +Copyright (C) 2026 Samuel Paschuan + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU Affero General Public License as +published by the Free Software Foundation, either version 3 of the +License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Affero General Public License for more details. + +You should have received a copy of the GNU Affero General Public +License along with this program. If not, see . + +SPDX-License-Identifier: AGPL-3.0-or-later diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md deleted file mode 100644 index 3d1143ae..00000000 --- a/DEVELOPMENT.md +++ /dev/null @@ -1,75 +0,0 @@ -# TODO - -This is a list of DEVELOPMENT TODOs. Features, development experience, usage, documentation, etcetera. - -## Core / Hardening - -- [ ] **Attacker fingerprinting** — Beyond IP logging: capture TLS JA3/JA4 hashes, TCP window sizes, User-Agent strings, SSH client banners, and tool signatures (nmap, masscan, Metasploit, Cobalt Strike). Build attacker profiles across sessions. -- [ ] **Canary tokens** — Embed canary URLs, fake AWS keys, fake API tokens, and honeydocs (PDF/DOCX with phone-home URLs) into decky filesystems. Fire an alert the moment one is used. -- [ ] **Tarpit mode** — Slow down attackers by making services respond extremely slowly (e.g., SSH that takes 60s to reject, HTTP that drip-feeds bytes). Wastes attacker time and resources. -- [ ] **Dynamic decky mutation** — Deckies that change their exposed services or OS fingerprint over time to confuse port-scan caching and appear more "alive." -- [ ] **Credential harvesting DB** — Every username/password attempt across all services lands in a queryable database. Expose via CLI (`decnet creds`) and flag reuse across deckies. -- [ ] **Session recording** — Full session capture for SSH/Telnet (keystroke logs, commands run, files downloaded). Cowrie already does this — surface it better in the CLI and correlation engine. -- [ ] **Payload capture** — Store every file uploaded or command executed by an attacker. Hash and auto-submit to VirusTotal or a local sandbox. - -## Detection & Intelligence - -- [ ] **Real-time alerting** — Webhook/Slack/Telegram notifications when an attacker hits a decky for the first time, crosses N deckies (lateral movement), or uses a known bad IP. -- [ ] **Threat intel enrichment** — Auto-lookup attacker IPs against AbuseIPDB, Shodan, GreyNoise, and AlienVault OTX. Tag known scanners vs. targeted attackers. -- [ ] **Attack campaign clustering** — Group attacker sessions by tooling signatures, timing patterns, and credential sets. Identify coordinated campaigns hitting multiple deckies. -- [ ] **GeoIP mapping** — Attacker origin on a world map. Correlate with ASN data to identify cloud exit nodes, VPNs, and Tor exits. -- [ ] **TTPs tagging** — Map observed attacker behaviors to MITRE ATT&CK techniques automatically. Tag events in the correlation engine. -- [ ] **Honeypot interaction scoring** — Score attackers on a scale: casual scanner vs. persistent targeted attacker, based on depth of interaction and commands run. - -## Dashboard & Visibility - -- [ ] **Web dashboard** — Real-time web UI showing live decky status, attacker activity, traversal graphs, and credential stats. Could be a simple FastAPI + HTMX or a full React app. -- [ ] **Pre-built Kibana/Grafana dashboards** — Ship dashboard JSON exports out of the box so ELK/Grafana deployments are plug-and-play. -- [ ] **CLI live feed** — `decnet watch` command: tail all decky logs in a unified, colored terminal stream (like `docker-compose logs -f` but prettier). -- [ ] **Traversal graph export** — Export attacker traversal graphs as DOT/Graphviz or JSON for visualization in external tools. -- [ ] **Daily digest** — Automated daily summary email/report: new attackers, top credentials tried, most-hit services. - -## Deployment & Infrastructure - -- [ ] **SWARM / multihost mode** — Full Ansible-based orchestration for deploying deckies across N real hosts. -- [ ] **Terraform/Pulumi provider** — Spin up cloud-hosted deckies on AWS/GCP/Azure with one command. Useful for internet-facing honeynets. -- [ ] **Auto-scaling** — When attack traffic increases, automatically spawn more deckies to absorb and log more activity. -- [ ] **Kubernetes deployment mode** — Run deckies as Kubernetes pods for environments already running k8s. -- [ ] **Proxmox/libvirt backend** — Full VM-based deckies instead of containers, for even more realistic OS fingerprints and behavior. Docker for speed; VMs for realism. -- [ ] **Raspberry Pi / ARM support** — Low-cost physical honeynets using RPis. Validate ARM image builds. -- [ ] **Decky health monitoring** — Watchdog that auto-restarts crashed deckies and alerts if a service goes dark. - -## Services & Realism - -- [ ] **HTTPS/TLS support** — HTTP honeypot with a self-signed or Let's Encrypt cert. Many real-world services use HTTPS; plain HTTP stands out. -- [ ] **Fake Active Directory** — A convincing fake AD/LDAP with fake users, groups, and GPOs. Attacker tools like BloodHound should get juicy (fake) data. -- [ ] **Fake file shares** — SMB/NFS shares pre-populated with enticing but fake files: "passwords.xlsx", "vpn_config.ovpn", "backup_keys.tar.gz". All instrumented to detect access. -- [ ] **Realistic web apps** — HTTP honeypot serving convincing fake apps: a fake WordPress, a fake phpMyAdmin, a fake Grafana login — all logging every interaction. -- [ ] **OT/ICS profiles** — Expand Conpot support: Modbus, DNP3, BACnet, EtherNet/IP. Convincing industrial control system decoys. -- [ ] **Printer/IoT archetypes** — Expand existing printer/camera archetypes with actual service emulation (IPP, ONVIF, WS-Discovery). -- [ ] **Service interaction depth** — Some services currently just log the connection. Deepen interaction: fake MySQL that accepts queries and returns realistic fake data, fake Redis that stores and retrieves dummy keys. - -## Developer Experience - -- [ ] **Plugin SDK docs** — Full documentation and an example plugin for adding custom services. Lower the barrier for community contributions. -- [ ] **Integration tests** — Full deploy/teardown cycle tests against a real Docker daemon (not just unit tests). -- [ ] **Per-service tests** — Each of the 29 service implementations deserves its own test coverage. -- [x] **CI/CD pipeline** — GitHub/Gitea Actions: run tests on push, lint, build Docker images, publish releases. - - ci.yaml contains several steps for the CI/CD pipeline. Mainly: - - Trivy checks for Docker containers. - - Ruff linting. - - Pytests. - - Bandit SAST. - - pip-audit. -- [ ] **Config validation CLI** — `decnet validate my.ini` to dry-check an INI config before deploying. -- [ ] **Config generator wizard** — `decnet wizard` interactive prompt to generate an INI config without writing one by hand. -- [ ] **Gitea Wiki** — Set up the repository wiki with structured docs across the following pages: - - **Home** — Project overview, goals, and navigation index. - - **Architecture** — UNIHOST vs SWARM models, the two-network design (decoy-facing vs isolated logging), MACVLAN/IPVLAN, log pipeline (Cowrie → Logstash → ELK → SIEM), WSL limitations. - - **General Usage** — What DECNET can do and how: deploying deckies, choosing services, using `--randomize-services`, reading status, tearing down. Archetypes explained (what they are, how they group services into realistic machine personas — e.g. a Windows workstation archetype exposes RDP+SMB+LDAP, a Linux server exposes SSH+FTP+MySQL). List of built-in archetypes. How to pick an archetype vs. manually specifying services. - - **Custom Services** — How the plugin registry works, anatomy of a service plugin, step-by-step guide to writing and registering a custom service, how to package it for reuse. - - **Configuration Reference** — Full INI config option breakdown, all CLI flags (`--mode`, `--deckies`, `--interface`, `--log-target`, `--randomize-services`, etc.), environment variables. - - **Deployment Guides** — UNIHOST quickstart (bare metal/VM), SWARM/multihost with Ansible (once implemented), cloud deployment via Terraform (once implemented), Raspberry Pi / ARM builds. - - **Service Reference** — Full table of all 29 services: port, protocol, base image, interaction depth, and any known fingerprint quirks. - - **Attacker Intelligence** — Credential harvesting (`decnet creds`), session recording playback, threat intel enrichment (AbuseIPDB, GreyNoise, Shodan, OTX), MITRE ATT&CK tagging, campaign clustering. - - **Operations** — Health monitoring, watchdog behavior, teardown procedures, log rotation, troubleshooting common issues. diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..be3f7b28 --- /dev/null +++ b/LICENSE @@ -0,0 +1,661 @@ + GNU AFFERO GENERAL PUBLIC LICENSE + Version 3, 19 November 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU Affero General Public License is a free, copyleft license for +software and other kinds of works, specifically designed to ensure +cooperation with the community in the case of network server software. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +our General Public Licenses are intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + Developers that use our General Public Licenses protect your rights +with two steps: (1) assert copyright on the software, and (2) offer +you this License which gives you legal permission to copy, distribute +and/or modify the software. + + A secondary benefit of defending all users' freedom is that +improvements made in alternate versions of the program, if they +receive widespread use, become available for other developers to +incorporate. Many developers of free software are heartened and +encouraged by the resulting cooperation. However, in the case of +software used on network servers, this result may fail to come about. +The GNU General Public License permits making a modified version and +letting the public access it on a server without ever releasing its +source code to the public. + + The GNU Affero General Public License is designed specifically to +ensure that, in such cases, the modified source code becomes available +to the community. It requires the operator of a network server to +provide the source code of the modified version running there to the +users of that server. Therefore, public use of a modified version, on +a publicly accessible server, gives the public access to the source +code of the modified version. + + An older license, called the Affero General Public License and +published by Affero, was designed to accomplish similar goals. This is +a different license, not a version of the Affero GPL, but Affero has +released a new version of the Affero GPL which permits relicensing under +this license. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU Affero General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Remote Network Interaction; Use with the GNU General Public License. + + Notwithstanding any other provision of this License, if you modify the +Program, your modified version must prominently offer all users +interacting with it remotely through a computer network (if your version +supports such interaction) an opportunity to receive the Corresponding +Source of your version by providing access to the Corresponding Source +from a network server at no charge, through some standard or customary +means of facilitating copying of software. This Corresponding Source +shall include the Corresponding Source for any work covered by version 3 +of the GNU General Public License that is incorporated pursuant to the +following paragraph. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the work with which it is combined will remain governed by version +3 of the GNU General Public License. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU Affero General Public License from time to time. Such new versions +will be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU Affero General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU Affero General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU Affero General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If your software can interact with users remotely through a computer +network, you should also make sure that it provides a way for users to +get its source. For example, if your program is a web application, its +interface could display a "Source" link that leads users to an archive +of the code. There are many ways you could offer source, and different +solutions will be better for different programs; see section 13 for the +specific requirements. + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU AGPL, see +. diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..49e853b5 --- /dev/null +++ b/Makefile @@ -0,0 +1,261 @@ +PYTEST := .311/bin/pytest +FAIL_FAST ?= 1 +NO_CACHE ?= 0 +ARGS := + +# addopts in pyproject.toml already provides -v -q -x -n 4 --dist load. +# Unit suites inherit that; special suites clear it with --override-ini. +UNIT_FLAGS := --timeout=30 --timeout-method=thread +SEQ_FLAGS := --override-ini="addopts=-v -x" -n logical --timeout=120 --timeout-method=thread +FUZZ_FLAGS := --override-ini="addopts=-v -x" -n logical -m fuzz \ + --ignore=tests/api/test_schemathesis.py \ + --ignore=tests/api/test_schemathesis_agent.py \ + --ignore=tests/api/test_schemathesis_swarm.py \ + --ignore=tests/api/test_schemathesis_ttp.py +SCHEMA_QUICK ?= 0 +SCHEMA_FLAGS := --override-ini="addopts=-v -x" -n 4 -m fuzz --timeout=600 --timeout-method=thread +BENCH_FLAGS := --override-ini="addopts=-v" -p no:xdist --benchmark-only -m bench + +# ── Unit suites (xdist, 30s timeout) ───────────────────────────────────────── + +.PHONY: test-core +test-core: + $(PYTEST) tests/core tests/config tests/factories tests/fixtures $(UNIT_FLAGS) $(ARGS) + +.PHONY: test-web +test-web: + $(PYTEST) tests/web tests/services $(UNIT_FLAGS) $(ARGS) + +.PHONY: test-db +test-db: + $(PYTEST) tests/db tests/vectorstore $(UNIT_FLAGS) $(ARGS) + +.PHONY: test-bus +test-bus: + $(PYTEST) tests/bus tests/logging tests/telemetry $(UNIT_FLAGS) $(ARGS) + +.PHONY: test-ttp +test-ttp: + $(PYTEST) tests/ttp $(UNIT_FLAGS) $(ARGS) + +.PHONY: test-intel +test-intel: + $(PYTEST) tests/intel tests/asn tests/geoip $(UNIT_FLAGS) $(ARGS) + +.PHONY: test-analysis +test-analysis: + $(PYTEST) tests/clustering tests/correlation $(UNIT_FLAGS) $(ARGS) + +.PHONY: test-infra +test-infra: + $(PYTEST) tests/agent tests/collector tests/sniffer tests/profiler $(UNIT_FLAGS) $(ARGS) + +.PHONY: test-fleet +test-fleet: + $(PYTEST) tests/fleet tests/swarm tests/topology tests/orchestrator tests/deploy tests/updater $(UNIT_FLAGS) $(ARGS) + +.PHONY: test-cli +test-cli: + $(PYTEST) tests/cli tests/engine tests/mutator tests/realism $(UNIT_FLAGS) $(ARGS) + +.PHONY: test-features +test-features: + $(PYTEST) tests/canary tests/artifacts tests/webhook tests/decky_io tests/prober $(UNIT_FLAGS) $(ARGS) + +# ── Go and React suites ─────────────────────────────────────────────────────── + +_GO_MODULES := \ + decnet/templates/_caddy_modules/decnetfp \ + decnet/templates/http/_caddy_modules/decnetfp \ + decnet/templates/https/_caddy_modules/decnetfp + +.PHONY: test-go +test-go: + @failed=""; \ + for mod in $(_GO_MODULES); do \ + echo "=== go test: $$mod ==="; \ + if (cd "$$mod" && go test ./...); then \ + echo "[PASS] $$mod"; \ + else \ + echo "[FAIL] $$mod"; \ + failed="$$failed $$mod"; \ + if [ "$(FAIL_FAST)" = "1" ]; then exit 1; fi; \ + fi; \ + done; \ + [ -z "$$failed" ] + +.PHONY: test-react +test-react: + cd decnet_web && npm run test:run $(ARGS) + +# ── Special suites (sequential, longer timeout) ─────────────────────────────── + +.PHONY: test-live +test-live: + $(PYTEST) tests/live -m live $(SEQ_FLAGS) $(ARGS) + +.PHONY: test-api +test-api: + $(PYTEST) tests/api $(SEQ_FLAGS) $(ARGS) + +.PHONY: test-stress +test-stress: + $(PYTEST) tests/stress -m stress $(SEQ_FLAGS) $(ARGS) + +.PHONY: test-service +test-service: + $(PYTEST) tests/service_testing $(SEQ_FLAGS) $(ARGS) + +.PHONY: test-fuzz +test-fuzz: + $(PYTEST) $(FUZZ_FLAGS) $(ARGS) + +.PHONY: test-schema +test-schema: + SCHEMA_QUICK=$(SCHEMA_QUICK) $(PYTEST) \ + tests/api/test_schemathesis.py \ + tests/api/test_schemathesis_agent.py \ + tests/api/test_schemathesis_swarm.py \ + tests/api/test_schemathesis_ttp.py \ + $(SCHEMA_FLAGS) $(ARGS) + +.PHONY: test-bench +test-bench: + $(PYTEST) tests/perf $(BENCH_FLAGS) $(ARGS) + +.PHONY: test-docker +test-docker: + DECNET_LIVE_DOCKER=1 $(PYTEST) tests/docker -m docker $(SEQ_FLAGS) $(ARGS) + +# ── Static analysis ─────────────────────────────────────────────────────────── + +.PHONY: test-mypy +test-mypy: + .311/bin/mypy decnet --ignore-missing-imports --no-error-summary + +.PHONY: test-bandit +test-bandit: + .311/bin/bandit -r decnet -c pyproject.toml + +.PHONY: test-vulture +test-vulture: + .311/bin/vulture decnet --min-confidence 80 + +.PHONY: test-pip-audit +test-pip-audit: + .311/bin/pip-audit + +# ── Composite: all suites ───────────────────────────────────────────────────── + +_ALL_SUITES := core web db bus ttp intel analysis infra fleet cli features \ + go react \ + live api schema stress service fuzz bench docker \ + mypy bandit vulture pip-audit + +.PHONY: test-all test +test-all test: + @failed=""; \ + for suite in $(_ALL_SUITES); do \ + echo ""; \ + echo "══════════════════════════ $$suite ══════════════════════════"; \ + if $(MAKE) --no-print-directory test-$$suite ARGS="$(ARGS)"; then \ + echo "[PASS] $$suite"; \ + else \ + echo "[FAIL] $$suite"; \ + failed="$$failed $$suite"; \ + if [ "$(FAIL_FAST)" = "1" ]; then \ + echo "Stopping at first failure. Use FAIL_FAST=0 to run all suites."; \ + exit 1; \ + fi; \ + fi; \ + done; \ + if [ -n "$$failed" ]; then \ + echo ""; \ + echo "Failed:$$failed"; \ + exit 1; \ + fi; \ + echo ""; \ + echo "All suites passed." + +# ── Decky image pre-build ───────────────────────────────────────────────────── + +_DECKY_TEMPLATES := \ + conpot docker_api elasticsearch ftp http https imap k8s ldap \ + llmnr mongodb mqtt mssql mysql pop3 postgres rdp redis sip smb smtp \ + sniffer snmp ssh telnet tftp vnc + +.PHONY: build-all +build-all: + @failed=""; \ + for svc in $(_DECKY_TEMPLATES); do \ + echo ""; \ + echo "══════════════════════════ $$svc ══════════════════════════"; \ + _nc=""; \ + if [ "$(NO_CACHE)" = "1" ]; then _nc="--no-cache"; fi; \ + if DOCKER_BUILDKIT=1 docker build $$_nc \ + -t decnet/$$svc:latest \ + decnet/templates/$$svc; then \ + echo "[BUILT] $$svc"; \ + else \ + echo "[FAIL] $$svc"; \ + failed="$$failed $$svc"; \ + if [ "$(FAIL_FAST)" = "1" ]; then \ + echo "Stopping at first failure. Use FAIL_FAST=0 to build all."; \ + exit 1; \ + fi; \ + fi; \ + done; \ + if [ -n "$$failed" ]; then \ + echo ""; \ + echo "Failed:$$failed"; \ + exit 1; \ + fi; \ + echo ""; \ + echo "All decky images built." + +.PHONY: help +help: + @echo "Unit suites (xdist, 30s timeout):" + @echo " make test-core tests/core + config + factories + fixtures" + @echo " make test-web tests/web + services" + @echo " make test-db tests/db + vectorstore" + @echo " make test-bus tests/bus + logging + telemetry" + @echo " make test-ttp tests/ttp" + @echo " make test-intel tests/intel + asn + geoip" + @echo " make test-analysis tests/clustering + correlation" + @echo " make test-infra tests/agent + collector + sniffer + profiler" + @echo " make test-fleet tests/fleet + swarm + topology + orchestrator + deploy + updater" + @echo " make test-cli tests/cli + engine + mutator + realism" + @echo " make test-features tests/canary + artifacts + webhook + decky_io + prober" + @echo "" + @echo "Go / React suites:" + @echo " make test-go go test ./... in each Caddy module variant" + @echo " make test-react vitest run in decnet_web" + @echo "" + @echo "Special suites (sequential, 120s timeout):" + @echo " make test-live tests/live" + @echo " make test-api tests/api (schemathesis)" + @echo " make test-stress tests/stress" + @echo " make test-service tests/service_testing" + @echo " make test-schema schemathesis contract tests (-m fuzz, xdist logical)" + @echo " make test-schema SCHEMA_QUICK=1 same, capped at 100 examples per test" + @echo " make test-fuzz hypothesis fuzz (all normal dirs, -m fuzz, skips schemathesis files)" + @echo " make test-bench tests/perf" + @echo " make test-docker tests/docker (needs DECNET_LIVE_DOCKER=1)" + @echo "" + @echo "Static analysis:" + @echo " make test-mypy mypy type check on decnet/" + @echo " make test-bandit bandit security scan on decnet/" + @echo " make test-vulture vulture dead code scan (>=80% confidence)" + @echo " make test-pip-audit pip-audit dependency vulnerability scan" + @echo "" + @echo "Composites:" + @echo " make test-all ALL suites (unit + go + react + live + api + schema + fuzz + bench + stress + docker + static analysis)" + @echo " make test-all FAIL_FAST=0 same, report all failures instead of stopping" + @echo "" + @echo "Passthrough: make test-web ARGS='--lf -s'" + @echo "" + @echo "Decky images:" + @echo " make build-all build decnet/:latest for all 27 decky templates" + @echo " make build-all NO_CACHE=1 same, bypassing Docker layer cache" + @echo " make build-all FAIL_FAST=0 same, continue past failures" diff --git a/README.md b/README.md index b25e33d7..4898b65f 100644 --- a/README.md +++ b/README.md @@ -73,6 +73,7 @@ From the outside a decky looks identical to a real machine: its own MAC address, - Linux host (bare metal or VM — WSL has MACVLAN limitations) - Docker Engine 24+ - Python 3.11–3.13 (Python 3.14 is not yet supported — see [stress test notes](#stress-testing)) +- Node.js 18+ (required for canary token JS obfuscation) - Root / `sudo` for network setup (MACVLAN creation, host interface config) - NIC in promiscuous mode for MACVLAN (or use `--ipvlan` on WiFi) @@ -119,6 +120,15 @@ sudo decnet deploy --mode unihost --deckies 5 --interface eth0 --randomize-servi ### Start the API server and web dashboard +Recommended (systemd-managed): + +```bash +sudo .venv/bin/decnet init # first-time setup: writes systemd units +sudo systemctl start "decnet-*.service" # start all DECNET services +``` + +For development / quick runs, start the processes directly in the foreground: + ```bash decnet api start # REST API on :8000 decnet web start # Dashboard on :8080 @@ -230,11 +240,8 @@ The full command tree has grown significantly. Commands are gated by deployment | Command | Description | |---|---| | `decnet deploy` | Deploy deckies (unihost or swarm mode) | -| `decnet lifecycle start\|stop\|restart\|status` | Manage individual decky lifecycle | | `decnet teardown` | Stop and remove deckies | | `decnet status` | Print fleet state table | -| `decnet reconcile` | Reconcile desired fleet state with Docker reality | -| `decnet inventory` | List all known deckies and their metadata | #### `decnet deploy` flags @@ -257,38 +264,6 @@ The full command tree has grown significantly. Commands are gated by deployment | `--no-cache` | false | Force rebuild all images | | `--config` / `-c` | — | INI config file path | -### Services & intelligence - -| Command | Description | -|---|---| -| `decnet api start\|stop\|status` | Manage the REST API server | -| `decnet web start\|stop\|status` | Manage the web dashboard | -| `decnet bus start\|stop\|status` | Manage the service bus broker | -| `decnet workers start\|stop\|status` | Manage background workers | -| `decnet profiler start\|stop\|status` | Manage the attacker profiler worker | -| `decnet orchestrator start\|stop\|status` | Manage synthetic traffic injection | -| `decnet sniffer start\|stop\|status` | Manage passive packet capture | -| `decnet forwarder start\|stop\|status` | Manage syslog-over-TLS forwarder | -| `decnet listener start\|stop\|status` | Manage inbound syslog listener | - -### Topology, canary & intelligence - -| Command | Description | -|---|---| -| `decnet topology list\|create\|deploy\|teardown` | Manage MazeNET topologies | -| `decnet canary plant\|list\|revoke` | Manage canary tokens | -| `decnet ttp list\|tag\|export` | TTP tagging and STIX/MISP export | -| `decnet geoip lookup` | GeoIP + ASN enrichment | -| `decnet webhook list\|create\|delete\|test` | Manage alert webhooks | - -### Swarm & agent - -| Command | Description | -|---|---| -| `decnet swarm add\|remove\|list` | Manage swarm hosts | -| `decnet swarmctl deploy\|status\|teardown` | Control remote swarm agents | -| `decnet agent start\|stop\|status` | Run this host as an agent node | -| `decnet updater push\|rollback` | Push package updates to swarm agents | ### Utilities @@ -297,9 +272,6 @@ The full command tree has grown significantly. Commands are gated by deployment | `decnet services` | List all 25 registered honeypot service plugins | | `decnet distros` | List OS distro profiles | | `decnet archetypes` | List machine archetype profiles | -| `decnet db reset\|migrate` | Database operations | -| `decnet realism start\|stop` | Background LAN traffic generation | -| `decnet init` | Initialise a new DECNET deployment directory | --- @@ -307,8 +279,17 @@ The full command tree has grown significantly. Commands are gated by deployment ### Start +Recommended (systemd-managed): + +```bash +cp .env.example .env.local # edit JWT secret, ports, DB backend +sudo .venv/bin/decnet init # writes systemd units +sudo systemctl start "decnet-*.service" # starts API, workers, bus +``` + +For development / quick runs, start the processes directly in the foreground: + ```bash -cp .env.example .env.local # edit JWT secret, ports, DB backend decnet api start # :8000 decnet web start # :8080 ``` @@ -357,23 +338,17 @@ Set `DECNET_DB_BACKEND=mysql` and configure `DECNET_DB_*` env vars. DECNET supports multi-host deployments. One host runs as **master** (API + intelligence stack); others run as **agents** (decky engine only). -```bash -# On master -decnet swarm add --host 192.168.0.20 --name edge-01 --cert /path/to/cert.pem -decnet swarmctl deploy --host edge-01 --config mynet.ini +Swarm management is handled through the REST API (`/api/v1/swarm/`). On each agent host, initialise and start the agent service: +```bash # On agent host -decnet agent start +sudo .venv/bin/decnet init +sudo systemctl start decnet-agent.service ``` Agents authenticate to the master with per-host mTLS client certificates. The master verifies each agent's certificate fingerprint against `SwarmHost.client_cert_fingerprint` — CA-issued but not fingerprint-pinned is rejected. -Update distribution: - -```bash -decnet updater push --host edge-01 # push new package version -decnet updater rollback --host edge-01 # rollback to previous -``` +Package updates are distributed to agents via the REST API (`/api/v1/swarm/updater/`). --- @@ -382,7 +357,8 @@ decnet updater rollback --host edge-01 # rollback to previous When a host runs as an agent (`DECNET_MODE=agent`), the master-only commands and the full REST API are disabled. The agent exposes a minimal internal API for the master to drive topology operations, heartbeat, and log forwarding. ```bash -DECNET_MODE=agent decnet agent start +DECNET_MODE=agent sudo .venv/bin/decnet init +sudo systemctl start decnet-agent.service ``` Cross-host log forwarding uses RFC 5425 syslog-over-TLS on port 6514 with mutual TLS. Plaintext syslog is only permitted on loopback. @@ -449,14 +425,7 @@ Captured credentials from SSH, SMB, RDP, and web honeypots are deduplicated and MazeNET is DECNET's visual network-of-networks canvas. It lets you design multi-subnet deception environments, deploy them as live decky fleets, and observe attacker movement across segments. -```bash -decnet topology list -decnet topology create --name corp-lan --config mynet.ini -decnet topology deploy --id -decnet topology teardown --id -``` - -Topologies are designed in the web dashboard with a drag-and-drop canvas. Each node is either a **decky** (managed honeypot) or an **observed entity** (read-only attacker-pool node). Canvas positions persist per topology in the dashboard. +Topologies are managed through the REST API (`/api/v1/topology/`) and the web dashboard. Topologies are designed in the web dashboard with a drag-and-drop canvas. Each node is either a **decky** (managed honeypot) or an **observed entity** (read-only attacker-pool node). Canvas positions persist per topology in the dashboard. Topology mutations are async — the API returns immediately and the deployment status is polled via `GET /api/v1/topology/{id}/mutations/latest` or streamed via SSE. @@ -466,14 +435,13 @@ Topology mutations are async — the API returns immediately and the deployment Canary tokens are deception artefacts planted inside decky filesystems, emails, documents, and DNS responses. When triggered, they fire `canary.{token_id}.triggered` bus events and optionally call configured webhooks. -```bash -decnet canary plant --type url --decky decky-01 --label "corp-vpn-creds" -decnet canary plant --type dns --label "internal-share" -decnet canary list -decnet canary revoke --id -``` +Canary tokens are managed through the REST API (`/api/v1/canary/`) and the web dashboard. -Token types include: URL, DNS, document (PDF), image, email link. The `decnet canary-install-toolchain` command installs the Node.js tools used for obfuscated token generation. +Token types include: URL, DNS, document (PDF), image, email link. After `decnet init`, install the JS obfuscation toolchain once: + +```bash +decnet canary-install-toolchain +``` --- @@ -481,14 +449,7 @@ Token types include: URL, DNS, document (PDF), image, email link. The `decnet ca DECNET maps observed attacker behaviours to MITRE ATT&CK techniques using an inotify-backed rule store. Matched techniques are published as `ttp.tagged` bus events. -```bash -decnet ttp list # list techniques in loaded ATT&CK bundle -decnet ttp tag --attacker-id -decnet ttp export stix --attacker-id --output bundle.json -decnet ttp export misp --attacker-id --output event.json -``` - -Exports produce standard STIX 2.1 bundles and MISP events. DECNET uses the official MITRE ATT&CK STIX enterprise bundle and the CIRCL misp-stix converter. STIX custom extensions follow inter-DECNET round-trip semantics first; MISP/OpenCTI compatibility is secondary. +TTP tagging and exports are driven through the REST API (`/api/v1/ttp/`) and the web dashboard. Exports produce standard STIX 2.1 bundles and MISP events. DECNET uses the official MITRE ATT&CK STIX enterprise bundle and the CIRCL misp-stix converter. STIX custom extensions follow inter-DECNET round-trip semantics first; MISP/OpenCTI compatibility is secondary. --- diff --git a/alembic.ini b/alembic.ini new file mode 100644 index 00000000..4535bbe9 --- /dev/null +++ b/alembic.ini @@ -0,0 +1,147 @@ +# A generic, single database configuration. + +[alembic] +# path to migration scripts. +# this is typically a path given in POSIX (e.g. forward slashes) +# format, relative to the token %(here)s which refers to the location of this +# ini file +script_location = %(here)s/decnet/web/db/migrations + +# template used to generate migration file names; The default value is %%(rev)s_%%(slug)s +# Uncomment the line below if you want the files to be prepended with date and time +# see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file +# for all available tokens +# file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s +# Or organize into date-based subdirectories (requires recursive_version_locations = true) +# file_template = %%(year)d/%%(month).2d/%%(day).2d_%%(hour).2d%%(minute).2d_%%(second).2d_%%(rev)s_%%(slug)s + +# sys.path path, will be prepended to sys.path if present. +# defaults to the current working directory. for multiple paths, the path separator +# is defined by "path_separator" below. +prepend_sys_path = . + + +# timezone to use when rendering the date within the migration file +# as well as the filename. +# If specified, requires the tzdata library which can be installed by adding +# `alembic[tz]` to the pip requirements. +# string value is passed to ZoneInfo() +# leave blank for localtime +# timezone = + +# max length of characters to apply to the "slug" field +# truncate_slug_length = 40 + +# set to 'true' to run the environment during +# the 'revision' command, regardless of autogenerate +# revision_environment = false + +# set to 'true' to allow .pyc and .pyo files without +# a source .py file to be detected as revisions in the +# versions/ directory +# sourceless = false + +# version location specification; This defaults +# to /versions. When using multiple version +# directories, initial revisions must be specified with --version-path. +# The path separator used here should be the separator specified by "path_separator" +# below. +# version_locations = %(here)s/bar:%(here)s/bat:%(here)s/alembic/versions + +# path_separator; This indicates what character is used to split lists of file +# paths, including version_locations and prepend_sys_path within configparser +# files such as alembic.ini. +# The default rendered in new alembic.ini files is "os", which uses os.pathsep +# to provide os-dependent path splitting. +# +# Note that in order to support legacy alembic.ini files, this default does NOT +# take place if path_separator is not present in alembic.ini. If this +# option is omitted entirely, fallback logic is as follows: +# +# 1. Parsing of the version_locations option falls back to using the legacy +# "version_path_separator" key, which if absent then falls back to the legacy +# behavior of splitting on spaces and/or commas. +# 2. Parsing of the prepend_sys_path option falls back to the legacy +# behavior of splitting on spaces, commas, or colons. +# +# Valid values for path_separator are: +# +# path_separator = : +# path_separator = ; +# path_separator = space +# path_separator = newline +# +# Use os.pathsep. Default configuration used for new projects. +path_separator = os + +# set to 'true' to search source files recursively +# in each "version_locations" directory +# new in Alembic version 1.10 +# recursive_version_locations = false + +# the output encoding used when revision files +# are written from script.py.mako +# output_encoding = utf-8 + +# NOTE: no sqlalchemy.url here on purpose. env.py selects the engine from +# DECNET_DB_TYPE (sqlite|mysql), mirroring decnet/web/db/factory.py. + + +[post_write_hooks] +# post_write_hooks defines scripts or Python functions that are run +# on newly generated revision scripts. See the documentation for further +# detail and examples + +# format using "black" - use the console_scripts runner, against the "black" entrypoint +# hooks = black +# black.type = console_scripts +# black.entrypoint = black +# black.options = -l 79 REVISION_SCRIPT_FILENAME + +# lint with attempts to fix using "ruff" - use the module runner, against the "ruff" module +# hooks = ruff +# ruff.type = module +# ruff.module = ruff +# ruff.options = check --fix REVISION_SCRIPT_FILENAME + +# Alternatively, use the exec runner to execute a binary found on your PATH +# hooks = ruff +# ruff.type = exec +# ruff.executable = ruff +# ruff.options = check --fix REVISION_SCRIPT_FILENAME + +# Logging configuration. This is also consumed by the user-maintained +# env.py script only. +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARNING +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARNING +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S diff --git a/bait/.gitkeep b/bait/.gitkeep new file mode 100644 index 00000000..e69de29b diff --git a/bait/README.md b/bait/README.md new file mode 100644 index 00000000..ef4dbd02 --- /dev/null +++ b/bait/README.md @@ -0,0 +1,5 @@ +# bait/ + +Default operator-supplied email seed for IMAP/POP3 deckies. Drop `*.eml` and/or `*.json` files here; the IMAP/POP3 services bind-mount this dir read-only at `/var/spool/decnet-emails/seed` when no per-decky `email_seed` is configured. Entries concatenate onto the hardcoded bait baseline (additive to realism-engine output, never replacing). + +JSON shape: list of dicts with required `from_addr`, `to_addr`, `subject`, `body`; optional `from_name`, `date`, `flags`. See `decnet/templates/imap/server.py` for the loader. diff --git a/decnet.ini.example b/decnet.ini.example new file mode 100644 index 00000000..21698964 --- /dev/null +++ b/decnet.ini.example @@ -0,0 +1,64 @@ +; /etc/decnet/decnet.ini — DECNET host configuration +; +; Copy to /etc/decnet/decnet.ini and edit. Values here seed os.environ at +; CLI startup via setdefault() — real env vars still win, so you can +; override any value on the shell without editing this file. +; +; A missing file is fine; every daemon has sensible defaults. The main +; reason to use this file is to skip typing the same flags on every +; `decnet` invocation and to pin a host's role via `mode`. + +[decnet] +; mode = agent | master +; agent — worker host (runs `decnet agent`, `decnet forwarder`, `decnet updater`). +; Master-only commands (api, swarmctl, swarm, deploy, teardown, ...) +; are hidden from `decnet --help` and refuse to run. +; master — central server (runs `decnet api`, `decnet web`, `decnet swarmctl`, +; `decnet listener`). All commands visible. +mode = agent + +; disallow-master = true (default when mode=agent) +; Set to false for hybrid dev hosts that legitimately run both roles. +disallow-master = true + +; log-directory — root for DECNET's per-component logs. Systemd units set +; DECNET_SYSTEM_LOGS=/decnet..log so agent, forwarder, +; and engine each get their own file. The forwarder tails decnet.log. +log-directory = /var/log/decnet + + +; ─── Agent-only settings (read when mode=agent) ─────────────────────────── +[agent] +; Where the master's syslog-TLS listener lives. DECNET_SWARM_MASTER_HOST. +master-host = 192.168.1.50 +; Master listener port (RFC 5425 default 6514). DECNET_SWARM_SYSLOG_PORT. +swarm-syslog-port = 6514 +; Bind address/port for this worker's agent API (mTLS). +agent-port = 8765 +; Cert bundle dir — must contain ca.crt, worker.crt, worker.key from enroll. +; DECNET_AGENT_DIR — honored by the forwarder child as well. +agent-dir = /home/anti/.decnet/agent +; Updater cert bundle (required for `decnet updater`). +updater-dir = /home/anti/.decnet/updater + + +; ─── Master-only settings (read when mode=master) ───────────────────────── +[master] +; Main API (REST for the React dashboard). DECNET_API_HOST / _PORT. +api-host = 0.0.0.0 +api-port = 8000 +; React dev-server dashboard (`decnet web`). DECNET_WEB_HOST / _PORT. +web-host = 0.0.0.0 +web-port = 8080 +; Swarm controller (master-internal). DECNET_SWARMCTL_HOST isn't exposed +; under that name today — this block is the forward-compatible spelling. +; swarmctl-host = 127.0.0.1 +; swarmctl-port = 8770 +; Syslog-over-TLS listener bind address and port. DECNET_LISTENER_HOST and +; DECNET_SWARM_SYSLOG_PORT. The listener is auto-spawned by `decnet swarmctl`. +listener-host = 0.0.0.0 +swarm-syslog-port = 6514 +; Master CA dir (for enroll / swarm cert issuance). +; ca-dir = /home/anti/.decnet/ca +; JWT secret for the web API. MUST be set; 32+ bytes. Keep out of git. +; jwt-secret = REPLACE_ME_WITH_A_32_BYTE_SECRET diff --git a/decnet.tar b/decnet.tar new file mode 100644 index 00000000..02de619a Binary files /dev/null and b/decnet.tar differ diff --git a/decnet/__init__.py b/decnet/__init__.py index e69de29b..a6030a27 100644 --- a/decnet/__init__.py +++ b/decnet/__init__.py @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""DECNET — honeypot deception-network framework. + +This __init__ runs once, on the first `import decnet.*`. It seeds +os.environ from /etc/decnet/decnet.ini (if present) so that later +module-level reads in decnet.env pick up the INI values as if they had +been exported by the shell. Real env vars always win via setdefault(). + +Kept minimal on purpose — any heavier work belongs in a submodule. +""" +from decnet.config_ini import load_ini_config as _load_ini_config + +_load_ini_config() diff --git a/decnet/agent/__init__.py b/decnet/agent/__init__.py new file mode 100644 index 00000000..85e59e18 --- /dev/null +++ b/decnet/agent/__init__.py @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""DECNET worker agent — runs on every SWARM worker host. + +Exposes an mTLS-protected FastAPI service the master's SWARM controller +calls to deploy, mutate, and tear down deckies locally. The agent reuses +the existing `decnet.engine.deployer` code path unchanged, so a worker runs +deckies the same way `decnet deploy --mode unihost` does today. +""" diff --git a/decnet/agent/app.py b/decnet/agent/app.py new file mode 100644 index 00000000..fd0c3184 --- /dev/null +++ b/decnet/agent/app.py @@ -0,0 +1,366 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Worker-side FastAPI app. + +Protected by mTLS at the ASGI/uvicorn transport layer: uvicorn is started +with ``--ssl-ca-certs`` + ``--ssl-cert-reqs 2`` (CERT_REQUIRED), so any +client that cannot prove a cert signed by the DECNET CA is rejected before +reaching a handler. Once past the TLS handshake, all peers are trusted +equally (the only entity holding a CA-signed cert is the master +controller). + +Endpoints mirror the existing unihost CLI verbs: + +* ``POST /deploy`` — body: serialized ``DecnetConfig`` +* ``POST /teardown`` — body: optional ``{"decky_id": "..."}`` +* ``POST /mutate`` — body: ``{"decky_id": "...", "services": [...]}`` +* ``GET /status`` — deployment snapshot +* ``GET /health`` — liveness probe, does NOT require mTLS? No — mTLS + still required; master pings it with its cert. +""" +from __future__ import annotations + +import asyncio +import os +import pathlib +from contextlib import asynccontextmanager +from typing import Any, Optional + +from fastapi import FastAPI, HTTPException +from fastapi.responses import JSONResponse +from pydantic import BaseModel, Field + +import contextlib + +from decnet.agent import executor as _exec +from decnet.agent import heartbeat as _heartbeat +from decnet.agent import topology_ops as _topology_ops +from decnet.bus.factory import get_bus +from decnet.bus.publish import run_health_heartbeat +from decnet.swarm.pki import DEFAULT_AGENT_DIR +from decnet.agent.topology_store import AlreadyApplied, TopologyStore +from decnet.config import DecnetConfig +from decnet.logging import get_logger +from decnet.topology.validate import ValidationError + +log = get_logger("agent.app") + + +def _resolve_agent_dir() -> pathlib.Path: + env = os.environ.get("DECNET_AGENT_DIR") + if env: + return pathlib.Path(env) + system = pathlib.Path("/etc/decnet/agent") + if system.exists(): + return system + return DEFAULT_AGENT_DIR + + +# Module-level singleton. Created lazily on first use so tests can +# monkeypatch DECNET_AGENT_DIR before the store binds to a path. +_topology_store: Optional[TopologyStore] = None + + +def _store() -> TopologyStore: + global _topology_store + if _topology_store is None: + _topology_store = TopologyStore(_resolve_agent_dir() / "topology.db") + return _topology_store + + +_collector_task: Optional[asyncio.Task] = None + + +def _ensure_collector_started() -> None: + """Spawn the log collector on demand — called from /topology/apply + after a successful materialise. We must NOT start this in the + lifespan hook: the agent's boot invariant is "never touch docker + until master tells us to" (see tests/swarm/test_agent_no_auto_restore.py). + + The collector watches ``decnet.topology.service=true`` labels via + docker events, writing RFC 5424 lines to ``DECNET_AGENT_LOG_FILE`` + which the forwarder ships to the master over syslog-TLS. Idempotent: + subsequent calls while the task is still running are no-ops. + """ + global _collector_task + if _collector_task is not None and not _collector_task.done(): + return + from decnet.env import DECNET_AGENT_LOG_FILE + + try: + from decnet.collector.worker import log_collector_worker + except Exception: # noqa: BLE001 — docker may be unavailable on dev + log.warning( + "agent log collector not starting — collector worker import failed", + exc_info=True, + ) + return + _collector_task = asyncio.create_task( + log_collector_worker(DECNET_AGENT_LOG_FILE), + name="agent-log-collector", + ) + log.info("agent log collector started log_file=%s", DECNET_AGENT_LOG_FILE) + + +_bus_heartbeat_task: Optional[asyncio.Task] = None + + +@asynccontextmanager +async def _lifespan(app: FastAPI): + # Best-effort: if identity/bundle plumbing isn't configured (e.g. dev + # runs or non-enrolled hosts), heartbeat.start() is a silent no-op. + _heartbeat.start() + + # Host-local bus heartbeat (system.agent.health). Separate channel + # from the mTLS master-facing heartbeat above; this one lets peers on + # the same host (dashboard, updater) see the agent is alive without + # hitting its HTTPS endpoint. Bus-disabled path is a no-op loop. + bus = None + try: + bus = get_bus(client_name="agent") + await bus.connect() + except Exception as exc: # noqa: BLE001 + log.warning("agent: bus unavailable, skipping health heartbeat: %s", exc) + bus = None + + global _bus_heartbeat_task + _bus_heartbeat_task = asyncio.create_task( + run_health_heartbeat(bus, "agent"), + name="agent-bus-heartbeat", + ) + + try: + yield + finally: + await _heartbeat.stop() + if _bus_heartbeat_task is not None: + _bus_heartbeat_task.cancel() + with contextlib.suppress(asyncio.CancelledError, Exception): + await _bus_heartbeat_task + _bus_heartbeat_task = None + if bus is not None: + with contextlib.suppress(Exception): + await bus.close() + global _collector_task + if _collector_task is not None and not _collector_task.done(): + _collector_task.cancel() + try: + await _collector_task + except (asyncio.CancelledError, Exception): # noqa: BLE001 + pass + _collector_task = None + global _topology_store + if _topology_store is not None: + _topology_store.close() + _topology_store = None + + +app = FastAPI( + title="DECNET SWARM Agent", + version="0.1.0", + docs_url=None, # no interactive docs on worker — narrow attack surface + redoc_url=None, + openapi_url=None, + lifespan=_lifespan, + responses={ + 400: {"description": "Malformed request body"}, + 500: {"description": "Executor error"}, + }, +) + + +# ------------------------------------------------------------------ schemas + +class DeployRequest(BaseModel): + config: DecnetConfig = Field(..., description="Full DecnetConfig to materialise on this worker") + dry_run: bool = False + no_cache: bool = False + + +class TeardownRequest(BaseModel): + decky_id: Optional[str] = None + + +class MutateRequest(BaseModel): + decky_id: str + services: list[str] + dry_run: bool = False + + +# ------------------------------------------------------------------ routes + +@app.get("/health") +async def health() -> dict[str, str]: + return {"status": "ok"} + + +@app.get("/status") +async def status() -> dict: + return await _exec.status() + + +@app.post( + "/deploy", + status_code=202, + responses={202: {"description": "Deploy accepted; runs in background; lifecycle deltas pushed via heartbeat"}}, +) +async def deploy(req: DeployRequest) -> dict: + """Spawn the deploy in the background and return 202 immediately. + + The master tracks per-decky completion via lifecycle deltas pushed on + the next heartbeat (one immediate push on completion, plus the + scheduled 30 s ticks as a fallback). Holding the request open across + a multi-minute compose build was the previous source of the wizard + API-hang.""" + asyncio.create_task( + _exec.deploy_async(req.config, dry_run=req.dry_run, no_cache=req.no_cache), + name=f"deploy-{id(req)}", + ) + return {"status": "accepted", "deckies": [d.name for d in req.config.deckies]} + + +@app.post( + "/teardown", + responses={500: {"description": "Teardown raised an exception"}}, +) +async def teardown(req: TeardownRequest) -> dict: + try: + await _exec.teardown(req.decky_id) + except Exception as exc: + log.exception("agent.teardown failed") + raise HTTPException(status_code=500, detail=str(exc)) from exc + return {"status": "torn_down", "decky_id": req.decky_id} + + +@app.post( + "/self-destruct", + responses={500: {"description": "Reaper could not be scheduled"}}, +) +async def self_destruct() -> dict: + """Stop all DECNET services on this worker and delete the install + footprint. Called by the master during decommission. Logs under + /var/log/decnet* are preserved. Fire-and-forget — returns 202 before + the reaper starts deleting files.""" + try: + await _exec.self_destruct() + except Exception as exc: + log.exception("agent.self_destruct failed") + raise HTTPException(status_code=500, detail=str(exc)) from exc + return {"status": "self_destruct_scheduled"} + + +# ------------------------------------------------------- topology endpoints + + +class ApplyTopologyRequest(BaseModel): + hydrated: dict[str, Any] = Field( + ..., description="Hydrated topology dict from master.persistence.hydrate()" + ) + version_hash: str = Field( + ..., description="Master's canonical_hash(hydrated); must match ours" + ) + + +class TeardownTopologyRequest(BaseModel): + topology_id: str = Field(..., description="Topology UUID to dismantle") + + +@app.post( + "/topology/apply", + responses={ + 400: {"description": "Malformed hydrated topology or hash mismatch"}, + 409: {"description": "A different topology is already applied"}, + 500: {"description": "Docker or compose raised while applying"}, + }, +) +async def topology_apply(req: ApplyTopologyRequest) -> dict: + store = _store() + try: + await _topology_ops.apply(req.hydrated, req.version_hash, store) + except _topology_ops.HashMismatch as exc: + raise HTTPException(status_code=400, detail=str(exc)) from exc + except ValidationError as exc: + raise HTTPException(status_code=400, detail=str(exc)) from exc + except AlreadyApplied as exc: + raise HTTPException(status_code=409, detail=str(exc)) from exc + except Exception as exc: + log.exception("agent.topology_apply failed") + topology_id = (req.hydrated.get("topology") or {}).get("id") + if topology_id: + try: + store.record_error( + str(topology_id), str(exc)[:500], hydrated=req.hydrated, + ) + except Exception: # noqa: BLE001 — don't mask original failure + log.exception("failed to record apply error") + raise HTTPException(status_code=500, detail=str(exc)) from exc + _ensure_collector_started() + return {"status": "applied", "version_hash": req.version_hash} + + +@app.post( + "/topology/teardown", + responses={500: {"description": "Docker or compose raised while tearing down"}}, +) +async def topology_teardown(req: TeardownTopologyRequest) -> dict: + try: + await _topology_ops.teardown(req.topology_id, _store()) + except Exception as exc: + log.exception("agent.topology_teardown failed") + raise HTTPException(status_code=500, detail=str(exc)) from exc + return {"status": "torn_down", "topology_id": req.topology_id} + + +@app.get("/topology/state") +async def topology_state() -> dict: + return _topology_ops.state(_store()) + + +@app.post( + "/mutate", + status_code=202, + responses={ + 202: {"description": "Mutate accepted; runs in background; lifecycle delta pushed via heartbeat"}, + 404: {"description": "No active deployment, or unknown decky_id (dry_run validation only)"}, + }, +) +async def mutate(req: MutateRequest) -> Any: + """Spawn the mutate in the background and return 202 immediately. + + Master tracks completion via a lifecycle delta pushed on the next + heartbeat (immediate push on completion). ``dry_run`` is still + synchronous — it validates against the worker's current state and + returns the would-be services without spawning a task or touching + docker, so the wizard's preview path stays cheap.""" + if req.dry_run: + from decnet.config import load_state + state = load_state() + if state is None: + raise HTTPException( + status_code=404, + detail="no active deployment on this worker", + ) + cfg, _ = state + decky = next((d for d in cfg.deckies if d.name == req.decky_id), None) + if decky is None: + raise HTTPException( + status_code=404, + detail=f"decky {req.decky_id!r} not found in worker state", + ) + return JSONResponse( + status_code=200, + content={ + "status": "dry_run", + "decky_id": req.decky_id, + "services": list(req.services), + }, + ) + + asyncio.create_task( + _exec.mutate_async(req.decky_id, list(req.services)), + name=f"mutate-{req.decky_id}", + ) + return { + "status": "accepted", + "decky_id": req.decky_id, + "services": list(req.services), + } diff --git a/decnet/agent/executor.py b/decnet/agent/executor.py new file mode 100644 index 00000000..2e3a9796 --- /dev/null +++ b/decnet/agent/executor.py @@ -0,0 +1,317 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Thin adapter between the agent's HTTP endpoints and the existing +``decnet.engine.deployer`` code path. + +Kept deliberately small: the agent does not re-implement deployment logic, +it only translates a master RPC into the same function calls the unihost +CLI already uses. Everything runs in a worker thread (the deployer is +blocking) so the FastAPI event loop stays responsive. +""" +from __future__ import annotations + +import asyncio +from ipaddress import IPv4Network +from typing import Any + +from decnet.engine import deployer as _deployer +from decnet.config import DecnetConfig, load_state, clear_state +from decnet.logging import get_logger +from decnet.network import ( + allocate_ips, + detect_interface, + detect_subnet, + get_host_ip, +) + +log = get_logger("agent.executor") + + +def _relocalize(config: DecnetConfig) -> DecnetConfig: + """Rewrite a master-built config to the worker's local network reality. + + The master populates ``interface``/``subnet``/``gateway`` from its own + box before dispatching, which blows up the deployer on any worker whose + NIC name differs (common in heterogeneous fleets — master on ``wlp6s0``, + worker on ``enp0s3``). We always re-detect locally; if the worker sits + on a different subnet than the master, decky IPs are re-allocated from + the worker's subnet so they're actually reachable. + """ + local_iface = detect_interface() + local_subnet, local_gateway = detect_subnet(local_iface) + local_host_ip = get_host_ip(local_iface) + + updates: dict[str, Any] = { + "interface": local_iface, + "subnet": local_subnet, + "gateway": local_gateway, + } + + master_net = IPv4Network(config.subnet, strict=False) if config.subnet else None + local_net = IPv4Network(local_subnet, strict=False) + if master_net is None or master_net != local_net: + log.info( + "agent.deploy subnet mismatch master=%s local=%s — re-allocating decky IPs", + config.subnet, local_subnet, + ) + fresh_ips = allocate_ips( + subnet=local_subnet, + gateway=local_gateway, + host_ip=local_host_ip, + count=len(config.deckies), + ) + new_deckies = [d.model_copy(update={"ip": ip}) for d, ip in zip(config.deckies, fresh_ips)] + updates["deckies"] = new_deckies + + return config.model_copy(update=updates) + + +async def deploy(config: DecnetConfig, dry_run: bool = False, no_cache: bool = False) -> None: + """Run the blocking deployer off-loop. The deployer itself calls + save_state() internally once the compose file is materialised.""" + log.info( + "agent.deploy mode=%s deckies=%d interface=%s (incoming)", + config.mode, len(config.deckies), config.interface, + ) + if config.mode == "swarm": + config = _relocalize(config) + log.info( + "agent.deploy relocalized interface=%s subnet=%s gateway=%s", + config.interface, config.subnet, config.gateway, + ) + await asyncio.to_thread(_deployer.deploy, config, dry_run, no_cache, False) + + +async def deploy_async( + config: DecnetConfig, *, dry_run: bool = False, no_cache: bool = False, +) -> None: + """Background-task body for /deploy: run the deploy, then push a + lifecycle delta to the master so it observes terminal transitions + immediately rather than waiting for the next scheduled heartbeat. + + Per-decky lifecycle deltas — master pivots them onto the matching + open DeckyLifecycle rows via the heartbeat handler. Errors are + captured and pushed as ``failed`` deltas; the task itself never + raises (a crashed task would just leave master rows wedged). + """ + from datetime import datetime, timezone + from decnet.agent.heartbeat import push_lifecycle_delta + + decky_names = [d.name for d in config.deckies] + try: + await deploy(config, dry_run=dry_run, no_cache=no_cache) + except Exception as exc: # noqa: BLE001 + log.exception("agent.deploy_async failed") + err = f"{type(exc).__name__}: {exc}" + deltas = [ + { + "decky_name": name, "operation": "deploy", + "status": "failed", "error": err[:2000], + "completed_at": datetime.now(timezone.utc).isoformat(), + } + for name in decky_names + ] + await push_lifecycle_delta(deltas) + return + deltas = [ + { + "decky_name": name, "operation": "deploy", + "status": "succeeded", + "completed_at": datetime.now(timezone.utc).isoformat(), + } + for name in decky_names + ] + await push_lifecycle_delta(deltas) + + +async def mutate_async(decky_id: str, services: list[str]) -> None: + """Background-task body for /mutate. Same shape as deploy_async: + perform the work, then push a single lifecycle delta on + completion (success or failure).""" + import time + from datetime import datetime, timezone + from decnet.composer import write_compose + from decnet.config import load_state, save_state + from decnet.engine import _compose_with_retry + from decnet.agent.heartbeat import push_lifecycle_delta + + def _delta(status: str, error: str | None = None) -> dict: + out = { + "decky_name": decky_id, "operation": "mutate", + "status": status, + "completed_at": datetime.now(timezone.utc).isoformat(), + } + if error is not None: + out["error"] = error[:2000] + return out + + try: + state = load_state() + if state is None: + await push_lifecycle_delta( + [_delta("failed", "no active deployment on this worker")], + ) + return + cfg, compose_path = state + decky = next((d for d in cfg.deckies if d.name == decky_id), None) + if decky is None: + await push_lifecycle_delta( + [_delta("failed", f"decky {decky_id!r} not found in worker state")], + ) + return + decky.services = list(services) + decky.last_mutated = time.time() + save_state(cfg, compose_path) + write_compose(cfg, compose_path) + await asyncio.to_thread( + _compose_with_retry, "up", "-d", "--remove-orphans", + compose_file=compose_path, + ) + except Exception as exc: # noqa: BLE001 + log.exception("agent.mutate_async failed decky=%s", decky_id) + err = f"{type(exc).__name__}: {exc}" + await push_lifecycle_delta([_delta("failed", err)]) + return + await push_lifecycle_delta([_delta("succeeded")]) + + +async def teardown(decky_id: str | None = None) -> None: + log.info("agent.teardown decky_id=%s", decky_id) + await asyncio.to_thread(_deployer.teardown, decky_id) + if decky_id is None: + await asyncio.to_thread(clear_state) + + +def _decky_runtime_states(config: DecnetConfig) -> dict[str, dict[str, Any]]: + """Map decky_name → {"running": bool, "services": {svc: container_state}}. + + Queried so the master can tell, after a partial-failure deploy, which + deckies actually came up instead of tainting the whole shard as failed. + Best-effort: a docker error returns an empty map, not an exception. + """ + try: + import docker # local import — agent-only path + client = docker.from_env() + live = {c.name: c.status for c in client.containers.list(all=True, ignore_removed=True)} + except Exception: # pragma: no cover — defensive + log.exception("_decky_runtime_states: docker query failed") + return {} + + out: dict[str, dict[str, Any]] = {} + for d in config.deckies: + svc_states = { + svc: live.get(f"{d.name}-{svc.replace('_', '-')}", "absent") + for svc in d.services + } + out[d.name] = { + "running": bool(svc_states) and all(s == "running" for s in svc_states.values()), + "services": svc_states, + } + return out + + +_REAPER_SCRIPT = r"""#!/bin/bash +# DECNET agent self-destruct reaper. +# Runs detached from the agent process so it survives the agent's death. +# Waits briefly for the HTTP response to drain, then stops services, +# wipes install paths, and preserves logs. +set +e + +sleep 3 + +# Stop decky containers started by the local deployer (best-effort). +if command -v docker >/dev/null 2>&1; then + docker ps -q --filter "label=com.docker.compose.project=decnet" | xargs -r docker stop + docker ps -aq --filter "label=com.docker.compose.project=decnet" | xargs -r docker rm -f + docker network rm decnet_lan 2>/dev/null +fi + +# Stop+disable every systemd unit the installer may have dropped. +for unit in decnet-agent decnet-engine decnet-collector decnet-forwarder decnet-prober decnet-reconciler decnet-sniffer decnet-updater; do + systemctl stop "$unit" 2>/dev/null + systemctl disable "$unit" 2>/dev/null +done + +# Nuke install paths. Logs under /var/log/decnet* are intentionally +# preserved — the operator typically wants them for forensic review. +rm -rf /opt/decnet* /var/lib/decnet/* /usr/local/bin/decnet* /etc/decnet +rm -f /etc/systemd/system/decnet-*.service /etc/systemd/system/decnet-*.timer + +systemctl daemon-reload 2>/dev/null +rm -f "$0" +""" + + +async def self_destruct() -> None: + """Tear down deckies, then spawn a detached reaper that wipes the + install footprint. Returns immediately so the HTTP response can drain + before the reaper starts deleting files out from under the agent.""" + import os + import shutil + import subprocess # nosec B404 + import tempfile + + # Best-effort teardown first — the reaper also runs docker stop, but + # going through the deployer gives the host-macvlan/ipvlan helper a + # chance to clean up routes cleanly. + try: + await asyncio.to_thread(_deployer.teardown, None) + await asyncio.to_thread(clear_state) + except Exception: + log.exception("self_destruct: pre-reap teardown failed — reaper will force-stop containers") + + # Reaper lives under /tmp so it survives rm -rf /opt/decnet*. + fd, path = tempfile.mkstemp(prefix="decnet-reaper-", suffix=".sh", dir="/tmp") # nosec B108 — reaper must outlive /opt/decnet removal + try: + os.write(fd, _REAPER_SCRIPT.encode()) + finally: + os.close(fd) + os.chmod(path, 0o700) # nosec B103 — root-owned reaper, needs exec + + # The reaper MUST run outside decnet-agent.service's cgroup — otherwise + # `systemctl stop decnet-agent` SIGTERMs the whole cgroup (reaper included) + # before rm -rf completes. `start_new_session=True` gets us a fresh POSIX + # session but does NOT escape the systemd cgroup. So we prefer + # `systemd-run --scope` (launches the command in a transient scope + # detached from the caller's service), falling back to a bare Popen if + # systemd-run is unavailable (non-systemd host / container). + systemd_run = shutil.which("systemd-run") + if systemd_run: + argv = [ + systemd_run, + "--collect", + "--unit", f"decnet-reaper-{os.getpid()}", + "--description", "DECNET agent self-destruct reaper", + "/bin/bash", path, + ] + spawn_kwargs = {"start_new_session": True} + else: + argv = ["/bin/bash", path] + spawn_kwargs = {"start_new_session": True} + + subprocess.Popen( # type: ignore[call-overload] # nosec B603 + argv, + stdin=subprocess.DEVNULL, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + close_fds=True, + **spawn_kwargs, + ) + log.warning( + "self_destruct: reaper spawned path=%s via=%s — agent will die in ~3s", + path, "systemd-run" if systemd_run else "popen", + ) + + +async def status() -> dict[str, Any]: + state = await asyncio.to_thread(load_state) + if state is None: + return {"deployed": False, "deckies": []} + config, _compose_path = state + runtime = await asyncio.to_thread(_decky_runtime_states, config) + return { + "deployed": True, + "mode": config.mode, + "compose_path": str(_compose_path), + "deckies": [d.model_dump() for d in config.deckies], + "runtime": runtime, + } diff --git a/decnet/agent/heartbeat.py b/decnet/agent/heartbeat.py new file mode 100644 index 00000000..6603d19f --- /dev/null +++ b/decnet/agent/heartbeat.py @@ -0,0 +1,210 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Agent → master liveness heartbeat loop. + +Every ``INTERVAL_S`` seconds the worker posts ``executor.status()`` to +``POST /swarm/heartbeat`` over mTLS. The master pins the +presented client cert's SHA-256 against the ``SwarmHost`` row for the +claimed ``host_uuid``; a match refreshes ``last_heartbeat`` + each +``DeckyShard``'s snapshot + runtime state. + +Identity comes from ``/etc/decnet/decnet.ini`` (seeded by the enroll +bundle) — specifically ``DECNET_HOST_UUID`` and ``DECNET_MASTER_HOST``. +The worker's existing ``~/.decnet/agent/`` bundle (or +``/etc/decnet/agent/``) provides the mTLS client cert. + +Started/stopped via the agent FastAPI app's lifespan. If identity +plumbing is missing (pre-enrollment dev runs) the loop logs at DEBUG and +declines to start — callers don't have to guard it. +""" +from __future__ import annotations + +import asyncio +import pathlib +from typing import Optional + +import httpx + +from decnet.agent import executor as _exec +from decnet.logging import get_logger +from decnet.swarm import pki +from decnet.swarm.log_forwarder import build_worker_ssl_context + +log = get_logger("agent.heartbeat") + +INTERVAL_S = 30.0 +_TIMEOUT = httpx.Timeout(connect=5.0, read=10.0, write=5.0, pool=5.0) + +_task: Optional[asyncio.Task] = None + + +def _resolve_agent_dir() -> pathlib.Path: + """Match the agent-dir resolution order used by the agent server: + DECNET_AGENT_DIR env, else /etc/decnet/agent (production install), + else ~/.decnet/agent (dev).""" + import os + env = os.environ.get("DECNET_AGENT_DIR") + if env: + return pathlib.Path(env) + system = pathlib.Path("/etc/decnet/agent") + if system.exists(): + return system + return pki.DEFAULT_AGENT_DIR + + +async def _build_body( + host_uuid: str, + agent_version: str, + lifecycle: Optional[list[dict]] = None, +) -> dict: + snap = await _exec.status() + body: dict = { + "host_uuid": host_uuid, + "agent_version": agent_version, + "status": snap, + } + # Best-effort: fold in applied-topology snapshot. Failures must never + # wedge the heartbeat loop — master will fall back to "no topology + # reported" which triggers a resync if it expected one. + try: + from decnet.agent import topology_ops as _topo_ops + from decnet.agent.topology_store import TopologyStore + store = TopologyStore(_resolve_agent_dir() / "topology.db") + try: + body["topology"] = _topo_ops.state(store) + finally: + store.close() + except Exception: + log.debug("heartbeat: topology state unavailable", exc_info=True) + if lifecycle: + body["lifecycle"] = lifecycle + return body + + +async def _tick(client: httpx.AsyncClient, url: str, host_uuid: str, agent_version: str) -> None: + body = await _build_body(host_uuid, agent_version) + resp = await client.post(url, json=body) + # 403 / 404 are terminal-ish — we still keep looping because an + # operator may re-enrol the host mid-session, but we log loudly so + # prod ops can spot cert-pinning drift. + if resp.status_code == 204: + return + log.warning( + "heartbeat rejected status=%d body=%s", + resp.status_code, resp.text[:200], + ) + + +async def _loop(url: str, host_uuid: str, agent_version: str, ssl_ctx) -> None: + log.info("heartbeat loop starting url=%s host_uuid=%s interval=%ss", + url, host_uuid, INTERVAL_S) + async with httpx.AsyncClient(verify=ssl_ctx, timeout=_TIMEOUT) as client: + while True: + try: + await _tick(client, url, host_uuid, agent_version) + except asyncio.CancelledError: + raise + except Exception: + log.exception("heartbeat tick failed — will retry in %ss", INTERVAL_S) + await asyncio.sleep(INTERVAL_S) + + +def start() -> Optional[asyncio.Task]: + """Kick off the background heartbeat task. No-op if identity is + unconfigured (dev mode) — the caller doesn't need to check.""" + global _task + from decnet.env import ( + DECNET_HOST_UUID, + DECNET_MASTER_HOST, + DECNET_SWARMCTL_PORT, + ) + + if _task is not None and not _task.done(): + return _task + if not DECNET_HOST_UUID or not DECNET_MASTER_HOST: + log.debug("heartbeat not starting — DECNET_HOST_UUID or DECNET_MASTER_HOST unset") + return None + + agent_dir = _resolve_agent_dir() + try: + ssl_ctx = build_worker_ssl_context(agent_dir) + except Exception: + log.exception("heartbeat not starting — worker SSL context unavailable at %s", agent_dir) + return None + + try: + from decnet import __version__ as _v # type: ignore[attr-defined] + agent_version = _v + except Exception: + agent_version = "unknown" + + url = f"https://{DECNET_MASTER_HOST}:{DECNET_SWARMCTL_PORT}/swarm/heartbeat" + _task = asyncio.create_task( + _loop(url, DECNET_HOST_UUID, agent_version, ssl_ctx), + name="agent-heartbeat", + ) + return _task + + +async def push_lifecycle_delta(deltas: list[dict]) -> None: + """Fire a one-off heartbeat POST carrying *deltas* in the + ``lifecycle`` field. Each delta: ``{decky_name, operation, status, + error?, completed_at?}``. + + Called by the agent executor on /deploy and /mutate completion so + the master observes the terminal transition immediately rather than + waiting up to ``INTERVAL_S`` for the next scheduled tick. Failures + are logged and swallowed; the next scheduled heartbeat carries the + same deltas via DB-side reconciliation, since the worker has no + durable per-row state to lose. + """ + from decnet.env import ( + DECNET_HOST_UUID, + DECNET_MASTER_HOST, + DECNET_SWARMCTL_PORT, + ) + + if not deltas: + return + if not DECNET_HOST_UUID or not DECNET_MASTER_HOST: + log.debug("push_lifecycle_delta: identity unconfigured — skipping") + return + + agent_dir = _resolve_agent_dir() + try: + ssl_ctx = build_worker_ssl_context(agent_dir) + except Exception: + log.exception("push_lifecycle_delta: SSL context unavailable") + return + + try: + from decnet import __version__ as _v # type: ignore[attr-defined] + agent_version = _v + except Exception: + agent_version = "unknown" + + url = f"https://{DECNET_MASTER_HOST}:{DECNET_SWARMCTL_PORT}/swarm/heartbeat" + try: + async with httpx.AsyncClient(verify=ssl_ctx, timeout=_TIMEOUT) as client: + body = await _build_body( + DECNET_HOST_UUID, agent_version, lifecycle=deltas, + ) + resp = await client.post(url, json=body) + if resp.status_code not in (200, 204): + log.warning( + "lifecycle delta push rejected status=%d body=%s", + resp.status_code, resp.text[:200], + ) + except Exception: + log.exception("push_lifecycle_delta failed — next scheduled tick will retry") + + +async def stop() -> None: + global _task + if _task is None: + return + _task.cancel() + try: + await _task + except (asyncio.CancelledError, Exception): + pass + _task = None diff --git a/decnet/agent/server.py b/decnet/agent/server.py new file mode 100644 index 00000000..acadfcfd --- /dev/null +++ b/decnet/agent/server.py @@ -0,0 +1,71 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Worker-agent uvicorn launcher. + +Starts ``decnet.agent.app:app`` over HTTPS with mTLS enforcement. The +worker must already have a bundle in ``~/.decnet/agent/`` (delivered by +``decnet swarm enroll`` from the master); if it does not, we refuse to +start — unauthenticated agents are not a supported mode. +""" +from __future__ import annotations + +import os +import pathlib +import signal +import subprocess # nosec B404 +import sys + +from decnet.logging import get_logger +from decnet.swarm import pki + +log = get_logger("agent.server") + + +def run(host: str, port: int, agent_dir: pathlib.Path = pki.DEFAULT_AGENT_DIR) -> int: + bundle = pki.load_worker_bundle(agent_dir) + if bundle is None: + print( + f"[agent] No cert bundle at {agent_dir}. " + f"Run `decnet swarm enroll` from the master first.", + file=sys.stderr, + ) + return 2 + + keyfile = agent_dir / "worker.key" + certfile = agent_dir / "worker.crt" + cafile = agent_dir / "ca.crt" + + cmd = [ + sys.executable, + "-m", + "uvicorn", + "decnet.agent.app:app", + "--host", + host, + "--port", + str(port), + "--ssl-keyfile", + str(keyfile), + "--ssl-certfile", + str(certfile), + "--ssl-ca-certs", + str(cafile), + # 2 == ssl.CERT_REQUIRED — clients MUST present a CA-signed cert. + "--ssl-cert-reqs", + "2", + ] + log.info("agent starting host=%s port=%d bundle=%s", host, port, agent_dir) + # Own process group for clean Ctrl+C / SIGTERM propagation to uvicorn + # workers (same pattern as `decnet api`). + proc = subprocess.Popen(cmd, start_new_session=True) # nosec B603 + try: + return proc.wait() + except KeyboardInterrupt: + try: + os.killpg(proc.pid, signal.SIGTERM) + try: + return proc.wait(timeout=10) + except subprocess.TimeoutExpired: + os.killpg(proc.pid, signal.SIGKILL) + return proc.wait() + except ProcessLookupError: + return 0 diff --git a/decnet/agent/topology_ops.py b/decnet/agent/topology_ops.py new file mode 100644 index 00000000..933acfce --- /dev/null +++ b/decnet/agent/topology_ops.py @@ -0,0 +1,220 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Agent-side topology apply/teardown/state primitives. + +Wraps the compose + bridge machinery from :mod:`decnet.engine.deployer` +so the agent can drive a topology without ever touching the master's +sqlmodel repo. The master-side ``deploy_topology`` always calls +``transition_status(repo, …)`` which is useless (and unreachable) on +an agent — here we operate purely on a hydrated dict + the local +:class:`TopologyStore`. + +v1 constraint: one topology per agent. A second apply for a different +``topology_id`` triggers an on-the-spot teardown of the predecessor +before the new apply proceeds — master is authoritative. +""" +from __future__ import annotations + +import asyncio +import subprocess # nosec B404 +from typing import Any + +import docker + +from decnet.agent.topology_store import ( + TopologyStore, + observed, +) +from decnet.engine.deployer import ( + _compose, + _compose_with_retry, + _teardown_order, + _topology_compose_path, + _topology_compose_project, +) +from decnet.logging import get_logger +from decnet.network import create_bridge_network, remove_bridge_network +from decnet.topology.compose import ( + _network_name as _topology_network_name, + write_topology_compose, +) +from decnet.topology.hashing import canonical_hash +from decnet.topology.validate import ( + ValidationError, + errors as _validation_errors, + validate as _validate_topology, +) + +log = get_logger("agent.topology_ops") + + +class HashMismatch(RuntimeError): + """Raised when the master-provided version_hash doesn't match what we + hash locally — suggests serialisation drift. We fail loudly rather + than silently papering over a schema mismatch.""" + + +def _topology_id(hydrated: dict[str, Any]) -> str: + topo = hydrated.get("topology") or {} + tid = topo.get("id") + if not tid: + raise ValueError("hydrated topology missing topology.id") + return str(tid) + + +def _check_hash_and_validate(hydrated: dict[str, Any], version_hash: str) -> str: + """Verify hash integrity and structural validity; return topology_id.""" + local_hash = canonical_hash(hydrated) + if local_hash != version_hash: + raise HashMismatch( + f"master hash {version_hash!r} does not match agent hash " + f"{local_hash!r} — refusing to apply" + ) + issues = _validate_topology(hydrated) + if _validation_errors(issues): + raise ValidationError(issues) + return _topology_id(hydrated) + + +async def _teardown_superseded(topology_id: str, store: TopologyStore) -> None: + """Tear down the current topology if it differs from topology_id. + + Master is authoritative — a different pinned topology (fully applied, + partially applied, or drifted) is torn down before the new apply proceeds. + Refusing with 409 would leave the agent stuck in a state only a human + could resolve. + """ + existing = store.current() + if existing is None or existing.topology_id == topology_id: + return + log.info( + "superseding topology %s with %s on master authority", + existing.topology_id, topology_id, + ) + try: + await teardown(existing.topology_id, store) + except Exception as exc: # noqa: BLE001 — we still want to try applying + log.warning( + "best-effort teardown of superseded topology %s failed: %s", + existing.topology_id, exc, + ) + # Hard-clear the store row so the new apply isn't blocked by a + # half-torn-down predecessor. Leftover docker objects surface via + # the next heartbeat's observed block. + store.clear(existing.topology_id) + + +def _materialise(hydrated: dict[str, Any], topology_id: str) -> None: + """Create bridge networks, write compose file, and bring up containers. + + Sync/blocking — callers must dispatch via asyncio.to_thread. + + ``--always-recreate-deps`` keeps service containers' netns shares + fresh: every decky service joins its base's netns via + ``network_mode: container:``, and that share is bound at + service start time. If a base is recreated (e.g. when ``ports:`` + changes after toggling ``forwards_l3``) but compose decides the + services are unchanged, the services keep a stale netns FD + pointing at the destroyed base — they end up in an empty + namespace with only ``lo``, and external traffic hits a closed + port on the live base. Forcing dependents to recreate alongside + the base is the cheapest way to make this race impossible. + """ + compose_path = _topology_compose_path(topology_id) + compose_project = _topology_compose_project(topology_id) + client = docker.from_env() + for lan in hydrated["lans"]: + net_name = _topology_network_name(topology_id, lan["name"]) + create_bridge_network(client, net_name, lan["subnet"], internal=not lan["is_dmz"]) + write_topology_compose(hydrated, compose_path) + _compose_with_retry( + "up", "--build", "-d", "--always-recreate-deps", + compose_file=compose_path, project=compose_project, + ) + + +async def apply( + hydrated: dict[str, Any], + version_hash: str, + store: TopologyStore, +) -> None: + """Materialise *hydrated* on this agent and record it in *store*. + + Raises: + HashMismatch: master and agent disagree on the canonical hash — + don't touch docker, fail the apply. + ValidationError: topology fails structural validation. + Any docker / compose error propagates up; the endpoint maps it + to 500 and records the message on the store row. + """ + topology_id = _check_hash_and_validate(hydrated, version_hash) + await _teardown_superseded(topology_id, store) + await asyncio.to_thread(_materialise, hydrated, topology_id) + store.put(topology_id, version_hash, hydrated) + log.info("topology %s applied on agent (%d LANs)", topology_id, len(hydrated["lans"])) + + +async def teardown( + topology_id: str, + store: TopologyStore, +) -> None: + """Tear down *topology_id* on this agent. Idempotent: if there's no + record and no compose file, it's a no-op that still returns cleanly.""" + row = store.current() + # Prefer the stored hydrated blob — it's what we applied with. If + # it's gone (db wiped) but compose-file lingers, we still try to + # compose-down and delete bridges by scanning the compose file's + # LAN membership list via the hydrated blob if available. + hydrated = row.hydrated if row and row.topology_id == topology_id else None + compose_path = _topology_compose_path(topology_id) + compose_project = _topology_compose_project(topology_id) + client = docker.from_env() + + def _dismantle() -> None: + if compose_path.exists(): + try: + _compose( + "down", "--remove-orphans", + compose_file=compose_path, project=compose_project, + ) + except subprocess.CalledProcessError as exc: + log.warning( + "topology %s compose down failed (continuing): %s", + topology_id, exc, + ) + if hydrated is not None: + for lan_name in _teardown_order(hydrated["lans"]): + net_name = _topology_network_name(topology_id, lan_name) + remove_bridge_network(client, net_name) + if compose_path.exists(): + compose_path.unlink() + + await asyncio.to_thread(_dismantle) + store.clear(topology_id) + log.info("topology %s torn down on agent", topology_id) + + +def state(store: TopologyStore) -> dict[str, Any]: + """Snapshot-plus-live-observation — the shape the heartbeat embeds.""" + row = store.current() + try: + obs = observed(docker.from_env()) + except Exception as exc: # noqa: BLE001 — docker socket may be gone + obs = {"error": str(exc)[:200]} + if row is None: + return { + "topology_id": None, + "applied_version_hash": None, + "applied_at": None, + "last_error": None, + "observed": obs, + } + return { + "topology_id": row.topology_id, + "applied_version_hash": row.applied_version_hash, + "applied_at": row.applied_at, + "last_error": row.last_error, + "observed": obs, + } + + +__all__ = ["apply", "teardown", "state", "HashMismatch"] diff --git a/decnet/agent/topology_store.py b/decnet/agent/topology_store.py new file mode 100644 index 00000000..916d3b22 --- /dev/null +++ b/decnet/agent/topology_store.py @@ -0,0 +1,215 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Agent-side sqlite cache of the currently-applied topology. + +**This is a cache, not a source of truth.** The master is the only +authority for what the agent should be running. This store exists so +the agent can answer two questions quickly and offline: + +1. What topology did I last apply, and with what version hash? +2. Is what docker is currently doing consistent with that? + +The hash goes out on every heartbeat; the master compares it to what +it thinks this host should be running and schedules a re-push on +mismatch. + +Why sqlite when the blob is JSON? Consistent with +:mod:`decnet.swarm.log_forwarder._OffsetStore` — single-row sqlite is +the project-wide pattern for agent-local persistent state. Keeps +operational mental model small: "one state.db per thing". + +Design choices worth calling out: + +- **One row, one topology.** v1 only supports a single topology per + agent. Attempting to :meth:`put` a different ``topology_id`` while + a row already exists raises :class:`AlreadyApplied` — the agent + rejects the apply with 409 and the master is expected to teardown + the old one first. +- **No auto-restore on boot.** The agent does NOT read this db at + startup and try to re-apply. Whatever docker has after a restart + is what it has; the next heartbeat reports the truth and the + master decides whether to re-push. Same reason we don't sync + mutations from agent → master anywhere else: split-brain is worse + than temporary drift. +""" +from __future__ import annotations + +import json +import pathlib +import sqlite3 +import time +from dataclasses import dataclass +from typing import Any, Optional + + +class AlreadyApplied(RuntimeError): + """Raised when a different topology is already pinned to this agent.""" + + +@dataclass(frozen=True) +class AppliedRow: + topology_id: str + applied_version_hash: str + hydrated: dict[str, Any] + applied_at: int + last_error: Optional[str] + + +class TopologyStore: + """Single-row sqlite cache. Stdlib only, sync (called from endpoints).""" + + def __init__(self, db_path: pathlib.Path) -> None: + db_path.parent.mkdir(parents=True, exist_ok=True) + # check_same_thread=False: Starlette/FastAPI runs sync endpoint + # bodies on a worker thread distinct from where `app` is imported. + # The agent is single-process, so there's no real contention — + # sqlite's own connection lock is enough. + self._conn = sqlite3.connect(str(db_path), check_same_thread=False) + self._conn.row_factory = sqlite3.Row + self._conn.execute( + "CREATE TABLE IF NOT EXISTS applied_topology (" + " topology_id TEXT PRIMARY KEY," + " applied_version_hash TEXT NOT NULL," + " hydrated_blob_json TEXT NOT NULL," + " applied_at INTEGER NOT NULL," + " last_error TEXT)" + ) + self._conn.commit() + + # ----------------------------------------------------------------- reads + + def current(self) -> Optional[AppliedRow]: + """Return the single applied topology, or ``None`` if idle.""" + row = self._conn.execute( + "SELECT topology_id, applied_version_hash, hydrated_blob_json," + " applied_at, last_error FROM applied_topology LIMIT 1" + ).fetchone() + if row is None: + return None + return AppliedRow( + topology_id=row["topology_id"], + applied_version_hash=row["applied_version_hash"], + hydrated=json.loads(row["hydrated_blob_json"]), + applied_at=int(row["applied_at"]), + last_error=row["last_error"], + ) + + # ---------------------------------------------------------------- writes + + def put( + self, + topology_id: str, + applied_version_hash: str, + hydrated: dict[str, Any], + ) -> None: + """Record an applied topology. + + If a *different* topology is already recorded, raises + :class:`AlreadyApplied`. Re-applying the same ``topology_id`` + just updates the hash + blob (idempotent re-push). + """ + existing = self.current() + if existing is not None and existing.topology_id != topology_id: + raise AlreadyApplied( + f"agent already has topology {existing.topology_id!r}; " + f"cannot apply {topology_id!r}" + ) + self._conn.execute( + "INSERT INTO applied_topology" + " (topology_id, applied_version_hash, hydrated_blob_json," + " applied_at, last_error)" + " VALUES (?, ?, ?, ?, NULL)" + " ON CONFLICT(topology_id) DO UPDATE SET" + " applied_version_hash=excluded.applied_version_hash," + " hydrated_blob_json=excluded.hydrated_blob_json," + " applied_at=excluded.applied_at," + " last_error=NULL", + ( + topology_id, + applied_version_hash, + json.dumps(hydrated, sort_keys=True), + int(time.time()), + ), + ) + self._conn.commit() + + def record_error( + self, + topology_id: str, + message: str, + hydrated: Optional[dict[str, Any]] = None, + ) -> None: + """Attach a last-error message for *topology_id*. + + Upserts a marker row when no apply has yet succeeded for this + topology — that way a failure *during* the first materialise + (put() hasn't been reached) still surfaces via GET + /topology/state and the next heartbeat. The marker row uses an + empty ``applied_version_hash`` so master's heartbeat check sees + the hash mismatch and schedules a resync. + + If *hydrated* is provided it is stored so a later teardown can + still walk the LAN list — otherwise a partial deploy is strands + containers + bridges with no breadcrumb back to them. + """ + blob = json.dumps(hydrated, sort_keys=True) if hydrated else "{}" + self._conn.execute( + "INSERT INTO applied_topology" + " (topology_id, applied_version_hash, hydrated_blob_json," + " applied_at, last_error)" + " VALUES (?, '', ?, 0, ?)" + " ON CONFLICT(topology_id) DO UPDATE SET" + " last_error=excluded.last_error," + " hydrated_blob_json=CASE" + " WHEN applied_topology.hydrated_blob_json='{}'" + " THEN excluded.hydrated_blob_json" + " ELSE applied_topology.hydrated_blob_json END", + (topology_id, blob, message), + ) + self._conn.commit() + + def clear(self, topology_id: str) -> None: + """Remove the row for *topology_id* (post-teardown). + + No-op if the row doesn't exist — makes teardown idempotent. + """ + self._conn.execute( + "DELETE FROM applied_topology WHERE topology_id=?", + (topology_id,), + ) + self._conn.commit() + + def close(self) -> None: + self._conn.close() + + +# --------------------------------------------------- live docker observation + + +def observed(docker_client: Any) -> dict[str, Any]: + """Snapshot what docker is *actually* running on this agent. + + Returns a compact dict the heartbeat can ship so the master can + cross-check ``applied_version_hash`` against reality (a matching + hash with missing bridges is still drift). Best-effort: if docker + is unreachable we return an ``error`` marker rather than raising — + the agent still needs to heartbeat, and the master can treat + ``error`` as "unknown, re-push". + """ + try: + bridges = [ + n.name + for n in docker_client.networks.list() + if n.attrs.get("Driver") == "bridge" + and n.name.startswith("decnet-topology-") + ] + containers = [ + c.name + for c in docker_client.containers.list(all=False) + if c.name.startswith("decnet-") + ] + return {"bridges": sorted(bridges), "containers": sorted(containers)} + except Exception as exc: # noqa: BLE001 — best-effort observation + return {"error": str(exc)[:200]} + + +__all__ = ["TopologyStore", "AppliedRow", "AlreadyApplied", "observed"] diff --git a/decnet/archetypes.py b/decnet/archetypes.py index e4145c8e..2ccc6830 100644 --- a/decnet/archetypes.py +++ b/decnet/archetypes.py @@ -1,3 +1,4 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later """ Machine archetype profiles for DECNET deckies. @@ -148,7 +149,7 @@ ARCHETYPES: dict[str, Archetype] = { slug="deaddeck", display_name="Deaddeck (Entry Point)", description="Internet-facing entry point with real interactive SSH — no honeypot emulation", - services=["real_ssh"], + services=["ssh"], preferred_distros=["debian", "ubuntu22"], nmap_os="linux", ), @@ -167,4 +168,4 @@ def all_archetypes() -> dict[str, Archetype]: def random_archetype() -> Archetype: - return random.choice(list(ARCHETYPES.values())) + return random.choice(list(ARCHETYPES.values())) # nosec B311 diff --git a/decnet/artifacts/__init__.py b/decnet/artifacts/__init__.py new file mode 100644 index 00000000..c37aa012 --- /dev/null +++ b/decnet/artifacts/__init__.py @@ -0,0 +1 @@ +"""Artifact storage helpers shared between the web router and TTP workers.""" diff --git a/decnet/artifacts/paths.py b/decnet/artifacts/paths.py new file mode 100644 index 00000000..4ef5f0b9 --- /dev/null +++ b/decnet/artifacts/paths.py @@ -0,0 +1,86 @@ +""" +Shared on-disk artifact path resolution. + +Honeypot decoys (SSH, SMTP) farm captured payloads into a host-mounted +quarantine tree: + + /var/lib/decnet/artifacts/{decky}/{service}/{stored_as} + +Two callers need to translate ``(decky, stored_as, service)`` into a +concrete ``Path`` rooted under that tree: + +* The web router endpoint ``GET /api/v1/artifacts/{decky}/{stored_as}`` + (``decnet.web.router.artifacts.api_get_artifact``) — admin-gated + download for the dashboard. +* The TTP ``EmailLifter`` (``decnet.ttp.impl.email_lifter``), which + reads the stored ``.eml`` at tag-time so body-aware predicates + (R0047 BEC, R0048 macro) don't need raw body text on the bus. + +Both callers share the same validation rules and the same +defence-in-depth symlink-escape check; this module is the single +implementation. It is auth-agnostic — wrappers layer authentication +where appropriate (the router does ``require_admin``, the lifter does +not). +""" + +from __future__ import annotations + +import os +import re +from pathlib import Path + +# decky names come from the deployer — lowercase alnum plus hyphens. +_DECKY_RE = re.compile(r"^[a-z0-9][a-z0-9-]{0,62}$") + +# Services that own an artifacts subdir. Kept explicit so a caller +# can't pivot into arbitrary subpaths via a query string or bus payload. +_ALLOWED_SERVICES = frozenset({"ssh", "smtp"}) + +# stored_as is assembled by the capturing template as: +# ${ts}_${sha:0:12}_${base} +# where ts is ISO-8601 UTC (e.g. 2026-04-18T02:22:56Z), sha is 12 hex chars, +# and base is the original filename's basename. Keep the filename charset +# tight but allow common punctuation dropped files actually use. +_STORED_AS_RE = re.compile( + r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z_[a-f0-9]{12}_[A-Za-z0-9._-]{1,255}$" +) + +# Module-level so tests can monkeypatch. Override via env in production +# (the systemd unit sets this) — the prod path matches the bind mount +# declared in decnet/services/{ssh,smtp}.py. +ARTIFACTS_ROOT = Path( + os.environ.get("DECNET_ARTIFACTS_ROOT", "/var/lib/decnet/artifacts") +) + + +class ArtifactPathError(ValueError): + """Raised when (decky, stored_as, service) fails validation or escapes + the artifacts root. + + The router catches this and re-raises HTTPException(400). The lifter + catches it and treats the event as having no body available (no-tag). + """ + + +def resolve_artifact_path(decky: str, stored_as: str, service: str) -> Path: + """Validate inputs, resolve the on-disk path, and confirm it stays + inside the artifacts root. + + Raises :class:`ArtifactPathError` on any violation. Does NOT check + that the file exists — callers handle that distinctly (404 for the + router, no-tag for the lifter). + """ + if service not in _ALLOWED_SERVICES: + raise ArtifactPathError("invalid service") + if not _DECKY_RE.fullmatch(decky): + raise ArtifactPathError("invalid decky name") + if not _STORED_AS_RE.fullmatch(stored_as): + raise ArtifactPathError("invalid stored_as") + + root = ARTIFACTS_ROOT.resolve() + candidate = (root / decky / service / stored_as).resolve() + # defence-in-depth: even though the regexes reject `..`, make sure a + # symlink or weird filesystem state can't escape the root. + if root not in candidate.parents and candidate != root: + raise ArtifactPathError("path escapes artifacts root") + return candidate diff --git a/decnet/artifacts/shards.py b/decnet/artifacts/shards.py new file mode 100644 index 00000000..b8d06957 --- /dev/null +++ b/decnet/artifacts/shards.py @@ -0,0 +1,129 @@ +"""Shared asciinema shard helpers. + +Extracted from ``decnet/web/router/transcripts/api_get_transcript.py`` +so non-router callers (the BEHAVE-SHELL session-ended handler in +``decnet/profiler/worker.py``, the collector's session aggregator) +can resolve shard paths without crossing the layer boundary into the +FastAPI router. + +Functions here speak in :class:`ValueError` — callers that want HTTP +semantics translate at the boundary. The router wrappers keep their +existing ``HTTPException`` behaviour for backwards compatibility. + +PII boundary unchanged: shards live on disk; this module returns +:class:`pathlib.Path` pointers, never byte content. The ``_get_index`` +cache stores byte offsets only. +""" +from __future__ import annotations + +import os +import re +from collections import OrderedDict +from pathlib import Path + +ARTIFACTS_ROOT = Path( + os.environ.get("DECNET_ARTIFACTS_ROOT", "/var/lib/decnet/artifacts"), +) + +_DECKY_RE = re.compile(r"^[a-z0-9][a-z0-9-]{0,62}$") +_SERVICE_RE = re.compile(r"^(ssh|telnet)$") +_SHARD_BASENAME_RE = re.compile(r"^sessions-\d{4}-\d{2}-\d{2}\.jsonl$") +_SID_LINE_RE = re.compile(rb'"sid"\s*:\s*"([a-f0-9-]{36})"') + +# (path, mtime_ns) → {sid: [(offset, length), ...]} +_INDEX_CACHE: "OrderedDict[tuple[str, int], dict[str, list[tuple[int, int]]]]" = ( + OrderedDict() +) +_CACHE_MAX = 32 + + +def validate_names(decky: str, service: str) -> None: + """Raise :class:`ValueError` if ``decky`` / ``service`` look forged.""" + if not _DECKY_RE.fullmatch(decky): + raise ValueError(f"invalid decky name: {decky!r}") + if not _SERVICE_RE.fullmatch(service): + raise ValueError(f"invalid service: {service!r}") + + +def resolve_shard(decky: str, service: str, shard_name: str) -> Path: + """Resolve ``ARTIFACTS_ROOT/{decky}/{service}/transcripts/{shard_name}`` + with escape-attempt detection. Raises :class:`ValueError` on + invalid inputs. + """ + validate_names(decky, service) + if not _SHARD_BASENAME_RE.fullmatch(shard_name): + raise ValueError(f"invalid shard name: {shard_name!r}") + root = ARTIFACTS_ROOT.resolve() + candidate = (root / decky / service / "transcripts" / shard_name).resolve() + if root not in candidate.parents and candidate != root: + raise ValueError(f"path escapes artifacts root: {candidate}") + return candidate + + +def _build_index(path: Path) -> dict[str, list[tuple[int, int]]]: + index: dict[str, list[tuple[int, int]]] = {} + with path.open("rb") as f: + offset = 0 + for line in f: + length = len(line) + m = _SID_LINE_RE.search(line) + if m: + sid = m.group(1).decode("ascii") + index.setdefault(sid, []).append((offset, length)) + offset += length + return index + + +def get_index(path: Path) -> tuple[dict[str, list[tuple[int, int]]], int]: + """Return ``(sid → [(offset, length), …], file_size)``. + + Cached by ``(path, mtime_ns)``; rebuilt when the shard changes. + """ + st = path.stat() + key = (str(path), st.st_mtime_ns) + if key in _INDEX_CACHE: + _INDEX_CACHE.move_to_end(key) + return _INDEX_CACHE[key], st.st_size + index = _build_index(path) + _INDEX_CACHE[key] = index + _INDEX_CACHE.move_to_end(key) + while len(_INDEX_CACHE) > _CACHE_MAX: + _INDEX_CACHE.popitem(last=False) + return index, st.st_size + + +def find_shard_with_sid(decky: str, service: str, sid: str) -> Path | None: + """Scan every ``sessions-YYYY-MM-DD.jsonl`` under the decky's + transcripts dir until one claims this ``sid``. + + Newest shards first — most lookups are for recent sessions. Caches + the per-shard sid index, so repeated calls are ~free until the + shard's mtime changes. + + Returns ``None`` when nothing claims the sid OR when the + transcripts dir is missing / unreadable. Never raises on + filesystem-level errors — callers treat ``None`` as "skip". + """ + validate_names(decky, service) + root = ARTIFACTS_ROOT.resolve() + transcripts_dir = (root / decky / service / "transcripts").resolve() + if root not in transcripts_dir.parents: + return None + try: + if not transcripts_dir.is_dir(): + return None + entries = list(transcripts_dir.iterdir()) + except (OSError, PermissionError): + return None + shards = sorted( + (p for p in entries if _SHARD_BASENAME_RE.fullmatch(p.name)), + reverse=True, + ) + for shard in shards: + try: + index, _size = get_index(shard) + except (OSError, PermissionError): + continue + if sid in index: + return shard + return None diff --git a/decnet/asn/__init__.py b/decnet/asn/__init__.py new file mode 100644 index 00000000..4dcd5fcc --- /dev/null +++ b/decnet/asn/__init__.py @@ -0,0 +1,93 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" +IP-to-ASN enrichment — maps attacker IPs to BGP-announced AS numbers and +org names for attacker intelligence. + +Public surface mirrors :mod:`decnet.geoip` so callers can compose them: + +* :func:`get_lookup` — returns the singleton :class:`AsnLookup`. +* :func:`enrich_ip` — takes an IP string, returns + ``(asn_int, asn_name, bgp_prefix, provider_name)`` or ``(None, None, None, None)``. + +Provider selection goes through :func:`~decnet.asn.factory.get_provider` +(env ``DECNET_ASN_PROVIDER``, default ``iptoasn``). Direct imports of +concrete providers are forbidden — mirrors the ``get_bus`` / +``get_repository`` rule. +""" +from __future__ import annotations + +import os +import time +from typing import Optional, Tuple + +from decnet.asn.factory import get_provider +from decnet.asn.lookup import AsnLookup +from decnet.asn.paths import ASN_ROOT + +# 24 h — iptoasn refreshes daily. +REFRESH_INTERVAL_S = 86_400 + +_lookup: Optional[AsnLookup] = None +_provider_name: Optional[str] = None + + +def get_lookup(*, force_refresh: bool = False) -> AsnLookup: + """Return the cached :class:`AsnLookup`, building it on first use. + + If the provider's data files are missing or older than + ``REFRESH_INTERVAL_S`` seconds, refresh before building. Pass + ``force_refresh=True`` to bypass the age check (used by a future + ``decnet asn refresh`` CLI command). + """ + global _lookup, _provider_name + provider = get_provider() + _provider_name = provider.name + + if force_refresh or _files_stale(provider): + provider.refresh() + _lookup = None # rebuild on next access + + if _lookup is None: + _lookup = provider.build_lookup() + return _lookup + + +def enrich_ip(ip: str) -> Tuple[Optional[int], Optional[str], Optional[str], Optional[str]]: + """Return ``(asn, as_name, bgp_prefix, provider_name)`` or ``(None, None, None, None)``. + + Never raises — any lookup failure collapses to all-None so the + caller (profiler) can upsert the attacker row regardless. + + ``DECNET_ASN_ENABLED=false`` short-circuits the whole path, useful + for tests / agent hosts / ops wanting to disable enrichment without + touching provider config. + """ + if os.environ.get("DECNET_ASN_ENABLED", "true").lower() == "false": + return (None, None, None, None) + try: + lookup = get_lookup() + info = lookup.asn(ip) + if info is None: + return (None, None, None, None) + return (info.asn, info.name or None, info.prefix, _provider_name or "unknown") + except Exception: + return (None, None, None, None) + + +def _files_stale(provider) -> bool: + """True when the provider has no fresh data on disk. + + Same semantics as :func:`decnet.geoip._files_stale`: a partial + cache still produces correct answers for the ranges it covers. + """ + paths = provider.data_paths() + if not paths: + return True + now = time.time() + for p in paths: + if p.exists() and now - p.stat().st_mtime <= REFRESH_INTERVAL_S: + return False + return True + + +__all__ = ["get_lookup", "enrich_ip", "ASN_ROOT", "REFRESH_INTERVAL_S"] diff --git a/decnet/asn/base.py b/decnet/asn/base.py new file mode 100644 index 00000000..8cbc14e3 --- /dev/null +++ b/decnet/asn/base.py @@ -0,0 +1,34 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""ASN provider protocol — mirror of :mod:`decnet.geoip.base`. + +Concrete providers (e.g. :mod:`decnet.asn.iptoasn`) implement this. +Callers must go through :func:`decnet.asn.factory.get_provider`; never +import a concrete provider class directly. +""" +from __future__ import annotations + +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Sequence + +from decnet.asn.lookup import AsnLookup + + +class Provider(ABC): + """Abstract IP→ASN data provider.""" + + #: Short tag written to ``Attacker.asn_source`` (e.g. ``'iptoasn'``). + name: str + + @abstractmethod + def refresh(self) -> None: + """Download / regenerate the provider's raw data files.""" + + @abstractmethod + def build_lookup(self) -> AsnLookup: + """Parse the on-disk data files and return a ready-to-query lookup.""" + + @abstractmethod + def data_paths(self) -> Sequence[Path]: + """Return the list of files this provider manages — used for staleness + detection. Order is not significant.""" diff --git a/decnet/asn/factory.py b/decnet/asn/factory.py new file mode 100644 index 00000000..eb156b49 --- /dev/null +++ b/decnet/asn/factory.py @@ -0,0 +1,40 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""ASN provider factory — mirror of :mod:`decnet.geoip.factory`. + +Dispatch key: ``DECNET_ASN_PROVIDER`` (default ``iptoasn``). Lazy +singleton. +""" +from __future__ import annotations + +import os +from typing import Optional + +from decnet.asn.base import Provider + +_cached: Optional[Provider] = None +_cached_key: Optional[str] = None + + +def get_provider() -> Provider: + """Return the configured :class:`Provider` singleton.""" + global _cached, _cached_key + key = os.environ.get("DECNET_ASN_PROVIDER", "iptoasn").lower() + if _cached is not None and _cached_key == key: + return _cached + + if key == "iptoasn": + from decnet.asn.iptoasn.provider import IptoasnProvider + provider: Provider = IptoasnProvider() + else: + raise ValueError(f"Unsupported ASN provider: {key!r}") + + _cached = provider + _cached_key = key + return provider + + +def reset_cache() -> None: + """Forget the singleton — tests swap providers via the env var.""" + global _cached, _cached_key + _cached = None + _cached_key = None diff --git a/decnet/asn/iptoasn/__init__.py b/decnet/asn/iptoasn/__init__.py new file mode 100644 index 00000000..163b5596 --- /dev/null +++ b/decnet/asn/iptoasn/__init__.py @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""iptoasn.com IP→ASN provider. + +Daily-refreshed gzipped TSV dump of the global BGP table, derived from +RIPE RIS. Released into the public domain by upstream — no attribution +required, no UA mandate, no terms to violate. + +Direct imports of :class:`IptoasnProvider` are discouraged — go through +:func:`decnet.asn.factory.get_provider`. +""" diff --git a/decnet/asn/iptoasn/fetch.py b/decnet/asn/iptoasn/fetch.py new file mode 100644 index 00000000..b4e05ace --- /dev/null +++ b/decnet/asn/iptoasn/fetch.py @@ -0,0 +1,64 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""iptoasn.com bulk dump download. + +One file: ``ip2asn-v4.tsv.gz``, ~5 MB compressed, refreshed daily. +Pulled over HTTPS with the same generic UA the geoip RIR fetcher uses +(stealth: never identify as DECNET — public-data scrapers correlated to +honeypot operator egress is the threat model). +""" +from __future__ import annotations + +import logging +import shutil +import urllib.request +from pathlib import Path +from typing import Tuple + +logger = logging.getLogger("decnet.asn.iptoasn.fetch") + +# Mirror the (name, url) tuple shape of geoip.rir.fetch so test +# harnesses can swap one for the other. +IPTOASN_SOURCES: Tuple[Tuple[str, str], ...] = ( + ("ip2asn-v4", "https://iptoasn.com/data/ip2asn-v4.tsv.gz"), +) + +# Generic UA — matches geoip.rir.fetch. iptoasn.com explicitly releases +# the data into the public domain and does NOT require an identifying UA, +# so we keep DECNET stealth instead of advertising. +_USER_AGENT = "Mozilla/5.0 (compatible; fetch/1.0)" +_TIMEOUT_S = 60 + + +def fetch_all(dest: Path) -> list[Path]: + """Download every iptoasn file into *dest*. Returns the written paths. + + Atomic per file: download to ``{name}.tsv.gz.tmp`` then rename. A + partial failure leaves the previous generation intact. + """ + dest.mkdir(parents=True, exist_ok=True) + written: list[Path] = [] + for name, url in IPTOASN_SOURCES: + target = dest / f"{name}.tsv.gz" + tmp = target.with_suffix(".gz.tmp") + try: + _download(url, tmp) + tmp.replace(target) + written.append(target) + logger.info( + "asn.iptoasn: fetched %s (%d bytes)", + name, target.stat().st_size, + ) + except Exception as exc: + logger.error( + "asn.iptoasn: fetch failed for %s (%s): %s", name, url, exc + ) + if tmp.exists(): + tmp.unlink(missing_ok=True) + # Keep any stale previous file — better outdated than empty. + return written + + +def _download(url: str, dest: Path) -> None: + req = urllib.request.Request(url, headers={"User-Agent": _USER_AGENT}) + with urllib.request.urlopen(req, timeout=_TIMEOUT_S) as resp, dest.open("wb") as fh: # nosec B310 — fixed https iptoasn URL + shutil.copyfileobj(resp, fh) diff --git a/decnet/asn/iptoasn/parse.py b/decnet/asn/iptoasn/parse.py new file mode 100644 index 00000000..468ab0ed --- /dev/null +++ b/decnet/asn/iptoasn/parse.py @@ -0,0 +1,79 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Parser for the iptoasn.com ``ip2asn-v4.tsv`` dump. + +Line shape (gzipped, one row per BGP-announced prefix):: + + 1.0.0.0\\t1.0.0.255\\t13335\\tUS\\tCLOUDFLARENET + +Fields: ``range_start``, ``range_end``, ``as_number``, ``country_code``, +``as_description``. Both range columns are dotted IPv4 strings (the dump +is IPv4-only — there's a separate ``ip2asn-v6.tsv.gz`` we don't pull). + +Rows skipped: + +* ``as_number == 0`` — iptoasn's sentinel for "unannounced" / private + / reserved space. Country may still be present (``"None"`` / two-letter + CC) but we don't care: the geoip module owns country, ASN owns BGP. +* Rows where either range column won't parse as IPv4. +* Rows with fewer than 3 tab-separated columns. +""" +from __future__ import annotations + +import gzip +import ipaddress +import logging +from pathlib import Path +from typing import Iterator + +from decnet.asn.lookup import AsnInfo, Range + +logger = logging.getLogger("decnet.asn.iptoasn.parse") + + +def parse_file(path: Path) -> Iterator[Range]: + """Yield ``(start_int, end_int_inclusive, AsnInfo)`` for every BGP row. + + Accepts a gzipped path (``*.tsv.gz``); plain TSV is also fine for + test harnesses that hand-craft small fixtures. + """ + opener = gzip.open if path.suffix == ".gz" else open + with opener(path, "rt", encoding="utf-8", errors="replace") as fh: + for lineno, raw in enumerate(fh, 1): + line = raw.rstrip("\n") + if not line: + continue + parts = line.split("\t") + if len(parts) < 3: + continue + start_s, end_s, asn_s = parts[0], parts[1], parts[2] + # Description is the 5th column; iptoasn quotes nothing, + # but the field can contain stray whitespace. ``""`` when + # missing or unknown. + name = parts[4].strip() if len(parts) >= 5 else "" + + try: + asn = int(asn_s) + except ValueError: + logger.debug( + "asn.iptoasn: skipping malformed asn line %d in %s", + lineno, path.name, + ) + continue + # ASN 0 is iptoasn's sentinel for unannounced / sentinel + # space. Skip — there's no useful enrichment to attach. + if asn == 0: + continue + + try: + start_int = int(ipaddress.IPv4Address(start_s)) + end_int = int(ipaddress.IPv4Address(end_s)) + except (ValueError, ipaddress.AddressValueError): + logger.debug( + "asn.iptoasn: skipping malformed addr line %d in %s", + lineno, path.name, + ) + continue + if end_int < start_int: + continue + + yield (start_int, end_int, AsnInfo(asn=asn, name=name)) diff --git a/decnet/asn/iptoasn/provider.py b/decnet/asn/iptoasn/provider.py new file mode 100644 index 00000000..f2323ea0 --- /dev/null +++ b/decnet/asn/iptoasn/provider.py @@ -0,0 +1,84 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""iptoasn provider — orchestrates fetch + parse into an :class:`AsnLookup`. + +Mirrors :class:`decnet.geoip.rir.provider.RirProvider` exactly: fetch, +build a pickled cache, invalidate when raw files are newer than the +cache. +""" +from __future__ import annotations + +import logging +from pathlib import Path +from typing import Sequence + +from decnet.asn.base import Provider +from decnet.asn.iptoasn.fetch import IPTOASN_SOURCES, fetch_all +from decnet.asn.iptoasn.parse import parse_file +from decnet.asn.lookup import AsnLookup, Range +from decnet.asn.paths import ensure_root + +logger = logging.getLogger("decnet.asn.iptoasn.provider") + +# Pickled lookup cache — skips re-parsing the ~580k-row gz dump on every +# profiler restart. Rebuilt whenever any raw file is newer than the +# cache, see ``_cache_fresh``. +_CACHE_NAME = ".iptoasn_index.pkl" + + +class IptoasnProvider(Provider): + name = "iptoasn" + + def __init__(self) -> None: + self._root = ensure_root() + + # ---------- Provider interface ---------- + + def refresh(self) -> None: + logger.info("asn.iptoasn: refreshing dump into %s", self._root) + fetch_all(self._root) + cache = self._root / _CACHE_NAME + if cache.exists(): + cache.unlink(missing_ok=True) + + def build_lookup(self) -> AsnLookup: + cache = self._root / _CACHE_NAME + if self._cache_fresh(cache): + try: + lookup = AsnLookup.load(cache) + logger.debug( + "asn.iptoasn: loaded cached index (%d ranges)", + len(lookup), + ) + return lookup + except Exception as exc: + logger.warning( + "asn.iptoasn: cache load failed, rebuilding: %s", exc + ) + + ranges: list[Range] = [] + for path in self.data_paths(): + if not path.exists(): + continue + ranges.extend(parse_file(path)) + lookup = AsnLookup.from_ranges(ranges) + try: + lookup.save(cache) + except Exception as exc: + logger.warning("asn.iptoasn: cache save failed: %s", exc) + logger.info("asn.iptoasn: built index with %d ranges", len(lookup)) + return lookup + + def data_paths(self) -> Sequence[Path]: + return [self._root / f"{name}.tsv.gz" for name, _url in IPTOASN_SOURCES] + + # ---------- internals ---------- + + def _cache_fresh(self, cache: Path) -> bool: + """True when the pickle exists and is at least as new as every raw file.""" + if not cache.exists(): + return False + cache_mtime = cache.stat().st_mtime + for path in self.data_paths(): + if path.exists() and path.stat().st_mtime > cache_mtime: + return False + return True diff --git a/decnet/asn/lookup.py b/decnet/asn/lookup.py new file mode 100644 index 00000000..a43a6f59 --- /dev/null +++ b/decnet/asn/lookup.py @@ -0,0 +1,143 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Provider-agnostic IP→ASN lookup. + +A :class:`AsnLookup` is a frozen, sorted array of ``(start_ip, +end_ip_inclusive, AsnInfo)`` ranges queried via :mod:`bisect`. +O(log n) on ~600k ranges (a current iptoasn dump is ~580k rows). + +Private/loopback/invalid IPv4 and all IPv6 addresses resolve to +``None`` — the same policy :mod:`decnet.geoip.lookup` uses. +""" +from __future__ import annotations + +import bisect +import ipaddress +import pickle # nosec B403 — self-produced cache under /var/lib/decnet, never deserialized from untrusted input +from dataclasses import dataclass +from pathlib import Path +from typing import Iterable, List, Optional, Tuple + + +@dataclass(frozen=True) +class AsnInfo: + """One BGP-announced prefix's origin metadata.""" + + asn: int + name: str # AS description / org name; "" if absent in the source data + prefix: Optional[str] = None # synthesized covering CIDR; set at lookup time, not at rest + + +Range = Tuple[int, int, AsnInfo] + + +def _synthesize_prefix(start_int: int, end_int: int, queried_int: int) -> Optional[str]: + """Return the most-specific CIDR from [start, end] that contains queried_int.""" + try: + for net in ipaddress.summarize_address_range( + ipaddress.IPv4Address(start_int), ipaddress.IPv4Address(end_int) + ): + if queried_int >= int(net.network_address) and queried_int <= int(net.broadcast_address): + return str(net) + except (ValueError, TypeError): + pass + return None + + +@dataclass +class AsnLookup: + """Indexed AS lookup over IPv4 ranges.""" + + # Parallel arrays for bisect: _starts[i] is the start-IP of the i-th + # range, _ends[i] its inclusive end, _infos[i] its AsnInfo. + _starts: List[int] + _ends: List[int] + _infos: List[AsnInfo] + + @classmethod + def from_ranges(cls, ranges: Iterable[Range]) -> "AsnLookup": + """Build a lookup from ``(start, end_inclusive, AsnInfo)`` triples. + + Ranges are sorted by start; on identical starts, last writer + wins (matches :class:`decnet.geoip.lookup.Lookup` semantics). + Non-overlapping adjacency is preserved. + """ + sorted_ranges = sorted(ranges, key=lambda r: (r[0], r[1])) + starts: List[int] = [] + ends: List[int] = [] + infos: List[AsnInfo] = [] + for start, end, info in sorted_ranges: + if starts and starts[-1] == start: + ends[-1] = end + infos[-1] = info + continue + starts.append(start) + ends.append(end) + infos.append(info) + return cls(starts, ends, infos) + + def asn(self, ip: str) -> Optional[AsnInfo]: + """Return the :class:`AsnInfo` for ``ip`` or ``None``. + + ``None`` on: IPv6, private/loopback/link-local/multicast/reserved + addresses, malformed strings, and IPs outside every BGP-announced + range in the source dump. + """ + try: + addr = ipaddress.ip_address(ip) + except ValueError: + return None + if isinstance(addr, ipaddress.IPv6Address): + return None + if ( + addr.is_private + or addr.is_loopback + or addr.is_link_local + or addr.is_multicast + or addr.is_reserved + or addr.is_unspecified + ): + return None + + n = int(addr) + idx = bisect.bisect_right(self._starts, n) - 1 + if idx < 0: + return None + if n <= self._ends[idx]: + info = self._infos[idx] + prefix = _synthesize_prefix(self._starts[idx], self._ends[idx], n) + return AsnInfo(asn=info.asn, name=info.name, prefix=prefix) + return None + + def __len__(self) -> int: + return len(self._starts) + + # ---------- persistence ---------- + + def save(self, path: Path) -> None: + """Pickle the lookup to *path* (atomic rename).""" + tmp = path.with_suffix(path.suffix + ".tmp") + tmp.parent.mkdir(parents=True, exist_ok=True) + with tmp.open("wb") as fh: + pickle.dump( + { + "version": 1, + "starts": self._starts, + "ends": self._ends, + "infos": [(i.asn, i.name) for i in self._infos], + }, + fh, + protocol=pickle.HIGHEST_PROTOCOL, + ) + tmp.replace(path) + + @classmethod + def load(cls, path: Path) -> "AsnLookup": + """Load a pickled lookup from *path*.""" + with path.open("rb") as fh: + data = pickle.load(fh) # nosec B301 — self-produced file under /var/lib/decnet + if data.get("version") != 1: + raise ValueError( + f"unsupported asn-lookup index version: {data.get('version')!r}" + ) + infos = [AsnInfo(asn=a, name=n) for a, n in data["infos"]] + return cls(data["starts"], data["ends"], infos) diff --git a/decnet/asn/paths.py b/decnet/asn/paths.py new file mode 100644 index 00000000..79ffa730 --- /dev/null +++ b/decnet/asn/paths.py @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Filesystem layout for ASN data — mirror of :mod:`decnet.geoip.paths`. + +``ASN_ROOT`` is where providers drop their raw files and cache indexes. +Default ``/var/lib/decnet/asn``. Override with ``DECNET_ASN_ROOT`` for +test harnesses. +""" +from __future__ import annotations + +import os +from pathlib import Path + +ASN_ROOT = Path(os.environ.get("DECNET_ASN_ROOT", "/var/lib/decnet/asn")) + + +def ensure_root() -> Path: + """Create ``ASN_ROOT`` if absent and return it. No-op if present.""" + ASN_ROOT.mkdir(parents=True, exist_ok=True) + return ASN_ROOT diff --git a/decnet/bus/__init__.py b/decnet/bus/__init__.py new file mode 100644 index 00000000..3dd4e19f --- /dev/null +++ b/decnet/bus/__init__.py @@ -0,0 +1,19 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""DECNET ServiceBus — pub/sub notification substrate. + +The bus is the notification layer for DECNET's worker constellation. The DB +remains the source of truth for anything durable; the bus carries "something +happened, go look" events. Delivery is at-most-once, fire-and-forget. + +Consumers call :func:`get_bus` from :mod:`decnet.bus.factory`; never import +transport implementations directly. The factory selects the backend via +``DECNET_BUS_TYPE`` (``nats`` or ``fake``) and honors ``DECNET_BUS_ENABLED``. + +Topic hierarchy is defined in :mod:`decnet.bus.topics` and locked early so +consumers can subscribe with stable wildcard patterns. +""" +from __future__ import annotations + +from decnet.bus.base import BaseBus, Event, Subscription + +__all__ = ["BaseBus", "Event", "Subscription"] diff --git a/decnet/bus/app.py b/decnet/bus/app.py new file mode 100644 index 00000000..1bd533d8 --- /dev/null +++ b/decnet/bus/app.py @@ -0,0 +1,93 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Process-wide bus singleton for request-serving workers (API, SSE routes). + +A single connected :class:`~decnet.bus.base.BaseBus` shared across request +handlers — opening a UNIX socket per request would be wasteful and add +latency to the hot path. The API lifespan is responsible for calling +:func:`close_app_bus` on shutdown; connect is lazy so tests and +contract-test mode that never hit a publish/subscribe code path don't +pay for a bus connection they'll never use. + +Failures during :meth:`BaseBus.connect` are swallowed and logged — a +dead bus must never break request serving. Publishers should treat a +``None`` return from :func:`get_app_bus` as "skip this notification", +same as ``DECNET_BUS_ENABLED=false``. + +Connect is **retried with a short backoff** (not one-shot): a startup +race where the API lifespan hits :func:`get_app_bus` before ``decnet +bus`` is ready would otherwise poison the singleton for the entire +process lifetime. Instead we remember the last failure timestamp and +let callers retry once ``_RETRY_BACKOFF`` seconds have passed. +""" +from __future__ import annotations + +import asyncio +import time + +from decnet.bus.base import BaseBus +from decnet.bus.factory import get_bus +from decnet.logging import get_logger + +log = get_logger("bus.app") + +# Publishers in the hot path shouldn't pay connect-retry latency on every +# call; the dashboard's own 5 s poll interval recovers within one tick +# once the bus comes up. A persistently-dead bus only gets a connect +# attempt every 2 s, not once per request. +_RETRY_BACKOFF: float = 2.0 + +_lock = asyncio.Lock() +_shared: BaseBus | None = None +_last_failure_ts: float = 0.0 + + +async def get_app_bus() -> BaseBus | None: + """Return the process-wide connected bus, or ``None`` if unavailable. + + On first call, constructs a client via :func:`get_bus` and awaits + ``connect()``. Subsequent calls return the cached instance. If a + connect attempt raises, the failure timestamp is recorded and + subsequent calls within ``_RETRY_BACKOFF`` seconds return ``None`` + without re-attempting — after the backoff window, the next call + retries. This is what lets the API recover from a + ``decnet bus``-started-after-API race without a full API restart. + """ + global _shared, _last_failure_ts + if _shared is not None: + return _shared + if (time.monotonic() - _last_failure_ts) < _RETRY_BACKOFF: + return None + async with _lock: + if _shared is not None: + return _shared + if (time.monotonic() - _last_failure_ts) < _RETRY_BACKOFF: + return None + try: + candidate = get_bus(client_name="api") + await candidate.connect() + _shared = candidate + _last_failure_ts = 0.0 + return _shared + except Exception as exc: # noqa: BLE001 + log.warning("app bus unavailable: %s", exc) + _last_failure_ts = time.monotonic() + return None + + +async def close_app_bus() -> None: + """Close the shared bus if one is open; clear the backoff window. + + Call from the API lifespan shutdown. Safe to call multiple times. + Resetting ``_last_failure_ts`` means the next ``get_app_bus()`` + after shutdown-and-restart-within-the-same-process (rare, but + tests do this) retries immediately instead of honouring a stale + backoff. + """ + global _shared, _last_failure_ts + bus, _shared = _shared, None + _last_failure_ts = 0.0 + if bus is not None: + try: + await bus.close() + except Exception as exc: # noqa: BLE001 + log.warning("app bus close raised: %s", exc) diff --git a/decnet/bus/base.py b/decnet/bus/base.py new file mode 100644 index 00000000..539f72ae --- /dev/null +++ b/decnet/bus/base.py @@ -0,0 +1,206 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Bus abstractions: the :class:`Event` envelope and the :class:`BaseBus` ABC. + +Every transport (NATS, in-process fake, null) speaks this contract. The +envelope is versioned (``v``) so future evolution never breaks deployed +consumers that happen to see a newer event shape. + +Subscription model: :meth:`BaseBus.subscribe` returns a :class:`Subscription` +that is an async context manager AND an async iterator. The expected usage is: + + async with bus.subscribe("topology.*.mutation.*") as sub: + async for event in sub: + handle(event) + +Leaving the ``async with`` releases the underlying subscription handle; the +transport is free to drop any buffered events after that point. +""" +from __future__ import annotations + +import abc +import asyncio +import time +import uuid +from dataclasses import dataclass, field +from typing import Any, AsyncIterator, cast + +EVENT_SCHEMA_VERSION = 1 + + +@dataclass(frozen=True) +class Event: + """The bus envelope. + + ``v`` is the envelope schema version, bumped on incompatible shape + changes. ``type`` is a short discriminator (``"mutation.applied"``, + ``"decky.state"``) useful for consumers that subscribe to a broad + wildcard and dispatch in Python; it is redundant with the trailing + segments of ``topic`` but cheaper to inspect. ``ts`` is epoch seconds + (float). ``id`` is a random UUID so consumers can de-dupe if they + ever see the same event twice (not expected at-most-once, but cheap + insurance). + """ + + topic: str + payload: dict[str, Any] + type: str = "" + v: int = EVENT_SCHEMA_VERSION + ts: float = field(default_factory=time.time) + id: str = field(default_factory=lambda: uuid.uuid4().hex) + + def to_dict(self) -> dict[str, Any]: + return { + "v": self.v, + "id": self.id, + "topic": self.topic, + "type": self.type, + "ts": self.ts, + "payload": self.payload, + } + + @classmethod + def from_dict(cls, topic: str, data: dict[str, Any]) -> "Event": + """Reconstruct an Event from a wire-format dict. + + ``topic`` is passed explicitly because the transport knows which + subject the message arrived on; trusting a ``topic`` field from the + wire would let a misbehaving publisher spoof events on topics they + don't actually publish to. + """ + return cls( + topic=topic, + payload=data.get("payload", {}) or {}, + type=data.get("type", "") or "", + v=int(data.get("v", EVENT_SCHEMA_VERSION)), + ts=float(data.get("ts", time.time())), + id=data.get("id") or uuid.uuid4().hex, + ) + + +class Subscription(abc.ABC): + """An open subscription — async context manager + async iterator. + + Concrete transports subclass this and implement :meth:`_aclose` plus the + async iterator protocol. Callers should not instantiate directly; use + :meth:`BaseBus.subscribe`. + """ + + def __init__(self, pattern: str) -> None: + self.pattern = pattern + self._closed = False + + async def __aenter__(self) -> "Subscription": + return self + + async def __aexit__(self, *exc: Any) -> None: + await self.aclose() + + def __aiter__(self) -> AsyncIterator[Event]: + return self + + async def aclose(self) -> None: + if self._closed: + return + self._closed = True + await self._aclose() + + @abc.abstractmethod + async def __anext__(self) -> Event: # pragma: no cover - abstract + raise NotImplementedError + + @abc.abstractmethod + async def _aclose(self) -> None: # pragma: no cover - abstract + raise NotImplementedError + + +class BaseBus(abc.ABC): + """Pub/sub transport contract. + + Implementations MUST be safe to ``await connect()`` multiple times and + ``await close()`` multiple times. Publishing to a closed bus raises + :class:`RuntimeError`; subscribing to a closed bus does too. + """ + + @abc.abstractmethod + async def connect(self) -> None: + """Establish any network/transport resources. Idempotent.""" + + @abc.abstractmethod + async def publish( + self, + topic: str, + payload: dict[str, Any], + *, + event_type: str = "", + ) -> None: + """Publish *payload* on *topic*. Fire-and-forget. + + Delivery is at-most-once. On transport error the implementation + logs and returns; it does not raise, because bus losses must not + cascade into worker failure (DB is source of truth). + """ + + @abc.abstractmethod + def subscribe(self, pattern: str) -> Subscription: + """Return a :class:`Subscription` that yields events matching *pattern*. + + Patterns follow NATS wildcard semantics: ``*`` matches one topic + token, ``>`` matches one-or-more trailing tokens. Examples: + + * ``topology.*.mutation.applied`` — all ``applied`` events for any + topology. + * ``topology.abc123.mutation.*`` — all mutation states for one + topology. + * ``topology.>`` — every event under the ``topology`` root. + """ + + @abc.abstractmethod + async def close(self) -> None: + """Tear down transport resources. Idempotent.""" + + async def __aenter__(self) -> "BaseBus": + await self.connect() + return self + + async def __aexit__(self, *exc: Any) -> None: + await self.close() + + +# ─── Wildcard matching shared across in-process transports ─────────────────── + +def matches(pattern: str, topic: str) -> bool: + """Return True iff *topic* matches *pattern* under NATS wildcard rules. + + ``*`` matches exactly one non-empty token; ``>`` matches one-or-more + trailing tokens (so ``topology.>`` matches ``topology.abc.x`` but not + ``topology`` alone). + """ + p_tokens = pattern.split(".") + t_tokens = topic.split(".") + for i, p in enumerate(p_tokens): + if p == ">": + # Must have at least one token remaining to match. + return i < len(t_tokens) + if i >= len(t_tokens): + return False + if p == "*": + if not t_tokens[i]: + return False + continue + if p != t_tokens[i]: + return False + return len(p_tokens) == len(t_tokens) + + +# Sentinel used by the in-process transports to signal "no more events" +# through the asyncio.Queue fan-out without inventing a separate control +# channel. Not part of the wire protocol. +_CLOSE_SENTINEL: Any = object() + + +async def _next_or_stop(queue: "asyncio.Queue[Any]") -> Event: + """Pop the next item from *queue*, raising ``StopAsyncIteration`` on close.""" + item = await queue.get() + if item is _CLOSE_SENTINEL: + raise StopAsyncIteration + return cast(Event, item) diff --git a/decnet/bus/factory.py b/decnet/bus/factory.py new file mode 100644 index 00000000..4dc3ffb0 --- /dev/null +++ b/decnet/bus/factory.py @@ -0,0 +1,86 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Bus factory — selects a :class:`~decnet.bus.base.BaseBus` implementation. + +Dispatch key: the ``DECNET_BUS_TYPE`` environment variable. + +* ``unix`` (default) → :class:`~decnet.bus.unix_client.UnixSocketBus` +* ``fake`` → :class:`~decnet.bus.fake.FakeBus` (in-process) + +If ``DECNET_BUS_ENABLED`` is ``"false"`` the factory short-circuits to +:class:`~decnet.bus.fake.NullBus` regardless of ``DECNET_BUS_TYPE`` — a +cheap way for dev environments to run workers without a bus daemon. + +Mirrors :mod:`decnet.web.db.factory` (lazy imports inside each branch, +env-driven dispatch, optional telemetry wrapping). Callers MUST use +:func:`get_bus` rather than instantiating transports directly. +""" +from __future__ import annotations + +import os +from typing import Any, cast + +from decnet.bus.base import BaseBus + + +def get_bus(**kwargs: Any) -> BaseBus: + """Instantiate the bus implementation selected by environment. + + Keyword arguments are forwarded to the concrete transport: + + * ``UnixSocketBus`` accepts ``socket_path`` (overrides + ``DECNET_BUS_SOCKET``) and ``client_name``. + * ``FakeBus`` accepts ``queue_size``. + """ + if os.environ.get("DECNET_BUS_ENABLED", "true").lower() == "false": + from decnet.bus.fake import NullBus + return NullBus() + + bus_type = os.environ.get("DECNET_BUS_TYPE", "unix").lower() + + if bus_type == "unix": + from decnet.bus.unix_client import UnixSocketBus + socket_path = kwargs.pop("socket_path", None) or _default_socket_path() + bus: BaseBus = UnixSocketBus(socket_path=socket_path, **kwargs) + elif bus_type == "fake": + from decnet.bus.fake import FakeBus + bus = FakeBus(**kwargs) + else: + raise ValueError(f"Unsupported bus type: {bus_type}") + + return _maybe_wrap_telemetry(bus) + + +def _default_socket_path() -> str: + """Return the bus socket path honoring ``DECNET_BUS_SOCKET`` and falling + back to ``/run/decnet/bus.sock`` → ``~/.decnet/bus.sock``. + + The runtime path (``/run/decnet``) is preferred because systemd + ``RuntimeDirectory=decnet`` sets it up with the right perms; the home + fallback keeps dev boxes usable without systemd. + """ + explicit = os.environ.get("DECNET_BUS_SOCKET") + if explicit: + return explicit + + runtime_dir = "/run/decnet" + if os.path.isdir(runtime_dir) and os.access(runtime_dir, os.W_OK): + return f"{runtime_dir}/bus.sock" + return os.path.expanduser("~/.decnet/bus.sock") + + +def _maybe_wrap_telemetry(bus: BaseBus) -> BaseBus: + """Wrap *bus* in a tracing proxy if OTEL is enabled, else return as-is. + + Uses :func:`decnet.telemetry.wrap_repository` as the underlying proxy — + its implementation is generic (wraps any async method in a span), so we + reuse it with a bus-appropriate tracer name. If telemetry isn't wired + up at all we no-op. + """ + try: + from decnet.telemetry import wrap_repository + except ImportError: + return bus + try: + return cast(BaseBus, wrap_repository(bus)) + except Exception: # pragma: no cover - defensive + return bus diff --git a/decnet/bus/fake.py b/decnet/bus/fake.py new file mode 100644 index 00000000..1766b524 --- /dev/null +++ b/decnet/bus/fake.py @@ -0,0 +1,184 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""In-process bus transports. + +* :class:`FakeBus` — real pub/sub semantics without touching a socket. Used + by unit tests and anywhere ``DECNET_BUS_TYPE=fake`` is set. Lets code + that depends on the bus be exercised entirely inside a single event loop, + matching the DECNET testing convention of not opening real network + sockets from unit tests. +* :class:`NullBus` — no-op. Returned by :func:`~decnet.bus.factory.get_bus` + when ``DECNET_BUS_ENABLED=false`` so workers can start cleanly in dev + environments where no bus daemon is running. Publishes are dropped; + subscriptions yield nothing and close cleanly. +""" +from __future__ import annotations + +import asyncio +from typing import Any, cast + +from decnet.bus.base import ( + BaseBus, + Event, + Subscription, + _CLOSE_SENTINEL, + matches, +) +from decnet.logging import get_logger + +log = get_logger("bus.fake") + +# Per-subscriber bounded queue: backpressure policy is drop-oldest so a slow +# consumer cannot stall publishers (the invariant — DB is the source of +# truth — makes dropped events acceptable). +_DEFAULT_QUEUE_SIZE = 1024 + + +# ─── FakeBus ───────────────────────────────────────────────────────────────── + + +class _FakeSubscription(Subscription): + """Subscription backed by an :class:`asyncio.Queue` fed from + :meth:`FakeBus.publish`. Unregisters itself on close.""" + + def __init__(self, bus: "FakeBus", pattern: str, queue: "asyncio.Queue[Any]") -> None: + super().__init__(pattern) + self._bus = bus + self._queue = queue + + async def __anext__(self) -> Event: + if self._closed: + raise StopAsyncIteration + item = await self._queue.get() + if item is _CLOSE_SENTINEL: + raise StopAsyncIteration + return cast(Event, item) + + async def _aclose(self) -> None: + self._bus._unregister(self) + # Unblock any pending __anext__ waiter. + try: + self._queue.put_nowait(_CLOSE_SENTINEL) + except asyncio.QueueFull: + pass + + +class FakeBus(BaseBus): + """In-process pub/sub. + + Publishes iterate every active subscription and enqueue the event on + the ones whose pattern matches the topic. If a subscriber's queue is + full, the oldest event is discarded to make room — same at-most-once + semantics as the real UNIX-socket transport. + """ + + def __init__(self, queue_size: int = _DEFAULT_QUEUE_SIZE) -> None: + self._queue_size = queue_size + self._subs: list[_FakeSubscription] = [] + self._connected = False + self._closed = False + self._lock = asyncio.Lock() + + async def connect(self) -> None: + self._connected = True + + async def publish( + self, + topic: str, + payload: dict[str, Any], + *, + event_type: str = "", + ) -> None: + if self._closed: + raise RuntimeError("publish on closed bus") + event = Event(topic=topic, payload=payload, type=event_type) + async with self._lock: + targets = [s for s in self._subs if matches(s.pattern, topic)] + for sub in targets: + _enqueue_drop_oldest(sub._queue, event) + + def subscribe(self, pattern: str) -> Subscription: + if self._closed: + raise RuntimeError("subscribe on closed bus") + queue: asyncio.Queue[Any] = asyncio.Queue(maxsize=self._queue_size) + sub = _FakeSubscription(self, pattern, queue) + self._subs.append(sub) + return sub + + def _unregister(self, sub: _FakeSubscription) -> None: + try: + self._subs.remove(sub) + except ValueError: + pass + + async def close(self) -> None: + if self._closed: + return + self._closed = True + # Wake every still-open subscription so iterators unblock cleanly. + for sub in list(self._subs): + try: + sub._queue.put_nowait(_CLOSE_SENTINEL) + except asyncio.QueueFull: + pass + self._subs.clear() + + +def _enqueue_drop_oldest(queue: "asyncio.Queue[Any]", event: Event) -> None: + """Put *event* on *queue*, dropping the oldest item if the queue is full. + + Factored out so both FakeBus and the real UNIX server share the exact + same backpressure policy. + """ + while True: + try: + queue.put_nowait(event) + return + except asyncio.QueueFull: + try: + dropped = queue.get_nowait() + log.warning( + "bus.fake: subscriber queue full, dropped %s", getattr(dropped, "topic", "?") + ) + except asyncio.QueueEmpty: + return + + +# ─── NullBus ───────────────────────────────────────────────────────────────── + + +class _NullSubscription(Subscription): + """A subscription that never yields and closes immediately on iteration.""" + + async def __anext__(self) -> Event: + raise StopAsyncIteration + + async def _aclose(self) -> None: + return + + +class NullBus(BaseBus): + """No-op bus used when ``DECNET_BUS_ENABLED=false``. + + Publishes are silently dropped; subscriptions are empty. Intended for + dev environments where no bus daemon is running — the process starts + cleanly, code that publishes doesn't need feature flags, and nothing + ever blocks on a subscriber. + """ + + async def connect(self) -> None: + return + + async def publish( + self, + topic: str, + payload: dict[str, Any], + *, + event_type: str = "", + ) -> None: + return + + def subscribe(self, pattern: str) -> Subscription: + return _NullSubscription(pattern) + + async def close(self) -> None: + return diff --git a/decnet/bus/protocol.py b/decnet/bus/protocol.py new file mode 100644 index 00000000..2ff6e68c --- /dev/null +++ b/decnet/bus/protocol.py @@ -0,0 +1,145 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Wire protocol for the DECNET bus UNIX-socket transport. + +Frame layout: + + []\\n # ASCII header, single line, no trailing space + <4-byte big-endian body length> + # orjson-serialized dict, or empty (length 0) + +Verbs: + +* ``HELLO `` — optional greeting, logged by server. Body empty. +* ``PUB `` — publisher → server. Body = payload dict. +* ``SUB `` — subscriber → server. Body empty. +* ``UNSUB `` — subscriber → server. Body empty. +* ``EVT `` — server → subscriber. Body = payload dict (wrapped + in an :class:`~decnet.bus.base.Event` envelope). +* ``BYE`` — either direction. Body empty. Graceful shutdown. + +Parsing rules: + +* The header is a single line terminated by ``\\n`` (LF). ``\\r`` is tolerated + but not required. +* Header tokens are whitespace-separated. The first token is the verb; + everything after is verb-specific. We split on the first space only so + topics / patterns with quoted content are not supported (they are not + needed — topic segments forbid whitespace per :mod:`decnet.bus.topics`). +* Maximum header length is 4096 bytes; maximum body length is 1 MiB. Beyond + those, the connection is dropped with a logged error. This is a honeypot + framework, not a general-purpose message broker; a malformed frame is + treated as hostile. +""" +from __future__ import annotations + +import asyncio +import struct +from dataclasses import dataclass +from typing import Any + +import orjson + +MAX_HEADER_BYTES = 4096 +MAX_BODY_BYTES = 1 * 1024 * 1024 # 1 MiB + +# Verb constants (callers should reference these, not bare strings). +HELLO = "HELLO" +PUB = "PUB" +SUB = "SUB" +UNSUB = "UNSUB" +EVT = "EVT" +BYE = "BYE" + +_VALID_VERBS = frozenset({HELLO, PUB, SUB, UNSUB, EVT, BYE}) + + +class ProtocolError(Exception): + """Malformed or oversized frame. Callers should close the connection.""" + + +@dataclass(frozen=True) +class Frame: + """A parsed frame. ``body`` is the raw (unparsed) body bytes — callers + decide whether to orjson-decode it (the protocol does not know whether + a given verb expects a dict body or an empty one). + """ + + verb: str + args: str # everything after the verb on the header line, trimmed + body: bytes + + +def encode(verb: str, args: str = "", body: dict[str, Any] | None = None) -> bytes: + """Serialize a frame. + + *body* is a dict that will be orjson-encoded, or ``None`` for an empty + body. The header line is written verbatim — callers must supply args + that are free of ``\\n``. + """ + if verb not in _VALID_VERBS: + raise ProtocolError(f"unknown verb {verb!r}") + if "\n" in args or "\r" in args: + raise ProtocolError("args must not contain newline characters") + + body_bytes = b"" if body is None else orjson.dumps(body) + if len(body_bytes) > MAX_BODY_BYTES: + raise ProtocolError( + f"body {len(body_bytes)} bytes exceeds max {MAX_BODY_BYTES}" + ) + + header = f"{verb} {args}".rstrip() + "\n" + header_bytes = header.encode("ascii") + if len(header_bytes) > MAX_HEADER_BYTES: + raise ProtocolError( + f"header {len(header_bytes)} bytes exceeds max {MAX_HEADER_BYTES}" + ) + return header_bytes + struct.pack(">I", len(body_bytes)) + body_bytes + + +async def read_frame(reader: asyncio.StreamReader) -> Frame | None: + """Read one frame from *reader*. + + Returns ``None`` on clean EOF before a new frame starts. Raises + :class:`ProtocolError` on malformed input (caller should close the + connection). + """ + try: + header = await reader.readuntil(b"\n") + except asyncio.IncompleteReadError as exc: + if not exc.partial: + return None + raise ProtocolError("connection closed mid-header") from exc + except asyncio.LimitOverrunError as exc: + raise ProtocolError("header exceeded buffer limit") from exc + + if len(header) > MAX_HEADER_BYTES: + raise ProtocolError(f"header {len(header)} bytes exceeds max") + + line = header.rstrip(b"\r\n").decode("ascii", errors="strict") + if not line: + raise ProtocolError("empty header line") + + verb, _, args = line.partition(" ") + if verb not in _VALID_VERBS: + raise ProtocolError(f"unknown verb {verb!r}") + + length_bytes = await reader.readexactly(4) + (body_len,) = struct.unpack(">I", length_bytes) + if body_len > MAX_BODY_BYTES: + raise ProtocolError(f"body length {body_len} exceeds max") + + body = await reader.readexactly(body_len) if body_len else b"" + return Frame(verb=verb, args=args.strip(), body=body) + + +def decode_body(body: bytes) -> dict[str, Any]: + """Decode a frame body as a JSON dict. Empty body → empty dict.""" + if not body: + return {} + try: + obj = orjson.loads(body) + except orjson.JSONDecodeError as exc: + raise ProtocolError(f"body is not valid JSON: {exc}") from exc + if not isinstance(obj, dict): + raise ProtocolError(f"body must be a JSON object, got {type(obj).__name__}") + return obj diff --git a/decnet/bus/publish.py b/decnet/bus/publish.py new file mode 100644 index 00000000..8bc96763 --- /dev/null +++ b/decnet/bus/publish.py @@ -0,0 +1,212 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Fire-and-forget publish helpers shared across every worker. + +Lifted out of ``decnet/mutator/engine.py`` once a second caller showed up +(DEBT-031). Keeping one implementation means the "never break the worker +loop" guarantee is audited in exactly one place. +""" +from __future__ import annotations + +import asyncio +import contextlib +import os +import signal +import time +from typing import Any, Callable + +from decnet.bus import topics as _topics +from decnet.bus.base import BaseBus +from decnet.logging import get_logger + +log = get_logger("bus.publish") + + +async def publish_safely( + bus: BaseBus | None, + topic: str, + payload: dict[str, Any], + event_type: str = "", +) -> None: + """Publish on *bus* without ever raising back at the caller. + + The DB row (or equivalent side-effect) has already been committed by + the time a worker calls this; the bus is the notification layer, not + the source of truth. A dropped publish is at most a few seconds of + UI latency until the next poll tick. A raised exception here, by + contrast, would crash the worker — which is strictly worse. + """ + if bus is None: + return + try: + await bus.publish(topic, payload, event_type=event_type) + except Exception as exc: # noqa: BLE001 + log.warning("bus publish failed topic=%s: %s", topic, exc) + + +def make_thread_safe_publisher( + bus: BaseBus | None, + loop: asyncio.AbstractEventLoop, +) -> Callable[[str, dict[str, Any], str], None]: + """Build a sync callable that marshals publishes back to *loop*. + + Workers that run their hot paths in a worker thread (scapy sniff loop, + ``asyncio.to_thread`` probes, blocking socket reads) cannot ``await`` + the bus directly. This helper returns a plain function that schedules + the publish on *loop* via ``run_coroutine_threadsafe`` and returns + immediately — the calling thread is never blocked on the publish. + + A ``None`` bus yields a no-op callable, matching the degraded-mode + contract the rest of this module already upholds. + """ + if bus is None: + return lambda _topic, _payload, _event_type="": None # type: ignore[misc] + + def _publish(topic: str, payload: dict[str, Any], event_type: str = "") -> None: + # Stream threads may keep draining after the bus owner closed it + # (shutdown race). Short-circuit here so we don't marshal a + # coroutine onto a dead loop just to have publish_safely swallow + # it. bus.publish's own WARN-once guard handles the rare case + # where _closed flips between this check and the coroutine + # actually running. + if getattr(bus, "_closed", False): + return + try: + asyncio.run_coroutine_threadsafe( + publish_safely(bus, topic, payload, event_type=event_type), + loop, + ) + except Exception as exc: # noqa: BLE001 + log.debug("cross-thread bus publish failed topic=%s: %s", topic, exc) + + return _publish + + +async def run_health_heartbeat( + bus: BaseBus | None, + worker: str, + *, + interval: float = 30.0, + extra: Callable[[], dict[str, Any]] | None = None, +) -> None: + """Publish ``system..health`` every *interval* seconds. + + Standard heartbeat loop shared across agent/forwarder/updater. Emits + ``{"worker": , "ts": , **extra()}`` on each tick. A + ``None`` bus turns the loop into a no-op sleep cycle — still cancellable + so the caller can use the same ``asyncio.create_task``/``.cancel()`` + pattern regardless of bus state. + + Cancellation-safe: unwraps the ``CancelledError`` so callers awaiting + the task during shutdown see a clean exit. + """ + topic = _topics.system_health(worker) + with contextlib.suppress(asyncio.CancelledError): + while True: + payload: dict[str, Any] = {"worker": worker, "ts": time.time()} + if extra is not None: + try: + payload.update(extra()) + except Exception as exc: # noqa: BLE001 + log.debug("heartbeat extra() failed worker=%s: %s", worker, exc) + await publish_safely(bus, topic, payload, event_type=_topics.SYSTEM_HEALTH) + await asyncio.sleep(interval) + + +async def run_control_listener( + bus: BaseBus | None, + worker: str, + shutdown: asyncio.Event, +) -> None: + """Subscribe to ``system..control`` and honour stop intents. + + On a well-formed ``{"action": "stop", ...}`` message the function sets + *shutdown* and returns — the worker's main loop is expected to check + the event and unwind cleanly, matching the SIGTERM path. + + Malformed payloads (missing/unknown action, non-dict, exception from + the transport) are logged and ignored. A ``None`` bus yields a noop + coroutine that simply awaits *shutdown* — callers can ``create_task`` + this unconditionally regardless of bus state. + + Cancellation-safe. + """ + if bus is None: + with contextlib.suppress(asyncio.CancelledError): + await shutdown.wait() + return + + topic = _topics.system_control(worker) + with contextlib.suppress(asyncio.CancelledError): + try: + async with bus.subscribe(topic) as sub: + async for event in sub: + payload = event.payload or {} + action = payload.get("action") + requested_by = payload.get("requested_by", "") + if action == _topics.WORKER_CONTROL_STOP: + log.info( + "control: stop requested worker=%s by=%s", + worker, requested_by, + ) + shutdown.set() + return + log.debug( + "control: ignoring unknown action worker=%s action=%r", + worker, action, + ) + except Exception as exc: # noqa: BLE001 + log.warning( + "control listener failed worker=%s: %s — shutdown via bus disabled", + worker, exc, + ) + + +async def run_control_listener_signal( + bus: BaseBus | None, + worker: str, +) -> None: + """Like :func:`run_control_listener` but signals the process on stop. + + Preferred for workers whose main loop is a blocking thread + (container-log tail, PTY read, scapy sniff) — wiring an + ``asyncio.Event`` through the thread boundary is error-prone, and + every DECNET worker already has systemd-equivalent SIGTERM cleanup. + A SIGTERM self-signal routes the stop through that same path + without inventing a second shutdown mechanism. + + Cancellation-safe. Never raises: a failed self-signal is logged + and the loop simply exits (admin can fall back to ``systemctl``). + """ + if bus is None: + return + + topic = _topics.system_control(worker) + with contextlib.suppress(asyncio.CancelledError): + try: + async with bus.subscribe(topic) as sub: + async for event in sub: + payload = event.payload or {} + action = payload.get("action") + requested_by = payload.get("requested_by", "") + if action == _topics.WORKER_CONTROL_STOP: + log.info( + "control: stop requested worker=%s by=%s → SIGTERM self", + worker, requested_by, + ) + try: + os.kill(os.getpid(), signal.SIGTERM) + except Exception as exc: # noqa: BLE001 + log.warning( + "control: self-signal failed worker=%s: %s", + worker, exc, + ) + return + log.debug( + "control: ignoring unknown action worker=%s action=%r", + worker, action, + ) + except Exception as exc: # noqa: BLE001 + log.warning( + "control signal listener failed worker=%s: %s", + worker, exc, + ) diff --git a/decnet/bus/topics.py b/decnet/bus/topics.py new file mode 100644 index 00000000..3163b614 --- /dev/null +++ b/decnet/bus/topics.py @@ -0,0 +1,653 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Canonical topic hierarchy for the DECNET ServiceBus. + +Locked early so consumers can subscribe with stable wildcard patterns. +Adding new topic families is fine; **renaming** existing ones is a breaking +change for every subscriber and requires a coordinated rollout. + +Token structure (NATS-style, dot-separated): + + topology.{topology_id}.mutation.{state} + topology.{topology_id}.status + decky.{decky_id}.state + decky.{decky_id}.traffic + orchestrator.traffic.{decky_id} + orchestrator.file.{decky_id} + orchestrator.email.{decky_id} + attacker.observed + attacker.scored + attacker.session.started + attacker.session.ended + attacker.observation.{primitive} + identity.formed + identity.observation.linked + identity.merged + identity.unmerged + identity.campaign.assigned + campaign.formed + campaign.identity.assigned + campaign.merged + campaign.unmerged + credential.captured + credential.reuse.detected + attribution.profile.state_changed + attribution.profile.multi_actor_suspected + canary.{token_id}.triggered + canary.{token_id}.placed + canary.{token_id}.revoked + system.log + system.bus.health + system.{worker}.health + email.received + ttp.tagged + ttp.rule.fired.{technique_id} + ttp.rule.suppressed + +Wildcards (per :func:`decnet.bus.base.matches`): + +* ``*`` matches exactly one token. +* ``>`` matches one-or-more trailing tokens (so ``topology.>`` matches + ``topology.abc.status`` but not the bare root ``topology``). +""" +from __future__ import annotations + +# ─── Root prefixes ─────────────────────────────────────────────────────────── + +TOPOLOGY = "topology" +DECKY = "decky" +ATTACKER = "attacker" +IDENTITY = "identity" +CAMPAIGN = "campaign" +SYSTEM = "system" +CREDENTIAL = "credential" +ATTRIBUTION = "attribution" +ORCHESTRATOR = "orchestrator" +CANARY = "canary" +SMTP = "smtp" +EMAIL = "email" +TTP = "ttp" + + +# ─── Leaf event-type constants (the last segment of each topic) ────────────── + +# Topology mutation lifecycle states — keep in sync with TopologyMutation.state +# in decnet/web/db/models.py; the bus topic mirrors the DB state machine. +MUTATION_ENQUEUED = "enqueued" +MUTATION_APPLYING = "applying" +MUTATION_APPLIED = "applied" +MUTATION_FAILED = "failed" + +# Topology-level status transitions (topology.{id}.status): fires when the +# topology row's status column changes (pending/deploying/active/degraded/failed). +TOPOLOGY_STATUS = "status" + +# Decky-level event types (second token). +DECKY_STATE = "state" +DECKY_TRAFFIC = "traffic" +# On-demand mutation request — published by the API/CLI/UI, consumed by +# the mutator's watch loop to force an immediate mutation of one decky +# without waiting for its scheduled interval. Underscored (not dotted) +# to stay a single NATS token so the builder's validator accepts it. +DECKY_MUTATE_REQUEST = "mutate_request" +# Mutation transition event — distinct from DECKY_STATE ("current +# shape") because a mutation is a *transition* that carries old/new +# services + trigger + timing. Correlator consumes these (via the +# syslog sidechannel too) to interleave substrate-change markers into +# attacker traversals. +DECKY_MUTATION = "mutation" +# Per-service add/remove on a deployed decky (live; no full redeploy). +# Payload carries ``decky_name``, ``service_name``, optional +# ``topology_id``, and ``services`` (the post-mutation list). Consumers +# that watch substrate shape (correlator, dashboard, profiler) reconcile +# off these without waiting for the next decnet-state.json snapshot. +DECKY_SERVICE_ADDED = "service_added" +DECKY_SERVICE_REMOVED = "service_removed" +# Per-service config change (the schema-driven Inspector form). Payload +# carries ``decky_name``, ``service_name``, optional ``topology_id``, +# ``service_config`` (the new validated dict), and ``recreated`` — true +# when the operator hit Apply (container was force-recreated to pick up +# the new env), false when they only hit Save (DB-only). +DECKY_SERVICE_CONFIG_CHANGED = "service_config_changed" +# Async deploy/mutate operation transitions +# (pending/running/succeeded/failed). Payload: {lifecycle_id, operation, +# status, error?}. UI polling endpoint is the source of truth; this +# fires for live subscribers (dashboard, mutator-side audit, etc). +DECKY_LIFECYCLE = "lifecycle" + +# Attacker event types (second token under the ``attacker`` root). First +# sighting, session boundary transitions, and score-threshold crossings +# published by correlator + profiler. Consumers typically subscribe to +# the wildcard ``attacker.>``. +ATTACKER_OBSERVED = "observed" +ATTACKER_SCORED = "scored" +# Published once per successful active probe result (JARM/HASSH/TCPfp/ipv6_leak). +# Distinct from ``observed`` which is the correlator's first-sight signal — +# a fingerprint is additional evidence about an already-observed attacker. +# Known payload ``kind`` discriminators carried in this topic: +# "jarm" — JARM TLS server hash (prober) +# "hassh" — HASSHServer SSH key-exchange hash (prober) +# "tcpfp" — TCP/IP stack fingerprint hash (prober) +# "tls_cert" — leaf TLS certificate SHA-256 (prober) +# "ipv6_leak" — fe80:: link-local address observed via passive sniffer +# or active ICMPv6 solicitation (prober + sniffer); +# payload: {attacker_ip, addr, iid_kind, mac_oui, vector, +# on_iface, observed_at} +ATTACKER_FINGERPRINTED = "fingerprinted" +# Published when the prober observes a NEW hash for an +# (attacker_ip, port, probe_type) triple it has seen before — i.e. the +# attacker rotated their VPS, rebuilt their SSH server, swapped their +# TLS cert. Distinct from ``fingerprinted`` which fires on every probe +# result; ``fingerprint_rotated`` fires only on diff and carries both +# old_hash + new_hash. Producer: prober (via the rotation library); +# consumers: dashboard, forensics, attribution clustering. +ATTACKER_FINGERPRINT_ROTATED = "fingerprint_rotated" +ATTACKER_SESSION_STARTED = "session.started" +ATTACKER_SESSION_ENDED = "session.ended" +# Published by the ``decnet enrich`` worker after an enrichment pass +# succeeds for an attacker IP (one or more 3rd-party intel providers +# returned a verdict). Payload carries the aggregate verdict + per- +# provider summary so SIEM-bound webhooks don't need to re-query the DB. +ATTACKER_INTEL_ENRICHED = "intel.enriched" +# Per-primitive BEHAVE-SHELL observation. Full topic shape: +# attacker.observation. +# e.g. ``attacker.observation.motor.input_modality``. Producer: +# ``decnet/profiler/behave_shell/`` (extractor library called from the +# profiler worker on ``attacker.session.ended``); consumers: dashboard +# SSE relay, attribution engine state machine, federation gossip +# (post-v0). See development/BEHAVE-INTEGRATION.md §"Bus topics" for +# the wire-format contract — the prefix is documentation + pattern +# match only; bus auth is socket file perms (DEBT-029 §2), not +# topic-level. The ``primitive`` segment MAY contain dots +# (``motor.shell_mastery.tab_completion``) — the same dotted-leaf +# rule that ``attacker.session.ended`` uses. +ATTACKER_OBSERVATION_PREFIX = "observation" + +# Identity-resolution event types (second/third tokens under ``identity``). +# Published by the (future) clusterer worker — see +# development/IDENTITY_RESOLUTION.md. Constants ship in this commit; +# no publishers exist yet, but consumers (webhook worker, dashboard +# SSE relay) can subscribe to ``identity.>`` from day one and receive +# events the instant the clusterer comes online. +# +# identity.formed — clusterer creates a new identity from +# one or more observations +# identity.observation.linked — observation attached to an existing +# identity (or reattached from another) +# identity.merged — two identities collapsed; loser gets +# ``merged_into_uuid`` set, subscribers +# re-key cached references to the winner +# identity.unmerged — revocable-merge undo: contradicting +# evidence cleared ``merged_into_uuid`` +# and re-split observations. The +# resurrected side's UUID is the same +# as the prior loser, so subscribers +# that cached references to the loser +# during the merged interval can +# re-attach without a new lookup. +# +# ``identity.campaign.assigned`` is deferred; it ships when the campaign +# clusterer ships. YAGNI before then. +IDENTITY_FORMED = "formed" +IDENTITY_OBSERVATION_LINKED = "observation.linked" +IDENTITY_MERGED = "merged" +IDENTITY_UNMERGED = "unmerged" +# Campaign-clusterer cross-family event — fires under ``identity.>`` so +# identity-stream subscribers (e.g. the IdentityDetail SSE client) get +# notified the moment an identity's ``campaign_id`` changes without +# having to subscribe to the campaign topic family. The same event +# fires under ``campaign.identity.assigned`` for campaign-side +# subscribers. +IDENTITY_CAMPAIGN_ASSIGNED = "campaign.assigned" + +# Campaign-clusterer event types (second/third tokens under +# ``campaign``). Mirror of the identity family at the layer above: +# campaigns group identities into operations, and the clusterer +# publishes the same form / link / merge / unmerge lifecycle. +# +# campaign.formed — clusterer creates a new campaign from +# one or more identities +# campaign.identity.assigned — identity attached to an existing +# campaign (or reassigned from another) +# campaign.merged — two campaigns collapsed; loser gets +# ``merged_into_uuid`` set, subscribers +# re-key cached references to the winner +# campaign.unmerged — revocable-merge undo: contradicting +# evidence cleared ``merged_into_uuid`` +# and re-split identities +CAMPAIGN_FORMED = "formed" +CAMPAIGN_IDENTITY_ASSIGNED = "identity.assigned" +CAMPAIGN_MERGED = "merged" +CAMPAIGN_UNMERGED = "unmerged" + +# Credential event types (second/third tokens under ``credential``). +# ``credential.captured`` fires once per upserted Credential row — the +# correlator listens for it and runs the cred-reuse query in response, +# so reuse detection latency is sub-second after a fresh capture. +# ``credential.reuse.detected`` fires when the correlator inserts a new +# CredentialReuse row or grows an existing one (added decky/service/IP). +CREDENTIAL_CAPTURED = "captured" +CREDENTIAL_REUSE_DETECTED = "reuse.detected" + +# Attribution-engine event types (second/third tokens under +# ``attribution``). Published by the v0 attribution worker +# (``decnet.correlation.attribution_worker``) which subscribes to +# ``attacker.observation.>`` and runs the per-(identity, primitive) +# state machine. See ``development/ATTRIBUTION-ENGINE.md``. +# +# attribution.profile.state_changed — per-primitive state +# transition (e.g. +# stable → drifting). +# Payload: identity_uuid, +# primitive, old_state, +# new_state, current_value, +# confidence, +# observation_count, ts. +# attribution.profile.multi_actor_suspected — fires when ≥ 2 +# primitives flag the same +# identity as multi_actor +# concurrently. Cross- +# primitive correlator; +# single-primitive +# multi_actor is too noisy +# on its own. Payload: +# identity_uuid, primitives, +# evidence_summary, +# confidence, ts. +# +# These are *derived* signals — distinct from +# ``identity.*`` (clusterer lifecycle, IDENTITY_RESOLUTION.md) and +# ``attacker.observation.*`` (raw extractor envelopes, +# BEHAVE-INTEGRATION.md). The three families compose: observations feed +# the attribution engine, the engine emits derived state, the clusterer +# reads observations + state to form / merge identities. +ATTRIBUTION_PROFILE_PREFIX = "profile" +ATTRIBUTION_PROFILE_STATE_CHANGED = "profile.state_changed" +ATTRIBUTION_PROFILE_MULTI_ACTOR_SUSPECTED = "profile.multi_actor_suspected" + +# Canary-token event types (third token under ``canary``). +# +# canary.{token_id}.placed — orchestrator/API successfully planted a +# canary artifact inside a decky's +# filesystem (or persisted a passive token +# that has no callback wiring). Lets +# dashboards reflect baseline coverage in +# real time without a DB poll. +# canary.{token_id}.triggered — ``decnet canary`` worker observed a +# callback hit (HTTP slug or DNS subdomain +# lookup) for the token. Payload carries +# ``src_ip``, ``user_agent``, ``request_path`` +# and any DNS qname so downstream +# consumers (correlator, webhook fanout) +# can attribute and forward without a +# follow-up DB read. +# canary.{token_id}.revoked — operator removed a token; planter unlinked +# the file (best-effort) and the row was +# marked ``revoked``. Subscribers may +# evict cached lookups by token id. +CANARY_PLACED = "placed" +CANARY_TRIGGERED = "triggered" +CANARY_REVOKED = "revoked" + +# Orchestrator event types (second token under ``orchestrator``). The +# orchestrator worker publishes one of these per synthetic action it +# drives against a decky — cheap inter-decky traffic and filesystem +# mutations whose role is to keep the honeypot from looking suspiciously +# static. Always nested with the destination decky uuid as the third +# token, so consumers can subscribe to a single decky's life-injection +# stream via ``orchestrator.*.``. +ORCHESTRATOR_TRAFFIC = "traffic" +ORCHESTRATOR_FILE = "file" +# Emailgen — published by the ``decnet emailgen`` worker once per generated +# fake email delivered into a mail decky's maildir. Third token is the +# destination mail-decky uuid (the IMAP/POP3 host serving the mailbox), +# matching the ``orchestrator.*.`` subscription pattern. +ORCHESTRATOR_EMAIL = "email" + +# System event types. +SYSTEM_LOG = "log" +SYSTEM_BUS_HEALTH = "bus.health" +# Worker-health leaf — built per-worker as ``system..health`` via +# :func:`system_health`. The leaf constant stays the same across workers; +# the worker name goes in the middle token. +SYSTEM_HEALTH = "health" +# Worker-control leaf — built per-worker as ``system..control`` via +# :func:`system_control`. Admin-originated stop intents travel on this +# topic; each worker subscribes to its own. +SYSTEM_CONTROL = "control" + +# Control payload ``action`` values — the wire vocabulary. Only ``stop`` is +# handled in v1; ``start`` is reserved because a stopped worker has no +# subscriber, so starting requires external supervision (systemd). +WORKER_CONTROL_STOP = "stop" +WORKER_CONTROL_START = "start" + +# Webhook subscription-set changed — published by the CRUD router after any +# create / update / delete on WebhookSubscription so the webhook worker can +# reload its in-memory subscription list and re-subscribe to the new union +# of patterns. Payload is currently empty; consumers only need the signal. +WEBHOOK_SUBSCRIPTIONS_CHANGED = "system.webhook.subscriptions_changed" + +# Email-receipt event — fired by smtp / smtp-relay services on full-message +# receipt (envelope + headers + body + attachments captured). Single-token +# leaf so the bus tokenizer accepts it directly under the ``email`` root. +# Consumed by the TTP ``email_lifter`` for header / body-pattern / attachment +# rules. PII rule (TTP_TAGGING.md "Hard parts §6"): payload carries hashes, +# counts, header names, and rcpt-domain sets — never rcpt addresses or body +# bytes. +EMAIL_RECEIVED = "received" + +# TTP-tagging event types (second/third tokens under ``ttp``). +# +# ttp.tagged — one or more new tags written. Published +# only when ``INSERT OR IGNORE`` wrote at +# least one new row; idempotent +# re-evaluations publish nothing +# (loop-prevention invariant — see +# TTP_TAGGING.md). +# ttp.rule.fired.{technique_id} — per-technique fan-out for SIEM +# consumers that subscribe to a single +# technique. Topic key is the parent +# technique; sub_technique is in the +# payload. Built via :func:`ttp_rule_fired`. +# ttp.rule.suppressed — rule fired but the tag was dropped +# (confidence below floor, rate-limited, +# or the rule's RuleState was disabled). +# Observability signal for the dashboard. +# +# Per-rule reload + state-change topics. Built via +# :func:`ttp_rule_reloaded` / :func:`ttp_rule_state`; SIEM consumers +# subscribe to ``ttp.rule.reloaded.>`` (every rule) or +# ``ttp.rule.reloaded.R0001`` (one rule) at their preferred granularity. +TTP_TAGGED = "tagged" +TTP_RULE_FIRED = "rule.fired" +TTP_RULE_SUPPRESSED = "rule.suppressed" +TTP_RULE_RELOADED = "rule.reloaded" +TTP_RULE_STATE = "rule.state" + + +# ─── Builders ──────────────────────────────────────────────────────────────── + +def topology_mutation(topology_id: str, state: str) -> str: + """Build ``topology..mutation.``. + + *state* should be one of the ``MUTATION_*`` constants. + """ + _reject_tokens(topology_id, state) + return f"{TOPOLOGY}.{topology_id}.mutation.{state}" + + +def topology_status(topology_id: str) -> str: + """Build ``topology..status``.""" + _reject_tokens(topology_id) + return f"{TOPOLOGY}.{topology_id}.{TOPOLOGY_STATUS}" + + +def decky(decky_id: str, event_type: str) -> str: + """Build ``decky..``. + + *event_type* is typically one of ``DECKY_STATE`` or ``DECKY_TRAFFIC``. + """ + _reject_tokens(decky_id, event_type) + return f"{DECKY}.{decky_id}.{event_type}" + + +def decky_mutation(decky_id: str) -> str: + """Build ``decky..mutation``.""" + _reject_tokens(decky_id) + return f"{DECKY}.{decky_id}.{DECKY_MUTATION}" + + +def decky_lifecycle(decky_id: str) -> str: + """Build ``decky..lifecycle``.""" + _reject_tokens(decky_id) + return f"{DECKY}.{decky_id}.{DECKY_LIFECYCLE}" + + +def system(event_type: str) -> str: + """Build ``system.``. + + *event_type* may itself contain dots (e.g. ``bus.health``) — we don't + re-validate the already-constant leaves; this just prefixes. + """ + if not event_type: + raise ValueError("system topic requires a non-empty event_type") + return f"{SYSTEM}.{event_type}" + + +def credential(event_type: str) -> str: + """Build ``credential.``. + + *event_type* is typically one of :data:`CREDENTIAL_CAPTURED` or + :data:`CREDENTIAL_REUSE_DETECTED`. Dotted leaves + (``reuse.detected``) are permitted — same rationale as + :func:`system`. + """ + if not event_type: + raise ValueError("credential topic requires a non-empty event_type") + return f"{CREDENTIAL}.{event_type}" + + +def attacker(event_type: str) -> str: + """Build ``attacker.``. + + *event_type* is typically one of ``ATTACKER_OBSERVED``, + ``ATTACKER_SCORED``, ``ATTACKER_SESSION_STARTED``, + ``ATTACKER_SESSION_ENDED``. Dotted leaves (``session.started``) are + permitted — same rationale as :func:`system`. + """ + if not event_type: + raise ValueError("attacker topic requires a non-empty event_type") + return f"{ATTACKER}.{event_type}" + + +def attacker_observation(primitive: str) -> str: + """Build ``attacker.observation.``. + + *primitive* is the fully-qualified BEHAVE-SHELL primitive path + (e.g. ``motor.input_modality``, + ``cognitive.feedback_loop_engagement``, + ``motor.shell_mastery.tab_completion``). Dotted primitives are + permitted — this matches the format + ``behave_shell.spec.event_adapter.event_topic_for`` produces + upstream, and DECNET's bus admits the dotted leaf the same way + :func:`attacker` does for ``session.started``. + + Empty string is rejected so a downstream typo doesn't ship as + ``attacker.observation.``. + """ + if not primitive: + raise ValueError( + "attacker_observation topic requires a non-empty primitive", + ) + return f"{ATTACKER}.{ATTACKER_OBSERVATION_PREFIX}.{primitive}" + + +def attribution(event_type: str) -> str: + """Build ``attribution.``. + + *event_type* is typically one of + :data:`ATTRIBUTION_PROFILE_STATE_CHANGED` or + :data:`ATTRIBUTION_PROFILE_MULTI_ACTOR_SUSPECTED` — both contain a + dot (``profile.state_changed``) which is permitted under the same + "trailing dotted leaf" rule that ``attacker.session.started`` uses. + """ + if not event_type: + raise ValueError("attribution topic requires a non-empty event_type") + return f"{ATTRIBUTION}.{event_type}" + + +def campaign(event_type: str) -> str: + """Build ``campaign.``. + + *event_type* is typically one of :data:`CAMPAIGN_FORMED`, + :data:`CAMPAIGN_IDENTITY_ASSIGNED`, :data:`CAMPAIGN_MERGED`, or + :data:`CAMPAIGN_UNMERGED`. Dotted leaves (``identity.assigned``) + are permitted — same rationale as :func:`system`. + """ + if not event_type: + raise ValueError("campaign topic requires a non-empty event_type") + return f"{CAMPAIGN}.{event_type}" + + +def identity(event_type: str) -> str: + """Build ``identity.``. + + *event_type* is typically one of :data:`IDENTITY_FORMED`, + :data:`IDENTITY_OBSERVATION_LINKED`, :data:`IDENTITY_MERGED`, or + :data:`IDENTITY_UNMERGED`. Dotted leaves (``observation.linked``) + are permitted — same rationale as :func:`system`. + """ + if not event_type: + raise ValueError("identity topic requires a non-empty event_type") + return f"{IDENTITY}.{event_type}" + + +def orchestrator(event_type: str, decky_id: str) -> str: + """Build ``orchestrator..``. + + *event_type* should be one of :data:`ORCHESTRATOR_TRAFFIC` or + :data:`ORCHESTRATOR_FILE`. The destination decky is always the + third token so per-decky subscribers can use + ``orchestrator.*.``. + """ + _reject_tokens(event_type, decky_id) + return f"{ORCHESTRATOR}.{event_type}.{decky_id}" + + +def canary(token_id: str, event_type: str) -> str: + """Build ``canary..``. + + *event_type* should be one of :data:`CANARY_PLACED`, + :data:`CANARY_TRIGGERED`, or :data:`CANARY_REVOKED`. The token id + is always the second token so per-token subscribers can use + ``canary..>`` and fleet-wide consumers (webhook fanout, + correlator) use ``canary.>``. + """ + _reject_tokens(token_id, event_type) + return f"{CANARY}.{token_id}.{event_type}" + + +def system_health(worker: str) -> str: + """Build ``system..health``. + + Worker-health heartbeats live as a nested leaf under ``system`` so + consumers can subscribe to ``system.*.health`` for every worker at + once, or to ``system.mutator.health`` for a single one. *worker* is + validated as a regular segment — no dots, wildcards, or whitespace. + """ + _reject_tokens(worker) + return f"{SYSTEM}.{worker}.{SYSTEM_HEALTH}" + + +def system_control(worker: str) -> str: + """Build ``system..control``. + + Admin-originated stop (and, eventually, start) intents are published + here; the worker in question subscribes to its own address and reacts. + Payload shape:: + + {"action": "stop", "requested_by": "", "ts": } + + *action* must be one of :data:`WORKER_CONTROL_STOP` / + :data:`WORKER_CONTROL_START`; any other value is ignored by the + listener. Same segment rules as :func:`system_health`. + """ + _reject_tokens(worker) + return f"{SYSTEM}.{worker}.{SYSTEM_CONTROL}" + + +def smtp(event_type: str) -> str: + """Build ``smtp.``. + + *event_type* may contain dots (e.g. ``probe.pending``). + """ + if not event_type: + raise ValueError("smtp topic requires a non-empty event_type") + return f"{SMTP}.{event_type}" + + +def email_topic(event_type: str) -> str: + """Build ``email.``. + + Named ``email_topic`` rather than ``email`` to avoid shadowing the + Python ``email`` stdlib package at import sites that pull both. + *event_type* is typically :data:`EMAIL_RECEIVED`. + """ + if not event_type: + raise ValueError("email topic requires a non-empty event_type") + return f"{EMAIL}.{event_type}" + + +def ttp(event_type: str) -> str: + """Build ``ttp.``. + + *event_type* is typically one of :data:`TTP_TAGGED`, + :data:`TTP_RULE_FIRED`, or :data:`TTP_RULE_SUPPRESSED`. Dotted + leaves (``rule.fired``) are permitted — same rationale as + :func:`system`. For per-technique fan-out use + :func:`ttp_rule_fired`. + """ + if not event_type: + raise ValueError("ttp topic requires a non-empty event_type") + return f"{TTP}.{event_type}" + + +def ttp_rule_fired(technique_id: str) -> str: + """Build ``ttp.rule.fired.``. + + Per-technique fan-out: SIEM subscribers can listen on + ``ttp.rule.fired.>`` for everything, ``ttp.rule.fired.T1110`` for + one technique. *technique_id* is validated as a single segment — + sub-techniques like ``T1110.001`` are rejected because they would + split into two tokens. The topic key is the parent technique; + ``sub_technique_id`` lives in the payload. + """ + _reject_tokens(technique_id) + return f"{TTP}.rule.fired.{technique_id}" + + +def ttp_rule_reloaded(rule_id: str) -> str: + """Build ``ttp.rule.reloaded.``. + + Per-rule fan-out fired by the :class:`~decnet.ttp.store.base.RuleStore` + when a rule's *definition* changes (YAML edit on the filesystem + backend, ``ttp_rule`` row update on the database backend). One event + per per-rule edit — never batched (the "incremental, never batched" + property in TTP_TAGGING.md §"Bus topics" inherits its granularity + from :meth:`RuleStore.subscribe_changes`). + + Subscribers: ``ttp.rule.reloaded.>`` for every rule, + ``ttp.rule.reloaded.R0001`` for one. *rule_id* is validated as a + single segment. + """ + _reject_tokens(rule_id) + return f"{TTP}.{TTP_RULE_RELOADED}.{rule_id}" + + +def ttp_rule_state(rule_id: str) -> str: + """Build ``ttp.rule.state.``. + + Per-rule fan-out fired by the :class:`~decnet.ttp.store.base.RuleStore` + when a rule's *operational state* changes (operator hits the disable + button, an ``expires_at`` TTL fires and auto-reverts the state). + *rule_id* is validated as a single segment. + """ + _reject_tokens(rule_id) + return f"{TTP}.{TTP_RULE_STATE}.{rule_id}" + + +def _reject_tokens(*parts: str) -> None: + """Reject topic segments that would break NATS-style tokenization. + + Dots, wildcards, whitespace, and empty strings in a *segment* would + silently corrupt the hierarchy (e.g. ``topology.a.b.status`` for a + ``topology_id`` of ``"a.b"``). Raise early at the builder instead of + shipping a malformed topic to the wire. + """ + for p in parts: + if not p: + raise ValueError("topic segment must not be empty") + if "." in p or "*" in p or ">" in p or any(c.isspace() for c in p): + raise ValueError( + f"topic segment {p!r} may not contain '.', '*', '>', or whitespace" + ) diff --git a/decnet/bus/unix_client.py b/decnet/bus/unix_client.py new file mode 100644 index 00000000..bb7c2792 --- /dev/null +++ b/decnet/bus/unix_client.py @@ -0,0 +1,269 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""UNIX-socket client — :class:`UnixSocketBus` implementation of :class:`BaseBus`. + +Holds one open socket to the local :class:`~decnet.bus.unix_server.BusServer`. +Operations: + +* :meth:`publish` writes a single ``PUB`` frame and returns; no ack. +* :meth:`subscribe` writes a ``SUB`` frame and returns a + :class:`~decnet.bus.base.Subscription` backed by an :class:`asyncio.Queue` + that the background reader task feeds. + +One background reader task per bus instance dispatches incoming ``EVT`` +frames to every registered subscription whose pattern matches the topic. +On connection drop or close, every subscription is woken via a sentinel so +iterators unblock cleanly; callers see :class:`StopAsyncIteration` from the +``async for`` loop. + +No auto-reconnect in MVP. If the server restarts, callers must +:meth:`close` the bus and construct a new one. This mirrors how other +DECNET workers handle their dependencies — the systemd ``Restart=on-failure`` +supervision above us is the retry loop. +""" +from __future__ import annotations + +import asyncio +import contextlib +import os +import pathlib +from typing import Any, cast + +from decnet.bus import protocol +from decnet.bus.base import ( + BaseBus, + Event, + Subscription, + _CLOSE_SENTINEL, + matches, +) +from decnet.bus.fake import _enqueue_drop_oldest as _enqueue_event_drop_oldest +from decnet.logging import get_logger + +log = get_logger("bus.client") + +_INBOUND_QUEUE_SIZE = 1024 + + +class _UnixSubscription(Subscription): + def __init__( + self, + bus: "UnixSocketBus", + pattern: str, + queue: "asyncio.Queue[Any]", + ) -> None: + super().__init__(pattern) + self._bus = bus + self._queue = queue + + async def __anext__(self) -> Event: + if self._closed: + raise StopAsyncIteration + item = await self._queue.get() + if item is _CLOSE_SENTINEL: + raise StopAsyncIteration + return cast(Event, item) + + async def _aclose(self) -> None: + await self._bus._unregister(self) + try: + self._queue.put_nowait(_CLOSE_SENTINEL) + except asyncio.QueueFull: + pass + + +class UnixSocketBus(BaseBus): + """Client handle for a local :class:`BusServer`. + + One instance per process typically; multiple instances simply open + multiple sockets to the same server. Connection is lazy — the first + :meth:`connect` (or any publish/subscribe call via ``async with``) + opens the socket. + """ + + def __init__( + self, + socket_path: pathlib.Path | str, + *, + client_name: str | None = None, + ) -> None: + self._path = pathlib.Path(socket_path) + self._client_name = client_name or f"decnet-bus-client[{os.getpid()}]" + self._reader: asyncio.StreamReader | None = None + self._writer: asyncio.StreamWriter | None = None + self._reader_task: asyncio.Task[None] | None = None + self._subs: list[_UnixSubscription] = [] + self._lock = asyncio.Lock() + self._write_lock = asyncio.Lock() + self._closed = False + # Sticky flag: the first publish-on-closed-bus call logs at + # WARNING so operators see that a publish was dropped; subsequent + # calls on the same instance log at DEBUG only to prevent a + # log flood when stream threads drain after close. The bus is + # critical infra, so the first warning is non-negotiable. + self._closed_publish_warned = False + + # ─── Lifecycle ────────────────────────────────────────────────────────── + + async def connect(self) -> None: + # Double-checked locking: the cheap unlocked check fast-paths the + # already-connected case, but the actual connect must hold ``_lock`` + # so two coroutines racing on a fresh bus (e.g. concurrent + # publish()/subscribe() both lazily calling connect()) can't each + # open a socket and spawn a reader task — the loser would orphan a + # live FD and an uncancelled reader_loop that close() never reaps. + if self._writer is not None: + return + async with self._lock: + # Re-check under the lock: a racing caller may have connected + # while we awaited the lock. + if self._writer is not None: + return + if self._closed: + raise RuntimeError("connect on closed bus") + self._reader, self._writer = await asyncio.open_unix_connection(str(self._path)) + await self._send(protocol.encode(protocol.HELLO, args=self._client_name)) + self._reader_task = asyncio.create_task(self._reader_loop()) + log.debug("bus.client: connected to %s as %s", self._path, self._client_name) + + async def close(self) -> None: + if self._closed: + return + self._closed = True + + # Best-effort BYE — we don't care if it fails. + if self._writer is not None and not self._writer.is_closing(): + with contextlib.suppress(Exception): + await self._send(protocol.encode(protocol.BYE)) + + if self._reader_task is not None: + self._reader_task.cancel() + with contextlib.suppress(asyncio.CancelledError): + await self._reader_task + self._reader_task = None + + if self._writer is not None: + with contextlib.suppress(Exception): + self._writer.close() + await self._writer.wait_closed() + self._writer = None + self._reader = None + + # Wake every subscription so `async for` exits. + for sub in list(self._subs): + with contextlib.suppress(asyncio.QueueFull): + sub._queue.put_nowait(_CLOSE_SENTINEL) + self._subs.clear() + + # ─── Pub/Sub ──────────────────────────────────────────────────────────── + + async def publish( + self, + topic: str, + payload: dict[str, Any], + *, + event_type: str = "", + ) -> None: + if self._closed: + # Degrade gracefully: the DB is the source of truth, the bus + # is only the notification layer. Raising here made every + # caller via publish_safely flood the logs once per stream + # line during shutdown races. First drop warns loudly; + # subsequent drops on the same instance are DEBUG-only. + if not self._closed_publish_warned: + self._closed_publish_warned = True + log.warning( + "bus.client: publish on closed bus dropped topic=%s " + "(further drops on this instance logged at DEBUG)", + topic, + ) + else: + log.debug("bus.client: publish on closed bus dropped topic=%s", topic) + return + if self._writer is None: + await self.connect() + body = Event(topic=topic, payload=payload, type=event_type).to_dict() + try: + await self._send(protocol.encode(protocol.PUB, args=topic, body=body)) + except (ConnectionError, BrokenPipeError) as exc: + # Bus loss is a logged warning, never a publisher crash. The + # DB-as-source-of-truth invariant means the work is already + # persisted; the missing event is just a missed notification. + log.warning("bus.client: publish failed: %s", exc) + + def subscribe(self, pattern: str) -> Subscription: + if self._closed: + raise RuntimeError("subscribe on closed bus") + queue: asyncio.Queue[Any] = asyncio.Queue(maxsize=_INBOUND_QUEUE_SIZE) + sub = _UnixSubscription(self, pattern, queue) + self._subs.append(sub) + # Schedule the SUB frame asynchronously so subscribe() stays sync, + # matching the BaseBus signature. The caller will shortly `async + # with` / `async for` the subscription, which will run the event + # loop and pick this task up. + asyncio.ensure_future(self._send_sub(pattern)) + return sub + + async def _send_sub(self, pattern: str) -> None: + try: + if self._writer is None: + await self.connect() + await self._send(protocol.encode(protocol.SUB, args=pattern)) + except Exception as exc: # pragma: no cover - network paths in live tests + log.warning("bus.client: SUB %s failed: %s", pattern, exc) + + async def _unregister(self, sub: _UnixSubscription) -> None: + try: + self._subs.remove(sub) + except ValueError: + return + # Tell the server we no longer want events for this pattern if no + # other local subscription still wants it. + if not any(s.pattern == sub.pattern for s in self._subs): + with contextlib.suppress(Exception): + await self._send(protocol.encode(protocol.UNSUB, args=sub.pattern)) + + # ─── Internal I/O ─────────────────────────────────────────────────────── + + async def _send(self, frame_bytes: bytes) -> None: + if self._writer is None: + raise ConnectionError("bus.client: not connected") + async with self._write_lock: + self._writer.write(frame_bytes) + await self._writer.drain() + + async def _reader_loop(self) -> None: + if self._reader is None: + return + try: + while True: + frame = await protocol.read_frame(self._reader) + if frame is None: + break + if frame.verb != protocol.EVT: + # Clients only ever legitimately receive EVT (or BYE). + if frame.verb == protocol.BYE: + break + log.warning("bus.client: unexpected verb from server: %s", frame.verb) + continue + topic = frame.args + data = protocol.decode_body(frame.body) if frame.body else {} + event = Event.from_dict(topic, data) + self._dispatch(event) + except protocol.ProtocolError as exc: + log.warning("bus.client: protocol error: %s", exc) + except (asyncio.IncompleteReadError, ConnectionError): + pass + except asyncio.CancelledError: + raise + except Exception: # pragma: no cover + log.exception("bus.client: reader loop crashed") + finally: + # Server-side close — wake every subscription. + for sub in list(self._subs): + with contextlib.suppress(asyncio.QueueFull): + sub._queue.put_nowait(_CLOSE_SENTINEL) + + def _dispatch(self, event: Event) -> None: + for sub in self._subs: + if matches(sub.pattern, event.topic): + _enqueue_event_drop_oldest(sub._queue, event) diff --git a/decnet/bus/unix_server.py b/decnet/bus/unix_server.py new file mode 100644 index 00000000..d01cad25 --- /dev/null +++ b/decnet/bus/unix_server.py @@ -0,0 +1,310 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""UNIX-socket server for the DECNET bus. + +One :class:`BusServer` per host. Accepts local connections on a UNIX-domain +socket; each connection may: + +* publish events (``PUB`` frames) that the server fans out to all matching + subscribers on other connections, and +* subscribe to patterns (``SUB`` frames) and receive matching events as + ``EVT`` frames. + +Authorization is socket file permissions (0660, group=``decnet`` if that +POSIX group exists, else the server process's own group). Anything the +kernel lets ``connect()`` is trusted — there is no verb-level auth. This +matches the "local processes on the same host" threat model; cross-host +federation is out of scope (see DEBT-029). + +Backpressure is per-connection, drop-oldest: if a subscriber can't drain its +outbound queue fast enough, the server discards the oldest pending event +rather than blocking publishers. The bus is at-most-once by contract, so +drops are acceptable; stalled publishers are not. +""" +from __future__ import annotations + +import asyncio +import contextlib +import grp +import os +import pathlib +from dataclasses import dataclass, field +from typing import Any + +from decnet.bus import protocol +from decnet.bus.base import Event, matches +from decnet.logging import get_logger + +log = get_logger("bus.server") + +_SOCKET_MODE = 0o660 +_DEFAULT_GROUP = "decnet" +_OUTBOUND_QUEUE_SIZE = 1024 + + +@dataclass(eq=False) +class _Connection: + """Per-connection server state.""" + + writer: asyncio.StreamWriter + peer_name: str = "" + patterns: set[str] = field(default_factory=set) + outbound: asyncio.Queue[bytes] = field( + default_factory=lambda: asyncio.Queue(maxsize=_OUTBOUND_QUEUE_SIZE) + ) + closed: bool = False + + +class BusServer: + """Serve a UNIX-socket bus on *socket_path*. + + Lifecycle: construct → :meth:`start` → :meth:`serve_forever` (or rely + on :meth:`start` returning once bound) → :meth:`close` for teardown. + Safe to :meth:`close` multiple times. + """ + + def __init__( + self, + socket_path: pathlib.Path | str, + *, + group: str | None = _DEFAULT_GROUP, + mode: int = _SOCKET_MODE, + ) -> None: + self._path = pathlib.Path(socket_path) + self._group = group + self._mode = mode + self._server: asyncio.base_events.Server | None = None + self._connections: set[_Connection] = set() + self._closed = False + + # ─── Lifecycle ────────────────────────────────────────────────────────── + + async def start(self) -> None: + """Bind the socket and begin accepting connections. + + Removes any stale socket file at *socket_path* first (common case: + the previous worker crashed without cleaning up). The parent + directory must already exist; we do NOT create it blindly because + the chosen directory (typically ``/run/decnet``) may require + systemd ``RuntimeDirectory=`` to set up. + """ + if self._server is not None: + return + + parent = self._path.parent + if not parent.exists(): + raise FileNotFoundError( + f"bus socket parent directory {parent} does not exist; " + f"create it with systemd RuntimeDirectory= or mkdir" + ) + + # Clean up a stale socket from a previous crash. If a live server + # is actually listening there, ``bind()`` below will fail — we do + # not try to detect live vs. stale ourselves. + with contextlib.suppress(FileNotFoundError): + if self._path.is_socket(): + self._path.unlink() + + self._server = await asyncio.start_unix_server( + self._handle_connection, path=str(self._path), + ) + _chmod_and_chown(self._path, self._mode, self._group) + log.info("bus.server: listening on %s (mode=%o group=%s)", + self._path, self._mode, self._group or "") + + async def serve_forever(self) -> None: + if self._server is None: + raise RuntimeError("BusServer not started") + async with self._server: + await self._server.serve_forever() + + async def close(self) -> None: + if self._closed: + return + self._closed = True + + if self._server is not None: + self._server.close() + with contextlib.suppress(Exception): + await self._server.wait_closed() + self._server = None + + # Drain every live connection. + for conn in list(self._connections): + await self._close_connection(conn) + self._connections.clear() + + with contextlib.suppress(FileNotFoundError): + self._path.unlink() + log.info("bus.server: closed") + + # ─── Internal publish fan-out ─────────────────────────────────────────── + + async def publish(self, topic: str, payload: dict[str, Any], event_type: str = "") -> None: + """Server-side publish helper — used by the worker to emit + ``system.bus.health`` heartbeats without opening a client loop.""" + event = Event(topic=topic, payload=payload, type=event_type) + self._fanout(event) + + # ─── Connection handler ───────────────────────────────────────────────── + + async def _handle_connection( + self, + reader: asyncio.StreamReader, + writer: asyncio.StreamWriter, + ) -> None: + conn = _Connection(writer=writer) + self._connections.add(conn) + writer_task = asyncio.create_task(self._writer_loop(conn)) + try: + await self._reader_loop(conn, reader) + except protocol.ProtocolError as exc: + log.warning("bus.server: protocol error from %s: %s", conn.peer_name, exc) + except (asyncio.IncompleteReadError, ConnectionError) as exc: + log.debug("bus.server: %s disconnected: %s", conn.peer_name, exc) + except Exception: # pragma: no cover - defensive + log.exception("bus.server: unhandled error in connection") + finally: + await self._close_connection(conn) + self._connections.discard(conn) + writer_task.cancel() + with contextlib.suppress(asyncio.CancelledError): + await writer_task + + async def _reader_loop( + self, conn: _Connection, reader: asyncio.StreamReader, + ) -> None: + while True: + frame = await protocol.read_frame(reader) + if frame is None: + return + await self._dispatch(conn, frame) + if frame.verb == protocol.BYE: + return + + async def _dispatch(self, conn: _Connection, frame: protocol.Frame) -> None: + if frame.verb == protocol.HELLO: + conn.peer_name = frame.args or conn.peer_name + log.debug("bus.server: HELLO from %s", conn.peer_name) + return + if frame.verb == protocol.SUB: + pattern = frame.args + if not pattern: + raise protocol.ProtocolError("SUB requires a pattern") + conn.patterns.add(pattern) + log.debug("bus.server: %s SUB %s", conn.peer_name, pattern) + return + if frame.verb == protocol.UNSUB: + conn.patterns.discard(frame.args) + return + if frame.verb == protocol.PUB: + topic = frame.args + if not topic: + raise protocol.ProtocolError("PUB requires a topic") + data = protocol.decode_body(frame.body) if frame.body else {} + event = Event( + topic=topic, + payload=data.get("payload", {}) or {}, + type=data.get("type", "") or "", + ) + self._fanout(event, origin=conn) + return + if frame.verb == protocol.BYE: + return + # EVT is server-to-client only; receiving one is a protocol violation. + raise protocol.ProtocolError(f"unexpected verb {frame.verb!r} from client") + + def _fanout(self, event: Event, *, origin: _Connection | None = None) -> None: + """Enqueue *event* as an EVT frame on every matching connection. + + We do NOT deliver back to the originating connection (a publisher + does not receive its own event). Encoding happens once per event, + not once per subscriber. + """ + try: + frame_bytes = protocol.encode( + protocol.EVT, args=event.topic, body=event.to_dict(), + ) + except protocol.ProtocolError: + log.exception("bus.server: failed to encode EVT for topic=%s", event.topic) + return + + for conn in self._connections: + if conn is origin or conn.closed: + continue + if not any(matches(p, event.topic) for p in conn.patterns): + continue + _enqueue_drop_oldest(conn.outbound, frame_bytes, event.topic) + + async def _writer_loop(self, conn: _Connection) -> None: + """Serialize writes onto *conn*'s socket. + + One writer task per connection so a slow peer only blocks its own + queue, not the fan-out loop. The queue is bounded with drop-oldest + policy applied at enqueue time (see :func:`_enqueue_drop_oldest`). + """ + try: + while not conn.closed: + data = await conn.outbound.get() + conn.writer.write(data) + await conn.writer.drain() + except (ConnectionError, BrokenPipeError): + log.debug("bus.server: %s writer: peer closed", conn.peer_name) + except asyncio.CancelledError: + pass + except Exception: # pragma: no cover - defensive + log.exception("bus.server: writer loop crashed for %s", conn.peer_name) + + async def _close_connection(self, conn: _Connection) -> None: + if conn.closed: + return + conn.closed = True + with contextlib.suppress(Exception): + conn.writer.close() + await conn.writer.wait_closed() + + +# ─── Helpers ───────────────────────────────────────────────────────────────── + +def _chmod_and_chown(path: pathlib.Path, mode: int, group: str | None) -> None: + """Apply socket file perms and best-effort group ownership. + + If *group* is ``None`` or the named group does not exist, we leave the + socket owned by the current process group. This keeps the server + usable on dev boxes that don't have a ``decnet`` group set up. + """ + try: + os.chmod(path, mode) + except OSError as exc: + log.warning("bus.server: chmod(%s, %o) failed: %s", path, mode, exc) + + if not group: + return + try: + gid = grp.getgrnam(group).gr_gid + except KeyError: + log.debug("bus.server: group %r not found, leaving socket group unchanged", group) + return + try: + os.chown(path, -1, gid) + except PermissionError: + # Dev box running as an unprivileged user can't chown. Log once at + # debug and move on — the socket is still usable by the owner. + log.debug("bus.server: chown(%s, gid=%d) denied; leaving as-is", path, gid) + except OSError as exc: + log.warning("bus.server: chown(%s, gid=%d) failed: %s", path, gid, exc) + + +def _enqueue_drop_oldest( + queue: "asyncio.Queue[bytes]", data: bytes, topic: str, +) -> None: + """Drop-oldest backpressure — mirrors :func:`decnet.bus.fake._enqueue_drop_oldest`.""" + while True: + try: + queue.put_nowait(data) + return + except asyncio.QueueFull: + try: + queue.get_nowait() + log.warning("bus.server: subscriber queue full, dropped event topic=%s", topic) + except asyncio.QueueEmpty: + return diff --git a/decnet/bus/worker.py b/decnet/bus/worker.py new file mode 100644 index 00000000..4cc3b61f --- /dev/null +++ b/decnet/bus/worker.py @@ -0,0 +1,122 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""``decnet bus`` worker entrypoint. + +Starts a :class:`~decnet.bus.unix_server.BusServer` on the configured UNIX +socket and serves forever, emitting a ``system.bus.health`` heartbeat on +its own bus every :data:`HEARTBEAT_INTERVAL_SEC` seconds so liveness-aware +consumers (dashboards, watchdogs) can tell the bus is up without polling +the filesystem. + +Cross-host federation is **out of scope** for the MVP; each host runs its +own bus independently. See DEBT-029 for the deferred ``--bridge-tcp`` +mode that would proxy the socket over the swarm mTLS channel. +""" +from __future__ import annotations + +import asyncio +import os +import pathlib +import signal +import time + +from decnet.bus import topics +from decnet.bus.unix_server import BusServer +from decnet.logging import get_logger + +log = get_logger("bus.worker") + +HEARTBEAT_INTERVAL_SEC = 10 + + +async def bus_worker( + socket_path: str | pathlib.Path, + *, + group: str | None = "decnet", + heartbeat_interval: int = HEARTBEAT_INTERVAL_SEC, +) -> None: + """Run the bus server until cancelled or SIGTERM/SIGINT is received. + + The parent directory of *socket_path* must already exist (systemd's + ``RuntimeDirectory=decnet`` handles this in prod; dev code is expected + to ``mkdir`` first). This function does not create it implicitly + because the right choice of perms/owner depends on the deployment + context. + """ + path = pathlib.Path(socket_path) + _ensure_parent(path) + + server = BusServer(path, group=group) + await server.start() + log.info("bus.worker: pid=%d socket=%s", os.getpid(), path) + + stop_event = asyncio.Event() + _install_signal_handlers(stop_event) + + heartbeat_task = asyncio.create_task(_heartbeat_loop(server, heartbeat_interval)) + serve_task = asyncio.create_task(server.serve_forever()) + + try: + await stop_event.wait() + log.info("bus.worker: shutdown signal received") + finally: + heartbeat_task.cancel() + serve_task.cancel() + for task in (heartbeat_task, serve_task): + try: + await task + except (asyncio.CancelledError, Exception): # noqa: BLE001 - draining shutdown + pass + await server.close() + log.info("bus.worker: stopped") + + +async def _heartbeat_loop(server: BusServer, interval: int) -> None: + """Publish ``system.bus.health`` on the server's own fan-out.""" + started_at = time.time() + while True: + try: + await server.publish( + topics.system(topics.SYSTEM_BUS_HEALTH), + { + "pid": os.getpid(), + "uptime_sec": round(time.time() - started_at, 3), + "ts": time.time(), + }, + event_type=topics.SYSTEM_BUS_HEALTH, + ) + except Exception: # pragma: no cover - heartbeat must never kill the worker + log.exception("bus.worker: heartbeat publish failed") + await asyncio.sleep(interval) + + +def _install_signal_handlers(stop_event: asyncio.Event) -> None: + loop = asyncio.get_running_loop() + for sig in (signal.SIGTERM, signal.SIGINT): + try: + loop.add_signal_handler(sig, stop_event.set) + except (NotImplementedError, RuntimeError): + # add_signal_handler is not supported on Windows / in some + # test harnesses where the loop is running in a non-main thread. + # The worker still exits via KeyboardInterrupt bubbling up. + pass + + +def _ensure_parent(path: pathlib.Path) -> None: + parent = path.parent + if parent.exists(): + return + # Dev-box convenience: if the parent is the user's ``~/.decnet`` dir, + # create it. We do not auto-mkdir ``/run/decnet`` — that's systemd's job + # and silently creating it as the wrong user would cause permission + # confusion later. + home_prefix = pathlib.Path.home() / ".decnet" + try: + parent.relative_to(home_prefix.parent) + except ValueError: + raise FileNotFoundError( + f"bus socket parent {parent} does not exist; create it first" + ) + parent.mkdir(parents=True, exist_ok=True) + + +__all__ = ["bus_worker", "HEARTBEAT_INTERVAL_SEC"] diff --git a/decnet/canary/__init__.py b/decnet/canary/__init__.py new file mode 100644 index 00000000..916befbb --- /dev/null +++ b/decnet/canary/__init__.py @@ -0,0 +1,38 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Canary tokens — decoy artifacts planted in decky filesystems. + +Public surface is exported here so callers can ``from decnet.canary +import CanaryArtifact, get_generator, get_instrumenter`` without +knowing the submodule layout. Concrete generators / instrumenters +live under :mod:`decnet.canary.generators` and +:mod:`decnet.canary.instrumenters` respectively; the factory keeps +import-time cost down by deferring those imports until first use +(same pattern as :mod:`decnet.intel.factory`). +""" +from __future__ import annotations + +from decnet.canary.base import ( + CanaryArtifact, + CanaryContext, + CanaryGenerator, + CanaryInstrumenter, +) +from decnet.canary.factory import ( + KNOWN_GENERATORS, + KNOWN_INSTRUMENTERS, + get_generator, + get_instrumenter, + pick_instrumenter_for_mime, +) + +__all__ = [ + "CanaryArtifact", + "CanaryContext", + "CanaryGenerator", + "CanaryInstrumenter", + "KNOWN_GENERATORS", + "KNOWN_INSTRUMENTERS", + "get_generator", + "get_instrumenter", + "pick_instrumenter_for_mime", +] diff --git a/decnet/canary/_obfuscate_helper.js b/decnet/canary/_obfuscate_helper.js new file mode 100644 index 00000000..19786cd4 --- /dev/null +++ b/decnet/canary/_obfuscate_helper.js @@ -0,0 +1,19 @@ +// SPDX-License-Identifier: AGPL-3.0-or-later +// Node helper invoked by decnet.canary.obfuscator. +// Reads {code, options} JSON from stdin, writes obfuscated JS to stdout. +// Kept dependency-light on purpose: only javascript-obfuscator. +const JsObf = require('javascript-obfuscator'); + +let raw = ''; +process.stdin.setEncoding('utf8'); +process.stdin.on('data', (chunk) => { raw += chunk; }); +process.stdin.on('end', () => { + try { + const { code, options } = JSON.parse(raw); + const result = JsObf.obfuscate(code, options || {}); + process.stdout.write(result.getObfuscatedCode()); + } catch (e) { + process.stderr.write(String(e && e.stack || e)); + process.exit(2); + } +}); diff --git a/decnet/canary/base.py b/decnet/canary/base.py new file mode 100644 index 00000000..a3789714 --- /dev/null +++ b/decnet/canary/base.py @@ -0,0 +1,152 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Canary generator / instrumenter ABCs and the artifact dataclass. + +Two flavors of producer share the same return shape: + +* :class:`CanaryGenerator` synthesises a fake artifact from scratch + (e.g. a plausible ``~/.aws/credentials`` block, a ``.git/config`` + pointing at an attacker-bait remote URL). Operators don't supply + any input. + +* :class:`CanaryInstrumenter` mutates an operator-uploaded blob to + embed the callback (HTTP slug + DNS host). The original blob bytes + are passed in; the instrumenter returns the mutated version. + +Both return a :class:`CanaryArtifact` — the planter doesn't care +which path produced it. Same dataclass keeps the planter's +docker-exec injector trivial. + +ABCs intentionally do not include I/O — generators and instrumenters +are pure functions of (slug, host, blob?). All filesystem work +happens in :mod:`decnet.canary.planter` and :mod:`decnet.canary.storage`. +""" +from __future__ import annotations + +from abc import ABC, abstractmethod +from dataclasses import dataclass, field +from typing import Optional + + +@dataclass +class CanaryContext: + """Inputs every generator/instrumenter needs to embed a working callback. + + ``callback_token`` is the unique slug; it appears verbatim in HTTP + URLs (``https:///c/``) and as the leftmost + DNS label (``.canary.``) so a single + slug resolves to a single :class:`CanaryToken` row regardless of + which path the attacker tripped. + + ``http_base`` and ``dns_zone`` come from the canary worker's + public-facing config (``DECNET_CANARY_HTTP_BASE``, + ``DECNET_CANARY_DNS_ZONE``). When DNS isn't deployed, + ``dns_zone`` is empty and instrumenters that only have a DNS + surface (e.g. an artifact whose only realistic embed point is a + hostname) raise. + """ + + callback_token: str + http_base: str # e.g. "https://canary.example.test" — no trailing slash + dns_zone: str = "" # e.g. "canary.example.test"; "" disables DNS embeds + persona: str = "linux" # "linux" | "windows" — drives default username, path style + + +@dataclass +class CanaryArtifact: + """Bytes-and-placement bundle produced by a generator/instrumenter.""" + + path: str + """Absolute path inside the target container.""" + + content: bytes + """Final bytes that hit the decky filesystem. + + Always raw bytes — the planter base64-encodes for the wire so + binary blobs (DOCX/PNG/PDF) survive ``docker exec sh -c`` safely. + """ + + mode: int = 0o600 + """Unix file mode. Defaults to ``0600`` because most realistic + canary placements (``~/.aws/credentials``, ``.env``, ``id_rsa``) + are operator-only. Honeydocs in user docs folders should pass + ``0o644``. + """ + + mtime_offset: int = 0 + """Seconds relative to *now* for the planted file's mtime. + + Negative values backdate the file so it doesn't look like it + appeared the moment the decky was deployed. ``-86400 * 90`` (90 + days ago) is a common choice for ``honeydoc`` artifacts; ``0`` + means "stamp it now," which is fine for ``aws_creds``-like files + that would plausibly be touched recently. + """ + + instrumenter: Optional[str] = None + """Identifier of the instrumenter that produced this artifact (for + upload-driven tokens). Mirrored into ``CanaryToken.instrumenter``. + Mutually exclusive with :attr:`generator`. + """ + + generator: Optional[str] = None + """Identifier of the generator that produced this artifact (for + synthesised tokens). Mirrored into ``CanaryToken.generator``. + Mutually exclusive with :attr:`instrumenter`. + """ + + notes: list[str] = field(default_factory=list) + """Human-readable notes about the embedding (e.g. "DOCX: injected + 1×1 remote image at relsId rId99"). Surfaced in the API + ``preview`` response so the operator sees what we did before + planting. Never leaked to the attacker-facing surface. + """ + + fingerprint_nonce: Optional[str] = None + """Per-mint HMAC nonce for fingerprint canaries; ``None`` for everything + else. Cultivator reads this and persists it on ``CanaryToken.fingerprint_nonce`` + so the worker can validate incoming ``?k=`` params. + """ + + +class CanaryGenerator(ABC): + """Produces a fake artifact from scratch.""" + + name: str #: short tag — matches ``CanaryToken.generator`` + + @abstractmethod + def generate(self, ctx: CanaryContext) -> CanaryArtifact: + """Synthesise the artifact. + + MUST NOT do I/O. MUST be deterministic for the same + ``(callback_token, http_base, dns_zone, persona)`` so re-seeding + from :attr:`CanaryToken.secret_seed` produces byte-identical + output and the planter is naturally idempotent. + """ + + +class CanaryInstrumenter(ABC): + """Mutates an operator-uploaded blob to embed a callback.""" + + name: str #: short tag — matches ``CanaryToken.instrumenter`` + + #: MIME prefixes this instrumenter handles. The factory uses these + #: to dispatch by sniffed content-type. Sub-string match against + #: the prefix list (e.g. ``("application/pdf",)`` or + #: ``("text/",)``). + mime_prefixes: tuple[str, ...] = () + + @abstractmethod + def instrument( + self, blob: bytes, ctx: CanaryContext, *, target_path: str, + ) -> CanaryArtifact: + """Return the mutated bytes with the callback embedded. + + MUST raise :class:`InstrumenterRejectedError` when the blob + can't be safely mutated (corrupt zip, encrypted PDF, etc.) so + the API can surface a 400 with the specific reason rather than + silently shipping the original bytes. + """ + + +class InstrumenterRejectedError(ValueError): + """Raised when an instrumenter can't safely mutate the input.""" diff --git a/decnet/canary/cultivator.py b/decnet/canary/cultivator.py new file mode 100644 index 00000000..67ab48de --- /dev/null +++ b/decnet/canary/cultivator.py @@ -0,0 +1,193 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Realism contract adapter for canary generators. + +Stage 7 of the realism migration. The orchestrator's planner picks a +``canary_*`` :class:`~decnet.realism.taxonomy.ContentClass` 1–3% of +the time on file ticks; this module turns that pick into a +:class:`~decnet.canary.base.CanaryArtifact` (bytes the SSH driver +plants) plus a persisted :class:`~decnet.web.db.models.CanaryToken` +row so the canary worker recognises the slug when an attacker trips +it. + +What this is NOT: it doesn't pick *when* canaries fire — that's the +realism planner's job. It doesn't decide *where* on the filesystem +the canary lands beyond what realism naming + persona conventions +already produce. It's a thin bytes-and-row factory bolted onto the +realism contract. + +Stealth (per ``feedback_stealth.md``): we never leak the +``DECNET`` literal into anything that survives to the planted file. +The underlying generators are already stealth-clean; this wrapper +must not undo that. +""" +from __future__ import annotations + +import os +import secrets as _secrets +from datetime import datetime, timezone +from typing import Any, Optional + +from decnet.canary.base import CanaryArtifact, CanaryContext +from decnet.canary.factory import get_generator +from decnet.logging import get_logger +from decnet.realism.personas import login_for +from decnet.realism.taxonomy import ContentClass, Plan + +log = get_logger("canary.cultivator") + + +# realism content_class → canary generator name. Mirrors +# :data:`decnet.canary.factory.KNOWN_GENERATORS`. +_CLASS_TO_GENERATOR: dict[ContentClass, str] = { + ContentClass.CANARY_AWS_CREDS: "aws_creds", + ContentClass.CANARY_ENV_FILE: "env_file", + ContentClass.CANARY_GIT_CONFIG: "git_config", + ContentClass.CANARY_SSH_KEY: "ssh_key", + ContentClass.CANARY_HONEYDOC: "honeydoc", + ContentClass.CANARY_HONEYDOC_DOCX: "honeydoc_docx", + ContentClass.CANARY_HONEYDOC_PDF: "honeydoc_pdf", + ContentClass.CANARY_MYSQL_DUMP: "mysql_dump", + ContentClass.CANARY_FINGERPRINT_HTML: "fingerprint_html", + ContentClass.CANARY_FINGERPRINT_SVG: "fingerprint_svg", +} + + +# Generator → CanaryKind. The trip surface (HTTP slug callback / DNS +# resolution / passive bait) determines how the canary worker matches +# an attacker callback to this token. Aligned with +# :data:`decnet.web.db.models.canary.CanaryKind`. +_GENERATOR_TO_KIND: dict[str, str] = { + "aws_creds": "aws_passive", # no embedded callback; passive bait + "env_file": "http", + "git_config": "http", + "honeydoc": "http", + "honeydoc_docx": "http", + "honeydoc_pdf": "http", + "ssh_key": "dns", # trip is DNS resolution of host comment + "mysql_dump": "dns", # trip is DNS resolution of subdomain + "fingerprint_html": "http", # obfuscated JS beacons GET /c/ + "fingerprint_svg": "http", # same, embedded inside SVG + + +""" + + +_ROW_POOL = ( + ("ny-app-01.corp.local", "k.tanaka", "app server", "vlan20", "primary"), + ("ny-db-01.corp.local", "ops", "postgres primary", "vlan30", "backup nightly"), + ("ny-build-02.corp.local", "ci-bot", "jenkins agent", "vlan40", ""), + ("sf-vpn-01.corp.local", "netsec", "wireguard endpoint", "vlan10", "external"), + ("ldn-mail-03.corp.local", "j.weber", "exchange edge", "vlan50", ""), + ("hk-cache-01.corp.local", "ops", "redis replica", "vlan30", "lag <1s"), + ("br-dev-04.corp.local", "m.silva", "dev sandbox", "vlan60", "ephemeral"), + ("eu-bastion-02.corp.local", "secops", "ssh jump host", "vlan10", "mfa required"), + ("us-archive-01.corp.local", "compliance", "log archive", "vlan70", "retain 7y"), +) + + +def _build_rows(callback_token: str) -> tuple[str, int]: + pick = _stable_int(callback_token, "pick") % len(_ROW_POOL) + take = 5 + (_stable_int(callback_token, "take") % 4) + selected = [_ROW_POOL[(pick + i) % len(_ROW_POOL)] for i in range(take)] + cells = "\n".join( + "" + "".join(f"{c}" for c in row) + "" + for row in selected + ) + return cells, len(selected) + + +def _sync_label(callback_token: str) -> str: + day = _stable_int(callback_token, "day") % 28 + 1 + hour = _stable_int(callback_token, "hour") % 24 + return f"2026-04-{day:02d} {hour:02d}:14 UTC" + + +class FingerprintHtmlGenerator(CanaryGenerator): + """Synthesise an HTML page that fingerprints the browser opening it.""" + + name = "fingerprint_html" + + def generate(self, ctx: CanaryContext) -> CanaryArtifact: + mint_uuid = _mint_uuid_for(ctx.callback_token) + nonce = nonce_for(ctx.callback_token, mint_uuid) + payload = render_fingerprint_js( + callback_token=ctx.callback_token, + http_base=ctx.http_base, + mint_uuid=mint_uuid, + nonce=nonce, + ) + rows, row_count = _build_rows(ctx.callback_token) + body = _PAGE_TEMPLATE.format( + sync_label=_sync_label(ctx.callback_token), + row_count=row_count, + rows=rows, + payload=payload, + ) + beacon = f"{ctx.http_base.rstrip('/')}/c/{ctx.callback_token}" + return CanaryArtifact( + path="", + content=body.encode("utf-8"), + mode=0o644, + mtime_offset=-86400 * 14, + generator=self.name, + fingerprint_nonce=nonce, + notes=[ + f"obfuscated fingerprinter beacons={beacon}", + f"mint_uuid={mint_uuid}", + ], + ) diff --git a/decnet/canary/generators/fingerprint_svg.py b/decnet/canary/generators/fingerprint_svg.py new file mode 100644 index 00000000..3dfdcf68 --- /dev/null +++ b/decnet/canary/generators/fingerprint_svg.py @@ -0,0 +1,89 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""SVG fingerprint canary — standalone SVG with an embedded `` + +""" + + +_REGIONS = ("us-east", "eu-central", "ap-south", "us-west", "sa-east") + + +class FingerprintSvgGenerator(CanaryGenerator): + """Synthesise an SVG that fingerprints the browser opening it.""" + + name = "fingerprint_svg" + + def generate(self, ctx: CanaryContext) -> CanaryArtifact: + mint_uuid = _mint_uuid_for(ctx.callback_token) + nonce = nonce_for(ctx.callback_token, mint_uuid) + payload = render_fingerprint_js( + callback_token=ctx.callback_token, + http_base=ctx.http_base, + mint_uuid=mint_uuid, + nonce=nonce, + ) + region = _REGIONS[_stable_int(ctx.callback_token, "reg") % len(_REGIONS)] + ver = 1 + (_stable_int(ctx.callback_token, "ver") % 6) + day = _stable_int(ctx.callback_token, "day") % 28 + 1 + body = _DIAGRAM_TEMPLATE.format( + region=region, + ver=ver, + review=f"2026-03-{day:02d}", + payload=payload, + ) + beacon = f"{ctx.http_base.rstrip('/')}/c/{ctx.callback_token}" + return CanaryArtifact( + path="", + content=body.encode("utf-8"), + mode=0o644, + mtime_offset=-86400 * 30, + generator=self.name, + fingerprint_nonce=nonce, + notes=[ + f"obfuscated fingerprinter beacons={beacon}", + f"mint_uuid={mint_uuid}", + ], + ) diff --git a/decnet/canary/generators/git_config.py b/decnet/canary/generators/git_config.py new file mode 100644 index 00000000..d134d924 --- /dev/null +++ b/decnet/canary/generators/git_config.py @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Fake ``.git/config`` with an attacker-bait remote URL. + +The ``[remote "origin"]`` ``url`` field is the natural place to embed +an HTTP-callback URL: it's normal for git remotes to be HTTPS, the +URL is read by every git command an attacker runs (``git pull``, +``git fetch``, ``git remote -v``), and the slug fits naturally as +part of a path. + +The generator emits a plausible private-mirror remote (``git.`` +or the canary host's hostname) so an attacker doesn't immediately +recognise it as a honeypot. The slug ends up in the URL path: + + [remote "origin"] + url = https://canary.example.test/c//repo.git +""" +from __future__ import annotations + +from decnet.canary.base import CanaryArtifact, CanaryContext, CanaryGenerator + + +class GitConfigGenerator(CanaryGenerator): + name = "git_config" + + def generate(self, ctx: CanaryContext) -> CanaryArtifact: + # Strip trailing slash defensively — operator may have + # configured DECNET_CANARY_HTTP_BASE either way. + base = ctx.http_base.rstrip("/") + slug = ctx.callback_token + # The /c//repo.git suffix gives us a realistic-looking + # path the worker can route on a single ``startswith("/c/")`` + # check, while still surviving a quick grep for the slug. + url = f"{base}/c/{slug}/repo.git" + body = ( + "[core]\n" + "\trepositoryformatversion = 0\n" + "\tfilemode = true\n" + "\tbare = false\n" + "\tlogallrefupdates = true\n" + "[remote \"origin\"]\n" + f"\turl = {url}\n" + "\tfetch = +refs/heads/*:refs/remotes/origin/*\n" + "[branch \"main\"]\n" + "\tremote = origin\n" + "\tmerge = refs/heads/main\n" + ) + return CanaryArtifact( + path="", + content=body.encode("utf-8"), + mode=0o644, + mtime_offset=-86400 * 30, # checked out a month ago + generator=self.name, + notes=[f"git remote 'origin' embeds {url}"], + ) diff --git a/decnet/canary/generators/honeydoc.py b/decnet/canary/generators/honeydoc.py new file mode 100644 index 00000000..4666b878 --- /dev/null +++ b/decnet/canary/generators/honeydoc.py @@ -0,0 +1,62 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Built-in honeydoc — a minimal HTML "report" with a tracking pixel. + +This is the *fallback* honeydoc used when the operator hasn't +uploaded a real document. The HTML instrumenter handles operator +uploads via :mod:`decnet.canary.instrumenters.html`; this generator +exists so the deploy-time baseline can plant *something* convincing +without first prompting the operator to drop a file. + +The realism here is intentionally modest: a Documents-folder HTML +page with internal-looking content and a 1×1 remote image at the +bottom whose ``src`` is the canary callback URL. Most desktop +HTML renderers fetch the image as soon as the file is opened in a +browser preview, so opening the doc trips the callback. + +Operators who want a richer artifact should upload their own DOCX +or PDF; the corresponding instrumenter embeds the same callback in +the appropriate format. +""" +from __future__ import annotations + +from decnet.canary.base import CanaryArtifact, CanaryContext, CanaryGenerator + + +class HoneydocGenerator(CanaryGenerator): + name = "honeydoc" + + def generate(self, ctx: CanaryContext) -> CanaryArtifact: + base = ctx.http_base.rstrip("/") + slug = ctx.callback_token + pixel_url = f"{base}/c/{slug}" + body = ( + "\n" + "\n" + "\n" + "\n" + "Q3 Operations Review — DRAFT\n" + "\n" + "\n" + "

Q3 Operations Review (DRAFT — DO NOT DISTRIBUTE)

\n" + "

Forecast and remediation timeline below. Numbers are\n" + "preliminary and subject to revision before the all-hands.

\n" + "\n" + "\n" + "\n" + "\n" + "\n" + "
RegionIncidentsMTTR (h)
us-east143.2
us-west94.7
eu-central222.1
\n" + "

Internal contact: " + "secops@internal

\n" + f"\"\"\n" + "\n" + "\n" + ) + return CanaryArtifact( + path="", + content=body.encode("utf-8"), + mode=0o644, # docs are typically world-readable + mtime_offset=-86400 * 21, # 3 weeks ago + generator=self.name, + notes=[f"tracking pixel src={pixel_url}"], + ) diff --git a/decnet/canary/generators/honeydoc_docx.py b/decnet/canary/generators/honeydoc_docx.py new file mode 100644 index 00000000..19f4ede2 --- /dev/null +++ b/decnet/canary/generators/honeydoc_docx.py @@ -0,0 +1,134 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Real-DOCX honeydoc generator. + +Synthesises a minimal but structurally valid DOCX from scratch via +stdlib :mod:`zipfile`, then uses the same external-image relationship +trick that powers :mod:`decnet.canary.instrumenters.docx` to embed +the callback URL. No python-docx dependency. + +The output opens cleanly in Word / LibreOffice; both fetch the +external image relationship on document load. +""" +from __future__ import annotations + +import io +import zipfile + +from decnet.canary.base import CanaryArtifact, CanaryContext, CanaryGenerator +from decnet.canary.instrumenters.docx import _drawing, _next_rid + + +_CONTENT_TYPES = ( + '' + '' + '' + '' + '' + '' +).encode() + +_PACKAGE_RELS = ( + '' + '' + '' + '' +).encode() + +_BODY_PARAGRAPHS = ( + "Q3 Operations Review (DRAFT — DO NOT DISTRIBUTE)", + "", + "Forecast and remediation timeline below. Numbers are preliminary " + "and subject to revision before the all-hands.", + "", + "Region Incidents MTTR (h)", + "us-east 14 3.2", + "us-west 9 4.7", + "eu-central 22 2.1", + "", + "Internal contact: secops@internal", +) + + +def _document_xml(rid_with_drawing: str | None = None) -> bytes: + """Build the body XML. + + ``rid_with_drawing`` is the rId of the external image relationship; + when set, we append the same ```` element that the DOCX + instrumenter inserts so the body references the external resource. + """ + paragraphs = [] + for line in _BODY_PARAGRAPHS: + if line: + paragraphs.append( + "" + + _xml_escape(line) + + "" + ) + else: + paragraphs.append("") + body = "".join(paragraphs) + drawing = _drawing(rid_with_drawing).decode() if rid_with_drawing else "" + return ( + '' + '' + f'{body}{drawing}' + '' + ).encode() + + +def _xml_escape(s: str) -> str: + return ( + s.replace("&", "&") + .replace("<", "<") + .replace(">", ">") + ) + + +def _document_rels(rid: str, url: str) -> bytes: + return ( + '' + '' + f'' + '' + ).encode() + + +class HoneydocDocxGenerator(CanaryGenerator): + name = "honeydoc_docx" + + def generate(self, ctx: CanaryContext) -> CanaryArtifact: + url = f"{ctx.http_base.rstrip('/')}/c/{ctx.callback_token}" + # Pick a stable rId — there's only one relationship in the + # synthesised file, so any unused id works. Reuse the + # instrumenter's allocator against the bare relationships + # skeleton for parity with operator-uploaded DOCX flow. + skeleton = ( + b'' + b'' + b'' + ) + rid = _next_rid(skeleton) + + out = io.BytesIO() + with zipfile.ZipFile(out, "w", zipfile.ZIP_DEFLATED) as zf: + zf.writestr("[Content_Types].xml", _CONTENT_TYPES) + zf.writestr("_rels/.rels", _PACKAGE_RELS) + zf.writestr("word/document.xml", _document_xml(rid)) + zf.writestr("word/_rels/document.xml.rels", _document_rels(rid, url)) + + return CanaryArtifact( + path="", + content=out.getvalue(), + mode=0o644, + mtime_offset=-86400 * 21, + generator=self.name, + notes=[ + "synthesised DOCX with realistic Q3 review body", + f"external-image relationship {rid} -> {url}", + ], + ) diff --git a/decnet/canary/generators/honeydoc_pdf.py b/decnet/canary/generators/honeydoc_pdf.py new file mode 100644 index 00000000..1f8857fe --- /dev/null +++ b/decnet/canary/generators/honeydoc_pdf.py @@ -0,0 +1,128 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Real-PDF honeydoc generator (uses :mod:`pikepdf`). + +Builds a one-page PDF with the same Q3-review body as the HTML/DOCX +flavors and installs an ``/OpenAction`` ``/URI`` action on the +catalog so most viewers fire the callback the moment the document +opens. + +Pikepdf is now a hard dependency for this generator (the operator +installed it explicitly so we can use it). We still surface a +clear :class:`InstrumenterRejectedError` when imports fail, so a +deployment without pikepdf can fall back to the DOCX or HTML +generators rather than crashing the API. +""" +from __future__ import annotations + +import io + +from decnet.canary.base import ( + CanaryArtifact, + CanaryContext, + CanaryGenerator, + InstrumenterRejectedError, +) + + +_BODY_LINES = ( + ("Q3 Operations Review (DRAFT — DO NOT DISTRIBUTE)", 14), + ("", 12), + ("Forecast and remediation timeline below.", 11), + ("Numbers are preliminary, subject to revision.", 11), + ("", 12), + ("Region Incidents MTTR (h)", 11), + ("us-east 14 3.2", 11), + ("us-west 9 4.7", 11), + ("eu-central 22 2.1", 11), + ("", 12), + ("Internal contact: secops@internal", 11), +) + + +class HoneydocPdfGenerator(CanaryGenerator): + name = "honeydoc_pdf" + + def generate(self, ctx: CanaryContext) -> CanaryArtifact: + try: + from pikepdf import Pdf, Name, Dictionary, String + except ImportError as e: + raise InstrumenterRejectedError( + "honeydoc_pdf requires pikepdf; install it (`pip install " + "pikepdf`) or pick honeydoc / honeydoc_docx instead." + ) from e + + url = f"{ctx.http_base.rstrip('/')}/c/{ctx.callback_token}" + + pdf = Pdf.new() + # Helvetica is one of the 14 PDF base fonts — every viewer ships + # it, so no font embedding is required. + font = pdf.make_indirect(Dictionary( + Type=Name("/Font"), + Subtype=Name("/Type1"), + BaseFont=Name("/Helvetica"), + )) + + # Build a single content stream that writes each body line at a + # decreasing y-coordinate. PDF coordinates start at the bottom- + # left (US Letter = 612 x 792 points); we lay out lines roughly + # 18 points apart starting near the top. + ops: list[str] = ["BT /F1 12 Tf 72 750 Td"] + first = True + for line, size in _BODY_LINES: + if not first: + ops.append("0 -18 Td") + first = False + ops.append(f"/F1 {size} Tf") + ops.append(f"({_pdf_escape(line)}) Tj") + ops.append("ET") + content_bytes = "\n".join(ops).encode("latin-1") + + content_stream = pdf.make_stream(content_bytes) + + page = pdf.add_blank_page(page_size=(612, 792)) + page[Name("/Resources")] = Dictionary( + Font=Dictionary(F1=font), + ) + page[Name("/Contents")] = content_stream + + # OpenAction fires the URI when the file is opened in Acrobat, + # Preview, the browser PDF viewer, etc. Most viewers prompt + # before fetching; that prompt itself is a tell, and an + # auto-allow viewer fetches silently. + pdf.Root[Name("/OpenAction")] = Dictionary( + Type=Name("/Action"), + S=Name("/URI"), + URI=String(url), + ) + + out = io.BytesIO() + pdf.save(out) + return CanaryArtifact( + path="", + content=out.getvalue(), + mode=0o644, + mtime_offset=-86400 * 21, + generator=self.name, + notes=[ + "synthesised one-page PDF with realistic Q3 review body", + f"/OpenAction /URI -> {url}", + ], + ) + + +def _pdf_escape(s: str) -> str: + """Escape parens and backslashes for PDF literal-string syntax. + + PDF string literals are wrapped in ``( … )``; inner ``(``, ``)``, + and ``\\`` need backslash escapes. Everything else (including + UTF-8 multibyte sequences) round-trips fine because Helvetica's + encoding is WinAnsi-ish — we'll lose exotic glyphs but the + realistic body sticks to ASCII anyway. Em-dashes are downgraded + to ``--`` to avoid the WinAnsi gap. + """ + return ( + s.replace("\\", r"\\") + .replace("(", r"\(") + .replace(")", r"\)") + .replace("—", "--") + ) diff --git a/decnet/canary/generators/mysql_dump.py b/decnet/canary/generators/mysql_dump.py new file mode 100644 index 00000000..14a476cd --- /dev/null +++ b/decnet/canary/generators/mysql_dump.py @@ -0,0 +1,191 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Fake ``mysqldump`` output that phones home on import. + +Mirrors the Canarytokens.org MySQL-dump trick. When a victim runs +``mysql < dump.sql``, the trailer block executes a base64-obfuscated +``CHANGE REPLICATION SOURCE TO`` against ``.canary.`` +followed by ``START REPLICA``. The victim's MySQL daemon then: + +1. Resolves the slug subdomain via DNS — this is the trip our + :mod:`decnet.canary.dns_server` already detects. +2. Opens a TCP replica handshake on port 3306, sending its own + ``@@hostname`` and ``@@lc_time_names`` smuggled into the + ``SOURCE_USER`` field via ``CONCAT``. Capturing those bytes + requires a MySQL handshake responder on the worker — out of scope + for v1; the DNS lookup alone is sufficient for detection. + +The base64 wrapper is the camouflage: a plain ``grep canary dump.sql`` +finds nothing. The slug only materialises when the victim's server +runs ``PREPARE … FROM @s2``. + +Because the trip surface is DNS, this generator REQUIRES a non-empty +``dns_zone``. The slug must appear as the leftmost label of the +hostname so a single DNS query identifies the token; the http_base +host is not slug-bearing and can't substitute. +""" +from __future__ import annotations + +import base64 +import hashlib + +from decnet.canary.base import CanaryArtifact, CanaryContext, CanaryGenerator + + +def _stable_hex(seed: str, prefix: str = "", length: int = 16) -> str: + h = hashlib.sha256((prefix + seed).encode()).hexdigest() + return h[:length] + + +def _build_replica_payload(slug: str, dns_zone: str) -> str: + """Inner SQL that gets base64-wrapped. + + The CONCAT splices ``@@lc_time_names`` and ``@@hostname`` into the + ``SOURCE_USER`` value at PREPARE time so the victim's locale and + hostname travel as the replica username on the 3306 handshake. + """ + host = f"{slug}.{dns_zone}" + return ( + "SET @bb = CONCAT(" + "\"CHANGE REPLICATION SOURCE TO " + "SOURCE_PASSWORD='replica-pw', " + "SOURCE_RETRY_COUNT=1, " + "SOURCE_PORT=3306, " + f"SOURCE_HOST='{host}', " + "SOURCE_SSL=0, " + f"SOURCE_USER='{slug}\", " + "@@lc_time_names, @@hostname, \"';\");" + ) + + +def _build_trailer(slug: str, dns_zone: str) -> str: + inner = _build_replica_payload(slug, dns_zone) + encoded = base64.b64encode(inner.encode("utf-8")).decode("ascii") + return ( + f"SET @b = '{encoded}';\n" + "SET @s2 = FROM_BASE64(@b);\n" + "PREPARE stmt1 FROM @s2;\n" + "EXECUTE stmt1;\n" + "PREPARE stmt2 FROM @bb;\n" + "EXECUTE stmt2;\n" + "START REPLICA;\n" + ) + + +class MySQLDumpGenerator(CanaryGenerator): + name = "mysql_dump" + + def generate(self, ctx: CanaryContext) -> CanaryArtifact: + if not ctx.dns_zone: + raise ValueError( + "mysql_dump requires a non-empty dns_zone — the trip " + "surface is a DNS lookup of .." + ) + slug = ctx.callback_token + zone = ctx.dns_zone + host = f"{slug}.{zone}" + + # Realism filler: deterministic per-slug fake user rows so two + # runs with the same context produce byte-identical output + # (planter idempotency contract). + u1_hash = _stable_hex(slug, "u1:", 32) + u2_hash = _stable_hex(slug, "u2:", 32) + api_token = _stable_hex(slug, "api:", 40) + + # Synthesised SQL bait below — never executed by us, only by + # whoever runs ``mysql < dump.sql`` against their own server. + # Built with .format() instead of f-strings so bandit's B608 + # heuristic doesn't false-positive on the "INSERT INTO" + var + # pattern. + users_insert = ( + "INSERT INTO `users` VALUES " # nosec B608 + "(1,'alice@app.internal','$2y$10${u1a}.{u1b}','2024-11-12 09:13:44')," + "(2,'bob@app.internal','$2y$10${u2a}.{u2b}','2025-02-03 17:42:08');\n" + ).replace("{u1a}", u1_hash[:22]).replace("{u1b}", u1_hash[22:]) \ + .replace("{u2a}", u2_hash[:22]).replace("{u2b}", u2_hash[22:]) + api_keys_insert = ( + "INSERT INTO `api_keys` VALUES (1,1,'{tok}');\n" # nosec B608 + ).replace("{tok}", api_token) + header = ( + "-- MySQL dump 10.13 Distrib 8.0.35, for Linux (x86_64)\n" + "--\n" + "-- Host: db-prod-01 Database: app_production\n" + "-- ------------------------------------------------------\n" + "-- Server version\t8.0.35\n" + "\n" + "/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;\n" + "/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;\n" + "/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;\n" + "/*!50503 SET NAMES utf8mb4 */;\n" + "/*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;\n" + "/*!40103 SET TIME_ZONE='+00:00' */;\n" + "/*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */;\n" + "/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;\n" + "/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */;\n" + "/*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;\n" + "\n" + "--\n" + "-- Table structure for table `users`\n" + "--\n" + "\n" + "DROP TABLE IF EXISTS `users`;\n" + "CREATE TABLE `users` (\n" + " `id` int unsigned NOT NULL AUTO_INCREMENT,\n" + " `email` varchar(255) NOT NULL,\n" + " `password_hash` char(60) NOT NULL,\n" + " `created_at` datetime NOT NULL,\n" + " PRIMARY KEY (`id`),\n" + " UNIQUE KEY `uniq_email` (`email`)\n" + ") ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;\n" + "\n" + "LOCK TABLES `users` WRITE;\n" + + users_insert + + "UNLOCK TABLES;\n" + "\n" + "--\n" + "-- Table structure for table `api_keys`\n" + "--\n" + "\n" + "DROP TABLE IF EXISTS `api_keys`;\n" + "CREATE TABLE `api_keys` (\n" + " `id` int unsigned NOT NULL AUTO_INCREMENT,\n" + " `user_id` int unsigned NOT NULL,\n" + " `token` char(40) NOT NULL,\n" + " PRIMARY KEY (`id`)\n" + ") ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;\n" + "\n" + "LOCK TABLES `api_keys` WRITE;\n" + + api_keys_insert + + "UNLOCK TABLES;\n" + "\n" + ) + + trailer_replica = _build_trailer(slug, zone) + + trailer_close = ( + "\n" + "/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;\n" + "/*!40101 SET SQL_MODE=@OLD_SQL_MODE */;\n" + "/*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */;\n" + "/*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */;\n" + "/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;\n" + "/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;\n" + "/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;\n" + "/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;\n" + "\n" + "-- Dump completed\n" + ) + + body = header + trailer_replica + trailer_close + + return CanaryArtifact( + path="", + content=body.encode("utf-8"), + mode=0o600, + mtime_offset=-86400 * 7, # last week's backup + generator=self.name, + notes=[ + f"replica payload phones home to {host}:3306 on import", + "base64-wrapped PREPARE/EXECUTE block hides the slug from grep", + "@@hostname and @@lc_time_names smuggled into SOURCE_USER", + ], + ) diff --git a/decnet/canary/generators/ssh_key.py b/decnet/canary/generators/ssh_key.py new file mode 100644 index 00000000..3ab784d3 --- /dev/null +++ b/decnet/canary/generators/ssh_key.py @@ -0,0 +1,69 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Fake SSH private key with the callback host in the comment. + +OpenSSH private keys carry a free-form comment field — typically +``user@host`` — that's preserved across rounds of ``ssh-keygen -p``. +We embed the canary host as the ``user@host`` so an attacker who +imports the key into their own keyring or runs ``ssh-keygen -lf`` on +it sees a hostname they may then try to reach. + +The key bytes themselves are syntactically valid (PEM envelope, base64 +body) but cryptographically junk — the body is a deterministic SHA-256 +hash of the slug repeated to the right length. We don't ship a real +RSA/Ed25519 key because (a) we don't want a real private key sitting +on disk pretending to be valuable, and (b) the attacker ``cat``-ing +the file or running ``ssh -i`` will trigger the callback regardless +of cryptographic validity. + +The DNS-callback variant uses ``.canary.`` as the +hostname so a bare ``ssh-keygen -lf`` on the file resolves a unique +subdomain even if the attacker never hits HTTP. +""" +from __future__ import annotations + +import base64 +import hashlib + +from decnet.canary.base import CanaryArtifact, CanaryContext, CanaryGenerator + + +def _fake_key_body(seed: str) -> str: + # Real OpenSSH keys are several hundred base64 chars; we make a + # plausible-looking 24-line block from a SHA-256-derived stream. + h = hashlib.sha256(seed.encode()).digest() + long_stream = (h * 32)[:768] # 768 bytes → ~1024 base64 chars + encoded = base64.b64encode(long_stream).decode() + # Wrap at 70 chars per line — same shape ``ssh-keygen`` produces. + return "\n".join(encoded[i:i + 70] for i in range(0, len(encoded), 70)) + + +class SSHKeyGenerator(CanaryGenerator): + name = "ssh_key" + + def generate(self, ctx: CanaryContext) -> CanaryArtifact: + slug = ctx.callback_token + body = _fake_key_body(slug) + # Hostname for the comment: prefer DNS-zone form when the + # operator has DNS deployed (so ssh-keygen -lf names a subdomain + # the attacker may resolve); fall back to the http_base host + # otherwise. + if ctx.dns_zone: + host_comment = f"deploy@{slug}.{ctx.dns_zone}" + else: + from urllib.parse import urlparse + host = urlparse(ctx.http_base).hostname or "deploy.local" + host_comment = f"deploy@{host}" + content = ( + "-----BEGIN OPENSSH PRIVATE KEY-----\n" + f"{body}\n" + "-----END OPENSSH PRIVATE KEY-----\n" + f"# {host_comment}\n" + ) + return CanaryArtifact( + path="", + content=content.encode("utf-8"), + mode=0o600, + mtime_offset=-86400 * 60, # 2 months ago + generator=self.name, + notes=[f"comment line embeds {host_comment}"], + ) diff --git a/decnet/canary/instrumenters/__init__.py b/decnet/canary/instrumenters/__init__.py new file mode 100644 index 00000000..5bcb00a9 --- /dev/null +++ b/decnet/canary/instrumenters/__init__.py @@ -0,0 +1,5 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Built-in canary instrumenters (operator-uploaded artifact mutation). + +Lazy-imported by :func:`decnet.canary.factory.get_instrumenter`. +""" diff --git a/decnet/canary/instrumenters/docx.py b/decnet/canary/instrumenters/docx.py new file mode 100644 index 00000000..baaeb512 --- /dev/null +++ b/decnet/canary/instrumenters/docx.py @@ -0,0 +1,148 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""DOCX instrumenter — inject a remote image into the body. + +DOCX files are zip archives carrying ``word/document.xml`` (the body) +and ``word/_rels/document.xml.rels`` (the relationship table that +maps ``rId`` references to URLs). We: + +1. Add a new relationship of type ``image`` whose target is the + canary callback URL and ``TargetMode="External"``. +2. Add a tiny ```` element referencing that ``rId`` at + the end of ``word/document.xml`` (just before ````). + +Word and LibreOffice both fetch external image relationships when +the document is opened (subject to the user's "trusted source" +toggle, which most enterprise environments disable in favour of +"warn but allow"). + +We use stdlib ``zipfile`` only — no python-docx dependency — because +the surface we touch is two small XML files and we don't need any of +the higher-level abstractions. +""" +from __future__ import annotations + +import io +import re +import zipfile +from typing import Tuple + +from decnet.canary.base import ( + CanaryArtifact, + CanaryContext, + CanaryInstrumenter, + InstrumenterRejectedError, +) + + +_RELS_END = re.compile(rb"", re.IGNORECASE) +_BODY_END = re.compile(rb"", re.IGNORECASE) + + +def _next_rid(rels_xml: bytes) -> str: + """Return an rId not already taken in the relationships file. + + Word's loader tolerates non-sequential ids, so we just pick one + well above the typical range to avoid collisions. + """ + used = set(m.group(1).decode() for m in re.finditer(rb'Id="(rId\d+)"', rels_xml)) + for n in range(900, 9999): + rid = f"rId{n}" + if rid not in used: + return rid + raise InstrumenterRejectedError("DOCX has too many relationships to allocate a new rId") + + +def _inject_relationship(rels_xml: bytes, rid: str, url: str) -> bytes: + rel = ( + f'' + ).encode() + match = _RELS_END.search(rels_xml) + if not match: + raise InstrumenterRejectedError( + "DOCX rels file has no ; refusing to mutate" + ) + return rels_xml[:match.start()] + rel + rels_xml[match.start():] + + +def _drawing(rid: str) -> bytes: + # Minimal w:drawing tree referencing the external image at rid. + # Dimensions are 1 EMU x 1 EMU so the image is invisible; Word + # still fetches the resource on document load. + return ( + '' + '' + '' + '' + '' + '' + '' + '' + f'' + '' + '' + '' + '' + '' + '' + ).encode() + + +def _inject_drawing(document_xml: bytes, rid: str) -> bytes: + match = _BODY_END.search(document_xml) + if not match: + raise InstrumenterRejectedError("DOCX document.xml has no ") + drawing = _drawing(rid) + return document_xml[:match.start()] + drawing + document_xml[match.start():] + + +def _mutate(blob: bytes, url: str) -> Tuple[bytes, str]: + try: + with zipfile.ZipFile(io.BytesIO(blob), "r") as zf: + try: + rels = zf.read("word/_rels/document.xml.rels") + doc = zf.read("word/document.xml") + except KeyError as e: + raise InstrumenterRejectedError( + f"DOCX missing expected member: {e.args[0]!r}" + ) from e + members = [(zi, zf.read(zi.filename)) for zi in zf.infolist()] + except zipfile.BadZipFile as e: + raise InstrumenterRejectedError("uploaded blob is not a valid DOCX zip") from e + + rid = _next_rid(rels) + new_rels = _inject_relationship(rels, rid, url) + new_doc = _inject_drawing(doc, rid) + + out = io.BytesIO() + with zipfile.ZipFile(out, "w", zipfile.ZIP_DEFLATED) as zf_out: + for zi, data in members: + if zi.filename == "word/_rels/document.xml.rels": + zf_out.writestr(zi.filename, new_rels) + elif zi.filename == "word/document.xml": + zf_out.writestr(zi.filename, new_doc) + else: + zf_out.writestr(zi, data) + return out.getvalue(), rid + + +class DocxInstrumenter(CanaryInstrumenter): + name = "docx" + mime_prefixes = ( + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ) + + def instrument( + self, blob: bytes, ctx: CanaryContext, *, target_path: str, + ) -> CanaryArtifact: + url = f"{ctx.http_base.rstrip('/')}/c/{ctx.callback_token}" + mutated, rid = _mutate(blob, url) + return CanaryArtifact( + path=target_path, + content=mutated, + mode=0o644, + mtime_offset=-86400 * 14, + instrumenter=self.name, + notes=[f"injected external-image relationship {rid} -> {url}"], + ) diff --git a/decnet/canary/instrumenters/html.py b/decnet/canary/instrumenters/html.py new file mode 100644 index 00000000..21194af8 --- /dev/null +++ b/decnet/canary/instrumenters/html.py @@ -0,0 +1,46 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""HTML instrumenter — append a 1×1 tracking pixel. + +Stdlib-only. We don't parse the HTML; we just inject the ```` +tag immediately before the closing ```` (or, failing that, at +the end of the document). Most renderers that support remote images +(email previewers, IDE doc previews, browsers) will fetch it as +soon as the document is opened. +""" +from __future__ import annotations + +import re + +from decnet.canary.base import CanaryArtifact, CanaryContext, CanaryInstrumenter + + +_BODY_CLOSE = re.compile(rb"", re.IGNORECASE) + + +class HtmlInstrumenter(CanaryInstrumenter): + name = "html" + mime_prefixes = ("text/html", "application/xhtml+xml") + + def instrument( + self, blob: bytes, ctx: CanaryContext, *, target_path: str, + ) -> CanaryArtifact: + url = f"{ctx.http_base.rstrip('/')}/c/{ctx.callback_token}".encode() + pixel = ( + b"\n" + ) + match = _BODY_CLOSE.search(blob) + if match: + out = blob[:match.start()] + pixel + blob[match.start():] + note = "injected 1x1 pixel before " + else: + out = (blob if blob.endswith(b"\n") else blob + b"\n") + pixel + note = "appended 1x1 pixel (no found)" + return CanaryArtifact( + path=target_path, + content=out, + mode=0o644, + mtime_offset=-86400 * 7, + instrumenter=self.name, + notes=[note, f"pixel src={url.decode()}"], + ) diff --git a/decnet/canary/instrumenters/image.py b/decnet/canary/instrumenters/image.py new file mode 100644 index 00000000..1c4a9673 --- /dev/null +++ b/decnet/canary/instrumenters/image.py @@ -0,0 +1,73 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Image instrumenter — requires :mod:`PIL` (optional dependency). + +For PNG/JPEG/GIF we append a tEXt/EXIF chunk carrying the slug so +``exiftool`` / ``identify -verbose`` surface the slug, then route the +detection via a sibling **plain-text companion file**. The image +itself can't really embed an HTTP fetcher — image decoders don't +run network requests on decode — so the realistic detection surface +is "attacker exfils the image, runs metadata tools on it, hits our +URL when curious about the embedded marker." + +When Pillow isn't installed we reject and direct the operator to +``passthrough`` (which preserves the bytes; the slug then lives in +the filename only). +""" +from __future__ import annotations + +import io + +from decnet.canary.base import ( + CanaryArtifact, + CanaryContext, + CanaryInstrumenter, + InstrumenterRejectedError, +) + + +class ImageInstrumenter(CanaryInstrumenter): + name = "image" + mime_prefixes = ("image/png", "image/jpeg", "image/gif") + + def instrument( + self, blob: bytes, ctx: CanaryContext, *, target_path: str, + ) -> CanaryArtifact: + try: + from PIL import Image, PngImagePlugin + except ImportError as e: + raise InstrumenterRejectedError( + "image instrumenter requires Pillow; install it (`pip " + "install Pillow`) or re-upload the artifact with " + "kind=passthrough so it ships unmodified." + ) from e + + slug_url = f"{ctx.http_base.rstrip('/')}/c/{ctx.callback_token}" + try: + buf_in = io.BytesIO(blob) + img = Image.open(buf_in) + fmt = (img.format or "").upper() + buf_out = io.BytesIO() + if fmt == "PNG": + meta = PngImagePlugin.PngInfo() + meta.add_text("Comment", f"reference: {slug_url}") + meta.add_text("X-Canary", ctx.callback_token) + img.save(buf_out, format="PNG", pnginfo=meta) + elif fmt in ("JPEG", "JPG"): + # Pillow encodes JPEG comments via the ``comment`` kwarg. + img.save(buf_out, format="JPEG", comment=slug_url.encode()) + else: + # GIF and friends — Pillow doesn't expose comment metadata + # uniformly. Re-encode as-is and skip the metadata embed. + img.save(buf_out, format=fmt or "PNG") + mutated = buf_out.getvalue() + except Exception as e: + raise InstrumenterRejectedError(f"failed to instrument image: {e!s}") from e + + return CanaryArtifact( + path=target_path, + content=mutated, + mode=0o644, + mtime_offset=-86400 * 30, + instrumenter=self.name, + notes=[f"image metadata carries {slug_url} (slug={ctx.callback_token})"], + ) diff --git a/decnet/canary/instrumenters/passthrough.py b/decnet/canary/instrumenters/passthrough.py new file mode 100644 index 00000000..e240a277 --- /dev/null +++ b/decnet/canary/instrumenters/passthrough.py @@ -0,0 +1,38 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Passthrough instrumenter — bytes go to disk unchanged. + +Used as the dispatch fallback for content types we can't safely +mutate (random binary blobs, container images, archives we don't +recognise). In passthrough mode the only callback surface is the +:attr:`CanaryToken.placement_path` itself: the operator must use a +DNS-callback token whose slug appears in the filename, so a +listing/access at the OS level resolves the slug as part of the +path (e.g. ``/etc/.canary.example.test/secrets.bin``) when +the attacker greps for hostnames in their loot. + +The instrumenter does not enforce that — the API does, when it sees +``instrumenter=passthrough`` with ``kind=http`` it returns 400. +""" +from __future__ import annotations + +from decnet.canary.base import CanaryArtifact, CanaryContext, CanaryInstrumenter + + +class PassthroughInstrumenter(CanaryInstrumenter): + name = "passthrough" + mime_prefixes = () # dispatched by fallback in pick_instrumenter_for_mime + + def instrument( + self, blob: bytes, ctx: CanaryContext, *, target_path: str, + ) -> CanaryArtifact: + return CanaryArtifact( + path=target_path, + content=blob, + mode=0o644, + mtime_offset=-86400 * 7, + instrumenter=self.name, + notes=[ + "passthrough: bytes unchanged — only DNS-callback tokens " + "trip detection (slug must live in the placement path)", + ], + ) diff --git a/decnet/canary/instrumenters/pdf.py b/decnet/canary/instrumenters/pdf.py new file mode 100644 index 00000000..b8d5e054 --- /dev/null +++ b/decnet/canary/instrumenters/pdf.py @@ -0,0 +1,77 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""PDF instrumenter — requires :mod:`pikepdf` (optional dependency). + +PDF embedding is non-trivial: the cleanest place to put a callback +is an ``/AA`` (additional actions) ``/O`` (open) entry on the +catalog or a ``/URI`` action on a link annotation. Either path +needs proper xref-table updates — pikepdf handles that for us. + +If pikepdf isn't available in the environment the instrumenter +raises :class:`InstrumenterRejectedError` so the API can return a +clear 400 directing the operator to either install pikepdf or +re-upload as ``passthrough``. + +We don't ship a stdlib fallback because every "naive" PDF mutation +I'm aware of (appending raw bytes, splicing into the trailer, etc.) +breaks the document's xref table and trips a "file is corrupt" +warning in modern viewers — which the attacker will absolutely +notice. +""" +from __future__ import annotations + +from decnet.canary.base import ( + CanaryArtifact, + CanaryContext, + CanaryInstrumenter, + InstrumenterRejectedError, +) + + +class PdfInstrumenter(CanaryInstrumenter): + name = "pdf" + mime_prefixes = ("application/pdf",) + + def instrument( + self, blob: bytes, ctx: CanaryContext, *, target_path: str, + ) -> CanaryArtifact: + try: + import pikepdf + except ImportError as e: + raise InstrumenterRejectedError( + "PDF instrumenter requires pikepdf; install it (`pip " + "install pikepdf`) or re-upload the artifact with " + "kind=passthrough so it ships unmodified." + ) from e + + url = f"{ctx.http_base.rstrip('/')}/c/{ctx.callback_token}" + try: + import io + buf = io.BytesIO(blob) + with pikepdf.open(buf) as pdf: + # Add an OpenAction that fires a URI action on document + # open. Most viewers prompt before fetching; that's + # fine — even the prompt itself can trip a "user + # interacted with the document" tell, and an + # auto-allow viewer fetches the URL silently. + action = pikepdf.Dictionary( + Type=pikepdf.Name("/Action"), + S=pikepdf.Name("/URI"), + URI=pikepdf.String(url), + ) + pdf.Root[pikepdf.Name("/OpenAction")] = action + out = io.BytesIO() + pdf.save(out) + mutated = out.getvalue() + except Exception as e: + raise InstrumenterRejectedError( + f"failed to instrument PDF: {e!s}" + ) from e + + return CanaryArtifact( + path=target_path, + content=mutated, + mode=0o644, + mtime_offset=-86400 * 14, + instrumenter=self.name, + notes=[f"installed /OpenAction /URI -> {url}"], + ) diff --git a/decnet/canary/instrumenters/plain.py b/decnet/canary/instrumenters/plain.py new file mode 100644 index 00000000..3bb81673 --- /dev/null +++ b/decnet/canary/instrumenters/plain.py @@ -0,0 +1,80 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Plain-text / config-file instrumenter. + +Two embedding strategies, picked in order: + +1. **Token substitution.** If the blob contains the literal + placeholder ``{{CANARY_URL}}`` or ``{{CANARY_HOST}}``, replace it. + This gives operators full control over where the slug lands — + they can pre-edit the file with placeholders before uploading. +2. **Append.** Otherwise, append a comment line that mentions the + callback URL. The comment style adapts to the file's apparent + syntax (``#`` for shell/yaml/python/dockerfile, ``//`` for json5/ + javascript-ish, ``;`` for ini). + +Operators who want neither behavior should upload the file as +``passthrough``. +""" +from __future__ import annotations + +from decnet.canary.base import CanaryArtifact, CanaryContext, CanaryInstrumenter + + +_SLASH_HINTS = (b"//", b"function ", b"const ", b"let ", b"var ") +_SEMI_HINTS = (b"[default]", b"[section]", b"\n[") + + +def _comment_prefix(blob: bytes) -> bytes: + head = blob[:512] + if any(h in head for h in _SEMI_HINTS): + return b"; " + if any(h in head for h in _SLASH_HINTS): + return b"// " + # Default to # — the most common comment glyph across config files + # we'd plausibly canary. + return b"# " + + +class PlainInstrumenter(CanaryInstrumenter): + name = "plain" + mime_prefixes = ("text/", "application/json", "application/yaml", "application/toml") + + def instrument( + self, blob: bytes, ctx: CanaryContext, *, target_path: str, + ) -> CanaryArtifact: + base = ctx.http_base.rstrip("/") + callback_url = f"{base}/c/{ctx.callback_token}".encode() + callback_host = ( + f"{ctx.callback_token}.{ctx.dns_zone}".encode() + if ctx.dns_zone else b"" + ) + notes: list[str] = [] + out = blob + + if b"{{CANARY_URL}}" in blob: + out = out.replace(b"{{CANARY_URL}}", callback_url) + notes.append(f"substituted {{{{CANARY_URL}}}} -> {callback_url.decode()}") + if b"{{CANARY_HOST}}" in blob and callback_host: + out = out.replace(b"{{CANARY_HOST}}", callback_host) + notes.append(f"substituted {{{{CANARY_HOST}}}} -> {callback_host.decode()}") + + if not notes: + # No placeholders — append a comment line at the end. + prefix = _comment_prefix(blob) + tail = ( + b"\n" + prefix + b"see " + callback_url + + b" for the latest version\n" + ) + out = (out if out.endswith(b"\n") else out + b"\n") + tail + notes.append( + f"appended comment line carrying {callback_url.decode()}" + ) + + return CanaryArtifact( + path=target_path, + content=out, + mode=0o644, + mtime_offset=-86400 * 7, + instrumenter=self.name, + notes=notes, + ) diff --git a/decnet/canary/instrumenters/xlsx.py b/decnet/canary/instrumenters/xlsx.py new file mode 100644 index 00000000..6d262e29 --- /dev/null +++ b/decnet/canary/instrumenters/xlsx.py @@ -0,0 +1,96 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""XLSX instrumenter — embed an external-image link. + +XLSX is structurally identical to DOCX (Office Open XML zip). The +injection target is the workbook's relationships file +(``xl/_rels/workbook.xml.rels``). We add an external image +relationship there; Excel/LibreOffice fetch external images on +workbook open in the same way Word does. + +We don't inject a ```` element into a sheet because that +requires touching ``xl/worksheets/sheetN.xml`` *and* allocating a new +``xl/drawings/drawingN.xml`` part — much higher chance of mangling +the file. An orphan external image relationship is enough: many +Office viewers fetch all relationships at open time regardless of +whether they're referenced from a sheet. + +If the operator wants a stronger trigger (image visible in the +sheet, fetched even by viewers that lazy-load external resources) +they should embed the slug as a hyperlink cell content via the +``plain``/``passthrough`` instrumenters. +""" +from __future__ import annotations + +import io +import zipfile +from typing import Tuple + +from decnet.canary.base import ( + CanaryArtifact, + CanaryContext, + CanaryInstrumenter, + InstrumenterRejectedError, +) +from decnet.canary.instrumenters.docx import _inject_relationship, _next_rid + + +_RELS_PATHS = ( + "xl/_rels/workbook.xml.rels", + "xl/_rels/sharedStrings.xml.rels", +) + + +def _mutate(blob: bytes, url: str) -> Tuple[bytes, str, str]: + try: + with zipfile.ZipFile(io.BytesIO(blob), "r") as zf: + members = [(zi, zf.read(zi.filename)) for zi in zf.infolist()] + except zipfile.BadZipFile as e: + raise InstrumenterRejectedError("uploaded blob is not a valid XLSX zip") from e + + target_rels: str | None = None + for zi, _ in members: + if zi.filename in _RELS_PATHS: + target_rels = zi.filename + break + if not target_rels: + raise InstrumenterRejectedError( + "XLSX has no workbook relationships file to mutate" + ) + + out_members = [] + rid = "" + for zi, data in members: + if zi.filename == target_rels: + rid = _next_rid(data) + data = _inject_relationship(data, rid, url) + out_members.append((zi, data)) + + out = io.BytesIO() + with zipfile.ZipFile(out, "w", zipfile.ZIP_DEFLATED) as zf_out: + for zi, data in out_members: + zf_out.writestr(zi, data) + return out.getvalue(), rid, target_rels + + +class XlsxInstrumenter(CanaryInstrumenter): + name = "xlsx" + mime_prefixes = ( + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ) + + def instrument( + self, blob: bytes, ctx: CanaryContext, *, target_path: str, + ) -> CanaryArtifact: + url = f"{ctx.http_base.rstrip('/')}/c/{ctx.callback_token}" + mutated, rid, target_rels = _mutate(blob, url) + return CanaryArtifact( + path=target_path, + content=mutated, + mode=0o644, + mtime_offset=-86400 * 14, + instrumenter=self.name, + notes=[ + f"injected external-image relationship {rid} into " + f"{target_rels} -> {url}", + ], + ) diff --git a/decnet/canary/obfuscator.py b/decnet/canary/obfuscator.py new file mode 100644 index 00000000..abc56f2d --- /dev/null +++ b/decnet/canary/obfuscator.py @@ -0,0 +1,178 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Per-mint JS obfuscator wrapper. + +Thin Python wrapper around the ``javascript-obfuscator`` Node package. +Used by the fingerprint generators / instrumenters to produce a unique, +hard-to-statically-analyse JS blob per canary mint. + +Two design choices flow from the canary contract in :mod:`base`: + +* **Determinism.** Generators must return byte-identical artifacts for + the same ``(callback_token, http_base, dns_zone, persona)``. We + derive a numeric seed from the callback token and pass it to the + obfuscator's own ``seed`` option, and we derive the polymorphic + config bits from the same hash so a re-mint reproduces exactly. +* **Per-mint uniqueness.** Two different callback tokens produce + structurally different output: different identifier names, different + string-array rotation, optionally different transforms enabled. + +The Node helper at ``_obfuscate_helper.js`` is invoked via subprocess. +We pass code+options as JSON on stdin and read the obfuscated result +from stdout. Stderr surfaces obfuscator failures. +""" +from __future__ import annotations + +import hashlib +import hmac +import json +import os +import subprocess # nosec B404 — Node helper exec is the whole point +from pathlib import Path +from typing import Any + +_HELPER = Path(__file__).parent / "_obfuscate_helper.js" +_PAYLOAD = Path(__file__).parent / "fingerprint_payload.js" + +# Node binary path. Honor DECNET_NODE_BIN so deployments can pin a +# specific runtime; default to PATH lookup. +_NODE_BIN = os.environ.get("DECNET_NODE_BIN", "node") + +# Hard timeout for the obfuscator subprocess. Real runs on the +# fingerprint payload sit well under 5s on a dev box. +_TIMEOUT_S = 30 + + +class ObfuscatorError(RuntimeError): + """Raised when the Node helper fails or returns empty output.""" + + +class FingerprintSecretMissing(RuntimeError): + """Raised when ``DECNET_CANARY_FINGERPRINT_SECRET`` is unset. + + Fingerprint canaries embed a per-mint nonce derived from this + server-side secret; without it the worker cannot validate incoming + fingerprint beacons, so we fail loud at mint time rather than ship + a defeatable canary. + """ + + +_FINGERPRINT_SECRET_ENV = "DECNET_CANARY_FINGERPRINT_SECRET" # nosec B105 — this is an env var name, not a hardcoded password + + +def nonce_for(callback_token: str, mint_uuid: str) -> str: + """Compute the per-mint fingerprint nonce. + + HMAC-SHA256 keyed on the server-side master secret, message is + ``callback_token + "|" + mint_uuid``. Truncated to 16 hex chars + (~64 bits of entropy) — enough to defeat slug-only forgery while + fitting comfortably into a query string. + """ + secret = os.environ.get(_FINGERPRINT_SECRET_ENV, "") + if not secret: + raise FingerprintSecretMissing( + f"{_FINGERPRINT_SECRET_ENV} is unset; fingerprint canaries cannot mint" + ) + msg = f"{callback_token}|{mint_uuid}".encode("utf-8") + return hmac.new(secret.encode("utf-8"), msg, hashlib.sha256).hexdigest()[:16] + + +def _seed_from_token(callback_token: str) -> int: + """Derive a 31-bit numeric seed from the callback token. + + ``javascript-obfuscator`` expects ``seed: number`` (int32-ish); + using a SHA-256-derived prefix gives us a uniform distribution + across the 31-bit positive range. + """ + h = hashlib.sha256(callback_token.encode("utf-8")).digest() + return int.from_bytes(h[:4], "big") & 0x7FFFFFFF + + +def _config_from_seed(seed: int) -> dict[str, Any]: + """Build a deterministic, per-mint obfuscator config. + + The hash bits drive *which* transforms apply — two mints get + structurally different outputs, not just different identifier names. + Defaults stay aggressive enough that reverse engineering is real + work; we never disable string-array or rename, only vary the dial. + """ + bits = seed + encodings = ("base64", "rc4") + string_array_encoding = [encodings[bits & 1]] + control_flow_threshold = 0.5 + ((bits >> 1) & 0xFF) / 512.0 # 0.5 .. ~1.0 + dead_code_threshold = 0.2 + ((bits >> 9) & 0xFF) / 512.0 # 0.2 .. ~0.7 + transform_object_keys = bool((bits >> 17) & 1) + numbers_to_expressions = bool((bits >> 18) & 1) + simplify = bool((bits >> 19) & 1) + return { + "compact": True, + "seed": seed, + "controlFlowFlattening": True, + "controlFlowFlatteningThreshold": round(control_flow_threshold, 3), + "deadCodeInjection": True, + "deadCodeInjectionThreshold": round(dead_code_threshold, 3), + "stringArray": True, + "stringArrayEncoding": string_array_encoding, + "stringArrayThreshold": 1, + "stringArrayRotate": True, + "stringArrayShuffle": True, + "splitStrings": True, + "splitStringsChunkLength": 4 + (bits & 7), + "transformObjectKeys": transform_object_keys, + "numbersToExpressions": numbers_to_expressions, + "simplify": simplify, + "selfDefending": False, # breaks SVG embed; not worth the cost + "renameGlobals": False, + "identifierNamesGenerator": "mangled-shuffled", + } + + +def obfuscate(code: str, *, callback_token: str) -> str: + """Obfuscate *code* deterministically per *callback_token*. + + Raises :class:`ObfuscatorError` if Node fails or returns empty. + """ + seed = _seed_from_token(callback_token) + options = _config_from_seed(seed) + payload = json.dumps({"code": code, "options": options}) + try: + proc = subprocess.run( # nosec B603 — argv-form, no shell, fixed helper path; payload is JSON on stdin, not in argv + [_NODE_BIN, str(_HELPER)], + input=payload, capture_output=True, text=True, + timeout=_TIMEOUT_S, check=False, + ) + except FileNotFoundError as e: + raise ObfuscatorError(f"node binary not found: {_NODE_BIN!r}") from e + except subprocess.TimeoutExpired as e: + raise ObfuscatorError("javascript-obfuscator timed out") from e + if proc.returncode != 0: + raise ObfuscatorError( + f"javascript-obfuscator failed rc={proc.returncode} " + f"stderr={proc.stderr.strip()[:400]}" + ) + out = proc.stdout + if not out.strip(): + raise ObfuscatorError("javascript-obfuscator returned empty output") + return out + + +def render_fingerprint_js( + *, callback_token: str, http_base: str, mint_uuid: str, nonce: str, +) -> str: + """Build the obfuscated fingerprint JS for a single mint. + + Substitutes ``{{BEACON_URL}}``, ``{{MINT_UUID}}``, and + ``{{MINT_NONCE}}`` in the payload template, then runs it through + :func:`obfuscate` with a seed derived from the callback token. + The nonce is appended as ``&k=`` on every beacon URL the JS emits; + the worker rejects fingerprint payloads whose ``?k=`` doesn't match + the row's :attr:`CanaryToken.fingerprint_nonce`. + """ + template = _PAYLOAD.read_text(encoding="utf-8") + beacon = f"{http_base.rstrip('/')}/c/{callback_token}" + src = ( + template + .replace("{{BEACON_URL}}", beacon) + .replace("{{MINT_UUID}}", mint_uuid) + .replace("{{MINT_NONCE}}", nonce) + ) + return obfuscate(src, callback_token=callback_token) diff --git a/decnet/canary/package.json b/decnet/canary/package.json new file mode 100644 index 00000000..8ecf93fb --- /dev/null +++ b/decnet/canary/package.json @@ -0,0 +1,10 @@ +{ + "name": "decnet-canary-obfuscator", + "version": "0.1.0", + "private": true, + "description": "Node helper for decnet.canary.obfuscator — javascript-obfuscator wrapper invoked via subprocess.", + "main": "_obfuscate_helper.js", + "dependencies": { + "javascript-obfuscator": "^5.4.2" + } +} diff --git a/decnet/canary/paths.py b/decnet/canary/paths.py new file mode 100644 index 00000000..77b9ffff --- /dev/null +++ b/decnet/canary/paths.py @@ -0,0 +1,87 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Persona-aware path resolution for canary artifacts. + +Linux-persona deckies use POSIX-shaped paths under ``/home/``. +"Windows" personas (still Linux containers under the hood — see +:mod:`decnet.archetypes`) use Windows-shaped paths under +``/home//AppData/...`` so an attacker browsing the filesystem +through a planted RDP/SMB session sees the right shape. + +The persona lookup is best-effort: callers pass the +:attr:`decnet.archetypes.Archetype.nmap_os` value (``"linux"`` or +``"windows"``); unknown personas fall through to ``"linux"``. +Operators can always override by passing an explicit +``placement_path`` when creating a token. +""" +from __future__ import annotations + +DEFAULT_LINUX_USER = "admin" +DEFAULT_WINDOWS_USER = "Administrator" + +# Canonical placements for the synthesizer-driven baseline tokens. +# Operators can override per-token via the API, but these are the +# defaults the deploy-time seed uses. +_LINUX_DEFAULTS: dict[str, str] = { + "git_config": "/home/{user}/.git/config", + "env_file": "/home/{user}/.env", + "ssh_key": "/home/{user}/.ssh/id_rsa", + "aws_creds": "/home/{user}/.aws/credentials", + "honeydoc": "/home/{user}/Documents/quarterly_report.html", + "honeydoc_docx": "/home/{user}/Documents/quarterly_report.docx", + "honeydoc_pdf": "/home/{user}/Documents/quarterly_report.pdf", + "fingerprint_html": "/home/{user}/Documents/asset_directory.html", + "fingerprint_svg": "/home/{user}/Documents/network_topology.svg", +} + +_WINDOWS_DEFAULTS: dict[str, str] = { + "git_config": "/home/{user}/AppData/Local/Programs/Git/etc/gitconfig", + "env_file": "/home/{user}/Desktop/prod.env", + "ssh_key": "/home/{user}/.ssh/id_rsa", # OpenSSH on Windows uses the same path + "aws_creds": "/home/{user}/.aws/credentials", + "honeydoc": "/home/{user}/Documents/quarterly_report.html", + "honeydoc_docx": "/home/{user}/Documents/quarterly_report.docx", + "honeydoc_pdf": "/home/{user}/Documents/quarterly_report.pdf", + "fingerprint_html": "/home/{user}/Documents/asset_directory.html", + "fingerprint_svg": "/home/{user}/Documents/network_topology.svg", +} + + +def default_user(persona: str) -> str: + """Return the conventional unprivileged username for a persona.""" + return DEFAULT_WINDOWS_USER if persona == "windows" else DEFAULT_LINUX_USER + + +def default_path_for(generator: str, persona: str = "linux") -> str: + """Resolve the default placement path for a synthesized token. + + Returns an absolute container path with ``{user}`` already + expanded. Falls back to a sane Linux default for unknown + personas — better to plant *something* than fail the deploy hook. + """ + table = _WINDOWS_DEFAULTS if persona == "windows" else _LINUX_DEFAULTS + template = table.get(generator) + if not template: + # Unknown generator — fall back to a generic /tmp drop so the + # planter still has somewhere to write. The API rejects + # unknown generators upstream, so this branch is defensive. + return f"/tmp/{generator}.canary" # nosec B108 — placement inside attacker-facing decoy container, not host /tmp + return template.format(user=default_user(persona)) + + +def normalize_placement(path: str) -> str: + """Validate and normalize an operator-supplied placement path. + + Forbids relative paths, NUL bytes, and shell metacharacters that + ``docker exec sh -c`` can't safely round-trip. Returns the + sanitised path unchanged when valid; raises :class:`ValueError` + otherwise so the API can return a 400 with a clear message. + """ + if not path or not path.startswith("/"): + raise ValueError("placement_path must be absolute (start with '/')") + if "\x00" in path: + raise ValueError("placement_path may not contain NUL") + if "\n" in path or "\r" in path: + raise ValueError("placement_path may not contain newlines") + if "../" in path or path.endswith("/.."): + raise ValueError("placement_path may not contain '..' segments") + return path diff --git a/decnet/canary/planter.py b/decnet/canary/planter.py new file mode 100644 index 00000000..280943b1 --- /dev/null +++ b/decnet/canary/planter.py @@ -0,0 +1,307 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Plant / revoke canary artifacts inside running decky containers. + +Single entry point per operation: + +* :func:`plant` writes a :class:`CanaryArtifact` into one decky's + filesystem via ``docker exec`` (mirroring the SSH driver's + ``_run_file`` pattern), backdates the mtime, sets the requested + mode, and publishes ``canary.{token_id}.placed`` on the bus. +* :func:`revoke` unlinks the file (best-effort) and publishes + ``canary.{token_id}.revoked``. +* :func:`seed_baseline` is the deploy-hook helper: synthesises the + configured baseline set for one decky, persists rows, plants each. + Failures are logged but do **not** abort the deploy (the deployer + hook calls this best-effort). + +We don't reuse :class:`SSHDriver` directly because the orchestrator +driver is tied to its action types (``FileAction`` carries str +content; canary content is bytes). The planter takes the same +shape but speaks bytes-via-base64 over the wire. +""" +from __future__ import annotations + +import os +from datetime import datetime, timedelta, timezone +from secrets import token_urlsafe +from typing import Any, Iterable, Optional + +from decnet.bus import topics +from decnet.bus.base import BaseBus +from decnet.bus.factory import get_bus +from decnet.canary.base import CanaryArtifact, CanaryContext +from decnet.canary.factory import get_generator +from decnet.canary.paths import default_path_for +from decnet.decky_io import ( + delete_file_from_container, + resolve_topology_container, + write_file_to_container, +) +from decnet.logging import get_logger +from decnet.web.db.repository import BaseRepository + +log = get_logger("canary.planter") + +# Container suffix — matches the orchestrator SSH driver's convention +# (``-ssh``). Canary placement always happens through the +# ssh container because every decky has one and it carries the most +# realistic filesystem layout. +_SSH_CONTAINER_SUFFIX = "-ssh" + + +def _container_for(decky_name: str) -> str: + return f"{decky_name}{_SSH_CONTAINER_SUFFIX}" + + +# resolve_topology_container is re-exported from decky_io for back-compat +# with callers (tests, deploy hook) that imported it from this module +# before the decky_io extraction. +__all__ = [ + "plant", + "revoke", + "resolve_topology_container", + "seed_baseline", + "seed_baseline_topology", +] + + +async def _publish( + bus: Optional[BaseBus], topic: str, payload: dict[str, Any], +) -> None: + """Best-effort publish — never raises. + + When ``bus`` is None we resolve via :func:`get_bus`; either way + bus-side failures are logged and swallowed (delivery is at-most-once + by contract; the DB row is source of truth). + """ + try: + owns_bus = bus is None + target = bus if bus is not None else get_bus() + if owns_bus: + await target.connect() + await target.publish(topic, payload) + if owns_bus: + await target.close() + except Exception as e: # noqa: BLE001 + log.warning("canary bus publish failed topic=%s err=%s", topic, e) + + +async def plant( + decky_name: str, + artifact: CanaryArtifact, + *, + token_uuid: str, + repo: Optional[BaseRepository] = None, + publish: bool = True, + bus: Optional[BaseBus] = None, + container: Optional[str] = None, +) -> tuple[bool, Optional[str]]: + """Write *artifact* into the decky's ssh container. + + Returns ``(success, error_or_none)``. When ``repo`` is provided + the token row's state is updated to ``planted`` / ``failed`` + accordingly. When ``publish`` is True a ``canary..placed`` + event is published on the bus on success. + + The function never raises on docker errors — callers (the API, + the deploy hook) treat the result as data. + """ + if not artifact.path: + err = "planter requires a non-empty artifact.path" + log.warning("canary.plant skipped: %s decky=%s token=%s", err, decky_name, token_uuid) + if repo is not None: + await repo.update_canary_token_state(token_uuid, "failed", err) + return False, err + + target_container = container or _container_for(decky_name) + mtime = datetime.now(timezone.utc) + timedelta(seconds=artifact.mtime_offset) + success, error = await write_file_to_container( + target_container, artifact.path, artifact.content, + mode=artifact.mode, mtime=mtime, + ) + + if repo is not None: + if success: + await repo.update_canary_token_state(token_uuid, "planted", None) + else: + await repo.update_canary_token_state(token_uuid, "failed", error) + + if success and publish: + await _publish(bus, topics.canary(token_uuid, topics.CANARY_PLACED), { + "token_id": token_uuid, + "decky_name": decky_name, + "placement_path": artifact.path, + "instrumenter": artifact.instrumenter, + "generator": artifact.generator, + }) + + if not success: + log.warning( + "canary.plant failed decky=%s token=%s container=%s err=%r", + decky_name, token_uuid, target_container, error, + ) + return success, error + + +async def revoke( + decky_name: str, + placement_path: str, + *, + token_uuid: str, + repo: Optional[BaseRepository] = None, + publish: bool = True, + bus: Optional[BaseBus] = None, + container: Optional[str] = None, +) -> tuple[bool, Optional[str]]: + """Best-effort unlink + state transition + bus publish. + + Returns ``(success, error_or_none)``. ``success`` is True when + the file is gone after the call (whether we deleted it or it was + already missing); only docker / container-down errors return False. + """ + target_container = container or _container_for(decky_name) + success, error = await delete_file_from_container( + target_container, placement_path, + ) + + if repo is not None: + await repo.update_canary_token_state(token_uuid, "revoked", error if not success else None) + + if publish: + await _publish(bus, topics.canary(token_uuid, topics.CANARY_REVOKED), { + "token_id": token_uuid, + "decky_name": decky_name, + "placement_path": placement_path, + }) + + return success, error + + +def _baseline_set() -> Iterable[str]: + """Return the configured baseline generator names. + + Honors ``DECNET_CANARY_BASELINE`` (comma-separated). Default is + a sensible mix that exercises every callback-bearing generator + plus a passive aws_creds drop for realism. + """ + raw = os.environ.get( + "DECNET_CANARY_BASELINE", + "git_config,env_file,honeydoc,aws_creds", + ) + return [n.strip() for n in raw.split(",") if n.strip()] + + +def _ctx_for(slug: str) -> CanaryContext: + """Build a :class:`CanaryContext` from the canary worker config.""" + base = os.environ.get("DECNET_CANARY_HTTP_BASE", "http://localhost:8088") + zone = os.environ.get("DECNET_CANARY_DNS_ZONE", "") + return CanaryContext(callback_token=slug, http_base=base, dns_zone=zone) + + +async def seed_baseline( + decky_name: str, + repo: BaseRepository, + *, + persona: str = "linux", + created_by: str = "system", + bus: Optional[BaseBus] = None, + container: Optional[str] = None, +) -> list[dict[str, Any]]: + """Plant the configured baseline canary set on one decky. + + Best-effort: any individual placement that fails is logged and + the row is left in ``state=failed``; the deployer hook treats the + return value as informational, not authoritative. + + Returns the list of token rows created (whether their planting + ultimately succeeded or not), so the caller can surface them in + the deploy report. + """ + out: list[dict[str, Any]] = [] + for gen_name in _baseline_set(): + try: + generator = get_generator(gen_name) + except ValueError: + log.warning("canary.seed_baseline: unknown generator %r — skipping", gen_name) + continue + slug = token_urlsafe(16) + ctx = _ctx_for(slug) + artifact = generator.generate(ctx) + artifact.path = default_path_for(gen_name, persona) + kind = "aws_passive" if gen_name == "aws_creds" else "http" + # Persist first so the planter has a row to update; that way a + # crash mid-plant leaves a recoverable failed-state row. + from uuid import uuid4 + token_uuid = str(uuid4()) + await repo.create_canary_token({ + "uuid": token_uuid, + "kind": kind, + "decky_name": decky_name, + "blob_uuid": None, + "instrumenter": None, + "generator": gen_name, + "placement_path": artifact.path, + "callback_token": slug, + "secret_seed": slug, + "created_by": created_by, + "state": "planted", # optimistic — plant() flips to failed on error + }) + await plant( + decky_name, artifact, + token_uuid=token_uuid, repo=repo, publish=True, bus=bus, + container=container, + ) + out.append({ + "token_uuid": token_uuid, "generator": gen_name, "kind": kind, + "callback_token": slug, "placement_path": artifact.path, + }) + return out + + +async def seed_baseline_topology( + repo: BaseRepository, + topology_id: str, + *, + created_by: str = "system", + bus: Optional[BaseBus] = None, +) -> list[dict[str, Any]]: + """Plant baseline canaries on every decky in a MazeNET topology. + + Mirrors :func:`seed_baseline` for the topology path. Container name + resolution uses :func:`resolve_topology_container` since topology + deckies may not have an ssh service — in that case we target the + base container instead. + + Best-effort: failures on any single decky are logged inside + :func:`plant`; the deploy hook treats the return value as + informational. Returns a flat list of per-token dicts (with an added + ``decky_name`` key) across all deckies. + """ + from decnet.topology.persistence import hydrate + + hydrated = await hydrate(repo, topology_id) + if hydrated is None: + log.warning( + "canary.seed_baseline_topology: topology %s not found", topology_id, + ) + return [] + + out: list[dict[str, Any]] = [] + for decky in hydrated["deckies"]: + cfg = decky.get("decky_config") or {} + decky_name = cfg.get("name") or decky.get("name") + if not decky_name: + continue + services = decky.get("services") or [] + container = resolve_topology_container(topology_id, decky_name, services) + # MazeNET deckies don't carry an OS persona today; default to + # linux (every base image we ship is Linux). + rows = await seed_baseline( + decky_name, repo, + persona="linux", created_by=created_by, bus=bus, + container=container, + ) + for r in rows: + r["decky_name"] = decky_name + out.append(r) + return out diff --git a/decnet/canary/storage.py b/decnet/canary/storage.py new file mode 100644 index 00000000..282ee4d7 --- /dev/null +++ b/decnet/canary/storage.py @@ -0,0 +1,90 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Filesystem store for operator-uploaded canary blobs. + +Blobs live under ``/var/lib/decnet/canary/blobs/`` (override +via ``DECNET_CANARY_BLOB_DIR``) and are deduplicated by content hash. +The DB table :class:`decnet.web.db.models.CanaryBlob` mirrors +metadata; the bytes are read on demand at instrumentation time, so +the API process never holds large operator uploads in memory longer +than the request itself. + +Refcount-aware deletion is enforced at the DB layer (see +:meth:`decnet.web.db.repository.BaseRepository.delete_canary_blob`); +this module only provides write/read/unlink primitives keyed by +sha256. +""" +from __future__ import annotations + +import hashlib +import os +from pathlib import Path +from typing import Tuple + + +def blob_dir() -> Path: + """Return the on-disk root for canary blobs. + + Honors ``DECNET_CANARY_BLOB_DIR`` so tests can point at a tmp + path. The directory is created lazily on first write. + """ + raw = os.environ.get("DECNET_CANARY_BLOB_DIR", "/var/lib/decnet/canary/blobs") + return Path(raw) + + +def _path_for(sha256: str) -> Path: + # Two-level fan-out (``ab/cd/abcd...``) keeps any one directory + # from accumulating thousands of entries on busy fleets. Same + # shape as Git's loose-object store. + if len(sha256) < 4: + raise ValueError("sha256 must be at least 4 chars") + root = blob_dir() + return root / sha256[:2] / sha256[2:4] / sha256 + + +def write_blob(content: bytes) -> Tuple[str, Path, int]: + """Persist ``content`` under its sha256 path. + + Idempotent: if the target file already exists with the same + bytes, no rewrite happens. Returns ``(sha256, path, + size_bytes)``. + """ + sha = hashlib.sha256(content).hexdigest() + target = _path_for(sha) + target.parent.mkdir(parents=True, exist_ok=True) + if not target.exists(): + # Atomic-ish: write to a temp sibling and rename. Avoids the + # half-written-file race a concurrent reader would otherwise + # see if we wrote in place. + tmp = target.with_suffix(target.suffix + ".part") + tmp.write_bytes(content) + os.replace(tmp, target) + return sha, target, len(content) + + +def read_blob(sha256: str) -> bytes: + """Read the bytes for a stored blob. + + Raises :class:`FileNotFoundError` when the on-disk row was unlinked + out of band (operator pruned ``/var/lib/decnet`` by hand) — the + caller (instrumenter dispatch) surfaces it as a 410-ish error so + the operator can re-upload. + """ + return _path_for(sha256).read_bytes() + + +def unlink_blob(sha256: str) -> bool: + """Delete the on-disk bytes for ``sha256``. + + Returns True if a file was removed, False if it was already gone. + The DB row deletion happens in + :meth:`SQLModelRepository.delete_canary_blob`; this function is + a best-effort companion called *after* the DB delete commits so + a crash between them leaves a recoverable orphan, never a + dangling DB reference. + """ + target = _path_for(sha256) + try: + target.unlink() + except FileNotFoundError: + return False + return True diff --git a/decnet/canary/worker.py b/decnet/canary/worker.py new file mode 100644 index 00000000..d7f419fa --- /dev/null +++ b/decnet/canary/worker.py @@ -0,0 +1,421 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""``decnet canary`` worker — HTTP + DNS callback receivers. + +Two surfaces, one process: + +* **HTTP** — a tiny FastAPI app on its own port (default 8088). The + only useful route is ``GET /c/{slug}`` which looks up the slug in + the canary token table, persists a :class:`CanaryTrigger` row, + publishes ``canary..triggered`` on the bus, and returns + a 1×1 transparent GIF (or 204 if the client's ``Accept`` doesn't + list any image type). +* **DNS** — an authoritative UDP server (default 5353 if non-root, + 53 if root) for ``*.``. Same lookup + persist + + publish flow, plus a sinkhole A record so the attacker's resolver + doesn't loop on NXDOMAIN. + +Both surfaces are **stealth** by policy +(:mod:`feedback_stealth`): no DECNET strings in headers / banners / +error pages. The HTTP app strips the default ``Server: uvicorn`` +header in middleware; FastAPI's docs/openapi UI is disabled because +discovering them would tip off the attacker that this is a honeypot. + +The worker is supervised by its own systemd unit +(``decnet-canary.service``); like every other DECNET worker, it +crashes loudly rather than masking failures. +""" +from __future__ import annotations + +import asyncio +import base64 +import binascii +import json +import os +import time +import uuid +from datetime import datetime, timezone +from typing import Any, Optional + +from fastapi import FastAPI, Request, Response + +from decnet.bus import topics +from decnet.bus.base import BaseBus +from decnet.bus.factory import get_bus +from decnet.canary.dns_server import CanaryDNSProtocol, DNSQuery +from decnet.logging import get_logger +from decnet.web.db.factory import get_repository +from decnet.web.db.repository import BaseRepository + +log = get_logger("canary.worker") + +# 1×1 transparent GIF — public-domain canonical bytes. Returning the +# same image every time is fine: the body has no information the +# attacker shouldn't see, and image clients cache it. +_TRANSPARENT_GIF = bytes.fromhex( + "47494638396101000100800100000000ffffff21f90401000001002c00000000010001000002024401003b" +) + + +# Namespace used by fingerprint generators to derive mint UUID. +# Must stay in sync with fingerprint_html._MINT_NAMESPACE. +_MINT_NAMESPACE = uuid.UUID("a3f7c821-9d1e-4b6a-8c2d-1e4f9a7b3c5d") + +# In-memory per-(token_uuid, src_ip) rate limiter for fingerprint persists. +# Maps (token_uuid, src_ip) -> list of monotonic timestamps. +# Not shared across worker restarts or processes — acceptable for MVP. +_FP_RATE_WINDOW_S = 60 +_FP_RATE_LIMIT = 30 +_fp_rate_buckets: dict[tuple[str, str], list[float]] = {} + + +def _fp_rate_allowed(token_uuid: str, src_ip: str) -> bool: + key = (token_uuid, src_ip) + now = time.monotonic() + cutoff = now - _FP_RATE_WINDOW_S + bucket = _fp_rate_buckets.get(key, []) + bucket = [t for t in bucket if t > cutoff] + if len(bucket) >= _FP_RATE_LIMIT: + _fp_rate_buckets[key] = bucket + return False + bucket.append(now) + _fp_rate_buckets[key] = bucket + return True + + +def _is_valid_fp_shape(fp: dict) -> bool: + """Layer B — structural sanity check on a decoded fingerprint blob.""" + if not isinstance(fp.get("mint"), str) or not fp["mint"]: + return False + known_keys = {"nav", "scr", "tz", "cv", "gl", "au", "ft", "rtc"} + present = sum(1 for k in known_keys if isinstance(fp.get(k), dict)) + return present >= 3 + + +def _http_base() -> str: + return os.environ.get("DECNET_CANARY_HTTP_BASE", "http://localhost:8088").rstrip("/") + + +def _dns_zone() -> str: + return os.environ.get("DECNET_CANARY_DNS_ZONE", "").strip(".").lower() + + +def _http_port() -> int: + return int(os.environ.get("DECNET_CANARY_HTTP_PORT", "8088")) + + +def _dns_port() -> int: + # Default 5353 (mDNS-ish, non-privileged) — operators pin :53 via + # NAT or a CAP_NET_BIND_SERVICE-enabled unit. + return int(os.environ.get("DECNET_CANARY_DNS_PORT", "5353")) + + +def _dns_bind() -> str: + return os.environ.get("DECNET_CANARY_DNS_BIND", "0.0.0.0") # nosec B104 — attacker-facing decoy listener, internet exposure is the design + + +def _http_bind() -> str: + return os.environ.get("DECNET_CANARY_HTTP_BIND", "0.0.0.0") # nosec B104 — same rationale + + +# ---------------------------- HTTP surface -------------------------------- + + +def _build_app(repo: BaseRepository, bus: BaseBus) -> FastAPI: + """Construct the FastAPI app. + + Disables docs / openapi / redoc — operators query the canary + surface via the *main* DECNET API, never directly. Anyone hitting + these paths is either misconfigured or scanning for a honeypot. + """ + app = FastAPI( + title="", # don't leak "DECNET" in OpenAPI + docs_url=None, redoc_url=None, openapi_url=None, + ) + + @app.middleware("http") + async def _stealth_headers(request: Request, call_next): + response: Response = await call_next(request) + # Strip the uvicorn / starlette banner; replace with a + # generic Server line that matches what most CDNs return. + response.headers["Server"] = "nginx" + # Don't leak request id / process id headers. + if "x-process-time" in response.headers: + del response.headers["x-process-time"] + return response + + @app.get("/c/{slug}") + async def callback(slug: str, request: Request) -> Response: + raw_nonce = request.query_params.get("k") + fp_meta, parsed_fp = _extract_fingerprint(request.query_params) + merged_headers = dict(request.headers) + if fp_meta: + merged_headers.update(fp_meta) + await _record_hit( + repo, bus, + slug=slug, + src_ip=_client_ip(request), + user_agent=request.headers.get("user-agent"), + request_path=str(request.url.path), + dns_qname=None, + raw_headers=merged_headers, + parsed_fp=parsed_fp, + raw_nonce=raw_nonce, + ) + # Always 200 with a tiny image so the attacker's client sees + # a "success" — same return regardless of whether the slug is + # known. Stealth: do NOT distinguish unknown vs known via + # status code or response body. + return Response(content=_TRANSPARENT_GIF, media_type="image/gif") + + @app.get("/") + async def root() -> Response: + # Bare root returns a generic 404. The decoy posture: pretend + # to be an empty static-file host that just happens to resolve + # /c/ when it matches. + return Response(status_code=404) + + return app + + +# Per-chunk size cap. Real fingerprints fit in one ~3KB GET; honest +# overflow is handled via chunking (s/i/n + d). Anything larger than +# this on a single request is junk, so we drop it instead of letting an +# attacker inflate a trigger row indefinitely. +_FP_CHUNK_MAX = 8 * 1024 + + +def _extract_fingerprint(qp: Any) -> tuple[dict[str, Any], Optional[dict]]: + """Decode fingerprint-payload query params into (meta_dict, parsed_fp). + + The obfuscated browser payload may send three shapes on ``GET /c/``: + + * ``?o=1`` — bare-open beacon, fired before fingerprinting starts. + * ``?d=`` — single-shot fingerprint dump. + * ``?s=&i=&n=&d=`` — chunked dump. + + Returns a tuple of: + - ``meta`` — flat dict with ``_fp_*`` keys to merge into raw_headers. + - ``parsed_fp`` — the decoded fingerprint dict for validation, or ``None`` + when there's no ``?d=`` or decoding fails. + """ + out: dict[str, Any] = {} + parsed_fp: Optional[dict] = None + if not qp: + return out, parsed_fp + o = qp.get("o") if hasattr(qp, "get") else None + if o: + out["_fp_open"] = "1" + d = qp.get("d") if hasattr(qp, "get") else None + if not d: + return out, parsed_fp + if len(d) > _FP_CHUNK_MAX: + out["_fp_oversize"] = "1" + return out, parsed_fp + + sid = qp.get("s") + idx = qp.get("i") + total = qp.get("n") + if sid and idx and total: + out["_fp_sid"] = sid + out["_fp_idx"] = idx + out["_fp_total"] = total + out["_fp_chunk"] = d + return out, parsed_fp + + # Single-shot: decode and pass back as parsed_fp; validation runs in + # _record_hit after token lookup so we have the stored nonce at hand. + try: + padded = d + "=" * (-len(d) % 4) + raw = base64.urlsafe_b64decode(padded.encode("ascii")) + parsed = json.loads(raw.decode("utf-8")) + except (binascii.Error, ValueError, UnicodeDecodeError): + out["_fp_decode_error"] = "1" + return out, parsed_fp + if isinstance(parsed, dict): + parsed_fp = parsed + else: + out["_fp_decode_error"] = "1" + return out, parsed_fp + + +def _client_ip(request: Request) -> str: + # Honor X-Forwarded-For if the operator deployed behind a reverse + # proxy. Take the leftmost address in the chain; everything after + # is upstream-proxy noise. + fwd = request.headers.get("x-forwarded-for") + if fwd: + return fwd.split(",", 1)[0].strip() + if request.client: + return request.client.host + return "0.0.0.0" # nosec B104 — sentinel for "unknown remote" + + +# ---------------------------- shared persistence ------------------------- + + +async def _record_hit( + repo: BaseRepository, + bus: BaseBus, + *, + slug: str, + src_ip: str, + user_agent: Optional[str], + request_path: Optional[str], + dns_qname: Optional[str], + raw_headers: Optional[dict], + parsed_fp: Optional[dict] = None, + raw_nonce: Optional[str] = None, +) -> None: + """Resolve slug -> token, persist a trigger, publish on the bus. + + Unknown slugs are silently swallowed: returning the same response + for known and unknown slugs is the stealth posture, and persisting + every random scan would clutter the DB. + + When *parsed_fp* is present (single-shot fingerprint decode succeeded), + it is validated through four layers before being merged into raw_headers: + A) nonce match against CanaryToken.fingerprint_nonce, + B) structural shape check, + C) mint UUID consistency, + D) per-(token, IP) rate limit. + Each failure drops the structured ``_fp`` and sets a ``_fp_*_invalid`` flag. + The trigger row always lands regardless — the GET hit is itself forensic. + """ + token = await repo.get_canary_token_by_slug(slug) + if token is None: + return + + final_headers: dict[str, Any] = dict(raw_headers or {}) + + if parsed_fp is not None: + stored_nonce: Optional[str] = token.get("fingerprint_nonce") + + # Layer A — nonce + if stored_nonce is not None and raw_nonce != stored_nonce: + final_headers["_fp_invalid_nonce"] = "1" + parsed_fp = None + + # Layer B — shape (only when nonce passed or no nonce enforced) + if parsed_fp is not None and not _is_valid_fp_shape(parsed_fp): + final_headers["_fp_invalid_shape"] = "1" + parsed_fp = None + + # Layer C — mint UUID consistency + if parsed_fp is not None: + expected_mint = str(uuid.uuid5(_MINT_NAMESPACE, slug)) + if parsed_fp.get("mint") != expected_mint: + final_headers["_fp_invalid_mint"] = "1" + parsed_fp = None + + # Layer D — rate limit + if parsed_fp is not None and not _fp_rate_allowed(token["uuid"], src_ip): + final_headers["_fp_rate_limited"] = "1" + parsed_fp = None + + if parsed_fp is not None: + final_headers["_fp"] = parsed_fp + + trigger_id = await repo.record_canary_trigger({ + "token_uuid": token["uuid"], + "occurred_at": datetime.now(timezone.utc), + "src_ip": src_ip, + "user_agent": user_agent, + "request_path": request_path, + "dns_qname": dns_qname, + "raw_headers": final_headers, + }) + try: + await bus.publish( + topics.canary(token["uuid"], topics.CANARY_TRIGGERED), + { + "token_id": token["uuid"], + "trigger_id": trigger_id, + "decky_name": token["decky_name"], + "src_ip": src_ip, + "user_agent": user_agent, + "request_path": request_path, + "dns_qname": dns_qname, + }, + ) + except Exception as e: # noqa: BLE001 — best effort + log.warning("canary.triggered publish failed slug=%s err=%s", slug, e) + + # Auto-deregister fingerprint canaries after the first valid fingerprint + # is collected. Slug goes dark; the stealth posture means the attacker + # sees the same 200 + GIF on the next hit — nothing reveals the revocation. + # Guard: only fingerprint tokens have a non-NULL fingerprint_nonce; plain + # http/dns canaries are NOT auto-revoked. + if parsed_fp is not None and token.get("fingerprint_nonce") is not None: + try: + await repo.update_canary_token_state(token["uuid"], "revoked") + await bus.publish( + topics.canary(token["uuid"], topics.CANARY_REVOKED), + {"token_id": token["uuid"], "trigger_id": trigger_id, + "reason": "fingerprint_collected"}, + ) + except Exception as e: # noqa: BLE001 — trigger row already landed; best effort + log.warning("canary.deregister failed token=%s err=%s", token["uuid"], e) + + +# ---------------------------- DNS surface -------------------------------- + + +async def _start_dns_server( + repo: BaseRepository, bus: BaseBus, *, loop: asyncio.AbstractEventLoop, +) -> Optional[asyncio.DatagramTransport]: + zone = _dns_zone() + if not zone: + log.info("canary.dns disabled (DECNET_CANARY_DNS_ZONE unset)") + return None + + async def _hook(slug: str, query: DNSQuery, src_ip: str) -> None: + await _record_hit( + repo, bus, + slug=slug, src_ip=src_ip, user_agent=None, + request_path=None, dns_qname=query.qname, + raw_headers=None, + ) + + transport, _proto = await loop.create_datagram_endpoint( + lambda: CanaryDNSProtocol(zone, _hook), + local_addr=(_dns_bind(), _dns_port()), + ) + log.info("canary.dns listening zone=%s port=%d", zone, _dns_port()) + return transport + + +# ---------------------------- entry point -------------------------------- + + +async def run() -> None: + """Worker entry point — kicked off by ``decnet canary``.""" + import uvicorn + + repo = get_repository() + await repo.initialize() + bus = get_bus() + await bus.connect() + + app = _build_app(repo, bus) + config = uvicorn.Config( + app, + host=_http_bind(), + port=_http_port(), + log_level="warning", + access_log=False, # stealth: no per-request lines + server_header=False, # we set Server: nginx in middleware + ) + server = uvicorn.Server(config) + loop = asyncio.get_running_loop() + dns_transport = await _start_dns_server(repo, bus, loop=loop) + try: + await server.serve() + finally: + if dns_transport is not None: + dns_transport.close() + await bus.close() + + +def main() -> None: + """CLI entry point — synchronous wrapper for ``asyncio.run``.""" + asyncio.run(run()) diff --git a/decnet/cli.py b/decnet/cli.py deleted file mode 100644 index 9d3de7d4..00000000 --- a/decnet/cli.py +++ /dev/null @@ -1,461 +0,0 @@ -""" -DECNET CLI — entry point for all commands. - -Usage: - decnet deploy --mode unihost --deckies 5 --randomize-services - decnet status - decnet teardown [--all | --id decky-01] - decnet services -""" - -import random -from typing import Optional - -import typer -from rich.console import Console -from rich.table import Table - -from decnet.archetypes import Archetype, all_archetypes, get_archetype -from decnet.config import ( - DeckyConfig, - DecnetConfig, - random_hostname, -) -from decnet.distros import all_distros, get_distro, random_distro -from decnet.ini_loader import IniConfig, load_ini -from decnet.network import detect_interface, detect_subnet, allocate_ips, get_host_ip -from decnet.services.registry import all_services - -app = typer.Typer( - name="decnet", - help="Deploy a deception network of honeypot deckies on your LAN.", - no_args_is_help=True, -) -console = Console() - -def _all_service_names() -> list[str]: - """Return all registered service names from the live plugin registry.""" - return sorted(all_services().keys()) - - -def _resolve_distros( - distros_explicit: list[str] | None, - randomize_distros: bool, - n: int, - archetype: Archetype | None = None, -) -> list[str]: - """Return a list of n distro slugs based on CLI flags or archetype preference.""" - if distros_explicit: - return [distros_explicit[i % len(distros_explicit)] for i in range(n)] - if randomize_distros: - return [random_distro().slug for _ in range(n)] - if archetype: - pool = archetype.preferred_distros - return [pool[i % len(pool)] for i in range(n)] - # Default: cycle through all distros to maximize heterogeneity - slugs = list(all_distros().keys()) - return [slugs[i % len(slugs)] for i in range(n)] - - -def _build_deckies( - n: int, - ips: list[str], - services_explicit: list[str] | None, - randomize_services: bool, - distros_explicit: list[str] | None = None, - randomize_distros: bool = False, - archetype: Archetype | None = None, -) -> list[DeckyConfig]: - deckies = [] - used_combos: set[frozenset] = set() - distro_slugs = _resolve_distros(distros_explicit, randomize_distros, n, archetype) - - for i, ip in enumerate(ips): - name = f"decky-{i + 1:02d}" - distro = get_distro(distro_slugs[i]) - hostname = random_hostname(distro.slug) - - if services_explicit: - svc_list = services_explicit - elif archetype: - svc_list = list(archetype.services) - elif randomize_services: - svc_pool = _all_service_names() - attempts = 0 - while True: - count = random.randint(1, min(3, len(svc_pool))) - chosen = frozenset(random.sample(svc_pool, count)) - attempts += 1 - if chosen not in used_combos or attempts > 20: - break - svc_list = list(chosen) - used_combos.add(chosen) - else: - typer.echo("Error: provide --services, --archetype, or --randomize-services.", err=True) - raise typer.Exit(1) - - deckies.append( - DeckyConfig( - name=name, - ip=ip, - services=svc_list, - distro=distro.slug, - base_image=distro.image, - build_base=distro.build_base, - hostname=hostname, - archetype=archetype.slug if archetype else None, - nmap_os=archetype.nmap_os if archetype else "linux", - ) - ) - return deckies - - -def _build_deckies_from_ini( - ini: IniConfig, - subnet_cidr: str, - gateway: str, - host_ip: str, - randomize: bool, -) -> list[DeckyConfig]: - """Build DeckyConfig list from an IniConfig, auto-allocating missing IPs.""" - from ipaddress import IPv4Address, IPv4Network - - explicit_ips: set[IPv4Address] = { - IPv4Address(s.ip) for s in ini.deckies if s.ip - } - - net = IPv4Network(subnet_cidr, strict=False) - reserved = { - net.network_address, - net.broadcast_address, - IPv4Address(gateway), - IPv4Address(host_ip), - } | explicit_ips - - auto_pool = (str(addr) for addr in net.hosts() if addr not in reserved) - - deckies: list[DeckyConfig] = [] - for spec in ini.deckies: - # Resolve archetype (if any) — explicit services/distro override it - arch: Archetype | None = None - if spec.archetype: - try: - arch = get_archetype(spec.archetype) - except ValueError as e: - console.print(f"[red]{e}[/]") - raise typer.Exit(1) - - # Distro: archetype preferred list → random → global cycle - distro_pool = arch.preferred_distros if arch else list(all_distros().keys()) - distro = get_distro(distro_pool[len(deckies) % len(distro_pool)]) - hostname = random_hostname(distro.slug) - - ip = spec.ip or next(auto_pool, None) - if ip is None: - raise RuntimeError( - f"Not enough free IPs in {subnet_cidr} while assigning IP for '{spec.name}'." - ) - - if spec.services: - known = set(_all_service_names()) - unknown = [s for s in spec.services if s not in known] - if unknown: - console.print( - f"[red]Unknown service(s) in [{spec.name}]: {unknown}. " - f"Available: {_all_service_names()}[/]" - ) - raise typer.Exit(1) - svc_list = spec.services - elif arch: - svc_list = list(arch.services) - elif randomize: - svc_pool = _all_service_names() - count = random.randint(1, min(3, len(svc_pool))) - svc_list = random.sample(svc_pool, count) - else: - console.print( - f"[red]Decky '[{spec.name}]' has no services= in config. " - "Add services=, archetype=, or use --randomize-services.[/]" - ) - raise typer.Exit(1) - - # nmap_os priority: explicit INI key > archetype default > "linux" - resolved_nmap_os = spec.nmap_os or (arch.nmap_os if arch else "linux") - deckies.append(DeckyConfig( - name=spec.name, - ip=ip, - services=svc_list, - distro=distro.slug, - base_image=distro.image, - build_base=distro.build_base, - hostname=hostname, - archetype=arch.slug if arch else None, - service_config=spec.service_config, - nmap_os=resolved_nmap_os, - )) - return deckies - - -@app.command() -def deploy( - mode: str = typer.Option("unihost", "--mode", "-m", help="Deployment mode: unihost | swarm"), - deckies: Optional[int] = typer.Option(None, "--deckies", "-n", help="Number of deckies to deploy (required without --config)", min=1), - interface: Optional[str] = typer.Option(None, "--interface", "-i", help="Host NIC (auto-detected if omitted)"), - subnet: Optional[str] = typer.Option(None, "--subnet", help="LAN subnet CIDR (auto-detected if omitted)"), - ip_start: Optional[str] = typer.Option(None, "--ip-start", help="First decky IP (auto if omitted)"), - services: Optional[str] = typer.Option(None, "--services", help="Comma-separated services, e.g. ssh,smb,rdp"), - randomize_services: bool = typer.Option(False, "--randomize-services", help="Assign random services to each decky"), - distro: Optional[str] = typer.Option(None, "--distro", help="Comma-separated distro slugs, e.g. debian,ubuntu22,rocky9"), - randomize_distros: bool = typer.Option(False, "--randomize-distros", help="Assign a random distro to each decky"), - log_target: Optional[str] = typer.Option(None, "--log-target", help="Forward logs to ip:port (e.g. 192.168.1.5:5140)"), - log_file: Optional[str] = typer.Option(None, "--log-file", help="Write RFC 5424 syslog to this path inside containers (e.g. /var/log/decnet/decnet.log)"), - archetype_name: Optional[str] = typer.Option(None, "--archetype", "-a", help="Machine archetype slug (e.g. linux-server, windows-workstation)"), - dry_run: bool = typer.Option(False, "--dry-run", help="Generate compose file without starting containers"), - no_cache: bool = typer.Option(False, "--no-cache", help="Force rebuild all images, ignoring Docker layer cache"), - ipvlan: bool = typer.Option(False, "--ipvlan", help="Use IPvlan L2 instead of MACVLAN (required on WiFi interfaces)"), - config_file: Optional[str] = typer.Option(None, "--config", "-c", help="Path to INI config file"), -) -> None: - """Deploy deckies to the LAN.""" - if mode not in ("unihost", "swarm"): - console.print("[red]--mode must be 'unihost' or 'swarm'[/]") - raise typer.Exit(1) - - # ------------------------------------------------------------------ # - # Config-file path # - # ------------------------------------------------------------------ # - if config_file: - try: - ini = load_ini(config_file) - except FileNotFoundError as e: - console.print(f"[red]{e}[/]") - raise typer.Exit(1) - - # CLI flags override INI values when explicitly provided - iface = interface or ini.interface or detect_interface() - subnet_cidr = subnet or ini.subnet - effective_gateway = ini.gateway - if subnet_cidr is None: - subnet_cidr, effective_gateway = detect_subnet(iface) - elif effective_gateway is None: - _, effective_gateway = detect_subnet(iface) - - host_ip = get_host_ip(iface) - console.print(f"[dim]Config:[/] {config_file} [dim]Interface:[/] {iface} " - f"[dim]Subnet:[/] {subnet_cidr} [dim]Gateway:[/] {effective_gateway} " - f"[dim]Host IP:[/] {host_ip}") - - # Register bring-your-own services from INI before validation - if ini.custom_services: - from decnet.custom_service import CustomService - from decnet.services.registry import register_custom_service - for cs in ini.custom_services: - register_custom_service( - CustomService( - name=cs.name, - image=cs.image, - exec_cmd=cs.exec_cmd, - ports=cs.ports, - ) - ) - - effective_log_target = log_target or ini.log_target - effective_log_file = log_file - decky_configs = _build_deckies_from_ini( - ini, subnet_cidr, effective_gateway, host_ip, randomize_services - ) - # ------------------------------------------------------------------ # - # Classic CLI path # - # ------------------------------------------------------------------ # - else: - if deckies is None: - console.print("[red]--deckies is required when --config is not used.[/]") - raise typer.Exit(1) - - services_list = [s.strip() for s in services.split(",")] if services else None - if services_list: - known = set(_all_service_names()) - unknown = [s for s in services_list if s not in known] - if unknown: - console.print(f"[red]Unknown service(s): {unknown}. Available: {_all_service_names()}[/]") - raise typer.Exit(1) - - # Resolve archetype if provided - arch: Archetype | None = None - if archetype_name: - try: - arch = get_archetype(archetype_name) - except ValueError as e: - console.print(f"[red]{e}[/]") - raise typer.Exit(1) - - if not services_list and not randomize_services and not arch: - console.print("[red]Specify --services, --archetype, or --randomize-services.[/]") - raise typer.Exit(1) - - iface = interface or detect_interface() - if subnet is None: - subnet_cidr, effective_gateway = detect_subnet(iface) - else: - subnet_cidr = subnet - _, effective_gateway = detect_subnet(iface) - - host_ip = get_host_ip(iface) - console.print(f"[dim]Interface:[/] {iface} [dim]Subnet:[/] {subnet_cidr} " - f"[dim]Gateway:[/] {effective_gateway} [dim]Host IP:[/] {host_ip}") - - distros_list = [d.strip() for d in distro.split(",")] if distro else None - if distros_list: - try: - for slug in distros_list: - get_distro(slug) - except ValueError as e: - console.print(f"[red]{e}[/]") - raise typer.Exit(1) - - ips = allocate_ips(subnet_cidr, effective_gateway, host_ip, deckies, ip_start) - decky_configs = _build_deckies( - deckies, ips, services_list, randomize_services, - distros_explicit=distros_list, randomize_distros=randomize_distros, - archetype=arch, - ) - effective_log_target = log_target - effective_log_file = log_file - - config = DecnetConfig( - mode=mode, - interface=iface, - subnet=subnet_cidr, - gateway=effective_gateway, - deckies=decky_configs, - log_target=effective_log_target, - log_file=effective_log_file, - ipvlan=ipvlan, - ) - - if effective_log_target and not dry_run: - from decnet.logging.forwarder import probe_log_target - if not probe_log_target(effective_log_target): - console.print(f"[yellow]Warning: log target {effective_log_target} is unreachable. " - "Logs will be lost if it stays down.[/]") - - from decnet.deployer import deploy as _deploy - _deploy(config, dry_run=dry_run, no_cache=no_cache) - - -@app.command() -def status() -> None: - """Show running deckies and their status.""" - from decnet.deployer import status as _status - _status() - - -@app.command() -def teardown( - all_: bool = typer.Option(False, "--all", help="Tear down all deckies and remove network"), - id_: Optional[str] = typer.Option(None, "--id", help="Tear down a specific decky by name"), -) -> None: - """Stop and remove deckies.""" - if not all_ and not id_: - console.print("[red]Specify --all or --id .[/]") - raise typer.Exit(1) - - from decnet.deployer import teardown as _teardown - _teardown(decky_id=id_) - - -@app.command(name="services") -def list_services() -> None: - """List all registered honeypot service plugins.""" - svcs = all_services() - table = Table(title="Available Services", show_lines=True) - table.add_column("Name", style="bold cyan") - table.add_column("Ports") - table.add_column("Image") - for name, svc in sorted(svcs.items()): - table.add_row(name, ", ".join(str(p) for p in svc.ports), svc.default_image) - console.print(table) - - -@app.command(name="distros") -def list_distros() -> None: - """List all available OS distro profiles for deckies.""" - table = Table(title="Available Distro Profiles", show_lines=True) - table.add_column("Slug", style="bold cyan") - table.add_column("Display Name") - table.add_column("Docker Image", style="dim") - for slug, profile in sorted(all_distros().items()): - table.add_row(slug, profile.display_name, profile.image) - console.print(table) - - -@app.command(name="correlate") -def correlate( - log_file: Optional[str] = typer.Option(None, "--log-file", "-f", help="Path to DECNET syslog file to analyse"), - min_deckies: int = typer.Option(2, "--min-deckies", "-m", help="Minimum number of distinct deckies an IP must touch to be reported"), - output: str = typer.Option("table", "--output", "-o", help="Output format: table | json | syslog"), - emit_syslog: bool = typer.Option(False, "--emit-syslog", help="Also print traversal events as RFC 5424 lines (for SIEM piping)"), -) -> None: - """Analyse logs for cross-decky traversals and print the attacker movement graph.""" - import sys - import json as _json - from pathlib import Path - from decnet.correlation.engine import CorrelationEngine - - engine = CorrelationEngine() - - if log_file: - path = Path(log_file) - if not path.exists(): - console.print(f"[red]Log file not found: {log_file}[/]") - raise typer.Exit(1) - engine.ingest_file(path) - elif not sys.stdin.isatty(): - for line in sys.stdin: - engine.ingest(line) - else: - console.print("[red]Provide --log-file or pipe log data via stdin.[/]") - raise typer.Exit(1) - - traversals = engine.traversals(min_deckies) - - if output == "json": - console.print_json(_json.dumps(engine.report_json(min_deckies), indent=2)) - elif output == "syslog": - for line in engine.traversal_syslog_lines(min_deckies): - typer.echo(line) - else: - if not traversals: - console.print( - f"[yellow]No traversals detected " - f"(min_deckies={min_deckies}, events_indexed={engine.events_indexed}).[/]" - ) - else: - console.print(engine.report_table(min_deckies)) - console.print( - f"[dim]Parsed {engine.lines_parsed} lines · " - f"indexed {engine.events_indexed} events · " - f"{len(engine.all_attackers())} unique IPs · " - f"[bold]{len(traversals)}[/] traversal(s)[/]" - ) - - if emit_syslog: - for line in engine.traversal_syslog_lines(min_deckies): - typer.echo(line) - - -@app.command(name="archetypes") -def list_archetypes() -> None: - """List all machine archetype profiles.""" - table = Table(title="Machine Archetypes", show_lines=True) - table.add_column("Slug", style="bold cyan") - table.add_column("Display Name") - table.add_column("Default Services", style="green") - table.add_column("Description", style="dim") - for slug, arch in sorted(all_archetypes().items()): - table.add_row( - slug, - arch.display_name, - ", ".join(arch.services), - arch.description, - ) - console.print(table) diff --git a/decnet/cli/__init__.py b/decnet/cli/__init__.py new file mode 100644 index 00000000..51e0ba41 --- /dev/null +++ b/decnet/cli/__init__.py @@ -0,0 +1,92 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" +DECNET CLI — entry point for all commands. + +Usage: + decnet deploy --mode unihost --deckies 5 --randomize-services + decnet status + decnet teardown [--all | --id decky-01] + decnet services + +Layout: each command module exports ``register(app)`` which attaches its +commands to the passed Typer app. ``__init__.py`` builds the root app, +calls every module's ``register`` in order, then runs the master-only +gate. The gate must fire LAST so it sees the fully-populated dispatch +table before filtering. +""" + +from __future__ import annotations + +import typer + +from . import ( + agent, + api, + bus, + canary, + db, + deploy, + forwarder, + geoip, + init, + inventory, + lifecycle, + listener, + orchestrator, + profiler, + realism, + reconciler, + sniffer, + swarm, + swarmctl, + topology, + ttp, + updater, + web, + webhook, + workers, +) +from .gating import _gate_commands_by_mode +from .utils import console as console, log as log + +app = typer.Typer( + name="decnet", + help="Deploy a deception network of honeypot deckies on your LAN.", + no_args_is_help=True, +) + +# Order matches the old flat layout so `decnet --help` reads the same. +for _mod in ( + api, swarmctl, agent, updater, listener, forwarder, + swarm, + deploy, lifecycle, workers, inventory, + web, profiler, orchestrator, realism, reconciler, sniffer, db, + topology, bus, geoip, init, webhook, canary, ttp, +): + _mod.register(app) + +_gate_commands_by_mode(app) + +# Backwards-compat re-exports. Tests and third-party tooling import these +# directly from ``decnet.cli``; the refactor must keep them resolvable. +from .db import _db_reset_mysql_async # noqa: E402,F401 +from .gating import ( # noqa: E402,F401 + MASTER_ONLY_COMMANDS, + MASTER_ONLY_GROUPS, + _agent_mode_active, + _require_master_mode, +) +from .utils import ( # noqa: E402,F401 + _daemonize, + _http_request, + _is_running, + _kill_all_services, + _pid_dir, + _service_registry, + _spawn_detached, + _swarmctl_base_url, +) + + +if __name__ == "__main__": # pragma: no cover + app() diff --git a/decnet/cli/agent.py b/decnet/cli/agent.py new file mode 100644 index 00000000..53aae048 --- /dev/null +++ b/decnet/cli/agent.py @@ -0,0 +1,65 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +from __future__ import annotations + +import os +import pathlib as _pathlib +import sys as _sys +from typing import Optional + +import typer + +from . import utils as _utils +from .utils import console, log + + +def register(app: typer.Typer) -> None: + @app.command() + def agent( + port: int = typer.Option(8765, "--port", help="Port for the worker agent"), + host: str = typer.Option("0.0.0.0", "--host", help="Bind address for the worker agent"), # nosec B104 + agent_dir: Optional[str] = typer.Option(None, "--agent-dir", help="Worker cert bundle dir (default: ~/.decnet/agent, expanded under the running user's HOME — set this when running as sudo/root)"), + daemon: bool = typer.Option(False, "--daemon", "-d", help="Detach to background as a daemon process"), + no_forwarder: bool = typer.Option(False, "--no-forwarder", help="Do not auto-spawn the log forwarder alongside the agent"), + ) -> None: + """Run the DECNET SWARM worker agent (requires a cert bundle in ~/.decnet/agent/). + + By default, `decnet agent` auto-spawns `decnet forwarder` as a fully- + detached sibling process so worker logs start flowing to the master + without a second manual invocation. The forwarder survives agent + restarts and crashes — if it dies on its own, restart it manually + with `decnet forwarder --daemon …`. Pass --no-forwarder to skip. + """ + from decnet.agent import server as _agent_server + from decnet.env import DECNET_SWARM_MASTER_HOST, DECNET_AGENT_LOG_FILE + from decnet.swarm import pki as _pki + + resolved_dir = _pathlib.Path(agent_dir) if agent_dir else _pki.DEFAULT_AGENT_DIR + + if daemon: + log.info("agent daemonizing host=%s port=%d", host, port) + _utils._daemonize() + + if not no_forwarder and DECNET_SWARM_MASTER_HOST: + fw_argv = [ + _sys.executable, "-m", "decnet", "forwarder", + "--master-host", DECNET_SWARM_MASTER_HOST, + "--master-port", str(int(os.environ.get("DECNET_SWARM_SYSLOG_PORT", "6514"))), + "--agent-dir", str(resolved_dir), + "--log-file", str(DECNET_AGENT_LOG_FILE), + "--daemon", + ] + try: + pid = _utils._spawn_detached(fw_argv, _utils._pid_dir() / "forwarder.pid") + log.info("agent auto-spawned forwarder pid=%d master=%s", pid, DECNET_SWARM_MASTER_HOST) + console.print(f"[dim]Auto-spawned forwarder (pid {pid}) → {DECNET_SWARM_MASTER_HOST}.[/]") + except Exception as e: # noqa: BLE001 + log.warning("agent could not auto-spawn forwarder: %s", e) + console.print(f"[yellow]forwarder auto-spawn skipped: {e}[/]") + elif not no_forwarder: + log.info("agent skipping forwarder auto-spawn (DECNET_SWARM_MASTER_HOST unset)") + + log.info("agent command invoked host=%s port=%d dir=%s", host, port, resolved_dir) + console.print(f"[green]Starting DECNET worker agent on {host}:{port} (mTLS)...[/]") + rc = _agent_server.run(host, port, agent_dir=resolved_dir) + if rc != 0: + raise typer.Exit(rc) diff --git a/decnet/cli/api.py b/decnet/cli/api.py new file mode 100644 index 00000000..1a6c4a3b --- /dev/null +++ b/decnet/cli/api.py @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +from __future__ import annotations + +import os +import signal +import subprocess # nosec B404 +import sys + +import typer + +from decnet.env import DECNET_API_HOST, DECNET_API_PORT, DECNET_INGEST_LOG_FILE + +from . import utils as _utils +from .gating import _require_master_mode +from .utils import console, log + + +def register(app: typer.Typer) -> None: + @app.command() + def api( + port: int = typer.Option(DECNET_API_PORT, "--port", help="Port for the backend API"), + host: str = typer.Option(DECNET_API_HOST, "--host", help="Host IP for the backend API"), + log_file: str = typer.Option(DECNET_INGEST_LOG_FILE, "--log-file", help="Path to the DECNET log file to monitor"), + daemon: bool = typer.Option(False, "--daemon", "-d", help="Detach to background as a daemon process"), + workers: int = typer.Option(1, "--workers", "-w", min=1, help="Number of uvicorn worker processes"), + ) -> None: + """Run the DECNET API and Web Dashboard in standalone mode.""" + _require_master_mode("api") + if daemon: + log.info("API daemonizing host=%s port=%d workers=%d", host, port, workers) + _utils._daemonize() + + log.info("API command invoked host=%s port=%d workers=%d", host, port, workers) + console.print(f"[green]Starting DECNET API on {host}:{port} (workers={workers})...[/]") + _env: dict[str, str] = os.environ.copy() + _env["DECNET_INGEST_LOG_FILE"] = str(log_file) + _cmd = [sys.executable, "-m", "uvicorn", "decnet.web.api:app", + "--host", host, "--port", str(port), "--workers", str(workers)] + try: + proc = subprocess.Popen(_cmd, env=_env, start_new_session=True) # nosec B603 B404 + try: + proc.wait() + except KeyboardInterrupt: + try: + os.killpg(proc.pid, signal.SIGTERM) + try: + proc.wait(timeout=10) + except subprocess.TimeoutExpired: + os.killpg(proc.pid, signal.SIGKILL) + proc.wait() + except ProcessLookupError: + pass + except (FileNotFoundError, subprocess.SubprocessError): + console.print("[red]Failed to start API. Ensure 'uvicorn' is installed in the current environment.[/]") diff --git a/decnet/cli/bus.py b/decnet/cli/bus.py new file mode 100644 index 00000000..74b79ba5 --- /dev/null +++ b/decnet/cli/bus.py @@ -0,0 +1,46 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +from __future__ import annotations + +import typer + +from . import utils as _utils +from .utils import console, log + + +def register(app: typer.Typer) -> None: + @app.command(name="bus") + def bus_cmd( + socket_path: str = typer.Option( + None, "--socket", "-s", + help="UNIX socket path (defaults to DECNET_BUS_SOCKET env var, " + "then /run/decnet/bus.sock, then ~/.decnet/bus.sock).", + ), + group: str = typer.Option( + "decnet", "--group", "-g", + help="POSIX group to chown the socket to (falls back to process " + "group if the named group does not exist).", + ), + heartbeat: int = typer.Option( + 10, "--heartbeat", "-H", + help="Seconds between system.bus.health heartbeat events.", + ), + daemon: bool = typer.Option(False, "--daemon", "-d", help="Detach to background as a daemon process."), + ) -> None: + """Run the DECNET ServiceBus worker (host-local UNIX-socket pub/sub).""" + import asyncio + from decnet.bus.factory import _default_socket_path + from decnet.bus.worker import bus_worker + + resolved = socket_path or _default_socket_path() + + if daemon: + log.info("bus daemonizing socket=%s", resolved) + _utils._daemonize() + + log.info("bus starting socket=%s group=%s heartbeat=%ds", resolved, group, heartbeat) + console.print(f"[bold cyan]Bus starting[/] (socket: {resolved}, heartbeat: {heartbeat}s)") + + try: + asyncio.run(bus_worker(resolved, group=group, heartbeat_interval=heartbeat)) + except KeyboardInterrupt: + console.print("\n[yellow]Bus stopped.[/]") diff --git a/decnet/cli/canary.py b/decnet/cli/canary.py new file mode 100644 index 00000000..fa9873bd --- /dev/null +++ b/decnet/cli/canary.py @@ -0,0 +1,104 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""``decnet canary`` — HTTP + DNS callback receiver for canary tokens. + +Two entry points share this module: + +* ``decnet canary`` — runs the worker process. Mirrors the shape of + :mod:`decnet.cli.webhook`. Invoked by the ``decnet-canary.service`` + systemd unit so its argv must stay stable. +* ``decnet canary-install-toolchain`` — provisions the Node side of + the fingerprint-canary obfuscator. Idempotent; safe to call from + the API service unit's ``ExecStartPre``. + +Not master-only — any host that hosts deckies can run its own +canary worker (the bus events stay local; the webhook worker on +each host fans them out to SIEMs independently per the design +in ``development/let-s-move-to-the-enumerated-pike.md``). +""" +from __future__ import annotations + +import shutil +import subprocess # nosec B404 — npm exec is the whole point of the toolchain installer +from pathlib import Path + +import typer + +from . import utils as _utils +from .utils import console, log + +_TOOLCHAIN_TIMEOUT_S = 180 + + +def register(app: typer.Typer) -> None: + @app.command(name="canary") + def canary_cmd( + daemon: bool = typer.Option( + False, "--daemon", "-d", help="Detach to background as a daemon process", + ), + ) -> None: + """Run the canary HTTP + DNS callback receiver.""" + import asyncio + + from decnet.canary.worker import run + + if daemon: + log.info("canary daemonizing") + _utils._daemonize() + + log.info("canary starting") + console.print("[bold cyan]Canary callback receiver starting[/]") + + try: + asyncio.run(run()) + except KeyboardInterrupt: + console.print("\n[yellow]Canary worker stopped.[/]") + + @app.command(name="canary-install-toolchain") + def canary_install_toolchain( + npm_bin: str = typer.Option( + "npm", "--npm-bin", help="Path to the npm executable. Defaults to PATH lookup.", + ), + ) -> None: + """Install the Node-side toolchain used by fingerprint canaries. + + Runs ``npm install --omit=dev`` under the installed ``decnet/canary/`` + directory so the obfuscator's helper script can ``require()`` + ``javascript-obfuscator`` at mint time. Requires Node >= 18. + + Idempotent: re-running on an already-installed tree is fast + (npm short-circuits when ``node_modules/`` is up-to-date). + """ + import decnet.canary as _canary_pkg + canary_dir = Path(_canary_pkg.__file__).resolve().parent + if not (canary_dir / "package.json").is_file(): + console.print( + f"[red]canary package.json not found under {canary_dir}; " + "wheel may be missing the JS toolchain payload.[/]" + ) + raise typer.Exit(code=2) + if shutil.which(npm_bin) is None: + console.print( + f"[red]npm executable {npm_bin!r} not found on PATH. " + "Install Node >= 18 and re-run.[/]" + ) + raise typer.Exit(code=2) + console.print( + f"[cyan]installing canary toolchain[/] in {canary_dir}", + ) + try: + proc = subprocess.run( # nosec B603 — argv-form, no shell, fixed cwd, npm_bin checked above + [npm_bin, "install", "--omit=dev", "--no-fund", "--no-audit"], + cwd=str(canary_dir), + capture_output=True, text=True, + timeout=_TOOLCHAIN_TIMEOUT_S, check=False, + ) + except subprocess.TimeoutExpired: + console.print("[red]npm install timed out after 3 minutes[/]") + raise typer.Exit(code=3) from None + if proc.returncode != 0: + console.print( + f"[red]npm install failed rc={proc.returncode}[/]\n" + f"{proc.stderr.strip()}" + ) + raise typer.Exit(code=proc.returncode) + console.print("[green]canary toolchain ready[/]") diff --git a/decnet/cli/db.py b/decnet/cli/db.py new file mode 100644 index 00000000..ae4b325e --- /dev/null +++ b/decnet/cli/db.py @@ -0,0 +1,142 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +from __future__ import annotations + +from typing import Optional + +import typer +from rich.table import Table + +from .utils import console, log + + +def _decnet_tables() -> tuple[str, ...]: + """Every DECNET-managed table, ordered child-first for DROP safety. + + Source is ``SQLModel.metadata.sorted_tables`` — the same registry that + drives ``create_all`` — so adding a new model automatically enrolls + its table in ``db-reset`` with no manual step. (Previous hardcoded + list drifted multiple times; ``webhook_subscriptions`` / + ``session_profile`` / ``smtp_targets`` all got missed.) + + ``sorted_tables`` returns parent-first (topological order that makes + ``CREATE`` safe). For ``DROP`` we need the reverse: children first, + so FK constraints drop before their parents. ``SET FOREIGN_KEY_CHECKS + = 0`` below makes this order-insensitive for MySQL, but the reverse + order keeps the code honest for any backend that doesn't support + disabling the FK check. + """ + from sqlmodel import SQLModel + # Importing the models package registers every table on SQLModel.metadata. + import decnet.web.db.models # noqa: F401 + + return tuple( + t.name for t in reversed(SQLModel.metadata.sorted_tables) + ) + + +async def _db_reset_mysql_async(dsn: str, mode: str, confirm: bool) -> None: + """Inspect + (optionally) wipe a MySQL database. Pulled out of the CLI + wrapper so tests can drive it without spawning a Typer runner.""" + from urllib.parse import urlparse + from sqlalchemy import text + from sqlalchemy.ext.asyncio import create_async_engine + + db_name = urlparse(dsn).path.lstrip("/") or "(default)" + engine = create_async_engine(dsn) + tables = _decnet_tables() + try: + rows: dict[str, int] = {} + async with engine.connect() as conn: + for tbl in tables: + try: + result = await conn.execute(text(f"SELECT COUNT(*) FROM `{tbl}`")) # nosec B608 + rows[tbl] = result.scalar() or 0 + except Exception: # noqa: BLE001 — ProgrammingError for missing table varies by driver + rows[tbl] = -1 + + summary = Table(title=f"DECNET MySQL reset — database `{db_name}` (mode={mode})") + summary.add_column("Table", style="cyan") + summary.add_column("Rows", justify="right") + for tbl, count in rows.items(): + summary.add_row(tbl, "[dim]missing[/]" if count < 0 else f"{count:,}") + console.print(summary) + + if not confirm: + console.print( + "[yellow]Dry-run only. Re-run with [bold]--i-know-what-im-doing[/] " + "to actually execute.[/]" + ) + return + + async with engine.begin() as conn: + await conn.execute(text("SET FOREIGN_KEY_CHECKS = 0")) + for tbl in tables: + if rows.get(tbl, -1) < 0: + continue + if mode == "truncate": + await conn.execute(text(f"TRUNCATE TABLE `{tbl}`")) + console.print(f"[green]✓ TRUNCATE {tbl}[/]") + else: + await conn.execute(text(f"DROP TABLE `{tbl}`")) + console.print(f"[green]✓ DROP TABLE {tbl}[/]") + await conn.execute(text("SET FOREIGN_KEY_CHECKS = 1")) + + console.print(f"[bold green]Done. Database `{db_name}` reset ({mode}).[/]") + finally: + await engine.dispose() + + +def register(app: typer.Typer) -> None: + @app.command(name="db-reset") + def db_reset( + i_know: bool = typer.Option( + False, + "--i-know-what-im-doing", + help="Required to actually execute. Without it, the command runs in dry-run mode.", + ), + mode: str = typer.Option( + "truncate", + "--mode", + help="truncate (wipe rows, keep schema) | drop-tables (DROP TABLE for each DECNET table)", + ), + url: Optional[str] = typer.Option( + None, + "--url", + help="Override DECNET_DB_URL for this invocation (e.g. when cleanup needs admin creds).", + ), + ) -> None: + """Wipe the MySQL database used by the DECNET dashboard. + + Destructive. Runs dry by default — pass --i-know-what-im-doing to commit. + Only supported against MySQL; refuses to operate on SQLite. + """ + import asyncio + import os + + if mode not in ("truncate", "drop-tables"): + console.print(f"[red]Invalid --mode '{mode}'. Expected: truncate | drop-tables.[/]") + raise typer.Exit(2) + + db_type = os.environ.get("DECNET_DB_TYPE", "sqlite").lower() + if db_type != "mysql": + console.print( + f"[red]db-reset is MySQL-only (DECNET_DB_TYPE='{db_type}'). " + f"For SQLite, just delete the decnet.db file.[/]" + ) + raise typer.Exit(2) + + dsn = url or os.environ.get("DECNET_DB_URL") + if not dsn: + from decnet.web.db.mysql.database import build_mysql_url + try: + dsn = build_mysql_url() + except ValueError as e: + console.print(f"[red]{e}[/]") + raise typer.Exit(2) from e + + log.info("db-reset invoked mode=%s confirm=%s", mode, i_know) + try: + asyncio.run(_db_reset_mysql_async(dsn, mode=mode, confirm=i_know)) + except Exception as e: # noqa: BLE001 + console.print(f"[red]db-reset failed: {e}[/]") + raise typer.Exit(1) from e diff --git a/decnet/cli/deploy.py b/decnet/cli/deploy.py new file mode 100644 index 00000000..821ba5f2 --- /dev/null +++ b/decnet/cli/deploy.py @@ -0,0 +1,308 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +from __future__ import annotations + +from typing import Optional + +import typer +from rich.table import Table + +from decnet.archetypes import Archetype, get_archetype +from decnet.config import DecnetConfig +from decnet.distros import get_distro +from decnet.env import DECNET_API_HOST, DECNET_INGEST_LOG_FILE +from decnet.fleet import all_service_names, build_deckies, build_deckies_from_ini +from decnet.ini_loader import load_ini +from decnet.network import detect_interface, detect_subnet, allocate_ips, get_host_ip + +from . import utils as _utils +from .gating import _require_master_mode +from .utils import console, log + + +def _deploy_swarm(config: "DecnetConfig", *, dry_run: bool, no_cache: bool) -> None: + """Shard deckies round-robin across enrolled workers and POST to swarmctl.""" + base = _utils._swarmctl_base_url(None) + resp = _utils._http_request("GET", base + "/swarm/hosts?host_status=enrolled") + enrolled = resp.json() + resp2 = _utils._http_request("GET", base + "/swarm/hosts?host_status=active") + active = resp2.json() + workers = [*enrolled, *active] + if not workers: + console.print("[red]No enrolled workers — run `decnet swarm enroll ...` first.[/]") + raise typer.Exit(1) + + assigned: list = [] + for idx, d in enumerate(config.deckies): + target = workers[idx % len(workers)] + assigned.append(d.model_copy(update={"host_uuid": target["uuid"]})) + config = config.model_copy(update={"deckies": assigned}) + + body = {"config": config.model_dump(mode="json"), "dry_run": dry_run, "no_cache": no_cache} + console.print(f"[cyan]Dispatching {len(config.deckies)} deckies across {len(workers)} worker(s)...[/]") + resp3 = _utils._http_request("POST", base + "/swarm/deploy", json_body=body, timeout=900.0) + results = resp3.json().get("results", []) + + table = Table(title="SWARM deploy results") + for col in ("worker", "host_uuid", "ok", "detail"): + table.add_column(col) + any_failed = False + for r in results: + ok = bool(r.get("ok")) + if not ok: + any_failed = True + detail = r.get("detail") + if isinstance(detail, dict): + detail = detail.get("status") or "ok" + table.add_row( + str(r.get("host_name") or ""), + str(r.get("host_uuid") or ""), + "[green]yes[/]" if ok else "[red]no[/]", + str(detail)[:80], + ) + console.print(table) + if any_failed: + raise typer.Exit(1) + + +def register(app: typer.Typer) -> None: + @app.command() + def deploy( + mode: str = typer.Option("unihost", "--mode", "-m", help="Deployment mode: unihost | swarm"), + deckies: Optional[int] = typer.Option(None, "--deckies", "-n", help="Number of deckies to deploy (required without --config)", min=1), + interface: Optional[str] = typer.Option(None, "--interface", "-i", help="Host NIC (auto-detected if omitted)"), + subnet: Optional[str] = typer.Option(None, "--subnet", help="LAN subnet CIDR (auto-detected if omitted)"), + ip_start: Optional[str] = typer.Option(None, "--ip-start", help="First decky IP (auto if omitted)"), + services: Optional[str] = typer.Option(None, "--services", help="Comma-separated services, e.g. ssh,smb,rdp"), + randomize_services: bool = typer.Option(False, "--randomize-services", help="Assign random services to each decky"), + distro: Optional[str] = typer.Option(None, "--distro", help="Comma-separated distro slugs, e.g. debian,ubuntu22,rocky9"), + randomize_distros: bool = typer.Option(False, "--randomize-distros", help="Assign a random distro to each decky"), + log_file: Optional[str] = typer.Option(DECNET_INGEST_LOG_FILE, "--log-file", help="Host path for the collector to write RFC 5424 logs (e.g. /var/log/decnet/decnet.log)"), + archetype_name: Optional[str] = typer.Option(None, "--archetype", "-a", help="Machine archetype slug (e.g. linux-server, windows-workstation)"), + mutate_interval: Optional[int] = typer.Option(30, "--mutate-interval", help="Automatically rotate services every N minutes"), + dry_run: bool = typer.Option(False, "--dry-run", help="Generate compose file without starting containers"), + no_cache: bool = typer.Option(False, "--no-cache", help="Force rebuild all images, ignoring Docker layer cache"), + parallel: bool = typer.Option(False, "--parallel", help="Build all images concurrently (enables BuildKit, separates build from up)"), + ipvlan: bool = typer.Option(False, "--ipvlan", help="Use IPvlan L2 instead of MACVLAN (required on WiFi interfaces)"), + config_file: Optional[str] = typer.Option(None, "--config", "-c", help="Path to INI config file"), + api: bool = typer.Option(False, "--api", help="Start the FastAPI backend to ingest and serve logs"), + api_port: int = typer.Option(8000, "--api-port", help="Port for the backend API"), + daemon: bool = typer.Option(False, "--daemon", help="Detach to background as a daemon process"), + ) -> None: + """Deploy deckies to the LAN.""" + import os + import subprocess # nosec B404 + import sys + from pathlib import Path as _Path + + _require_master_mode("deploy") + if daemon: + log.info("deploy daemonizing mode=%s deckies=%s", mode, deckies) + _utils._daemonize() + + log.info("deploy command invoked mode=%s deckies=%s dry_run=%s", mode, deckies, dry_run) + if mode not in ("unihost", "swarm"): + console.print("[red]--mode must be 'unihost' or 'swarm'[/]") + raise typer.Exit(1) + + if config_file: + try: + ini = load_ini(config_file) + except FileNotFoundError as e: + console.print(f"[red]{e}[/]") + raise typer.Exit(1) + + iface = interface or ini.interface or detect_interface() + subnet_cidr = subnet or ini.subnet + effective_gateway = ini.gateway + if subnet_cidr is None: + subnet_cidr, effective_gateway = detect_subnet(iface) + elif effective_gateway is None: + _, effective_gateway = detect_subnet(iface) + + host_ip = get_host_ip(iface) + console.print(f"[dim]Config:[/] {config_file} [dim]Interface:[/] {iface} " + f"[dim]Subnet:[/] {subnet_cidr} [dim]Gateway:[/] {effective_gateway} " + f"[dim]Host IP:[/] {host_ip}") + + if ini.custom_services: + from decnet.custom_service import CustomService + from decnet.services.registry import register_custom_service + for cs in ini.custom_services: + register_custom_service( + CustomService( + name=cs.name, + image=cs.image, + exec_cmd=cs.exec_cmd, + ports=cs.ports, + ) + ) + + effective_log_file = log_file + try: + decky_configs = build_deckies_from_ini( + ini, subnet_cidr, effective_gateway, host_ip, randomize_services, cli_mutate_interval=mutate_interval + ) + except ValueError as e: + console.print(f"[red]{e}[/]") + raise typer.Exit(1) + else: + if deckies is None: + console.print("[red]--deckies is required when --config is not used.[/]") + raise typer.Exit(1) + + services_list = [s.strip() for s in services.split(",")] if services else None + if services_list: + known = set(all_service_names()) + unknown = [s for s in services_list if s not in known] + if unknown: + console.print(f"[red]Unknown service(s): {unknown}. Available: {all_service_names()}[/]") + raise typer.Exit(1) + + arch: Archetype | None = None + if archetype_name: + try: + arch = get_archetype(archetype_name) + except ValueError as e: + console.print(f"[red]{e}[/]") + raise typer.Exit(1) + + if not services_list and not randomize_services and not arch: + console.print("[red]Specify --services, --archetype, or --randomize-services.[/]") + raise typer.Exit(1) + + iface = interface or detect_interface() + if subnet is None: + subnet_cidr, effective_gateway = detect_subnet(iface) + else: + subnet_cidr = subnet + _, effective_gateway = detect_subnet(iface) + + host_ip = get_host_ip(iface) + console.print(f"[dim]Interface:[/] {iface} [dim]Subnet:[/] {subnet_cidr} " + f"[dim]Gateway:[/] {effective_gateway} [dim]Host IP:[/] {host_ip}") + + distros_list = [d.strip() for d in distro.split(",")] if distro else None + if distros_list: + try: + for slug in distros_list: + get_distro(slug) + except ValueError as e: + console.print(f"[red]{e}[/]") + raise typer.Exit(1) + + ips = allocate_ips(subnet_cidr, effective_gateway, host_ip, deckies, ip_start) + decky_configs = build_deckies( + deckies, ips, services_list, randomize_services, + distros_explicit=distros_list, randomize_distros=randomize_distros, + archetype=arch, mutate_interval=mutate_interval, + ) + effective_log_file = log_file + + if api and not effective_log_file: + effective_log_file = os.path.join(os.getcwd(), "decnet.log") + console.print(f"[cyan]API mode enabled: defaulting log-file to {effective_log_file}[/]") + + config = DecnetConfig( + mode=mode, + interface=iface, + subnet=subnet_cidr, + gateway=effective_gateway, + deckies=decky_configs, + log_file=effective_log_file, + ipvlan=ipvlan, + mutate_interval=mutate_interval, + ) + + log.debug("deploy: config built deckies=%d interface=%s subnet=%s", len(config.deckies), config.interface, config.subnet) + + if mode == "swarm": + _deploy_swarm(config, dry_run=dry_run, no_cache=no_cache) + if dry_run: + log.info("deploy: swarm dry-run complete, no workers dispatched") + else: + log.info("deploy: swarm deployment complete deckies=%d", len(config.deckies)) + return + + from decnet.engine import deploy as _deploy + _deploy(config, dry_run=dry_run, no_cache=no_cache, parallel=parallel) + if dry_run: + log.info("deploy: dry-run complete, no containers started") + else: + log.info("deploy: deployment complete deckies=%d", len(config.deckies)) + + if mutate_interval is not None and not dry_run: + console.print(f"[green]Starting DECNET Mutator watcher in the background (interval: {mutate_interval}m)...[/]") + try: + subprocess.Popen( # nosec B603 + [sys.executable, "-m", "decnet.cli", "mutate", "--watch"], + stdout=subprocess.DEVNULL, + stderr=subprocess.STDOUT, + start_new_session=True, + ) + except (FileNotFoundError, subprocess.SubprocessError): + console.print("[red]Failed to start mutator watcher.[/]") + + if effective_log_file and not dry_run and not api: + _collector_err = _Path(effective_log_file).with_suffix(".collector.log") + console.print(f"[bold cyan]Starting log collector[/] → {effective_log_file}") + subprocess.Popen( # nosec B603 + [sys.executable, "-m", "decnet.cli", "collect", "--log-file", str(effective_log_file)], + stdin=subprocess.DEVNULL, + stdout=open(_collector_err, "a"), + stderr=subprocess.STDOUT, + start_new_session=True, + ) + + if api and not dry_run: + console.print(f"[green]Starting DECNET API on port {api_port}...[/]") + _env: dict[str, str] = os.environ.copy() + _env["DECNET_INGEST_LOG_FILE"] = str(effective_log_file or "") + try: + subprocess.Popen( # nosec B603 + [sys.executable, "-m", "uvicorn", "decnet.web.api:app", "--host", DECNET_API_HOST, "--port", str(api_port)], + env=_env, + stdout=subprocess.DEVNULL, + stderr=subprocess.STDOUT + ) + console.print(f"[dim]API running at http://{DECNET_API_HOST}:{api_port}[/]") + except (FileNotFoundError, subprocess.SubprocessError): + console.print("[red]Failed to start API. Ensure 'uvicorn' is installed in the current environment.[/]") + + if effective_log_file and not dry_run: + console.print("[bold cyan]Starting DECNET-PROBER[/] (auto-discovers attackers from log stream)") + try: + subprocess.Popen( # nosec B603 + [sys.executable, "-m", "decnet.cli", "probe", "--daemon", "--log-file", str(effective_log_file)], + stdin=subprocess.DEVNULL, + stdout=subprocess.DEVNULL, + stderr=subprocess.STDOUT, + start_new_session=True, + ) + except (FileNotFoundError, subprocess.SubprocessError): + console.print("[red]Failed to start DECNET-PROBER.[/]") + + if effective_log_file and not dry_run: + console.print("[bold cyan]Starting DECNET-PROFILER[/] (builds attacker profiles from log stream)") + try: + subprocess.Popen( # nosec B603 + [sys.executable, "-m", "decnet.cli", "profiler", "--daemon"], + stdin=subprocess.DEVNULL, + stdout=subprocess.DEVNULL, + stderr=subprocess.STDOUT, + start_new_session=True, + ) + except (FileNotFoundError, subprocess.SubprocessError): + console.print("[red]Failed to start DECNET-PROFILER.[/]") + + if effective_log_file and not dry_run: + console.print("[bold cyan]Starting DECNET-SNIFFER[/] (passive network capture)") + try: + subprocess.Popen( # nosec B603 + [sys.executable, "-m", "decnet.cli", "sniffer", "--daemon", "--log-file", str(effective_log_file)], + stdin=subprocess.DEVNULL, + stdout=subprocess.DEVNULL, + stderr=subprocess.STDOUT, + start_new_session=True, + ) + except (FileNotFoundError, subprocess.SubprocessError): + console.print("[red]Failed to start DECNET-SNIFFER.[/]") diff --git a/decnet/cli/forwarder.py b/decnet/cli/forwarder.py new file mode 100644 index 00000000..1b932917 --- /dev/null +++ b/decnet/cli/forwarder.py @@ -0,0 +1,75 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +from __future__ import annotations + +import asyncio +import pathlib +import signal +from typing import Optional + +import typer + +from decnet.env import DECNET_INGEST_LOG_FILE + +from . import utils as _utils +from .utils import console, log + + +def register(app: typer.Typer) -> None: + @app.command() + def forwarder( + master_host: Optional[str] = typer.Option(None, "--master-host", help="Master listener hostname/IP (default: $DECNET_SWARM_MASTER_HOST)"), + master_port: int = typer.Option(6514, "--master-port", help="Master listener TCP port (RFC 5425 default 6514)"), + log_file: Optional[str] = typer.Option(DECNET_INGEST_LOG_FILE, "--log-file", help="Local RFC 5424 file to tail and forward"), + agent_dir: Optional[str] = typer.Option(None, "--agent-dir", help="Worker cert bundle dir (default: ~/.decnet/agent)"), + state_db: Optional[str] = typer.Option(None, "--state-db", help="Forwarder offset SQLite path (default: /forwarder.db)"), + poll_interval: float = typer.Option(0.5, "--poll-interval", help="Seconds between log file stat checks"), + daemon: bool = typer.Option(False, "--daemon", "-d", help="Detach to background as a daemon process"), + ) -> None: + """Run the worker-side syslog-over-TLS forwarder (RFC 5425, mTLS to master:6514).""" + from decnet.env import DECNET_SWARM_MASTER_HOST + from decnet.swarm import pki + from decnet.swarm.log_forwarder import ForwarderConfig, run_forwarder + + resolved_host = master_host or DECNET_SWARM_MASTER_HOST + if not resolved_host: + console.print("[red]--master-host is required (or set DECNET_SWARM_MASTER_HOST).[/]") + raise typer.Exit(2) + + resolved_agent_dir = pathlib.Path(agent_dir) if agent_dir else pki.DEFAULT_AGENT_DIR + if not (resolved_agent_dir / "worker.crt").exists(): + console.print(f"[red]No worker cert bundle at {resolved_agent_dir} — enroll from the master first.[/]") + raise typer.Exit(2) + + if not log_file: + console.print("[red]--log-file is required.[/]") + raise typer.Exit(2) + + cfg = ForwarderConfig( + log_path=pathlib.Path(log_file), + master_host=resolved_host, + master_port=master_port, + agent_dir=resolved_agent_dir, + state_db=pathlib.Path(state_db) if state_db else None, + ) + + if daemon: + log.info("forwarder daemonizing master=%s:%d log=%s", resolved_host, master_port, log_file) + _utils._daemonize() + + log.info("forwarder command invoked master=%s:%d log=%s", resolved_host, master_port, log_file) + console.print(f"[green]Starting DECNET forwarder → {resolved_host}:{master_port} (mTLS)...[/]") + + async def _main() -> None: + stop = asyncio.Event() + loop = asyncio.get_running_loop() + for sig in (signal.SIGTERM, signal.SIGINT): + try: + loop.add_signal_handler(sig, stop.set) + except (NotImplementedError, RuntimeError): # pragma: no cover + pass + await run_forwarder(cfg, poll_interval=poll_interval, stop_event=stop) + + try: + asyncio.run(_main()) + except KeyboardInterrupt: + pass diff --git a/decnet/cli/gating.py b/decnet/cli/gating.py new file mode 100644 index 00000000..667dc7da --- /dev/null +++ b/decnet/cli/gating.py @@ -0,0 +1,78 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""Role-based CLI gating. + +MAINTAINERS: when you add a new Typer command (or add_typer group) that is +master-only, register its name in MASTER_ONLY_COMMANDS / MASTER_ONLY_GROUPS +below. The gate is the only thing that: + (a) hides the command from `decnet --help` on worker hosts, and + (b) prevents a misconfigured worker from invoking master-side logic. +Forgetting to register a new command is a role-boundary bug. Grep for +MASTER_ONLY when touching command registration. + +Worker-legitimate commands (NOT in these sets): agent, updater, forwarder, +status, collect, probe, sniffer. Agents run deckies locally and should be +able to inspect them + run the per-host microservices (collector streams +container logs, prober characterizes attackers hitting this host, sniffer +captures traffic). Mutator and Profiler stay master-only: the mutator +orchestrates respawns across the swarm; the profiler rebuilds attacker +profiles against the master DB (no per-host DB exists). +""" + +from __future__ import annotations + +import os + +import typer + +from .utils import console + +MASTER_ONLY_COMMANDS: frozenset[str] = frozenset({ + "api", "swarmctl", "deploy", "redeploy", "teardown", + "mutate", "listener", "profiler", + "services", "distros", "correlate", "archetypes", "web", + "db-reset", "init", "webhook", "clusterer", "campaign-clusterer", + # `ttp` runs on agents — local SMTP decoys persist .eml files into the + # agent's artifacts tree and the EmailLifter disk-reaches them in-process + # (DEBT-047). `ttp-backfill` stays master-only: it walks the master DB. + "ttp-backfill", +}) +MASTER_ONLY_GROUPS: frozenset[str] = frozenset( + {"swarm", "topology", "geoip", "realism"} +) + + +def _agent_mode_active() -> bool: + """True when the host is configured as an agent AND master commands are + disallowed (the default for agents). Workers overriding this explicitly + set DECNET_DISALLOW_MASTER=false to opt into hybrid use.""" + mode = os.environ.get("DECNET_MODE", "master").lower() + disallow = os.environ.get("DECNET_DISALLOW_MASTER", "true").lower() == "true" + return mode == "agent" and disallow + + +def _require_master_mode(command_name: str) -> None: + """Defence-in-depth: called at the top of every master-only command body. + + The registration-time gate in _gate_commands_by_mode() already hides + these commands from Typer's dispatch table, but this check protects + against direct function imports (e.g. from tests or third-party tools) + that would bypass Typer entirely.""" + if _agent_mode_active(): + console.print( + f"[red]`decnet {command_name}` is a master-only command; this host " + f"is configured as an agent (DECNET_MODE=agent).[/]" + ) + raise typer.Exit(1) + + +def _gate_commands_by_mode(_app: typer.Typer) -> None: + if not _agent_mode_active(): + return + _app.registered_commands = [ + c for c in _app.registered_commands + if (c.name or (c.callback.__name__ if c.callback else "")) not in MASTER_ONLY_COMMANDS + ] + _app.registered_groups = [ + g for g in _app.registered_groups + if g.name not in MASTER_ONLY_GROUPS + ] diff --git a/decnet/cli/geoip.py b/decnet/cli/geoip.py new file mode 100644 index 00000000..3675c35e --- /dev/null +++ b/decnet/cli/geoip.py @@ -0,0 +1,60 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +"""GeoIP CLI — refresh and lookup subcommands (master-only). + +Usage:: + + decnet geoip refresh # re-download RIR files and rebuild the index + decnet geoip lookup 8.8.8.8 # one-shot IP -> country dump +""" +from __future__ import annotations + +import typer + +from .gating import _require_master_mode +from .utils import console, log + +_group = typer.Typer( + name="geoip", + help="GeoIP provider management (master only).", + no_args_is_help=True, +) + + +@_group.command("refresh") +def _refresh() -> None: + """Force re-download of the GeoIP provider data and rebuild the index.""" + _require_master_mode("geoip refresh") + from decnet.geoip import get_lookup + from decnet.geoip.factory import get_provider + + provider = get_provider() + log.info("geoip: forcing refresh via %s provider", provider.name) + console.print(f"[bold cyan]Refreshing {provider.name} GeoIP data…[/]") + try: + lookup = get_lookup(force_refresh=True) + except Exception as exc: # noqa: BLE001 + console.print(f"[red]refresh failed: {exc}[/]") + raise typer.Exit(1) from exc + console.print( + f"[green]OK[/] {provider.name} index rebuilt " + f"({len(lookup)} ranges)." + ) + + +@_group.command("lookup") +def _lookup( + ip: str = typer.Argument(..., help="IP address to resolve."), +) -> None: + """Print the country code for an IP (or 'unknown').""" + _require_master_mode("geoip lookup") + from decnet.geoip import enrich_ip + + cc, source = enrich_ip(ip) + if cc is None: + console.print(f"{ip} [yellow]unknown[/]") + raise typer.Exit(0) + console.print(f"{ip} [green]cc={cc}[/] source={source}") + + +def register(app: typer.Typer) -> None: + app.add_typer(_group, name="geoip") diff --git a/decnet/cli/init.py b/decnet/cli/init.py new file mode 100644 index 00000000..897bdfd4 --- /dev/null +++ b/decnet/cli/init.py @@ -0,0 +1,864 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" +`decnet init` — one-shot master-host bootstrap. + +Idempotent: running it twice is a no-op on already-configured items. +Takes a freshly ``pip install``'d DECNET and turns it into a ready-to- +run master host: creates the ``decnet`` system user/group, installs +the systemd units + polkit rule + tmpfiles.d entry, seeds the +directory layout, drops a placeholder config, and starts the +``decnet.target`` grouping unit. + +Requires root. Uses ``subprocess.run`` (never ``shell=True``) for every +privileged call so the full argv surface is auditable. +""" +from __future__ import annotations + +import grp +import hashlib +import os +import pwd +import shutil +import subprocess # nosec B404 +import sys +from pathlib import Path +from typing import Callable, List, Optional + +import typer +from jinja2 import Environment, FileSystemLoader, StrictUndefined + +import decnet as _decnet_pkg +from .gating import _require_master_mode +from .utils import console, log + + +_CONFIG_PLACEHOLDER = """\ +# /etc/decnet/decnet.ini — DECNET host config. +# +# Every key is OPTIONAL. Absent keys fall through to env-var defaults +# defined in decnet/env.py. Real env vars always win over this file +# (precedence: env > INI > default), so systemd EnvironmentFile= and +# one-off `DECNET_FOO=bar decnet ...` invocations always take effect. +# +# Secrets (JWT, admin password, DB password) intentionally DO NOT +# live here. Put them in /opt/decnet/.env.local or the systemd +# EnvironmentFile= — never in a group-readable INI. + +[decnet] +# DECNET-service user/group as configured at `decnet init` time. +# Resolved to a uid/gid on each host at deploy time via pwd.getpwnam, +# so the same user name can have different numeric uids on master vs +# agents without breaking artifact ownership. +api-user = {api_user} +api-group = {api_group} +# mode = master # or "agent" + +# [api] +# host = 127.0.0.1 +# port = 8000 + +# [web] +# host = 127.0.0.1 +# port = 8080 +# admin-user = admin +# cors-origins = http://localhost:8080 # comma-separated + +# [database] +# type = sqlite # or "mysql" +# url = mysql+asyncmy://user@host:3306/decnet # if set, wins over host/port/name/user +# host = localhost +# port = 3306 +# name = decnet +# user = decnet + +# [bus] +# enabled = true +# type = unix # or "fake" +# socket = /run/decnet/bus.sock +# group = decnet + +# [swarm] +# master-host = 10.0.0.1 +# syslog-port = 6514 +# swarmctl-port = 8770 +# swarmctl-host = 127.0.0.1 + +# [logging] +# system-log = /var/log/decnet/decnet.system.log +# ingest-log = /var/log/decnet/decnet.log +# agent-log = /var/log/decnet/agent.log + +# [ingester] +# batch-size = 100 +# batch-max-wait-ms = 250 + +# [tracing] +# enabled = false +# otel-endpoint = http://localhost:4317 + +# [agent] +# Managed by the enroll bundle — do NOT edit by hand on an agent host. +""" + + +def _deploy_root() -> Path: + """Resolve the on-disk ``deploy/`` directory of the installed package. + + Editable install (``pip install -e .``): sibling of the ``decnet`` + package at repo root. Wheel installs aren't supported yet — the + error message tells the operator to use an editable install. + """ + root = Path(_decnet_pkg.__file__).resolve().parent.parent / "deploy" + if not (root / "decnet.target").is_file(): + raise RuntimeError( + f"cannot locate deploy/ directory (looked at {root}); " + "are you on a wheel install that didn't bundle deploy/? " + "use `pip install -e .` from a git checkout" + ) + return root + + +def _sha256(path: Path) -> str: + h = hashlib.sha256() + h.update(path.read_bytes()) + return h.hexdigest() + + +def _run(argv: List[str], *, dry_run: bool) -> None: + if dry_run: + console.print(f" [dim]would run:[/] {' '.join(argv)}") + return + log.info("init: exec %s", argv) + subprocess.run(argv, check=True) # nosec B603 + + +def _step(label: str, action: Callable[[], str]) -> bool: + """Run ``action``, print a checklist line. + + The callable returns the human-readable outcome verb: + ``"ok"`` → ``[ OK ]