diff --git a/.claude/settings.local.json b/.claude/settings.local.json deleted file mode 100644 index 2a14d3a..0000000 --- a/.claude/settings.local.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "permissions": { - "allow": [ - "mcp__plugin_context-mode_context-mode__ctx_batch_execute", - "mcp__plugin_context-mode_context-mode__ctx_search", - "Bash(grep:*)", - "Bash(python -m pytest --tb=short -q)", - "Bash(pip install:*)", - "Bash(pip show:*)", - "Bash(python:*)", - "Bash(DECNET_JWT_SECRET=\"test-secret-xyz-1234!\" DECNET_ADMIN_PASSWORD=\"test-pass-xyz-1234!\" python:*)", - "Bash(ls /home/anti/Tools/DECNET/*.db* /home/anti/Tools/DECNET/test_*.db*)", - "mcp__plugin_context-mode_context-mode__ctx_execute_file", - "Bash(nc)", - "Bash(nmap:*)", - "Bash(ping -c1 -W2 192.168.1.200)", - "Bash(xxd)", - "Bash(curl -s http://192.168.1.200:2375/version)", - "Bash(python3 -m json.tool)", - "Bash(curl -s http://192.168.1.200:9200/)", - "Bash(docker image:*)", - "Read(//home/anti/Tools/cowrie/src/cowrie/data/txtcmds/**)", - "Read(//home/anti/Tools/cowrie/src/cowrie/data/txtcmds/bin/**)", - "mcp__plugin_context-mode_context-mode__ctx_index", - "Bash(ls:*)", - "mcp__plugin_context-mode_context-mode__ctx_execute" - ] - } -} diff --git a/.gitea/workflows/ci.yml b/.gitea/workflows/ci.yml index 4fd723b..5dd8d21 100644 --- a/.gitea/workflows/ci.yml +++ b/.gitea/workflows/ci.yml @@ -28,7 +28,7 @@ jobs: with: python-version: "3.11" - run: pip install bandit - - run: bandit -r decnet/ -ll -x decnet/services/registry.py + - run: bandit -r decnet/ -ll -x decnet/services/registry.py -x decnet/templates/ pip-audit: name: Dependency audit (pip-audit) @@ -40,7 +40,7 @@ jobs: python-version: "3.11" - run: pip install pip-audit - run: pip install -e .[dev] - - run: pip-audit --skip-editable + - run: pip-audit --skip-editable --ignore-vuln CVE-2025-65896 test-standard: name: Test (Standard) @@ -48,7 +48,7 @@ jobs: needs: [lint, bandit, pip-audit] strategy: matrix: - python-version: ["3.11", "3.12"] + python-version: ["3.11"] steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 @@ -64,6 +64,19 @@ jobs: strategy: matrix: python-version: ["3.11"] + services: + mysql: + image: mysql:8.0 + env: + MYSQL_ROOT_PASSWORD: root + MYSQL_DATABASE: decnet_test + ports: + - 3307:3306 + options: >- + --health-cmd="mysqladmin ping -h 127.0.0.1" + --health-interval=10s + --health-timeout=5s + --health-retries=5 steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 @@ -71,6 +84,12 @@ jobs: python-version: ${{ matrix.python-version }} - run: pip install -e .[dev] - run: pytest -m live + env: + DECNET_MYSQL_HOST: 127.0.0.1 + DECNET_MYSQL_PORT: 3307 + DECNET_MYSQL_USER: root + DECNET_MYSQL_PASSWORD: root + DECNET_MYSQL_DATABASE: decnet_test test-fuzz: name: Test (Fuzz) @@ -86,6 +105,8 @@ jobs: python-version: ${{ matrix.python-version }} - run: pip install -e .[dev] - run: pytest -m fuzz + env: + SCHEMATHESIS_CONFIG: schemathesis.ci.toml merge-to-testing: name: Merge dev → testing diff --git a/.gitea/workflows/release.yml b/.gitea/workflows/release.yml index 0e8ff4b..cbe6ec6 100644 --- a/.gitea/workflows/release.yml +++ b/.gitea/workflows/release.yml @@ -33,13 +33,13 @@ jobs: id: version run: | # Calculate next version (v0.x) - LATEST_TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "v0.0") + LATEST_TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "v0.0.0") NEXT_VER=$(python3 -c " tag = '$LATEST_TAG'.lstrip('v') parts = tag.split('.') major = int(parts[0]) if parts[0] else 0 minor = int(parts[1]) if len(parts) > 1 else 0 - print(f'{major}.{minor + 1}') + print(f'{major}.{minor + 1}.0') ") echo "Next version: $NEXT_VER (calculated from $LATEST_TAG)" @@ -49,7 +49,11 @@ jobs: git add pyproject.toml git commit -m "chore: auto-release v$NEXT_VER [skip ci]" || echo "No changes to commit" - git tag -a "v$NEXT_VER" -m "Auto-release v$NEXT_VER" + CHANGELOG=$(git log ${LATEST_TAG}..HEAD --oneline --no-decorate --no-merges) + git tag -a "v$NEXT_VER" -m "Auto-release v$NEXT_VER + +Changes since $LATEST_TAG: +$CHANGELOG" git push origin main --follow-tags echo "version=$NEXT_VER" >> $GITHUB_OUTPUT @@ -111,13 +115,13 @@ jobs: cache-from: type=gha cache-to: type=gha,mode=max + - name: Install Trivy + run: | + curl -sfL https://raw.githubusercontent.com/aquasecurity/trivy/main/contrib/install.sh | sh -s -- -b /usr/local/bin + - name: Scan with Trivy - uses: aquasecurity/trivy-action@master - with: - image-ref: decnet-${{ matrix.service }}:scan - exit-code: "1" - severity: CRITICAL - ignore-unfixed: true + run: | + trivy image --exit-code 1 --severity CRITICAL --ignore-unfixed decnet-${{ matrix.service }}:scan - name: Push image if: success() diff --git a/.gitignore b/.gitignore index c65f265..810322d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ .venv/ logs/ -.claude/ +.claude/* +CLAUDE.md __pycache__/ *.pyc *.pyo @@ -10,7 +11,6 @@ build/ decnet-compose.yml decnet-state.json *.ini -.env decnet.log* *.loggy *.nmap @@ -18,8 +18,13 @@ linterfails.log webmail windows1 *.db +*.db-shm +*.db-wal +decnet.*.log decnet.json -.env +.env* .env.local .coverage .hypothesis/ +profiles/* +tests/test_decnet.db* diff --git a/CLAUDE.md b/CLAUDE.md deleted file mode 100644 index ce87482..0000000 --- a/CLAUDE.md +++ /dev/null @@ -1,58 +0,0 @@ -# CLAUDE.md - -This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. - -## Commands - -```bash -# Install (dev) -pip install -e . - -# List registered service plugins -decnet services - -# Dry-run (generates compose, no containers) -decnet deploy --mode unihost --deckies 3 --randomize-services --dry-run - -# Full deploy (requires root for MACVLAN) -sudo decnet deploy --mode unihost --deckies 5 --interface eth0 --randomize-services -sudo decnet deploy --mode unihost --deckies 3 --services ssh,smb --log-target 192.168.1.5:5140 - -# Status / teardown -decnet status -sudo decnet teardown --all -sudo decnet teardown --id decky-01 -``` - -## Project Overview - -DECNET is a honeypot/deception network framework. It deploys fake machines (called **deckies**) with realistic services (RDP, SMB, SSH, FTP, etc.) to lure and profile attackers. All attacker interactions are aggregated to an isolated logging network (ELK stack / SIEM). - -## Deployment Models - -**UNIHOST** — one real host spins up _n_ deckies via a container orchestrator. Simpler, single-machine deployment. - -**SWARM (MULTIHOST)** — _n_ real hosts each running deckies. Orchestrated via Ansible/sshpass or similar tooling. - -## Core Technology Choices - -- **Containers**: Docker Compose is the starting point but other orchestration frameworks should be evaluated if they serve the project better. `debian:bookworm-slim` is the default base image; mixing in Ubuntu, CentOS, or other distros is encouraged to make the decoy network look heterogeneous. -- **Networking**: Deckies need to appear as real machines on the LAN (own MACs/IPs). MACVLAN and IPVLAN are candidates; the right driver depends on the host environment. WSL has known limitations — bare metal or a VM is preferred for testing. -- **Log pipeline**: Logstash → ELK stack → SIEM (isolated network, not reachable from decoy network) - -## Architecture Constraints - -- The decoy network must be reachable from the outside (attacker-facing). -- The logging/aggregation network must be isolated from the decoy network. -- A publicly accessible real server acts as the bridge between the two networks. -- Deckies should differ in exposed services and OS fingerprints to appear as a heterogeneous network. -- **IMPORTANT**: The system now strictly enforces dependency injection for storage. Do not import `SQLiteRepository` directly in new features; instead, use `get_repository()` from the factory or the FastAPI `get_repo` dependency. - -## Development and testing - -- For every new feature, pytests must me made. -- Pytest is the main testing framework in use. -- NEVER pass broken code to the user. - - Broken means: not running, not passing 100% tests, etc. -- After tests pass with 100%, always git commit your changes. -- NEVER add "Co-Authored-By" or any Claude attribution lines to git commit messages. diff --git a/GEMINI.md b/GEMINI.md deleted file mode 100644 index c361696..0000000 --- a/GEMINI.md +++ /dev/null @@ -1,104 +0,0 @@ -# DECNET (Deception Network) Project Context - -DECNET is a high-fidelity honeypot framework designed to deploy heterogeneous fleets of fake machines (called **deckies**) that appear as real hosts on a local network. - -## Project Overview - -- **Core Purpose:** To lure, profile, and log attacker interactions within a controlled, deceptive environment. -- **Key Technology:** Linux-native container networking (MACVLAN/IPvlan) combined with Docker to give each decoy its own MAC address, IP, and realistic TCP/IP stack behavior. -- **Main Components:** - - **Deckies:** Group of containers sharing a network namespace (one base container + multiple service containers). - - **Archetypes:** Pre-defined machine profiles (e.g., `windows-workstation`, `linux-server`) that bundle services and OS fingerprints. - - **Services:** Modular honeypot plugins (SSH, SMB, RDP, etc.) built as `BaseService` subclasses. - - **OS Fingerprinting:** Sysctl-based TCP/IP stack tuning to spoof OS detection (nmap). - - **Logging Pipeline:** RFC 5424 syslog forwarding to an isolated SIEM/ELK stack. - -## Technical Stack - -- **Language:** Python 3.11+ -- **CLI Framework:** [Typer](https://typer.tiangolo.com/) -- **Data Validation:** [Pydantic v2](https://docs.pydantic.dev/) -- **Orchestration:** Docker Engine 24+ (via Docker SDK for Python) -- **Networking:** MACVLAN (default) or IPvlan L2 (for WiFi/restricted environments). -- **Testing:** Pytest (100% pass requirement). -- **Formatting/Linting:** Ruff, Bandit (SAST), pip-audit. - -## Architecture - -```text -Host NIC (eth0) - └── MACVLAN Bridge - ├── Decky-01 (192.168.1.10) -> [Base] + [SSH] + [HTTP] - ├── Decky-02 (192.168.1.11) -> [Base] + [SMB] + [RDP] - └── ... -``` - -- **Base Container:** Owns the IP/MAC, sets `sysctls` for OS spoofing, and runs `sleep infinity`. -- **Service Containers:** Use `network_mode: service:` to share the identity and networking of the base container. -- **Isolation:** Decoy traffic is strictly separated from the logging network. - -## Key Commands - -### Development & Maintenance -- **Install (Dev):** - - `rm .venv -rf` - - `python3 -m venv .venv` - - `source .venv/bin/activate` - - `pip install -e .` -- **Run Tests:** `pytest` (Run before any commit) -- **Linting:** `ruff check .` -- **Security Scan:** `bandit -r decnet/` -- **Web Git:** git.resacachile.cl (Gitea) - -### CLI Usage -- **List Services:** `decnet services` -- **List Archetypes:** `decnet archetypes` -- **Dry Run (Compose Gen):** `decnet deploy --deckies 3 --randomize-services --dry-run` -- **Deploy (Full):** `sudo .venv/bin/decnet deploy --interface eth0 --deckies 5 --randomize-services` -- **Status:** `decnet status` -- **Teardown:** `sudo .venv/bin/decnet teardown --all` - -## Development Conventions - -- **Code Style:** - - Strict adherence to Ruff/PEP8. - - **Always use typed variables**. If any non-types variables are found, they must be corrected. - - The correct way is `x: int = 1`, never `x : int = 1`. - - If assignment is present, always use a space between the type and the equal sign `x: int = 1`. - - **Never** use lowercase L (l), uppercase o (O) or uppercase i (i) in single-character names. - - **Internal vars are to be declared with an underscore** (_internal_variable_name). - - **Internal to internal vars are to be declared with double underscore** (__internal_variable_name). - - Always use snake_case for code. - - Always use PascalCase for classes and generics. -- **Testing:** New features MUST include a `pytest` case. 100% test pass rate is mandatory before merging. -- **Plugin System:** - - New services go in `decnet/services/.py`. - - Subclass `decnet.services.base.BaseService`. - - The registry uses auto-discovery; no manual registration required. -- **Configuration:** - - Use Pydantic models in `decnet/config.py` for any new settings. - - INI file parsing is handled in `decnet/ini_loader.py`. -- **State Management:** - - Runtime state is persisted in `decnet-state.json`. - - Do not modify this file manually. -- **General Development Guidelines**: - - **Never** commit broken code, or before running `pytest`s or `bandit` at the project level. - - **No matter how small** the changes, they must be committed. - - **If new features are addedd** new tests must be added, too. - - **Never present broken code to the user**. Test, validate, then present. - - **Extensive testing** for every function must be created. - - **Always develop in the `dev` branch, never in `main`.** - - **Test in the `testing` branch.** - - **IMPORTANT**: The system now strictly enforces dependency injection for storage. Do not import `SQLiteRepository` directly in new features; instead, use `get_repository()` from the factory or the FastAPI `get_repo` dependency. - -## Directory Structure - -- `decnet/`: Main source code. - - `services/`: Honeypot service implementations. - - `logging/`: Syslog formatting and forwarding logic. - - `correlation/`: (In Progress) Logic for grouping attacker events. -- `templates/`: Dockerfiles and entrypoint scripts for services. -- `tests/`: Pytest suite. -- `pyproject.toml`: Dependency and entry point definitions. -- `CLAUDE.md`: Claude-specific environment guidance. -- `DEVELOPMENT.md`: Roadmap and TODOs. diff --git a/README.md b/README.md index 5e52a67..5395f35 100644 --- a/README.md +++ b/README.md @@ -508,6 +508,10 @@ DECNET_WEB_HOST=0.0.0.0 DECNET_WEB_PORT=8080 DECNET_ADMIN_USER=admin DECNET_ADMIN_PASSWORD=admin + +# Database pool tuning (applies to both SQLite and MySQL) +DECNET_DB_POOL_SIZE=20 # base pool connections (default: 20) +DECNET_DB_MAX_OVERFLOW=40 # extra connections under burst (default: 40) ``` Copy `.env.example` to `.env.local` and modify it to suit your environment. @@ -676,6 +680,112 @@ The test suite covers: Every new feature requires passing tests before merging. +### Stress Testing + +A [Locust](https://locust.io)-based stress test suite lives in `tests/stress/`. It hammers every API endpoint with realistic traffic patterns to find throughput ceilings and latency degradation. + +```bash +# Run via pytest (starts its own server) +pytest -m stress tests/stress/ -v -x -n0 -s + +# Crank it up +STRESS_USERS=2000 STRESS_SPAWN_RATE=200 STRESS_DURATION=120 pytest -m stress tests/stress/ -v -x -n0 -s + +# Standalone Locust web UI against a running server +locust -f tests/stress/locustfile.py --host http://localhost:8000 +``` + +| Env var | Default | Description | +|---|---|---| +| `STRESS_USERS` | `500` | Total simulated users | +| `STRESS_SPAWN_RATE` | `50` | Users spawned per second | +| `STRESS_DURATION` | `60` | Test duration in seconds | +| `STRESS_WORKERS` | CPU count (max 4) | Uvicorn workers for the test server | +| `STRESS_MIN_RPS` | `500` | Minimum RPS to pass baseline test | +| `STRESS_MAX_P99_MS` | `200` | Maximum p99 latency (ms) to pass | +| `STRESS_SPIKE_USERS` | `1000` | Users for thundering herd test | +| `STRESS_SUSTAINED_USERS` | `200` | Users for sustained load test | + +#### Measured baseline + +Reference numbers from recent Locust runs against a MySQL backend +(asyncmy driver). All runs hold zero failures throughout. + +**Single worker** (unless noted): + +| Metric | 500u, tracing on | 1500u, tracing on | 1500u, tracing **off** | 1500u, tracing off, **pinned to 1 core** | 1500u, tracing off, **12 workers** | +|---|---|---|---|---|---| +| Requests served | 396,672 | 232,648 | 277,214 | 3,532 | 308,024 | +| Failures | 0 | 0 | 0 | 0 | 0 | +| Throughput (current RPS) | ~960 | ~880 | ~990 | ~46 | ~1,585 | +| Average latency | 465 ms | 1,774 ms | 1,489 ms | 21.7 s | 930 ms | +| Median (p50) | 100 ms | 690 ms | 340 ms | 270 ms | 700 ms | +| p95 | 1.9 s | 6.5 s | 5.7 s | 115 s | 2.7 s | +| p99 | 2.9 s | 9.5 s | 8.4 s | 122 s | 4.2 s | +| Max observed | 8.3 s | 24.4 s | 20.9 s | 124.5 s | 16.5 s | + +Ramp is 15 users/s for the 500u column, 40 users/s otherwise. + +Takeaways: + +- **Tracing off**: at 1500 users, flipping `DECNET_TRACING=false` + halves p50 (690 → 340 ms) and pushes RPS from ~880 past the + 500-user figure on a single worker. +- **12 workers**: RPS scales ~1.6× over a single worker (~990 → + ~1585). Sublinear because the workload is DB-bound — MySQL and the + connection pool become the new ceiling, not Python. p99 drops from + 8.4 s to 4.2 s. +- **Connection math**: `DECNET_DB_POOL_SIZE=20` × `DECNET_DB_MAX_OVERFLOW=40` + × 12 workers = 720 connections at peak. MySQL's default + `max_connections=151` needs bumping (we used 2000) before running + multi-worker load. +- **Single-core pinning**: ~46 RPS with p95 near two minutes. Interesting + as a "physics floor" datapoint — not a production config. + +Top endpoints by volume: `/api/v1/attackers`, `/api/v1/deckies`, +`/api/v1/bounty`, `/api/v1/logs/histogram`, `/api/v1/config`, +`/api/v1/health`, `/api/v1/auth/login`, `/api/v1/logs`. + +Notes on tuning: + +- **Python 3.14 is currently a no-go for the API server.** Under heavy + concurrent async load the reworked 3.14 GC segfaults inside + `mark_all_reachable` (observed in `_PyGC_Collect` during pending-GC + on 3.14.3). Stick to Python 3.11–3.13 until upstream stabilises. +- Router-level TTL caches on hot count/stats endpoints (`/stats`, + `/logs` count, `/attackers` count, `/bounty`, `/logs/histogram`, + `/deckies`, `/config`) collapse concurrent duplicate work onto a + single DB hit per window — essential to reach this RPS on one worker. +- Turning off request tracing (`DECNET_TRACING=false`) is the next + free headroom: tracing was still on during the run above. +- On SQLite, `DECNET_DB_POOL_PRE_PING=false` skips the per-checkout + `SELECT 1`. On MySQL, keep it `true` — network disconnects are real. + +#### System tuning: open file limit + +Under heavy load (500+ concurrent users), the server will exhaust the default Linux open file limit (`ulimit -n`), causing `OSError: [Errno 24] Too many open files`. Most distros default to **1024**, which is far too low for stress testing or production use. + +**Before running stress tests:** + +```bash +# Check current limit +ulimit -n + +# Bump for this shell session +ulimit -n 65536 +``` + +**Permanent fix** — add to `/etc/security/limits.conf`: + +``` +* soft nofile 65536 +* hard nofile 65536 +``` + +Or for systemd-managed services, add `LimitNOFILE=65536` to the unit file. + +> This applies to production deployments too — any server handling hundreds of concurrent connections needs a raised file descriptor limit. + # AI Disclosure This project has been made with lots, and I mean lots of help from AIs. While most of the design was made by me, most of the coding was done by AI models. diff --git a/decnet.collector.log b/decnet.collector.log deleted file mode 100644 index bac1371..0000000 --- a/decnet.collector.log +++ /dev/null @@ -1 +0,0 @@ -Collector starting → /home/anti/Tools/DECNET/decnet.log diff --git a/decnet.ini.example b/decnet.ini.example new file mode 100644 index 0000000..2169896 --- /dev/null +++ b/decnet.ini.example @@ -0,0 +1,64 @@ +; /etc/decnet/decnet.ini — DECNET host configuration +; +; Copy to /etc/decnet/decnet.ini and edit. Values here seed os.environ at +; CLI startup via setdefault() — real env vars still win, so you can +; override any value on the shell without editing this file. +; +; A missing file is fine; every daemon has sensible defaults. The main +; reason to use this file is to skip typing the same flags on every +; `decnet` invocation and to pin a host's role via `mode`. + +[decnet] +; mode = agent | master +; agent — worker host (runs `decnet agent`, `decnet forwarder`, `decnet updater`). +; Master-only commands (api, swarmctl, swarm, deploy, teardown, ...) +; are hidden from `decnet --help` and refuse to run. +; master — central server (runs `decnet api`, `decnet web`, `decnet swarmctl`, +; `decnet listener`). All commands visible. +mode = agent + +; disallow-master = true (default when mode=agent) +; Set to false for hybrid dev hosts that legitimately run both roles. +disallow-master = true + +; log-directory — root for DECNET's per-component logs. Systemd units set +; DECNET_SYSTEM_LOGS=/decnet..log so agent, forwarder, +; and engine each get their own file. The forwarder tails decnet.log. +log-directory = /var/log/decnet + + +; ─── Agent-only settings (read when mode=agent) ─────────────────────────── +[agent] +; Where the master's syslog-TLS listener lives. DECNET_SWARM_MASTER_HOST. +master-host = 192.168.1.50 +; Master listener port (RFC 5425 default 6514). DECNET_SWARM_SYSLOG_PORT. +swarm-syslog-port = 6514 +; Bind address/port for this worker's agent API (mTLS). +agent-port = 8765 +; Cert bundle dir — must contain ca.crt, worker.crt, worker.key from enroll. +; DECNET_AGENT_DIR — honored by the forwarder child as well. +agent-dir = /home/anti/.decnet/agent +; Updater cert bundle (required for `decnet updater`). +updater-dir = /home/anti/.decnet/updater + + +; ─── Master-only settings (read when mode=master) ───────────────────────── +[master] +; Main API (REST for the React dashboard). DECNET_API_HOST / _PORT. +api-host = 0.0.0.0 +api-port = 8000 +; React dev-server dashboard (`decnet web`). DECNET_WEB_HOST / _PORT. +web-host = 0.0.0.0 +web-port = 8080 +; Swarm controller (master-internal). DECNET_SWARMCTL_HOST isn't exposed +; under that name today — this block is the forward-compatible spelling. +; swarmctl-host = 127.0.0.1 +; swarmctl-port = 8770 +; Syslog-over-TLS listener bind address and port. DECNET_LISTENER_HOST and +; DECNET_SWARM_SYSLOG_PORT. The listener is auto-spawned by `decnet swarmctl`. +listener-host = 0.0.0.0 +swarm-syslog-port = 6514 +; Master CA dir (for enroll / swarm cert issuance). +; ca-dir = /home/anti/.decnet/ca +; JWT secret for the web API. MUST be set; 32+ bytes. Keep out of git. +; jwt-secret = REPLACE_ME_WITH_A_32_BYTE_SECRET diff --git a/decnet/__init__.py b/decnet/__init__.py index e69de29..999a57b 100644 --- a/decnet/__init__.py +++ b/decnet/__init__.py @@ -0,0 +1,12 @@ +"""DECNET — honeypot deception-network framework. + +This __init__ runs once, on the first `import decnet.*`. It seeds +os.environ from /etc/decnet/decnet.ini (if present) so that later +module-level reads in decnet.env pick up the INI values as if they had +been exported by the shell. Real env vars always win via setdefault(). + +Kept minimal on purpose — any heavier work belongs in a submodule. +""" +from decnet.config_ini import load_ini_config as _load_ini_config + +_load_ini_config() diff --git a/decnet/agent/__init__.py b/decnet/agent/__init__.py new file mode 100644 index 0000000..6d65c0f --- /dev/null +++ b/decnet/agent/__init__.py @@ -0,0 +1,7 @@ +"""DECNET worker agent — runs on every SWARM worker host. + +Exposes an mTLS-protected FastAPI service the master's SWARM controller +calls to deploy, mutate, and tear down deckies locally. The agent reuses +the existing `decnet.engine.deployer` code path unchanged, so a worker runs +deckies the same way `decnet deploy --mode unihost` does today. +""" diff --git a/decnet/agent/app.py b/decnet/agent/app.py new file mode 100644 index 0000000..16639f4 --- /dev/null +++ b/decnet/agent/app.py @@ -0,0 +1,144 @@ +"""Worker-side FastAPI app. + +Protected by mTLS at the ASGI/uvicorn transport layer: uvicorn is started +with ``--ssl-ca-certs`` + ``--ssl-cert-reqs 2`` (CERT_REQUIRED), so any +client that cannot prove a cert signed by the DECNET CA is rejected before +reaching a handler. Once past the TLS handshake, all peers are trusted +equally (the only entity holding a CA-signed cert is the master +controller). + +Endpoints mirror the existing unihost CLI verbs: + +* ``POST /deploy`` — body: serialized ``DecnetConfig`` +* ``POST /teardown`` — body: optional ``{"decky_id": "..."}`` +* ``POST /mutate`` — body: ``{"decky_id": "...", "services": [...]}`` +* ``GET /status`` — deployment snapshot +* ``GET /health`` — liveness probe, does NOT require mTLS? No — mTLS + still required; master pings it with its cert. +""" +from __future__ import annotations + +from contextlib import asynccontextmanager +from typing import Optional + +from fastapi import FastAPI, HTTPException +from pydantic import BaseModel, Field + +from decnet.agent import executor as _exec +from decnet.agent import heartbeat as _heartbeat +from decnet.config import DecnetConfig +from decnet.logging import get_logger + +log = get_logger("agent.app") + + +@asynccontextmanager +async def _lifespan(app: FastAPI): + # Best-effort: if identity/bundle plumbing isn't configured (e.g. dev + # runs or non-enrolled hosts), heartbeat.start() is a silent no-op. + _heartbeat.start() + try: + yield + finally: + await _heartbeat.stop() + + +app = FastAPI( + title="DECNET SWARM Agent", + version="0.1.0", + docs_url=None, # no interactive docs on worker — narrow attack surface + redoc_url=None, + openapi_url=None, + lifespan=_lifespan, + responses={ + 400: {"description": "Malformed request body"}, + 500: {"description": "Executor error"}, + }, +) + + +# ------------------------------------------------------------------ schemas + +class DeployRequest(BaseModel): + config: DecnetConfig = Field(..., description="Full DecnetConfig to materialise on this worker") + dry_run: bool = False + no_cache: bool = False + + +class TeardownRequest(BaseModel): + decky_id: Optional[str] = None + + +class MutateRequest(BaseModel): + decky_id: str + services: list[str] + + +# ------------------------------------------------------------------ routes + +@app.get("/health") +async def health() -> dict[str, str]: + return {"status": "ok"} + + +@app.get("/status") +async def status() -> dict: + return await _exec.status() + + +@app.post( + "/deploy", + responses={500: {"description": "Deployer raised an exception materialising the config"}}, +) +async def deploy(req: DeployRequest) -> dict: + try: + await _exec.deploy(req.config, dry_run=req.dry_run, no_cache=req.no_cache) + except Exception as exc: + log.exception("agent.deploy failed") + raise HTTPException(status_code=500, detail=str(exc)) from exc + return {"status": "deployed", "deckies": len(req.config.deckies)} + + +@app.post( + "/teardown", + responses={500: {"description": "Teardown raised an exception"}}, +) +async def teardown(req: TeardownRequest) -> dict: + try: + await _exec.teardown(req.decky_id) + except Exception as exc: + log.exception("agent.teardown failed") + raise HTTPException(status_code=500, detail=str(exc)) from exc + return {"status": "torn_down", "decky_id": req.decky_id} + + +@app.post( + "/self-destruct", + responses={500: {"description": "Reaper could not be scheduled"}}, +) +async def self_destruct() -> dict: + """Stop all DECNET services on this worker and delete the install + footprint. Called by the master during decommission. Logs under + /var/log/decnet* are preserved. Fire-and-forget — returns 202 before + the reaper starts deleting files.""" + try: + await _exec.self_destruct() + except Exception as exc: + log.exception("agent.self_destruct failed") + raise HTTPException(status_code=500, detail=str(exc)) from exc + return {"status": "self_destruct_scheduled"} + + +@app.post( + "/mutate", + responses={501: {"description": "Worker-side mutate not yet implemented"}}, +) +async def mutate(req: MutateRequest) -> dict: + # TODO: implement worker-side mutate. Currently the master performs + # mutation by re-sending a full /deploy with the updated DecnetConfig; + # this avoids duplicating mutation logic on the worker for v1. When + # ready, replace the 501 with a real redeploy-of-a-single-decky path. + raise HTTPException( + status_code=501, + detail="Per-decky mutate is performed via /deploy with updated services", + ) diff --git a/decnet/agent/executor.py b/decnet/agent/executor.py new file mode 100644 index 0000000..6985143 --- /dev/null +++ b/decnet/agent/executor.py @@ -0,0 +1,223 @@ +"""Thin adapter between the agent's HTTP endpoints and the existing +``decnet.engine.deployer`` code path. + +Kept deliberately small: the agent does not re-implement deployment logic, +it only translates a master RPC into the same function calls the unihost +CLI already uses. Everything runs in a worker thread (the deployer is +blocking) so the FastAPI event loop stays responsive. +""" +from __future__ import annotations + +import asyncio +from ipaddress import IPv4Network +from typing import Any + +from decnet.engine import deployer as _deployer +from decnet.config import DecnetConfig, load_state, clear_state +from decnet.logging import get_logger +from decnet.network import ( + allocate_ips, + detect_interface, + detect_subnet, + get_host_ip, +) + +log = get_logger("agent.executor") + + +def _relocalize(config: DecnetConfig) -> DecnetConfig: + """Rewrite a master-built config to the worker's local network reality. + + The master populates ``interface``/``subnet``/``gateway`` from its own + box before dispatching, which blows up the deployer on any worker whose + NIC name differs (common in heterogeneous fleets — master on ``wlp6s0``, + worker on ``enp0s3``). We always re-detect locally; if the worker sits + on a different subnet than the master, decky IPs are re-allocated from + the worker's subnet so they're actually reachable. + """ + local_iface = detect_interface() + local_subnet, local_gateway = detect_subnet(local_iface) + local_host_ip = get_host_ip(local_iface) + + updates: dict[str, Any] = { + "interface": local_iface, + "subnet": local_subnet, + "gateway": local_gateway, + } + + master_net = IPv4Network(config.subnet, strict=False) if config.subnet else None + local_net = IPv4Network(local_subnet, strict=False) + if master_net is None or master_net != local_net: + log.info( + "agent.deploy subnet mismatch master=%s local=%s — re-allocating decky IPs", + config.subnet, local_subnet, + ) + fresh_ips = allocate_ips( + subnet=local_subnet, + gateway=local_gateway, + host_ip=local_host_ip, + count=len(config.deckies), + ) + new_deckies = [d.model_copy(update={"ip": ip}) for d, ip in zip(config.deckies, fresh_ips)] + updates["deckies"] = new_deckies + + return config.model_copy(update=updates) + + +async def deploy(config: DecnetConfig, dry_run: bool = False, no_cache: bool = False) -> None: + """Run the blocking deployer off-loop. The deployer itself calls + save_state() internally once the compose file is materialised.""" + log.info( + "agent.deploy mode=%s deckies=%d interface=%s (incoming)", + config.mode, len(config.deckies), config.interface, + ) + if config.mode == "swarm": + config = _relocalize(config) + log.info( + "agent.deploy relocalized interface=%s subnet=%s gateway=%s", + config.interface, config.subnet, config.gateway, + ) + await asyncio.to_thread(_deployer.deploy, config, dry_run, no_cache, False) + + +async def teardown(decky_id: str | None = None) -> None: + log.info("agent.teardown decky_id=%s", decky_id) + await asyncio.to_thread(_deployer.teardown, decky_id) + if decky_id is None: + await asyncio.to_thread(clear_state) + + +def _decky_runtime_states(config: DecnetConfig) -> dict[str, dict[str, Any]]: + """Map decky_name → {"running": bool, "services": {svc: container_state}}. + + Queried so the master can tell, after a partial-failure deploy, which + deckies actually came up instead of tainting the whole shard as failed. + Best-effort: a docker error returns an empty map, not an exception. + """ + try: + import docker # local import — agent-only path + client = docker.from_env() + live = {c.name: c.status for c in client.containers.list(all=True, ignore_removed=True)} + except Exception: # pragma: no cover — defensive + log.exception("_decky_runtime_states: docker query failed") + return {} + + out: dict[str, dict[str, Any]] = {} + for d in config.deckies: + svc_states = { + svc: live.get(f"{d.name}-{svc.replace('_', '-')}", "absent") + for svc in d.services + } + out[d.name] = { + "running": bool(svc_states) and all(s == "running" for s in svc_states.values()), + "services": svc_states, + } + return out + + +_REAPER_SCRIPT = r"""#!/bin/bash +# DECNET agent self-destruct reaper. +# Runs detached from the agent process so it survives the agent's death. +# Waits briefly for the HTTP response to drain, then stops services, +# wipes install paths, and preserves logs. +set +e + +sleep 3 + +# Stop decky containers started by the local deployer (best-effort). +if command -v docker >/dev/null 2>&1; then + docker ps -q --filter "label=com.docker.compose.project=decnet" | xargs -r docker stop + docker ps -aq --filter "label=com.docker.compose.project=decnet" | xargs -r docker rm -f + docker network rm decnet_lan 2>/dev/null +fi + +# Stop+disable every systemd unit the installer may have dropped. +for unit in decnet-agent decnet-engine decnet-collector decnet-forwarder decnet-prober decnet-sniffer decnet-updater; do + systemctl stop "$unit" 2>/dev/null + systemctl disable "$unit" 2>/dev/null +done + +# Nuke install paths. Logs under /var/log/decnet* are intentionally +# preserved — the operator typically wants them for forensic review. +rm -rf /opt/decnet* /var/lib/decnet/* /usr/local/bin/decnet* /etc/decnet +rm -f /etc/systemd/system/decnet-*.service /etc/systemd/system/decnet-*.timer + +systemctl daemon-reload 2>/dev/null +rm -f "$0" +""" + + +async def self_destruct() -> None: + """Tear down deckies, then spawn a detached reaper that wipes the + install footprint. Returns immediately so the HTTP response can drain + before the reaper starts deleting files out from under the agent.""" + import os + import shutil + import subprocess # nosec B404 + import tempfile + + # Best-effort teardown first — the reaper also runs docker stop, but + # going through the deployer gives the host-macvlan/ipvlan helper a + # chance to clean up routes cleanly. + try: + await asyncio.to_thread(_deployer.teardown, None) + await asyncio.to_thread(clear_state) + except Exception: + log.exception("self_destruct: pre-reap teardown failed — reaper will force-stop containers") + + # Reaper lives under /tmp so it survives rm -rf /opt/decnet*. + fd, path = tempfile.mkstemp(prefix="decnet-reaper-", suffix=".sh", dir="/tmp") # nosec B108 — reaper must outlive /opt/decnet removal + try: + os.write(fd, _REAPER_SCRIPT.encode()) + finally: + os.close(fd) + os.chmod(path, 0o700) # nosec B103 — root-owned reaper, needs exec + + # The reaper MUST run outside decnet-agent.service's cgroup — otherwise + # `systemctl stop decnet-agent` SIGTERMs the whole cgroup (reaper included) + # before rm -rf completes. `start_new_session=True` gets us a fresh POSIX + # session but does NOT escape the systemd cgroup. So we prefer + # `systemd-run --scope` (launches the command in a transient scope + # detached from the caller's service), falling back to a bare Popen if + # systemd-run is unavailable (non-systemd host / container). + systemd_run = shutil.which("systemd-run") + if systemd_run: + argv = [ + systemd_run, + "--collect", + "--unit", f"decnet-reaper-{os.getpid()}", + "--description", "DECNET agent self-destruct reaper", + "/bin/bash", path, + ] + spawn_kwargs = {"start_new_session": True} + else: + argv = ["/bin/bash", path] + spawn_kwargs = {"start_new_session": True} + + subprocess.Popen( # nosec B603 + argv, + stdin=subprocess.DEVNULL, + stdout=subprocess.DEVNULL, + stderr=subprocess.DEVNULL, + close_fds=True, + **spawn_kwargs, + ) + log.warning( + "self_destruct: reaper spawned path=%s via=%s — agent will die in ~3s", + path, "systemd-run" if systemd_run else "popen", + ) + + +async def status() -> dict[str, Any]: + state = await asyncio.to_thread(load_state) + if state is None: + return {"deployed": False, "deckies": []} + config, _compose_path = state + runtime = await asyncio.to_thread(_decky_runtime_states, config) + return { + "deployed": True, + "mode": config.mode, + "compose_path": str(_compose_path), + "deckies": [d.model_dump() for d in config.deckies], + "runtime": runtime, + } diff --git a/decnet/agent/heartbeat.py b/decnet/agent/heartbeat.py new file mode 100644 index 0000000..bbc00aa --- /dev/null +++ b/decnet/agent/heartbeat.py @@ -0,0 +1,134 @@ +"""Agent → master liveness heartbeat loop. + +Every ``INTERVAL_S`` seconds the worker posts ``executor.status()`` to +``POST /swarm/heartbeat`` over mTLS. The master pins the +presented client cert's SHA-256 against the ``SwarmHost`` row for the +claimed ``host_uuid``; a match refreshes ``last_heartbeat`` + each +``DeckyShard``'s snapshot + runtime state. + +Identity comes from ``/etc/decnet/decnet.ini`` (seeded by the enroll +bundle) — specifically ``DECNET_HOST_UUID`` and ``DECNET_MASTER_HOST``. +The worker's existing ``~/.decnet/agent/`` bundle (or +``/etc/decnet/agent/``) provides the mTLS client cert. + +Started/stopped via the agent FastAPI app's lifespan. If identity +plumbing is missing (pre-enrollment dev runs) the loop logs at DEBUG and +declines to start — callers don't have to guard it. +""" +from __future__ import annotations + +import asyncio +import pathlib +from typing import Optional + +import httpx + +from decnet.agent import executor as _exec +from decnet.logging import get_logger +from decnet.swarm import pki +from decnet.swarm.log_forwarder import build_worker_ssl_context + +log = get_logger("agent.heartbeat") + +INTERVAL_S = 30.0 +_TIMEOUT = httpx.Timeout(connect=5.0, read=10.0, write=5.0, pool=5.0) + +_task: Optional[asyncio.Task] = None + + +def _resolve_agent_dir() -> pathlib.Path: + """Match the agent-dir resolution order used by the agent server: + DECNET_AGENT_DIR env, else /etc/decnet/agent (production install), + else ~/.decnet/agent (dev).""" + import os + env = os.environ.get("DECNET_AGENT_DIR") + if env: + return pathlib.Path(env) + system = pathlib.Path("/etc/decnet/agent") + if system.exists(): + return system + return pki.DEFAULT_AGENT_DIR + + +async def _tick(client: httpx.AsyncClient, url: str, host_uuid: str, agent_version: str) -> None: + snap = await _exec.status() + resp = await client.post( + url, + json={ + "host_uuid": host_uuid, + "agent_version": agent_version, + "status": snap, + }, + ) + # 403 / 404 are terminal-ish — we still keep looping because an + # operator may re-enrol the host mid-session, but we log loudly so + # prod ops can spot cert-pinning drift. + if resp.status_code == 204: + return + log.warning( + "heartbeat rejected status=%d body=%s", + resp.status_code, resp.text[:200], + ) + + +async def _loop(url: str, host_uuid: str, agent_version: str, ssl_ctx) -> None: + log.info("heartbeat loop starting url=%s host_uuid=%s interval=%ss", + url, host_uuid, INTERVAL_S) + async with httpx.AsyncClient(verify=ssl_ctx, timeout=_TIMEOUT) as client: + while True: + try: + await _tick(client, url, host_uuid, agent_version) + except asyncio.CancelledError: + raise + except Exception: + log.exception("heartbeat tick failed — will retry in %ss", INTERVAL_S) + await asyncio.sleep(INTERVAL_S) + + +def start() -> Optional[asyncio.Task]: + """Kick off the background heartbeat task. No-op if identity is + unconfigured (dev mode) — the caller doesn't need to check.""" + global _task + from decnet.env import ( + DECNET_HOST_UUID, + DECNET_MASTER_HOST, + DECNET_SWARMCTL_PORT, + ) + + if _task is not None and not _task.done(): + return _task + if not DECNET_HOST_UUID or not DECNET_MASTER_HOST: + log.debug("heartbeat not starting — DECNET_HOST_UUID or DECNET_MASTER_HOST unset") + return None + + agent_dir = _resolve_agent_dir() + try: + ssl_ctx = build_worker_ssl_context(agent_dir) + except Exception: + log.exception("heartbeat not starting — worker SSL context unavailable at %s", agent_dir) + return None + + try: + from decnet import __version__ as _v + agent_version = _v + except Exception: + agent_version = "unknown" + + url = f"https://{DECNET_MASTER_HOST}:{DECNET_SWARMCTL_PORT}/swarm/heartbeat" + _task = asyncio.create_task( + _loop(url, DECNET_HOST_UUID, agent_version, ssl_ctx), + name="agent-heartbeat", + ) + return _task + + +async def stop() -> None: + global _task + if _task is None: + return + _task.cancel() + try: + await _task + except (asyncio.CancelledError, Exception): + pass + _task = None diff --git a/decnet/agent/server.py b/decnet/agent/server.py new file mode 100644 index 0000000..663bc35 --- /dev/null +++ b/decnet/agent/server.py @@ -0,0 +1,70 @@ +"""Worker-agent uvicorn launcher. + +Starts ``decnet.agent.app:app`` over HTTPS with mTLS enforcement. The +worker must already have a bundle in ``~/.decnet/agent/`` (delivered by +``decnet swarm enroll`` from the master); if it does not, we refuse to +start — unauthenticated agents are not a supported mode. +""" +from __future__ import annotations + +import os +import pathlib +import signal +import subprocess # nosec B404 +import sys + +from decnet.logging import get_logger +from decnet.swarm import pki + +log = get_logger("agent.server") + + +def run(host: str, port: int, agent_dir: pathlib.Path = pki.DEFAULT_AGENT_DIR) -> int: + bundle = pki.load_worker_bundle(agent_dir) + if bundle is None: + print( + f"[agent] No cert bundle at {agent_dir}. " + f"Run `decnet swarm enroll` from the master first.", + file=sys.stderr, + ) + return 2 + + keyfile = agent_dir / "worker.key" + certfile = agent_dir / "worker.crt" + cafile = agent_dir / "ca.crt" + + cmd = [ + sys.executable, + "-m", + "uvicorn", + "decnet.agent.app:app", + "--host", + host, + "--port", + str(port), + "--ssl-keyfile", + str(keyfile), + "--ssl-certfile", + str(certfile), + "--ssl-ca-certs", + str(cafile), + # 2 == ssl.CERT_REQUIRED — clients MUST present a CA-signed cert. + "--ssl-cert-reqs", + "2", + ] + log.info("agent starting host=%s port=%d bundle=%s", host, port, agent_dir) + # Own process group for clean Ctrl+C / SIGTERM propagation to uvicorn + # workers (same pattern as `decnet api`). + proc = subprocess.Popen(cmd, start_new_session=True) # nosec B603 + try: + return proc.wait() + except KeyboardInterrupt: + try: + os.killpg(proc.pid, signal.SIGTERM) + try: + return proc.wait(timeout=10) + except subprocess.TimeoutExpired: + os.killpg(proc.pid, signal.SIGKILL) + return proc.wait() + except ProcessLookupError: + return 0 diff --git a/decnet/cli.py b/decnet/cli.py deleted file mode 100644 index 91415e5..0000000 --- a/decnet/cli.py +++ /dev/null @@ -1,478 +0,0 @@ -""" -DECNET CLI — entry point for all commands. - -Usage: - decnet deploy --mode unihost --deckies 5 --randomize-services - decnet status - decnet teardown [--all | --id decky-01] - decnet services -""" - -import signal -from typing import Optional - -import typer -from rich.console import Console -from rich.table import Table - -from decnet.env import ( - DECNET_API_HOST, - DECNET_API_PORT, - DECNET_INGEST_LOG_FILE, - DECNET_WEB_HOST, - DECNET_WEB_PORT, -) -from decnet.archetypes import Archetype, all_archetypes, get_archetype -from decnet.config import ( - DecnetConfig, -) -from decnet.distros import all_distros, get_distro -from decnet.fleet import all_service_names, build_deckies, build_deckies_from_ini -from decnet.ini_loader import load_ini -from decnet.network import detect_interface, detect_subnet, allocate_ips, get_host_ip -from decnet.services.registry import all_services - -app = typer.Typer( - name="decnet", - help="Deploy a deception network of honeypot deckies on your LAN.", - no_args_is_help=True, -) -console = Console() - - -def _kill_api() -> None: - """Find and kill any running DECNET API (uvicorn) or mutator processes.""" - import psutil - import os - - _killed: bool = False - for _proc in psutil.process_iter(['pid', 'name', 'cmdline']): - try: - _cmd = _proc.info['cmdline'] - if not _cmd: - continue - if "uvicorn" in _cmd and "decnet.web.api:app" in _cmd: - console.print(f"[yellow]Stopping DECNET API (PID {_proc.info['pid']})...[/]") - os.kill(_proc.info['pid'], signal.SIGTERM) - _killed = True - elif "decnet.cli" in _cmd and "mutate" in _cmd and "--watch" in _cmd: - console.print(f"[yellow]Stopping DECNET Mutator Watcher (PID {_proc.info['pid']})...[/]") - os.kill(_proc.info['pid'], signal.SIGTERM) - _killed = True - except (psutil.NoSuchProcess, psutil.AccessDenied): - continue - - if _killed: - console.print("[green]Background processes stopped.[/]") - - -@app.command() -def api( - port: int = typer.Option(DECNET_API_PORT, "--port", help="Port for the backend API"), - host: str = typer.Option(DECNET_API_HOST, "--host", help="Host IP for the backend API"), - log_file: str = typer.Option(DECNET_INGEST_LOG_FILE, "--log-file", help="Path to the DECNET log file to monitor"), -) -> None: - """Run the DECNET API and Web Dashboard in standalone mode.""" - import subprocess # nosec B404 - import sys - import os - - console.print(f"[green]Starting DECNET API on {host}:{port}...[/]") - _env: dict[str, str] = os.environ.copy() - _env["DECNET_INGEST_LOG_FILE"] = str(log_file) - try: - subprocess.run( # nosec B603 B404 - [sys.executable, "-m", "uvicorn", "decnet.web.api:app", "--host", host, "--port", str(port)], - env=_env - ) - except KeyboardInterrupt: - pass - except (FileNotFoundError, subprocess.SubprocessError): - console.print("[red]Failed to start API. Ensure 'uvicorn' is installed in the current environment.[/]") - - -@app.command() -def deploy( - mode: str = typer.Option("unihost", "--mode", "-m", help="Deployment mode: unihost | swarm"), - deckies: Optional[int] = typer.Option(None, "--deckies", "-n", help="Number of deckies to deploy (required without --config)", min=1), - interface: Optional[str] = typer.Option(None, "--interface", "-i", help="Host NIC (auto-detected if omitted)"), - subnet: Optional[str] = typer.Option(None, "--subnet", help="LAN subnet CIDR (auto-detected if omitted)"), - ip_start: Optional[str] = typer.Option(None, "--ip-start", help="First decky IP (auto if omitted)"), - services: Optional[str] = typer.Option(None, "--services", help="Comma-separated services, e.g. ssh,smb,rdp"), - randomize_services: bool = typer.Option(False, "--randomize-services", help="Assign random services to each decky"), - distro: Optional[str] = typer.Option(None, "--distro", help="Comma-separated distro slugs, e.g. debian,ubuntu22,rocky9"), - randomize_distros: bool = typer.Option(False, "--randomize-distros", help="Assign a random distro to each decky"), - log_file: Optional[str] = typer.Option(DECNET_INGEST_LOG_FILE, "--log-file", help="Host path for the collector to write RFC 5424 logs (e.g. /var/log/decnet/decnet.log)"), - archetype_name: Optional[str] = typer.Option(None, "--archetype", "-a", help="Machine archetype slug (e.g. linux-server, windows-workstation)"), - mutate_interval: Optional[int] = typer.Option(30, "--mutate-interval", help="Automatically rotate services every N minutes"), - dry_run: bool = typer.Option(False, "--dry-run", help="Generate compose file without starting containers"), - no_cache: bool = typer.Option(False, "--no-cache", help="Force rebuild all images, ignoring Docker layer cache"), - parallel: bool = typer.Option(False, "--parallel", help="Build all images concurrently (enables BuildKit, separates build from up)"), - ipvlan: bool = typer.Option(False, "--ipvlan", help="Use IPvlan L2 instead of MACVLAN (required on WiFi interfaces)"), - config_file: Optional[str] = typer.Option(None, "--config", "-c", help="Path to INI config file"), - api: bool = typer.Option(False, "--api", help="Start the FastAPI backend to ingest and serve logs"), - api_port: int = typer.Option(8000, "--api-port", help="Port for the backend API"), -) -> None: - """Deploy deckies to the LAN.""" - import os - if mode not in ("unihost", "swarm"): - console.print("[red]--mode must be 'unihost' or 'swarm'[/]") - raise typer.Exit(1) - - # ------------------------------------------------------------------ # - # Config-file path # - # ------------------------------------------------------------------ # - if config_file: - try: - ini = load_ini(config_file) - except FileNotFoundError as e: - console.print(f"[red]{e}[/]") - raise typer.Exit(1) - - iface = interface or ini.interface or detect_interface() - subnet_cidr = subnet or ini.subnet - effective_gateway = ini.gateway - if subnet_cidr is None: - subnet_cidr, effective_gateway = detect_subnet(iface) - elif effective_gateway is None: - _, effective_gateway = detect_subnet(iface) - - host_ip = get_host_ip(iface) - console.print(f"[dim]Config:[/] {config_file} [dim]Interface:[/] {iface} " - f"[dim]Subnet:[/] {subnet_cidr} [dim]Gateway:[/] {effective_gateway} " - f"[dim]Host IP:[/] {host_ip}") - - if ini.custom_services: - from decnet.custom_service import CustomService - from decnet.services.registry import register_custom_service - for cs in ini.custom_services: - register_custom_service( - CustomService( - name=cs.name, - image=cs.image, - exec_cmd=cs.exec_cmd, - ports=cs.ports, - ) - ) - - effective_log_file = log_file - try: - decky_configs = build_deckies_from_ini( - ini, subnet_cidr, effective_gateway, host_ip, randomize_services, cli_mutate_interval=mutate_interval - ) - except ValueError as e: - console.print(f"[red]{e}[/]") - raise typer.Exit(1) - # ------------------------------------------------------------------ # - # Classic CLI path # - # ------------------------------------------------------------------ # - else: - if deckies is None: - console.print("[red]--deckies is required when --config is not used.[/]") - raise typer.Exit(1) - - services_list = [s.strip() for s in services.split(",")] if services else None - if services_list: - known = set(all_service_names()) - unknown = [s for s in services_list if s not in known] - if unknown: - console.print(f"[red]Unknown service(s): {unknown}. Available: {all_service_names()}[/]") - raise typer.Exit(1) - - arch: Archetype | None = None - if archetype_name: - try: - arch = get_archetype(archetype_name) - except ValueError as e: - console.print(f"[red]{e}[/]") - raise typer.Exit(1) - - if not services_list and not randomize_services and not arch: - console.print("[red]Specify --services, --archetype, or --randomize-services.[/]") - raise typer.Exit(1) - - iface = interface or detect_interface() - if subnet is None: - subnet_cidr, effective_gateway = detect_subnet(iface) - else: - subnet_cidr = subnet - _, effective_gateway = detect_subnet(iface) - - host_ip = get_host_ip(iface) - console.print(f"[dim]Interface:[/] {iface} [dim]Subnet:[/] {subnet_cidr} " - f"[dim]Gateway:[/] {effective_gateway} [dim]Host IP:[/] {host_ip}") - - distros_list = [d.strip() for d in distro.split(",")] if distro else None - if distros_list: - try: - for slug in distros_list: - get_distro(slug) - except ValueError as e: - console.print(f"[red]{e}[/]") - raise typer.Exit(1) - - ips = allocate_ips(subnet_cidr, effective_gateway, host_ip, deckies, ip_start) - decky_configs = build_deckies( - deckies, ips, services_list, randomize_services, - distros_explicit=distros_list, randomize_distros=randomize_distros, - archetype=arch, mutate_interval=mutate_interval, - ) - effective_log_file = log_file - - if api and not effective_log_file: - effective_log_file = os.path.join(os.getcwd(), "decnet.log") - console.print(f"[cyan]API mode enabled: defaulting log-file to {effective_log_file}[/]") - - config = DecnetConfig( - mode=mode, - interface=iface, - subnet=subnet_cidr, - gateway=effective_gateway, - deckies=decky_configs, - log_file=effective_log_file, - ipvlan=ipvlan, - mutate_interval=mutate_interval, - ) - - from decnet.engine import deploy as _deploy - _deploy(config, dry_run=dry_run, no_cache=no_cache, parallel=parallel) - - if mutate_interval is not None and not dry_run: - import subprocess # nosec B404 - import sys - console.print(f"[green]Starting DECNET Mutator watcher in the background (interval: {mutate_interval}m)...[/]") - try: - subprocess.Popen( # nosec B603 - [sys.executable, "-m", "decnet.cli", "mutate", "--watch"], - stdout=subprocess.DEVNULL, - stderr=subprocess.STDOUT, - start_new_session=True, - ) - except (FileNotFoundError, subprocess.SubprocessError): - console.print("[red]Failed to start mutator watcher.[/]") - - if effective_log_file and not dry_run and not api: - import subprocess # nosec B404 - import sys - from pathlib import Path as _Path - _collector_err = _Path(effective_log_file).with_suffix(".collector.log") - console.print(f"[bold cyan]Starting log collector[/] → {effective_log_file}") - subprocess.Popen( # nosec B603 - [sys.executable, "-m", "decnet.cli", "collect", "--log-file", str(effective_log_file)], - stdin=subprocess.DEVNULL, - stdout=open(_collector_err, "a"), # nosec B603 - stderr=subprocess.STDOUT, - start_new_session=True, - ) - - if api and not dry_run: - import subprocess # nosec B404 - import sys - console.print(f"[green]Starting DECNET API on port {api_port}...[/]") - _env: dict[str, str] = os.environ.copy() - _env["DECNET_INGEST_LOG_FILE"] = str(effective_log_file or "") - try: - subprocess.Popen( # nosec B603 - [sys.executable, "-m", "uvicorn", "decnet.web.api:app", "--host", DECNET_API_HOST, "--port", str(api_port)], - env=_env, - stdout=subprocess.DEVNULL, - stderr=subprocess.STDOUT - ) - console.print(f"[dim]API running at http://{DECNET_API_HOST}:{api_port}[/]") - except (FileNotFoundError, subprocess.SubprocessError): - console.print("[red]Failed to start API. Ensure 'uvicorn' is installed in the current environment.[/]") - - -@app.command() -def collect( - log_file: str = typer.Option(DECNET_INGEST_LOG_FILE, "--log-file", "-f", help="Path to write RFC 5424 syslog lines and .json records"), -) -> None: - """Stream Docker logs from all running decky service containers to a log file.""" - import asyncio - from decnet.collector import log_collector_worker - console.print(f"[bold cyan]Collector starting[/] → {log_file}") - asyncio.run(log_collector_worker(log_file)) - - -@app.command() -def mutate( - watch: bool = typer.Option(False, "--watch", "-w", help="Run continuously and mutate deckies according to their interval"), - decky_name: Optional[str] = typer.Option(None, "--decky", "-d", help="Force mutate a specific decky immediately"), - force_all: bool = typer.Option(False, "--all", help="Force mutate all deckies immediately"), -) -> None: - """Manually trigger or continuously watch for decky mutation.""" - import asyncio - from decnet.mutator import mutate_decky, mutate_all, run_watch_loop - from decnet.web.dependencies import repo - - async def _run() -> None: - await repo.initialize() - if watch: - await run_watch_loop(repo) - elif decky_name: - await mutate_decky(decky_name, repo) - elif force_all: - await mutate_all(force=True, repo=repo) - else: - await mutate_all(force=False, repo=repo) - - asyncio.run(_run()) - - -@app.command() -def status() -> None: - """Show running deckies and their status.""" - from decnet.engine import status as _status - _status() - - -@app.command() -def teardown( - all_: bool = typer.Option(False, "--all", help="Tear down all deckies and remove network"), - id_: Optional[str] = typer.Option(None, "--id", help="Tear down a specific decky by name"), -) -> None: - """Stop and remove deckies.""" - if not all_ and not id_: - console.print("[red]Specify --all or --id .[/]") - raise typer.Exit(1) - - from decnet.engine import teardown as _teardown - _teardown(decky_id=id_) - - if all_: - _kill_api() - - -@app.command(name="services") -def list_services() -> None: - """List all registered honeypot service plugins.""" - svcs = all_services() - table = Table(title="Available Services", show_lines=True) - table.add_column("Name", style="bold cyan") - table.add_column("Ports") - table.add_column("Image") - for name, svc in sorted(svcs.items()): - table.add_row(name, ", ".join(str(p) for p in svc.ports), svc.default_image) - console.print(table) - - -@app.command(name="distros") -def list_distros() -> None: - """List all available OS distro profiles for deckies.""" - table = Table(title="Available Distro Profiles", show_lines=True) - table.add_column("Slug", style="bold cyan") - table.add_column("Display Name") - table.add_column("Docker Image", style="dim") - for slug, profile in sorted(all_distros().items()): - table.add_row(slug, profile.display_name, profile.image) - console.print(table) - - -@app.command(name="correlate") -def correlate( - log_file: Optional[str] = typer.Option(None, "--log-file", "-f", help="Path to DECNET syslog file to analyse"), - min_deckies: int = typer.Option(2, "--min-deckies", "-m", help="Minimum number of distinct deckies an IP must touch to be reported"), - output: str = typer.Option("table", "--output", "-o", help="Output format: table | json | syslog"), - emit_syslog: bool = typer.Option(False, "--emit-syslog", help="Also print traversal events as RFC 5424 lines (for SIEM piping)"), -) -> None: - """Analyse logs for cross-decky traversals and print the attacker movement graph.""" - import sys - import json as _json - from pathlib import Path - from decnet.correlation.engine import CorrelationEngine - - engine = CorrelationEngine() - - if log_file: - path = Path(log_file) - if not path.exists(): - console.print(f"[red]Log file not found: {log_file}[/]") - raise typer.Exit(1) - engine.ingest_file(path) - elif not sys.stdin.isatty(): - for line in sys.stdin: - engine.ingest(line) - else: - console.print("[red]Provide --log-file or pipe log data via stdin.[/]") - raise typer.Exit(1) - - traversals = engine.traversals(min_deckies) - - if output == "json": - console.print_json(_json.dumps(engine.report_json(min_deckies), indent=2)) - elif output == "syslog": - for line in engine.traversal_syslog_lines(min_deckies): - typer.echo(line) - else: - if not traversals: - console.print( - f"[yellow]No traversals detected " - f"(min_deckies={min_deckies}, events_indexed={engine.events_indexed}).[/]" - ) - else: - console.print(engine.report_table(min_deckies)) - console.print( - f"[dim]Parsed {engine.lines_parsed} lines · " - f"indexed {engine.events_indexed} events · " - f"{len(engine.all_attackers())} unique IPs · " - f"[bold]{len(traversals)}[/] traversal(s)[/]" - ) - - if emit_syslog: - for line in engine.traversal_syslog_lines(min_deckies): - typer.echo(line) - - -@app.command(name="archetypes") -def list_archetypes() -> None: - """List all machine archetype profiles.""" - table = Table(title="Machine Archetypes", show_lines=True) - table.add_column("Slug", style="bold cyan") - table.add_column("Display Name") - table.add_column("Default Services", style="green") - table.add_column("Description", style="dim") - for slug, arch in sorted(all_archetypes().items()): - table.add_row( - slug, - arch.display_name, - ", ".join(arch.services), - arch.description, - ) - console.print(table) - - -@app.command(name="web") -def serve_web( - web_port: int = typer.Option(DECNET_WEB_PORT, "--web-port", help="Port to serve the DECNET Web Dashboard"), - host: str = typer.Option(DECNET_WEB_HOST, "--host", help="Host IP to serve the Web Dashboard"), -) -> None: - """Serve the DECNET Web Dashboard frontend.""" - import http.server - import socketserver - from pathlib import Path - - dist_dir = Path(__file__).parent.parent / "decnet_web" / "dist" - - if not dist_dir.exists(): - console.print(f"[red]Frontend build not found at {dist_dir}. Make sure you run 'npm run build' inside 'decnet_web'.[/]") - raise typer.Exit(1) - - class SPAHTTPRequestHandler(http.server.SimpleHTTPRequestHandler): - def do_GET(self): - path = self.translate_path(self.path) - if not Path(path).exists() or Path(path).is_dir(): - self.path = "/index.html" - return super().do_GET() - - import os - os.chdir(dist_dir) - - with socketserver.TCPServer((host, web_port), SPAHTTPRequestHandler) as httpd: - console.print(f"[green]Serving DECNET Web Dashboard on http://{host}:{web_port}[/]") - try: - httpd.serve_forever() - except KeyboardInterrupt: - console.print("\n[dim]Shutting down dashboard server.[/]") - -if __name__ == '__main__': # pragma: no cover - app() diff --git a/decnet/cli/__init__.py b/decnet/cli/__init__.py new file mode 100644 index 0000000..e2976c8 --- /dev/null +++ b/decnet/cli/__init__.py @@ -0,0 +1,80 @@ +""" +DECNET CLI — entry point for all commands. + +Usage: + decnet deploy --mode unihost --deckies 5 --randomize-services + decnet status + decnet teardown [--all | --id decky-01] + decnet services + +Layout: each command module exports ``register(app)`` which attaches its +commands to the passed Typer app. ``__init__.py`` builds the root app, +calls every module's ``register`` in order, then runs the master-only +gate. The gate must fire LAST so it sees the fully-populated dispatch +table before filtering. +""" + +from __future__ import annotations + +import typer + +from . import ( + agent, + api, + db, + deploy, + forwarder, + inventory, + lifecycle, + listener, + profiler, + sniffer, + swarm, + swarmctl, + updater, + web, + workers, +) +from .gating import _gate_commands_by_mode +from .utils import console as console, log as log + +app = typer.Typer( + name="decnet", + help="Deploy a deception network of honeypot deckies on your LAN.", + no_args_is_help=True, +) + +# Order matches the old flat layout so `decnet --help` reads the same. +for _mod in ( + api, swarmctl, agent, updater, listener, forwarder, + swarm, + deploy, lifecycle, workers, inventory, + web, profiler, sniffer, db, +): + _mod.register(app) + +_gate_commands_by_mode(app) + +# Backwards-compat re-exports. Tests and third-party tooling import these +# directly from ``decnet.cli``; the refactor must keep them resolvable. +from .db import _db_reset_mysql_async # noqa: E402,F401 +from .gating import ( # noqa: E402,F401 + MASTER_ONLY_COMMANDS, + MASTER_ONLY_GROUPS, + _agent_mode_active, + _require_master_mode, +) +from .utils import ( # noqa: E402,F401 + _daemonize, + _http_request, + _is_running, + _kill_all_services, + _pid_dir, + _service_registry, + _spawn_detached, + _swarmctl_base_url, +) + + +if __name__ == "__main__": # pragma: no cover + app() diff --git a/decnet/cli/agent.py b/decnet/cli/agent.py new file mode 100644 index 0000000..ae89a46 --- /dev/null +++ b/decnet/cli/agent.py @@ -0,0 +1,64 @@ +from __future__ import annotations + +import os +import pathlib as _pathlib +import sys as _sys +from typing import Optional + +import typer + +from . import utils as _utils +from .utils import console, log + + +def register(app: typer.Typer) -> None: + @app.command() + def agent( + port: int = typer.Option(8765, "--port", help="Port for the worker agent"), + host: str = typer.Option("0.0.0.0", "--host", help="Bind address for the worker agent"), # nosec B104 + agent_dir: Optional[str] = typer.Option(None, "--agent-dir", help="Worker cert bundle dir (default: ~/.decnet/agent, expanded under the running user's HOME — set this when running as sudo/root)"), + daemon: bool = typer.Option(False, "--daemon", "-d", help="Detach to background as a daemon process"), + no_forwarder: bool = typer.Option(False, "--no-forwarder", help="Do not auto-spawn the log forwarder alongside the agent"), + ) -> None: + """Run the DECNET SWARM worker agent (requires a cert bundle in ~/.decnet/agent/). + + By default, `decnet agent` auto-spawns `decnet forwarder` as a fully- + detached sibling process so worker logs start flowing to the master + without a second manual invocation. The forwarder survives agent + restarts and crashes — if it dies on its own, restart it manually + with `decnet forwarder --daemon …`. Pass --no-forwarder to skip. + """ + from decnet.agent import server as _agent_server + from decnet.env import DECNET_SWARM_MASTER_HOST, DECNET_INGEST_LOG_FILE + from decnet.swarm import pki as _pki + + resolved_dir = _pathlib.Path(agent_dir) if agent_dir else _pki.DEFAULT_AGENT_DIR + + if daemon: + log.info("agent daemonizing host=%s port=%d", host, port) + _utils._daemonize() + + if not no_forwarder and DECNET_SWARM_MASTER_HOST: + fw_argv = [ + _sys.executable, "-m", "decnet", "forwarder", + "--master-host", DECNET_SWARM_MASTER_HOST, + "--master-port", str(int(os.environ.get("DECNET_SWARM_SYSLOG_PORT", "6514"))), + "--agent-dir", str(resolved_dir), + "--log-file", str(DECNET_INGEST_LOG_FILE), + "--daemon", + ] + try: + pid = _utils._spawn_detached(fw_argv, _utils._pid_dir() / "forwarder.pid") + log.info("agent auto-spawned forwarder pid=%d master=%s", pid, DECNET_SWARM_MASTER_HOST) + console.print(f"[dim]Auto-spawned forwarder (pid {pid}) → {DECNET_SWARM_MASTER_HOST}.[/]") + except Exception as e: # noqa: BLE001 + log.warning("agent could not auto-spawn forwarder: %s", e) + console.print(f"[yellow]forwarder auto-spawn skipped: {e}[/]") + elif not no_forwarder: + log.info("agent skipping forwarder auto-spawn (DECNET_SWARM_MASTER_HOST unset)") + + log.info("agent command invoked host=%s port=%d dir=%s", host, port, resolved_dir) + console.print(f"[green]Starting DECNET worker agent on {host}:{port} (mTLS)...[/]") + rc = _agent_server.run(host, port, agent_dir=resolved_dir) + if rc != 0: + raise typer.Exit(rc) diff --git a/decnet/cli/api.py b/decnet/cli/api.py new file mode 100644 index 0000000..80f88cd --- /dev/null +++ b/decnet/cli/api.py @@ -0,0 +1,53 @@ +from __future__ import annotations + +import os +import signal +import subprocess # nosec B404 +import sys + +import typer + +from decnet.env import DECNET_API_HOST, DECNET_API_PORT, DECNET_INGEST_LOG_FILE + +from . import utils as _utils +from .gating import _require_master_mode +from .utils import console, log + + +def register(app: typer.Typer) -> None: + @app.command() + def api( + port: int = typer.Option(DECNET_API_PORT, "--port", help="Port for the backend API"), + host: str = typer.Option(DECNET_API_HOST, "--host", help="Host IP for the backend API"), + log_file: str = typer.Option(DECNET_INGEST_LOG_FILE, "--log-file", help="Path to the DECNET log file to monitor"), + daemon: bool = typer.Option(False, "--daemon", "-d", help="Detach to background as a daemon process"), + workers: int = typer.Option(1, "--workers", "-w", min=1, help="Number of uvicorn worker processes"), + ) -> None: + """Run the DECNET API and Web Dashboard in standalone mode.""" + _require_master_mode("api") + if daemon: + log.info("API daemonizing host=%s port=%d workers=%d", host, port, workers) + _utils._daemonize() + + log.info("API command invoked host=%s port=%d workers=%d", host, port, workers) + console.print(f"[green]Starting DECNET API on {host}:{port} (workers={workers})...[/]") + _env: dict[str, str] = os.environ.copy() + _env["DECNET_INGEST_LOG_FILE"] = str(log_file) + _cmd = [sys.executable, "-m", "uvicorn", "decnet.web.api:app", + "--host", host, "--port", str(port), "--workers", str(workers)] + try: + proc = subprocess.Popen(_cmd, env=_env, start_new_session=True) # nosec B603 B404 + try: + proc.wait() + except KeyboardInterrupt: + try: + os.killpg(proc.pid, signal.SIGTERM) + try: + proc.wait(timeout=10) + except subprocess.TimeoutExpired: + os.killpg(proc.pid, signal.SIGKILL) + proc.wait() + except ProcessLookupError: + pass + except (FileNotFoundError, subprocess.SubprocessError): + console.print("[red]Failed to start API. Ensure 'uvicorn' is installed in the current environment.[/]") diff --git a/decnet/cli/db.py b/decnet/cli/db.py new file mode 100644 index 0000000..73c9430 --- /dev/null +++ b/decnet/cli/db.py @@ -0,0 +1,130 @@ +from __future__ import annotations + +from typing import Optional + +import typer +from rich.table import Table + +from .utils import console, log + + +_DB_RESET_TABLES: tuple[str, ...] = ( + # Order matters for DROP TABLE: child FKs first. + # - attacker_behavior FK-references attackers. + # - decky_shards FK-references swarm_hosts. + "attacker_behavior", + "attackers", + "logs", + "bounty", + "state", + "users", + "decky_shards", + "swarm_hosts", +) + + +async def _db_reset_mysql_async(dsn: str, mode: str, confirm: bool) -> None: + """Inspect + (optionally) wipe a MySQL database. Pulled out of the CLI + wrapper so tests can drive it without spawning a Typer runner.""" + from urllib.parse import urlparse + from sqlalchemy import text + from sqlalchemy.ext.asyncio import create_async_engine + + db_name = urlparse(dsn).path.lstrip("/") or "(default)" + engine = create_async_engine(dsn) + try: + rows: dict[str, int] = {} + async with engine.connect() as conn: + for tbl in _DB_RESET_TABLES: + try: + result = await conn.execute(text(f"SELECT COUNT(*) FROM `{tbl}`")) # nosec B608 + rows[tbl] = result.scalar() or 0 + except Exception: # noqa: BLE001 — ProgrammingError for missing table varies by driver + rows[tbl] = -1 + + summary = Table(title=f"DECNET MySQL reset — database `{db_name}` (mode={mode})") + summary.add_column("Table", style="cyan") + summary.add_column("Rows", justify="right") + for tbl, count in rows.items(): + summary.add_row(tbl, "[dim]missing[/]" if count < 0 else f"{count:,}") + console.print(summary) + + if not confirm: + console.print( + "[yellow]Dry-run only. Re-run with [bold]--i-know-what-im-doing[/] " + "to actually execute.[/]" + ) + return + + async with engine.begin() as conn: + await conn.execute(text("SET FOREIGN_KEY_CHECKS = 0")) + for tbl in _DB_RESET_TABLES: + if rows.get(tbl, -1) < 0: + continue + if mode == "truncate": + await conn.execute(text(f"TRUNCATE TABLE `{tbl}`")) + console.print(f"[green]✓ TRUNCATE {tbl}[/]") + else: + await conn.execute(text(f"DROP TABLE `{tbl}`")) + console.print(f"[green]✓ DROP TABLE {tbl}[/]") + await conn.execute(text("SET FOREIGN_KEY_CHECKS = 1")) + + console.print(f"[bold green]Done. Database `{db_name}` reset ({mode}).[/]") + finally: + await engine.dispose() + + +def register(app: typer.Typer) -> None: + @app.command(name="db-reset") + def db_reset( + i_know: bool = typer.Option( + False, + "--i-know-what-im-doing", + help="Required to actually execute. Without it, the command runs in dry-run mode.", + ), + mode: str = typer.Option( + "truncate", + "--mode", + help="truncate (wipe rows, keep schema) | drop-tables (DROP TABLE for each DECNET table)", + ), + url: Optional[str] = typer.Option( + None, + "--url", + help="Override DECNET_DB_URL for this invocation (e.g. when cleanup needs admin creds).", + ), + ) -> None: + """Wipe the MySQL database used by the DECNET dashboard. + + Destructive. Runs dry by default — pass --i-know-what-im-doing to commit. + Only supported against MySQL; refuses to operate on SQLite. + """ + import asyncio + import os + + if mode not in ("truncate", "drop-tables"): + console.print(f"[red]Invalid --mode '{mode}'. Expected: truncate | drop-tables.[/]") + raise typer.Exit(2) + + db_type = os.environ.get("DECNET_DB_TYPE", "sqlite").lower() + if db_type != "mysql": + console.print( + f"[red]db-reset is MySQL-only (DECNET_DB_TYPE='{db_type}'). " + f"For SQLite, just delete the decnet.db file.[/]" + ) + raise typer.Exit(2) + + dsn = url or os.environ.get("DECNET_DB_URL") + if not dsn: + from decnet.web.db.mysql.database import build_mysql_url + try: + dsn = build_mysql_url() + except ValueError as e: + console.print(f"[red]{e}[/]") + raise typer.Exit(2) from e + + log.info("db-reset invoked mode=%s confirm=%s", mode, i_know) + try: + asyncio.run(_db_reset_mysql_async(dsn, mode=mode, confirm=i_know)) + except Exception as e: # noqa: BLE001 + console.print(f"[red]db-reset failed: {e}[/]") + raise typer.Exit(1) from e diff --git a/decnet/cli/deploy.py b/decnet/cli/deploy.py new file mode 100644 index 0000000..1ec48b0 --- /dev/null +++ b/decnet/cli/deploy.py @@ -0,0 +1,307 @@ +from __future__ import annotations + +from typing import Optional + +import typer +from rich.table import Table + +from decnet.archetypes import Archetype, get_archetype +from decnet.config import DecnetConfig +from decnet.distros import get_distro +from decnet.env import DECNET_API_HOST, DECNET_INGEST_LOG_FILE +from decnet.fleet import all_service_names, build_deckies, build_deckies_from_ini +from decnet.ini_loader import load_ini +from decnet.network import detect_interface, detect_subnet, allocate_ips, get_host_ip + +from . import utils as _utils +from .gating import _require_master_mode +from .utils import console, log + + +def _deploy_swarm(config: "DecnetConfig", *, dry_run: bool, no_cache: bool) -> None: + """Shard deckies round-robin across enrolled workers and POST to swarmctl.""" + base = _utils._swarmctl_base_url(None) + resp = _utils._http_request("GET", base + "/swarm/hosts?host_status=enrolled") + enrolled = resp.json() + resp2 = _utils._http_request("GET", base + "/swarm/hosts?host_status=active") + active = resp2.json() + workers = [*enrolled, *active] + if not workers: + console.print("[red]No enrolled workers — run `decnet swarm enroll ...` first.[/]") + raise typer.Exit(1) + + assigned: list = [] + for idx, d in enumerate(config.deckies): + target = workers[idx % len(workers)] + assigned.append(d.model_copy(update={"host_uuid": target["uuid"]})) + config = config.model_copy(update={"deckies": assigned}) + + body = {"config": config.model_dump(mode="json"), "dry_run": dry_run, "no_cache": no_cache} + console.print(f"[cyan]Dispatching {len(config.deckies)} deckies across {len(workers)} worker(s)...[/]") + resp3 = _utils._http_request("POST", base + "/swarm/deploy", json_body=body, timeout=900.0) + results = resp3.json().get("results", []) + + table = Table(title="SWARM deploy results") + for col in ("worker", "host_uuid", "ok", "detail"): + table.add_column(col) + any_failed = False + for r in results: + ok = bool(r.get("ok")) + if not ok: + any_failed = True + detail = r.get("detail") + if isinstance(detail, dict): + detail = detail.get("status") or "ok" + table.add_row( + str(r.get("host_name") or ""), + str(r.get("host_uuid") or ""), + "[green]yes[/]" if ok else "[red]no[/]", + str(detail)[:80], + ) + console.print(table) + if any_failed: + raise typer.Exit(1) + + +def register(app: typer.Typer) -> None: + @app.command() + def deploy( + mode: str = typer.Option("unihost", "--mode", "-m", help="Deployment mode: unihost | swarm"), + deckies: Optional[int] = typer.Option(None, "--deckies", "-n", help="Number of deckies to deploy (required without --config)", min=1), + interface: Optional[str] = typer.Option(None, "--interface", "-i", help="Host NIC (auto-detected if omitted)"), + subnet: Optional[str] = typer.Option(None, "--subnet", help="LAN subnet CIDR (auto-detected if omitted)"), + ip_start: Optional[str] = typer.Option(None, "--ip-start", help="First decky IP (auto if omitted)"), + services: Optional[str] = typer.Option(None, "--services", help="Comma-separated services, e.g. ssh,smb,rdp"), + randomize_services: bool = typer.Option(False, "--randomize-services", help="Assign random services to each decky"), + distro: Optional[str] = typer.Option(None, "--distro", help="Comma-separated distro slugs, e.g. debian,ubuntu22,rocky9"), + randomize_distros: bool = typer.Option(False, "--randomize-distros", help="Assign a random distro to each decky"), + log_file: Optional[str] = typer.Option(DECNET_INGEST_LOG_FILE, "--log-file", help="Host path for the collector to write RFC 5424 logs (e.g. /var/log/decnet/decnet.log)"), + archetype_name: Optional[str] = typer.Option(None, "--archetype", "-a", help="Machine archetype slug (e.g. linux-server, windows-workstation)"), + mutate_interval: Optional[int] = typer.Option(30, "--mutate-interval", help="Automatically rotate services every N minutes"), + dry_run: bool = typer.Option(False, "--dry-run", help="Generate compose file without starting containers"), + no_cache: bool = typer.Option(False, "--no-cache", help="Force rebuild all images, ignoring Docker layer cache"), + parallel: bool = typer.Option(False, "--parallel", help="Build all images concurrently (enables BuildKit, separates build from up)"), + ipvlan: bool = typer.Option(False, "--ipvlan", help="Use IPvlan L2 instead of MACVLAN (required on WiFi interfaces)"), + config_file: Optional[str] = typer.Option(None, "--config", "-c", help="Path to INI config file"), + api: bool = typer.Option(False, "--api", help="Start the FastAPI backend to ingest and serve logs"), + api_port: int = typer.Option(8000, "--api-port", help="Port for the backend API"), + daemon: bool = typer.Option(False, "--daemon", help="Detach to background as a daemon process"), + ) -> None: + """Deploy deckies to the LAN.""" + import os + import subprocess # nosec B404 + import sys + from pathlib import Path as _Path + + _require_master_mode("deploy") + if daemon: + log.info("deploy daemonizing mode=%s deckies=%s", mode, deckies) + _utils._daemonize() + + log.info("deploy command invoked mode=%s deckies=%s dry_run=%s", mode, deckies, dry_run) + if mode not in ("unihost", "swarm"): + console.print("[red]--mode must be 'unihost' or 'swarm'[/]") + raise typer.Exit(1) + + if config_file: + try: + ini = load_ini(config_file) + except FileNotFoundError as e: + console.print(f"[red]{e}[/]") + raise typer.Exit(1) + + iface = interface or ini.interface or detect_interface() + subnet_cidr = subnet or ini.subnet + effective_gateway = ini.gateway + if subnet_cidr is None: + subnet_cidr, effective_gateway = detect_subnet(iface) + elif effective_gateway is None: + _, effective_gateway = detect_subnet(iface) + + host_ip = get_host_ip(iface) + console.print(f"[dim]Config:[/] {config_file} [dim]Interface:[/] {iface} " + f"[dim]Subnet:[/] {subnet_cidr} [dim]Gateway:[/] {effective_gateway} " + f"[dim]Host IP:[/] {host_ip}") + + if ini.custom_services: + from decnet.custom_service import CustomService + from decnet.services.registry import register_custom_service + for cs in ini.custom_services: + register_custom_service( + CustomService( + name=cs.name, + image=cs.image, + exec_cmd=cs.exec_cmd, + ports=cs.ports, + ) + ) + + effective_log_file = log_file + try: + decky_configs = build_deckies_from_ini( + ini, subnet_cidr, effective_gateway, host_ip, randomize_services, cli_mutate_interval=mutate_interval + ) + except ValueError as e: + console.print(f"[red]{e}[/]") + raise typer.Exit(1) + else: + if deckies is None: + console.print("[red]--deckies is required when --config is not used.[/]") + raise typer.Exit(1) + + services_list = [s.strip() for s in services.split(",")] if services else None + if services_list: + known = set(all_service_names()) + unknown = [s for s in services_list if s not in known] + if unknown: + console.print(f"[red]Unknown service(s): {unknown}. Available: {all_service_names()}[/]") + raise typer.Exit(1) + + arch: Archetype | None = None + if archetype_name: + try: + arch = get_archetype(archetype_name) + except ValueError as e: + console.print(f"[red]{e}[/]") + raise typer.Exit(1) + + if not services_list and not randomize_services and not arch: + console.print("[red]Specify --services, --archetype, or --randomize-services.[/]") + raise typer.Exit(1) + + iface = interface or detect_interface() + if subnet is None: + subnet_cidr, effective_gateway = detect_subnet(iface) + else: + subnet_cidr = subnet + _, effective_gateway = detect_subnet(iface) + + host_ip = get_host_ip(iface) + console.print(f"[dim]Interface:[/] {iface} [dim]Subnet:[/] {subnet_cidr} " + f"[dim]Gateway:[/] {effective_gateway} [dim]Host IP:[/] {host_ip}") + + distros_list = [d.strip() for d in distro.split(",")] if distro else None + if distros_list: + try: + for slug in distros_list: + get_distro(slug) + except ValueError as e: + console.print(f"[red]{e}[/]") + raise typer.Exit(1) + + ips = allocate_ips(subnet_cidr, effective_gateway, host_ip, deckies, ip_start) + decky_configs = build_deckies( + deckies, ips, services_list, randomize_services, + distros_explicit=distros_list, randomize_distros=randomize_distros, + archetype=arch, mutate_interval=mutate_interval, + ) + effective_log_file = log_file + + if api and not effective_log_file: + effective_log_file = os.path.join(os.getcwd(), "decnet.log") + console.print(f"[cyan]API mode enabled: defaulting log-file to {effective_log_file}[/]") + + config = DecnetConfig( + mode=mode, + interface=iface, + subnet=subnet_cidr, + gateway=effective_gateway, + deckies=decky_configs, + log_file=effective_log_file, + ipvlan=ipvlan, + mutate_interval=mutate_interval, + ) + + log.debug("deploy: config built deckies=%d interface=%s subnet=%s", len(config.deckies), config.interface, config.subnet) + + if mode == "swarm": + _deploy_swarm(config, dry_run=dry_run, no_cache=no_cache) + if dry_run: + log.info("deploy: swarm dry-run complete, no workers dispatched") + else: + log.info("deploy: swarm deployment complete deckies=%d", len(config.deckies)) + return + + from decnet.engine import deploy as _deploy + _deploy(config, dry_run=dry_run, no_cache=no_cache, parallel=parallel) + if dry_run: + log.info("deploy: dry-run complete, no containers started") + else: + log.info("deploy: deployment complete deckies=%d", len(config.deckies)) + + if mutate_interval is not None and not dry_run: + console.print(f"[green]Starting DECNET Mutator watcher in the background (interval: {mutate_interval}m)...[/]") + try: + subprocess.Popen( # nosec B603 + [sys.executable, "-m", "decnet.cli", "mutate", "--watch"], + stdout=subprocess.DEVNULL, + stderr=subprocess.STDOUT, + start_new_session=True, + ) + except (FileNotFoundError, subprocess.SubprocessError): + console.print("[red]Failed to start mutator watcher.[/]") + + if effective_log_file and not dry_run and not api: + _collector_err = _Path(effective_log_file).with_suffix(".collector.log") + console.print(f"[bold cyan]Starting log collector[/] → {effective_log_file}") + subprocess.Popen( # nosec B603 + [sys.executable, "-m", "decnet.cli", "collect", "--log-file", str(effective_log_file)], + stdin=subprocess.DEVNULL, + stdout=open(_collector_err, "a"), + stderr=subprocess.STDOUT, + start_new_session=True, + ) + + if api and not dry_run: + console.print(f"[green]Starting DECNET API on port {api_port}...[/]") + _env: dict[str, str] = os.environ.copy() + _env["DECNET_INGEST_LOG_FILE"] = str(effective_log_file or "") + try: + subprocess.Popen( # nosec B603 + [sys.executable, "-m", "uvicorn", "decnet.web.api:app", "--host", DECNET_API_HOST, "--port", str(api_port)], + env=_env, + stdout=subprocess.DEVNULL, + stderr=subprocess.STDOUT + ) + console.print(f"[dim]API running at http://{DECNET_API_HOST}:{api_port}[/]") + except (FileNotFoundError, subprocess.SubprocessError): + console.print("[red]Failed to start API. Ensure 'uvicorn' is installed in the current environment.[/]") + + if effective_log_file and not dry_run: + console.print("[bold cyan]Starting DECNET-PROBER[/] (auto-discovers attackers from log stream)") + try: + subprocess.Popen( # nosec B603 + [sys.executable, "-m", "decnet.cli", "probe", "--daemon", "--log-file", str(effective_log_file)], + stdin=subprocess.DEVNULL, + stdout=subprocess.DEVNULL, + stderr=subprocess.STDOUT, + start_new_session=True, + ) + except (FileNotFoundError, subprocess.SubprocessError): + console.print("[red]Failed to start DECNET-PROBER.[/]") + + if effective_log_file and not dry_run: + console.print("[bold cyan]Starting DECNET-PROFILER[/] (builds attacker profiles from log stream)") + try: + subprocess.Popen( # nosec B603 + [sys.executable, "-m", "decnet.cli", "profiler", "--daemon"], + stdin=subprocess.DEVNULL, + stdout=subprocess.DEVNULL, + stderr=subprocess.STDOUT, + start_new_session=True, + ) + except (FileNotFoundError, subprocess.SubprocessError): + console.print("[red]Failed to start DECNET-PROFILER.[/]") + + if effective_log_file and not dry_run: + console.print("[bold cyan]Starting DECNET-SNIFFER[/] (passive network capture)") + try: + subprocess.Popen( # nosec B603 + [sys.executable, "-m", "decnet.cli", "sniffer", "--daemon", "--log-file", str(effective_log_file)], + stdin=subprocess.DEVNULL, + stdout=subprocess.DEVNULL, + stderr=subprocess.STDOUT, + start_new_session=True, + ) + except (FileNotFoundError, subprocess.SubprocessError): + console.print("[red]Failed to start DECNET-SNIFFER.[/]") diff --git a/decnet/cli/forwarder.py b/decnet/cli/forwarder.py new file mode 100644 index 0000000..b736fd8 --- /dev/null +++ b/decnet/cli/forwarder.py @@ -0,0 +1,74 @@ +from __future__ import annotations + +import asyncio +import pathlib +import signal +from typing import Optional + +import typer + +from decnet.env import DECNET_INGEST_LOG_FILE + +from . import utils as _utils +from .utils import console, log + + +def register(app: typer.Typer) -> None: + @app.command() + def forwarder( + master_host: Optional[str] = typer.Option(None, "--master-host", help="Master listener hostname/IP (default: $DECNET_SWARM_MASTER_HOST)"), + master_port: int = typer.Option(6514, "--master-port", help="Master listener TCP port (RFC 5425 default 6514)"), + log_file: Optional[str] = typer.Option(DECNET_INGEST_LOG_FILE, "--log-file", help="Local RFC 5424 file to tail and forward"), + agent_dir: Optional[str] = typer.Option(None, "--agent-dir", help="Worker cert bundle dir (default: ~/.decnet/agent)"), + state_db: Optional[str] = typer.Option(None, "--state-db", help="Forwarder offset SQLite path (default: /forwarder.db)"), + poll_interval: float = typer.Option(0.5, "--poll-interval", help="Seconds between log file stat checks"), + daemon: bool = typer.Option(False, "--daemon", "-d", help="Detach to background as a daemon process"), + ) -> None: + """Run the worker-side syslog-over-TLS forwarder (RFC 5425, mTLS to master:6514).""" + from decnet.env import DECNET_SWARM_MASTER_HOST + from decnet.swarm import pki + from decnet.swarm.log_forwarder import ForwarderConfig, run_forwarder + + resolved_host = master_host or DECNET_SWARM_MASTER_HOST + if not resolved_host: + console.print("[red]--master-host is required (or set DECNET_SWARM_MASTER_HOST).[/]") + raise typer.Exit(2) + + resolved_agent_dir = pathlib.Path(agent_dir) if agent_dir else pki.DEFAULT_AGENT_DIR + if not (resolved_agent_dir / "worker.crt").exists(): + console.print(f"[red]No worker cert bundle at {resolved_agent_dir} — enroll from the master first.[/]") + raise typer.Exit(2) + + if not log_file: + console.print("[red]--log-file is required.[/]") + raise typer.Exit(2) + + cfg = ForwarderConfig( + log_path=pathlib.Path(log_file), + master_host=resolved_host, + master_port=master_port, + agent_dir=resolved_agent_dir, + state_db=pathlib.Path(state_db) if state_db else None, + ) + + if daemon: + log.info("forwarder daemonizing master=%s:%d log=%s", resolved_host, master_port, log_file) + _utils._daemonize() + + log.info("forwarder command invoked master=%s:%d log=%s", resolved_host, master_port, log_file) + console.print(f"[green]Starting DECNET forwarder → {resolved_host}:{master_port} (mTLS)...[/]") + + async def _main() -> None: + stop = asyncio.Event() + loop = asyncio.get_running_loop() + for sig in (signal.SIGTERM, signal.SIGINT): + try: + loop.add_signal_handler(sig, stop.set) + except (NotImplementedError, RuntimeError): # pragma: no cover + pass + await run_forwarder(cfg, poll_interval=poll_interval, stop_event=stop) + + try: + asyncio.run(_main()) + except KeyboardInterrupt: + pass diff --git a/decnet/cli/gating.py b/decnet/cli/gating.py new file mode 100644 index 0000000..5c15352 --- /dev/null +++ b/decnet/cli/gating.py @@ -0,0 +1,71 @@ +"""Role-based CLI gating. + +MAINTAINERS: when you add a new Typer command (or add_typer group) that is +master-only, register its name in MASTER_ONLY_COMMANDS / MASTER_ONLY_GROUPS +below. The gate is the only thing that: + (a) hides the command from `decnet --help` on worker hosts, and + (b) prevents a misconfigured worker from invoking master-side logic. +Forgetting to register a new command is a role-boundary bug. Grep for +MASTER_ONLY when touching command registration. + +Worker-legitimate commands (NOT in these sets): agent, updater, forwarder, +status, collect, probe, sniffer. Agents run deckies locally and should be +able to inspect them + run the per-host microservices (collector streams +container logs, prober characterizes attackers hitting this host, sniffer +captures traffic). Mutator and Profiler stay master-only: the mutator +orchestrates respawns across the swarm; the profiler rebuilds attacker +profiles against the master DB (no per-host DB exists). +""" + +from __future__ import annotations + +import os + +import typer + +from .utils import console + +MASTER_ONLY_COMMANDS: frozenset[str] = frozenset({ + "api", "swarmctl", "deploy", "redeploy", "teardown", + "mutate", "listener", "profiler", + "services", "distros", "correlate", "archetypes", "web", + "db-reset", +}) +MASTER_ONLY_GROUPS: frozenset[str] = frozenset({"swarm"}) + + +def _agent_mode_active() -> bool: + """True when the host is configured as an agent AND master commands are + disallowed (the default for agents). Workers overriding this explicitly + set DECNET_DISALLOW_MASTER=false to opt into hybrid use.""" + mode = os.environ.get("DECNET_MODE", "master").lower() + disallow = os.environ.get("DECNET_DISALLOW_MASTER", "true").lower() == "true" + return mode == "agent" and disallow + + +def _require_master_mode(command_name: str) -> None: + """Defence-in-depth: called at the top of every master-only command body. + + The registration-time gate in _gate_commands_by_mode() already hides + these commands from Typer's dispatch table, but this check protects + against direct function imports (e.g. from tests or third-party tools) + that would bypass Typer entirely.""" + if _agent_mode_active(): + console.print( + f"[red]`decnet {command_name}` is a master-only command; this host " + f"is configured as an agent (DECNET_MODE=agent).[/]" + ) + raise typer.Exit(1) + + +def _gate_commands_by_mode(_app: typer.Typer) -> None: + if not _agent_mode_active(): + return + _app.registered_commands = [ + c for c in _app.registered_commands + if (c.name or c.callback.__name__) not in MASTER_ONLY_COMMANDS + ] + _app.registered_groups = [ + g for g in _app.registered_groups + if g.name not in MASTER_ONLY_GROUPS + ] diff --git a/decnet/cli/inventory.py b/decnet/cli/inventory.py new file mode 100644 index 0000000..fb75c36 --- /dev/null +++ b/decnet/cli/inventory.py @@ -0,0 +1,52 @@ +from __future__ import annotations + +import typer +from rich.table import Table + +from decnet.archetypes import all_archetypes +from decnet.distros import all_distros +from decnet.services.registry import all_services + +from .utils import console + + +def register(app: typer.Typer) -> None: + @app.command(name="services") + def list_services() -> None: + """List all registered honeypot service plugins.""" + svcs = all_services() + table = Table(title="Available Services", show_lines=True) + table.add_column("Name", style="bold cyan") + table.add_column("Ports") + table.add_column("Image") + for name, svc in sorted(svcs.items()): + table.add_row(name, ", ".join(str(p) for p in svc.ports), svc.default_image) + console.print(table) + + @app.command(name="distros") + def list_distros() -> None: + """List all available OS distro profiles for deckies.""" + table = Table(title="Available Distro Profiles", show_lines=True) + table.add_column("Slug", style="bold cyan") + table.add_column("Display Name") + table.add_column("Docker Image", style="dim") + for slug, profile in sorted(all_distros().items()): + table.add_row(slug, profile.display_name, profile.image) + console.print(table) + + @app.command(name="archetypes") + def list_archetypes() -> None: + """List all machine archetype profiles.""" + table = Table(title="Machine Archetypes", show_lines=True) + table.add_column("Slug", style="bold cyan") + table.add_column("Display Name") + table.add_column("Default Services", style="green") + table.add_column("Description", style="dim") + for slug, arch in sorted(all_archetypes().items()): + table.add_row( + slug, + arch.display_name, + ", ".join(arch.services), + arch.description, + ) + console.print(table) diff --git a/decnet/cli/lifecycle.py b/decnet/cli/lifecycle.py new file mode 100644 index 0000000..0661729 --- /dev/null +++ b/decnet/cli/lifecycle.py @@ -0,0 +1,97 @@ +from __future__ import annotations + +import subprocess # nosec B404 +from typing import Optional + +import typer +from rich.table import Table + +from decnet.env import DECNET_INGEST_LOG_FILE + +from . import utils as _utils +from .gating import _agent_mode_active, _require_master_mode +from .utils import console, log + + +def register(app: typer.Typer) -> None: + @app.command() + def redeploy( + log_file: str = typer.Option(DECNET_INGEST_LOG_FILE, "--log-file", "-f", help="Path to the DECNET log file"), + ) -> None: + """Check running DECNET services and relaunch any that are down.""" + log.info("redeploy: checking services") + registry = _utils._service_registry(str(log_file)) + + table = Table(title="DECNET Services", show_lines=True) + table.add_column("Service", style="bold cyan") + table.add_column("Status") + table.add_column("PID", style="dim") + table.add_column("Action") + + relaunched = 0 + for name, match_fn, launch_args in registry: + pid = _utils._is_running(match_fn) + if pid is not None: + table.add_row(name, "[green]UP[/]", str(pid), "—") + else: + try: + subprocess.Popen( # nosec B603 + launch_args, + stdin=subprocess.DEVNULL, + stdout=subprocess.DEVNULL, + stderr=subprocess.STDOUT, + start_new_session=True, + ) + table.add_row(name, "[red]DOWN[/]", "—", "[green]relaunched[/]") + relaunched += 1 + except (FileNotFoundError, subprocess.SubprocessError) as exc: + table.add_row(name, "[red]DOWN[/]", "—", f"[red]failed: {exc}[/]") + + console.print(table) + if relaunched: + console.print(f"[green]{relaunched} service(s) relaunched.[/]") + else: + console.print("[green]All services running.[/]") + + @app.command() + def status() -> None: + """Show running deckies and their status.""" + log.info("status command invoked") + from decnet.engine import status as _status + _status() + + registry = _utils._service_registry(str(DECNET_INGEST_LOG_FILE)) + if _agent_mode_active(): + registry = [r for r in registry if r[0] not in {"Mutator", "Profiler", "API"}] + svc_table = Table(title="DECNET Services", show_lines=True) + svc_table.add_column("Service", style="bold cyan") + svc_table.add_column("Status") + svc_table.add_column("PID", style="dim") + + for name, match_fn, _launch_args in registry: + pid = _utils._is_running(match_fn) + if pid is not None: + svc_table.add_row(name, "[green]UP[/]", str(pid)) + else: + svc_table.add_row(name, "[red]DOWN[/]", "—") + + console.print(svc_table) + + @app.command() + def teardown( + all_: bool = typer.Option(False, "--all", help="Tear down all deckies and remove network"), + id_: Optional[str] = typer.Option(None, "--id", help="Tear down a specific decky by name"), + ) -> None: + """Stop and remove deckies.""" + _require_master_mode("teardown") + if not all_ and not id_: + console.print("[red]Specify --all or --id .[/]") + raise typer.Exit(1) + + log.info("teardown command invoked all=%s id=%s", all_, id_) + from decnet.engine import teardown as _teardown + _teardown(decky_id=id_) + log.info("teardown complete all=%s id=%s", all_, id_) + + if all_: + _utils._kill_all_services() diff --git a/decnet/cli/listener.py b/decnet/cli/listener.py new file mode 100644 index 0000000..d95a362 --- /dev/null +++ b/decnet/cli/listener.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +import asyncio +import pathlib +import signal +from typing import Optional + +import typer + +from . import utils as _utils +from .utils import console, log + + +def register(app: typer.Typer) -> None: + @app.command() + def listener( + bind_host: str = typer.Option("0.0.0.0", "--host", help="Bind address for the master syslog-TLS listener"), # nosec B104 + bind_port: int = typer.Option(6514, "--port", help="Listener TCP port (RFC 5425 default 6514)"), + log_path: Optional[str] = typer.Option(None, "--log-path", help="RFC 5424 forensic sink (default: ./master.log)"), + json_path: Optional[str] = typer.Option(None, "--json-path", help="Parsed-JSON ingest sink (default: ./master.json)"), + ca_dir: Optional[str] = typer.Option(None, "--ca-dir", help="DECNET CA dir (default: ~/.decnet/ca)"), + daemon: bool = typer.Option(False, "--daemon", "-d", help="Detach to background as a daemon process"), + ) -> None: + """Run the master-side syslog-over-TLS listener (RFC 5425, mTLS).""" + from decnet.swarm import pki + from decnet.swarm.log_listener import ListenerConfig, run_listener + + resolved_ca_dir = pathlib.Path(ca_dir) if ca_dir else pki.DEFAULT_CA_DIR + resolved_log = pathlib.Path(log_path) if log_path else pathlib.Path("master.log") + resolved_json = pathlib.Path(json_path) if json_path else pathlib.Path("master.json") + + cfg = ListenerConfig( + log_path=resolved_log, json_path=resolved_json, + bind_host=bind_host, bind_port=bind_port, ca_dir=resolved_ca_dir, + ) + + if daemon: + log.info("listener daemonizing host=%s port=%d", bind_host, bind_port) + _utils._daemonize() + + log.info("listener command invoked host=%s port=%d", bind_host, bind_port) + console.print(f"[green]Starting DECNET log listener on {bind_host}:{bind_port} (mTLS)...[/]") + + async def _main() -> None: + stop = asyncio.Event() + loop = asyncio.get_running_loop() + for sig in (signal.SIGTERM, signal.SIGINT): + try: + loop.add_signal_handler(sig, stop.set) + except (NotImplementedError, RuntimeError): # pragma: no cover + pass + await run_listener(cfg, stop_event=stop) + + try: + asyncio.run(_main()) + except KeyboardInterrupt: + pass diff --git a/decnet/cli/profiler.py b/decnet/cli/profiler.py new file mode 100644 index 0000000..c3c6269 --- /dev/null +++ b/decnet/cli/profiler.py @@ -0,0 +1,34 @@ +from __future__ import annotations + +import typer + +from . import utils as _utils +from .utils import console, log + + +def register(app: typer.Typer) -> None: + @app.command(name="profiler") + def profiler_cmd( + interval: int = typer.Option(30, "--interval", "-i", help="Seconds between profile rebuild cycles"), + daemon: bool = typer.Option(False, "--daemon", "-d", help="Detach to background as a daemon process"), + ) -> None: + """Run the attacker profiler as a standalone microservice.""" + import asyncio + from decnet.profiler import attacker_profile_worker + from decnet.web.dependencies import repo + + if daemon: + log.info("profiler daemonizing interval=%d", interval) + _utils._daemonize() + + log.info("profiler starting interval=%d", interval) + console.print(f"[bold cyan]Profiler starting[/] (interval: {interval}s)") + + async def _run() -> None: + await repo.initialize() + await attacker_profile_worker(repo, interval=interval) + + try: + asyncio.run(_run()) + except KeyboardInterrupt: + console.print("\n[yellow]Profiler stopped.[/]") diff --git a/decnet/cli/sniffer.py b/decnet/cli/sniffer.py new file mode 100644 index 0000000..febd000 --- /dev/null +++ b/decnet/cli/sniffer.py @@ -0,0 +1,31 @@ +from __future__ import annotations + +import typer + +from decnet.env import DECNET_INGEST_LOG_FILE + +from . import utils as _utils +from .utils import console, log + + +def register(app: typer.Typer) -> None: + @app.command(name="sniffer") + def sniffer_cmd( + log_file: str = typer.Option(DECNET_INGEST_LOG_FILE, "--log-file", "-f", help="Path to write captured syslog + JSON records"), + daemon: bool = typer.Option(False, "--daemon", "-d", help="Detach to background as a daemon process"), + ) -> None: + """Run the network sniffer as a standalone microservice.""" + import asyncio + from decnet.sniffer import sniffer_worker + + if daemon: + log.info("sniffer daemonizing log_file=%s", log_file) + _utils._daemonize() + + log.info("sniffer starting log_file=%s", log_file) + console.print(f"[bold cyan]Sniffer starting[/] → {log_file}") + + try: + asyncio.run(sniffer_worker(log_file)) + except KeyboardInterrupt: + console.print("\n[yellow]Sniffer stopped.[/]") diff --git a/decnet/cli/swarm.py b/decnet/cli/swarm.py new file mode 100644 index 0000000..8a0157e --- /dev/null +++ b/decnet/cli/swarm.py @@ -0,0 +1,346 @@ +"""`decnet swarm ...` — master-side operator commands (HTTP to local swarmctl).""" + +from __future__ import annotations + +from typing import Optional + +import typer +from rich.table import Table + +from . import utils as _utils +from .utils import console + + +def register(app: typer.Typer) -> None: + swarm_app = typer.Typer( + name="swarm", + help="Manage swarm workers (enroll, list, decommission). Requires `decnet swarmctl` running.", + no_args_is_help=True, + ) + app.add_typer(swarm_app, name="swarm") + + @swarm_app.command("enroll") + def swarm_enroll( + name: str = typer.Option(..., "--name", help="Short hostname for the worker (also the cert CN)"), + address: str = typer.Option(..., "--address", help="IP or DNS the master uses to reach the worker"), + agent_port: int = typer.Option(8765, "--agent-port", help="Worker agent TCP port"), + sans: Optional[str] = typer.Option(None, "--sans", help="Comma-separated extra SANs for the worker cert"), + notes: Optional[str] = typer.Option(None, "--notes", help="Free-form operator notes"), + out_dir: Optional[str] = typer.Option(None, "--out-dir", help="Write the bundle (ca.crt/worker.crt/worker.key) to this dir for scp"), + updater: bool = typer.Option(False, "--updater", help="Also issue an updater-identity cert (CN=updater@) for the remote self-updater"), + url: Optional[str] = typer.Option(None, "--url", help="Override swarm controller URL (default: 127.0.0.1:8770)"), + ) -> None: + """Issue a mTLS bundle for a new worker and register it in the swarm.""" + import pathlib as _pathlib + + body: dict = {"name": name, "address": address, "agent_port": agent_port} + if sans: + body["sans"] = [s.strip() for s in sans.split(",") if s.strip()] + if notes: + body["notes"] = notes + if updater: + body["issue_updater_bundle"] = True + + resp = _utils._http_request("POST", _utils._swarmctl_base_url(url) + "/swarm/enroll", json_body=body) + data = resp.json() + + console.print(f"[green]Enrolled worker:[/] {data['name']} " + f"[dim]uuid=[/]{data['host_uuid']} " + f"[dim]fingerprint=[/]{data['fingerprint']}") + if data.get("updater"): + console.print(f"[green] + updater identity[/] " + f"[dim]fingerprint=[/]{data['updater']['fingerprint']}") + + if out_dir: + target = _pathlib.Path(out_dir).expanduser() + target.mkdir(parents=True, exist_ok=True) + (target / "ca.crt").write_text(data["ca_cert_pem"]) + (target / "worker.crt").write_text(data["worker_cert_pem"]) + (target / "worker.key").write_text(data["worker_key_pem"]) + for leaf in ("worker.key",): + try: + (target / leaf).chmod(0o600) + except OSError: + pass + console.print(f"[cyan]Agent bundle written to[/] {target}") + + if data.get("updater"): + upd_target = target.parent / f"{target.name}-updater" + upd_target.mkdir(parents=True, exist_ok=True) + (upd_target / "ca.crt").write_text(data["ca_cert_pem"]) + (upd_target / "updater.crt").write_text(data["updater"]["updater_cert_pem"]) + (upd_target / "updater.key").write_text(data["updater"]["updater_key_pem"]) + try: + (upd_target / "updater.key").chmod(0o600) + except OSError: + pass + console.print(f"[cyan]Updater bundle written to[/] {upd_target}") + console.print("[dim]Ship the agent dir to ~/.decnet/agent/ and the updater dir to ~/.decnet/updater/ on the worker.[/]") + else: + console.print("[dim]Ship this directory to the worker at ~/.decnet/agent/ (or wherever `decnet agent --agent-dir` points).[/]") + else: + console.print("[yellow]No --out-dir given — bundle PEMs are in the JSON response; persist them before leaving this shell.[/]") + + @swarm_app.command("list") + def swarm_list( + host_status: Optional[str] = typer.Option(None, "--status", help="Filter by status (enrolled|active|unreachable|decommissioned)"), + url: Optional[str] = typer.Option(None, "--url", help="Override swarm controller URL"), + ) -> None: + """List enrolled workers.""" + q = f"?host_status={host_status}" if host_status else "" + resp = _utils._http_request("GET", _utils._swarmctl_base_url(url) + "/swarm/hosts" + q) + rows = resp.json() + if not rows: + console.print("[dim]No workers enrolled.[/]") + return + table = Table(title="DECNET swarm workers") + for col in ("name", "address", "port", "status", "last heartbeat", "enrolled"): + table.add_column(col) + for r in rows: + table.add_row( + r.get("name") or "", + r.get("address") or "", + str(r.get("agent_port") or ""), + r.get("status") or "", + str(r.get("last_heartbeat") or "—"), + str(r.get("enrolled_at") or "—"), + ) + console.print(table) + + @swarm_app.command("check") + def swarm_check( + url: Optional[str] = typer.Option(None, "--url", help="Override swarm controller URL"), + json_out: bool = typer.Option(False, "--json", help="Emit JSON instead of a table"), + ) -> None: + """Actively probe every enrolled worker and refresh status + last_heartbeat.""" + resp = _utils._http_request("POST", _utils._swarmctl_base_url(url) + "/swarm/check", timeout=60.0) + payload = resp.json() + results = payload.get("results", []) + + if json_out: + console.print_json(data=payload) + return + + if not results: + console.print("[dim]No workers enrolled.[/]") + return + + table = Table(title="DECNET swarm check") + for col in ("name", "address", "reachable", "detail"): + table.add_column(col) + for r in results: + reachable = r.get("reachable") + mark = "[green]yes[/]" if reachable else "[red]no[/]" + detail = r.get("detail") + detail_str = "—" + if isinstance(detail, dict): + detail_str = detail.get("status") or ", ".join(f"{k}={v}" for k, v in detail.items()) + elif detail is not None: + detail_str = str(detail) + table.add_row( + r.get("name") or "", + r.get("address") or "", + mark, + detail_str, + ) + console.print(table) + + @swarm_app.command("update") + def swarm_update( + host: Optional[str] = typer.Option(None, "--host", help="Target worker (name or UUID). Omit with --all."), + all_hosts: bool = typer.Option(False, "--all", help="Push to every enrolled worker."), + include_self: bool = typer.Option(False, "--include-self", help="Also push to each updater's /update-self after a successful agent update."), + root: Optional[str] = typer.Option(None, "--root", help="Source tree to tar (default: CWD)."), + exclude: list[str] = typer.Option([], "--exclude", help="Additional exclude glob. Repeatable."), + updater_port: int = typer.Option(8766, "--updater-port", help="Port the workers' updater listens on."), + dry_run: bool = typer.Option(False, "--dry-run", help="Build the tarball and print stats; no network."), + url: Optional[str] = typer.Option(None, "--url", help="Override swarm controller URL."), + ) -> None: + """Push the current working tree to workers' self-updaters (with auto-rollback on failure).""" + import asyncio + import pathlib as _pathlib + + from decnet.swarm.tar_tree import tar_working_tree, detect_git_sha + from decnet.swarm.updater_client import UpdaterClient + + if not (host or all_hosts): + console.print("[red]Supply --host or --all.[/]") + raise typer.Exit(2) + if host and all_hosts: + console.print("[red]--host and --all are mutually exclusive.[/]") + raise typer.Exit(2) + + base = _utils._swarmctl_base_url(url) + resp = _utils._http_request("GET", base + "/swarm/hosts") + rows = resp.json() + if host: + targets = [r for r in rows if r.get("name") == host or r.get("uuid") == host] + if not targets: + console.print(f"[red]No enrolled worker matching '{host}'.[/]") + raise typer.Exit(1) + else: + targets = [r for r in rows if r.get("status") != "decommissioned"] + if not targets: + console.print("[dim]No targets.[/]") + return + + tree_root = _pathlib.Path(root) if root else _pathlib.Path.cwd() + sha = detect_git_sha(tree_root) + console.print(f"[dim]Tarring[/] {tree_root} [dim]sha={sha or '(not a git repo)'}[/]") + tarball = tar_working_tree(tree_root, extra_excludes=exclude) + console.print(f"[dim]Tarball size:[/] {len(tarball):,} bytes") + + if dry_run: + console.print("[yellow]--dry-run: not pushing.[/]") + for t in targets: + console.print(f" would push to [cyan]{t.get('name')}[/] at {t.get('address')}:{updater_port}") + return + + async def _push_one(h: dict) -> dict: + name = h.get("name") or h.get("uuid") + out: dict = {"name": name, "address": h.get("address"), "agent": None, "self": None} + try: + async with UpdaterClient(h, updater_port=updater_port) as u: + r = await u.update(tarball, sha=sha) + out["agent"] = {"status": r.status_code, "body": r.json() if r.content else {}} + if r.status_code == 200 and include_self: + rs = await u.update_self(tarball, sha=sha) + out["self"] = {"status": rs.status_code, "body": rs.json() if rs.content else {}} + except Exception as exc: # noqa: BLE001 + out["error"] = f"{type(exc).__name__}: {exc}" + return out + + async def _push_all() -> list[dict]: + return await asyncio.gather(*(_push_one(t) for t in targets)) + + results = asyncio.run(_push_all()) + + table = Table(title="DECNET swarm update") + for col in ("host", "address", "agent", "self", "detail"): + table.add_column(col) + any_failure = False + for r in results: + agent = r.get("agent") or {} + selff = r.get("self") or {} + err = r.get("error") + if err: + any_failure = True + table.add_row(r["name"], r.get("address") or "", "[red]error[/]", "—", err) + continue + a_status = agent.get("status") + if a_status == 200: + agent_cell = "[green]updated[/]" + elif a_status == 409: + agent_cell = "[yellow]rolled-back[/]" + any_failure = True + else: + agent_cell = f"[red]{a_status}[/]" + any_failure = True + if not include_self: + self_cell = "—" + elif selff.get("status") == 200 or selff.get("status") is None: + self_cell = "[green]ok[/]" if selff else "[dim]skipped[/]" + else: + self_cell = f"[red]{selff.get('status')}[/]" + detail = "" + body = agent.get("body") or {} + if isinstance(body, dict): + detail = body.get("release", {}).get("sha") or body.get("detail", {}).get("error") or "" + table.add_row(r["name"], r.get("address") or "", agent_cell, self_cell, str(detail)[:80]) + console.print(table) + + if any_failure: + raise typer.Exit(1) + + @swarm_app.command("deckies") + def swarm_deckies( + host: Optional[str] = typer.Option(None, "--host", help="Filter by worker name or UUID"), + state: Optional[str] = typer.Option(None, "--state", help="Filter by shard state (pending|running|failed|torn_down)"), + url: Optional[str] = typer.Option(None, "--url", help="Override swarm controller URL"), + json_out: bool = typer.Option(False, "--json", help="Emit JSON instead of a table"), + ) -> None: + """List deployed deckies across the swarm with their owning worker host.""" + base = _utils._swarmctl_base_url(url) + + host_uuid: Optional[str] = None + if host: + resp = _utils._http_request("GET", base + "/swarm/hosts") + rows = resp.json() + match = next((r for r in rows if r.get("uuid") == host or r.get("name") == host), None) + if match is None: + console.print(f"[red]No enrolled worker matching '{host}'.[/]") + raise typer.Exit(1) + host_uuid = match["uuid"] + + query = [] + if host_uuid: + query.append(f"host_uuid={host_uuid}") + if state: + query.append(f"state={state}") + path = "/swarm/deckies" + ("?" + "&".join(query) if query else "") + + resp = _utils._http_request("GET", base + path) + rows = resp.json() + + if json_out: + console.print_json(data=rows) + return + + if not rows: + console.print("[dim]No deckies deployed.[/]") + return + + table = Table(title="DECNET swarm deckies") + for col in ("decky", "host", "address", "state", "services"): + table.add_column(col) + for r in rows: + services = ",".join(r.get("services") or []) or "—" + state_val = r.get("state") or "pending" + colored = { + "running": f"[green]{state_val}[/]", + "failed": f"[red]{state_val}[/]", + "pending": f"[yellow]{state_val}[/]", + "torn_down": f"[dim]{state_val}[/]", + }.get(state_val, state_val) + table.add_row( + r.get("decky_name") or "", + r.get("host_name") or "", + r.get("host_address") or "", + colored, + services, + ) + console.print(table) + + @swarm_app.command("decommission") + def swarm_decommission( + name: Optional[str] = typer.Option(None, "--name", help="Worker hostname"), + uuid: Optional[str] = typer.Option(None, "--uuid", help="Worker UUID (skip lookup)"), + url: Optional[str] = typer.Option(None, "--url", help="Override swarm controller URL"), + yes: bool = typer.Option(False, "--yes", "-y", help="Skip interactive confirmation"), + ) -> None: + """Remove a worker from the swarm (cascades decky shard rows).""" + if not (name or uuid): + console.print("[red]Supply --name or --uuid.[/]") + raise typer.Exit(2) + + base = _utils._swarmctl_base_url(url) + target_uuid = uuid + target_name = name + if target_uuid is None: + resp = _utils._http_request("GET", base + "/swarm/hosts") + rows = resp.json() + match = next((r for r in rows if r.get("name") == name), None) + if match is None: + console.print(f"[red]No enrolled worker named '{name}'.[/]") + raise typer.Exit(1) + target_uuid = match["uuid"] + target_name = match.get("name") or target_name + + if not yes: + confirm = typer.confirm(f"Decommission worker {target_name!r} ({target_uuid})?", default=False) + if not confirm: + console.print("[dim]Aborted.[/]") + raise typer.Exit(0) + + _utils._http_request("DELETE", f"{base}/swarm/hosts/{target_uuid}") + console.print(f"[green]Decommissioned {target_name or target_uuid}.[/]") diff --git a/decnet/cli/swarmctl.py b/decnet/cli/swarmctl.py new file mode 100644 index 0000000..687823c --- /dev/null +++ b/decnet/cli/swarmctl.py @@ -0,0 +1,104 @@ +from __future__ import annotations + +import os +import signal +import subprocess # nosec B404 +import sys +from typing import Optional + +import typer + +from . import utils as _utils +from .gating import _require_master_mode +from .utils import console, log + + +def register(app: typer.Typer) -> None: + @app.command() + def swarmctl( + port: int = typer.Option(8770, "--port", help="Port for the swarm controller"), + host: str = typer.Option("127.0.0.1", "--host", help="Bind address for the swarm controller"), + daemon: bool = typer.Option(False, "--daemon", "-d", help="Detach to background as a daemon process"), + no_listener: bool = typer.Option(False, "--no-listener", help="Do not auto-spawn the syslog-TLS listener alongside swarmctl"), + tls: bool = typer.Option(False, "--tls", help="Serve over HTTPS with mTLS (required for cross-host worker heartbeats)"), + cert: Optional[str] = typer.Option(None, "--cert", help="BYOC: path to TLS server cert (PEM). Auto-issues from the DECNET CA if omitted."), + key: Optional[str] = typer.Option(None, "--key", help="BYOC: path to TLS server private key (PEM)."), + client_ca: Optional[str] = typer.Option(None, "--client-ca", help="CA bundle used to verify worker client certs. Defaults to the DECNET CA."), + ) -> None: + """Run the DECNET SWARM controller (master-side, separate process from `decnet api`). + + By default, `decnet swarmctl` auto-spawns `decnet listener` as a fully- + detached sibling process so the master starts accepting forwarder + connections on 6514 without a second manual invocation. The listener + survives swarmctl restarts and crashes — if it dies on its own, + restart it manually with `decnet listener --daemon …`. Pass + --no-listener to skip. + + Pass ``--tls`` to serve over HTTPS with mutual-TLS enforcement. By + default the server cert is auto-issued from the DECNET CA under + ``~/.decnet/swarmctl/`` so enrolled workers (which already ship that + CA's ``ca.crt``) trust it out of the box. BYOC via ``--cert``/``--key`` + if you need a publicly-trusted or externally-managed cert. + """ + _require_master_mode("swarmctl") + if daemon: + log.info("swarmctl daemonizing host=%s port=%d", host, port) + _utils._daemonize() + + if not no_listener: + listener_host = os.environ.get("DECNET_LISTENER_HOST", "0.0.0.0") # nosec B104 + listener_port = int(os.environ.get("DECNET_SWARM_SYSLOG_PORT", "6514")) + lst_argv = [ + sys.executable, "-m", "decnet", "listener", + "--host", listener_host, + "--port", str(listener_port), + "--daemon", + ] + try: + pid = _utils._spawn_detached(lst_argv, _utils._pid_dir() / "listener.pid") + log.info("swarmctl auto-spawned listener pid=%d bind=%s:%d", + pid, listener_host, listener_port) + console.print(f"[dim]Auto-spawned listener (pid {pid}) on {listener_host}:{listener_port}.[/]") + except Exception as e: # noqa: BLE001 + log.warning("swarmctl could not auto-spawn listener: %s", e) + console.print(f"[yellow]listener auto-spawn skipped: {e}[/]") + + log.info("swarmctl command invoked host=%s port=%d tls=%s", host, port, tls) + scheme = "https" if tls else "http" + console.print(f"[green]Starting DECNET SWARM controller on {scheme}://{host}:{port}...[/]") + _cmd = [sys.executable, "-m", "uvicorn", "decnet.web.swarm_api:app", + "--host", host, "--port", str(port)] + if tls: + from decnet.swarm import pki as _pki + if cert and key: + cert_path, key_path = cert, key + elif cert or key: + console.print("[red]--cert and --key must be provided together.[/]") + raise typer.Exit(code=2) + else: + auto_cert, auto_key, _auto_ca = _pki.ensure_swarmctl_cert(host) + cert_path, key_path = str(auto_cert), str(auto_key) + console.print(f"[dim]Auto-issued swarmctl server cert → {cert_path}[/]") + ca_path = client_ca or str(_pki.DEFAULT_CA_DIR / "ca.crt") + _cmd += [ + "--ssl-keyfile", key_path, + "--ssl-certfile", cert_path, + "--ssl-ca-certs", ca_path, + "--ssl-cert-reqs", "2", + ] + try: + proc = subprocess.Popen(_cmd, start_new_session=True) # nosec B603 B404 + try: + proc.wait() + except KeyboardInterrupt: + try: + os.killpg(proc.pid, signal.SIGTERM) + try: + proc.wait(timeout=10) + except subprocess.TimeoutExpired: + os.killpg(proc.pid, signal.SIGKILL) + proc.wait() + except ProcessLookupError: + pass + except (FileNotFoundError, subprocess.SubprocessError): + console.print("[red]Failed to start swarmctl. Ensure 'uvicorn' is installed in the current environment.[/]") diff --git a/decnet/cli/updater.py b/decnet/cli/updater.py new file mode 100644 index 0000000..20c9ac4 --- /dev/null +++ b/decnet/cli/updater.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +import pathlib as _pathlib +from typing import Optional + +import typer + +from . import utils as _utils +from .utils import console, log + + +def register(app: typer.Typer) -> None: + @app.command() + def updater( + port: int = typer.Option(8766, "--port", help="Port for the self-updater daemon"), + host: str = typer.Option("0.0.0.0", "--host", help="Bind address for the updater"), # nosec B104 + updater_dir: Optional[str] = typer.Option(None, "--updater-dir", help="Updater cert bundle dir (default: ~/.decnet/updater)"), + install_dir: Optional[str] = typer.Option(None, "--install-dir", help="Release install root (default: /opt/decnet)"), + agent_dir: Optional[str] = typer.Option(None, "--agent-dir", help="Worker agent cert bundle (for local /health probes; default: ~/.decnet/agent)"), + daemon: bool = typer.Option(False, "--daemon", "-d", help="Detach to background as a daemon process"), + ) -> None: + """Run the DECNET self-updater (requires a bundle in ~/.decnet/updater/).""" + from decnet.swarm import pki as _pki + from decnet.updater import server as _upd_server + + resolved_updater = _pathlib.Path(updater_dir) if updater_dir else _upd_server.DEFAULT_UPDATER_DIR + resolved_install = _pathlib.Path(install_dir) if install_dir else _pathlib.Path("/opt/decnet") + resolved_agent = _pathlib.Path(agent_dir) if agent_dir else _pki.DEFAULT_AGENT_DIR + + if daemon: + log.info("updater daemonizing host=%s port=%d", host, port) + _utils._daemonize() + + log.info( + "updater command invoked host=%s port=%d updater_dir=%s install_dir=%s", + host, port, resolved_updater, resolved_install, + ) + console.print(f"[green]Starting DECNET self-updater on {host}:{port} (mTLS)...[/]") + rc = _upd_server.run( + host, port, + updater_dir=resolved_updater, + install_dir=resolved_install, + agent_dir=resolved_agent, + ) + if rc != 0: + raise typer.Exit(rc) diff --git a/decnet/cli/utils.py b/decnet/cli/utils.py new file mode 100644 index 0000000..1954173 --- /dev/null +++ b/decnet/cli/utils.py @@ -0,0 +1,177 @@ +"""Shared CLI helpers: console, logger, process management, swarm HTTP client. + +Submodules reference these as ``from . import utils`` then ``utils.foo(...)`` +so tests can patch ``decnet.cli.utils.`` and have every caller see it. +""" + +from __future__ import annotations + +import os +import signal +import subprocess # nosec B404 +import sys +from pathlib import Path +from typing import Optional + +import typer +from rich.console import Console + +from decnet.logging import get_logger +from decnet.env import DECNET_API_HOST, DECNET_API_PORT, DECNET_INGEST_LOG_FILE + +log = get_logger("cli") +console = Console() + + +def _daemonize() -> None: + """Fork the current process into a background daemon (Unix double-fork).""" + if os.fork() > 0: + raise SystemExit(0) + os.setsid() + if os.fork() > 0: + raise SystemExit(0) + sys.stdout = open(os.devnull, "w") # noqa: SIM115 + sys.stderr = open(os.devnull, "w") # noqa: SIM115 + sys.stdin = open(os.devnull, "r") # noqa: SIM115 + + +def _pid_dir() -> Path: + """Return the writable PID directory. + + /opt/decnet when it exists and is writable (production), else + ~/.decnet (dev). The directory is created if needed.""" + candidates = [Path("/opt/decnet"), Path.home() / ".decnet"] + for path in candidates: + try: + path.mkdir(parents=True, exist_ok=True) + if os.access(path, os.W_OK): + return path + except (PermissionError, OSError): + continue + return Path("/tmp") # nosec B108 + + +def _spawn_detached(argv: list[str], pid_file: Path) -> int: + """Spawn a DECNET subcommand as a fully-independent sibling process. + + The parent does NOT wait() on this child. start_new_session=True puts + the child in its own session so SIGHUP on parent exit doesn't kill it; + stdin/stdout/stderr go to /dev/null so the launching shell can close + without EIO on the child. close_fds=True prevents inherited sockets + from pinning ports we're trying to rebind. + + This is deliberately NOT a supervisor — we fire-and-forget. If the + child dies, the operator restarts it manually via its own subcommand. + """ + if pid_file.exists(): + try: + existing = int(pid_file.read_text().strip()) + os.kill(existing, 0) + return existing + except (ValueError, ProcessLookupError, PermissionError, OSError): + pass # stale pid_file — fall through and spawn + + with open(os.devnull, "rb") as dn_in, open(os.devnull, "ab") as dn_out: + proc = subprocess.Popen( # nosec B603 + argv, + stdin=dn_in, stdout=dn_out, stderr=dn_out, + start_new_session=True, close_fds=True, + ) + pid_file.parent.mkdir(parents=True, exist_ok=True) + pid_file.write_text(f"{proc.pid}\n") + return proc.pid + + +def _is_running(match_fn) -> int | None: + """Return PID of a running DECNET process matching ``match_fn(cmdline)``, or None.""" + import psutil + + for proc in psutil.process_iter(["pid", "cmdline"]): + try: + cmd = proc.info["cmdline"] + if cmd and match_fn(cmd): + return proc.info["pid"] + except (psutil.NoSuchProcess, psutil.AccessDenied): + continue + return None + + +def _service_registry(log_file: str) -> list[tuple[str, callable, list[str]]]: + """Return the microservice registry for health-check and relaunch. + + On agents these run as systemd units invoking /usr/local/bin/decnet, + which doesn't include "decnet.cli" in its cmdline. On master dev boxes + they're launched via `python -m decnet.cli`. Match either form — cmd + is a list of argv tokens, so substring-check the joined string. + """ + _py = sys.executable + + def _matches(sub: str, extras: tuple[str, ...] = ()): + def _check(cmd) -> bool: + joined = " ".join(cmd) if not isinstance(cmd, str) else cmd + if "decnet" not in joined: + return False + if sub not in joined: + return False + return all(e in joined for e in extras) + return _check + + return [ + ("Collector", _matches("collect"), + [_py, "-m", "decnet.cli", "collect", "--daemon", "--log-file", log_file]), + ("Mutator", _matches("mutate", ("--watch",)), + [_py, "-m", "decnet.cli", "mutate", "--daemon", "--watch"]), + ("Prober", _matches("probe"), + [_py, "-m", "decnet.cli", "probe", "--daemon", "--log-file", log_file]), + ("Profiler", _matches("profiler"), + [_py, "-m", "decnet.cli", "profiler", "--daemon"]), + ("Sniffer", _matches("sniffer"), + [_py, "-m", "decnet.cli", "sniffer", "--daemon", "--log-file", log_file]), + ("API", + lambda cmd: "uvicorn" in cmd and "decnet.web.api:app" in cmd, + [_py, "-m", "uvicorn", "decnet.web.api:app", + "--host", DECNET_API_HOST, "--port", str(DECNET_API_PORT)]), + ] + + +def _kill_all_services() -> None: + """Find and kill all running DECNET microservice processes.""" + registry = _service_registry(str(DECNET_INGEST_LOG_FILE)) + killed = 0 + for name, match_fn, _launch_args in registry: + pid = _is_running(match_fn) + if pid is not None: + console.print(f"[yellow]Stopping {name} (PID {pid})...[/]") + os.kill(pid, signal.SIGTERM) + killed += 1 + + if killed: + console.print(f"[green]{killed} background process(es) stopped.[/]") + else: + console.print("[dim]No DECNET services were running.[/]") + + +_DEFAULT_SWARMCTL_URL = "http://127.0.0.1:8770" + + +def _swarmctl_base_url(url: Optional[str]) -> str: + return url or os.environ.get("DECNET_SWARMCTL_URL", _DEFAULT_SWARMCTL_URL) + + +def _http_request(method: str, url: str, *, json_body: Optional[dict] = None, timeout: float = 30.0): + """Tiny sync wrapper around httpx; avoids leaking async into the CLI.""" + import httpx + try: + resp = httpx.request(method, url, json=json_body, timeout=timeout) + except httpx.HTTPError as exc: + console.print(f"[red]Could not reach swarm controller at {url}: {exc}[/]") + console.print("[dim]Is `decnet swarmctl` running?[/]") + raise typer.Exit(2) + if resp.status_code >= 400: + try: + detail = resp.json().get("detail", resp.text) + except Exception: # nosec B110 + detail = resp.text + console.print(f"[red]{method} {url} failed: {resp.status_code} — {detail}[/]") + raise typer.Exit(1) + return resp diff --git a/decnet/cli/web.py b/decnet/cli/web.py new file mode 100644 index 0000000..a7abb7b --- /dev/null +++ b/decnet/cli/web.py @@ -0,0 +1,120 @@ +from __future__ import annotations + +import typer + +from decnet.env import DECNET_API_PORT, DECNET_WEB_HOST, DECNET_WEB_PORT + +from . import utils as _utils +from .utils import console, log + + +def register(app: typer.Typer) -> None: + @app.command(name="web") + def serve_web( + web_port: int = typer.Option(DECNET_WEB_PORT, "--web-port", help="Port to serve the DECNET Web Dashboard"), + host: str = typer.Option(DECNET_WEB_HOST, "--host", help="Host IP to serve the Web Dashboard"), + api_port: int = typer.Option(DECNET_API_PORT, "--api-port", help="Port the DECNET API is listening on"), + daemon: bool = typer.Option(False, "--daemon", "-d", help="Detach to background as a daemon process"), + ) -> None: + """Serve the DECNET Web Dashboard frontend. + + Proxies /api/* requests to the API server so the frontend can use + relative URLs (/api/v1/...) with no CORS configuration required. + """ + import http.client + import http.server + import os + import socketserver + from pathlib import Path + + dist_dir = Path(__file__).resolve().parent.parent.parent / "decnet_web" / "dist" + + if not dist_dir.exists(): + console.print(f"[red]Frontend build not found at {dist_dir}. Make sure you run 'npm run build' inside 'decnet_web'.[/]") + raise typer.Exit(1) + + if daemon: + log.info("web daemonizing host=%s port=%d api_port=%d", host, web_port, api_port) + _utils._daemonize() + + _api_port = api_port + + class SPAHTTPRequestHandler(http.server.SimpleHTTPRequestHandler): + def do_GET(self): + if self.path.startswith("/api/"): + self._proxy("GET") + return + path = self.translate_path(self.path) + if not Path(path).exists() or Path(path).is_dir(): + self.path = "/index.html" + return super().do_GET() + + def do_POST(self): + if self.path.startswith("/api/"): + self._proxy("POST") + return + self.send_error(405) + + def do_PUT(self): + if self.path.startswith("/api/"): + self._proxy("PUT") + return + self.send_error(405) + + def do_DELETE(self): + if self.path.startswith("/api/"): + self._proxy("DELETE") + return + self.send_error(405) + + def _proxy(self, method: str) -> None: + content_length = int(self.headers.get("Content-Length", 0)) + body = self.rfile.read(content_length) if content_length else None + + forward = {k: v for k, v in self.headers.items() + if k.lower() not in ("host", "connection")} + + try: + conn = http.client.HTTPConnection("127.0.0.1", _api_port, timeout=120) + conn.request(method, self.path, body=body, headers=forward) + resp = conn.getresponse() + + self.send_response(resp.status) + for key, val in resp.getheaders(): + if key.lower() not in ("connection", "transfer-encoding"): + self.send_header(key, val) + self.end_headers() + + content_type = resp.getheader("Content-Type", "") + if "text/event-stream" in content_type: + conn.sock.settimeout(None) + + _read = getattr(resp, "read1", resp.read) + while True: + chunk = _read(4096) + if not chunk: + break + self.wfile.write(chunk) + self.wfile.flush() + except Exception as exc: + log.warning("web proxy error %s %s: %s", method, self.path, exc) + self.send_error(502, f"API proxy error: {exc}") + finally: + try: + conn.close() + except Exception: # nosec B110 — best-effort conn cleanup + pass + + def log_message(self, fmt: str, *args: object) -> None: + log.debug("web %s", fmt % args) + + os.chdir(dist_dir) + + socketserver.TCPServer.allow_reuse_address = True + with socketserver.ThreadingTCPServer((host, web_port), SPAHTTPRequestHandler) as httpd: + console.print(f"[green]Serving DECNET Web Dashboard on http://{host}:{web_port}[/]") + console.print(f"[dim]Proxying /api/* → http://127.0.0.1:{_api_port}[/]") + try: + httpd.serve_forever() + except KeyboardInterrupt: + console.print("\n[dim]Shutting down dashboard server.[/]") diff --git a/decnet/cli/workers.py b/decnet/cli/workers.py new file mode 100644 index 0000000..cc0ba52 --- /dev/null +++ b/decnet/cli/workers.py @@ -0,0 +1,142 @@ +from __future__ import annotations + +from typing import Optional + +import typer + +from decnet.env import DECNET_INGEST_LOG_FILE + +from . import utils as _utils +from .utils import console, log + + +def register(app: typer.Typer) -> None: + @app.command() + def probe( + log_file: str = typer.Option(DECNET_INGEST_LOG_FILE, "--log-file", "-f", help="Path for RFC 5424 syslog + .json output (reads attackers from .json, writes results to both)"), + interval: int = typer.Option(300, "--interval", "-i", help="Seconds between probe cycles (default: 300)"), + timeout: float = typer.Option(5.0, "--timeout", help="Per-probe TCP timeout in seconds"), + daemon: bool = typer.Option(False, "--daemon", "-d", help="Detach to background (used by deploy, no console output)"), + ) -> None: + """Fingerprint attackers (JARM + HASSH + TCP/IP stack) discovered in the log stream.""" + import asyncio + from decnet.prober import prober_worker + + if daemon: + log.info("probe daemonizing log_file=%s interval=%d", log_file, interval) + _utils._daemonize() + asyncio.run(prober_worker(log_file, interval=interval, timeout=timeout)) + return + + log.info("probe command invoked log_file=%s interval=%d", log_file, interval) + console.print(f"[bold cyan]DECNET-PROBER[/] watching {log_file} for attackers (interval: {interval}s)") + console.print("[dim]Press Ctrl+C to stop[/]") + try: + asyncio.run(prober_worker(log_file, interval=interval, timeout=timeout)) + except KeyboardInterrupt: + console.print("\n[yellow]DECNET-PROBER stopped.[/]") + + @app.command() + def collect( + log_file: str = typer.Option(DECNET_INGEST_LOG_FILE, "--log-file", "-f", help="Path to write RFC 5424 syslog lines and .json records"), + daemon: bool = typer.Option(False, "--daemon", "-d", help="Detach to background as a daemon process"), + ) -> None: + """Stream Docker logs from all running decky service containers to a log file.""" + import asyncio + from decnet.collector import log_collector_worker + + if daemon: + log.info("collect daemonizing log_file=%s", log_file) + _utils._daemonize() + + log.info("collect command invoked log_file=%s", log_file) + console.print(f"[bold cyan]Collector starting[/] → {log_file}") + asyncio.run(log_collector_worker(log_file)) + + @app.command() + def mutate( + watch: bool = typer.Option(False, "--watch", "-w", help="Run continuously and mutate deckies according to their interval"), + decky_name: Optional[str] = typer.Option(None, "--decky", help="Force mutate a specific decky immediately"), + force_all: bool = typer.Option(False, "--all", help="Force mutate all deckies immediately"), + daemon: bool = typer.Option(False, "--daemon", "-d", help="Detach to background as a daemon process"), + ) -> None: + """Manually trigger or continuously watch for decky mutation.""" + import asyncio + from decnet.mutator import mutate_decky, mutate_all, run_watch_loop + from decnet.web.dependencies import repo + + if daemon: + log.info("mutate daemonizing watch=%s", watch) + _utils._daemonize() + + async def _run() -> None: + await repo.initialize() + if watch: + await run_watch_loop(repo) + elif decky_name: + await mutate_decky(decky_name, repo) + elif force_all: + await mutate_all(force=True, repo=repo) + else: + await mutate_all(force=False, repo=repo) + + asyncio.run(_run()) + + @app.command(name="correlate") + def correlate( + log_file: Optional[str] = typer.Option(None, "--log-file", "-f", help="Path to DECNET syslog file to analyse"), + min_deckies: int = typer.Option(2, "--min-deckies", "-m", help="Minimum number of distinct deckies an IP must touch to be reported"), + output: str = typer.Option("table", "--output", "-o", help="Output format: table | json | syslog"), + emit_syslog: bool = typer.Option(False, "--emit-syslog", help="Also print traversal events as RFC 5424 lines (for SIEM piping)"), + daemon: bool = typer.Option(False, "--daemon", "-d", help="Detach to background as a daemon process"), + ) -> None: + """Analyse logs for cross-decky traversals and print the attacker movement graph.""" + import sys + import json as _json + from pathlib import Path + from decnet.correlation.engine import CorrelationEngine + + if daemon: + log.info("correlate daemonizing log_file=%s", log_file) + _utils._daemonize() + + engine = CorrelationEngine() + + if log_file: + path = Path(log_file) + if not path.exists(): + console.print(f"[red]Log file not found: {log_file}[/]") + raise typer.Exit(1) + engine.ingest_file(path) + elif not sys.stdin.isatty(): + for line in sys.stdin: + engine.ingest(line) + else: + console.print("[red]Provide --log-file or pipe log data via stdin.[/]") + raise typer.Exit(1) + + traversals = engine.traversals(min_deckies) + + if output == "json": + console.print_json(_json.dumps(engine.report_json(min_deckies), indent=2)) + elif output == "syslog": + for line in engine.traversal_syslog_lines(min_deckies): + typer.echo(line) + else: + if not traversals: + console.print( + f"[yellow]No traversals detected " + f"(min_deckies={min_deckies}, events_indexed={engine.events_indexed}).[/]" + ) + else: + console.print(engine.report_table(min_deckies)) + console.print( + f"[dim]Parsed {engine.lines_parsed} lines · " + f"indexed {engine.events_indexed} events · " + f"{len(engine.all_attackers())} unique IPs · " + f"[bold]{len(traversals)}[/] traversal(s)[/]" + ) + + if emit_syslog: + for line in engine.traversal_syslog_lines(min_deckies): + typer.echo(line) diff --git a/decnet/collector/worker.py b/decnet/collector/worker.py index 69e2c6b..3234afc 100644 --- a/decnet/collector/worker.py +++ b/decnet/collector/worker.py @@ -8,13 +8,100 @@ The ingester tails the .json file; rsyslog can consume the .log file independent import asyncio import json -import logging +import os import re +import threading +import time +from concurrent.futures import ThreadPoolExecutor from datetime import datetime from pathlib import Path from typing import Any, Optional -logger = logging.getLogger("decnet.collector") +from decnet.logging import get_logger +from decnet.telemetry import traced as _traced, get_tracer as _get_tracer, inject_context as _inject_ctx + +logger = get_logger("collector") + +# ─── Ingestion rate limiter ─────────────────────────────────────────────────── +# +# Rationale: connection-lifecycle events (connect/disconnect/accept/close) are +# emitted once per TCP connection. During a portscan or credential-stuffing +# run, a single attacker can generate hundreds of these per second from the +# honeypot services themselves — each becoming a tiny WAL-write transaction +# through the ingester, starving reads until the queue drains. +# +# The collector still writes every line to the raw .log file (forensic record +# for rsyslog/SIEM). Only the .json path — which feeds SQLite — is deduped. +# +# Dedup key: (attacker_ip, decky, service, event_type) +# Window: DECNET_COLLECTOR_RL_WINDOW_SEC seconds (default 1.0) +# Scope: DECNET_COLLECTOR_RL_EVENT_TYPES comma list +# (default: connect,disconnect,connection,accept,close) +# Events outside that set bypass the limiter untouched. + +def _parse_float_env(name: str, default: float) -> float: + raw = os.environ.get(name) + if raw is None: + return default + try: + value = float(raw) + except ValueError: + logger.warning("collector: invalid %s=%r, using default %s", name, raw, default) + return default + return max(0.0, value) + + +_RL_WINDOW_SEC: float = _parse_float_env("DECNET_COLLECTOR_RL_WINDOW_SEC", 1.0) +_RL_EVENT_TYPES: frozenset[str] = frozenset( + t.strip() + for t in os.environ.get( + "DECNET_COLLECTOR_RL_EVENT_TYPES", + "connect,disconnect,connection,accept,close", + ).split(",") + if t.strip() +) +_RL_MAX_ENTRIES: int = 10_000 + +_rl_lock: threading.Lock = threading.Lock() +_rl_last: dict[tuple[str, str, str, str], float] = {} + + +def _should_ingest(parsed: dict[str, Any]) -> bool: + """ + Return True if this parsed event should be written to the JSON ingestion + stream. Rate-limited connection-lifecycle events return False when another + event with the same (attacker_ip, decky, service, event_type) was emitted + inside the dedup window. + """ + event_type = parsed.get("event_type", "") + if _RL_WINDOW_SEC <= 0.0 or event_type not in _RL_EVENT_TYPES: + return True + key = ( + parsed.get("attacker_ip", "Unknown"), + parsed.get("decky", ""), + parsed.get("service", ""), + event_type, + ) + now = time.monotonic() + with _rl_lock: + last = _rl_last.get(key, 0.0) + if now - last < _RL_WINDOW_SEC: + return False + _rl_last[key] = now + # Opportunistic GC: when the map grows past the cap, drop entries older + # than 60 windows (well outside any realistic in-flight dedup range). + if len(_rl_last) > _RL_MAX_ENTRIES: + cutoff = now - (_RL_WINDOW_SEC * 60.0) + stale = [k for k, t in _rl_last.items() if t < cutoff] + for k in stale: + del _rl_last[k] + return True + + +def _reset_rate_limiter() -> None: + """Test-only helper — clear dedup state between test cases.""" + with _rl_lock: + _rl_last.clear() # ─── RFC 5424 parser ────────────────────────────────────────────────────────── @@ -23,13 +110,22 @@ _RFC5424_RE = re.compile( r"(\S+) " # 1: TIMESTAMP r"(\S+) " # 2: HOSTNAME (decky name) r"(\S+) " # 3: APP-NAME (service) - r"- " # PROCID always NILVALUE + r"\S+ " # PROCID — NILVALUE ("-") for syslog_bridge emitters, + # real PID for native syslog callers like sshd/sudo + # routed through rsyslog. Accept both; we don't consume it. r"(\S+) " # 4: MSGID (event_type) r"(.+)$", # 5: SD element + optional MSG ) -_SD_BLOCK_RE = re.compile(r'\[decnet@55555\s+(.*?)\]', re.DOTALL) +_SD_BLOCK_RE = re.compile(r'\[relay@55555\s+(.*?)\]', re.DOTALL) _PARAM_RE = re.compile(r'(\w+)="((?:[^"\\]|\\.)*)"') -_IP_FIELDS = ("src_ip", "src", "client_ip", "remote_ip", "ip") +_IP_FIELDS = ("src_ip", "src", "client_ip", "remote_ip", "remote_addr", "target_ip", "ip") + +# Free-form `key=value` pairs in the MSG body. Used for lines that bypass the +# syslog_bridge SD format — e.g. the SSH container's PROMPT_COMMAND which +# calls `logger -t bash "CMD uid=0 user=root src=1.2.3.4 pwd=/root cmd=…"`. +# Values run until the next whitespace, so `cmd=…` at end-of-line is preserved +# as one unit; we only care about IP-shaped fields here anyway. +_MSG_KV_RE = re.compile(r'(\w+)=(\S+)') def parse_rfc5424(line: str) -> Optional[dict[str, Any]]: @@ -64,6 +160,19 @@ def parse_rfc5424(line: str) -> Optional[dict[str, Any]]: attacker_ip = fields[fname] break + # Fallback for plain `logger` callers that don't use SD params (notably + # the SSH container's bash PROMPT_COMMAND: `logger -t bash "CMD … src=IP …"`). + # Scan the MSG body for IP-shaped `key=value` tokens ONLY — don't fold + # them into `fields`, because the frontend's parseEventBody already + # renders kv pairs from the msg and doubling them up produces noisy + # duplicate pills. This keeps attacker attribution working without + # changing the shape of `fields` for non-SD lines. + if attacker_ip == "Unknown" and msg: + for k, v in _MSG_KV_RE.findall(msg): + if k in _IP_FIELDS: + attacker_ip = v + break + try: ts_formatted = datetime.fromisoformat(ts_raw).strftime("%Y-%m-%d %H:%M:%S") except ValueError: @@ -115,34 +224,78 @@ def is_service_event(attrs: dict) -> bool: # ─── Blocking stream worker (runs in a thread) ──────────────────────────────── +def _reopen_if_needed(path: Path, fh: Optional[Any]) -> Any: + """Return fh if it still points to the same inode as path; otherwise close + fh and open a fresh handle. Handles the file being deleted (manual rm) or + rotated (logrotate rename + create).""" + try: + if fh is not None and os.fstat(fh.fileno()).st_ino == os.stat(path).st_ino: + return fh + except OSError: + pass + # File gone or inode changed — close stale handle and open a new one. + if fh is not None: + try: + fh.close() + except Exception: # nosec B110 — best-effort file handle cleanup + pass + path.parent.mkdir(parents=True, exist_ok=True) + return open(path, "a", encoding="utf-8") + + +@_traced("collector.stream_container") def _stream_container(container_id: str, log_path: Path, json_path: Path) -> None: """Stream logs from one container and append to the host log files.""" import docker # type: ignore[import] + lf: Optional[Any] = None + jf: Optional[Any] = None try: client = docker.from_env() container = client.containers.get(container_id) log_stream = container.logs(stream=True, follow=True, stdout=True, stderr=False) buf = "" - with ( - open(log_path, "a", encoding="utf-8") as lf, - open(json_path, "a", encoding="utf-8") as jf, - ): - for chunk in log_stream: - buf += chunk.decode("utf-8", errors="replace") - while "\n" in buf: - line, buf = buf.split("\n", 1) - line = line.rstrip() - if not line: - continue - lf.write(line + "\n") - lf.flush() - parsed = parse_rfc5424(line) - if parsed: - jf.write(json.dumps(parsed) + "\n") - jf.flush() + for chunk in log_stream: + buf += chunk.decode("utf-8", errors="replace") + while "\n" in buf: + line, buf = buf.split("\n", 1) + line = line.rstrip() + if not line: + continue + lf = _reopen_if_needed(log_path, lf) + lf.write(line + "\n") + lf.flush() + parsed = parse_rfc5424(line) + if parsed: + if _should_ingest(parsed): + _tracer = _get_tracer("collector") + with _tracer.start_as_current_span("collector.event") as _span: + _span.set_attribute("decky", parsed.get("decky", "")) + _span.set_attribute("service", parsed.get("service", "")) + _span.set_attribute("event_type", parsed.get("event_type", "")) + _span.set_attribute("attacker_ip", parsed.get("attacker_ip", "")) + _inject_ctx(parsed) + logger.debug("collector: event written decky=%s type=%s", parsed.get("decky"), parsed.get("event_type")) + jf = _reopen_if_needed(json_path, jf) + jf.write(json.dumps(parsed) + "\n") + jf.flush() + else: + logger.debug( + "collector: rate-limited decky=%s service=%s type=%s attacker=%s", + parsed.get("decky"), parsed.get("service"), + parsed.get("event_type"), parsed.get("attacker_ip"), + ) + else: + logger.debug("collector: malformed RFC5424 line snippet=%r", line[:80]) except Exception as exc: - logger.debug("Log stream ended for container %s: %s", container_id, exc) + logger.debug("collector: log stream ended container_id=%s reason=%s", container_id, exc) + finally: + for fh in (lf, jf): + if fh is not None: + try: + fh.close() + except Exception: # nosec B110 — best-effort file handle cleanup + pass # ─── Async collector ────────────────────────────────────────────────────────── @@ -164,15 +317,26 @@ async def log_collector_worker(log_file: str) -> None: active: dict[str, asyncio.Task[None]] = {} loop = asyncio.get_running_loop() + # Dedicated thread pool so long-running container log streams don't + # saturate the default asyncio executor and starve short-lived + # to_thread() calls elsewhere (e.g. load_state in the web API). + collector_pool = ThreadPoolExecutor( + max_workers=64, thread_name_prefix="decnet-collector", + ) + def _spawn(container_id: str, container_name: str) -> None: if container_id not in active or active[container_id].done(): active[container_id] = asyncio.ensure_future( - asyncio.to_thread(_stream_container, container_id, log_path, json_path), + loop.run_in_executor( + collector_pool, _stream_container, + container_id, log_path, json_path, + ), loop=loop, ) - logger.info("Collecting logs from container: %s", container_name) + logger.info("collector: streaming container=%s", container_name) try: + logger.info("collector started log_path=%s", log_path) client = docker.from_env() for container in client.containers.list(): @@ -190,11 +354,15 @@ async def log_collector_worker(log_file: str) -> None: if cid and is_service_event(attrs): loop.call_soon_threadsafe(_spawn, cid, name) - await asyncio.to_thread(_watch_events) + await loop.run_in_executor(collector_pool, _watch_events) except asyncio.CancelledError: + logger.info("collector shutdown requested cancelling %d tasks", len(active)) for task in active.values(): task.cancel() + collector_pool.shutdown(wait=False) raise except Exception as exc: - logger.error("Collector error: %s", exc) + logger.error("collector error: %s", exc) + finally: + collector_pool.shutdown(wait=False) diff --git a/decnet/composer.py b/decnet/composer.py index 973762e..d789615 100644 --- a/decnet/composer.py +++ b/decnet/composer.py @@ -64,6 +64,8 @@ def generate_compose(config: DecnetConfig) -> dict: # --- Service containers: share base network namespace --- for svc_name in decky.services: svc = get_service(svc_name) + if svc.fleet_singleton: + continue svc_cfg = decky.service_config.get(svc_name, {}) fragment = svc.compose_fragment(decky.name, service_cfg=svc_cfg) diff --git a/decnet/config.py b/decnet/config.py index f07c682..b0f1e9f 100644 --- a/decnet/config.py +++ b/decnet/config.py @@ -48,23 +48,53 @@ class Rfc5424Formatter(logging.Formatter): msg = record.getMessage() if record.exc_info: msg += "\n" + self.formatException(record.exc_info) + app = getattr(record, "decnet_component", self._app) return ( - f"<{prival}>1 {ts} {self._hostname} {self._app}" + f"<{prival}>1 {ts} {self._hostname} {app}" f" {os.getpid()} {record.name} - {msg}" ) def _configure_logging(dev: bool) -> None: - """Install the RFC 5424 handler on the root logger (idempotent).""" + """Install RFC 5424 handlers on the root logger (idempotent). + + Always adds a StreamHandler (stderr). Also adds a RotatingFileHandler + writing to DECNET_SYSTEM_LOGS (default: decnet.system.log in $PWD) so + all microservice daemons — which redirect stderr to /dev/null — still + produce readable logs. File handler is skipped under pytest. + """ + from decnet.logging.inode_aware_handler import InodeAwareRotatingFileHandler + root = logging.getLogger() - # Avoid adding duplicate handlers on re-import (e.g. during testing) + # Guard: if our StreamHandler is already installed, all handlers are set. if any(isinstance(h, logging.StreamHandler) and isinstance(h.formatter, Rfc5424Formatter) for h in root.handlers): return - handler = logging.StreamHandler() - handler.setFormatter(Rfc5424Formatter()) + + fmt = Rfc5424Formatter() root.setLevel(logging.DEBUG if dev else logging.INFO) - root.addHandler(handler) + + stream_handler = logging.StreamHandler() + stream_handler.setFormatter(fmt) + root.addHandler(stream_handler) + + # Skip the file handler during pytest runs to avoid polluting the test cwd. + _in_pytest = any(k.startswith("PYTEST") for k in os.environ) + if not _in_pytest: + _log_path = os.environ.get("DECNET_SYSTEM_LOGS", "decnet.system.log") + file_handler = InodeAwareRotatingFileHandler( + _log_path, + mode="a", + maxBytes=10 * 1024 * 1024, # 10 MB + backupCount=5, + encoding="utf-8", + ) + file_handler.setFormatter(fmt) + root.addHandler(file_handler) + # Drop root ownership when invoked via sudo so non-root follow-up + # commands (e.g. `decnet api` after `sudo decnet deploy`) can append. + from decnet.privdrop import chown_to_invoking_user + chown_to_invoking_user(_log_path) _dev = os.environ.get("DECNET_DEVELOPER", "").lower() == "true" diff --git a/decnet/config_ini.py b/decnet/config_ini.py new file mode 100644 index 0000000..6a914e2 --- /dev/null +++ b/decnet/config_ini.py @@ -0,0 +1,90 @@ +"""Parse /etc/decnet/decnet.ini and seed os.environ defaults. + +The INI file is a convenience layer on top of the existing DECNET_* env +vars. It never overrides an explicit environment variable (uses +os.environ.setdefault). Call load_ini_config() once, very early, before +any decnet.env import, so env.py picks up the seeded values as if they +had been exported by the shell. + +Shape:: + + [decnet] + mode = agent # or "master" + log-directory = /var/log/decnet + disallow-master = true + + [agent] + master-host = 192.168.1.50 + master-port = 8770 + agent-port = 8765 + agent-dir = /home/anti/.decnet/agent + ... + + [master] + api-host = 0.0.0.0 + swarmctl-port = 8770 + listener-port = 6514 + ... + +Only the section matching `mode` is loaded. The other section is +ignored silently so an agent host never reads master secrets (and +vice versa). Keys are converted to SCREAMING_SNAKE_CASE and prefixed +with ``DECNET_`` — e.g. ``master-host`` → ``DECNET_MASTER_HOST``. +""" +from __future__ import annotations + +import configparser +import os +from pathlib import Path +from typing import Optional + + +DEFAULT_CONFIG_PATH = Path("/etc/decnet/decnet.ini") + +# The [decnet] section keys are role-agnostic and always exported. +_COMMON_KEYS = frozenset({"mode", "disallow-master", "log-directory"}) + + +def _key_to_env(key: str) -> str: + return "DECNET_" + key.replace("-", "_").upper() + + +def load_ini_config(path: Optional[Path] = None) -> Optional[Path]: + """Seed os.environ defaults from the DECNET INI file. + + Returns the path that was actually loaded (so callers can log it), or + None if no file was read. Missing file is a no-op — callers fall back + to env vars / CLI flags / hardcoded defaults. + + Precedence: real os.environ > INI > defaults. Real env vars are never + overwritten because we use setdefault(). + """ + if path is None: + override = os.environ.get("DECNET_CONFIG") + path = Path(override) if override else DEFAULT_CONFIG_PATH + + if not path.is_file(): + return None + + parser = configparser.ConfigParser() + parser.read(path) + + # [decnet] first — mode/disallow-master/log-directory. These seed the + # mode decision for the section selection below. + if parser.has_section("decnet"): + for key, value in parser.items("decnet"): + os.environ.setdefault(_key_to_env(key), value) + + mode = os.environ.get("DECNET_MODE", "master").lower() + if mode not in ("agent", "master"): + raise ValueError( + f"decnet.ini: [decnet] mode must be 'agent' or 'master', got '{mode}'" + ) + + # Role-specific section. + section = mode + if parser.has_section(section): + for key, value in parser.items(section): + os.environ.setdefault(_key_to_env(key), value) + + return path diff --git a/decnet/correlation/engine.py b/decnet/correlation/engine.py index 1f9f748..198d544 100644 --- a/decnet/correlation/engine.py +++ b/decnet/correlation/engine.py @@ -33,6 +33,7 @@ from decnet.logging.syslog_formatter import ( SEVERITY_WARNING, format_rfc5424, ) +from decnet.telemetry import traced as _traced, get_tracer as _get_tracer class CorrelationEngine: @@ -64,6 +65,7 @@ class CorrelationEngine: self.events_indexed += 1 return event + @_traced("correlation.ingest_file") def ingest_file(self, path: Path) -> int: """ Parse every line of *path* and index it. @@ -73,12 +75,18 @@ class CorrelationEngine: with open(path) as fh: for line in fh: self.ingest(line) + _tracer = _get_tracer("correlation") + with _tracer.start_as_current_span("correlation.ingest_file.summary") as _span: + _span.set_attribute("lines_parsed", self.lines_parsed) + _span.set_attribute("events_indexed", self.events_indexed) + _span.set_attribute("unique_ips", len(self._events)) return self.events_indexed # ------------------------------------------------------------------ # # Query # # ------------------------------------------------------------------ # + @_traced("correlation.traversals") def traversals(self, min_deckies: int = 2) -> list[AttackerTraversal]: """ Return all attackers that touched at least *min_deckies* distinct @@ -135,6 +143,7 @@ class CorrelationEngine: ) return table + @_traced("correlation.report_json") def report_json(self, min_deckies: int = 2) -> dict: """Serialisable dict representation of all traversals.""" return { @@ -147,6 +156,7 @@ class CorrelationEngine: "traversals": [t.to_dict() for t in self.traversals(min_deckies)], } + @_traced("correlation.traversal_syslog_lines") def traversal_syslog_lines(self, min_deckies: int = 2) -> list[str]: """ Emit one RFC 5424 syslog line per detected traversal. diff --git a/decnet/correlation/parser.py b/decnet/correlation/parser.py index e457254..4aae381 100644 --- a/decnet/correlation/parser.py +++ b/decnet/correlation/parser.py @@ -6,7 +6,7 @@ the fields needed for cross-decky correlation: attacker IP, decky name, service, event type, and timestamp. Log format (produced by decnet.logging.syslog_formatter): - 1 TIMESTAMP HOSTNAME APP-NAME - MSGID [decnet@55555 k1="v1" k2="v2"] [MSG] + 1 TIMESTAMP HOSTNAME APP-NAME - MSGID [relay@55555 k1="v1" k2="v2"] [MSG] The attacker IP may appear under several field names depending on service: src_ip — ftp, smtp, http, most services @@ -31,14 +31,14 @@ _RFC5424_RE = re.compile( r"(.+)$", # 5: SD element + optional MSG ) -# Structured data block: [decnet@55555 k="v" ...] -_SD_BLOCK_RE = re.compile(r'\[decnet@55555\s+(.*?)\]', re.DOTALL) +# Structured data block: [relay@55555 k="v" ...] +_SD_BLOCK_RE = re.compile(r'\[relay@55555\s+(.*?)\]', re.DOTALL) # Individual param: key="value" (with escaped chars inside value) _PARAM_RE = re.compile(r'(\w+)="((?:[^"\\]|\\.)*)"') # Field names to probe for attacker IP, in priority order -_IP_FIELDS = ("src_ip", "src", "client_ip", "remote_ip", "ip") +_IP_FIELDS = ("src_ip", "src", "client_ip", "remote_ip", "remote_addr", "target_ip", "ip") @dataclass diff --git a/decnet/engine/deployer.py b/decnet/engine/deployer.py index 3f03c63..c788158 100644 --- a/decnet/engine/deployer.py +++ b/decnet/engine/deployer.py @@ -11,6 +11,8 @@ import docker from rich.console import Console from rich.table import Table +from decnet.logging import get_logger +from decnet.telemetry import traced as _traced from decnet.config import DecnetConfig, clear_state, load_state, save_state from decnet.composer import write_compose from decnet.network import ( @@ -26,13 +28,14 @@ from decnet.network import ( teardown_host_macvlan, ) +log = get_logger("engine") console = Console() COMPOSE_FILE = Path("decnet-compose.yml") -_CANONICAL_LOGGING = Path(__file__).parent.parent.parent / "templates" / "decnet_logging.py" +_CANONICAL_LOGGING = Path(__file__).parent.parent / "templates" / "syslog_bridge.py" def _sync_logging_helper(config: DecnetConfig) -> None: - """Copy the canonical decnet_logging.py into every active template build context.""" + """Copy the canonical syslog_bridge.py into every active template build context.""" from decnet.services.registry import get_service seen: set[Path] = set() for decky in config.deckies: @@ -44,16 +47,32 @@ def _sync_logging_helper(config: DecnetConfig) -> None: if ctx is None or ctx in seen: continue seen.add(ctx) - dest = ctx / "decnet_logging.py" + dest = ctx / "syslog_bridge.py" if not dest.exists() or dest.read_bytes() != _CANONICAL_LOGGING.read_bytes(): shutil.copy2(_CANONICAL_LOGGING, dest) def _compose(*args: str, compose_file: Path = COMPOSE_FILE, env: dict | None = None) -> None: import os - cmd = ["docker", "compose", "-f", str(compose_file), *args] + # -p decnet pins the compose project name. Without it, docker compose + # derives the project from basename($PWD); when a daemon (systemd) runs + # with WorkingDirectory=/ that basename is empty and compose aborts with + # "project name must not be empty". + cmd = ["docker", "compose", "-p", "decnet", "-f", str(compose_file), *args] merged = {**os.environ, **(env or {})} - subprocess.run(cmd, check=True, env=merged) # nosec B603 + result = subprocess.run(cmd, capture_output=True, text=True, env=merged) # nosec B603 + if result.stdout: + print(result.stdout, end="") + if result.returncode != 0: + # Docker emits the useful detail ("Address already in use", which IP, + # which port) on stderr. Surface it to the structured log so the + # agent's journal carries it — without this the upstream traceback + # just shows the exit code. + if result.stderr: + log.error("docker compose %s failed: %s", " ".join(args), result.stderr.strip()) + raise subprocess.CalledProcessError( + result.returncode, cmd, result.stdout, result.stderr + ) _PERMANENT_ERRORS = ( @@ -65,6 +84,7 @@ _PERMANENT_ERRORS = ( ) +@_traced("engine.compose_with_retry") def _compose_with_retry( *args: str, compose_file: Path = COMPOSE_FILE, @@ -75,7 +95,11 @@ def _compose_with_retry( """Run a docker compose command, retrying on transient failures.""" import os last_exc: subprocess.CalledProcessError | None = None - cmd = ["docker", "compose", "-f", str(compose_file), *args] + # -p decnet pins the compose project name. Without it, docker compose + # derives the project from basename($PWD); when a daemon (systemd) runs + # with WorkingDirectory=/ that basename is empty and compose aborts with + # "project name must not be empty". + cmd = ["docker", "compose", "-p", "decnet", "-f", str(compose_file), *args] merged = {**os.environ, **(env or {})} for attempt in range(1, retries + 1): result = subprocess.run(cmd, capture_output=True, text=True, env=merged) # nosec B603 @@ -102,15 +126,21 @@ def _compose_with_retry( else: if result.stderr: console.print(f"[red]{result.stderr.strip()}[/]") + log.error("docker compose %s failed after %d attempts: %s", + " ".join(args), retries, result.stderr.strip()) raise last_exc +@_traced("engine.deploy") def deploy(config: DecnetConfig, dry_run: bool = False, no_cache: bool = False, parallel: bool = False) -> None: + log.info("deployment started n_deckies=%d interface=%s subnet=%s dry_run=%s", len(config.deckies), config.interface, config.subnet, dry_run) + log.debug("deploy: deckies=%s", [d.name for d in config.deckies]) client = docker.from_env() ip_list = [d.ip for d in config.deckies] decky_range = ips_to_range(ip_list) host_ip = get_host_ip(config.interface) + log.debug("deploy: ip_range=%s host_ip=%s", decky_range, host_ip) net_driver = "IPvlan L2" if config.ipvlan else "MACVLAN" console.print(f"[bold cyan]Creating {net_driver} network[/] ({MACVLAN_NETWORK_NAME}) on {config.interface}") @@ -140,11 +170,21 @@ def deploy(config: DecnetConfig, dry_run: bool = False, no_cache: bool = False, console.print(f"[bold cyan]Compose file written[/] → {compose_path}") if dry_run: + log.info("deployment dry-run complete compose_path=%s", compose_path) console.print("[yellow]Dry run — no containers started.[/]") return save_state(config, compose_path) + # Pre-up cleanup: a prior half-failed `up` can leave containers still + # holding the IPs/ports this run wants, which surfaces as the recurring + # "Address already in use" from Docker's IPAM. Best-effort — ignore + # failure (e.g. nothing to tear down on a clean host). + try: + _compose("down", "--remove-orphans", compose_file=compose_path) + except subprocess.CalledProcessError: + log.debug("pre-up cleanup: compose down failed (likely nothing to remove)") + build_env = {"DOCKER_BUILDKIT": "1"} if parallel else {} console.print("[bold cyan]Building images and starting deckies...[/]") @@ -161,12 +201,16 @@ def deploy(config: DecnetConfig, dry_run: bool = False, no_cache: bool = False, _compose_with_retry("build", "--no-cache", compose_file=compose_path) _compose_with_retry("up", "--build", "-d", compose_file=compose_path) + log.info("deployment complete n_deckies=%d", len(config.deckies)) _print_status(config) +@_traced("engine.teardown") def teardown(decky_id: str | None = None) -> None: + log.info("teardown requested decky_id=%s", decky_id or "all") state = load_state() if state is None: + log.warning("teardown: no active deployment found") console.print("[red]No active deployment found (no decnet-state.json).[/]") return @@ -174,10 +218,14 @@ def teardown(decky_id: str | None = None) -> None: client = docker.from_env() if decky_id: - svc_names = [f"{decky_id}-{svc}" for svc in [d.services for d in config.deckies if d.name == decky_id]] - if not svc_names: + decky = next((d for d in config.deckies if d.name == decky_id), None) + if decky is None: console.print(f"[red]Decky '{decky_id}' not found in current deployment.[/]") return + svc_names = [f"{decky_id}-{svc}" for svc in decky.services] + if not svc_names: + log.warning("teardown: decky %s has no services to stop", decky_id) + return _compose("stop", *svc_names, compose_file=compose_path) _compose("rm", "-f", *svc_names, compose_file=compose_path) else: @@ -193,6 +241,7 @@ def teardown(decky_id: str | None = None) -> None: clear_state() net_driver = "IPvlan" if config.ipvlan else "MACVLAN" + log.info("teardown complete all deckies removed network_driver=%s", net_driver) console.print(f"[green]All deckies torn down. {net_driver} network removed.[/]") diff --git a/decnet/env.py b/decnet/env.py index eb57d3d..90cf221 100644 --- a/decnet/env.py +++ b/decnet/env.py @@ -6,9 +6,14 @@ from dotenv import load_dotenv # Calculate absolute path to the project root _ROOT: Path = Path(__file__).parent.parent.absolute() -# Load .env.local first, then fallback to .env +# Load .env.local first, then fallback to .env. +# Also check CWD so deployments that install into site-packages (e.g. the +# self-updater's release slots) can ship a per-host .env.local at the +# process's working directory without having to edit site-packages. load_dotenv(_ROOT / ".env.local") load_dotenv(_ROOT / ".env") +load_dotenv(Path.cwd() / ".env.local") +load_dotenv(Path.cwd() / ".env") def _port(name: str, default: int) -> int: @@ -40,30 +45,109 @@ def _require_env(name: str) -> str: f"Environment variable '{name}' is set to an insecure default ('{value}'). " f"Choose a strong, unique value before starting DECNET." ) + if name == "DECNET_JWT_SECRET" and len(value) < 32: + _developer = os.environ.get("DECNET_DEVELOPER", "False").lower() == "true" + if not _developer: + raise ValueError( + f"DECNET_JWT_SECRET is too short ({len(value)} bytes). " + f"Use at least 32 characters to satisfy HS256 requirements (RFC 7518 §3.2)." + ) return value +# System logging — all microservice daemons append here. +DECNET_SYSTEM_LOGS: str = os.environ.get("DECNET_SYSTEM_LOGS", "decnet.system.log") + +# Set to "true" to embed the profiler inside the API process. +# Leave unset (default) when the standalone `decnet profiler --daemon` is +# running — embedding both produces two workers sharing the same DB cursor, +# which causes events to be skipped or processed twice. +DECNET_EMBED_PROFILER: bool = os.environ.get("DECNET_EMBED_PROFILER", "").lower() == "true" + +# Set to "true" to embed the MACVLAN sniffer inside the API process. +# Leave unset (default) when the standalone `decnet sniffer --daemon` is +# running (which `decnet deploy` always does). Embedding both produces two +# workers sniffing the same interface — duplicated events and wasted CPU. +DECNET_EMBED_SNIFFER: bool = os.environ.get("DECNET_EMBED_SNIFFER", "").lower() == "true" + +# Set to "true" to mount the Pyinstrument ASGI middleware on the FastAPI app. +# Produces per-request HTML flamegraphs under ./profiles/. Off by default so +# production and normal dev runs pay zero profiling overhead. +DECNET_PROFILE_REQUESTS: bool = os.environ.get("DECNET_PROFILE_REQUESTS", "").lower() == "true" +DECNET_PROFILE_DIR: str = os.environ.get("DECNET_PROFILE_DIR", "profiles") + # API Options -DECNET_API_HOST: str = os.environ.get("DECNET_API_HOST", "0.0.0.0") # nosec B104 +DECNET_API_HOST: str = os.environ.get("DECNET_API_HOST", "127.0.0.1") DECNET_API_PORT: int = _port("DECNET_API_PORT", 8000) -DECNET_JWT_SECRET: str = _require_env("DECNET_JWT_SECRET") +# DECNET_JWT_SECRET is resolved lazily via module __getattr__ so that agent / +# updater / swarmctl subcommands (which never touch auth) can start without +# the master's JWT secret being present in the environment. DECNET_INGEST_LOG_FILE: str | None = os.environ.get("DECNET_INGEST_LOG_FILE", "/var/log/decnet/decnet.log") +# SWARM log pipeline — RFC 5425 syslog-over-TLS between worker forwarders +# and the master listener. Plaintext syslog across hosts is forbidden. +DECNET_SWARM_SYSLOG_PORT: int = _port("DECNET_SWARM_SYSLOG_PORT", 6514) +DECNET_SWARM_MASTER_HOST: str | None = os.environ.get("DECNET_SWARM_MASTER_HOST") + +# Worker-side identity + swarmctl locator, seeded by the enroll bundle's +# /etc/decnet/decnet.ini ([agent] host-uuid / master-host / swarmctl-port). +# The agent heartbeat loop uses these to self-identify to the master. +DECNET_HOST_UUID: str | None = os.environ.get("DECNET_HOST_UUID") +DECNET_MASTER_HOST: str | None = os.environ.get("DECNET_MASTER_HOST") +DECNET_SWARMCTL_PORT: int = _port("DECNET_SWARMCTL_PORT", 8770) + +# Ingester batching: how many log rows to accumulate per commit, and the +# max wait (ms) before flushing a partial batch. Larger batches reduce +# SQLite write-lock contention; the timeout keeps latency bounded during +# low-traffic periods. +DECNET_BATCH_SIZE: int = int(os.environ.get("DECNET_BATCH_SIZE", "100")) +DECNET_BATCH_MAX_WAIT_MS: int = int(os.environ.get("DECNET_BATCH_MAX_WAIT_MS", "250")) + # Web Dashboard Options -DECNET_WEB_HOST: str = os.environ.get("DECNET_WEB_HOST", "0.0.0.0") # nosec B104 +DECNET_WEB_HOST: str = os.environ.get("DECNET_WEB_HOST", "127.0.0.1") DECNET_WEB_PORT: int = _port("DECNET_WEB_PORT", 8080) DECNET_ADMIN_USER: str = os.environ.get("DECNET_ADMIN_USER", "admin") DECNET_ADMIN_PASSWORD: str = os.environ.get("DECNET_ADMIN_PASSWORD", "admin") DECNET_DEVELOPER: bool = os.environ.get("DECNET_DEVELOPER", "False").lower() == "true" +# Host role — seeded by /etc/decnet/decnet.ini or exported directly. +# "master" = the central server (api, web, swarmctl, listener). +# "agent" = a worker node (agent, forwarder, updater). Workers gate their +# Typer CLI to hide master-only commands (see decnet/cli.py). +DECNET_MODE: str = os.environ.get("DECNET_MODE", "master").lower() +# When mode=agent, hide master-only Typer commands. Set to "false" for dual- +# role dev hosts where a single machine plays both sides. +DECNET_DISALLOW_MASTER: bool = ( + os.environ.get("DECNET_DISALLOW_MASTER", "true").lower() == "true" +) + +# Tracing — set to "true" to enable OpenTelemetry distributed tracing. +# Separate from DECNET_DEVELOPER so tracing can be toggled independently. +DECNET_DEVELOPER_TRACING: bool = os.environ.get("DECNET_DEVELOPER_TRACING", "").lower() == "true" +DECNET_OTEL_ENDPOINT: str = os.environ.get("DECNET_OTEL_ENDPOINT", "http://localhost:4317") + # Database Options DECNET_DB_TYPE: str = os.environ.get("DECNET_DB_TYPE", "sqlite").lower() DECNET_DB_URL: Optional[str] = os.environ.get("DECNET_DB_URL") +# MySQL component vars (used only when DECNET_DB_URL is not set) +DECNET_DB_HOST: str = os.environ.get("DECNET_DB_HOST", "localhost") +DECNET_DB_PORT: int = _port("DECNET_DB_PORT", 3306) if os.environ.get("DECNET_DB_PORT") else 3306 +DECNET_DB_NAME: str = os.environ.get("DECNET_DB_NAME", "decnet") +DECNET_DB_USER: str = os.environ.get("DECNET_DB_USER", "decnet") +DECNET_DB_PASSWORD: Optional[str] = os.environ.get("DECNET_DB_PASSWORD") # CORS — comma-separated list of allowed origins for the web dashboard API. # Defaults to the configured web host/port. Override with DECNET_CORS_ORIGINS if needed. # Example: DECNET_CORS_ORIGINS=http://192.168.1.50:9090,https://dashboard.example.com -_web_hostname: str = "localhost" if DECNET_WEB_HOST in ("0.0.0.0", "127.0.0.1", "::") else DECNET_WEB_HOST # nosec B104 +_WILDCARD_ADDRS = {"0.0.0.0", "127.0.0.1", "::"} # nosec B104 — comparison only, not a bind +_web_hostname: str = "localhost" if DECNET_WEB_HOST in _WILDCARD_ADDRS else DECNET_WEB_HOST _cors_default: str = f"http://{_web_hostname}:{DECNET_WEB_PORT}" _cors_raw: str = os.environ.get("DECNET_CORS_ORIGINS", _cors_default) DECNET_CORS_ORIGINS: list[str] = [o.strip() for o in _cors_raw.split(",") if o.strip()] + + +def __getattr__(name: str) -> str: + """Lazy resolution for secrets only the master web/api process needs.""" + if name == "DECNET_JWT_SECRET": + return _require_env("DECNET_JWT_SECRET") + raise AttributeError(f"module 'decnet.env' has no attribute {name!r}") diff --git a/decnet/fleet.py b/decnet/fleet.py index 01a38c4..f41dbee 100644 --- a/decnet/fleet.py +++ b/decnet/fleet.py @@ -17,8 +17,11 @@ from decnet.services.registry import all_services def all_service_names() -> list[str]: - """Return all registered service names from the live plugin registry.""" - return sorted(all_services().keys()) + """Return all registered per-decky service names (excludes fleet singletons).""" + return sorted( + name for name, svc in all_services().items() + if not svc.fleet_singleton + ) def resolve_distros( diff --git a/decnet/logging/__init__.py b/decnet/logging/__init__.py index e69de29..73f6102 100644 --- a/decnet/logging/__init__.py +++ b/decnet/logging/__init__.py @@ -0,0 +1,92 @@ +""" +DECNET application logging helpers. + +Usage: + from decnet.logging import get_logger + log = get_logger("engine") # APP-NAME in RFC 5424 output becomes "engine" + +The returned logger propagates to the root logger (configured in config.py with +Rfc5424Formatter), so level control via DECNET_DEVELOPER still applies globally. + +When ``DECNET_DEVELOPER_TRACING`` is active, every LogRecord is enriched with +``otel_trace_id`` and ``otel_span_id`` from the current OTEL span context. +This lets you correlate log lines with Jaeger traces — click a log entry and +jump straight to the span that produced it. +""" + +from __future__ import annotations + +import logging + + +class _ComponentFilter(logging.Filter): + """Injects *decnet_component* onto every LogRecord so Rfc5424Formatter can + use it as the RFC 5424 APP-NAME field instead of the hardcoded "decnet".""" + + def __init__(self, component: str) -> None: + super().__init__() + self.component = component + + def filter(self, record: logging.LogRecord) -> bool: + record.decnet_component = self.component # type: ignore[attr-defined] + return True + + +class _TraceContextFilter(logging.Filter): + """Injects ``otel_trace_id`` and ``otel_span_id`` onto every LogRecord + from the active OTEL span context. + + Installed once by ``enable_trace_context()`` on the root ``decnet`` logger + so all child loggers inherit the enrichment via propagation. + + When no span is active, both fields are set to ``"0"`` (cheap string + comparison downstream, no None-checks needed). + """ + + def filter(self, record: logging.LogRecord) -> bool: + try: + from opentelemetry import trace + span = trace.get_current_span() + ctx = span.get_span_context() + if ctx and ctx.trace_id: + record.otel_trace_id = format(ctx.trace_id, "032x") # type: ignore[attr-defined] + record.otel_span_id = format(ctx.span_id, "016x") # type: ignore[attr-defined] + else: + record.otel_trace_id = "0" # type: ignore[attr-defined] + record.otel_span_id = "0" # type: ignore[attr-defined] + except Exception: + record.otel_trace_id = "0" # type: ignore[attr-defined] + record.otel_span_id = "0" # type: ignore[attr-defined] + return True + + +_trace_filter_installed: bool = False + + +def enable_trace_context() -> None: + """Install the OTEL trace-context filter on the root ``decnet`` logger. + + Called once from ``decnet.telemetry.setup_tracing()`` after the + TracerProvider is initialised. Safe to call multiple times (idempotent). + """ + global _trace_filter_installed + if _trace_filter_installed: + return + root = logging.getLogger("decnet") + root.addFilter(_TraceContextFilter()) + _trace_filter_installed = True + + +def get_logger(component: str) -> logging.Logger: + """Return a named logger that self-identifies as *component* in RFC 5424. + + Valid components: cli, engine, api, mutator, collector. + + The logger is named ``decnet.`` and propagates normally, so the + root handler (Rfc5424Formatter + level gate from DECNET_DEVELOPER) handles + output. Calling this function multiple times for the same component is safe. + """ + logger = logging.getLogger(f"decnet.{component}") + if not any(isinstance(f, _ComponentFilter) for f in logger.filters): + logger.addFilter(_ComponentFilter(component)) + return logger diff --git a/decnet/logging/file_handler.py b/decnet/logging/file_handler.py index 50a83d1..e806c39 100644 --- a/decnet/logging/file_handler.py +++ b/decnet/logging/file_handler.py @@ -13,29 +13,37 @@ import logging.handlers import os from pathlib import Path +from decnet.logging.inode_aware_handler import InodeAwareRotatingFileHandler +from decnet.privdrop import chown_to_invoking_user, chown_tree_to_invoking_user +from decnet.telemetry import traced as _traced + _LOG_FILE_ENV = "DECNET_LOG_FILE" _DEFAULT_LOG_FILE = "/var/log/decnet/decnet.log" _MAX_BYTES = 10 * 1024 * 1024 # 10 MB _BACKUP_COUNT = 5 -_handler: logging.handlers.RotatingFileHandler | None = None +_handler: InodeAwareRotatingFileHandler | None = None _logger: logging.Logger | None = None -def _get_logger() -> logging.Logger: +@_traced("logging.init_file_handler") +def _init_file_handler() -> logging.Logger: + """One-time initialisation of the rotating file handler.""" global _handler, _logger - if _logger is not None: - return _logger log_path = Path(os.environ.get(_LOG_FILE_ENV, _DEFAULT_LOG_FILE)) log_path.parent.mkdir(parents=True, exist_ok=True) + # When running under sudo, hand the parent dir back to the invoking user + # so a subsequent non-root `decnet api` can also write to it. + chown_tree_to_invoking_user(log_path.parent) - _handler = logging.handlers.RotatingFileHandler( + _handler = InodeAwareRotatingFileHandler( log_path, maxBytes=_MAX_BYTES, backupCount=_BACKUP_COUNT, encoding="utf-8", ) + chown_to_invoking_user(log_path) _handler.setFormatter(logging.Formatter("%(message)s")) _logger = logging.getLogger("decnet.syslog") @@ -46,6 +54,12 @@ def _get_logger() -> logging.Logger: return _logger +def _get_logger() -> logging.Logger: + if _logger is not None: + return _logger + return _init_file_handler() + + def write_syslog(line: str) -> None: """Write a single RFC 5424 syslog line to the rotating log file.""" try: diff --git a/decnet/logging/forwarder.py b/decnet/logging/forwarder.py index 9ddbd07..31b0e1e 100644 --- a/decnet/logging/forwarder.py +++ b/decnet/logging/forwarder.py @@ -11,6 +11,8 @@ shared utilities for validating and parsing the log_target string. import socket +from decnet.telemetry import traced as _traced + def parse_log_target(log_target: str) -> tuple[str, int]: """ @@ -23,6 +25,7 @@ def parse_log_target(log_target: str) -> tuple[str, int]: return parts[0], int(parts[1]) +@_traced("logging.probe_log_target") def probe_log_target(log_target: str, timeout: float = 2.0) -> bool: """ Return True if the log target is reachable (TCP connect succeeds). diff --git a/decnet/logging/inode_aware_handler.py b/decnet/logging/inode_aware_handler.py new file mode 100644 index 0000000..3a7aad7 --- /dev/null +++ b/decnet/logging/inode_aware_handler.py @@ -0,0 +1,60 @@ +""" +RotatingFileHandler that detects external deletion or rotation. + +Stdlib ``RotatingFileHandler`` holds an open file descriptor for the +lifetime of the handler. If the target file is deleted (``rm``) or +rotated out (``logrotate`` without ``copytruncate``), the handler keeps +writing to the now-orphaned inode until its own size-based rotation +finally triggers — silently losing every line in between. + +Stdlib ``WatchedFileHandler`` solves exactly this problem but doesn't +rotate by size. This subclass combines both: before each emit we stat +the configured path and compare its inode/device to the currently open +file; on mismatch we close and reopen. + +Cheap: one ``os.stat`` per log record. Matches the pattern used by +``decnet/collector/worker.py:_reopen_if_needed``. +""" +from __future__ import annotations + +import logging +import logging.handlers +import os + + +class InodeAwareRotatingFileHandler(logging.handlers.RotatingFileHandler): + """RotatingFileHandler that reopens the target on external rotation/deletion.""" + + def _should_reopen(self) -> bool: + if self.stream is None: + return True + try: + disk_stat = os.stat(self.baseFilename) + except FileNotFoundError: + return True + except OSError: + return False + try: + open_stat = os.fstat(self.stream.fileno()) + except OSError: + return True + return (disk_stat.st_ino != open_stat.st_ino + or disk_stat.st_dev != open_stat.st_dev) + + def emit(self, record: logging.LogRecord) -> None: + if self._should_reopen(): + try: + if self.stream is not None: + self.close() + except Exception: # nosec B110 + pass + try: + self.stream = self._open() + except OSError: + # A logging handler MUST NOT crash its caller. If we can't + # reopen (e.g. file is root-owned after `sudo decnet deploy` + # and the current process is non-root), defer to the stdlib + # error path, which just prints a traceback to stderr. + self.handleError(record) + return + super().emit(record) diff --git a/decnet/logging/syslog_formatter.py b/decnet/logging/syslog_formatter.py index 6d43244..5745bba 100644 --- a/decnet/logging/syslog_formatter.py +++ b/decnet/logging/syslog_formatter.py @@ -5,7 +5,7 @@ Produces fully-compliant syslog messages: 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG Facility: local0 (16) -PEN for structured data: decnet@55555 +PEN for structured data: relay@55555 """ from __future__ import annotations @@ -16,7 +16,7 @@ from typing import Any FACILITY_LOCAL0 = 16 NILVALUE = "-" -_SD_ID = "decnet@55555" +_SD_ID = "relay@55555" SEVERITY_INFO = 6 SEVERITY_WARNING = 4 diff --git a/decnet/models.py b/decnet/models.py index 1db29f2..ed5f955 100644 --- a/decnet/models.py +++ b/decnet/models.py @@ -99,6 +99,9 @@ class DeckyConfig(BaseModel): mutate_interval: int | None = None # automatic rotation interval in minutes last_mutated: float = 0.0 # timestamp of last mutation last_login_attempt: float = 0.0 # timestamp of most recent interaction + # SWARM: the SwarmHost.uuid that runs this decky. None in unihost mode + # so existing state files deserialize unchanged. + host_uuid: str | None = None @field_validator("services") @classmethod diff --git a/decnet/mutator/engine.py b/decnet/mutator/engine.py index 6d97e23..0e4a925 100644 --- a/decnet/mutator/engine.py +++ b/decnet/mutator/engine.py @@ -14,22 +14,28 @@ from decnet.fleet import all_service_names from decnet.composer import write_compose from decnet.config import DeckyConfig, DecnetConfig from decnet.engine import _compose_with_retry +from decnet.logging import get_logger +from decnet.telemetry import traced as _traced from pathlib import Path import anyio import asyncio from decnet.web.db.repository import BaseRepository +log = get_logger("mutator") console = Console() +@_traced("mutator.mutate_decky") async def mutate_decky(decky_name: str, repo: BaseRepository) -> bool: """ Perform an Intra-Archetype Shuffle for a specific decky. Returns True if mutation succeeded, False otherwise. """ + log.debug("mutate_decky: start decky=%s", decky_name) state_dict = await repo.get_state("deployment") if state_dict is None: + log.error("mutate_decky: no active deployment found in database") console.print("[red]No active deployment found in database.[/]") return False @@ -73,25 +79,30 @@ async def mutate_decky(decky_name: str, repo: BaseRepository) -> bool: # Still writes files for Docker to use write_compose(config, compose_path) + log.info("mutation applied decky=%s services=%s", decky_name, ",".join(decky.services)) console.print(f"[cyan]Mutating '{decky_name}' to services: {', '.join(decky.services)}[/]") try: # Wrap blocking call in thread await anyio.to_thread.run_sync(_compose_with_retry, "up", "-d", "--remove-orphans", compose_path) except Exception as e: + log.error("mutation failed decky=%s error=%s", decky_name, e) console.print(f"[red]Failed to mutate '{decky_name}': {e}[/]") return False return True +@_traced("mutator.mutate_all") async def mutate_all(repo: BaseRepository, force: bool = False) -> None: """ Check all deckies and mutate those that are due. If force=True, mutates all deckies regardless of schedule. """ + log.debug("mutate_all: start force=%s", force) state_dict = await repo.get_state("deployment") if state_dict is None: + log.error("mutate_all: no active deployment found") console.print("[red]No active deployment found.[/]") return @@ -116,15 +127,21 @@ async def mutate_all(repo: BaseRepository, force: bool = False) -> None: mutated_count += 1 if mutated_count == 0 and not force: + log.debug("mutate_all: no deckies due for mutation") console.print("[dim]No deckies are due for mutation.[/]") + else: + log.info("mutate_all: complete mutated_count=%d", mutated_count) +@_traced("mutator.watch_loop") async def run_watch_loop(repo: BaseRepository, poll_interval_secs: int = 10) -> None: """Run an infinite loop checking for deckies that need mutation.""" + log.info("mutator watch loop started poll_interval_secs=%d", poll_interval_secs) console.print(f"[green]DECNET Mutator Watcher started (polling every {poll_interval_secs}s).[/]") try: while True: await mutate_all(force=False, repo=repo) await asyncio.sleep(poll_interval_secs) except KeyboardInterrupt: + log.info("mutator watch loop stopped") console.print("\n[dim]Mutator watcher stopped.[/]") diff --git a/decnet/network.py b/decnet/network.py index aa88432..17b0527 100644 --- a/decnet/network.py +++ b/decnet/network.py @@ -126,22 +126,57 @@ def allocate_ips( # Docker MACVLAN network # --------------------------------------------------------------------------- -def create_macvlan_network( +def _ensure_network( client: docker.DockerClient, + *, + driver: str, interface: str, subnet: str, gateway: str, ip_range: str, + extra_options: dict | None = None, ) -> None: - """Create the MACVLAN Docker network. No-op if it already exists.""" - existing = [n.name for n in client.networks.list()] - if MACVLAN_NETWORK_NAME in existing: - return + """Create the decnet docker network with ``driver``, replacing any + existing network of the same name that was built with a different driver. + + Why the replace-on-driver-mismatch: macvlan and ipvlan slaves can't + coexist on the same parent interface. If an earlier run left behind a + macvlan-driver network and we're now asked for ipvlan (or vice versa), + short-circuiting on name alone leaves Docker attaching new containers + to the old driver and the host NIC ends up EBUSY on the next port + create. So: when driver disagrees, disconnect everything and DROP it. + """ + options = {"parent": interface} + if extra_options: + options.update(extra_options) + + for net in client.networks.list(names=[MACVLAN_NETWORK_NAME]): + if net.attrs.get("Driver") == driver: + # Same driver — but if the IPAM pool drifted (different subnet, + # gateway, or ip-range than this deploy asks for), reusing it + # hands out addresses from the old pool and we race the real LAN. + # Compare and rebuild on mismatch. + pools = (net.attrs.get("IPAM") or {}).get("Config") or [] + cur = pools[0] if pools else {} + if ( + cur.get("Subnet") == subnet + and cur.get("Gateway") == gateway + and cur.get("IPRange") == ip_range + ): + return # right driver AND matching pool, leave it alone + # Driver mismatch OR IPAM drift — tear it down. Disconnect any live + # containers first so `remove()` doesn't refuse with ErrNetworkInUse. + for cid in (net.attrs.get("Containers") or {}): + try: + net.disconnect(cid, force=True) + except docker.errors.APIError: + pass + net.remove() client.networks.create( name=MACVLAN_NETWORK_NAME, - driver="macvlan", - options={"parent": interface}, + driver=driver, + options=options, ipam=docker.types.IPAMConfig( driver="default", pool_configs=[ @@ -155,6 +190,21 @@ def create_macvlan_network( ) +def create_macvlan_network( + client: docker.DockerClient, + interface: str, + subnet: str, + gateway: str, + ip_range: str, +) -> None: + """Create the MACVLAN Docker network, replacing an ipvlan-driver one of + the same name if necessary (parent-NIC can't host both drivers).""" + _ensure_network( + client, driver="macvlan", interface=interface, + subnet=subnet, gateway=gateway, ip_range=ip_range, + ) + + def create_ipvlan_network( client: docker.DockerClient, interface: str, @@ -162,25 +212,12 @@ def create_ipvlan_network( gateway: str, ip_range: str, ) -> None: - """Create an IPvlan L2 Docker network. No-op if it already exists.""" - existing = [n.name for n in client.networks.list()] - if MACVLAN_NETWORK_NAME in existing: - return - - client.networks.create( - name=MACVLAN_NETWORK_NAME, - driver="ipvlan", - options={"parent": interface, "ipvlan_mode": "l2"}, - ipam=docker.types.IPAMConfig( - driver="default", - pool_configs=[ - docker.types.IPAMPool( - subnet=subnet, - gateway=gateway, - iprange=ip_range, - ) - ], - ), + """Create an IPvlan L2 Docker network, replacing a macvlan-driver one of + the same name if necessary (parent-NIC can't host both drivers).""" + _ensure_network( + client, driver="ipvlan", interface=interface, + subnet=subnet, gateway=gateway, ip_range=ip_range, + extra_options={"ipvlan_mode": "l2"}, ) @@ -204,10 +241,14 @@ def _require_root() -> None: def setup_host_macvlan(interface: str, host_macvlan_ip: str, decky_ip_range: str) -> None: """ Create a macvlan interface on the host so the deployer can reach deckies. - Idempotent — skips steps that are already done. + Idempotent — skips steps that are already done. Drops a stale ipvlan + host-helper first: the two drivers can share a parent NIC on paper but + leaving the opposite helper in place is just cruft after a driver swap. """ _require_root() + _run(["ip", "link", "del", HOST_IPVLAN_IFACE], check=False) + # Check if interface already exists result = _run(["ip", "link", "show", HOST_MACVLAN_IFACE], check=False) if result.returncode != 0: @@ -227,10 +268,14 @@ def teardown_host_macvlan(decky_ip_range: str) -> None: def setup_host_ipvlan(interface: str, host_ipvlan_ip: str, decky_ip_range: str) -> None: """ Create an IPvlan interface on the host so the deployer can reach deckies. - Idempotent — skips steps that are already done. + Idempotent — skips steps that are already done. Drops a stale macvlan + host-helper first so a prior macvlan deploy doesn't leave its slave + dangling on the parent NIC after the driver swap. """ _require_root() + _run(["ip", "link", "del", HOST_MACVLAN_IFACE], check=False) + result = _run(["ip", "link", "show", HOST_IPVLAN_IFACE], check=False) if result.returncode != 0: _run(["ip", "link", "add", HOST_IPVLAN_IFACE, "link", interface, "type", "ipvlan", "mode", "l2"]) diff --git a/decnet/privdrop.py b/decnet/privdrop.py new file mode 100644 index 0000000..0403335 --- /dev/null +++ b/decnet/privdrop.py @@ -0,0 +1,67 @@ +""" +Helpers for dropping root ownership on files created during privileged +operations (e.g. `sudo decnet deploy` needs root for MACVLAN, but its log +files should be owned by the invoking user so a subsequent non-root +`decnet api` can append to them). + +When sudo invokes a process, it sets SUDO_UID / SUDO_GID in the +environment to the original user's IDs. We use those to chown files +back after creation. +""" +from __future__ import annotations + +import os +from pathlib import Path +from typing import Optional + + +def _sudo_ids() -> Optional[tuple[int, int]]: + """Return (uid, gid) of the sudo-invoking user, or None when the + process was not launched via sudo / the env vars are missing.""" + raw_uid = os.environ.get("SUDO_UID") + raw_gid = os.environ.get("SUDO_GID") + if not raw_uid or not raw_gid: + return None + try: + return int(raw_uid), int(raw_gid) + except ValueError: + return None + + +def chown_to_invoking_user(path: str | os.PathLike[str]) -> None: + """Best-effort chown of *path* to the sudo-invoking user. + + No-op when: + * not running as root (nothing to drop), + * not launched via sudo (no SUDO_UID/SUDO_GID), + * the path does not exist, + * chown fails (logged-only — never raises). + """ + if os.geteuid() != 0: + return + ids = _sudo_ids() + if ids is None: + return + uid, gid = ids + p = Path(path) + if not p.exists(): + return + try: + os.chown(p, uid, gid) + except OSError: + # Best-effort; a failed chown is not fatal to logging. + pass + + +def chown_tree_to_invoking_user(root: str | os.PathLike[str]) -> None: + """Apply :func:`chown_to_invoking_user` to *root* and every file/dir + beneath it. Used for parent directories that we just created with + ``mkdir(parents=True)`` as root.""" + if os.geteuid() != 0 or _sudo_ids() is None: + return + root_path = Path(root) + if not root_path.exists(): + return + chown_to_invoking_user(root_path) + for entry in root_path.rglob("*"): + chown_to_invoking_user(entry) diff --git a/decnet/prober/__init__.py b/decnet/prober/__init__.py new file mode 100644 index 0000000..52a2051 --- /dev/null +++ b/decnet/prober/__init__.py @@ -0,0 +1,13 @@ +""" +DECNET-PROBER — standalone active network probing service. + +Runs as a detached host-level process (no container). Sends crafted TLS +probes to discover C2 frameworks and other attacker infrastructure via +JARM fingerprinting. Results are written as RFC 5424 syslog + JSON to the +same log file the collector uses, so the existing ingestion pipeline picks +them up automatically. +""" + +from decnet.prober.worker import prober_worker + +__all__ = ["prober_worker"] diff --git a/decnet/prober/hassh.py b/decnet/prober/hassh.py new file mode 100644 index 0000000..ef1999a --- /dev/null +++ b/decnet/prober/hassh.py @@ -0,0 +1,252 @@ +""" +HASSHServer — SSH server fingerprinting via KEX_INIT algorithm ordering. + +Connects to an SSH server, completes the version exchange, captures the +server's SSH_MSG_KEXINIT message, and hashes the server-to-client algorithm +fields (kex, encryption, MAC, compression) into a 32-character MD5 digest. + +This is the *server* variant of HASSH (HASSHServer). It fingerprints what +the server *offers*, which identifies the SSH implementation (OpenSSH, +Paramiko, libssh, Cobalt Strike SSH, etc.). + +Stdlib only (socket, struct, hashlib) plus decnet.telemetry for tracing (zero-cost when disabled). +""" + +from __future__ import annotations + +import hashlib +import socket +import struct +from typing import Any + +from decnet.telemetry import traced as _traced + +# SSH protocol constants +_SSH_MSG_KEXINIT = 20 +_KEX_INIT_COOKIE_LEN = 16 +_KEX_INIT_NAME_LISTS = 10 # 10 name-list fields in KEX_INIT + +# Blend in as a normal OpenSSH client +_CLIENT_BANNER = b"SSH-2.0-OpenSSH_9.6\r\n" + +# Max bytes to read for server banner +_MAX_BANNER_LEN = 256 + +# Max bytes for a single SSH packet (KEX_INIT is typically < 2KB) +_MAX_PACKET_LEN = 35000 + + +# ─── SSH connection + KEX_INIT capture ────────────────────────────────────── + +@_traced("prober.hassh_ssh_connect") +def _ssh_connect( + host: str, + port: int, + timeout: float, +) -> tuple[str, bytes] | None: + """ + TCP connect, exchange version strings, read server's KEX_INIT. + + Returns (server_banner, kex_init_payload) or None on failure. + The kex_init_payload starts at the SSH_MSG_KEXINIT type byte. + """ + sock = None + try: + sock = socket.create_connection((host, port), timeout=timeout) + sock.settimeout(timeout) + + # 1. Read server banner (line ending \r\n or \n) + banner = _read_banner(sock) + if banner is None or not banner.startswith("SSH-"): + return None + + # 2. Send our client version string + sock.sendall(_CLIENT_BANNER) + + # 3. Read the server's first binary packet (should be KEX_INIT) + payload = _read_ssh_packet(sock) + if payload is None or len(payload) < 1: + return None + + if payload[0] != _SSH_MSG_KEXINIT: + return None + + return (banner, payload) + + except (OSError, socket.timeout, TimeoutError, ConnectionError): + return None + finally: + if sock is not None: + try: + sock.close() + except OSError: + pass + + +def _read_banner(sock: socket.socket) -> str | None: + """Read the SSH version banner line from the socket.""" + buf = b"" + while len(buf) < _MAX_BANNER_LEN: + try: + byte = sock.recv(1) + except (OSError, socket.timeout, TimeoutError): + return None + if not byte: + return None + buf += byte + if buf.endswith(b"\n"): + break + + try: + return buf.decode("utf-8", errors="replace").rstrip("\r\n") + except Exception: + return None + + +def _read_ssh_packet(sock: socket.socket) -> bytes | None: + """ + Read a single SSH binary packet and return its payload. + + SSH binary packet format: + uint32 packet_length (not including itself or MAC) + byte padding_length + byte[] payload (packet_length - padding_length - 1) + byte[] padding + """ + header = _recv_exact(sock, 4) + if header is None: + return None + + packet_length = struct.unpack("!I", header)[0] + if packet_length < 2 or packet_length > _MAX_PACKET_LEN: + return None + + rest = _recv_exact(sock, packet_length) + if rest is None: + return None + + padding_length = rest[0] + payload_length = packet_length - padding_length - 1 + if payload_length < 1 or payload_length > len(rest) - 1: + return None + + return rest[1 : 1 + payload_length] + + +def _recv_exact(sock: socket.socket, n: int) -> bytes | None: + """Read exactly n bytes from socket, or None on failure.""" + buf = b"" + while len(buf) < n: + try: + chunk = sock.recv(n - len(buf)) + except (OSError, socket.timeout, TimeoutError): + return None + if not chunk: + return None + buf += chunk + return buf + + +# ─── KEX_INIT parsing ────────────────────────────────────────────────────── + +def _parse_kex_init(payload: bytes) -> dict[str, str] | None: + """ + Parse SSH_MSG_KEXINIT payload and extract the 10 name-list fields. + + Payload layout: + byte SSH_MSG_KEXINIT (20) + byte[16] cookie + 10 × name-list: + uint32 length + byte[] utf-8 string (comma-separated algorithm names) + bool first_kex_packet_follows + uint32 reserved + + Returns dict with keys: kex_algorithms, server_host_key_algorithms, + encryption_client_to_server, encryption_server_to_client, + mac_client_to_server, mac_server_to_client, + compression_client_to_server, compression_server_to_client, + languages_client_to_server, languages_server_to_client. + """ + if len(payload) < 1 + _KEX_INIT_COOKIE_LEN + 4: + return None + + offset = 1 + _KEX_INIT_COOKIE_LEN # skip type byte + cookie + + field_names = [ + "kex_algorithms", + "server_host_key_algorithms", + "encryption_client_to_server", + "encryption_server_to_client", + "mac_client_to_server", + "mac_server_to_client", + "compression_client_to_server", + "compression_server_to_client", + "languages_client_to_server", + "languages_server_to_client", + ] + + fields: dict[str, str] = {} + for name in field_names: + if offset + 4 > len(payload): + return None + length = struct.unpack("!I", payload[offset : offset + 4])[0] + offset += 4 + if offset + length > len(payload): + return None + fields[name] = payload[offset : offset + length].decode( + "utf-8", errors="replace" + ) + offset += length + + return fields + + +# ─── HASSH computation ────────────────────────────────────────────────────── + +def _compute_hassh(kex: str, enc: str, mac: str, comp: str) -> str: + """ + Compute HASSHServer hash: MD5 of "kex;enc_s2c;mac_s2c;comp_s2c". + + Returns 32-character lowercase hex digest. + """ + raw = f"{kex};{enc};{mac};{comp}" + return hashlib.md5(raw.encode("utf-8"), usedforsecurity=False).hexdigest() + + +# ─── Public API ───────────────────────────────────────────────────────────── + +@_traced("prober.hassh_server") +def hassh_server( + host: str, + port: int, + timeout: float = 5.0, +) -> dict[str, Any] | None: + """ + Connect to an SSH server and compute its HASSHServer fingerprint. + + Returns a dict with the hash, banner, and raw algorithm fields, + or None if the host is not running an SSH server on the given port. + """ + result = _ssh_connect(host, port, timeout) + if result is None: + return None + + banner, payload = result + fields = _parse_kex_init(payload) + if fields is None: + return None + + kex = fields["kex_algorithms"] + enc = fields["encryption_server_to_client"] + mac = fields["mac_server_to_client"] + comp = fields["compression_server_to_client"] + + return { + "hassh_server": _compute_hassh(kex, enc, mac, comp), + "banner": banner, + "kex_algorithms": kex, + "encryption_s2c": enc, + "mac_s2c": mac, + "compression_s2c": comp, + } diff --git a/decnet/prober/jarm.py b/decnet/prober/jarm.py new file mode 100644 index 0000000..7cd1502 --- /dev/null +++ b/decnet/prober/jarm.py @@ -0,0 +1,506 @@ +""" +JARM TLS fingerprinting — pure stdlib implementation. + +JARM sends 10 crafted TLS ClientHello packets to a target, each varying +TLS version, cipher suite order, extensions, and ALPN values. The +ServerHello responses are parsed and hashed to produce a 62-character +fingerprint that identifies the TLS server implementation. + +Reference: https://github.com/salesforce/jarm + +Only DECNET import is decnet.telemetry for tracing (zero-cost when disabled). +""" + +from __future__ import annotations + +import hashlib +import socket +import struct +import time +from typing import Any + +from decnet.telemetry import traced as _traced + +# ─── Constants ──────────────────────────────────────────────────────────────── + +JARM_EMPTY_HASH = "0" * 62 + +_INTER_PROBE_DELAY = 0.1 # seconds between probes to avoid IDS triggers + +# TLS version bytes +_TLS_1_0 = b"\x03\x01" +_TLS_1_1 = b"\x03\x02" +_TLS_1_2 = b"\x03\x03" +_TLS_1_3 = b"\x03\x03" # TLS 1.3 uses 0x0303 in record layer + +# TLS record types +_CONTENT_HANDSHAKE = 0x16 +_HANDSHAKE_CLIENT_HELLO = 0x01 +_HANDSHAKE_SERVER_HELLO = 0x02 + +# Extension types +_EXT_SERVER_NAME = 0x0000 +_EXT_EC_POINT_FORMATS = 0x000B +_EXT_SUPPORTED_GROUPS = 0x000A +_EXT_SESSION_TICKET = 0x0023 +_EXT_ENCRYPT_THEN_MAC = 0x0016 +_EXT_EXTENDED_MASTER_SECRET = 0x0017 +_EXT_SIGNATURE_ALGORITHMS = 0x000D +_EXT_SUPPORTED_VERSIONS = 0x002B +_EXT_PSK_KEY_EXCHANGE_MODES = 0x002D +_EXT_KEY_SHARE = 0x0033 +_EXT_ALPN = 0x0010 +_EXT_PADDING = 0x0015 + +# ─── Cipher suite lists per JARM spec ──────────────────────────────────────── + +# Forward cipher order (standard) +_CIPHERS_FORWARD = [ + 0x0016, 0x0033, 0x0067, 0xC09E, 0xC0A2, 0x009E, 0x0039, 0x006B, + 0xC09F, 0xC0A3, 0x009F, 0x0045, 0x00BE, 0x0088, 0x00C4, 0x009A, + 0xC008, 0xC009, 0xC023, 0xC0AC, 0xC0AE, 0xC02B, 0xC00A, 0xC024, + 0xC0AD, 0xC0AF, 0xC02C, 0xC072, 0xC073, 0xCCA8, 0x1301, 0x1302, + 0x1303, 0xC013, 0xC014, 0xC02F, 0x009C, 0xC02E, 0x002F, 0x0035, + 0x000A, 0x0005, 0x0004, +] + +# Reverse cipher order +_CIPHERS_REVERSE = list(reversed(_CIPHERS_FORWARD)) + +# TLS 1.3-only ciphers +_CIPHERS_TLS13 = [0x1301, 0x1302, 0x1303] + +# Middle-out cipher order (interleaved from center) +def _middle_out(lst: list[int]) -> list[int]: + result: list[int] = [] + mid = len(lst) // 2 + for i in range(mid + 1): + if mid + i < len(lst): + result.append(lst[mid + i]) + if mid - i >= 0 and mid - i != mid + i: + result.append(lst[mid - i]) + return result + +_CIPHERS_MIDDLE_OUT = _middle_out(_CIPHERS_FORWARD) + +# Rare/uncommon extensions cipher list +_CIPHERS_RARE = [ + 0x0016, 0x0033, 0xC011, 0xC012, 0x0067, 0xC09E, 0xC0A2, 0x009E, + 0x0039, 0x006B, 0xC09F, 0xC0A3, 0x009F, 0x0045, 0x00BE, 0x0088, + 0x00C4, 0x009A, 0xC008, 0xC009, 0xC023, 0xC0AC, 0xC0AE, 0xC02B, + 0xC00A, 0xC024, 0xC0AD, 0xC0AF, 0xC02C, 0xC072, 0xC073, 0xCCA8, + 0x1301, 0x1302, 0x1303, 0xC013, 0xC014, 0xC02F, 0x009C, 0xC02E, + 0x002F, 0x0035, 0x000A, 0x0005, 0x0004, +] + + +# ─── Probe definitions ──────────────────────────────────────────────────────── + +# Each probe: (tls_version, cipher_list, tls13_support, alpn, extensions_style) +# tls_version: record-layer version bytes +# cipher_list: which cipher suite ordering to use +# tls13_support: whether to include TLS 1.3 extensions (supported_versions, key_share, psk) +# alpn: ALPN protocol string or None +# extensions_style: "standard", "rare", or "no_extensions" + +_PROBE_CONFIGS: list[dict[str, Any]] = [ + # 0: TLS 1.2 forward + {"version": _TLS_1_2, "ciphers": _CIPHERS_FORWARD, "tls13": False, "alpn": None, "style": "standard"}, + # 1: TLS 1.2 reverse + {"version": _TLS_1_2, "ciphers": _CIPHERS_REVERSE, "tls13": False, "alpn": None, "style": "standard"}, + # 2: TLS 1.1 forward + {"version": _TLS_1_1, "ciphers": _CIPHERS_FORWARD, "tls13": False, "alpn": None, "style": "standard"}, + # 3: TLS 1.3 forward + {"version": _TLS_1_2, "ciphers": _CIPHERS_FORWARD, "tls13": True, "alpn": "h2", "style": "standard"}, + # 4: TLS 1.3 reverse + {"version": _TLS_1_2, "ciphers": _CIPHERS_REVERSE, "tls13": True, "alpn": "h2", "style": "standard"}, + # 5: TLS 1.3 invalid (advertise 1.3 support but no key_share) + {"version": _TLS_1_2, "ciphers": _CIPHERS_FORWARD, "tls13": "no_key_share", "alpn": None, "style": "standard"}, + # 6: TLS 1.3 middle-out + {"version": _TLS_1_2, "ciphers": _CIPHERS_MIDDLE_OUT, "tls13": True, "alpn": None, "style": "standard"}, + # 7: TLS 1.0 forward + {"version": _TLS_1_0, "ciphers": _CIPHERS_FORWARD, "tls13": False, "alpn": None, "style": "standard"}, + # 8: TLS 1.2 middle-out + {"version": _TLS_1_2, "ciphers": _CIPHERS_MIDDLE_OUT, "tls13": False, "alpn": None, "style": "standard"}, + # 9: TLS 1.2 with rare extensions + {"version": _TLS_1_2, "ciphers": _CIPHERS_RARE, "tls13": False, "alpn": "http/1.1", "style": "rare"}, +] + + +# ─── Extension builders ────────────────────────────────────────────────────── + +def _ext(ext_type: int, data: bytes) -> bytes: + return struct.pack("!HH", ext_type, len(data)) + data + + +def _ext_sni(host: str) -> bytes: + host_bytes = host.encode("ascii") + # ServerNameList: length(2) + ServerName: type(1) + length(2) + name + sni_data = struct.pack("!HBH", len(host_bytes) + 3, 0, len(host_bytes)) + host_bytes + return _ext(_EXT_SERVER_NAME, sni_data) + + +def _ext_supported_groups() -> bytes: + groups = [0x0017, 0x0018, 0x0019, 0x001D, 0x0100, 0x0101] # secp256r1, secp384r1, secp521r1, x25519, ffdhe2048, ffdhe3072 + data = struct.pack("!H", len(groups) * 2) + b"".join(struct.pack("!H", g) for g in groups) + return _ext(_EXT_SUPPORTED_GROUPS, data) + + +def _ext_ec_point_formats() -> bytes: + formats = b"\x00" # uncompressed only + return _ext(_EXT_EC_POINT_FORMATS, struct.pack("B", len(formats)) + formats) + + +def _ext_signature_algorithms() -> bytes: + algos = [ + 0x0401, 0x0501, 0x0601, # RSA PKCS1 SHA256/384/512 + 0x0201, # RSA PKCS1 SHA1 + 0x0403, 0x0503, 0x0603, # ECDSA SHA256/384/512 + 0x0203, # ECDSA SHA1 + 0x0804, 0x0805, 0x0806, # RSA-PSS SHA256/384/512 + ] + data = struct.pack("!H", len(algos) * 2) + b"".join(struct.pack("!H", a) for a in algos) + return _ext(_EXT_SIGNATURE_ALGORITHMS, data) + + +def _ext_supported_versions_13() -> bytes: + versions = [0x0304, 0x0303] # TLS 1.3, 1.2 + data = struct.pack("B", len(versions) * 2) + b"".join(struct.pack("!H", v) for v in versions) + return _ext(_EXT_SUPPORTED_VERSIONS, data) + + +def _ext_psk_key_exchange_modes() -> bytes: + return _ext(_EXT_PSK_KEY_EXCHANGE_MODES, b"\x01\x01") # psk_dhe_ke + + +def _ext_key_share() -> bytes: + # x25519 key share with 32 random-looking bytes + key_data = b"\x00" * 32 + entry = struct.pack("!HH", 0x001D, 32) + key_data # x25519 group + data = struct.pack("!H", len(entry)) + entry + return _ext(_EXT_KEY_SHARE, data) + + +def _ext_alpn(protocol: str) -> bytes: + proto_bytes = protocol.encode("ascii") + proto_entry = struct.pack("B", len(proto_bytes)) + proto_bytes + data = struct.pack("!H", len(proto_entry)) + proto_entry + return _ext(_EXT_ALPN, data) + + +def _ext_session_ticket() -> bytes: + return _ext(_EXT_SESSION_TICKET, b"") + + +def _ext_encrypt_then_mac() -> bytes: + return _ext(_EXT_ENCRYPT_THEN_MAC, b"") + + +def _ext_extended_master_secret() -> bytes: + return _ext(_EXT_EXTENDED_MASTER_SECRET, b"") + + +def _ext_padding(target_length: int, current_length: int) -> bytes: + pad_needed = target_length - current_length - 4 # 4 bytes for ext type + length + if pad_needed < 0: + return b"" + return _ext(_EXT_PADDING, b"\x00" * pad_needed) + + +# ─── ClientHello builder ───────────────────────────────────────────────────── + +def _build_client_hello(probe_index: int, host: str = "localhost") -> bytes: + """ + Construct one of 10 JARM-specified ClientHello packets. + + Args: + probe_index: 0-9, selects the probe configuration + host: target hostname for SNI extension + + Returns: + Complete TLS record bytes ready to send on the wire. + """ + cfg = _PROBE_CONFIGS[probe_index] + version: bytes = cfg["version"] + ciphers: list[int] = cfg["ciphers"] + tls13 = cfg["tls13"] + alpn: str | None = cfg["alpn"] + + # Random (32 bytes) + random_bytes = b"\x00" * 32 + + # Session ID (32 bytes, all zeros) + session_id = b"\x00" * 32 + + # Cipher suites + cipher_bytes = b"".join(struct.pack("!H", c) for c in ciphers) + cipher_data = struct.pack("!H", len(cipher_bytes)) + cipher_bytes + + # Compression methods (null only) + compression = b"\x01\x00" + + # Extensions + extensions = b"" + extensions += _ext_sni(host) + extensions += _ext_supported_groups() + extensions += _ext_ec_point_formats() + extensions += _ext_session_ticket() + extensions += _ext_encrypt_then_mac() + extensions += _ext_extended_master_secret() + extensions += _ext_signature_algorithms() + + if tls13 == True: # noqa: E712 + extensions += _ext_supported_versions_13() + extensions += _ext_psk_key_exchange_modes() + extensions += _ext_key_share() + elif tls13 == "no_key_share": + extensions += _ext_supported_versions_13() + extensions += _ext_psk_key_exchange_modes() + # Intentionally omit key_share + + if alpn: + extensions += _ext_alpn(alpn) + + ext_data = struct.pack("!H", len(extensions)) + extensions + + # ClientHello body + body = ( + version # client_version (2) + + random_bytes # random (32) + + struct.pack("B", len(session_id)) + session_id # session_id + + cipher_data # cipher_suites + + compression # compression_methods + + ext_data # extensions + ) + + # Handshake header: type(1) + length(3) + handshake = struct.pack("B", _HANDSHAKE_CLIENT_HELLO) + struct.pack("!I", len(body))[1:] + body + + # TLS record header: type(1) + version(2) + length(2) + record = struct.pack("B", _CONTENT_HANDSHAKE) + _TLS_1_0 + struct.pack("!H", len(handshake)) + handshake + + return record + + +# ─── ServerHello parser ────────────────────────────────────────────────────── + +def _parse_server_hello(data: bytes) -> str: + """ + Extract cipher suite and TLS version from a ServerHello response. + + Returns a pipe-delimited string "cipher|version|extensions" that forms + one component of the JARM hash, or "|||" on parse failure. + """ + try: + if len(data) < 6: + return "|||" + + # TLS record header + if data[0] != _CONTENT_HANDSHAKE: + return "|||" + + struct.unpack_from("!H", data, 1)[0] # record_version (unused) + record_len = struct.unpack_from("!H", data, 3)[0] + hs = data[5: 5 + record_len] + + if len(hs) < 4: + return "|||" + + # Handshake header + if hs[0] != _HANDSHAKE_SERVER_HELLO: + return "|||" + + hs_len = struct.unpack_from("!I", b"\x00" + hs[1:4])[0] + body = hs[4: 4 + hs_len] + + if len(body) < 34: + return "|||" + + pos = 0 + # Server version + server_version = struct.unpack_from("!H", body, pos)[0] + pos += 2 + + # Random (32 bytes) + pos += 32 + + # Session ID + if pos >= len(body): + return "|||" + sid_len = body[pos] + pos += 1 + sid_len + + # Cipher suite + if pos + 2 > len(body): + return "|||" + cipher = struct.unpack_from("!H", body, pos)[0] + pos += 2 + + # Compression method + if pos >= len(body): + return "|||" + pos += 1 + + # Parse extensions for supported_versions (to detect actual TLS 1.3) + actual_version = server_version + extensions_str = "" + if pos + 2 <= len(body): + ext_total = struct.unpack_from("!H", body, pos)[0] + pos += 2 + ext_end = pos + ext_total + ext_types: list[str] = [] + while pos + 4 <= ext_end and pos + 4 <= len(body): + ext_type = struct.unpack_from("!H", body, pos)[0] + ext_len = struct.unpack_from("!H", body, pos + 2)[0] + ext_types.append(f"{ext_type:04x}") + + if ext_type == _EXT_SUPPORTED_VERSIONS and ext_len >= 2: + actual_version = struct.unpack_from("!H", body, pos + 4)[0] + + pos += 4 + ext_len + extensions_str = "-".join(ext_types) + + version_str = _version_to_str(actual_version) + cipher_str = f"{cipher:04x}" + + return f"{cipher_str}|{version_str}|{extensions_str}" + + except Exception: + return "|||" + + +def _version_to_str(version: int) -> str: + return { + 0x0304: "tls13", + 0x0303: "tls12", + 0x0302: "tls11", + 0x0301: "tls10", + 0x0300: "ssl30", + }.get(version, f"{version:04x}") + + +# ─── Probe sender ──────────────────────────────────────────────────────────── + +@_traced("prober.jarm_send_probe") +def _send_probe(host: str, port: int, hello: bytes, timeout: float = 5.0) -> bytes | None: + """ + Open a TCP connection, send the ClientHello, and read the ServerHello. + + Returns raw response bytes or None on any failure. + """ + try: + sock = socket.create_connection((host, port), timeout=timeout) + try: + sock.sendall(hello) + sock.settimeout(timeout) + response = b"" + while True: + chunk = sock.recv(1484) + if not chunk: + break + response += chunk + # We only need the first TLS record (ServerHello) + if len(response) >= 5: + record_len = struct.unpack_from("!H", response, 3)[0] + if len(response) >= 5 + record_len: + break + return response if response else None + finally: + sock.close() + except (OSError, socket.error, socket.timeout): + return None + + +# ─── JARM hash computation ─────────────────────────────────────────────────── + +def _compute_jarm(responses: list[str]) -> str: + """ + Compute the final 62-character JARM hash from 10 probe response strings. + + The first 30 characters are the raw cipher/version concatenation. + The remaining 32 characters are a truncated SHA256 of the extensions. + """ + if all(r == "|||" for r in responses): + return JARM_EMPTY_HASH + + # Build the fuzzy hash + raw_parts: list[str] = [] + ext_parts: list[str] = [] + + for r in responses: + parts = r.split("|") + if len(parts) >= 3 and parts[0] != "": + cipher = parts[0] + version = parts[1] + extensions = parts[2] if len(parts) > 2 else "" + + # Map version to single char + ver_char = { + "tls13": "d", "tls12": "c", "tls11": "b", + "tls10": "a", "ssl30": "0", + }.get(version, "0") + + raw_parts.append(f"{cipher}{ver_char}") + ext_parts.append(extensions) + else: + raw_parts.append("000") + ext_parts.append("") + + # First 30 chars: cipher(4) + version(1) = 5 chars * 10 probes = 50... no + # JARM spec: first part is c|v per probe joined, then SHA256 of extensions + # Actual format: each response contributes 3 chars (cipher_first2 + ver_char) + # to the first 30, then all extensions hashed for the remaining 32. + + fuzzy_raw = "" + for r in responses: + parts = r.split("|") + if len(parts) >= 3 and parts[0] != "": + cipher = parts[0] # 4-char hex + version = parts[1] + ver_char = { + "tls13": "d", "tls12": "c", "tls11": "b", + "tls10": "a", "ssl30": "0", + }.get(version, "0") + fuzzy_raw += f"{cipher[0:2]}{ver_char}" + else: + fuzzy_raw += "000" + + # fuzzy_raw is 30 chars (3 * 10) + ext_str = ",".join(ext_parts) + ext_hash = hashlib.sha256(ext_str.encode()).hexdigest()[:32] + + return fuzzy_raw + ext_hash + + +# ─── Public API ────────────────────────────────────────────────────────────── + +@_traced("prober.jarm_hash") +def jarm_hash(host: str, port: int, timeout: float = 5.0) -> str: + """ + Compute the JARM fingerprint for a TLS server. + + Sends 10 crafted ClientHello packets and hashes the responses. + + Args: + host: target IP or hostname + port: target port + timeout: per-probe TCP timeout in seconds + + Returns: + 62-character JARM hash string, or all-zeros on total failure. + """ + responses: list[str] = [] + + for i in range(10): + hello = _build_client_hello(i, host=host) + raw = _send_probe(host, port, hello, timeout=timeout) + if raw is not None: + parsed = _parse_server_hello(raw) + responses.append(parsed) + else: + responses.append("|||") + + if i < 9: + time.sleep(_INTER_PROBE_DELAY) + + return _compute_jarm(responses) diff --git a/decnet/prober/tcpfp.py b/decnet/prober/tcpfp.py new file mode 100644 index 0000000..a9c0b82 --- /dev/null +++ b/decnet/prober/tcpfp.py @@ -0,0 +1,227 @@ +""" +TCP/IP stack fingerprinting via SYN-ACK analysis. + +Sends a crafted TCP SYN packet to a target host:port, captures the +SYN-ACK response, and extracts OS/tool-identifying characteristics: +TTL, window size, DF bit, MSS, window scale, SACK support, timestamps, +and TCP options ordering. + +Uses scapy for packet crafting and parsing. Requires root/CAP_NET_RAW. +""" + +from __future__ import annotations + +import hashlib +import random +from typing import Any + +from decnet.telemetry import traced as _traced + +# Lazy-import scapy to avoid breaking non-root usage of HASSH/JARM. +# The actual import happens inside functions that need it. + +# ─── TCP option short codes ───────────────────────────────────────────────── + +_OPT_CODES: dict[str, str] = { + "MSS": "M", + "WScale": "W", + "SAckOK": "S", + "SAck": "S", + "Timestamp": "T", + "NOP": "N", + "EOL": "E", + "AltChkSum": "A", + "AltChkSumOpt": "A", + "UTO": "U", +} + + +# ─── Packet construction ─────────────────────────────────────────────────── + +@_traced("prober.tcpfp_send_syn") +def _send_syn( + host: str, + port: int, + timeout: float, +) -> Any | None: + """ + Craft a TCP SYN with common options and send it. Returns the + SYN-ACK response packet or None on timeout/failure. + """ + from scapy.all import IP, TCP, conf, sr1 + + # Suppress scapy's noisy output + conf.verb = 0 + + src_port = random.randint(49152, 65535) # nosec B311 — ephemeral port, not crypto + + pkt = ( + IP(dst=host) + / TCP( + sport=src_port, + dport=port, + flags="S", + options=[ + ("MSS", 1460), + ("NOP", None), + ("WScale", 7), + ("NOP", None), + ("NOP", None), + ("Timestamp", (0, 0)), + ("SAckOK", b""), + ("EOL", None), + ], + ) + ) + + try: + resp = sr1(pkt, timeout=timeout, verbose=0) + except (OSError, PermissionError): + return None + + if resp is None: + return None + + # Verify it's a SYN-ACK (flags == 0x12) + from scapy.all import TCP as TCPLayer + if not resp.haslayer(TCPLayer): + return None + if resp[TCPLayer].flags != 0x12: # SYN-ACK + return None + + # Send RST to clean up half-open connection + _send_rst(host, port, src_port, resp) + + return resp + + +def _send_rst( + host: str, + dport: int, + sport: int, + resp: Any, +) -> None: + """Send RST to clean up the half-open connection.""" + try: + from scapy.all import IP, TCP, send + rst = ( + IP(dst=host) + / TCP( + sport=sport, + dport=dport, + flags="R", + seq=resp.ack, + ) + ) + send(rst, verbose=0) + except Exception: # nosec B110 — best-effort RST cleanup + pass + + +# ─── Response parsing ─────────────────────────────────────────────────────── + +def _parse_synack(resp: Any) -> dict[str, Any]: + """ + Extract fingerprint fields from a scapy SYN-ACK response packet. + """ + from scapy.all import IP, TCP + + ip_layer = resp[IP] + tcp_layer = resp[TCP] + + # IP fields + ttl = ip_layer.ttl + df_bit = 1 if (ip_layer.flags & 0x2) else 0 # DF = bit 1 + ip_id = ip_layer.id + + # TCP fields + window_size = tcp_layer.window + + # Parse TCP options + mss = 0 + window_scale = -1 + sack_ok = 0 + timestamp = 0 + options_order = _extract_options_order(tcp_layer.options) + + for opt_name, opt_value in tcp_layer.options: + if opt_name == "MSS": + mss = opt_value + elif opt_name == "WScale": + window_scale = opt_value + elif opt_name in ("SAckOK", "SAck"): + sack_ok = 1 + elif opt_name == "Timestamp": + timestamp = 1 + + return { + "ttl": ttl, + "window_size": window_size, + "df_bit": df_bit, + "ip_id": ip_id, + "mss": mss, + "window_scale": window_scale, + "sack_ok": sack_ok, + "timestamp": timestamp, + "options_order": options_order, + } + + +def _extract_options_order(options: list[tuple[str, Any]]) -> str: + """ + Map scapy TCP option tuples to a short-code string. + + E.g. [("MSS", 1460), ("NOP", None), ("WScale", 7)] → "M,N,W" + """ + codes = [] + for opt_name, _ in options: + code = _OPT_CODES.get(opt_name, "?") + codes.append(code) + return ",".join(codes) + + +# ─── Fingerprint computation ─────────────────────────────────────────────── + +def _compute_fingerprint(fields: dict[str, Any]) -> tuple[str, str]: + """ + Compute fingerprint raw string and SHA256 hash from parsed fields. + + Returns (raw_string, hash_hex_32). + """ + raw = ( + f"{fields['ttl']}:{fields['window_size']}:{fields['df_bit']}:" + f"{fields['mss']}:{fields['window_scale']}:{fields['sack_ok']}:" + f"{fields['timestamp']}:{fields['options_order']}" + ) + h = hashlib.sha256(raw.encode("utf-8")).hexdigest()[:32] + return raw, h + + +# ─── Public API ───────────────────────────────────────────────────────────── + +@_traced("prober.tcp_fingerprint") +def tcp_fingerprint( + host: str, + port: int, + timeout: float = 5.0, +) -> dict[str, Any] | None: + """ + Send a TCP SYN to host:port and fingerprint the SYN-ACK response. + + Returns a dict with the hash, raw fingerprint string, and individual + fields, or None if no SYN-ACK was received. + + Requires root/CAP_NET_RAW. + """ + resp = _send_syn(host, port, timeout) + if resp is None: + return None + + fields = _parse_synack(resp) + raw, h = _compute_fingerprint(fields) + + return { + "tcpfp_hash": h, + "tcpfp_raw": raw, + **fields, + } diff --git a/decnet/prober/worker.py b/decnet/prober/worker.py new file mode 100644 index 0000000..07e0aa0 --- /dev/null +++ b/decnet/prober/worker.py @@ -0,0 +1,478 @@ +""" +DECNET-PROBER standalone worker. + +Runs as a detached host-level process. Discovers attacker IPs by tailing the +collector's JSON log file, then fingerprints them via multiple active probes: +- JARM (TLS server fingerprinting) +- HASSHServer (SSH server fingerprinting) +- TCP/IP stack fingerprinting (OS/tool identification) + +Results are written as RFC 5424 syslog + JSON to the same log files. + +Target discovery is fully automatic — every unique attacker IP seen in the +log stream gets probed. No manual target list required. + +Tech debt: writing directly to the collector's log files couples the +prober to the collector's file format. A future refactor should introduce +a shared log-sink abstraction. +""" + +from __future__ import annotations + +import asyncio +import json +import re +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +from decnet.logging import get_logger +from decnet.prober.hassh import hassh_server +from decnet.prober.jarm import JARM_EMPTY_HASH, jarm_hash +from decnet.prober.tcpfp import tcp_fingerprint +from decnet.telemetry import traced as _traced + +logger = get_logger("prober") + +# ─── Default ports per probe type ─────────────────────────────────────────── + +# JARM: common C2 callback / TLS server ports +DEFAULT_PROBE_PORTS: list[int] = [ + 443, 8443, 8080, 4443, 50050, 2222, 993, 995, 8888, 9001, +] + +# HASSHServer: common SSH server ports +DEFAULT_SSH_PORTS: list[int] = [22, 2222, 22222, 2022] + +# TCP/IP stack: probe on ports commonly open on attacker machines. +# Wide spread gives the best chance of a SYN-ACK for TTL/fingerprint extraction. +DEFAULT_TCPFP_PORTS: list[int] = [22, 80, 443, 8080, 8443, 445, 3389] + +# ─── RFC 5424 formatting (inline, mirrors templates/*/decnet_logging.py) ───── + +_FACILITY_LOCAL0 = 16 +_SD_ID = "relay@55555" +_SEVERITY_INFO = 6 +_SEVERITY_WARNING = 4 + +_MAX_HOSTNAME = 255 +_MAX_APPNAME = 48 +_MAX_MSGID = 32 + + +def _sd_escape(value: str) -> str: + return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]") + + +def _sd_element(fields: dict[str, Any]) -> str: + if not fields: + return "-" + params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items()) + return f"[{_SD_ID} {params}]" + + +def _syslog_line( + event_type: str, + severity: int = _SEVERITY_INFO, + msg: str | None = None, + **fields: Any, +) -> str: + pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>" + ts = datetime.now(timezone.utc).isoformat() + hostname = "decnet-prober" + appname = "prober" + msgid = (event_type or "-")[:_MAX_MSGID] + sd = _sd_element(fields) + message = f" {msg}" if msg else "" + return f"{pri}1 {ts} {hostname} {appname} - {msgid} {sd}{message}" + + +# ─── RFC 5424 parser (subset of collector's, for JSON generation) ───────────── + +_RFC5424_RE = re.compile( + r"^<\d+>1 " + r"(\S+) " # 1: TIMESTAMP + r"(\S+) " # 2: HOSTNAME + r"(\S+) " # 3: APP-NAME + r"- " # PROCID + r"(\S+) " # 4: MSGID (event_type) + r"(.+)$", # 5: SD + MSG +) +_SD_BLOCK_RE = re.compile(r'\[relay@55555\s+(.*?)\]', re.DOTALL) +_PARAM_RE = re.compile(r'(\w+)="((?:[^"\\]|\\.)*)"') +_IP_FIELDS = ("src_ip", "src", "client_ip", "remote_ip", "ip", "target_ip") + + +def _parse_to_json(line: str) -> dict[str, Any] | None: + m = _RFC5424_RE.match(line) + if not m: + return None + ts_raw, decky, service, event_type, sd_rest = m.groups() + + fields: dict[str, str] = {} + msg = "" + + if sd_rest.startswith("["): + block = _SD_BLOCK_RE.search(sd_rest) + if block: + for k, v in _PARAM_RE.findall(block.group(1)): + fields[k] = v.replace('\\"', '"').replace("\\\\", "\\").replace("\\]", "]") + msg_match = re.search(r'\]\s+(.+)$', sd_rest) + if msg_match: + msg = msg_match.group(1).strip() + + attacker_ip = "Unknown" + for fname in _IP_FIELDS: + if fname in fields: + attacker_ip = fields[fname] + break + + try: + ts_formatted = datetime.fromisoformat(ts_raw).strftime("%Y-%m-%d %H:%M:%S") + except ValueError: + ts_formatted = ts_raw + + return { + "timestamp": ts_formatted, + "decky": decky, + "service": service, + "event_type": event_type, + "attacker_ip": attacker_ip, + "fields": fields, + "msg": msg, + "raw_line": line, + } + + +# ─── Log writer ────────────────────────────────────────────────────────────── + +def _write_event( + log_path: Path, + json_path: Path, + event_type: str, + severity: int = _SEVERITY_INFO, + msg: str | None = None, + **fields: Any, +) -> None: + line = _syslog_line(event_type, severity=severity, msg=msg, **fields) + + with open(log_path, "a", encoding="utf-8") as f: + f.write(line + "\n") + f.flush() + + parsed = _parse_to_json(line) + if parsed: + with open(json_path, "a", encoding="utf-8") as f: + f.write(json.dumps(parsed) + "\n") + f.flush() + + +# ─── Target discovery from log stream ──────────────────────────────────────── + +@_traced("prober.discover_attackers") +def _discover_attackers(json_path: Path, position: int) -> tuple[set[str], int]: + """ + Read new JSON log lines from the given position and extract unique + attacker IPs. Returns (new_ips, new_position). + + Only considers IPs that are not "Unknown" and come from events that + indicate real attacker interaction (not prober's own events). + """ + new_ips: set[str] = set() + + if not json_path.exists(): + return new_ips, position + + size = json_path.stat().st_size + if size < position: + position = 0 # file rotated + + if size == position: + return new_ips, position + + with open(json_path, "r", encoding="utf-8", errors="replace") as f: + f.seek(position) + while True: + line = f.readline() + if not line: + break + if not line.endswith("\n"): + break # partial line + + try: + record = json.loads(line.strip()) + except json.JSONDecodeError: + position = f.tell() + continue + + # Skip our own events + if record.get("service") == "prober": + position = f.tell() + continue + + ip = record.get("attacker_ip", "Unknown") + if ip != "Unknown" and ip: + new_ips.add(ip) + + position = f.tell() + + return new_ips, position + + +# ─── Probe cycle ───────────────────────────────────────────────────────────── + +@_traced("prober.probe_cycle") +def _probe_cycle( + targets: set[str], + probed: dict[str, dict[str, set[int]]], + jarm_ports: list[int], + ssh_ports: list[int], + tcpfp_ports: list[int], + log_path: Path, + json_path: Path, + timeout: float = 5.0, +) -> None: + """ + Probe all known attacker IPs with JARM, HASSH, and TCP/IP fingerprinting. + + Args: + targets: set of attacker IPs to probe + probed: dict mapping IP -> {probe_type -> set of ports already probed} + jarm_ports: TLS ports for JARM fingerprinting + ssh_ports: SSH ports for HASSHServer fingerprinting + tcpfp_ports: ports for TCP/IP stack fingerprinting + log_path: RFC 5424 log file + json_path: JSON log file + timeout: per-probe TCP timeout + """ + for ip in sorted(targets): + ip_probed = probed.setdefault(ip, {}) + + # Phase 1: JARM (TLS fingerprinting) + _jarm_phase(ip, ip_probed, jarm_ports, log_path, json_path, timeout) + + # Phase 2: HASSHServer (SSH fingerprinting) + _hassh_phase(ip, ip_probed, ssh_ports, log_path, json_path, timeout) + + # Phase 3: TCP/IP stack fingerprinting + _tcpfp_phase(ip, ip_probed, tcpfp_ports, log_path, json_path, timeout) + + +@_traced("prober.jarm_phase") +def _jarm_phase( + ip: str, + ip_probed: dict[str, set[int]], + ports: list[int], + log_path: Path, + json_path: Path, + timeout: float, +) -> None: + """JARM-fingerprint an IP on the given TLS ports.""" + done = ip_probed.setdefault("jarm", set()) + for port in ports: + if port in done: + continue + try: + h = jarm_hash(ip, port, timeout=timeout) + done.add(port) + if h == JARM_EMPTY_HASH: + continue + _write_event( + log_path, json_path, + "jarm_fingerprint", + target_ip=ip, + target_port=str(port), + jarm_hash=h, + msg=f"JARM {ip}:{port} = {h}", + ) + logger.info("prober: JARM %s:%d = %s", ip, port, h) + except Exception as exc: + done.add(port) + _write_event( + log_path, json_path, + "prober_error", + severity=_SEVERITY_WARNING, + target_ip=ip, + target_port=str(port), + error=str(exc), + msg=f"JARM probe failed for {ip}:{port}: {exc}", + ) + logger.warning("prober: JARM probe failed %s:%d: %s", ip, port, exc) + + +@_traced("prober.hassh_phase") +def _hassh_phase( + ip: str, + ip_probed: dict[str, set[int]], + ports: list[int], + log_path: Path, + json_path: Path, + timeout: float, +) -> None: + """HASSHServer-fingerprint an IP on the given SSH ports.""" + done = ip_probed.setdefault("hassh", set()) + for port in ports: + if port in done: + continue + try: + result = hassh_server(ip, port, timeout=timeout) + done.add(port) + if result is None: + continue + _write_event( + log_path, json_path, + "hassh_fingerprint", + target_ip=ip, + target_port=str(port), + hassh_server_hash=result["hassh_server"], + ssh_banner=result["banner"], + kex_algorithms=result["kex_algorithms"], + encryption_s2c=result["encryption_s2c"], + mac_s2c=result["mac_s2c"], + compression_s2c=result["compression_s2c"], + msg=f"HASSH {ip}:{port} = {result['hassh_server']}", + ) + logger.info("prober: HASSH %s:%d = %s", ip, port, result["hassh_server"]) + except Exception as exc: + done.add(port) + _write_event( + log_path, json_path, + "prober_error", + severity=_SEVERITY_WARNING, + target_ip=ip, + target_port=str(port), + error=str(exc), + msg=f"HASSH probe failed for {ip}:{port}: {exc}", + ) + logger.warning("prober: HASSH probe failed %s:%d: %s", ip, port, exc) + + +@_traced("prober.tcpfp_phase") +def _tcpfp_phase( + ip: str, + ip_probed: dict[str, set[int]], + ports: list[int], + log_path: Path, + json_path: Path, + timeout: float, +) -> None: + """TCP/IP stack fingerprint an IP on the given ports.""" + done = ip_probed.setdefault("tcpfp", set()) + for port in ports: + if port in done: + continue + try: + result = tcp_fingerprint(ip, port, timeout=timeout) + done.add(port) + if result is None: + continue + _write_event( + log_path, json_path, + "tcpfp_fingerprint", + target_ip=ip, + target_port=str(port), + tcpfp_hash=result["tcpfp_hash"], + tcpfp_raw=result["tcpfp_raw"], + ttl=str(result["ttl"]), + window_size=str(result["window_size"]), + df_bit=str(result["df_bit"]), + mss=str(result["mss"]), + window_scale=str(result["window_scale"]), + sack_ok=str(result["sack_ok"]), + timestamp=str(result["timestamp"]), + options_order=result["options_order"], + msg=f"TCPFP {ip}:{port} = {result['tcpfp_hash']}", + ) + logger.info("prober: TCPFP %s:%d = %s", ip, port, result["tcpfp_hash"]) + except Exception as exc: + done.add(port) + _write_event( + log_path, json_path, + "prober_error", + severity=_SEVERITY_WARNING, + target_ip=ip, + target_port=str(port), + error=str(exc), + msg=f"TCPFP probe failed for {ip}:{port}: {exc}", + ) + logger.warning("prober: TCPFP probe failed %s:%d: %s", ip, port, exc) + + +# ─── Main worker ───────────────────────────────────────────────────────────── + +@_traced("prober.worker") +async def prober_worker( + log_file: str, + interval: int = 300, + timeout: float = 5.0, + ports: list[int] | None = None, + ssh_ports: list[int] | None = None, + tcpfp_ports: list[int] | None = None, +) -> None: + """ + Main entry point for the standalone prober process. + + Discovers attacker IPs automatically by tailing the JSON log file, + then fingerprints each IP via JARM, HASSH, and TCP/IP stack probes. + + Args: + log_file: base path for log files (RFC 5424 to .log, JSON to .json) + interval: seconds between probe cycles + timeout: per-probe TCP timeout + ports: JARM TLS ports (defaults to DEFAULT_PROBE_PORTS) + ssh_ports: HASSH SSH ports (defaults to DEFAULT_SSH_PORTS) + tcpfp_ports: TCP fingerprint ports (defaults to DEFAULT_TCPFP_PORTS) + """ + jarm_ports = ports or DEFAULT_PROBE_PORTS + hassh_ports = ssh_ports or DEFAULT_SSH_PORTS + tcp_ports = tcpfp_ports or DEFAULT_TCPFP_PORTS + + all_ports_str = ( + f"jarm={','.join(str(p) for p in jarm_ports)} " + f"ssh={','.join(str(p) for p in hassh_ports)} " + f"tcpfp={','.join(str(p) for p in tcp_ports)}" + ) + + log_path = Path(log_file) + json_path = log_path.with_suffix(".json") + log_path.parent.mkdir(parents=True, exist_ok=True) + + logger.info( + "prober started interval=%ds %s log=%s", + interval, all_ports_str, log_path, + ) + + _write_event( + log_path, json_path, + "prober_startup", + interval=str(interval), + probe_ports=all_ports_str, + msg=f"DECNET-PROBER started, interval {interval}s, {all_ports_str}", + ) + + known_attackers: set[str] = set() + probed: dict[str, dict[str, set[int]]] = {} # IP -> {type -> ports} + log_position: int = 0 + + while True: + # Discover new attacker IPs from the log stream + new_ips, log_position = await asyncio.to_thread( + _discover_attackers, json_path, log_position, + ) + + if new_ips - known_attackers: + fresh = new_ips - known_attackers + known_attackers.update(fresh) + logger.info( + "prober: discovered %d new attacker(s), total=%d", + len(fresh), len(known_attackers), + ) + + if known_attackers: + await asyncio.to_thread( + _probe_cycle, known_attackers, probed, + jarm_ports, hassh_ports, tcp_ports, + log_path, json_path, timeout, + ) + + await asyncio.sleep(interval) diff --git a/decnet/profiler/__init__.py b/decnet/profiler/__init__.py new file mode 100644 index 0000000..138ce0e --- /dev/null +++ b/decnet/profiler/__init__.py @@ -0,0 +1,5 @@ +"""DECNET profiler — standalone attacker profile builder worker.""" + +from decnet.profiler.worker import attacker_profile_worker + +__all__ = ["attacker_profile_worker"] diff --git a/decnet/profiler/behavioral.py b/decnet/profiler/behavioral.py new file mode 100644 index 0000000..38fc8db --- /dev/null +++ b/decnet/profiler/behavioral.py @@ -0,0 +1,602 @@ +""" +Behavioral and timing analysis for DECNET attacker profiles. + +Consumes the chronological `LogEvent` stream already built by +`decnet.correlation.engine.CorrelationEngine` and derives per-IP metrics: + + - Inter-event timing statistics (mean / median / stdev / min / max) + - Coefficient-of-variation (jitter metric) + - Beaconing vs. interactive vs. scanning vs. brute_force vs. slow_scan + classification + - Tool attribution against known C2 frameworks (Cobalt Strike, Sliver, + Havoc, Mythic) using default beacon/jitter profiles — returns a list, + since multiple tools can be in use simultaneously + - Header-based tool detection (Nmap NSE, Gophish, Nikto, sqlmap, etc.) + from HTTP request events + - Recon → exfil phase sequencing (latency between the last recon event + and the first exfil-like event) + - OS / TCP fingerprint + retransmit rollup from sniffer-emitted events, + with TTL-based fallback when p0f returns no match + +Pure-Python; no external dependencies. All functions are safe to call from +both sync and async contexts. +""" + +from __future__ import annotations + +import json +import re +import statistics +from collections import Counter +from typing import Any + +from decnet.correlation.parser import LogEvent +from decnet.telemetry import traced as _traced, get_tracer as _get_tracer + +# ─── Event-type taxonomy ──────────────────────────────────────────────────── + +# Sniffer-emitted packet events that feed into fingerprint rollup. +_SNIFFER_SYN_EVENT: str = "tcp_syn_fingerprint" +_SNIFFER_FLOW_EVENT: str = "tcp_flow_timing" +# Prober-emitted active-probe result (SYN-ACK fingerprint of attacker machine). +_PROBER_TCPFP_EVENT: str = "tcpfp_fingerprint" + +# Canonical initial TTL for each coarse OS bucket. Used to derive hop +# distance when only the observed TTL is available (prober path). +_INITIAL_TTL: dict[str, int] = { + "linux": 64, + "windows": 128, + "embedded": 255, +} + +# Events that signal "recon" phase (scans, probes, auth attempts). +_RECON_EVENT_TYPES: frozenset[str] = frozenset({ + "scan", "connection", "banner", "probe", + "login_attempt", "auth", "auth_failure", +}) + +# Events that signal "exfil" / action-on-objective phase. +_EXFIL_EVENT_TYPES: frozenset[str] = frozenset({ + "download", "upload", "file_transfer", "data_exfil", + "command", "exec", "query", "shell_input", +}) + +# Fields carrying payload byte counts (for "large payload" detection). +_PAYLOAD_SIZE_FIELDS: tuple[str, ...] = ("bytes", "size", "content_length") + +# ─── C2 tool attribution signatures (beacon timing) ───────────────────────── +# +# Each entry lists the default beacon cadence profile of a popular C2. +# A profile *matches* an attacker when: +# - mean inter-event time is within ±`interval_tolerance` seconds, AND +# - jitter (cv = stdev / mean) is within ±`jitter_tolerance` +# +# Multiple matches are all returned (attacker may run multiple implants). + +_TOOL_SIGNATURES: tuple[dict[str, Any], ...] = ( + { + "name": "cobalt_strike", + "interval_s": 60.0, + "interval_tolerance_s": 8.0, + "jitter_cv": 0.20, + "jitter_tolerance": 0.05, + }, + { + "name": "sliver", + "interval_s": 60.0, + "interval_tolerance_s": 10.0, + "jitter_cv": 0.30, + "jitter_tolerance": 0.08, + }, + { + "name": "havoc", + "interval_s": 45.0, + "interval_tolerance_s": 8.0, + "jitter_cv": 0.10, + "jitter_tolerance": 0.03, + }, + { + "name": "mythic", + "interval_s": 30.0, + "interval_tolerance_s": 6.0, + "jitter_cv": 0.15, + "jitter_tolerance": 0.03, + }, +) + +# ─── Header-based tool signatures ─────────────────────────────────────────── +# +# Scanned against HTTP `request` events. `pattern` is a case-insensitive +# substring (or a regex anchored with ^ if it starts with that character). +# `header` is matched case-insensitively against the event's headers dict. + +_HEADER_TOOL_SIGNATURES: tuple[dict[str, str], ...] = ( + {"name": "nmap", "header": "user-agent", "pattern": "Nmap Scripting Engine"}, + {"name": "gophish", "header": "x-mailer", "pattern": "gophish"}, + {"name": "nikto", "header": "user-agent", "pattern": "Nikto"}, + {"name": "sqlmap", "header": "user-agent", "pattern": "sqlmap"}, + {"name": "nuclei", "header": "user-agent", "pattern": "Nuclei"}, + {"name": "masscan", "header": "user-agent", "pattern": "masscan"}, + {"name": "zgrab", "header": "user-agent", "pattern": "zgrab"}, + {"name": "metasploit", "header": "user-agent", "pattern": "Metasploit"}, + {"name": "curl", "header": "user-agent", "pattern": "^curl/"}, + {"name": "python_requests", "header": "user-agent", "pattern": "python-requests"}, + {"name": "gobuster", "header": "user-agent", "pattern": "gobuster"}, + {"name": "dirbuster", "header": "user-agent", "pattern": "DirBuster"}, + {"name": "hydra", "header": "user-agent", "pattern": "hydra"}, + {"name": "wfuzz", "header": "user-agent", "pattern": "Wfuzz"}, +) + +# ─── TTL → coarse OS bucket (fallback when p0f returns nothing) ───────────── + +def _os_from_ttl(ttl_str: str | None) -> str | None: + """Derive a coarse OS guess from observed TTL when p0f has no match.""" + if not ttl_str: + return None + try: + ttl = int(ttl_str) + except (TypeError, ValueError): + return None + if 55 <= ttl <= 70: + return "linux" + if 115 <= ttl <= 135: + return "windows" + if 235 <= ttl <= 255: + return "embedded" + return None + + +# ─── Timing stats ─────────────────────────────────────────────────────────── + +@_traced("profiler.timing_stats") +def timing_stats(events: list[LogEvent]) -> dict[str, Any]: + """ + Compute inter-arrival-time statistics across *events* (sorted by ts). + + Returns a dict with: + mean_iat_s, median_iat_s, stdev_iat_s, min_iat_s, max_iat_s, cv, + event_count, duration_s + + For n < 2 events the interval-based fields are None/0. + """ + if not events: + return { + "event_count": 0, + "duration_s": 0.0, + "mean_iat_s": None, + "median_iat_s": None, + "stdev_iat_s": None, + "min_iat_s": None, + "max_iat_s": None, + "cv": None, + } + + sorted_events = sorted(events, key=lambda e: e.timestamp) + duration_s = (sorted_events[-1].timestamp - sorted_events[0].timestamp).total_seconds() + + if len(sorted_events) < 2: + return { + "event_count": len(sorted_events), + "duration_s": round(duration_s, 3), + "mean_iat_s": None, + "median_iat_s": None, + "stdev_iat_s": None, + "min_iat_s": None, + "max_iat_s": None, + "cv": None, + } + + iats = [ + (sorted_events[i].timestamp - sorted_events[i - 1].timestamp).total_seconds() + for i in range(1, len(sorted_events)) + ] + # Exclude spuriously-negative (clock-skew) intervals. + iats = [v for v in iats if v >= 0] + if not iats: + return { + "event_count": len(sorted_events), + "duration_s": round(duration_s, 3), + "mean_iat_s": None, + "median_iat_s": None, + "stdev_iat_s": None, + "min_iat_s": None, + "max_iat_s": None, + "cv": None, + } + + mean = statistics.fmean(iats) + median = statistics.median(iats) + stdev = statistics.pstdev(iats) if len(iats) > 1 else 0.0 + cv = (stdev / mean) if mean > 0 else None + + return { + "event_count": len(sorted_events), + "duration_s": round(duration_s, 3), + "mean_iat_s": round(mean, 3), + "median_iat_s": round(median, 3), + "stdev_iat_s": round(stdev, 3), + "min_iat_s": round(min(iats), 3), + "max_iat_s": round(max(iats), 3), + "cv": round(cv, 4) if cv is not None else None, + } + + +# ─── Behavior classification ──────────────────────────────────────────────── + +@_traced("profiler.classify_behavior") +def classify_behavior(stats: dict[str, Any], services_count: int) -> str: + """ + Coarse behavior bucket: + beaconing | interactive | scanning | brute_force | slow_scan | mixed | unknown + + Heuristics (evaluated in priority order): + * `scanning` — ≥ 3 services touched OR mean IAT < 2 s, ≥ 3 events + * `brute_force` — 1 service, n ≥ 8, mean IAT < 5 s, CV < 0.6 + * `beaconing` — CV < 0.35, mean IAT ≥ 5 s, ≥ 4 events + * `slow_scan` — ≥ 2 services, mean IAT ≥ 10 s, ≥ 4 events + * `interactive` — mean IAT < 5 s AND CV ≥ 0.5, ≥ 6 events + * `mixed` — catch-all for sessions with enough data + * `unknown` — too few data points + """ + n = stats.get("event_count") or 0 + mean = stats.get("mean_iat_s") + cv = stats.get("cv") + + if n < 3 or mean is None: + return "unknown" + + # Slow scan / low-and-slow: multiple services with long gaps. + # Must be checked before generic scanning so slow multi-service sessions + # don't get mis-bucketed as a fast sweep. + if services_count >= 2 and mean >= 10.0 and n >= 4: + return "slow_scan" + + # Scanning: broad service sweep (multi-service) or very rapid single-service bursts. + if n >= 3 and ( + (services_count >= 3 and mean < 10.0) + or (services_count >= 2 and mean < 2.0) + ): + return "scanning" + + # Brute force: hammering one service rapidly and repeatedly. + if services_count == 1 and n >= 8 and mean < 5.0 and cv is not None and cv < 0.6: + return "brute_force" + + # Beaconing: regular cadence over multiple events. + if cv is not None and cv < 0.35 and mean >= 5.0 and n >= 4: + return "beaconing" + + # Interactive: short but irregular bursts (human or tool with think time). + if cv is not None and cv >= 0.5 and mean < 5.0 and n >= 6: + return "interactive" + + return "mixed" + + +# ─── C2 tool attribution (beacon timing) ──────────────────────────────────── + +def guess_tools(mean_iat_s: float | None, cv: float | None) -> list[str]: + """ + Match (mean_iat, cv) against known C2 default beacon profiles. + + Returns a list of all matching tool names (may be empty). Multiple + matches are all returned because an attacker can run several implants. + """ + if mean_iat_s is None or cv is None: + return [] + + hits: list[str] = [] + for sig in _TOOL_SIGNATURES: + if abs(mean_iat_s - sig["interval_s"]) > sig["interval_tolerance_s"]: + continue + if abs(cv - sig["jitter_cv"]) > sig["jitter_tolerance"]: + continue + hits.append(sig["name"]) + + return hits + + +# Keep the old name as an alias so callers that expected a single string still +# compile, but mark it deprecated. Returns the first hit or None. +def guess_tool(mean_iat_s: float | None, cv: float | None) -> str | None: + """Deprecated: use guess_tools() instead.""" + hits = guess_tools(mean_iat_s, cv) + if len(hits) == 1: + return hits[0] + return None + + +# ─── Header-based tool detection ──────────────────────────────────────────── + +@_traced("profiler.detect_tools_from_headers") +def detect_tools_from_headers(events: list[LogEvent]) -> list[str]: + """ + Scan HTTP `request` events for tool-identifying headers. + + Checks User-Agent, X-Mailer, and other headers case-insensitively + against `_HEADER_TOOL_SIGNATURES`. Returns a deduplicated list of + matched tool names in detection order. + """ + found: list[str] = [] + seen: set[str] = set() + + for e in events: + if e.event_type != "request": + continue + + raw_headers = e.fields.get("headers") + if not raw_headers: + continue + + # headers may arrive as a JSON string, a Python-repr string (legacy), + # or a dict already (in-memory / test paths). + if isinstance(raw_headers, str): + try: + headers: dict[str, str] = json.loads(raw_headers) + except (json.JSONDecodeError, ValueError): + # Backward-compat: events written before the JSON-encode fix + # were serialized as Python repr via str(dict). ast.literal_eval + # handles that safely (no arbitrary code execution). + try: + import ast as _ast + _parsed = _ast.literal_eval(raw_headers) + if isinstance(_parsed, dict): + headers = _parsed + else: + continue + except Exception: # nosec B112 — skip unparseable header values + continue + elif isinstance(raw_headers, dict): + headers = raw_headers + else: + continue + + # Normalise header keys to lowercase for matching. + lc_headers: dict[str, str] = {k.lower(): str(v) for k, v in headers.items()} + + for sig in _HEADER_TOOL_SIGNATURES: + name = sig["name"] + if name in seen: + continue + value = lc_headers.get(sig["header"]) + if value is None: + continue + pattern = sig["pattern"] + if pattern.startswith("^"): + if re.match(pattern, value, re.IGNORECASE): + found.append(name) + seen.add(name) + else: + if pattern.lower() in value.lower(): + found.append(name) + seen.add(name) + + return found + + +# ─── Phase sequencing ─────────────────────────────────────────────────────── + +@_traced("profiler.phase_sequence") +def phase_sequence(events: list[LogEvent]) -> dict[str, Any]: + """ + Derive recon→exfil phase transition info. + + Returns: + recon_end_ts : ISO timestamp of last recon-class event (or None) + exfil_start_ts : ISO timestamp of first exfil-class event (or None) + exfil_latency_s : seconds between them (None if not both present) + large_payload_count: count of events whose *fields* report a payload + ≥ 1 MiB (heuristic for bulk data transfer) + """ + recon_end = None + exfil_start = None + large_payload_count = 0 + + for e in sorted(events, key=lambda x: x.timestamp): + if e.event_type in _RECON_EVENT_TYPES: + recon_end = e.timestamp + elif e.event_type in _EXFIL_EVENT_TYPES and exfil_start is None: + exfil_start = e.timestamp + + for fname in _PAYLOAD_SIZE_FIELDS: + raw = e.fields.get(fname) + if raw is None: + continue + try: + if int(raw) >= 1_048_576: + large_payload_count += 1 + break + except (TypeError, ValueError): + continue + + latency: float | None = None + if recon_end is not None and exfil_start is not None and exfil_start >= recon_end: + latency = round((exfil_start - recon_end).total_seconds(), 3) + + return { + "recon_end_ts": recon_end.isoformat() if recon_end else None, + "exfil_start_ts": exfil_start.isoformat() if exfil_start else None, + "exfil_latency_s": latency, + "large_payload_count": large_payload_count, + } + + +# ─── Sniffer rollup (OS fingerprint + retransmits) ────────────────────────── + +@_traced("profiler.sniffer_rollup") +def sniffer_rollup(events: list[LogEvent]) -> dict[str, Any]: + """ + Roll up sniffer-emitted `tcp_syn_fingerprint` and `tcp_flow_timing` + events into a per-attacker summary. + + OS guess priority: + 1. Modal p0f label from os_guess field (if not "unknown"/empty). + 2. TTL-based coarse bucket (linux / windows / embedded) as fallback. + Hop distance: median of non-zero reported values only. + """ + os_guesses: list[str] = [] + ttl_values: list[str] = [] + hops: list[int] = [] + tcp_fp: dict[str, Any] | None = None + retransmits = 0 + + for e in events: + if e.event_type == _SNIFFER_SYN_EVENT: + og = e.fields.get("os_guess") + if og and og != "unknown": + os_guesses.append(og) + + # Collect raw TTL for fallback OS derivation. + ttl_raw = e.fields.get("ttl") or e.fields.get("initial_ttl") + if ttl_raw: + ttl_values.append(ttl_raw) + + # Only include hop distances that are valid and non-zero. + hop_raw = e.fields.get("hop_distance") + if hop_raw: + try: + hop_val = int(hop_raw) + if hop_val > 0: + hops.append(hop_val) + except (TypeError, ValueError): + pass + + # Keep the latest fingerprint snapshot. + tcp_fp = { + "window": _int_or_none(e.fields.get("window")), + "wscale": _int_or_none(e.fields.get("wscale")), + "mss": _int_or_none(e.fields.get("mss")), + "options_sig": e.fields.get("options_sig", ""), + "has_sack": e.fields.get("has_sack") == "true", + "has_timestamps": e.fields.get("has_timestamps") == "true", + } + + elif e.event_type == _SNIFFER_FLOW_EVENT: + try: + retransmits += int(e.fields.get("retransmits", "0")) + except (TypeError, ValueError): + pass + + elif e.event_type == _PROBER_TCPFP_EVENT: + # Active-probe result: prober sent SYN to attacker, got SYN-ACK back. + # Field names differ from the passive sniffer (different emitter). + ttl_raw = e.fields.get("ttl") + if ttl_raw: + ttl_values.append(ttl_raw) + + # Derive hop distance from observed TTL vs canonical initial TTL. + os_hint = _os_from_ttl(ttl_raw) + if os_hint: + initial = _INITIAL_TTL.get(os_hint) + if initial: + try: + hop_val = initial - int(ttl_raw) + if hop_val > 0: + hops.append(hop_val) + except (TypeError, ValueError): + pass + + # Prober uses window_size/window_scale/options_order instead of + # the sniffer's window/wscale/options_sig. + tcp_fp = { + "window": _int_or_none(e.fields.get("window_size")), + "wscale": _int_or_none(e.fields.get("window_scale")), + "mss": _int_or_none(e.fields.get("mss")), + "options_sig": e.fields.get("options_order", ""), + "has_sack": e.fields.get("sack_ok") == "1", + "has_timestamps": e.fields.get("timestamp") == "1", + } + + # Mode for the OS bucket — most frequently observed label. + os_guess: str | None = None + if os_guesses: + os_guess = Counter(os_guesses).most_common(1)[0][0] + else: + # TTL-based fallback: use the most common observed TTL value. + if ttl_values: + modal_ttl = Counter(ttl_values).most_common(1)[0][0] + os_guess = _os_from_ttl(modal_ttl) + + # Median hop distance (robust to the occasional weird TTL). + hop_distance: int | None = None + if hops: + hop_distance = int(statistics.median(hops)) + + return { + "os_guess": os_guess, + "hop_distance": hop_distance, + "tcp_fingerprint": tcp_fp or {}, + "retransmit_count": retransmits, + } + + +def _int_or_none(v: Any) -> int | None: + if v is None or v == "": + return None + try: + return int(v) + except (TypeError, ValueError): + return None + + +# ─── Composite: build the full AttackerBehavior record ────────────────────── + +@_traced("profiler.build_behavior_record") +def build_behavior_record(events: list[LogEvent]) -> dict[str, Any]: + """ + Build the dict to persist in the `attacker_behavior` table. + + Callers (profiler worker) pre-serialize JSON-typed fields; we do the + JSON encoding here to keep the repo layer schema-agnostic. + """ + # Timing stats are computed across *all* events (not filtered), because + # a C2 beacon often reuses the same "connection" event_type on each + # check-in. Filtering would throw that signal away. + stats = timing_stats(events) + services = {e.service for e in events} + behavior = classify_behavior(stats, len(services)) + rollup = sniffer_rollup(events) + phase = phase_sequence(events) + + # Combine beacon-timing tool matches with header-based detections. + beacon_tools = guess_tools(stats.get("mean_iat_s"), stats.get("cv")) + header_tools = detect_tools_from_headers(events) + all_tools: list[str] = list(dict.fromkeys(beacon_tools + header_tools)) # dedup, preserve order + + # Promote TCP-level scanner identification to tool_guesses. + # p0f fingerprints nmap from the TCP handshake alone — this fires even + # when no HTTP service is present, making it far more reliable than the + # header-based path for raw port scans. + if rollup["os_guess"] == "nmap" and "nmap" not in all_tools: + all_tools.insert(0, "nmap") + + # Beacon-specific projection: only surface interval/jitter when we've + # classified the flow as beaconing (otherwise these numbers are noise). + beacon_interval_s: float | None = None + beacon_jitter_pct: float | None = None + if behavior == "beaconing": + beacon_interval_s = stats.get("mean_iat_s") + cv = stats.get("cv") + beacon_jitter_pct = round(cv * 100, 2) if cv is not None else None + + _tracer = _get_tracer("profiler") + with _tracer.start_as_current_span("profiler.behavior_summary") as _span: + _span.set_attribute("behavior_class", behavior) + _span.set_attribute("os_guess", rollup["os_guess"] or "unknown") + _span.set_attribute("tool_count", len(all_tools)) + _span.set_attribute("event_count", stats.get("event_count", 0)) + if all_tools: + _span.set_attribute("tools", ",".join(all_tools)) + + return { + "os_guess": rollup["os_guess"], + "hop_distance": rollup["hop_distance"], + "tcp_fingerprint": json.dumps(rollup["tcp_fingerprint"]), + "retransmit_count": rollup["retransmit_count"], + "behavior_class": behavior, + "beacon_interval_s": beacon_interval_s, + "beacon_jitter_pct": beacon_jitter_pct, + "tool_guesses": json.dumps(all_tools), + "timing_stats": json.dumps(stats), + "phase_sequence": json.dumps(phase), + } diff --git a/decnet/profiler/worker.py b/decnet/profiler/worker.py new file mode 100644 index 0000000..3abaf8e --- /dev/null +++ b/decnet/profiler/worker.py @@ -0,0 +1,215 @@ +""" +Attacker profile builder — incremental background worker. + +Maintains a persistent CorrelationEngine and a log-ID cursor across cycles. +On cold start (first cycle or process restart), performs one full build from +all stored logs. Subsequent cycles fetch only new logs via the cursor, +ingest them into the existing engine, and rebuild profiles for affected IPs +only. + +Complexity per cycle: O(new_logs + affected_ips) instead of O(total_logs²). +""" + +from __future__ import annotations + +import asyncio +import json +from dataclasses import dataclass, field +from datetime import datetime, timezone +from typing import Any + +from decnet.correlation.engine import CorrelationEngine +from decnet.correlation.parser import LogEvent +from decnet.logging import get_logger +from decnet.profiler.behavioral import build_behavior_record +from decnet.telemetry import traced as _traced, get_tracer as _get_tracer +from decnet.web.db.repository import BaseRepository + +logger = get_logger("attacker_worker") + +_BATCH_SIZE = 500 +_STATE_KEY = "attacker_worker_cursor" + +# Event types that indicate active command/query execution (not just connection/scan) +_COMMAND_EVENT_TYPES = frozenset({ + "command", "exec", "query", "input", "shell_input", + "execute", "run", "sql_query", "redis_command", +}) + +# Fields that carry the executed command/query text +_COMMAND_FIELDS = ("command", "query", "input", "line", "sql", "cmd") + + +@dataclass +class _WorkerState: + engine: CorrelationEngine = field(default_factory=CorrelationEngine) + last_log_id: int = 0 + initialized: bool = False + + +async def attacker_profile_worker(repo: BaseRepository, *, interval: int = 30) -> None: + """Periodically updates the Attacker table incrementally. Designed to run as an asyncio Task.""" + logger.info("attacker profile worker started interval=%ds", interval) + state = _WorkerState() + _saved_cursor = await repo.get_state(_STATE_KEY) + if _saved_cursor: + state.last_log_id = _saved_cursor.get("last_log_id", 0) + state.initialized = True + logger.info("attacker worker: resumed from cursor last_log_id=%d", state.last_log_id) + while True: + await asyncio.sleep(interval) + try: + await _incremental_update(repo, state) + except Exception as exc: + logger.error("attacker worker: update failed: %s", exc) + + +@_traced("profiler.incremental_update") +async def _incremental_update(repo: BaseRepository, state: _WorkerState) -> None: + was_cold = not state.initialized + affected_ips: set[str] = set() + + while True: + batch = await repo.get_logs_after_id(state.last_log_id, limit=_BATCH_SIZE) + if not batch: + break + + for row in batch: + event = state.engine.ingest(row["raw_line"]) + if event and event.attacker_ip: + affected_ips.add(event.attacker_ip) + state.last_log_id = row["id"] + + await asyncio.sleep(0) # yield to event loop after each batch + + if len(batch) < _BATCH_SIZE: + break + + state.initialized = True + + if not affected_ips: + await repo.set_state(_STATE_KEY, {"last_log_id": state.last_log_id}) + return + + await _update_profiles(repo, state, affected_ips) + await repo.set_state(_STATE_KEY, {"last_log_id": state.last_log_id}) + + if was_cold: + logger.info("attacker worker: cold start rebuilt %d profiles", len(affected_ips)) + else: + logger.info("attacker worker: updated %d profiles (incremental)", len(affected_ips)) + + +@_traced("profiler.update_profiles") +async def _update_profiles( + repo: BaseRepository, + state: _WorkerState, + ips: set[str], +) -> None: + traversal_map = {t.attacker_ip: t for t in state.engine.traversals(min_deckies=2)} + bounties_map = await repo.get_bounties_for_ips(ips) + + _tracer = _get_tracer("profiler") + for ip in ips: + events = state.engine._events.get(ip, []) + if not events: + continue + + with _tracer.start_as_current_span("profiler.process_ip") as _span: + _span.set_attribute("attacker_ip", ip) + _span.set_attribute("event_count", len(events)) + + traversal = traversal_map.get(ip) + bounties = bounties_map.get(ip, []) + commands = _extract_commands_from_events(events) + + record = _build_record(ip, events, traversal, bounties, commands) + attacker_uuid = await repo.upsert_attacker(record) + + _span.set_attribute("is_traversal", traversal is not None) + _span.set_attribute("bounty_count", len(bounties)) + _span.set_attribute("command_count", len(commands)) + + # Behavioral / fingerprint rollup lives in a sibling table so failures + # here never block the core attacker profile upsert. + try: + behavior = build_behavior_record(events) + await repo.upsert_attacker_behavior(attacker_uuid, behavior) + except Exception as exc: + _span.record_exception(exc) + logger.error("attacker worker: behavior upsert failed for %s: %s", ip, exc) + + +def _build_record( + ip: str, + events: list[LogEvent], + traversal: Any, + bounties: list[dict[str, Any]], + commands: list[dict[str, Any]], +) -> dict[str, Any]: + services = sorted({e.service for e in events}) + deckies = ( + traversal.deckies + if traversal + else _first_contact_deckies(events) + ) + fingerprints = [b for b in bounties if b.get("bounty_type") == "fingerprint"] + credential_count = sum(1 for b in bounties if b.get("bounty_type") == "credential") + + return { + "ip": ip, + "first_seen": min(e.timestamp for e in events), + "last_seen": max(e.timestamp for e in events), + "event_count": len(events), + "service_count": len(services), + "decky_count": len({e.decky for e in events}), + "services": json.dumps(services), + "deckies": json.dumps(deckies), + "traversal_path": traversal.path if traversal else None, + "is_traversal": traversal is not None, + "bounty_count": len(bounties), + "credential_count": credential_count, + "fingerprints": json.dumps(fingerprints), + "commands": json.dumps(commands), + "updated_at": datetime.now(timezone.utc), + } + + +def _first_contact_deckies(events: list[LogEvent]) -> list[str]: + """Return unique deckies in first-contact order (for non-traversal attackers).""" + seen: list[str] = [] + for e in sorted(events, key=lambda x: x.timestamp): + if e.decky not in seen: + seen.append(e.decky) + return seen + + +def _extract_commands_from_events(events: list[LogEvent]) -> list[dict[str, Any]]: + """ + Extract executed commands from LogEvent objects. + + Works directly on LogEvent.fields (already a dict), so no JSON parsing needed. + """ + commands: list[dict[str, Any]] = [] + for event in events: + if event.event_type not in _COMMAND_EVENT_TYPES: + continue + + cmd_text: str | None = None + for key in _COMMAND_FIELDS: + val = event.fields.get(key) + if val: + cmd_text = str(val) + break + + if not cmd_text: + continue + + commands.append({ + "service": event.service, + "decky": event.decky, + "command": cmd_text, + "timestamp": event.timestamp.isoformat(), + }) + + return commands diff --git a/decnet/services/base.py b/decnet/services/base.py index 17c2e20..2f7936f 100644 --- a/decnet/services/base.py +++ b/decnet/services/base.py @@ -13,6 +13,7 @@ class BaseService(ABC): name: str # unique slug, e.g. "ssh", "smb" ports: list[int] # ports this service listens on inside the container default_image: str # Docker image tag, or "build" if a Dockerfile is needed + fleet_singleton: bool = False # True = runs once fleet-wide, not per-decky @abstractmethod def compose_fragment( diff --git a/decnet/services/conpot.py b/decnet/services/conpot.py index 643eac6..5eacff6 100644 --- a/decnet/services/conpot.py +++ b/decnet/services/conpot.py @@ -32,4 +32,4 @@ class ConpotService(BaseService): } def dockerfile_context(self): - return Path(__file__).parent.parent.parent / "templates" / "conpot" + return Path(__file__).parent.parent / "templates" / "conpot" diff --git a/decnet/services/docker_api.py b/decnet/services/docker_api.py index 4cc8e89..d4db39c 100644 --- a/decnet/services/docker_api.py +++ b/decnet/services/docker_api.py @@ -1,7 +1,7 @@ from pathlib import Path from decnet.services.base import BaseService -TEMPLATES_DIR = Path(__file__).parent.parent.parent / "templates" / "docker_api" +TEMPLATES_DIR = Path(__file__).parent.parent / "templates" / "docker_api" class DockerAPIService(BaseService): diff --git a/decnet/services/elasticsearch.py b/decnet/services/elasticsearch.py index 146cfca..d4bb65c 100644 --- a/decnet/services/elasticsearch.py +++ b/decnet/services/elasticsearch.py @@ -2,7 +2,7 @@ from pathlib import Path from decnet.services.base import BaseService -TEMPLATES_DIR = Path(__file__).parent.parent.parent / "templates" / "elasticsearch" +TEMPLATES_DIR = Path(__file__).parent.parent / "templates" / "elasticsearch" class ElasticsearchService(BaseService): diff --git a/decnet/services/ftp.py b/decnet/services/ftp.py index d034c81..0a1cafe 100644 --- a/decnet/services/ftp.py +++ b/decnet/services/ftp.py @@ -1,7 +1,7 @@ from pathlib import Path from decnet.services.base import BaseService -TEMPLATES_DIR = Path(__file__).parent.parent.parent / "templates" / "ftp" +TEMPLATES_DIR = Path(__file__).parent.parent / "templates" / "ftp" class FTPService(BaseService): diff --git a/decnet/services/http.py b/decnet/services/http.py index 28e2d47..56928de 100644 --- a/decnet/services/http.py +++ b/decnet/services/http.py @@ -2,7 +2,7 @@ import json from pathlib import Path from decnet.services.base import BaseService -TEMPLATES_DIR = Path(__file__).parent.parent.parent / "templates" / "http" +TEMPLATES_DIR = Path(__file__).parent.parent / "templates" / "http" class HTTPService(BaseService): diff --git a/decnet/services/https.py b/decnet/services/https.py new file mode 100644 index 0000000..3c6735a --- /dev/null +++ b/decnet/services/https.py @@ -0,0 +1,59 @@ +import json +from pathlib import Path +from decnet.services.base import BaseService + +TEMPLATES_DIR = Path(__file__).parent.parent / "templates" / "https" + + +class HTTPSService(BaseService): + name = "https" + ports = [443] + default_image = "build" + + def compose_fragment( + self, + decky_name: str, + log_target: str | None = None, + service_cfg: dict | None = None, + ) -> dict: + cfg = service_cfg or {} + fragment: dict = { + "build": {"context": str(TEMPLATES_DIR)}, + "container_name": f"{decky_name}-https", + "restart": "unless-stopped", + "environment": { + "NODE_NAME": decky_name, + }, + } + if log_target: + fragment["environment"]["LOG_TARGET"] = log_target + + # Optional persona overrides — only injected when explicitly set + if "server_header" in cfg: + fragment["environment"]["SERVER_HEADER"] = cfg["server_header"] + if "response_code" in cfg: + fragment["environment"]["RESPONSE_CODE"] = str(cfg["response_code"]) + if "fake_app" in cfg: + fragment["environment"]["FAKE_APP"] = cfg["fake_app"] + if "extra_headers" in cfg: + val = cfg["extra_headers"] + fragment["environment"]["EXTRA_HEADERS"] = ( + json.dumps(val) if isinstance(val, dict) else val + ) + if "custom_body" in cfg: + fragment["environment"]["CUSTOM_BODY"] = cfg["custom_body"] + if "files" in cfg: + files_path = str(Path(cfg["files"]).resolve()) + fragment["environment"]["FILES_DIR"] = "/opt/html_files" + fragment.setdefault("volumes", []).append(f"{files_path}:/opt/html_files:ro") + if "tls_cert" in cfg: + fragment["environment"]["TLS_CERT"] = cfg["tls_cert"] + if "tls_key" in cfg: + fragment["environment"]["TLS_KEY"] = cfg["tls_key"] + if "tls_cn" in cfg: + fragment["environment"]["TLS_CN"] = cfg["tls_cn"] + + return fragment + + def dockerfile_context(self) -> Path | None: + return TEMPLATES_DIR diff --git a/decnet/services/imap.py b/decnet/services/imap.py index cf8d09f..902f57c 100644 --- a/decnet/services/imap.py +++ b/decnet/services/imap.py @@ -1,7 +1,7 @@ from pathlib import Path from decnet.services.base import BaseService -TEMPLATES_DIR = Path(__file__).parent.parent.parent / "templates" / "imap" +TEMPLATES_DIR = Path(__file__).parent.parent / "templates" / "imap" class IMAPService(BaseService): diff --git a/decnet/services/k8s.py b/decnet/services/k8s.py index b5b3f24..32cc56d 100644 --- a/decnet/services/k8s.py +++ b/decnet/services/k8s.py @@ -1,7 +1,7 @@ from pathlib import Path from decnet.services.base import BaseService -TEMPLATES_DIR = Path(__file__).parent.parent.parent / "templates" / "k8s" +TEMPLATES_DIR = Path(__file__).parent.parent / "templates" / "k8s" class KubernetesAPIService(BaseService): diff --git a/decnet/services/ldap.py b/decnet/services/ldap.py index 48db9f5..76eaa2d 100644 --- a/decnet/services/ldap.py +++ b/decnet/services/ldap.py @@ -1,7 +1,7 @@ from pathlib import Path from decnet.services.base import BaseService -TEMPLATES_DIR = Path(__file__).parent.parent.parent / "templates" / "ldap" +TEMPLATES_DIR = Path(__file__).parent.parent / "templates" / "ldap" class LDAPService(BaseService): diff --git a/decnet/services/llmnr.py b/decnet/services/llmnr.py index 9dd4bc7..4319737 100644 --- a/decnet/services/llmnr.py +++ b/decnet/services/llmnr.py @@ -1,7 +1,7 @@ from pathlib import Path from decnet.services.base import BaseService -TEMPLATES_DIR = Path(__file__).parent.parent.parent / "templates" / "llmnr" +TEMPLATES_DIR = Path(__file__).parent.parent / "templates" / "llmnr" class LLMNRService(BaseService): diff --git a/decnet/services/mongodb.py b/decnet/services/mongodb.py index 4dcad69..397faaf 100644 --- a/decnet/services/mongodb.py +++ b/decnet/services/mongodb.py @@ -1,7 +1,7 @@ from pathlib import Path from decnet.services.base import BaseService -TEMPLATES_DIR = Path(__file__).parent.parent.parent / "templates" / "mongodb" +TEMPLATES_DIR = Path(__file__).parent.parent / "templates" / "mongodb" class MongoDBService(BaseService): diff --git a/decnet/services/mqtt.py b/decnet/services/mqtt.py index e85e14c..60d134f 100644 --- a/decnet/services/mqtt.py +++ b/decnet/services/mqtt.py @@ -1,7 +1,7 @@ from pathlib import Path from decnet.services.base import BaseService -TEMPLATES_DIR = Path(__file__).parent.parent.parent / "templates" / "mqtt" +TEMPLATES_DIR = Path(__file__).parent.parent / "templates" / "mqtt" class MQTTService(BaseService): diff --git a/decnet/services/mssql.py b/decnet/services/mssql.py index 9658325..46b262d 100644 --- a/decnet/services/mssql.py +++ b/decnet/services/mssql.py @@ -1,7 +1,7 @@ from pathlib import Path from decnet.services.base import BaseService -TEMPLATES_DIR = Path(__file__).parent.parent.parent / "templates" / "mssql" +TEMPLATES_DIR = Path(__file__).parent.parent / "templates" / "mssql" class MSSQLService(BaseService): diff --git a/decnet/services/mysql.py b/decnet/services/mysql.py index f8d15da..deb5b50 100644 --- a/decnet/services/mysql.py +++ b/decnet/services/mysql.py @@ -1,7 +1,7 @@ from pathlib import Path from decnet.services.base import BaseService -TEMPLATES_DIR = Path(__file__).parent.parent.parent / "templates" / "mysql" +TEMPLATES_DIR = Path(__file__).parent.parent / "templates" / "mysql" class MySQLService(BaseService): diff --git a/decnet/services/pop3.py b/decnet/services/pop3.py index 5caba08..58e33ad 100644 --- a/decnet/services/pop3.py +++ b/decnet/services/pop3.py @@ -1,7 +1,7 @@ from pathlib import Path from decnet.services.base import BaseService -TEMPLATES_DIR = Path(__file__).parent.parent.parent / "templates" / "pop3" +TEMPLATES_DIR = Path(__file__).parent.parent / "templates" / "pop3" class POP3Service(BaseService): diff --git a/decnet/services/postgres.py b/decnet/services/postgres.py index 1dbcfa3..8a75ded 100644 --- a/decnet/services/postgres.py +++ b/decnet/services/postgres.py @@ -1,7 +1,7 @@ from pathlib import Path from decnet.services.base import BaseService -TEMPLATES_DIR = Path(__file__).parent.parent.parent / "templates" / "postgres" +TEMPLATES_DIR = Path(__file__).parent.parent / "templates" / "postgres" class PostgresService(BaseService): diff --git a/decnet/services/rdp.py b/decnet/services/rdp.py index 7c9ac48..26057ff 100644 --- a/decnet/services/rdp.py +++ b/decnet/services/rdp.py @@ -1,7 +1,7 @@ from pathlib import Path from decnet.services.base import BaseService -TEMPLATES_DIR = Path(__file__).parent.parent.parent / "templates" / "rdp" +TEMPLATES_DIR = Path(__file__).parent.parent / "templates" / "rdp" class RDPService(BaseService): diff --git a/decnet/services/redis.py b/decnet/services/redis.py index 263823c..b6c9b5c 100644 --- a/decnet/services/redis.py +++ b/decnet/services/redis.py @@ -1,7 +1,7 @@ from pathlib import Path from decnet.services.base import BaseService -TEMPLATES_DIR = Path(__file__).parent.parent.parent / "templates" / "redis" +TEMPLATES_DIR = Path(__file__).parent.parent / "templates" / "redis" class RedisService(BaseService): diff --git a/decnet/services/sip.py b/decnet/services/sip.py index 0d50f65..0566539 100644 --- a/decnet/services/sip.py +++ b/decnet/services/sip.py @@ -1,7 +1,7 @@ from pathlib import Path from decnet.services.base import BaseService -TEMPLATES_DIR = Path(__file__).parent.parent.parent / "templates" / "sip" +TEMPLATES_DIR = Path(__file__).parent.parent / "templates" / "sip" class SIPService(BaseService): diff --git a/decnet/services/smb.py b/decnet/services/smb.py index da96971..f6a43ca 100644 --- a/decnet/services/smb.py +++ b/decnet/services/smb.py @@ -1,7 +1,7 @@ from pathlib import Path from decnet.services.base import BaseService -TEMPLATES_DIR = Path(__file__).parent.parent.parent / "templates" / "smb" +TEMPLATES_DIR = Path(__file__).parent.parent / "templates" / "smb" class SMBService(BaseService): diff --git a/decnet/services/smtp.py b/decnet/services/smtp.py index 3e616b5..364b38b 100644 --- a/decnet/services/smtp.py +++ b/decnet/services/smtp.py @@ -2,7 +2,7 @@ from pathlib import Path from decnet.services.base import BaseService -TEMPLATES_DIR = Path(__file__).parent.parent.parent / "templates" / "smtp" +TEMPLATES_DIR = Path(__file__).parent.parent / "templates" / "smtp" class SMTPService(BaseService): diff --git a/decnet/services/smtp_relay.py b/decnet/services/smtp_relay.py index 7656e19..7144db5 100644 --- a/decnet/services/smtp_relay.py +++ b/decnet/services/smtp_relay.py @@ -4,7 +4,7 @@ from decnet.services.base import BaseService # Reuses the same template as the smtp service — only difference is # SMTP_OPEN_RELAY=1 in the environment, which enables the open relay persona. -_TEMPLATES_DIR = Path(__file__).parent.parent.parent / "templates" / "smtp" +_TEMPLATES_DIR = Path(__file__).parent.parent / "templates" / "smtp" class SMTPRelayService(BaseService): diff --git a/decnet/services/sniffer.py b/decnet/services/sniffer.py new file mode 100644 index 0000000..5a12ea6 --- /dev/null +++ b/decnet/services/sniffer.py @@ -0,0 +1,41 @@ +from pathlib import Path +from decnet.services.base import BaseService + +TEMPLATES_DIR = Path(__file__).parent.parent / "templates" / "sniffer" + + +class SnifferService(BaseService): + """ + Passive network sniffer deployed alongside deckies on the MACVLAN. + + Captures TLS handshakes in promiscuous mode and extracts JA3/JA3S hashes + plus connection metadata. Requires NET_RAW + NET_ADMIN capabilities. + No inbound ports — purely passive. + """ + + name = "sniffer" + ports: list[int] = [] + default_image = "build" + fleet_singleton = True + + def compose_fragment( + self, + decky_name: str, + log_target: str | None = None, + service_cfg: dict | None = None, + ) -> dict: + fragment: dict = { + "build": {"context": str(TEMPLATES_DIR)}, + "container_name": f"{decky_name}-sniffer", + "restart": "unless-stopped", + "cap_add": ["NET_RAW", "NET_ADMIN"], + "environment": { + "NODE_NAME": decky_name, + }, + } + if log_target: + fragment["environment"]["LOG_TARGET"] = log_target + return fragment + + def dockerfile_context(self) -> Path | None: + return TEMPLATES_DIR diff --git a/decnet/services/snmp.py b/decnet/services/snmp.py index 613b426..0e67ce8 100644 --- a/decnet/services/snmp.py +++ b/decnet/services/snmp.py @@ -1,7 +1,7 @@ from pathlib import Path from decnet.services.base import BaseService -TEMPLATES_DIR = Path(__file__).parent.parent.parent / "templates" / "snmp" +TEMPLATES_DIR = Path(__file__).parent.parent / "templates" / "snmp" class SNMPService(BaseService): diff --git a/decnet/services/ssh.py b/decnet/services/ssh.py index db2ce54..f721f82 100644 --- a/decnet/services/ssh.py +++ b/decnet/services/ssh.py @@ -2,7 +2,7 @@ from pathlib import Path from decnet.services.base import BaseService -TEMPLATES_DIR = Path(__file__).parent.parent.parent / "templates" / "ssh" +TEMPLATES_DIR = Path(__file__).parent.parent / "templates" / "ssh" class SSHService(BaseService): @@ -32,16 +32,28 @@ class SSHService(BaseService): cfg = service_cfg or {} env: dict = { "SSH_ROOT_PASSWORD": cfg.get("password", "admin"), + # NODE_NAME is the authoritative decky identifier for log + # attribution — matches the host path used for the artifacts + # bind mount below. The container hostname (optionally overridden + # via SSH_HOSTNAME) is cosmetic and may differ to keep the + # decoy looking heterogeneous. + "NODE_NAME": decky_name, } if "hostname" in cfg: env["SSH_HOSTNAME"] = cfg["hostname"] + # File-catcher quarantine: bind-mount a per-decky host dir so attacker + # drops (scp/sftp/wget) are mirrored out-of-band for forensic analysis. + # The in-container path masquerades as systemd-coredump so `mount`/`df` + # from inside the container looks benign. + quarantine_host = f"/var/lib/decnet/artifacts/{decky_name}/ssh" return { "build": {"context": str(TEMPLATES_DIR)}, "container_name": f"{decky_name}-ssh", "restart": "unless-stopped", "cap_add": ["NET_BIND_SERVICE"], "environment": env, + "volumes": [f"{quarantine_host}:/var/lib/systemd/coredump:rw"], } def dockerfile_context(self) -> Path: diff --git a/decnet/services/telnet.py b/decnet/services/telnet.py index f022fac..81ee798 100644 --- a/decnet/services/telnet.py +++ b/decnet/services/telnet.py @@ -2,7 +2,7 @@ from pathlib import Path from decnet.services.base import BaseService -TEMPLATES_DIR = Path(__file__).parent.parent.parent / "templates" / "telnet" +TEMPLATES_DIR = Path(__file__).parent.parent / "templates" / "telnet" class TelnetService(BaseService): diff --git a/decnet/services/tftp.py b/decnet/services/tftp.py index 17ddd4c..a51ba7f 100644 --- a/decnet/services/tftp.py +++ b/decnet/services/tftp.py @@ -1,7 +1,7 @@ from pathlib import Path from decnet.services.base import BaseService -TEMPLATES_DIR = Path(__file__).parent.parent.parent / "templates" / "tftp" +TEMPLATES_DIR = Path(__file__).parent.parent / "templates" / "tftp" class TFTPService(BaseService): diff --git a/decnet/services/vnc.py b/decnet/services/vnc.py index 63cfdee..0c5834e 100644 --- a/decnet/services/vnc.py +++ b/decnet/services/vnc.py @@ -1,7 +1,7 @@ from pathlib import Path from decnet.services.base import BaseService -TEMPLATES_DIR = Path(__file__).parent.parent.parent / "templates" / "vnc" +TEMPLATES_DIR = Path(__file__).parent.parent / "templates" / "vnc" class VNCService(BaseService): diff --git a/decnet/sniffer/__init__.py b/decnet/sniffer/__init__.py new file mode 100644 index 0000000..4428ea1 --- /dev/null +++ b/decnet/sniffer/__init__.py @@ -0,0 +1,11 @@ +""" +Fleet-wide MACVLAN sniffer microservice. + +Runs as a single host-side background task (not per-decky) that sniffs +all TLS traffic on the MACVLAN interface, extracts fingerprints, and +feeds events into the existing log pipeline. +""" + +from decnet.sniffer.worker import sniffer_worker + +__all__ = ["sniffer_worker"] diff --git a/decnet/sniffer/fingerprint.py b/decnet/sniffer/fingerprint.py new file mode 100644 index 0000000..cdc8455 --- /dev/null +++ b/decnet/sniffer/fingerprint.py @@ -0,0 +1,1166 @@ +""" +TLS fingerprinting engine for the fleet-wide MACVLAN sniffer. + +Extracted from templates/sniffer/server.py. All pure-Python TLS parsing, +JA3/JA3S/JA4/JA4S/JA4L computation, session tracking, and dedup logic +lives here. The packet callback is parameterized to accept an IP-to-decky +mapping and a write function, so it works for fleet-wide sniffing. +""" + +from __future__ import annotations + +import hashlib +import struct +import time +from typing import Any, Callable + +from decnet.prober.tcpfp import _extract_options_order +from decnet.sniffer.p0f import guess_os, hop_distance, initial_ttl +from decnet.sniffer.syslog import SEVERITY_INFO, SEVERITY_WARNING, syslog_line +from decnet.telemetry import traced as _traced, get_tracer as _get_tracer + +# ─── Constants ─────────────────────────────────────────────────────────────── + +SERVICE_NAME: str = "sniffer" + +_SESSION_TTL: float = 60.0 +_DEDUP_TTL: float = 300.0 + +# Inactivity after which a TCP flow is considered closed and its timing +# summary is flushed as an event. +_FLOW_IDLE_TIMEOUT: float = 120.0 + +_GREASE: frozenset[int] = frozenset(0x0A0A + i * 0x1010 for i in range(16)) + +_TLS_RECORD_HANDSHAKE: int = 0x16 +_TLS_HT_CLIENT_HELLO: int = 0x01 +_TLS_HT_SERVER_HELLO: int = 0x02 +_TLS_HT_CERTIFICATE: int = 0x0B + +_EXT_SNI: int = 0x0000 +_EXT_SUPPORTED_GROUPS: int = 0x000A +_EXT_EC_POINT_FORMATS: int = 0x000B +_EXT_SIGNATURE_ALGORITHMS: int = 0x000D +_EXT_ALPN: int = 0x0010 +_EXT_SESSION_TICKET: int = 0x0023 +_EXT_SUPPORTED_VERSIONS: int = 0x002B +_EXT_PRE_SHARED_KEY: int = 0x0029 +_EXT_EARLY_DATA: int = 0x002A + +_TCP_SYN: int = 0x02 +_TCP_ACK: int = 0x10 +_TCP_FIN: int = 0x01 +_TCP_RST: int = 0x04 + + +# ─── TCP option extraction for passive fingerprinting ─────────────────────── + +def _extract_tcp_fingerprint(tcp_options: list) -> dict[str, Any]: + """ + Extract MSS, window-scale, SACK, timestamp flags, and the options order + signature from a scapy TCP options list. + """ + mss = 0 + wscale: int | None = None + sack_ok = False + has_ts = False + for opt_name, opt_value in tcp_options or []: + if opt_name == "MSS": + mss = opt_value + elif opt_name == "WScale": + wscale = opt_value + elif opt_name in ("SAckOK", "SAck"): + sack_ok = True + elif opt_name == "Timestamp": + has_ts = True + options_sig = _extract_options_order(tcp_options or []) + return { + "mss": mss, + "wscale": wscale, + "sack_ok": sack_ok, + "has_timestamps": has_ts, + "options_sig": options_sig, + } + + +# ─── GREASE helpers ────────────────────────────────────────────────────────── + +def _is_grease(value: int) -> bool: + return value in _GREASE + + +def _filter_grease(values: list[int]) -> list[int]: + return [v for v in values if not _is_grease(v)] + + +# ─── TLS parsers ───────────────────────────────────────────────────────────── + +@_traced("sniffer.parse_client_hello") +def _parse_client_hello(data: bytes) -> dict[str, Any] | None: + try: + if len(data) < 6: + return None + if data[0] != _TLS_RECORD_HANDSHAKE: + return None + record_len = struct.unpack_from("!H", data, 3)[0] + if len(data) < 5 + record_len: + return None + + hs = data[5:] + if hs[0] != _TLS_HT_CLIENT_HELLO: + return None + + hs_len = struct.unpack_from("!I", b"\x00" + hs[1:4])[0] + body = hs[4: 4 + hs_len] + if len(body) < 34: + return None + + pos = 0 + tls_version = struct.unpack_from("!H", body, pos)[0] + pos += 2 + pos += 32 # Random + + session_id_len = body[pos] + session_id = body[pos + 1: pos + 1 + session_id_len] + pos += 1 + session_id_len + + cs_len = struct.unpack_from("!H", body, pos)[0] + pos += 2 + cipher_suites = [ + struct.unpack_from("!H", body, pos + i * 2)[0] + for i in range(cs_len // 2) + ] + pos += cs_len + + comp_len = body[pos] + pos += 1 + comp_len + + extensions: list[int] = [] + supported_groups: list[int] = [] + ec_point_formats: list[int] = [] + signature_algorithms: list[int] = [] + supported_versions: list[int] = [] + sni: str = "" + alpn: list[str] = [] + has_session_ticket_data: bool = False + has_pre_shared_key: bool = False + has_early_data: bool = False + + if pos + 2 <= len(body): + ext_total = struct.unpack_from("!H", body, pos)[0] + pos += 2 + ext_end = pos + ext_total + + while pos + 4 <= ext_end: + ext_type = struct.unpack_from("!H", body, pos)[0] + ext_len = struct.unpack_from("!H", body, pos + 2)[0] + ext_data = body[pos + 4: pos + 4 + ext_len] + pos += 4 + ext_len + + if not _is_grease(ext_type): + extensions.append(ext_type) + + if ext_type == _EXT_SNI and len(ext_data) > 5: + sni = ext_data[5:].decode("ascii", errors="replace") + + elif ext_type == _EXT_SUPPORTED_GROUPS and len(ext_data) >= 2: + grp_len = struct.unpack_from("!H", ext_data, 0)[0] + supported_groups = [ + struct.unpack_from("!H", ext_data, 2 + i * 2)[0] + for i in range(grp_len // 2) + ] + + elif ext_type == _EXT_EC_POINT_FORMATS and len(ext_data) >= 1: + pf_len = ext_data[0] + ec_point_formats = list(ext_data[1: 1 + pf_len]) + + elif ext_type == _EXT_ALPN and len(ext_data) >= 2: + proto_list_len = struct.unpack_from("!H", ext_data, 0)[0] + ap = 2 + while ap < 2 + proto_list_len: + plen = ext_data[ap] + alpn.append(ext_data[ap + 1: ap + 1 + plen].decode("ascii", errors="replace")) + ap += 1 + plen + + elif ext_type == _EXT_SIGNATURE_ALGORITHMS and len(ext_data) >= 2: + sa_len = struct.unpack_from("!H", ext_data, 0)[0] + signature_algorithms = [ + struct.unpack_from("!H", ext_data, 2 + i * 2)[0] + for i in range(sa_len // 2) + ] + + elif ext_type == _EXT_SUPPORTED_VERSIONS and len(ext_data) >= 1: + sv_len = ext_data[0] + supported_versions = [ + struct.unpack_from("!H", ext_data, 1 + i * 2)[0] + for i in range(sv_len // 2) + ] + + elif ext_type == _EXT_SESSION_TICKET: + has_session_ticket_data = len(ext_data) > 0 + + elif ext_type == _EXT_PRE_SHARED_KEY: + has_pre_shared_key = True + + elif ext_type == _EXT_EARLY_DATA: + has_early_data = True + + filtered_ciphers = _filter_grease(cipher_suites) + filtered_groups = _filter_grease(supported_groups) + filtered_sig_algs = _filter_grease(signature_algorithms) + filtered_versions = _filter_grease(supported_versions) + + return { + "tls_version": tls_version, + "cipher_suites": filtered_ciphers, + "extensions": extensions, + "supported_groups": filtered_groups, + "ec_point_formats": ec_point_formats, + "signature_algorithms": filtered_sig_algs, + "supported_versions": filtered_versions, + "sni": sni, + "alpn": alpn, + "session_id": session_id, + "has_session_ticket_data": has_session_ticket_data, + "has_pre_shared_key": has_pre_shared_key, + "has_early_data": has_early_data, + } + + except Exception: + return None + + +@_traced("sniffer.parse_server_hello") +def _parse_server_hello(data: bytes) -> dict[str, Any] | None: + try: + if len(data) < 6 or data[0] != _TLS_RECORD_HANDSHAKE: + return None + + hs = data[5:] + if hs[0] != _TLS_HT_SERVER_HELLO: + return None + + hs_len = struct.unpack_from("!I", b"\x00" + hs[1:4])[0] + body = hs[4: 4 + hs_len] + if len(body) < 35: + return None + + pos = 0 + tls_version = struct.unpack_from("!H", body, pos)[0] + pos += 2 + pos += 32 # Random + + session_id_len = body[pos] + pos += 1 + session_id_len + + if pos + 2 > len(body): + return None + + cipher_suite = struct.unpack_from("!H", body, pos)[0] + pos += 2 + pos += 1 # Compression method + + extensions: list[int] = [] + selected_version: int | None = None + alpn: str = "" + + if pos + 2 <= len(body): + ext_total = struct.unpack_from("!H", body, pos)[0] + pos += 2 + ext_end = pos + ext_total + while pos + 4 <= ext_end: + ext_type = struct.unpack_from("!H", body, pos)[0] + ext_len = struct.unpack_from("!H", body, pos + 2)[0] + ext_data = body[pos + 4: pos + 4 + ext_len] + pos += 4 + ext_len + if not _is_grease(ext_type): + extensions.append(ext_type) + + if ext_type == _EXT_SUPPORTED_VERSIONS and len(ext_data) >= 2: + selected_version = struct.unpack_from("!H", ext_data, 0)[0] + + elif ext_type == _EXT_ALPN and len(ext_data) >= 2: + proto_list_len = struct.unpack_from("!H", ext_data, 0)[0] + if proto_list_len > 0 and len(ext_data) >= 4: + plen = ext_data[2] + alpn = ext_data[3: 3 + plen].decode("ascii", errors="replace") + + return { + "tls_version": tls_version, + "cipher_suite": cipher_suite, + "extensions": extensions, + "selected_version": selected_version, + "alpn": alpn, + } + + except Exception: + return None + + +@_traced("sniffer.parse_certificate") +def _parse_certificate(data: bytes) -> dict[str, Any] | None: + try: + if len(data) < 6 or data[0] != _TLS_RECORD_HANDSHAKE: + return None + + hs = data[5:] + if hs[0] != _TLS_HT_CERTIFICATE: + return None + + hs_len = struct.unpack_from("!I", b"\x00" + hs[1:4])[0] + body = hs[4: 4 + hs_len] + if len(body) < 3: + return None + + certs_len = struct.unpack_from("!I", b"\x00" + body[0:3])[0] + if certs_len == 0: + return None + + pos = 3 + if pos + 3 > len(body): + return None + cert_len = struct.unpack_from("!I", b"\x00" + body[pos:pos + 3])[0] + pos += 3 + if pos + cert_len > len(body): + return None + + cert_der = body[pos: pos + cert_len] + return _parse_x509_der(cert_der) + + except Exception: + return None + + +# ─── Minimal DER/ASN.1 X.509 parser ───────────────────────────────────────── + +def _der_read_tag_len(data: bytes, pos: int) -> tuple[int, int, int]: + tag = data[pos] + pos += 1 + length_byte = data[pos] + pos += 1 + if length_byte & 0x80: + num_bytes = length_byte & 0x7F + length = int.from_bytes(data[pos: pos + num_bytes], "big") + pos += num_bytes + else: + length = length_byte + return tag, pos, length + + +def _der_read_sequence(data: bytes, pos: int) -> tuple[int, int]: + tag, content_start, length = _der_read_tag_len(data, pos) + return content_start, length + + +def _der_read_oid(data: bytes, pos: int, length: int) -> str: + if length < 1: + return "" + first = data[pos] + oid_parts = [str(first // 40), str(first % 40)] + val = 0 + for i in range(1, length): + b = data[pos + i] + val = (val << 7) | (b & 0x7F) + if not (b & 0x80): + oid_parts.append(str(val)) + val = 0 + return ".".join(oid_parts) + + +def _der_extract_cn(data: bytes, start: int, length: int) -> str: + pos = start + end = start + length + while pos < end: + set_tag, set_start, set_len = _der_read_tag_len(data, pos) + if set_tag != 0x31: + break + set_end = set_start + set_len + attr_pos = set_start + while attr_pos < set_end: + seq_tag, seq_start, seq_len = _der_read_tag_len(data, attr_pos) + if seq_tag != 0x30: + break + oid_tag, oid_start, oid_len = _der_read_tag_len(data, seq_start) + if oid_tag == 0x06: + oid = _der_read_oid(data, oid_start, oid_len) + if oid == "2.5.4.3": + val_tag, val_start, val_len = _der_read_tag_len(data, oid_start + oid_len) + return data[val_start: val_start + val_len].decode("utf-8", errors="replace") + attr_pos = seq_start + seq_len + pos = set_end + return "" + + +def _der_extract_name_str(data: bytes, start: int, length: int) -> str: + parts: list[str] = [] + pos = start + end = start + length + oid_names = { + "2.5.4.3": "CN", + "2.5.4.6": "C", + "2.5.4.7": "L", + "2.5.4.8": "ST", + "2.5.4.10": "O", + "2.5.4.11": "OU", + } + while pos < end: + set_tag, set_start, set_len = _der_read_tag_len(data, pos) + if set_tag != 0x31: + break + set_end = set_start + set_len + attr_pos = set_start + while attr_pos < set_end: + seq_tag, seq_start, seq_len = _der_read_tag_len(data, attr_pos) + if seq_tag != 0x30: + break + oid_tag, oid_start, oid_len = _der_read_tag_len(data, seq_start) + if oid_tag == 0x06: + oid = _der_read_oid(data, oid_start, oid_len) + val_tag, val_start, val_len = _der_read_tag_len(data, oid_start + oid_len) + val = data[val_start: val_start + val_len].decode("utf-8", errors="replace") + name = oid_names.get(oid, oid) + parts.append(f"{name}={val}") + attr_pos = seq_start + seq_len + pos = set_end + return ", ".join(parts) + + +def _parse_x509_der(cert_der: bytes) -> dict[str, Any] | None: + try: + outer_start, outer_len = _der_read_sequence(cert_der, 0) + tbs_tag, tbs_start, tbs_len = _der_read_tag_len(cert_der, outer_start) + tbs_end = tbs_start + tbs_len + pos = tbs_start + + if cert_der[pos] == 0xA0: + _, v_start, v_len = _der_read_tag_len(cert_der, pos) + pos = v_start + v_len + + _, sn_start, sn_len = _der_read_tag_len(cert_der, pos) + pos = sn_start + sn_len + + _, sa_start, sa_len = _der_read_tag_len(cert_der, pos) + pos = sa_start + sa_len + + issuer_tag, issuer_start, issuer_len = _der_read_tag_len(cert_der, pos) + issuer_str = _der_extract_name_str(cert_der, issuer_start, issuer_len) + issuer_cn = _der_extract_cn(cert_der, issuer_start, issuer_len) + pos = issuer_start + issuer_len + + val_tag, val_start, val_len = _der_read_tag_len(cert_der, pos) + nb_tag, nb_start, nb_len = _der_read_tag_len(cert_der, val_start) + not_before = cert_der[nb_start: nb_start + nb_len].decode("ascii", errors="replace") + na_tag, na_start, na_len = _der_read_tag_len(cert_der, nb_start + nb_len) + not_after = cert_der[na_start: na_start + na_len].decode("ascii", errors="replace") + pos = val_start + val_len + + subj_tag, subj_start, subj_len = _der_read_tag_len(cert_der, pos) + subject_cn = _der_extract_cn(cert_der, subj_start, subj_len) + subject_str = _der_extract_name_str(cert_der, subj_start, subj_len) + + self_signed = (issuer_cn == subject_cn) and subject_cn != "" + + pos = subj_start + subj_len + sans: list[str] = _extract_sans(cert_der, pos, tbs_end) + + return { + "subject_cn": subject_cn, + "subject": subject_str, + "issuer": issuer_str, + "issuer_cn": issuer_cn, + "not_before": not_before, + "not_after": not_after, + "self_signed": self_signed, + "sans": sans, + } + + except Exception: + return None + + +def _extract_sans(cert_der: bytes, pos: int, end: int) -> list[str]: + sans: list[str] = [] + try: + if pos >= end: + return sans + spki_tag, spki_start, spki_len = _der_read_tag_len(cert_der, pos) + pos = spki_start + spki_len + + while pos < end: + tag = cert_der[pos] + if tag == 0xA3: + _, ext_wrap_start, ext_wrap_len = _der_read_tag_len(cert_der, pos) + _, exts_start, exts_len = _der_read_tag_len(cert_der, ext_wrap_start) + epos = exts_start + eend = exts_start + exts_len + while epos < eend: + ext_tag, ext_start, ext_len = _der_read_tag_len(cert_der, epos) + ext_end = ext_start + ext_len + oid_tag, oid_start, oid_len = _der_read_tag_len(cert_der, ext_start) + if oid_tag == 0x06: + oid = _der_read_oid(cert_der, oid_start, oid_len) + if oid == "2.5.29.17": + vpos = oid_start + oid_len + if vpos < ext_end and cert_der[vpos] == 0x01: + _, bs, bl = _der_read_tag_len(cert_der, vpos) + vpos = bs + bl + if vpos < ext_end: + os_tag, os_start, os_len = _der_read_tag_len(cert_der, vpos) + if os_tag == 0x04: + sans = _parse_san_sequence(cert_der, os_start, os_len) + epos = ext_end + break + else: + _, skip_start, skip_len = _der_read_tag_len(cert_der, pos) + pos = skip_start + skip_len + except Exception: # nosec B110 — DER parse errors return partial results + pass + return sans + + +def _parse_san_sequence(data: bytes, start: int, length: int) -> list[str]: + names: list[str] = [] + try: + seq_tag, seq_start, seq_len = _der_read_tag_len(data, start) + pos = seq_start + end = seq_start + seq_len + while pos < end: + tag = data[pos] + _, val_start, val_len = _der_read_tag_len(data, pos) + context_tag = tag & 0x1F + if context_tag == 2: + names.append(data[val_start: val_start + val_len].decode("ascii", errors="replace")) + elif context_tag == 7 and val_len == 4: + names.append(".".join(str(b) for b in data[val_start: val_start + val_len])) + pos = val_start + val_len + except Exception: # nosec B110 — SAN parse errors return partial results + pass + return names + + +# ─── JA3 / JA3S ───────────────────────────────────────────────────────────── + +def _tls_version_str(version: int) -> str: + return { + 0x0301: "TLS 1.0", + 0x0302: "TLS 1.1", + 0x0303: "TLS 1.2", + 0x0304: "TLS 1.3", + 0x0200: "SSL 2.0", + 0x0300: "SSL 3.0", + }.get(version, f"0x{version:04x}") + + +@_traced("sniffer.ja3") +def _ja3(ch: dict[str, Any]) -> tuple[str, str]: + parts = [ + str(ch["tls_version"]), + "-".join(str(c) for c in ch["cipher_suites"]), + "-".join(str(e) for e in ch["extensions"]), + "-".join(str(g) for g in ch["supported_groups"]), + "-".join(str(p) for p in ch["ec_point_formats"]), + ] + ja3_str = ",".join(parts) + return ja3_str, hashlib.md5(ja3_str.encode(), usedforsecurity=False).hexdigest() + + +@_traced("sniffer.ja3s") +def _ja3s(sh: dict[str, Any]) -> tuple[str, str]: + parts = [ + str(sh["tls_version"]), + str(sh["cipher_suite"]), + "-".join(str(e) for e in sh["extensions"]), + ] + ja3s_str = ",".join(parts) + return ja3s_str, hashlib.md5(ja3s_str.encode(), usedforsecurity=False).hexdigest() + + +# ─── JA4 / JA4S ───────────────────────────────────────────────────────────── + +def _ja4_version(ch: dict[str, Any]) -> str: + versions = ch.get("supported_versions", []) + if versions: + best = max(versions) + else: + best = ch["tls_version"] + return { + 0x0304: "13", + 0x0303: "12", + 0x0302: "11", + 0x0301: "10", + 0x0300: "s3", + 0x0200: "s2", + }.get(best, "00") + + +def _ja4_alpn_tag(alpn_list: list[str] | str) -> str: + if isinstance(alpn_list, str): + proto = alpn_list + elif alpn_list: + proto = alpn_list[0] + else: + return "00" + if not proto: + return "00" + if len(proto) == 1: + return proto[0] + proto[0] + return proto[0] + proto[-1] + + +def _sha256_12(text: str) -> str: + return hashlib.sha256(text.encode()).hexdigest()[:12] + + +@_traced("sniffer.ja4") +def _ja4(ch: dict[str, Any]) -> str: + proto = "t" + ver = _ja4_version(ch) + sni_flag = "d" if ch.get("sni") else "i" + cs_count = min(len(ch["cipher_suites"]), 99) + ext_count = min(len(ch["extensions"]), 99) + alpn_tag = _ja4_alpn_tag(ch.get("alpn", [])) + section_a = f"{proto}{ver}{sni_flag}{cs_count:02d}{ext_count:02d}{alpn_tag}" + sorted_cs = sorted(ch["cipher_suites"]) + section_b = _sha256_12(",".join(str(c) for c in sorted_cs)) + sorted_ext = sorted(ch["extensions"]) + sorted_sa = sorted(ch.get("signature_algorithms", [])) + ext_str = ",".join(str(e) for e in sorted_ext) + sa_str = ",".join(str(s) for s in sorted_sa) + combined = f"{ext_str}_{sa_str}" if sa_str else ext_str + section_c = _sha256_12(combined) + return f"{section_a}_{section_b}_{section_c}" + + +@_traced("sniffer.ja4s") +def _ja4s(sh: dict[str, Any]) -> str: + proto = "t" + selected = sh.get("selected_version") + if selected: + ver = {0x0304: "13", 0x0303: "12", 0x0302: "11", 0x0301: "10", + 0x0300: "s3", 0x0200: "s2"}.get(selected, "00") + else: + ver = {0x0304: "13", 0x0303: "12", 0x0302: "11", 0x0301: "10", + 0x0300: "s3", 0x0200: "s2"}.get(sh["tls_version"], "00") + ext_count = min(len(sh["extensions"]), 99) + alpn_tag = _ja4_alpn_tag(sh.get("alpn", "")) + section_a = f"{proto}{ver}{ext_count:02d}{alpn_tag}" + sorted_ext = sorted(sh["extensions"]) + inner = f"{sh['cipher_suite']},{','.join(str(e) for e in sorted_ext)}" + section_b = _sha256_12(inner) + return f"{section_a}_{section_b}" + + +# ─── JA4L (latency) ───────────────────────────────────────────────────────── + +def _ja4l( + key: tuple[str, int, str, int], + tcp_rtt: dict[tuple[str, int, str, int], dict[str, Any]], +) -> dict[str, Any] | None: + return tcp_rtt.get(key) + + +# ─── Session resumption ───────────────────────────────────────────────────── + +@_traced("sniffer.session_resumption_info") +def _session_resumption_info(ch: dict[str, Any]) -> dict[str, Any]: + mechanisms: list[str] = [] + if ch.get("has_session_ticket_data"): + mechanisms.append("session_ticket") + if ch.get("has_pre_shared_key"): + mechanisms.append("psk") + if ch.get("has_early_data"): + mechanisms.append("early_data_0rtt") + if ch.get("session_id") and len(ch["session_id"]) > 0: + mechanisms.append("session_id") + return { + "resumption_attempted": len(mechanisms) > 0, + "mechanisms": mechanisms, + } + + +# ─── Sniffer engine (stateful, one instance per worker) ───────────────────── + +class SnifferEngine: + """ + Stateful TLS fingerprinting engine. Tracks sessions, TCP RTTs, + and dedup state. Thread-safe only when called from a single thread + (the scapy sniff thread). + """ + + def __init__( + self, + ip_to_decky: dict[str, str], + write_fn: Callable[[str], None], + dedup_ttl: float = 300.0, + ): + self._ip_to_decky = ip_to_decky + self._write_fn = write_fn + self._dedup_ttl = dedup_ttl + + self._sessions: dict[tuple[str, int, str, int], dict[str, Any]] = {} + self._session_ts: dict[tuple[str, int, str, int], float] = {} + self._tcp_syn: dict[tuple[str, int, str, int], dict[str, Any]] = {} + self._tcp_rtt: dict[tuple[str, int, str, int], dict[str, Any]] = {} + + # Per-flow timing aggregator. Key: (src_ip, src_port, dst_ip, dst_port). + # Flow direction is client→decky; reverse packets are associated back + # to the forward flow so we can track retransmits and inter-arrival. + self._flows: dict[tuple[str, int, str, int], dict[str, Any]] = {} + self._flow_last_cleanup: float = 0.0 + self._FLOW_CLEANUP_INTERVAL: float = 30.0 + + self._dedup_cache: dict[tuple[str, str, str], float] = {} + self._dedup_last_cleanup: float = 0.0 + self._DEDUP_CLEANUP_INTERVAL: float = 60.0 + + def update_ip_map(self, ip_to_decky: dict[str, str]) -> None: + self._ip_to_decky = ip_to_decky + + def _resolve_decky(self, src_ip: str, dst_ip: str) -> str | None: + """Map a packet to a decky name. Returns None if neither IP is a known decky.""" + if dst_ip in self._ip_to_decky: + return self._ip_to_decky[dst_ip] + if src_ip in self._ip_to_decky: + return self._ip_to_decky[src_ip] + return None + + def _cleanup_sessions(self) -> None: + now = time.monotonic() + stale = [k for k, ts in self._session_ts.items() if now - ts > _SESSION_TTL] + for k in stale: + self._sessions.pop(k, None) + self._session_ts.pop(k, None) + stale_syn = [k for k, v in self._tcp_syn.items() + if now - v.get("time", 0) > _SESSION_TTL] + for k in stale_syn: + self._tcp_syn.pop(k, None) + stale_rtt = [k for k, _ in self._tcp_rtt.items() + if k not in self._sessions and k not in self._session_ts] + for k in stale_rtt: + self._tcp_rtt.pop(k, None) + + def _dedup_key_for(self, event_type: str, fields: dict[str, Any]) -> str: + if event_type == "tls_client_hello": + return fields.get("ja3", "") + "|" + fields.get("ja4", "") + if event_type == "tls_session": + return (fields.get("ja3", "") + "|" + fields.get("ja3s", "") + + "|" + fields.get("ja4", "") + "|" + fields.get("ja4s", "")) + if event_type == "tls_certificate": + return fields.get("subject_cn", "") + "|" + fields.get("issuer", "") + if event_type == "tcp_syn_fingerprint": + # Dedupe per (OS signature, options layout). One event per unique + # stack profile from this attacker IP per dedup window. + return fields.get("os_guess", "") + "|" + fields.get("options_sig", "") + if event_type == "tcp_flow_timing": + # Dedup per (attacker_ip, decky_port) — src_port is deliberately + # excluded so a port scanner rotating source ports only produces + # one timing event per dedup window. Behavior cadence doesn't + # need per-ephemeral-port fidelity. + return fields.get("dst_ip", "") + "|" + fields.get("dst_port", "") + return fields.get("mechanisms", fields.get("resumption", "")) + + def _is_duplicate(self, event_type: str, fields: dict[str, Any]) -> bool: + if self._dedup_ttl <= 0: + return False + now = time.monotonic() + if now - self._dedup_last_cleanup > self._DEDUP_CLEANUP_INTERVAL: + stale = [k for k, ts in self._dedup_cache.items() if now - ts > self._dedup_ttl] + for k in stale: + del self._dedup_cache[k] + self._dedup_last_cleanup = now + src_ip = fields.get("src_ip", "") + fp = self._dedup_key_for(event_type, fields) + cache_key = (src_ip, event_type, fp) + last_seen = self._dedup_cache.get(cache_key) + if last_seen is not None and now - last_seen < self._dedup_ttl: + return True + self._dedup_cache[cache_key] = now + return False + + def _log(self, node_name: str, event_type: str, severity: int = SEVERITY_INFO, **fields: Any) -> None: + if self._is_duplicate(event_type, fields): + return + line = syslog_line(SERVICE_NAME, node_name, event_type, severity=severity, **fields) + self._write_fn(line) + + # ── Flow tracking (per-TCP-4-tuple timing + retransmits) ──────────────── + + def _flow_key( + self, + src_ip: str, + src_port: int, + dst_ip: str, + dst_port: int, + ) -> tuple[str, int, str, int]: + """ + Canonicalize a packet to the *client→decky* direction so forward and + reverse packets share one flow record. + """ + if dst_ip in self._ip_to_decky: + return (src_ip, src_port, dst_ip, dst_port) + # Otherwise src is the decky, flip. + return (dst_ip, dst_port, src_ip, src_port) + + def _update_flow( + self, + flow_key: tuple[str, int, str, int], + now: float, + seq: int, + payload_len: int, + direction_forward: bool, + ) -> None: + """Record one packet into the flow aggregator.""" + flow = self._flows.get(flow_key) + if flow is None: + flow = { + "start": now, + "last": now, + "packets": 0, + "bytes": 0, + "iat_sum": 0.0, + "iat_min": float("inf"), + "iat_max": 0.0, + "iat_count": 0, + "forward_seqs": set(), + "retransmits": 0, + "emitted": False, + } + self._flows[flow_key] = flow + + if flow["packets"] > 0: + iat = now - flow["last"] + if iat >= 0: + flow["iat_sum"] += iat + flow["iat_count"] += 1 + if iat < flow["iat_min"]: + flow["iat_min"] = iat + if iat > flow["iat_max"]: + flow["iat_max"] = iat + + flow["last"] = now + flow["packets"] += 1 + flow["bytes"] += payload_len + + # Retransmit detection: a forward-direction packet with payload whose + # sequence number we've already seen is a retransmit. Empty SYN/ACKs + # are excluded because they share seq legitimately. + if direction_forward and payload_len > 0: + if seq in flow["forward_seqs"]: + flow["retransmits"] += 1 + else: + flow["forward_seqs"].add(seq) + + def _flush_flow( + self, + flow_key: tuple[str, int, str, int], + node_name: str, + ) -> None: + """Emit one `tcp_flow_timing` event for *flow_key* and drop its state. + + Trivial flows (scan probes: 1–2 packets, sub-second duration) are + dropped silently — they add noise to the log pipeline without carrying + usable behavioral signal (beacon cadence, exfil timing, retransmits + are all meaningful only on longer-lived flows). + """ + flow = self._flows.pop(flow_key, None) + if flow is None or flow.get("emitted"): + return + flow["emitted"] = True + + # Skip uninteresting flows — keep the log pipeline from being flooded + # by short-lived scan probes. + duration = flow["last"] - flow["start"] + if flow["packets"] < 4 and flow["retransmits"] == 0 and duration < 1.0: + return + + src_ip, src_port, dst_ip, dst_port = flow_key + iat_count = flow["iat_count"] + mean_iat_ms = round((flow["iat_sum"] / iat_count) * 1000, 2) if iat_count else 0.0 + min_iat_ms = round(flow["iat_min"] * 1000, 2) if iat_count else 0.0 + max_iat_ms = round(flow["iat_max"] * 1000, 2) if iat_count else 0.0 + duration_s = round(duration, 3) + + self._log( + node_name, + "tcp_flow_timing", + src_ip=src_ip, + src_port=str(src_port), + dst_ip=dst_ip, + dst_port=str(dst_port), + packets=str(flow["packets"]), + bytes=str(flow["bytes"]), + duration_s=str(duration_s), + mean_iat_ms=str(mean_iat_ms), + min_iat_ms=str(min_iat_ms), + max_iat_ms=str(max_iat_ms), + retransmits=str(flow["retransmits"]), + ) + + def flush_all_flows(self) -> None: + """ + Flush every tracked flow (emit `tcp_flow_timing` events) and drop + state. Safe to call from outside the sniff thread; used during + shutdown and in tests. + """ + for key in list(self._flows.keys()): + decky = self._ip_to_decky.get(key[2]) + if decky: + self._flush_flow(key, decky) + else: + self._flows.pop(key, None) + + def _flush_idle_flows(self) -> None: + """Flush any flow whose last packet was more than _FLOW_IDLE_TIMEOUT ago.""" + now = time.monotonic() + if now - self._flow_last_cleanup < self._FLOW_CLEANUP_INTERVAL: + return + self._flow_last_cleanup = now + stale: list[tuple[str, int, str, int]] = [ + k for k, f in self._flows.items() + if now - f["last"] > _FLOW_IDLE_TIMEOUT + ] + for key in stale: + decky = self._ip_to_decky.get(key[2]) + if decky: + self._flush_flow(key, decky) + else: + self._flows.pop(key, None) + + def on_packet(self, pkt: Any) -> None: + """Process a single scapy packet. Called from the sniff thread.""" + try: + from scapy.layers.inet import IP, TCP + except ImportError: + return + + if not (pkt.haslayer(IP) and pkt.haslayer(TCP)): + return + + ip = pkt[IP] + tcp = pkt[TCP] + + src_ip: str = ip.src + dst_ip: str = ip.dst + src_port: int = tcp.sport + dst_port: int = tcp.dport + flags: int = tcp.flags.value if hasattr(tcp.flags, 'value') else int(tcp.flags) + + # Skip traffic not involving any decky + node_name = self._resolve_decky(src_ip, dst_ip) + if node_name is None: + return + + now = time.monotonic() + + # Per-flow timing aggregation (covers all TCP traffic, not just TLS) + flow_key = self._flow_key(src_ip, src_port, dst_ip, dst_port) + direction_forward = (flow_key[0] == src_ip and flow_key[1] == src_port) + tcp_payload_len = len(bytes(tcp.payload)) + self._update_flow( + flow_key, + now=now, + seq=int(tcp.seq), + payload_len=tcp_payload_len, + direction_forward=direction_forward, + ) + self._flush_idle_flows() + + # TCP SYN tracking for JA4L + passive SYN fingerprint + if flags & _TCP_SYN and not (flags & _TCP_ACK): + key = (src_ip, src_port, dst_ip, dst_port) + self._tcp_syn[key] = {"time": now, "ttl": ip.ttl} + + # Emit passive OS fingerprint on the *client* SYN. Only do this + # when the destination is a known decky, i.e. we're seeing an + # attacker's initial packet. + if dst_ip in self._ip_to_decky: + _tracer = _get_tracer("sniffer") + with _tracer.start_as_current_span("sniffer.tcp_syn_fingerprint") as _span: + _span.set_attribute("attacker_ip", src_ip) + _span.set_attribute("dst_port", dst_port) + tcp_fp = _extract_tcp_fingerprint(list(tcp.options or [])) + os_label = guess_os( + ttl=ip.ttl, + window=int(tcp.window), + mss=tcp_fp["mss"], + wscale=tcp_fp["wscale"], + options_sig=tcp_fp["options_sig"], + ) + _span.set_attribute("os_guess", os_label) + target_node = self._ip_to_decky[dst_ip] + self._log( + target_node, + "tcp_syn_fingerprint", + src_ip=src_ip, + src_port=str(src_port), + dst_ip=dst_ip, + dst_port=str(dst_port), + ttl=str(ip.ttl), + initial_ttl=str(initial_ttl(ip.ttl)), + hop_distance=str(hop_distance(ip.ttl)), + window=str(int(tcp.window)), + mss=str(tcp_fp["mss"]), + wscale=("" if tcp_fp["wscale"] is None else str(tcp_fp["wscale"])), + options_sig=tcp_fp["options_sig"], + has_sack=str(tcp_fp["sack_ok"]).lower(), + has_timestamps=str(tcp_fp["has_timestamps"]).lower(), + os_guess=os_label, + ) + + elif flags & _TCP_SYN and flags & _TCP_ACK: + rev_key = (dst_ip, dst_port, src_ip, src_port) + syn_data = self._tcp_syn.pop(rev_key, None) + if syn_data: + rtt_ms = round((now - syn_data["time"]) * 1000, 2) + self._tcp_rtt[rev_key] = { + "rtt_ms": rtt_ms, + "client_ttl": syn_data["ttl"], + } + + # Flush flow on FIN/RST (terminal packets). + if flags & (_TCP_FIN | _TCP_RST): + decky = self._ip_to_decky.get(flow_key[2]) + if decky: + self._flush_flow(flow_key, decky) + + payload = bytes(tcp.payload) + if not payload: + return + + if payload[0] != _TLS_RECORD_HANDSHAKE: + return + + # ClientHello + ch = _parse_client_hello(payload) + if ch is not None: + _tracer = _get_tracer("sniffer") + with _tracer.start_as_current_span("sniffer.tls_client_hello") as _span: + _span.set_attribute("attacker_ip", src_ip) + _span.set_attribute("dst_port", dst_port) + self._cleanup_sessions() + + key = (src_ip, src_port, dst_ip, dst_port) + ja3_str, ja3_hash = _ja3(ch) + ja4_hash = _ja4(ch) + resumption = _session_resumption_info(ch) + rtt_data = _ja4l(key, self._tcp_rtt) + + _span.set_attribute("ja3", ja3_hash) + _span.set_attribute("ja4", ja4_hash) + _span.set_attribute("sni", ch["sni"] or "") + + self._sessions[key] = { + "ja3": ja3_hash, + "ja3_str": ja3_str, + "ja4": ja4_hash, + "tls_version": ch["tls_version"], + "cipher_suites": ch["cipher_suites"], + "extensions": ch["extensions"], + "signature_algorithms": ch.get("signature_algorithms", []), + "supported_versions": ch.get("supported_versions", []), + "sni": ch["sni"], + "alpn": ch["alpn"], + "resumption": resumption, + } + self._session_ts[key] = time.monotonic() + + log_fields: dict[str, Any] = { + "src_ip": src_ip, + "src_port": str(src_port), + "dst_ip": dst_ip, + "dst_port": str(dst_port), + "ja3": ja3_hash, + "ja4": ja4_hash, + "tls_version": _tls_version_str(ch["tls_version"]), + "sni": ch["sni"] or "", + "alpn": ",".join(ch["alpn"]), + "raw_ciphers": "-".join(str(c) for c in ch["cipher_suites"]), + "raw_extensions": "-".join(str(e) for e in ch["extensions"]), + } + + if resumption["resumption_attempted"]: + log_fields["resumption"] = ",".join(resumption["mechanisms"]) + + if rtt_data: + log_fields["ja4l_rtt_ms"] = str(rtt_data["rtt_ms"]) + log_fields["ja4l_client_ttl"] = str(rtt_data["client_ttl"]) + + # Resolve node for the *destination* (the decky being attacked) + target_node = self._ip_to_decky.get(dst_ip, node_name) + self._log(target_node, "tls_client_hello", **log_fields) + return + + # ServerHello + sh = _parse_server_hello(payload) + if sh is not None: + _tracer = _get_tracer("sniffer") + with _tracer.start_as_current_span("sniffer.tls_server_hello") as _span: + _span.set_attribute("attacker_ip", dst_ip) + rev_key = (dst_ip, dst_port, src_ip, src_port) + ch_data = self._sessions.pop(rev_key, None) + self._session_ts.pop(rev_key, None) + + ja3s_str, ja3s_hash = _ja3s(sh) + ja4s_hash = _ja4s(sh) + + _span.set_attribute("ja3s", ja3s_hash) + _span.set_attribute("ja4s", ja4s_hash) + + fields: dict[str, Any] = { + "src_ip": dst_ip, + "src_port": str(dst_port), + "dst_ip": src_ip, + "dst_port": str(src_port), + "ja3s": ja3s_hash, + "ja4s": ja4s_hash, + "tls_version": _tls_version_str(sh["tls_version"]), + } + + if ch_data: + fields["ja3"] = ch_data["ja3"] + fields["ja4"] = ch_data.get("ja4", "") + fields["sni"] = ch_data["sni"] or "" + fields["alpn"] = ",".join(ch_data["alpn"]) + fields["raw_ciphers"] = "-".join(str(c) for c in ch_data["cipher_suites"]) + fields["raw_extensions"] = "-".join(str(e) for e in ch_data["extensions"]) + if ch_data.get("resumption", {}).get("resumption_attempted"): + fields["resumption"] = ",".join(ch_data["resumption"]["mechanisms"]) + + rtt_data = self._tcp_rtt.pop(rev_key, None) + if rtt_data: + fields["ja4l_rtt_ms"] = str(rtt_data["rtt_ms"]) + fields["ja4l_client_ttl"] = str(rtt_data["client_ttl"]) + + # Server response — resolve by src_ip (the decky responding) + target_node = self._ip_to_decky.get(src_ip, node_name) + self._log(target_node, "tls_session", severity=SEVERITY_WARNING, **fields) + return + + # Certificate (TLS 1.2 only) + cert = _parse_certificate(payload) + if cert is not None: + _tracer = _get_tracer("sniffer") + with _tracer.start_as_current_span("sniffer.tls_certificate") as _span: + _span.set_attribute("subject_cn", cert["subject_cn"]) + _span.set_attribute("self_signed", cert["self_signed"]) + rev_key = (dst_ip, dst_port, src_ip, src_port) + ch_data = self._sessions.get(rev_key) + + cert_fields: dict[str, Any] = { + "src_ip": dst_ip, + "src_port": str(dst_port), + "dst_ip": src_ip, + "dst_port": str(src_port), + "subject_cn": cert["subject_cn"], + "issuer": cert["issuer"], + "self_signed": str(cert["self_signed"]).lower(), + "not_before": cert["not_before"], + "not_after": cert["not_after"], + } + if cert["sans"]: + cert_fields["sans"] = ",".join(cert["sans"]) + if ch_data: + cert_fields["sni"] = ch_data.get("sni", "") + + target_node = self._ip_to_decky.get(src_ip, node_name) + self._log(target_node, "tls_certificate", **cert_fields) diff --git a/decnet/sniffer/p0f.py b/decnet/sniffer/p0f.py new file mode 100644 index 0000000..88cceca --- /dev/null +++ b/decnet/sniffer/p0f.py @@ -0,0 +1,238 @@ +""" +Passive OS fingerprinting (p0f-lite) for the DECNET sniffer. + +Pure-Python lookup module. Given the values of an incoming TCP SYN packet +(TTL, window, MSS, window-scale, and TCP option ordering), returns a coarse +OS bucket (linux / windows / macos_ios / freebsd / openbsd / nmap / unknown) +plus derived hop distance and inferred initial TTL. + +Rationale +--------- +Full p0f v3 distinguishes several dozen OS/tool profiles by combining dozens +of low-level quirks (OLEN, WSIZE, EOL padding, PCLASS, quirks, payload class). +For DECNET we only need a coarse bucket — enough to tag an attacker as +"linux beacon" vs "windows interactive" vs "active scan". The curated +table below covers default stacks that dominate real-world attacker traffic. + +References (public p0f v3 DB, nmap-os-db, and Mozilla OS Fingerprint table): + https://github.com/p0f/p0f/blob/master/p0f.fp + +No external dependencies. +""" + +from __future__ import annotations + +from decnet.telemetry import traced as _traced + +# ─── TTL → initial TTL bucket ─────────────────────────────────────────────── + +# Common "hop 0" TTLs. Packets decrement TTL once per hop, so we round up +# the observed TTL to the nearest known starting value. +_TTL_BUCKETS: tuple[int, ...] = (32, 64, 128, 255) + + +def initial_ttl(ttl: int) -> int: + """ + Round *ttl* up to the nearest known initial-TTL bucket. + + A SYN with TTL=59 was almost certainly emitted by a Linux/BSD host + (initial 64) five hops away; TTL=120 by a Windows host (initial 128) + eight hops away. + """ + for bucket in _TTL_BUCKETS: + if ttl <= bucket: + return bucket + return 255 + + +def hop_distance(ttl: int) -> int: + """ + Estimate hops between the attacker and the sniffer based on TTL. + + Upper-bounded at 64 (anything further has most likely been mangled + by a misconfigured firewall or a TTL-spoofing NAT). + """ + dist = initial_ttl(ttl) - ttl + if dist < 0: + return 0 + if dist > 64: + return 64 + return dist + + +# ─── OS signature table (TTL bucket, window, MSS, wscale, option-order) ───── + +# Each entry is a set of loose predicates. If all predicates match, the +# OS label is returned. First-match wins. `None` means "don't care". +# +# The option signatures use the short-code alphabet from +# decnet/prober/tcpfp.py :: _OPT_CODES (M=MSS, N=NOP, W=WScale, +# T=Timestamp, S=SAckOK, E=EOL). + +_SIGNATURES: tuple[tuple[dict, str], ...] = ( + # ── nmap -sS / -sT default probe ─────────────────────────────────────── + # nmap crafts very distinctive SYNs: tiny window (1024/4096/etc.), full + # option set including WScale=10 and SAckOK. Match these first so they + # don't get misclassified as Linux. + ( + { + "ttl_bucket": 64, + "window_in": {1024, 2048, 3072, 4096, 31337, 32768, 65535}, + "mss": 1460, + "wscale": 10, + "options": "M,W,T,S,S", + }, + "nmap", + ), + ( + { + "ttl_bucket": 64, + "window_in": {1024, 2048, 3072, 4096, 31337, 32768, 65535}, + "options_starts_with": "M,W,T,S", + }, + "nmap", + ), + # ── macOS / iOS default SYN (match before Linux — shares TTL 64) ────── + # TTL 64, window 65535, MSS 1460, WScale 6, specific option order + # M,N,W,N,N,T,S,E (Darwin signature with EOL padding). + ( + { + "ttl_bucket": 64, + "window": 65535, + "wscale": 6, + "options": "M,N,W,N,N,T,S,E", + }, + "macos_ios", + ), + ( + { + "ttl_bucket": 64, + "window_in": {65535}, + "wscale_in": {5, 6}, + "has_timestamps": True, + "options_ends_with": "E", + }, + "macos_ios", + ), + # ── FreeBSD default SYN (TTL 64, no EOL) ─────────────────────────────── + ( + { + "ttl_bucket": 64, + "window": 65535, + "wscale": 6, + "has_sack": True, + "has_timestamps": True, + "options_no_eol": True, + }, + "freebsd", + ), + # ── Linux (kernel 3.x – 6.x) default SYN ─────────────────────────────── + # TTL 64, window 29200 / 64240 / 65535, MSS 1460, WScale 7, full options. + ( + { + "ttl_bucket": 64, + "window_min": 5000, + "wscale_in": {6, 7, 8, 9, 10, 11, 12, 13, 14}, + "has_sack": True, + "has_timestamps": True, + }, + "linux", + ), + # ── OpenBSD default SYN ───────────────────────────────────────────────── + # TTL 64, window 16384, WScale 3-6, MSS 1460 + ( + { + "ttl_bucket": 64, + "window_in": {16384, 16960}, + "wscale_in": {3, 4, 5, 6}, + }, + "openbsd", + ), + # ── Windows 10/11/Server default SYN ──────────────────────────────────── + # TTL 128, window 64240/65535, MSS 1460, WScale 8, SACK+TS + ( + { + "ttl_bucket": 128, + "window_min": 8192, + "wscale_in": {2, 6, 7, 8}, + "has_sack": True, + }, + "windows", + ), + # ── Windows 7/XP (legacy) ─────────────────────────────────────────────── + ( + { + "ttl_bucket": 128, + "window_in": {8192, 16384, 65535}, + }, + "windows", + ), + # ── Embedded / Cisco / network gear ───────────────────────────────────── + ( + { + "ttl_bucket": 255, + }, + "embedded", + ), +) + + +def _match_signature( + sig: dict, + ttl: int, + window: int, + mss: int, + wscale: int | None, + options_sig: str, +) -> bool: + """Evaluate every predicate in *sig* against the observed values.""" + tb = initial_ttl(ttl) + if "ttl_bucket" in sig and sig["ttl_bucket"] != tb: + return False + if "window" in sig and sig["window"] != window: + return False + if "window_in" in sig and window not in sig["window_in"]: + return False + if "window_min" in sig and window < sig["window_min"]: + return False + if "mss" in sig and sig["mss"] != mss: + return False + if "wscale" in sig and sig["wscale"] != wscale: + return False + if "wscale_in" in sig and wscale not in sig["wscale_in"]: + return False + if "has_sack" in sig: + if sig["has_sack"] != ("S" in options_sig): + return False + if "has_timestamps" in sig: + if sig["has_timestamps"] != ("T" in options_sig): + return False + if "options" in sig and sig["options"] != options_sig: + return False + if "options_starts_with" in sig and not options_sig.startswith(sig["options_starts_with"]): + return False + if "options_ends_with" in sig and not options_sig.endswith(sig["options_ends_with"]): + return False + if "options_no_eol" in sig and sig["options_no_eol"] and "E" in options_sig: + return False + return True + + +@_traced("sniffer.p0f_guess_os") +def guess_os( + ttl: int, + window: int, + mss: int = 0, + wscale: int | None = None, + options_sig: str = "", +) -> str: + """ + Return a coarse OS bucket for the given SYN characteristics. + + One of: "linux", "windows", "macos_ios", "freebsd", "openbsd", + "embedded", "nmap", "unknown". + """ + for sig, label in _SIGNATURES: + if _match_signature(sig, ttl, window, mss, wscale, options_sig): + return label + return "unknown" diff --git a/decnet/sniffer/syslog.py b/decnet/sniffer/syslog.py new file mode 100644 index 0000000..a32fd6d --- /dev/null +++ b/decnet/sniffer/syslog.py @@ -0,0 +1,71 @@ +""" +RFC 5424 syslog formatting and log-file writing for the fleet sniffer. + +Reuses the same wire format as templates/sniffer/decnet_logging.py so the +existing collector parser and ingester can consume events without changes. +""" + +import json +from datetime import datetime, timezone +from pathlib import Path +from typing import Any + +from decnet.collector.worker import parse_rfc5424 +from decnet.telemetry import traced as _traced + +# ─── Constants (must match templates/sniffer/decnet_logging.py) ────────────── + +_FACILITY_LOCAL0 = 16 +_SD_ID = "relay@55555" +_NILVALUE = "-" + +SEVERITY_INFO = 6 +SEVERITY_WARNING = 4 + +_MAX_HOSTNAME = 255 +_MAX_APPNAME = 48 +_MAX_MSGID = 32 + + +# ─── Formatter ─────────────────────────────────────────────────────────────── + +def _sd_escape(value: str) -> str: + return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]") + + +def _sd_element(fields: dict[str, Any]) -> str: + if not fields: + return _NILVALUE + params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items()) + return f"[{_SD_ID} {params}]" + + +def syslog_line( + service: str, + hostname: str, + event_type: str, + severity: int = SEVERITY_INFO, + msg: str | None = None, + **fields: Any, +) -> str: + pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>" + ts = datetime.now(timezone.utc).isoformat() + host = (hostname or _NILVALUE)[:_MAX_HOSTNAME] + appname = (service or _NILVALUE)[:_MAX_APPNAME] + msgid = (event_type or _NILVALUE)[:_MAX_MSGID] + sd = _sd_element(fields) + message = f" {msg}" if msg else "" + return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}" + + +@_traced("sniffer.write_event") +def write_event(line: str, log_path: Path, json_path: Path) -> None: + """Append a syslog line to the raw log and its parsed JSON to the json log.""" + with open(log_path, "a", encoding="utf-8") as lf: + lf.write(line + "\n") + lf.flush() + parsed = parse_rfc5424(line) + if parsed: + with open(json_path, "a", encoding="utf-8") as jf: + jf.write(json.dumps(parsed) + "\n") + jf.flush() diff --git a/decnet/sniffer/worker.py b/decnet/sniffer/worker.py new file mode 100644 index 0000000..3e2f0bc --- /dev/null +++ b/decnet/sniffer/worker.py @@ -0,0 +1,176 @@ +""" +Fleet-wide MACVLAN sniffer worker. + +Runs as a single host-side async background task that sniffs all TLS +traffic on the MACVLAN host interface. Maps packets to deckies by IP +and feeds fingerprint events into the existing log pipeline. + +Modeled on decnet.collector.worker — same lifecycle pattern. +Fault-isolated: any exception is logged and the worker exits cleanly. +The API never depends on this worker being alive. +""" + +import asyncio +import os +import subprocess # nosec B404 — needed for interface checks +import threading +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path + +from decnet.logging import get_logger +from decnet.network import HOST_IPVLAN_IFACE, HOST_MACVLAN_IFACE +from decnet.sniffer.fingerprint import SnifferEngine +from decnet.sniffer.syslog import write_event +from decnet.telemetry import traced as _traced + +logger = get_logger("sniffer") + +_IP_MAP_REFRESH_INTERVAL: float = 60.0 + + +def _load_ip_to_decky() -> dict[str, str]: + """Build IP → decky-name mapping from decnet-state.json.""" + from decnet.config import load_state + state = load_state() + if state is None: + return {} + config, _ = state + mapping: dict[str, str] = {} + for decky in config.deckies: + mapping[decky.ip] = decky.name + return mapping + + +def _interface_exists(iface: str) -> bool: + """Check if a network interface exists on this host.""" + try: + result = subprocess.run( # nosec B603 B607 — hardcoded args + ["ip", "link", "show", iface], + capture_output=True, text=True, check=False, + ) + return result.returncode == 0 + except Exception: + return False + + +@_traced("sniffer.sniff_loop") +def _sniff_loop( + interface: str, + log_path: Path, + json_path: Path, + stop_event: threading.Event, +) -> None: + """Blocking sniff loop. Runs in a dedicated thread via asyncio.to_thread.""" + try: + from scapy.sendrecv import sniff + except ImportError: + logger.error("scapy not installed — sniffer cannot start") + return + + ip_map = _load_ip_to_decky() + if not ip_map: + logger.warning("sniffer: no deckies in state — nothing to sniff") + return + + def _write_fn(line: str) -> None: + write_event(line, log_path, json_path) + + engine = SnifferEngine(ip_to_decky=ip_map, write_fn=_write_fn) + + # Periodically refresh IP map in a background daemon thread + def _refresh_loop() -> None: + while not stop_event.is_set(): + stop_event.wait(_IP_MAP_REFRESH_INTERVAL) + if stop_event.is_set(): + break + try: + new_map = _load_ip_to_decky() + if new_map: + engine.update_ip_map(new_map) + except Exception as exc: + logger.debug("sniffer: ip map refresh failed: %s", exc) + + refresh_thread = threading.Thread(target=_refresh_loop, daemon=True) + refresh_thread.start() + + logger.info("sniffer: sniffing on interface=%s deckies=%d", interface, len(ip_map)) + + try: + sniff( + iface=interface, + filter="tcp", + prn=engine.on_packet, + store=False, + stop_filter=lambda pkt: stop_event.is_set(), + ) + except Exception as exc: + logger.error("sniffer: scapy sniff exited: %s", exc) + finally: + stop_event.set() + logger.info("sniffer: sniff loop ended") + + +@_traced("sniffer.worker") +async def sniffer_worker(log_file: str) -> None: + """ + Async entry point — started as asyncio.create_task in the API lifespan. + + Fully fault-isolated: catches all exceptions, logs them, and returns + cleanly. The API continues running regardless of sniffer state. + """ + try: + # Interface selection: explicit env override wins, otherwise probe + # both the MACVLAN and IPvlan host-side names since the driver + # choice is per-deploy (--ipvlan flag). + env_iface = os.environ.get("DECNET_SNIFFER_IFACE") + if env_iface: + interface = env_iface + elif _interface_exists(HOST_MACVLAN_IFACE): + interface = HOST_MACVLAN_IFACE + elif _interface_exists(HOST_IPVLAN_IFACE): + interface = HOST_IPVLAN_IFACE + else: + logger.warning( + "sniffer: neither %s nor %s found — sniffer disabled " + "(fleet may not be deployed yet)", + HOST_MACVLAN_IFACE, HOST_IPVLAN_IFACE, + ) + return + + if not _interface_exists(interface): + logger.warning( + "sniffer: interface %s not found — sniffer disabled " + "(fleet may not be deployed yet)", interface, + ) + return + + log_path = Path(log_file) + json_path = log_path.with_suffix(".json") + log_path.parent.mkdir(parents=True, exist_ok=True) + + stop_event = threading.Event() + + # Dedicated thread pool so the long-running sniff loop doesn't + # occupy a slot in the default asyncio executor. + sniffer_pool = ThreadPoolExecutor( + max_workers=2, thread_name_prefix="decnet-sniffer", + ) + + try: + loop = asyncio.get_running_loop() + await loop.run_in_executor( + sniffer_pool, _sniff_loop, + interface, log_path, json_path, stop_event, + ) + except asyncio.CancelledError: + logger.info("sniffer: shutdown requested") + stop_event.set() + sniffer_pool.shutdown(wait=False) + raise + finally: + sniffer_pool.shutdown(wait=False) + + except asyncio.CancelledError: + raise + except Exception as exc: + logger.error("sniffer: worker failed — API continues without sniffing: %s", exc) diff --git a/decnet/swarm/__init__.py b/decnet/swarm/__init__.py new file mode 100644 index 0000000..b2c9c80 --- /dev/null +++ b/decnet/swarm/__init__.py @@ -0,0 +1,7 @@ +"""DECNET SWARM — multihost deployment subsystem. + +Components: +* ``pki`` — X.509 CA + CSR signing used by all swarm mTLS channels +* ``client`` — master-side HTTP client that talks to remote workers +* ``log_forwarder``— worker-side syslog-over-TLS (RFC 5425) forwarder +""" diff --git a/decnet/swarm/client.py b/decnet/swarm/client.py new file mode 100644 index 0000000..6f16e8e --- /dev/null +++ b/decnet/swarm/client.py @@ -0,0 +1,200 @@ +"""Master-side HTTP client that talks to a worker's DECNET agent. + +All traffic is mTLS: the master presents a cert issued by its own CA (which +workers trust) and the master validates the worker's cert against the same +CA. In practice the "client cert" the master shows is just another cert +signed by itself — the master is both the CA and the sole control-plane +client. + +Usage: + + async with AgentClient(host) as agent: + await agent.deploy(config) + status = await agent.status() + +The ``host`` is a SwarmHost dict returned by the repository. +""" +from __future__ import annotations + +import pathlib +import ssl +from dataclasses import dataclass +from typing import Any, Optional + +import httpx + +from decnet.config import DecnetConfig +from decnet.logging import get_logger +from decnet.swarm import pki + +log = get_logger("swarm.client") + +# How long a single HTTP operation can take. Deploy is the long pole — +# docker compose up pulls images, builds contexts, etc. Tune via env in a +# later iteration if the default proves too short. +_TIMEOUT_DEPLOY = httpx.Timeout(connect=10.0, read=600.0, write=30.0, pool=5.0) +_TIMEOUT_CONTROL = httpx.Timeout(connect=5.0, read=15.0, write=5.0, pool=5.0) + + +@dataclass(frozen=True) +class MasterIdentity: + """Paths to the master's own mTLS client bundle. + + The master uses ONE master-client cert to talk to every worker. It is + signed by the DECNET CA (same CA that signs worker certs). Stored + under ``~/.decnet/ca/master/`` by ``ensure_master_identity``. + """ + key_path: pathlib.Path + cert_path: pathlib.Path + ca_cert_path: pathlib.Path + + +def ensure_master_identity( + ca_dir: pathlib.Path = pki.DEFAULT_CA_DIR, +) -> MasterIdentity: + """Create (or load) the master's own client cert. + + Called once by the swarm controller on startup and by the CLI before + any master→worker call. Idempotent. + """ + ca = pki.ensure_ca(ca_dir) + master_dir = ca_dir / "master" + bundle = pki.load_worker_bundle(master_dir) + if bundle is None: + issued = pki.issue_worker_cert(ca, "decnet-master", ["127.0.0.1", "decnet-master"]) + pki.write_worker_bundle(issued, master_dir) + return MasterIdentity( + key_path=master_dir / "worker.key", + cert_path=master_dir / "worker.crt", + ca_cert_path=master_dir / "ca.crt", + ) + + +class AgentClient: + """Thin async wrapper around the worker agent's HTTP API.""" + + def __init__( + self, + host: dict[str, Any] | None = None, + *, + address: Optional[str] = None, + agent_port: Optional[int] = None, + identity: Optional[MasterIdentity] = None, + verify_hostname: bool = False, + ): + """Either pass a SwarmHost dict, or explicit address/port. + + ``verify_hostname`` stays False by default because the worker's + cert SAN is populated from the operator-supplied address list, not + from modern TLS hostname-verification semantics. The mTLS client + cert + CA pinning are what authenticate the peer. + """ + if host is not None: + self._address = host["address"] + self._port = int(host.get("agent_port") or 8765) + self._host_uuid = host.get("uuid") + self._host_name = host.get("name") + else: + if address is None or agent_port is None: + raise ValueError( + "AgentClient requires either a host dict or address+agent_port" + ) + self._address = address + self._port = int(agent_port) + self._host_uuid = None + self._host_name = None + + self._identity = identity or ensure_master_identity() + self._verify_hostname = verify_hostname + self._client: Optional[httpx.AsyncClient] = None + + # --------------------------------------------------------------- lifecycle + + def _build_client(self, timeout: httpx.Timeout) -> httpx.AsyncClient: + # Build the SSL context manually — httpx.create_ssl_context layers on + # purpose/ALPN/default-CA logic that doesn't compose with private-CA + # mTLS in all combinations. A bare SSLContext is predictable. + ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) + ctx.load_cert_chain( + str(self._identity.cert_path), str(self._identity.key_path) + ) + ctx.load_verify_locations(cafile=str(self._identity.ca_cert_path)) + ctx.verify_mode = ssl.CERT_REQUIRED + # Pin by CA + cert chain, not by DNS — workers enroll with arbitrary + # SANs (IPs, hostnames) and we don't want to force operators to keep + # those in sync with whatever URL the master happens to use. + ctx.check_hostname = self._verify_hostname + return httpx.AsyncClient( + base_url=f"https://{self._address}:{self._port}", + verify=ctx, + timeout=timeout, + ) + + async def __aenter__(self) -> "AgentClient": + self._client = self._build_client(_TIMEOUT_CONTROL) + return self + + async def __aexit__(self, *exc: Any) -> None: + if self._client: + await self._client.aclose() + self._client = None + + def _require_client(self) -> httpx.AsyncClient: + if self._client is None: + raise RuntimeError("AgentClient used outside `async with` block") + return self._client + + # ----------------------------------------------------------------- RPCs + + async def health(self) -> dict[str, Any]: + resp = await self._require_client().get("/health") + resp.raise_for_status() + return resp.json() + + async def status(self) -> dict[str, Any]: + resp = await self._require_client().get("/status") + resp.raise_for_status() + return resp.json() + + async def deploy( + self, + config: DecnetConfig, + *, + dry_run: bool = False, + no_cache: bool = False, + ) -> dict[str, Any]: + body = { + "config": config.model_dump(mode="json"), + "dry_run": dry_run, + "no_cache": no_cache, + } + # Swap in a long-deploy timeout for this call only. + old = self._require_client().timeout + self._require_client().timeout = _TIMEOUT_DEPLOY + try: + resp = await self._require_client().post("/deploy", json=body) + finally: + self._require_client().timeout = old + resp.raise_for_status() + return resp.json() + + async def teardown(self, decky_id: Optional[str] = None) -> dict[str, Any]: + resp = await self._require_client().post( + "/teardown", json={"decky_id": decky_id} + ) + resp.raise_for_status() + return resp.json() + + async def self_destruct(self) -> dict[str, Any]: + """Trigger the worker to stop services and wipe its install.""" + resp = await self._require_client().post("/self-destruct") + resp.raise_for_status() + return resp.json() + + # -------------------------------------------------------------- diagnostics + + def __repr__(self) -> str: + return ( + f"AgentClient(name={self._host_name!r}, " + f"address={self._address!r}, port={self._port})" + ) diff --git a/decnet/swarm/log_forwarder.py b/decnet/swarm/log_forwarder.py new file mode 100644 index 0000000..0a87343 --- /dev/null +++ b/decnet/swarm/log_forwarder.py @@ -0,0 +1,293 @@ +"""Worker-side syslog-over-TLS forwarder (RFC 5425). + +Runs alongside the worker agent. Tails the worker's local RFC 5424 log +file (written by the existing docker-collector) and ships each line to +the master's listener on TCP 6514 using octet-counted framing over mTLS. +Persists the last-forwarded byte offset in a tiny local SQLite so a +master crash never causes loss or duplication. + +Design constraints (from the plan, non-negotiable): +* transport MUST be TLS — plaintext syslog is never acceptable between + hosts; only loopback (decky → worker-local collector) may be plaintext; +* mTLS — the listener pins the worker cert against the DECNET CA, so only + enrolled workers can push logs; +* offset persistence MUST be transactional w.r.t. the send — we only + advance the offset after ``writer.drain()`` returns without error. + +The forwarder is intentionally a standalone coroutine, not a worker +inside the agent process. That keeps ``decnet agent`` crashes from +losing the log tail, and vice versa. +""" +from __future__ import annotations + +import asyncio +import os +import pathlib +import sqlite3 +import ssl +from dataclasses import dataclass +from typing import Optional + +from decnet.logging import get_logger +from decnet.swarm import pki + +log = get_logger("swarm.forwarder") + +# RFC 5425 framing: " ". +# The message itself is a standard RFC 5424 line (no trailing newline). +_FRAME_SEP = b" " + +_INITIAL_BACKOFF = 1.0 +_MAX_BACKOFF = 30.0 + + +@dataclass(frozen=True) +class ForwarderConfig: + log_path: pathlib.Path # worker's RFC 5424 .log file + master_host: str + master_port: int = 6514 + agent_dir: pathlib.Path = pki.DEFAULT_AGENT_DIR + state_db: Optional[pathlib.Path] = None # default: agent_dir / "forwarder.db" + # Max unacked bytes to keep in the local buffer when master is down. + # We bound the lag to avoid unbounded disk growth on catastrophic master + # outage — older lines are surfaced as a warning and dropped by advancing + # the offset. + max_lag_bytes: int = 128 * 1024 * 1024 # 128 MiB + + +# ------------------------------------------------------------ offset storage + + +class _OffsetStore: + """Single-row SQLite offset tracker. Stdlib only — no ORM, no async.""" + + def __init__(self, db_path: pathlib.Path) -> None: + db_path.parent.mkdir(parents=True, exist_ok=True) + self._conn = sqlite3.connect(str(db_path)) + self._conn.execute( + "CREATE TABLE IF NOT EXISTS forwarder_offset (" + " key TEXT PRIMARY KEY, offset INTEGER NOT NULL)" + ) + self._conn.commit() + + def get(self, key: str = "default") -> int: + row = self._conn.execute( + "SELECT offset FROM forwarder_offset WHERE key=?", (key,) + ).fetchone() + return int(row[0]) if row else 0 + + def set(self, offset: int, key: str = "default") -> None: + self._conn.execute( + "INSERT INTO forwarder_offset(key, offset) VALUES(?, ?) " + "ON CONFLICT(key) DO UPDATE SET offset=excluded.offset", + (key, offset), + ) + self._conn.commit() + + def close(self) -> None: + self._conn.close() + + +# ---------------------------------------------------------------- TLS setup + + +def build_worker_ssl_context(agent_dir: pathlib.Path) -> ssl.SSLContext: + """Client-side mTLS context for the forwarder. + + Worker presents its agent bundle (same cert used for the control-plane + HTTPS listener). The CA is the DECNET CA; we pin by CA, not hostname, + because workers reach masters by operator-supplied address. + """ + bundle = pki.load_worker_bundle(agent_dir) + if bundle is None: + raise RuntimeError( + f"no worker bundle at {agent_dir} — enroll from the master first" + ) + ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) + ctx.load_cert_chain( + certfile=str(agent_dir / "worker.crt"), + keyfile=str(agent_dir / "worker.key"), + ) + ctx.load_verify_locations(cafile=str(agent_dir / "ca.crt")) + ctx.verify_mode = ssl.CERT_REQUIRED + ctx.check_hostname = False + return ctx + + +# ----------------------------------------------------------- frame encoding + + +def encode_frame(line: str) -> bytes: + """RFC 5425 octet-counted framing: ``" "``. + + ``N`` is the byte length of the payload that follows (after the space). + """ + payload = line.rstrip("\n").encode("utf-8", errors="replace") + return f"{len(payload)}".encode("ascii") + _FRAME_SEP + payload + + +async def read_frame(reader: asyncio.StreamReader) -> Optional[bytes]: + """Read one octet-counted frame. Returns None on clean EOF.""" + # Read the ASCII length up to the first space. Bound the prefix so a + # malicious peer can't force us to buffer unbounded bytes before we know + # it's a valid frame. + prefix = b"" + while True: + c = await reader.read(1) + if not c: + return None if not prefix else b"" + if c == _FRAME_SEP: + break + if len(prefix) >= 10 or not c.isdigit(): + # RFC 5425 caps the length prefix at ~10 digits (< 4 GiB payload). + raise ValueError(f"invalid octet-count prefix: {prefix!r}") + prefix += c + n = int(prefix) + buf = await reader.readexactly(n) + return buf + + +# ----------------------------------------------------------------- main loop + + +async def _send_batch( + writer: asyncio.StreamWriter, + offset: int, + lines: list[tuple[int, str]], + store: _OffsetStore, +) -> int: + """Write every line as a frame, drain, then persist the last offset.""" + for _, line in lines: + writer.write(encode_frame(line)) + await writer.drain() + last_offset = lines[-1][0] + store.set(last_offset) + return last_offset + + +async def run_forwarder( + cfg: ForwarderConfig, + *, + poll_interval: float = 0.5, + stop_event: Optional[asyncio.Event] = None, +) -> None: + """Main forwarder loop. Run as a dedicated task. + + Stops when ``stop_event`` is set (used by tests and clean shutdown). + Exceptions trigger exponential backoff but are never fatal — the + forwarder is expected to outlive transient master/network failures. + """ + state_db = cfg.state_db or (cfg.agent_dir / "forwarder.db") + store = _OffsetStore(state_db) + offset = store.get() + backoff = _INITIAL_BACKOFF + + log.info( + "forwarder start log=%s master=%s:%d offset=%d", + cfg.log_path, cfg.master_host, cfg.master_port, offset, + ) + + try: + while stop_event is None or not stop_event.is_set(): + try: + ctx = build_worker_ssl_context(cfg.agent_dir) + reader, writer = await asyncio.open_connection( + cfg.master_host, cfg.master_port, ssl=ctx + ) + log.info("forwarder connected master=%s:%d", cfg.master_host, cfg.master_port) + backoff = _INITIAL_BACKOFF + try: + offset = await _pump(cfg, store, writer, offset, poll_interval, stop_event) + finally: + writer.close() + try: + await writer.wait_closed() + except Exception: # nosec B110 — socket cleanup is best-effort + pass + # Keep reader alive until here to avoid "reader garbage + # collected" warnings on some Python builds. + del reader + except (OSError, ssl.SSLError, ConnectionError) as exc: + log.warning( + "forwarder disconnected: %s — retrying in %.1fs", exc, backoff + ) + try: + await asyncio.wait_for( + _sleep_unless_stopped(backoff, stop_event), timeout=backoff + 1 + ) + except asyncio.TimeoutError: + pass + backoff = min(_MAX_BACKOFF, backoff * 2) + finally: + store.close() + log.info("forwarder stopped offset=%d", offset) + + +async def _pump( + cfg: ForwarderConfig, + store: _OffsetStore, + writer: asyncio.StreamWriter, + offset: int, + poll_interval: float, + stop_event: Optional[asyncio.Event], +) -> int: + """Read new lines since ``offset`` and ship them until disconnect.""" + while stop_event is None or not stop_event.is_set(): + if not cfg.log_path.exists(): + await _sleep_unless_stopped(poll_interval, stop_event) + continue + + stat = cfg.log_path.stat() + if stat.st_size < offset: + # truncated/rotated — reset. + log.warning("forwarder log rotated — resetting offset=0") + offset = 0 + store.set(0) + if stat.st_size - offset > cfg.max_lag_bytes: + # Catastrophic lag — skip ahead to cap local disk pressure. + skip_to = stat.st_size - cfg.max_lag_bytes + log.warning( + "forwarder lag %d > cap %d — dropping oldest %d bytes", + stat.st_size - offset, cfg.max_lag_bytes, skip_to - offset, + ) + offset = skip_to + store.set(offset) + + if stat.st_size == offset: + await _sleep_unless_stopped(poll_interval, stop_event) + continue + + batch: list[tuple[int, str]] = [] + with open(cfg.log_path, "r", encoding="utf-8", errors="replace") as f: + f.seek(offset) + while True: + line = f.readline() + if not line or not line.endswith("\n"): + break + offset_after = f.tell() + batch.append((offset_after, line.rstrip("\n"))) + if len(batch) >= 500: + break + if batch: + offset = await _send_batch(writer, offset, batch, store) + return offset + + +async def _sleep_unless_stopped( + seconds: float, stop_event: Optional[asyncio.Event] +) -> None: + if stop_event is None: + await asyncio.sleep(seconds) + return + try: + await asyncio.wait_for(stop_event.wait(), timeout=seconds) + except asyncio.TimeoutError: + pass + + +# Re-exported for CLI convenience +DEFAULT_PORT = 6514 + + +def default_master_host() -> Optional[str]: + return os.environ.get("DECNET_SWARM_MASTER_HOST") diff --git a/decnet/swarm/log_listener.py b/decnet/swarm/log_listener.py new file mode 100644 index 0000000..b3b4b39 --- /dev/null +++ b/decnet/swarm/log_listener.py @@ -0,0 +1,194 @@ +"""Master-side syslog-over-TLS listener (RFC 5425). + +Accepts mTLS-authenticated worker connections on TCP 6514, reads +octet-counted frames, parses each as an RFC 5424 line, and appends it to +the master's local ingest log files. The existing log_ingestion_worker +tails those files and inserts records into the master repo — worker +provenance is embedded in the parsed record's ``source_worker`` field. + +Design: +* TLS is mandatory. No plaintext fallback. A peer without a CA-signed + cert is rejected at the TLS handshake; nothing gets past the kernel. +* The listener never trusts the syslog HOSTNAME field for provenance — + that's attacker-supplied from the decky. The authoritative source is + the peer cert's CN, which the CA controlled at enrollment. +* Dropped connections are fine — the worker's forwarder holds the + offset and resumes from the same byte on reconnect. +""" +from __future__ import annotations + +import asyncio +import json +import pathlib +import ssl +from dataclasses import dataclass +from typing import Optional + +from cryptography import x509 +from cryptography.hazmat.primitives import serialization +from cryptography.x509.oid import NameOID + +from decnet.logging import get_logger +from decnet.swarm import pki +from decnet.swarm.log_forwarder import read_frame + +log = get_logger("swarm.listener") + + +@dataclass(frozen=True) +class ListenerConfig: + log_path: pathlib.Path # master's RFC 5424 .log (forensic sink) + json_path: pathlib.Path # master's .json (ingester tails this) + bind_host: str = "0.0.0.0" # nosec B104 — listener must bind publicly + bind_port: int = 6514 + ca_dir: pathlib.Path = pki.DEFAULT_CA_DIR + + +# --------------------------------------------------------- TLS context + + +def build_listener_ssl_context(ca_dir: pathlib.Path) -> ssl.SSLContext: + """Server-side mTLS context: master presents its master cert; clients + must present a cert signed by the DECNET CA.""" + master_dir = ca_dir / "master" + ca_cert = master_dir / "ca.crt" + cert = master_dir / "worker.crt" # master re-uses the 'worker' bundle layout + key = master_dir / "worker.key" + for p in (ca_cert, cert, key): + if not p.exists(): + raise RuntimeError( + f"master identity missing at {master_dir} — call ensure_master_identity first" + ) + ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) + ctx.load_cert_chain(certfile=str(cert), keyfile=str(key)) + ctx.load_verify_locations(cafile=str(ca_cert)) + ctx.verify_mode = ssl.CERT_REQUIRED + return ctx + + +# ---------------------------------------------------------- helpers + + +def peer_cn(ssl_object: Optional[ssl.SSLObject]) -> str: + """Extract the CN from the TLS peer certificate (worker provenance). + + Falls back to ``"unknown"`` on any parse error — we refuse to crash on + malformed cert DNs and instead tag the message for later inspection. + """ + if ssl_object is None: + return "unknown" + der = ssl_object.getpeercert(binary_form=True) + if der is None: + return "unknown" + try: + cert = x509.load_der_x509_certificate(der) + attrs = cert.subject.get_attributes_for_oid(NameOID.COMMON_NAME) + return attrs[0].value if attrs else "unknown" + except Exception: # nosec B110 — provenance is best-effort + return "unknown" + + +def fingerprint_from_ssl(ssl_object: Optional[ssl.SSLObject]) -> Optional[str]: + if ssl_object is None: + return None + der = ssl_object.getpeercert(binary_form=True) + if der is None: + return None + try: + cert = x509.load_der_x509_certificate(der) + return pki.fingerprint(cert.public_bytes(serialization.Encoding.PEM)) + except Exception: + return None + + +# --------------------------------------------------- per-connection handler + + +async def _handle_connection( + reader: asyncio.StreamReader, + writer: asyncio.StreamWriter, + cfg: ListenerConfig, +) -> None: + ssl_obj = writer.get_extra_info("ssl_object") + cn = peer_cn(ssl_obj) + peer = writer.get_extra_info("peername") + log.info("listener accepted worker=%s peer=%s", cn, peer) + + # Lazy import to avoid a circular dep if the collector pulls in logger setup. + from decnet.collector.worker import parse_rfc5424 + + cfg.log_path.parent.mkdir(parents=True, exist_ok=True) + cfg.json_path.parent.mkdir(parents=True, exist_ok=True) + + try: + with open(cfg.log_path, "a", encoding="utf-8") as lf, open( + cfg.json_path, "a", encoding="utf-8" + ) as jf: + while True: + try: + frame = await read_frame(reader) + except asyncio.IncompleteReadError: + break + except ValueError as exc: + log.warning("listener bad frame worker=%s err=%s", cn, exc) + break + if frame is None: + break + if not frame: + continue + line = frame.decode("utf-8", errors="replace") + lf.write(line + "\n") + lf.flush() + parsed = parse_rfc5424(line) + if parsed is not None: + parsed["source_worker"] = cn + jf.write(json.dumps(parsed) + "\n") + jf.flush() + else: + log.debug("listener malformed RFC5424 worker=%s snippet=%r", cn, line[:80]) + except Exception as exc: + log.warning("listener connection error worker=%s err=%s", cn, exc) + finally: + writer.close() + try: + await writer.wait_closed() + except Exception: # nosec B110 — socket cleanup is best-effort + pass + log.info("listener closed worker=%s", cn) + + +# ---------------------------------------------------------------- server + + +async def run_listener( + cfg: ListenerConfig, + *, + stop_event: Optional[asyncio.Event] = None, +) -> None: + ctx = build_listener_ssl_context(cfg.ca_dir) + + async def _client_cb( + reader: asyncio.StreamReader, writer: asyncio.StreamWriter + ) -> None: + await _handle_connection(reader, writer, cfg) + + server = await asyncio.start_server( + _client_cb, host=cfg.bind_host, port=cfg.bind_port, ssl=ctx + ) + sockets = server.sockets or () + log.info( + "listener bound host=%s port=%d sockets=%d", + cfg.bind_host, cfg.bind_port, len(sockets), + ) + async with server: + if stop_event is None: + await server.serve_forever() + else: + serve_task = asyncio.create_task(server.serve_forever()) + await stop_event.wait() + server.close() + serve_task.cancel() + try: + await serve_task + except (asyncio.CancelledError, Exception): # nosec B110 + pass diff --git a/decnet/swarm/pki.py b/decnet/swarm/pki.py new file mode 100644 index 0000000..2a870e7 --- /dev/null +++ b/decnet/swarm/pki.py @@ -0,0 +1,323 @@ +"""DECNET SWARM PKI — self-managed X.509 CA for master↔worker mTLS. + +Used by: +* the SWARM controller (master) to issue per-worker server+client certs at + enrollment time, +* the agent (worker) to present its mTLS identity for both the control-plane + HTTPS endpoint and the syslog-over-TLS (RFC 5425) log forwarder, +* the master-side syslog-TLS listener to authenticate inbound workers. + +Storage layout (master): + + ~/.decnet/ca/ + ca.key (PEM, 0600 — the CA private key) + ca.crt (PEM — self-signed root) + workers// + client.crt (issued, signed by CA) + +Worker layout (delivered by /enroll response): + + ~/.decnet/agent/ + ca.crt (master's CA — trust anchor) + worker.key (worker's own private key) + worker.crt (signed by master CA — used for both TLS + server auth *and* syslog client auth) + +The CA is a hard dependency only in swarm mode; unihost installs never +touch this module. +""" +from __future__ import annotations + +import datetime as _dt +import hashlib +import ipaddress +import os +import pathlib +from dataclasses import dataclass +from typing import Optional + +from cryptography import x509 +from cryptography.hazmat.primitives import hashes, serialization +from cryptography.hazmat.primitives.asymmetric import rsa +from cryptography.x509.oid import NameOID + +DEFAULT_CA_DIR = pathlib.Path(os.path.expanduser("~/.decnet/ca")) +DEFAULT_AGENT_DIR = pathlib.Path(os.path.expanduser("~/.decnet/agent")) +DEFAULT_SWARMCTL_DIR = pathlib.Path(os.path.expanduser("~/.decnet/swarmctl")) + +CA_KEY_BITS = 4096 +WORKER_KEY_BITS = 2048 +CA_VALIDITY_DAYS = 3650 # 10 years — internal CA +WORKER_VALIDITY_DAYS = 825 # max permitted by modern TLS clients + + +@dataclass(frozen=True) +class CABundle: + """The master's CA identity (key is secret, cert is published).""" + + key_pem: bytes + cert_pem: bytes + + +@dataclass(frozen=True) +class IssuedCert: + """A signed worker certificate + its private key, handed to the worker + exactly once during enrollment. + """ + + key_pem: bytes + cert_pem: bytes + ca_cert_pem: bytes + fingerprint_sha256: str # hex, lowercase + + +# --------------------------------------------------------------------- CA ops + + +def _pem_private(key: rsa.RSAPrivateKey) -> bytes: + return key.private_bytes( + encoding=serialization.Encoding.PEM, + format=serialization.PrivateFormat.PKCS8, + encryption_algorithm=serialization.NoEncryption(), + ) + + +def _pem_cert(cert: x509.Certificate) -> bytes: + return cert.public_bytes(serialization.Encoding.PEM) + + +def generate_ca(common_name: str = "DECNET SWARM Root CA") -> CABundle: + """Generate a fresh self-signed CA. Does not touch disk.""" + key = rsa.generate_private_key(public_exponent=65537, key_size=CA_KEY_BITS) + subject = issuer = x509.Name( + [ + x509.NameAttribute(NameOID.COMMON_NAME, common_name), + x509.NameAttribute(NameOID.ORGANIZATION_NAME, "DECNET"), + ] + ) + now = _dt.datetime.now(_dt.timezone.utc) + cert = ( + x509.CertificateBuilder() + .subject_name(subject) + .issuer_name(issuer) + .public_key(key.public_key()) + .serial_number(x509.random_serial_number()) + .not_valid_before(now - _dt.timedelta(minutes=5)) + .not_valid_after(now + _dt.timedelta(days=CA_VALIDITY_DAYS)) + .add_extension(x509.BasicConstraints(ca=True, path_length=0), critical=True) + .add_extension( + x509.KeyUsage( + digital_signature=True, + content_commitment=False, + key_encipherment=False, + data_encipherment=False, + key_agreement=False, + key_cert_sign=True, + crl_sign=True, + encipher_only=False, + decipher_only=False, + ), + critical=True, + ) + .sign(private_key=key, algorithm=hashes.SHA256()) + ) + return CABundle(key_pem=_pem_private(key), cert_pem=_pem_cert(cert)) + + +def save_ca(bundle: CABundle, ca_dir: pathlib.Path = DEFAULT_CA_DIR) -> None: + ca_dir.mkdir(parents=True, exist_ok=True) + # 0700 on the dir, 0600 on the key — defence against casual reads. + os.chmod(ca_dir, 0o700) + key_path = ca_dir / "ca.key" + cert_path = ca_dir / "ca.crt" + key_path.write_bytes(bundle.key_pem) + os.chmod(key_path, 0o600) + cert_path.write_bytes(bundle.cert_pem) + + +def load_ca(ca_dir: pathlib.Path = DEFAULT_CA_DIR) -> CABundle: + key_pem = (ca_dir / "ca.key").read_bytes() + cert_pem = (ca_dir / "ca.crt").read_bytes() + return CABundle(key_pem=key_pem, cert_pem=cert_pem) + + +def ensure_ca(ca_dir: pathlib.Path = DEFAULT_CA_DIR) -> CABundle: + """Load the CA if present, otherwise generate and persist a new one.""" + if (ca_dir / "ca.key").exists() and (ca_dir / "ca.crt").exists(): + return load_ca(ca_dir) + bundle = generate_ca() + save_ca(bundle, ca_dir) + return bundle + + +# --------------------------------------------------------------- cert issuance + + +def _parse_san(value: str) -> x509.GeneralName: + """Parse a SAN entry as IP if possible, otherwise DNS.""" + try: + return x509.IPAddress(ipaddress.ip_address(value)) + except ValueError: + return x509.DNSName(value) + + +def issue_worker_cert( + ca: CABundle, + worker_name: str, + sans: list[str], + validity_days: int = WORKER_VALIDITY_DAYS, +) -> IssuedCert: + """Sign a freshly-generated worker keypair. + + The cert is usable as BOTH a TLS server (agent's HTTPS endpoint) and a + TLS client (syslog-over-TLS upstream to the master) — extended key usage + covers both. ``sans`` should include every address/name the master or + workers will use to reach this worker — typically the worker's IP plus + its hostname. + """ + ca_key = serialization.load_pem_private_key(ca.key_pem, password=None) + ca_cert = x509.load_pem_x509_certificate(ca.cert_pem) + + worker_key = rsa.generate_private_key(public_exponent=65537, key_size=WORKER_KEY_BITS) + subject = x509.Name( + [ + x509.NameAttribute(NameOID.COMMON_NAME, worker_name), + x509.NameAttribute(NameOID.ORGANIZATION_NAME, "DECNET"), + x509.NameAttribute(NameOID.ORGANIZATIONAL_UNIT_NAME, "swarm-worker"), + ] + ) + now = _dt.datetime.now(_dt.timezone.utc) + san_entries: list[x509.GeneralName] = [_parse_san(s) for s in sans] if sans else [] + # Always include the worker-name as a DNS SAN so cert pinning by CN-as-DNS + # works even when the operator forgets to pass an explicit SAN list. + if not any( + isinstance(e, x509.DNSName) and e.value == worker_name for e in san_entries + ): + san_entries.append(x509.DNSName(worker_name)) + + builder = ( + x509.CertificateBuilder() + .subject_name(subject) + .issuer_name(ca_cert.subject) + .public_key(worker_key.public_key()) + .serial_number(x509.random_serial_number()) + .not_valid_before(now - _dt.timedelta(minutes=5)) + .not_valid_after(now + _dt.timedelta(days=validity_days)) + .add_extension(x509.BasicConstraints(ca=False, path_length=None), critical=True) + .add_extension( + x509.KeyUsage( + digital_signature=True, + content_commitment=False, + key_encipherment=True, + data_encipherment=False, + key_agreement=False, + key_cert_sign=False, + crl_sign=False, + encipher_only=False, + decipher_only=False, + ), + critical=True, + ) + .add_extension( + x509.ExtendedKeyUsage( + [ + x509.ObjectIdentifier("1.3.6.1.5.5.7.3.1"), # serverAuth + x509.ObjectIdentifier("1.3.6.1.5.5.7.3.2"), # clientAuth + ] + ), + critical=True, + ) + .add_extension(x509.SubjectAlternativeName(san_entries), critical=False) + ) + cert = builder.sign(private_key=ca_key, algorithm=hashes.SHA256()) + cert_pem = _pem_cert(cert) + fp = hashlib.sha256( + cert.public_bytes(serialization.Encoding.DER) + ).hexdigest() + return IssuedCert( + key_pem=_pem_private(worker_key), + cert_pem=cert_pem, + ca_cert_pem=ca.cert_pem, + fingerprint_sha256=fp, + ) + + +def write_worker_bundle( + issued: IssuedCert, + agent_dir: pathlib.Path = DEFAULT_AGENT_DIR, +) -> None: + """Persist an issued bundle into the worker's agent directory.""" + agent_dir.mkdir(parents=True, exist_ok=True) + os.chmod(agent_dir, 0o700) + (agent_dir / "ca.crt").write_bytes(issued.ca_cert_pem) + (agent_dir / "worker.crt").write_bytes(issued.cert_pem) + key_path = agent_dir / "worker.key" + key_path.write_bytes(issued.key_pem) + os.chmod(key_path, 0o600) + + +def load_worker_bundle( + agent_dir: pathlib.Path = DEFAULT_AGENT_DIR, +) -> Optional[IssuedCert]: + """Return the worker's bundle if enrolled; ``None`` otherwise.""" + ca = agent_dir / "ca.crt" + crt = agent_dir / "worker.crt" + key = agent_dir / "worker.key" + if not (ca.exists() and crt.exists() and key.exists()): + return None + cert_pem = crt.read_bytes() + cert = x509.load_pem_x509_certificate(cert_pem) + fp = hashlib.sha256( + cert.public_bytes(serialization.Encoding.DER) + ).hexdigest() + return IssuedCert( + key_pem=key.read_bytes(), + cert_pem=cert_pem, + ca_cert_pem=ca.read_bytes(), + fingerprint_sha256=fp, + ) + + +def ensure_swarmctl_cert( + bind_host: str, + ca_dir: pathlib.Path = DEFAULT_CA_DIR, + swarmctl_dir: pathlib.Path = DEFAULT_SWARMCTL_DIR, + extra_sans: Optional[list[str]] = None, +) -> tuple[pathlib.Path, pathlib.Path, pathlib.Path]: + """Return (cert_path, key_path, ca_path), auto-issuing if missing. + + Uses the existing DECNET CA (ensuring it exists first) so workers + whose bundle already includes ``ca.crt`` can verify the swarmctl + endpoint without additional trust configuration. Self-signed is + intentionally not the default — a cert signed by the same CA the + workers already trust is the friction-free path. + + Callers that want BYOC should skip this and pass their own + cert/key paths directly to uvicorn. + """ + swarmctl_dir.mkdir(parents=True, exist_ok=True) + os.chmod(swarmctl_dir, 0o700) + cert_path = swarmctl_dir / "server.crt" + key_path = swarmctl_dir / "server.key" + ca_cert_path = ca_dir / "ca.crt" + + if cert_path.exists() and key_path.exists() and ca_cert_path.exists(): + return cert_path, key_path, ca_cert_path + + ca = ensure_ca(ca_dir) + sans = list({bind_host, "127.0.0.1", "localhost", *(extra_sans or [])}) + issued = issue_worker_cert(ca, "swarmctl", sans) + cert_path.write_bytes(issued.cert_pem) + key_path.write_bytes(issued.key_pem) + os.chmod(key_path, 0o600) + # ensure_ca already wrote ca.crt under ca_dir, but save_ca is only + # called on generate — re-mirror it here to guarantee the path exists. + if not ca_cert_path.exists(): + ca_cert_path.write_bytes(ca.cert_pem) + return cert_path, key_path, ca_cert_path + + +def fingerprint(cert_pem: bytes) -> str: + """SHA-256 hex fingerprint of a cert (DER-encoded).""" + cert = x509.load_pem_x509_certificate(cert_pem) + return hashlib.sha256(cert.public_bytes(serialization.Encoding.DER)).hexdigest() diff --git a/decnet/swarm/tar_tree.py b/decnet/swarm/tar_tree.py new file mode 100644 index 0000000..ab5b7b9 --- /dev/null +++ b/decnet/swarm/tar_tree.py @@ -0,0 +1,97 @@ +"""Build a gzipped tarball of the master's working tree for pushing to workers. + +Always excludes the obvious large / secret / churn paths: ``.venv/``, +``__pycache__/``, ``.git/``, ``wiki-checkout/``, ``*.db*``, ``*.log``. The +caller can supply additional exclude globs. + +Deliberately does NOT invoke git — the tree is what the operator has on +disk (staged + unstaged + untracked). That's the whole point; the scp +workflow we're replacing also shipped the live tree. +""" +from __future__ import annotations + +import fnmatch +import io +import pathlib +import tarfile +from typing import Iterable, Optional + +DEFAULT_EXCLUDES = ( + ".venv", ".venv/*", + "**/.venv/*", + "__pycache__", "**/__pycache__", "**/__pycache__/*", + ".git", ".git/*", + "wiki-checkout", "wiki-checkout/*", + "*.pyc", "*.pyo", + "*.db", "*.db-wal", "*.db-shm", + "*.log", + ".pytest_cache", ".pytest_cache/*", + ".mypy_cache", ".mypy_cache/*", + ".tox", ".tox/*", + "*.egg-info", "*.egg-info/*", + "decnet-state.json", + "master.log", "master.json", + "decnet.db*", +) + + +def _is_excluded(rel: str, patterns: Iterable[str]) -> bool: + parts = pathlib.PurePosixPath(rel).parts + for pat in patterns: + if fnmatch.fnmatch(rel, pat): + return True + # Also match the pattern against every leading subpath — this is + # what catches nested `.venv/...` without forcing callers to spell + # out every `**/` glob. + for i in range(1, len(parts) + 1): + if fnmatch.fnmatch("/".join(parts[:i]), pat): + return True + return False + + +def tar_working_tree( + root: pathlib.Path, + extra_excludes: Optional[Iterable[str]] = None, +) -> bytes: + """Return the gzipped tarball bytes of ``root``. + + Entries are added with paths relative to ``root`` (no leading ``/``, + no ``..``). The updater rejects unsafe paths on the receiving side. + """ + patterns = list(DEFAULT_EXCLUDES) + list(extra_excludes or ()) + buf = io.BytesIO() + + with tarfile.open(fileobj=buf, mode="w:gz") as tar: + for path in sorted(root.rglob("*")): + rel = path.relative_to(root).as_posix() + if _is_excluded(rel, patterns): + continue + if path.is_symlink(): + # Symlinks inside a repo tree are rare and often break + # portability; skip them rather than ship dangling links. + continue + if path.is_dir(): + continue + tar.add(path, arcname=rel, recursive=False) + + return buf.getvalue() + + +def detect_git_sha(root: pathlib.Path) -> str: + """Best-effort ``HEAD`` sha. Returns ``""`` if not a git repo.""" + head = root / ".git" / "HEAD" + if not head.is_file(): + return "" + try: + ref = head.read_text().strip() + except OSError: + return "" + if ref.startswith("ref: "): + ref_path = root / ".git" / ref[5:] + if ref_path.is_file(): + try: + return ref_path.read_text().strip() + except OSError: + return "" + return "" + return ref diff --git a/decnet/swarm/updater_client.py b/decnet/swarm/updater_client.py new file mode 100644 index 0000000..753c558 --- /dev/null +++ b/decnet/swarm/updater_client.py @@ -0,0 +1,124 @@ +"""Master-side HTTP client for the worker's self-updater daemon. + +Sibling of ``AgentClient``: same mTLS identity (same DECNET CA, same +master client cert) but targets the updater's port (default 8766) and +speaks the multipart upload protocol the updater's ``/update`` endpoint +expects. + +Kept as its own module — not a subclass of ``AgentClient`` — because the +timeouts and failure semantics are genuinely different: pip install + +agent probe can take a minute on a slow VM, and ``/update-self`` drops +the connection on purpose (the updater re-execs itself mid-response). +""" +from __future__ import annotations + +import ssl +from typing import Any, Optional + +import httpx + +from decnet.logging import get_logger +from decnet.swarm.client import MasterIdentity, ensure_master_identity + +log = get_logger("swarm.updater_client") + +_TIMEOUT_UPDATE = httpx.Timeout(connect=10.0, read=180.0, write=120.0, pool=5.0) +_TIMEOUT_CONTROL = httpx.Timeout(connect=5.0, read=30.0, write=10.0, pool=5.0) + + +class UpdaterClient: + """Async client targeting a worker's ``decnet updater`` daemon.""" + + def __init__( + self, + host: dict[str, Any] | None = None, + *, + address: Optional[str] = None, + updater_port: int = 8766, + identity: Optional[MasterIdentity] = None, + ): + if host is not None: + self._address = host["address"] + self._host_name = host.get("name") + else: + if address is None: + raise ValueError("UpdaterClient requires host dict or address") + self._address = address + self._host_name = None + self._port = updater_port + self._identity = identity or ensure_master_identity() + self._client: Optional[httpx.AsyncClient] = None + + def _build_client(self, timeout: httpx.Timeout) -> httpx.AsyncClient: + ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) + ctx.load_cert_chain( + str(self._identity.cert_path), str(self._identity.key_path), + ) + ctx.load_verify_locations(cafile=str(self._identity.ca_cert_path)) + ctx.verify_mode = ssl.CERT_REQUIRED + ctx.check_hostname = False + return httpx.AsyncClient( + base_url=f"https://{self._address}:{self._port}", + verify=ctx, + timeout=timeout, + ) + + async def __aenter__(self) -> "UpdaterClient": + self._client = self._build_client(_TIMEOUT_CONTROL) + return self + + async def __aexit__(self, *exc: Any) -> None: + if self._client: + await self._client.aclose() + self._client = None + + def _require(self) -> httpx.AsyncClient: + if self._client is None: + raise RuntimeError("UpdaterClient used outside `async with` block") + return self._client + + # --------------------------------------------------------------- RPCs + + async def health(self) -> dict[str, Any]: + r = await self._require().get("/health") + r.raise_for_status() + return r.json() + + async def releases(self) -> dict[str, Any]: + r = await self._require().get("/releases") + r.raise_for_status() + return r.json() + + async def update(self, tarball: bytes, sha: str = "") -> httpx.Response: + """POST /update. Returns the Response so the caller can distinguish + 200 / 409 / 500 — each means something different. + """ + self._require().timeout = _TIMEOUT_UPDATE + try: + r = await self._require().post( + "/update", + files={"tarball": ("tree.tgz", tarball, "application/gzip")}, + data={"sha": sha}, + ) + finally: + self._require().timeout = _TIMEOUT_CONTROL + return r + + async def update_self(self, tarball: bytes, sha: str = "") -> httpx.Response: + """POST /update-self. The updater re-execs itself, so the connection + usually drops mid-response; that's not an error. Callers should then + poll /health until the new SHA appears. + """ + self._require().timeout = _TIMEOUT_UPDATE + try: + r = await self._require().post( + "/update-self", + files={"tarball": ("tree.tgz", tarball, "application/gzip")}, + data={"sha": sha, "confirm_self": "true"}, + ) + finally: + self._require().timeout = _TIMEOUT_CONTROL + return r + + async def rollback(self) -> httpx.Response: + return await self._require().post("/rollback") diff --git a/decnet/telemetry.py b/decnet/telemetry.py new file mode 100644 index 0000000..042440c --- /dev/null +++ b/decnet/telemetry.py @@ -0,0 +1,308 @@ +""" +DECNET OpenTelemetry tracing integration. + +Controlled entirely by ``DECNET_DEVELOPER_TRACING``. When disabled (the +default), every public export is a zero-cost no-op: no OTEL SDK imports, no +monkey-patching, no middleware, and ``@traced`` returns the original function +object unwrapped. +""" + +from __future__ import annotations + +import asyncio +import functools +import inspect +from typing import Any, Callable, TypeVar, overload + +from decnet.env import DECNET_DEVELOPER_TRACING, DECNET_OTEL_ENDPOINT +from decnet.logging import get_logger + +log = get_logger("api") + +F = TypeVar("F", bound=Callable[..., Any]) + +_ENABLED: bool = DECNET_DEVELOPER_TRACING + +# --------------------------------------------------------------------------- +# Lazy OTEL imports — only when tracing is enabled +# --------------------------------------------------------------------------- + +_tracer_provider: Any = None # TracerProvider | None + + +def _init_provider() -> None: + """Initialise the global TracerProvider (called once from setup_tracing).""" + global _tracer_provider + + from opentelemetry import trace + from opentelemetry.sdk.trace import TracerProvider + from opentelemetry.sdk.trace.export import BatchSpanProcessor + from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter + from opentelemetry.sdk.resources import Resource + + resource = Resource.create({ + "service.name": "decnet", + "service.version": "0.2.0", + }) + _tracer_provider = TracerProvider(resource=resource) + exporter = OTLPSpanExporter(endpoint=DECNET_OTEL_ENDPOINT, insecure=True) + _tracer_provider.add_span_processor(BatchSpanProcessor(exporter)) + trace.set_tracer_provider(_tracer_provider) + log.info("OTEL tracing enabled endpoint=%s", DECNET_OTEL_ENDPOINT) + + +def setup_tracing(app: Any) -> None: + """Configure the OTEL TracerProvider and instrument FastAPI. + + Call once from the FastAPI lifespan, after DB init. No-op when + ``DECNET_DEVELOPER_TRACING`` is not ``"true"``. + """ + if not _ENABLED: + return + + try: + _init_provider() + from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor + FastAPIInstrumentor.instrument_app(app) + from decnet.logging import enable_trace_context + enable_trace_context() + log.info("FastAPI auto-instrumentation active, log-trace correlation enabled") + except Exception as exc: + log.warning("OTEL setup failed — continuing without tracing: %s", exc) + + +def shutdown_tracing() -> None: + """Flush and shut down the tracer provider. Safe to call when disabled.""" + if _tracer_provider is not None: + try: + _tracer_provider.shutdown() + except Exception: # nosec B110 — best-effort tracer shutdown + pass + + +# --------------------------------------------------------------------------- +# get_tracer — mirrors get_logger(component) pattern +# --------------------------------------------------------------------------- + +class _NoOpSpan: + """Minimal stand-in so ``with get_tracer(...).start_as_current_span(...)`` + works when tracing is disabled.""" + + def set_attribute(self, key: str, value: Any) -> None: + pass + + def set_status(self, *args: Any, **kwargs: Any) -> None: + pass + + def record_exception(self, exc: BaseException) -> None: + pass + + def __enter__(self) -> "_NoOpSpan": + return self + + def __exit__(self, *args: Any) -> None: + pass + + +class _NoOpTracer: + """Returned by ``get_tracer()`` when tracing is disabled.""" + + def start_as_current_span(self, name: str, **kwargs: Any) -> _NoOpSpan: + return _NoOpSpan() + + def start_span(self, name: str, **kwargs: Any) -> _NoOpSpan: + return _NoOpSpan() + + +_tracers: dict[str, Any] = {} + + +def get_tracer(component: str) -> Any: + """Return an OTEL Tracer (or a no-op stand-in) for *component*.""" + if not _ENABLED: + return _NoOpTracer() + + if component not in _tracers: + from opentelemetry import trace + _tracers[component] = trace.get_tracer(f"decnet.{component}") + return _tracers[component] + + +# --------------------------------------------------------------------------- +# @traced decorator — async + sync, zero overhead when disabled +# --------------------------------------------------------------------------- + +@overload +def traced(fn: F) -> F: ... +@overload +def traced(name: str) -> Callable[[F], F]: ... + + +def traced(fn: Any = None, *, name: str | None = None) -> Any: + """Decorator that wraps a function in an OTEL span. + + Usage:: + + @traced # span name = "module.func" + async def my_worker(): ... + + @traced("custom.span.name") # explicit span name + def my_sync_func(): ... + + When ``DECNET_DEVELOPER_TRACING`` is disabled the original function is + returned **unwrapped** — zero overhead on every call. + """ + # Handle @traced("name") vs @traced vs @traced(name="name") + if fn is None and name is not None: + # Called as @traced("name") or @traced(name="name") + def decorator(f: F) -> F: + return _wrap(f, name) + return decorator + if fn is not None and isinstance(fn, str): + # Called as @traced("name") — fn is actually the name string + span_name = fn + def decorator(f: F) -> F: + return _wrap(f, span_name) + return decorator + if fn is not None and callable(fn): + # Called as @traced (no arguments) + return _wrap(fn, None) + # Fallback: @traced() with no args + def decorator(f: F) -> F: + return _wrap(f, name) + return decorator + + +def _wrap(fn: F, span_name: str | None) -> F: + """Wrap *fn* in a span. Returns *fn* unchanged when tracing is off.""" + if not _ENABLED: + return fn + + resolved_name = span_name or f"{fn.__module__.rsplit('.', 1)[-1]}.{fn.__qualname__}" + + if inspect.iscoroutinefunction(fn): + @functools.wraps(fn) + async def async_wrapper(*args: Any, **kwargs: Any) -> Any: + tracer = get_tracer(fn.__module__.split(".")[-1]) + with tracer.start_as_current_span(resolved_name) as span: + try: + result = await fn(*args, **kwargs) + return result + except Exception as exc: + span.record_exception(exc) + raise + return async_wrapper # type: ignore[return-value] + else: + @functools.wraps(fn) + def sync_wrapper(*args: Any, **kwargs: Any) -> Any: + tracer = get_tracer(fn.__module__.split(".")[-1]) + with tracer.start_as_current_span(resolved_name) as span: + try: + result = fn(*args, **kwargs) + return result + except Exception as exc: + span.record_exception(exc) + raise + return sync_wrapper # type: ignore[return-value] + + +# --------------------------------------------------------------------------- +# TracedRepository — proxy wrapper for BaseRepository +# --------------------------------------------------------------------------- + +def wrap_repository(repo: Any) -> Any: + """Wrap *repo* in a dynamic tracing proxy. Returns *repo* unchanged when disabled. + + Instead of mirroring every method signature (which drifts when concrete + repos add extra kwargs beyond the ABC), this proxy introspects the inner + repo at construction time and wraps every public async method in a span + via ``__getattr__``. Sync attributes are forwarded directly. + """ + if not _ENABLED: + return repo + + tracer = get_tracer("db") + + class TracedRepository: + """Dynamic proxy — wraps every async method call in a DB span.""" + + def __init__(self, inner: Any) -> None: + self._inner = inner + + def __getattr__(self, name: str) -> Any: + attr = getattr(self._inner, name) + + if asyncio.iscoroutinefunction(attr): + @functools.wraps(attr) + async def _traced_method(*args: Any, **kwargs: Any) -> Any: + with tracer.start_as_current_span(f"db.{name}") as span: + try: + return await attr(*args, **kwargs) + except Exception as exc: + span.record_exception(exc) + raise + return _traced_method + + return attr + + return TracedRepository(repo) + + +# --------------------------------------------------------------------------- +# Cross-stage trace context propagation +# --------------------------------------------------------------------------- +# The DECNET pipeline is decoupled via JSON files: +# collector -> .json file -> ingester -> DB -> profiler +# +# To show the full journey of an event in Jaeger, we embed W3C trace context +# into the JSON records. The collector injects it; the ingester extracts it +# and continues the trace as a child span. + +def inject_context(record: dict[str, Any]) -> None: + """Inject current OTEL trace context into *record* under ``_trace``. + + No-op when tracing is disabled. The ``_trace`` key is stripped by the + ingester after extraction — it never reaches the DB. + """ + if not _ENABLED: + return + try: + from opentelemetry.propagate import inject + carrier: dict[str, str] = {} + inject(carrier) + if carrier: + record["_trace"] = carrier + except Exception: # nosec B110 — trace injection is optional + pass + + +def extract_context(record: dict[str, Any]) -> Any: + """Extract OTEL trace context from *record* and return it. + + Returns ``None`` when tracing is disabled or no context is present. + Removes the ``_trace`` key from the record so it doesn't leak into the DB. + """ + if not _ENABLED: + record.pop("_trace", None) + return None + try: + carrier = record.pop("_trace", None) + if not carrier: + return None + from opentelemetry.propagate import extract + return extract(carrier) + except Exception: + return None + + +def start_span_with_context(tracer: Any, name: str, context: Any = None) -> Any: + """Start a span, optionally as a child of an extracted context. + + Returns a context manager span. When *context* is ``None``, creates a + root span (normal behavior). + """ + if not _ENABLED: + return _NoOpSpan() + if context is not None: + return tracer.start_as_current_span(name, context=context) + return tracer.start_as_current_span(name) diff --git a/templates/conpot/Dockerfile b/decnet/templates/conpot/Dockerfile similarity index 85% rename from templates/conpot/Dockerfile rename to decnet/templates/conpot/Dockerfile index 6bfad6d..1d3bb3e 100644 --- a/templates/conpot/Dockerfile +++ b/decnet/templates/conpot/Dockerfile @@ -11,16 +11,16 @@ RUN find /opt /usr /etc /home -name "*.xml" -exec sed -i 's/port="5020"/port="50 RUN (apt-get update && apt-get install -y --no-install-recommends libcap2-bin 2>/dev/null) || (apk add --no-cache libcap 2>/dev/null) || true RUN find /home/conpot/.local/bin /usr /opt -type f -name 'python*' -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true -# Bridge conpot's own logger into DECNET's RFC 5424 syslog pipeline. +# Bridge conpot's own logger into syslog-relay's RFC 5424 syslog pipeline. # entrypoint.py is self-contained (inlines the formatter) because the # conpot base image runs Python 3.6, which cannot import the shared -# decnet_logging.py (that file uses 3.9+ / 3.10+ type syntax). +# syslog_bridge.py (that file uses 3.9+ / 3.10+ type syntax). COPY entrypoint.py /home/conpot/entrypoint.py RUN chown conpot:conpot /home/conpot/entrypoint.py \ && chmod +x /home/conpot/entrypoint.py # The upstream image already runs as non-root 'conpot'. -# We do NOT switch to a 'decnet' user — doing so breaks pkg_resources +# We do NOT switch to a 'logrelay' user — doing so breaks pkg_resources # because conpot's eggs live under /home/conpot/.local and are only on # the Python path for that user. USER conpot diff --git a/templates/conpot/entrypoint.py b/decnet/templates/conpot/entrypoint.py similarity index 97% rename from templates/conpot/entrypoint.py rename to decnet/templates/conpot/entrypoint.py index 534eeb0..59b9b99 100644 --- a/templates/conpot/entrypoint.py +++ b/decnet/templates/conpot/entrypoint.py @@ -3,7 +3,7 @@ Entrypoint wrapper for the Conpot ICS/SCADA honeypot. Launches conpot as a child process and bridges its log output into the -DECNET structured syslog pipeline. Each line from conpot stdout/stderr +syslog-relay structured syslog pipeline. Each line from conpot stdout/stderr is classified and emitted as an RFC 5424 syslog line so the host-side collector can ingest it alongside every other service. @@ -21,7 +21,7 @@ from datetime import datetime, timezone # ── RFC 5424 inline formatter (Python 3.6-compatible) ───────────────────────── _FACILITY_LOCAL0 = 16 -_SD_ID = "decnet@55555" +_SD_ID = "relay@55555" _NILVALUE = "-" SEVERITY_INFO = 6 diff --git a/templates/conpot/decnet_logging.py b/decnet/templates/conpot/syslog_bridge.py similarity index 84% rename from templates/conpot/decnet_logging.py rename to decnet/templates/conpot/syslog_bridge.py index 5a09505..c0a78d0 100644 --- a/templates/conpot/decnet_logging.py +++ b/decnet/templates/conpot/syslog_bridge.py @@ -1,15 +1,15 @@ #!/usr/bin/env python3 """ -Shared RFC 5424 syslog helper for DECNET service templates. +Shared RFC 5424 syslog helper used by service containers. Services call syslog_line() to format an RFC 5424 message, then -write_syslog_file() to emit it to stdout — Docker captures it, and the -host-side collector streams it into the log file. +write_syslog_file() to emit it to stdout — the container runtime +captures it, and the host-side collector streams it into the log file. RFC 5424 structure: 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG -Facility: local0 (16), PEN for SD element ID: decnet@55555 +Facility: local0 (16). SD element ID uses PEN 55555. """ from datetime import datetime, timezone @@ -18,7 +18,7 @@ from typing import Any # ─── Constants ──────────────────────────────────────────────────────────────── _FACILITY_LOCAL0 = 16 -_SD_ID = "decnet@55555" +_SD_ID = "relay@55555" _NILVALUE = "-" SEVERITY_EMERG = 0 @@ -62,7 +62,7 @@ def syslog_line( Args: service: APP-NAME (e.g. "http", "mysql") - hostname: HOSTNAME (decky node name) + hostname: HOSTNAME (node name) event_type: MSGID (e.g. "request", "login_attempt") severity: Syslog severity integer (default: INFO=6) timestamp: UTC datetime; defaults to now @@ -80,10 +80,10 @@ def syslog_line( def write_syslog_file(line: str) -> None: - """Emit a syslog line to stdout for Docker log capture.""" + """Emit a syslog line to stdout for container log capture.""" print(line, flush=True) def forward_syslog(line: str, log_target: str) -> None: - """No-op stub. TCP forwarding is now handled by rsyslog, not by service containers.""" + """No-op stub. TCP forwarding is handled by rsyslog, not by service containers.""" pass diff --git a/templates/cowrie/Dockerfile b/decnet/templates/cowrie/Dockerfile similarity index 91% rename from templates/cowrie/Dockerfile rename to decnet/templates/cowrie/Dockerfile index 9e7ce84..c8f0fba 100644 --- a/templates/cowrie/Dockerfile +++ b/decnet/templates/cowrie/Dockerfile @@ -7,7 +7,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ git authbind \ && rm -rf /var/lib/apt/lists/* -RUN useradd -r -s /bin/false -d /opt decnet \ +RUN useradd -r -s /bin/false -d /opt logrelay \ && apt-get update && apt-get install -y --no-install-recommends libcap2-bin \ && rm -rf /var/lib/apt/lists/* \ && (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true) @@ -18,5 +18,5 @@ RUN chmod +x /entrypoint.sh HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ CMD kill -0 1 || exit 1 -USER decnet +USER logrelay ENTRYPOINT ["/entrypoint.sh"] diff --git a/templates/cowrie/cowrie.cfg.j2 b/decnet/templates/cowrie/cowrie.cfg.j2 similarity index 100% rename from templates/cowrie/cowrie.cfg.j2 rename to decnet/templates/cowrie/cowrie.cfg.j2 diff --git a/templates/cowrie/entrypoint.sh b/decnet/templates/cowrie/entrypoint.sh similarity index 100% rename from templates/cowrie/entrypoint.sh rename to decnet/templates/cowrie/entrypoint.sh diff --git a/templates/cowrie/honeyfs/etc/group b/decnet/templates/cowrie/honeyfs/etc/group similarity index 100% rename from templates/cowrie/honeyfs/etc/group rename to decnet/templates/cowrie/honeyfs/etc/group diff --git a/templates/cowrie/honeyfs/etc/hostname b/decnet/templates/cowrie/honeyfs/etc/hostname similarity index 100% rename from templates/cowrie/honeyfs/etc/hostname rename to decnet/templates/cowrie/honeyfs/etc/hostname diff --git a/templates/cowrie/honeyfs/etc/hosts b/decnet/templates/cowrie/honeyfs/etc/hosts similarity index 100% rename from templates/cowrie/honeyfs/etc/hosts rename to decnet/templates/cowrie/honeyfs/etc/hosts diff --git a/templates/cowrie/honeyfs/etc/issue b/decnet/templates/cowrie/honeyfs/etc/issue similarity index 100% rename from templates/cowrie/honeyfs/etc/issue rename to decnet/templates/cowrie/honeyfs/etc/issue diff --git a/templates/cowrie/honeyfs/etc/issue.net b/decnet/templates/cowrie/honeyfs/etc/issue.net similarity index 100% rename from templates/cowrie/honeyfs/etc/issue.net rename to decnet/templates/cowrie/honeyfs/etc/issue.net diff --git a/templates/cowrie/honeyfs/etc/motd b/decnet/templates/cowrie/honeyfs/etc/motd similarity index 100% rename from templates/cowrie/honeyfs/etc/motd rename to decnet/templates/cowrie/honeyfs/etc/motd diff --git a/templates/cowrie/honeyfs/etc/os-release b/decnet/templates/cowrie/honeyfs/etc/os-release similarity index 100% rename from templates/cowrie/honeyfs/etc/os-release rename to decnet/templates/cowrie/honeyfs/etc/os-release diff --git a/templates/cowrie/honeyfs/etc/passwd b/decnet/templates/cowrie/honeyfs/etc/passwd similarity index 100% rename from templates/cowrie/honeyfs/etc/passwd rename to decnet/templates/cowrie/honeyfs/etc/passwd diff --git a/templates/cowrie/honeyfs/etc/resolv.conf b/decnet/templates/cowrie/honeyfs/etc/resolv.conf similarity index 100% rename from templates/cowrie/honeyfs/etc/resolv.conf rename to decnet/templates/cowrie/honeyfs/etc/resolv.conf diff --git a/templates/cowrie/honeyfs/etc/shadow b/decnet/templates/cowrie/honeyfs/etc/shadow similarity index 100% rename from templates/cowrie/honeyfs/etc/shadow rename to decnet/templates/cowrie/honeyfs/etc/shadow diff --git a/templates/cowrie/honeyfs/home/admin/.aws/credentials b/decnet/templates/cowrie/honeyfs/home/admin/.aws/credentials similarity index 100% rename from templates/cowrie/honeyfs/home/admin/.aws/credentials rename to decnet/templates/cowrie/honeyfs/home/admin/.aws/credentials diff --git a/templates/cowrie/honeyfs/home/admin/.bash_history b/decnet/templates/cowrie/honeyfs/home/admin/.bash_history similarity index 100% rename from templates/cowrie/honeyfs/home/admin/.bash_history rename to decnet/templates/cowrie/honeyfs/home/admin/.bash_history diff --git a/templates/cowrie/honeyfs/home/admin/.ssh/authorized_keys b/decnet/templates/cowrie/honeyfs/home/admin/.ssh/authorized_keys similarity index 100% rename from templates/cowrie/honeyfs/home/admin/.ssh/authorized_keys rename to decnet/templates/cowrie/honeyfs/home/admin/.ssh/authorized_keys diff --git a/templates/cowrie/honeyfs/root/.bash_history b/decnet/templates/cowrie/honeyfs/root/.bash_history similarity index 100% rename from templates/cowrie/honeyfs/root/.bash_history rename to decnet/templates/cowrie/honeyfs/root/.bash_history diff --git a/templates/cowrie/honeyfs/var/log/auth.log b/decnet/templates/cowrie/honeyfs/var/log/auth.log similarity index 100% rename from templates/cowrie/honeyfs/var/log/auth.log rename to decnet/templates/cowrie/honeyfs/var/log/auth.log diff --git a/templates/docker_api/Dockerfile b/decnet/templates/docker_api/Dockerfile similarity index 87% rename from templates/docker_api/Dockerfile rename to decnet/templates/docker_api/Dockerfile index f67a0c7..61e09d5 100644 --- a/templates/docker_api/Dockerfile +++ b/decnet/templates/docker_api/Dockerfile @@ -8,13 +8,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ ENV PIP_BREAK_SYSTEM_PACKAGES=1 RUN pip3 install --no-cache-dir flask -COPY decnet_logging.py /opt/decnet_logging.py +COPY syslog_bridge.py /opt/syslog_bridge.py COPY server.py /opt/server.py COPY entrypoint.sh /entrypoint.sh RUN chmod +x /entrypoint.sh EXPOSE 2375 2376 -RUN useradd -r -s /bin/false -d /opt decnet \ +RUN useradd -r -s /bin/false -d /opt logrelay \ && apt-get update && apt-get install -y --no-install-recommends libcap2-bin \ && rm -rf /var/lib/apt/lists/* \ && (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true) @@ -22,5 +22,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \ HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ CMD kill -0 1 || exit 1 -USER decnet +USER logrelay ENTRYPOINT ["/entrypoint.sh"] diff --git a/templates/docker_api/entrypoint.sh b/decnet/templates/docker_api/entrypoint.sh similarity index 100% rename from templates/docker_api/entrypoint.sh rename to decnet/templates/docker_api/entrypoint.sh diff --git a/templates/docker_api/server.py b/decnet/templates/docker_api/server.py similarity index 96% rename from templates/docker_api/server.py rename to decnet/templates/docker_api/server.py index 594a185..03d4961 100644 --- a/templates/docker_api/server.py +++ b/decnet/templates/docker_api/server.py @@ -10,7 +10,7 @@ import json import os from flask import Flask, request -from decnet_logging import syslog_line, write_syslog_file, forward_syslog +from syslog_bridge import syslog_line, write_syslog_file, forward_syslog NODE_NAME = os.environ.get("NODE_NAME", "docker-host") SERVICE_NAME = "docker_api" @@ -62,7 +62,6 @@ _CONTAINERS = [ def _log(event_type: str, severity: int = 6, **kwargs) -> None: line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs) - print(line, flush=True) write_syslog_file(line) forward_syslog(line, LOG_TARGET) diff --git a/templates/docker_api/decnet_logging.py b/decnet/templates/docker_api/syslog_bridge.py similarity index 84% rename from templates/docker_api/decnet_logging.py rename to decnet/templates/docker_api/syslog_bridge.py index 5a09505..c0a78d0 100644 --- a/templates/docker_api/decnet_logging.py +++ b/decnet/templates/docker_api/syslog_bridge.py @@ -1,15 +1,15 @@ #!/usr/bin/env python3 """ -Shared RFC 5424 syslog helper for DECNET service templates. +Shared RFC 5424 syslog helper used by service containers. Services call syslog_line() to format an RFC 5424 message, then -write_syslog_file() to emit it to stdout — Docker captures it, and the -host-side collector streams it into the log file. +write_syslog_file() to emit it to stdout — the container runtime +captures it, and the host-side collector streams it into the log file. RFC 5424 structure: 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG -Facility: local0 (16), PEN for SD element ID: decnet@55555 +Facility: local0 (16). SD element ID uses PEN 55555. """ from datetime import datetime, timezone @@ -18,7 +18,7 @@ from typing import Any # ─── Constants ──────────────────────────────────────────────────────────────── _FACILITY_LOCAL0 = 16 -_SD_ID = "decnet@55555" +_SD_ID = "relay@55555" _NILVALUE = "-" SEVERITY_EMERG = 0 @@ -62,7 +62,7 @@ def syslog_line( Args: service: APP-NAME (e.g. "http", "mysql") - hostname: HOSTNAME (decky node name) + hostname: HOSTNAME (node name) event_type: MSGID (e.g. "request", "login_attempt") severity: Syslog severity integer (default: INFO=6) timestamp: UTC datetime; defaults to now @@ -80,10 +80,10 @@ def syslog_line( def write_syslog_file(line: str) -> None: - """Emit a syslog line to stdout for Docker log capture.""" + """Emit a syslog line to stdout for container log capture.""" print(line, flush=True) def forward_syslog(line: str, log_target: str) -> None: - """No-op stub. TCP forwarding is now handled by rsyslog, not by service containers.""" + """No-op stub. TCP forwarding is handled by rsyslog, not by service containers.""" pass diff --git a/templates/elasticsearch/Dockerfile b/decnet/templates/elasticsearch/Dockerfile similarity index 85% rename from templates/elasticsearch/Dockerfile rename to decnet/templates/elasticsearch/Dockerfile index a2d952f..5dca7b8 100644 --- a/templates/elasticsearch/Dockerfile +++ b/decnet/templates/elasticsearch/Dockerfile @@ -5,13 +5,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ python3 \ && rm -rf /var/lib/apt/lists/* -COPY decnet_logging.py /opt/decnet_logging.py +COPY syslog_bridge.py /opt/syslog_bridge.py COPY server.py /opt/server.py COPY entrypoint.sh /entrypoint.sh RUN chmod +x /entrypoint.sh EXPOSE 9200 -RUN useradd -r -s /bin/false -d /opt decnet \ +RUN useradd -r -s /bin/false -d /opt logrelay \ && apt-get update && apt-get install -y --no-install-recommends libcap2-bin \ && rm -rf /var/lib/apt/lists/* \ && (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true) @@ -19,5 +19,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \ HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ CMD kill -0 1 || exit 1 -USER decnet +USER logrelay ENTRYPOINT ["/entrypoint.sh"] diff --git a/templates/elasticsearch/entrypoint.sh b/decnet/templates/elasticsearch/entrypoint.sh similarity index 100% rename from templates/elasticsearch/entrypoint.sh rename to decnet/templates/elasticsearch/entrypoint.sh diff --git a/templates/elasticsearch/server.py b/decnet/templates/elasticsearch/server.py similarity index 97% rename from templates/elasticsearch/server.py rename to decnet/templates/elasticsearch/server.py index 4b0ea84..e65ee4c 100644 --- a/templates/elasticsearch/server.py +++ b/decnet/templates/elasticsearch/server.py @@ -8,7 +8,7 @@ as JSON. Designed to attract automated scanners and credential stuffers. import json import os from http.server import BaseHTTPRequestHandler, HTTPServer -from decnet_logging import syslog_line, write_syslog_file, forward_syslog +from syslog_bridge import syslog_line, write_syslog_file, forward_syslog NODE_NAME = os.environ.get("NODE_NAME", "esserver") SERVICE_NAME = "elasticsearch" @@ -40,7 +40,6 @@ _ROOT_RESPONSE = { def _log(event_type: str, severity: int = 6, **kwargs) -> None: line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs) - print(line, flush=True) write_syslog_file(line) forward_syslog(line, LOG_TARGET) diff --git a/templates/decnet_logging.py b/decnet/templates/elasticsearch/syslog_bridge.py similarity index 84% rename from templates/decnet_logging.py rename to decnet/templates/elasticsearch/syslog_bridge.py index 5a09505..c0a78d0 100644 --- a/templates/decnet_logging.py +++ b/decnet/templates/elasticsearch/syslog_bridge.py @@ -1,15 +1,15 @@ #!/usr/bin/env python3 """ -Shared RFC 5424 syslog helper for DECNET service templates. +Shared RFC 5424 syslog helper used by service containers. Services call syslog_line() to format an RFC 5424 message, then -write_syslog_file() to emit it to stdout — Docker captures it, and the -host-side collector streams it into the log file. +write_syslog_file() to emit it to stdout — the container runtime +captures it, and the host-side collector streams it into the log file. RFC 5424 structure: 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG -Facility: local0 (16), PEN for SD element ID: decnet@55555 +Facility: local0 (16). SD element ID uses PEN 55555. """ from datetime import datetime, timezone @@ -18,7 +18,7 @@ from typing import Any # ─── Constants ──────────────────────────────────────────────────────────────── _FACILITY_LOCAL0 = 16 -_SD_ID = "decnet@55555" +_SD_ID = "relay@55555" _NILVALUE = "-" SEVERITY_EMERG = 0 @@ -62,7 +62,7 @@ def syslog_line( Args: service: APP-NAME (e.g. "http", "mysql") - hostname: HOSTNAME (decky node name) + hostname: HOSTNAME (node name) event_type: MSGID (e.g. "request", "login_attempt") severity: Syslog severity integer (default: INFO=6) timestamp: UTC datetime; defaults to now @@ -80,10 +80,10 @@ def syslog_line( def write_syslog_file(line: str) -> None: - """Emit a syslog line to stdout for Docker log capture.""" + """Emit a syslog line to stdout for container log capture.""" print(line, flush=True) def forward_syslog(line: str, log_target: str) -> None: - """No-op stub. TCP forwarding is now handled by rsyslog, not by service containers.""" + """No-op stub. TCP forwarding is handled by rsyslog, not by service containers.""" pass diff --git a/templates/ftp/Dockerfile b/decnet/templates/ftp/Dockerfile similarity index 87% rename from templates/ftp/Dockerfile rename to decnet/templates/ftp/Dockerfile index d2365e6..378b3c8 100644 --- a/templates/ftp/Dockerfile +++ b/decnet/templates/ftp/Dockerfile @@ -8,13 +8,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ ENV PIP_BREAK_SYSTEM_PACKAGES=1 RUN pip3 install --no-cache-dir twisted jinja2 -COPY decnet_logging.py /opt/decnet_logging.py +COPY syslog_bridge.py /opt/syslog_bridge.py COPY server.py /opt/server.py COPY entrypoint.sh /entrypoint.sh RUN chmod +x /entrypoint.sh EXPOSE 21 -RUN useradd -r -s /bin/false -d /opt decnet \ +RUN useradd -r -s /bin/false -d /opt logrelay \ && apt-get update && apt-get install -y --no-install-recommends libcap2-bin \ && rm -rf /var/lib/apt/lists/* \ && (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true) @@ -22,5 +22,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \ HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ CMD kill -0 1 || exit 1 -USER decnet +USER logrelay ENTRYPOINT ["/entrypoint.sh"] diff --git a/templates/ftp/entrypoint.sh b/decnet/templates/ftp/entrypoint.sh similarity index 100% rename from templates/ftp/entrypoint.sh rename to decnet/templates/ftp/entrypoint.sh diff --git a/templates/ftp/server.py b/decnet/templates/ftp/server.py similarity index 96% rename from templates/ftp/server.py rename to decnet/templates/ftp/server.py index 94820a6..be6136f 100644 --- a/templates/ftp/server.py +++ b/decnet/templates/ftp/server.py @@ -12,7 +12,7 @@ from twisted.internet import defer, reactor from twisted.protocols.ftp import FTP, FTPFactory, FTPAnonymousShell from twisted.python.filepath import FilePath from twisted.python import log as twisted_log -from decnet_logging import syslog_line, write_syslog_file, forward_syslog +from syslog_bridge import syslog_line, write_syslog_file, forward_syslog NODE_NAME = os.environ.get("NODE_NAME", "ftpserver") SERVICE_NAME = "ftp" @@ -22,7 +22,6 @@ BANNER = os.environ.get("FTP_BANNER", "220 (vsFTPd 3.0.3)") def _log(event_type: str, severity: int = 6, **kwargs) -> None: line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs) - print(line, flush=True) write_syslog_file(line) forward_syslog(line, LOG_TARGET) diff --git a/templates/cowrie/decnet_logging.py b/decnet/templates/ftp/syslog_bridge.py similarity index 84% rename from templates/cowrie/decnet_logging.py rename to decnet/templates/ftp/syslog_bridge.py index 5a09505..c0a78d0 100644 --- a/templates/cowrie/decnet_logging.py +++ b/decnet/templates/ftp/syslog_bridge.py @@ -1,15 +1,15 @@ #!/usr/bin/env python3 """ -Shared RFC 5424 syslog helper for DECNET service templates. +Shared RFC 5424 syslog helper used by service containers. Services call syslog_line() to format an RFC 5424 message, then -write_syslog_file() to emit it to stdout — Docker captures it, and the -host-side collector streams it into the log file. +write_syslog_file() to emit it to stdout — the container runtime +captures it, and the host-side collector streams it into the log file. RFC 5424 structure: 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG -Facility: local0 (16), PEN for SD element ID: decnet@55555 +Facility: local0 (16). SD element ID uses PEN 55555. """ from datetime import datetime, timezone @@ -18,7 +18,7 @@ from typing import Any # ─── Constants ──────────────────────────────────────────────────────────────── _FACILITY_LOCAL0 = 16 -_SD_ID = "decnet@55555" +_SD_ID = "relay@55555" _NILVALUE = "-" SEVERITY_EMERG = 0 @@ -62,7 +62,7 @@ def syslog_line( Args: service: APP-NAME (e.g. "http", "mysql") - hostname: HOSTNAME (decky node name) + hostname: HOSTNAME (node name) event_type: MSGID (e.g. "request", "login_attempt") severity: Syslog severity integer (default: INFO=6) timestamp: UTC datetime; defaults to now @@ -80,10 +80,10 @@ def syslog_line( def write_syslog_file(line: str) -> None: - """Emit a syslog line to stdout for Docker log capture.""" + """Emit a syslog line to stdout for container log capture.""" print(line, flush=True) def forward_syslog(line: str, log_target: str) -> None: - """No-op stub. TCP forwarding is now handled by rsyslog, not by service containers.""" + """No-op stub. TCP forwarding is handled by rsyslog, not by service containers.""" pass diff --git a/templates/http/Dockerfile b/decnet/templates/http/Dockerfile similarity index 87% rename from templates/http/Dockerfile rename to decnet/templates/http/Dockerfile index 4014032..a8f2876 100644 --- a/templates/http/Dockerfile +++ b/decnet/templates/http/Dockerfile @@ -8,13 +8,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ ENV PIP_BREAK_SYSTEM_PACKAGES=1 RUN pip3 install --no-cache-dir flask jinja2 -COPY decnet_logging.py /opt/decnet_logging.py +COPY syslog_bridge.py /opt/syslog_bridge.py COPY server.py /opt/server.py COPY entrypoint.sh /entrypoint.sh RUN chmod +x /entrypoint.sh EXPOSE 80 443 -RUN useradd -r -s /bin/false -d /opt decnet \ +RUN useradd -r -s /bin/false -d /opt logrelay \ && apt-get update && apt-get install -y --no-install-recommends libcap2-bin \ && rm -rf /var/lib/apt/lists/* \ && (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true) @@ -22,5 +22,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \ HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ CMD kill -0 1 || exit 1 -USER decnet +USER logrelay ENTRYPOINT ["/entrypoint.sh"] diff --git a/templates/http/entrypoint.sh b/decnet/templates/http/entrypoint.sh similarity index 100% rename from templates/http/entrypoint.sh rename to decnet/templates/http/entrypoint.sh diff --git a/templates/http/server.py b/decnet/templates/http/server.py similarity index 96% rename from templates/http/server.py rename to decnet/templates/http/server.py index c666eeb..b169804 100644 --- a/templates/http/server.py +++ b/decnet/templates/http/server.py @@ -12,7 +12,7 @@ from pathlib import Path from flask import Flask, request, send_from_directory from werkzeug.serving import make_server, WSGIRequestHandler -from decnet_logging import syslog_line, write_syslog_file, forward_syslog +from syslog_bridge import syslog_line, write_syslog_file, forward_syslog logging.getLogger("werkzeug").setLevel(logging.ERROR) @@ -68,7 +68,6 @@ def _fix_server_header(response): def _log(event_type: str, severity: int = 6, **kwargs) -> None: line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs) - print(line, flush=True) write_syslog_file(line) forward_syslog(line, LOG_TARGET) @@ -80,7 +79,7 @@ def log_request(): method=request.method, path=request.path, remote_addr=request.remote_addr, - headers=dict(request.headers), + headers=json.dumps(dict(request.headers)), body=request.get_data(as_text=True)[:512], ) diff --git a/decnet/templates/http/syslog_bridge.py b/decnet/templates/http/syslog_bridge.py new file mode 100644 index 0000000..c0a78d0 --- /dev/null +++ b/decnet/templates/http/syslog_bridge.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +""" +Shared RFC 5424 syslog helper used by service containers. + +Services call syslog_line() to format an RFC 5424 message, then +write_syslog_file() to emit it to stdout — the container runtime +captures it, and the host-side collector streams it into the log file. + +RFC 5424 structure: + 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG + +Facility: local0 (16). SD element ID uses PEN 55555. +""" + +from datetime import datetime, timezone +from typing import Any + +# ─── Constants ──────────────────────────────────────────────────────────────── + +_FACILITY_LOCAL0 = 16 +_SD_ID = "relay@55555" +_NILVALUE = "-" + +SEVERITY_EMERG = 0 +SEVERITY_ALERT = 1 +SEVERITY_CRIT = 2 +SEVERITY_ERROR = 3 +SEVERITY_WARNING = 4 +SEVERITY_NOTICE = 5 +SEVERITY_INFO = 6 +SEVERITY_DEBUG = 7 + +_MAX_HOSTNAME = 255 +_MAX_APPNAME = 48 +_MAX_MSGID = 32 + +# ─── Formatter ──────────────────────────────────────────────────────────────── + +def _sd_escape(value: str) -> str: + """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3.""" + return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]") + + +def _sd_element(fields: dict[str, Any]) -> str: + if not fields: + return _NILVALUE + params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items()) + return f"[{_SD_ID} {params}]" + + +def syslog_line( + service: str, + hostname: str, + event_type: str, + severity: int = SEVERITY_INFO, + timestamp: datetime | None = None, + msg: str | None = None, + **fields: Any, +) -> str: + """ + Return a single RFC 5424-compliant syslog line (no trailing newline). + + Args: + service: APP-NAME (e.g. "http", "mysql") + hostname: HOSTNAME (node name) + event_type: MSGID (e.g. "request", "login_attempt") + severity: Syslog severity integer (default: INFO=6) + timestamp: UTC datetime; defaults to now + msg: Optional free-text MSG + **fields: Encoded as structured data params + """ + pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>" + ts = (timestamp or datetime.now(timezone.utc)).isoformat() + host = (hostname or _NILVALUE)[:_MAX_HOSTNAME] + appname = (service or _NILVALUE)[:_MAX_APPNAME] + msgid = (event_type or _NILVALUE)[:_MAX_MSGID] + sd = _sd_element(fields) + message = f" {msg}" if msg else "" + return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}" + + +def write_syslog_file(line: str) -> None: + """Emit a syslog line to stdout for container log capture.""" + print(line, flush=True) + + +def forward_syslog(line: str, log_target: str) -> None: + """No-op stub. TCP forwarding is handled by rsyslog, not by service containers.""" + pass diff --git a/decnet/templates/https/Dockerfile b/decnet/templates/https/Dockerfile new file mode 100644 index 0000000..7dbd915 --- /dev/null +++ b/decnet/templates/https/Dockerfile @@ -0,0 +1,29 @@ +ARG BASE_IMAGE=debian:bookworm-slim +FROM ${BASE_IMAGE} + +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3 python3-pip openssl \ + && rm -rf /var/lib/apt/lists/* + +ENV PIP_BREAK_SYSTEM_PACKAGES=1 +RUN pip3 install --no-cache-dir flask jinja2 + +COPY syslog_bridge.py /opt/syslog_bridge.py +COPY server.py /opt/server.py +COPY entrypoint.sh /entrypoint.sh +RUN chmod +x /entrypoint.sh + +RUN mkdir -p /opt/tls + +EXPOSE 443 +RUN useradd -r -s /bin/false -d /opt logrelay \ + && chown -R logrelay:logrelay /opt/tls \ + && apt-get update && apt-get install -y --no-install-recommends libcap2-bin \ + && rm -rf /var/lib/apt/lists/* \ + && (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true) + +HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ + CMD kill -0 1 || exit 1 + +USER logrelay +ENTRYPOINT ["/entrypoint.sh"] diff --git a/decnet/templates/https/entrypoint.sh b/decnet/templates/https/entrypoint.sh new file mode 100644 index 0000000..4301922 --- /dev/null +++ b/decnet/templates/https/entrypoint.sh @@ -0,0 +1,18 @@ +#!/bin/bash +set -e + +TLS_DIR="/opt/tls" +CERT="${TLS_CERT:-$TLS_DIR/cert.pem}" +KEY="${TLS_KEY:-$TLS_DIR/key.pem}" + +# Generate a self-signed certificate if none exists +if [ ! -f "$CERT" ] || [ ! -f "$KEY" ]; then + mkdir -p "$TLS_DIR" + CN="${TLS_CN:-${NODE_NAME:-localhost}}" + openssl req -x509 -newkey rsa:2048 -nodes \ + -keyout "$KEY" -out "$CERT" \ + -days 3650 -subj "/CN=$CN" \ + 2>/dev/null +fi + +exec python3 /opt/server.py diff --git a/decnet/templates/https/server.py b/decnet/templates/https/server.py new file mode 100644 index 0000000..40fd785 --- /dev/null +++ b/decnet/templates/https/server.py @@ -0,0 +1,136 @@ +#!/usr/bin/env python3 +""" +HTTPS service emulator using Flask + TLS. +Identical to the HTTP honeypot but wrapped in TLS. Accepts all requests, +logs every detail (method, path, headers, body, TLS info), and responds +with configurable pages. Forwards events as JSON to LOG_TARGET if set. +""" + +import json +import logging +import os +import ssl +from pathlib import Path + +from flask import Flask, request, send_from_directory +from werkzeug.serving import make_server, WSGIRequestHandler +from syslog_bridge import syslog_line, write_syslog_file, forward_syslog + +logging.getLogger("werkzeug").setLevel(logging.ERROR) + +NODE_NAME = os.environ.get("NODE_NAME", "webserver") +SERVICE_NAME = "https" +LOG_TARGET = os.environ.get("LOG_TARGET", "") +PORT = int(os.environ.get("PORT", "443")) +SERVER_HEADER = os.environ.get("SERVER_HEADER", "Apache/2.4.54 (Debian)") +RESPONSE_CODE = int(os.environ.get("RESPONSE_CODE", "403")) +FAKE_APP = os.environ.get("FAKE_APP", "") +EXTRA_HEADERS = json.loads(os.environ.get("EXTRA_HEADERS", "{}")) +CUSTOM_BODY = os.environ.get("CUSTOM_BODY", "") +FILES_DIR = os.environ.get("FILES_DIR", "") +TLS_CERT = os.environ.get("TLS_CERT", "/opt/tls/cert.pem") +TLS_KEY = os.environ.get("TLS_KEY", "/opt/tls/key.pem") + +_FAKE_APP_BODIES: dict[str, str] = { + "apache_default": ( + "\n" + "Apache2 Debian Default Page\n" + "

Apache2 Debian Default Page

\n" + "

It works!

" + ), + "nginx_default": ( + "Welcome to nginx!\n" + "

Welcome to nginx!

\n" + "

If you see this page, the nginx web server is successfully installed.

\n" + "" + ), + "wordpress": ( + "WordPress › Error\n" + "
\n" + "

Error establishing a database connection

" + ), + "phpmyadmin": ( + "phpMyAdmin\n" + "
\n" + "\n" + "\n" + "
" + ), + "iis_default": ( + "IIS Windows Server\n" + "

IIS Windows Server

\n" + "

Welcome to Internet Information Services

" + ), +} + +app = Flask(__name__) + +@app.after_request +def _fix_server_header(response): + response.headers["Server"] = SERVER_HEADER + return response + +def _log(event_type: str, severity: int = 6, **kwargs) -> None: + line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs) + write_syslog_file(line) + forward_syslog(line, LOG_TARGET) + + +@app.before_request +def log_request(): + _log( + "request", + method=request.method, + path=request.path, + remote_addr=request.remote_addr, + headers=dict(request.headers), + body=request.get_data(as_text=True)[:512], + ) + + +@app.route("/", defaults={"path": ""}) +@app.route("/", methods=["GET", "POST", "PUT", "DELETE", "PATCH", "OPTIONS", "HEAD"]) +def catch_all(path): + # Serve static files directory if configured + if FILES_DIR and path: + files_path = Path(FILES_DIR) / path + if files_path.is_file(): + return send_from_directory(FILES_DIR, path) + + # Select response body: custom > fake_app preset > default 403 + if CUSTOM_BODY: + body = CUSTOM_BODY + elif FAKE_APP and FAKE_APP in _FAKE_APP_BODIES: + body = _FAKE_APP_BODIES[FAKE_APP] + else: + body = ( + "\n" + "\n" + "403 Forbidden\n" + "\n" + "

Forbidden

\n" + "

You don't have permission to access this resource.

\n" + "
\n" + f"
{SERVER_HEADER} Server at {NODE_NAME} Port 443
\n" + "\n" + ) + + headers = {"Content-Type": "text/html", **EXTRA_HEADERS} + return body, RESPONSE_CODE, headers + + +class _SilentHandler(WSGIRequestHandler): + """Suppress Werkzeug's Server header so Flask's after_request is the sole source.""" + def version_string(self) -> str: + return "" + + +if __name__ == "__main__": + _log("startup", msg=f"HTTPS server starting as {NODE_NAME}") + + ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER) + ctx.load_cert_chain(TLS_CERT, TLS_KEY) + + srv = make_server("0.0.0.0", PORT, app, request_handler=_SilentHandler) # nosec B104 + srv.socket = ctx.wrap_socket(srv.socket, server_side=True) + srv.serve_forever() diff --git a/decnet/templates/https/syslog_bridge.py b/decnet/templates/https/syslog_bridge.py new file mode 100644 index 0000000..c0a78d0 --- /dev/null +++ b/decnet/templates/https/syslog_bridge.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +""" +Shared RFC 5424 syslog helper used by service containers. + +Services call syslog_line() to format an RFC 5424 message, then +write_syslog_file() to emit it to stdout — the container runtime +captures it, and the host-side collector streams it into the log file. + +RFC 5424 structure: + 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG + +Facility: local0 (16). SD element ID uses PEN 55555. +""" + +from datetime import datetime, timezone +from typing import Any + +# ─── Constants ──────────────────────────────────────────────────────────────── + +_FACILITY_LOCAL0 = 16 +_SD_ID = "relay@55555" +_NILVALUE = "-" + +SEVERITY_EMERG = 0 +SEVERITY_ALERT = 1 +SEVERITY_CRIT = 2 +SEVERITY_ERROR = 3 +SEVERITY_WARNING = 4 +SEVERITY_NOTICE = 5 +SEVERITY_INFO = 6 +SEVERITY_DEBUG = 7 + +_MAX_HOSTNAME = 255 +_MAX_APPNAME = 48 +_MAX_MSGID = 32 + +# ─── Formatter ──────────────────────────────────────────────────────────────── + +def _sd_escape(value: str) -> str: + """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3.""" + return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]") + + +def _sd_element(fields: dict[str, Any]) -> str: + if not fields: + return _NILVALUE + params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items()) + return f"[{_SD_ID} {params}]" + + +def syslog_line( + service: str, + hostname: str, + event_type: str, + severity: int = SEVERITY_INFO, + timestamp: datetime | None = None, + msg: str | None = None, + **fields: Any, +) -> str: + """ + Return a single RFC 5424-compliant syslog line (no trailing newline). + + Args: + service: APP-NAME (e.g. "http", "mysql") + hostname: HOSTNAME (node name) + event_type: MSGID (e.g. "request", "login_attempt") + severity: Syslog severity integer (default: INFO=6) + timestamp: UTC datetime; defaults to now + msg: Optional free-text MSG + **fields: Encoded as structured data params + """ + pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>" + ts = (timestamp or datetime.now(timezone.utc)).isoformat() + host = (hostname or _NILVALUE)[:_MAX_HOSTNAME] + appname = (service or _NILVALUE)[:_MAX_APPNAME] + msgid = (event_type or _NILVALUE)[:_MAX_MSGID] + sd = _sd_element(fields) + message = f" {msg}" if msg else "" + return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}" + + +def write_syslog_file(line: str) -> None: + """Emit a syslog line to stdout for container log capture.""" + print(line, flush=True) + + +def forward_syslog(line: str, log_target: str) -> None: + """No-op stub. TCP forwarding is handled by rsyslog, not by service containers.""" + pass diff --git a/templates/imap/Dockerfile b/decnet/templates/imap/Dockerfile similarity index 85% rename from templates/imap/Dockerfile rename to decnet/templates/imap/Dockerfile index a0e8fa2..35d1b67 100644 --- a/templates/imap/Dockerfile +++ b/decnet/templates/imap/Dockerfile @@ -5,13 +5,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ python3 \ && rm -rf /var/lib/apt/lists/* -COPY decnet_logging.py /opt/decnet_logging.py +COPY syslog_bridge.py /opt/syslog_bridge.py COPY server.py /opt/server.py COPY entrypoint.sh /entrypoint.sh RUN chmod +x /entrypoint.sh EXPOSE 143 993 -RUN useradd -r -s /bin/false -d /opt decnet \ +RUN useradd -r -s /bin/false -d /opt logrelay \ && apt-get update && apt-get install -y --no-install-recommends libcap2-bin \ && rm -rf /var/lib/apt/lists/* \ && (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true) @@ -19,5 +19,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \ HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ CMD kill -0 1 || exit 1 -USER decnet +USER logrelay ENTRYPOINT ["/entrypoint.sh"] diff --git a/templates/imap/entrypoint.sh b/decnet/templates/imap/entrypoint.sh similarity index 100% rename from templates/imap/entrypoint.sh rename to decnet/templates/imap/entrypoint.sh diff --git a/templates/imap/server.py b/decnet/templates/imap/server.py similarity index 99% rename from templates/imap/server.py rename to decnet/templates/imap/server.py index 71489af..5b01588 100644 --- a/templates/imap/server.py +++ b/decnet/templates/imap/server.py @@ -12,7 +12,7 @@ Banner advertises Dovecot so nmap fingerprints correctly. import asyncio import os -from decnet_logging import SEVERITY_WARNING, syslog_line, write_syslog_file, forward_syslog +from syslog_bridge import SEVERITY_WARNING, syslog_line, write_syslog_file, forward_syslog NODE_NAME = os.environ.get("NODE_NAME", "mailserver") SERVICE_NAME = "imap" @@ -236,7 +236,6 @@ _MAILBOXES = ["INBOX", "Sent", "Drafts", "Archive"] def _log(event_type: str, severity: int = 6, **kwargs) -> None: line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs) - print(line, flush=True) write_syslog_file(line) forward_syslog(line, LOG_TARGET) diff --git a/decnet/templates/imap/syslog_bridge.py b/decnet/templates/imap/syslog_bridge.py new file mode 100644 index 0000000..c0a78d0 --- /dev/null +++ b/decnet/templates/imap/syslog_bridge.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +""" +Shared RFC 5424 syslog helper used by service containers. + +Services call syslog_line() to format an RFC 5424 message, then +write_syslog_file() to emit it to stdout — the container runtime +captures it, and the host-side collector streams it into the log file. + +RFC 5424 structure: + 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG + +Facility: local0 (16). SD element ID uses PEN 55555. +""" + +from datetime import datetime, timezone +from typing import Any + +# ─── Constants ──────────────────────────────────────────────────────────────── + +_FACILITY_LOCAL0 = 16 +_SD_ID = "relay@55555" +_NILVALUE = "-" + +SEVERITY_EMERG = 0 +SEVERITY_ALERT = 1 +SEVERITY_CRIT = 2 +SEVERITY_ERROR = 3 +SEVERITY_WARNING = 4 +SEVERITY_NOTICE = 5 +SEVERITY_INFO = 6 +SEVERITY_DEBUG = 7 + +_MAX_HOSTNAME = 255 +_MAX_APPNAME = 48 +_MAX_MSGID = 32 + +# ─── Formatter ──────────────────────────────────────────────────────────────── + +def _sd_escape(value: str) -> str: + """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3.""" + return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]") + + +def _sd_element(fields: dict[str, Any]) -> str: + if not fields: + return _NILVALUE + params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items()) + return f"[{_SD_ID} {params}]" + + +def syslog_line( + service: str, + hostname: str, + event_type: str, + severity: int = SEVERITY_INFO, + timestamp: datetime | None = None, + msg: str | None = None, + **fields: Any, +) -> str: + """ + Return a single RFC 5424-compliant syslog line (no trailing newline). + + Args: + service: APP-NAME (e.g. "http", "mysql") + hostname: HOSTNAME (node name) + event_type: MSGID (e.g. "request", "login_attempt") + severity: Syslog severity integer (default: INFO=6) + timestamp: UTC datetime; defaults to now + msg: Optional free-text MSG + **fields: Encoded as structured data params + """ + pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>" + ts = (timestamp or datetime.now(timezone.utc)).isoformat() + host = (hostname or _NILVALUE)[:_MAX_HOSTNAME] + appname = (service or _NILVALUE)[:_MAX_APPNAME] + msgid = (event_type or _NILVALUE)[:_MAX_MSGID] + sd = _sd_element(fields) + message = f" {msg}" if msg else "" + return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}" + + +def write_syslog_file(line: str) -> None: + """Emit a syslog line to stdout for container log capture.""" + print(line, flush=True) + + +def forward_syslog(line: str, log_target: str) -> None: + """No-op stub. TCP forwarding is handled by rsyslog, not by service containers.""" + pass diff --git a/templates/k8s/Dockerfile b/decnet/templates/k8s/Dockerfile similarity index 87% rename from templates/k8s/Dockerfile rename to decnet/templates/k8s/Dockerfile index 118ed00..1da6296 100644 --- a/templates/k8s/Dockerfile +++ b/decnet/templates/k8s/Dockerfile @@ -8,13 +8,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ ENV PIP_BREAK_SYSTEM_PACKAGES=1 RUN pip3 install --no-cache-dir flask -COPY decnet_logging.py /opt/decnet_logging.py +COPY syslog_bridge.py /opt/syslog_bridge.py COPY server.py /opt/server.py COPY entrypoint.sh /entrypoint.sh RUN chmod +x /entrypoint.sh EXPOSE 6443 8080 -RUN useradd -r -s /bin/false -d /opt decnet \ +RUN useradd -r -s /bin/false -d /opt logrelay \ && apt-get update && apt-get install -y --no-install-recommends libcap2-bin \ && rm -rf /var/lib/apt/lists/* \ && (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true) @@ -22,5 +22,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \ HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ CMD kill -0 1 || exit 1 -USER decnet +USER logrelay ENTRYPOINT ["/entrypoint.sh"] diff --git a/templates/k8s/entrypoint.sh b/decnet/templates/k8s/entrypoint.sh similarity index 100% rename from templates/k8s/entrypoint.sh rename to decnet/templates/k8s/entrypoint.sh diff --git a/templates/k8s/server.py b/decnet/templates/k8s/server.py similarity index 97% rename from templates/k8s/server.py rename to decnet/templates/k8s/server.py index bf96fb9..8e5ba51 100644 --- a/templates/k8s/server.py +++ b/decnet/templates/k8s/server.py @@ -10,7 +10,7 @@ import json import os from flask import Flask, request -from decnet_logging import syslog_line, write_syslog_file, forward_syslog +from syslog_bridge import syslog_line, write_syslog_file, forward_syslog NODE_NAME = os.environ.get("NODE_NAME", "k8s-master") SERVICE_NAME = "k8s" @@ -69,7 +69,6 @@ _SECRETS = { def _log(event_type: str, severity: int = 6, **kwargs) -> None: line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs) - print(line, flush=True) write_syslog_file(line) forward_syslog(line, LOG_TARGET) diff --git a/decnet/templates/k8s/syslog_bridge.py b/decnet/templates/k8s/syslog_bridge.py new file mode 100644 index 0000000..c0a78d0 --- /dev/null +++ b/decnet/templates/k8s/syslog_bridge.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +""" +Shared RFC 5424 syslog helper used by service containers. + +Services call syslog_line() to format an RFC 5424 message, then +write_syslog_file() to emit it to stdout — the container runtime +captures it, and the host-side collector streams it into the log file. + +RFC 5424 structure: + 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG + +Facility: local0 (16). SD element ID uses PEN 55555. +""" + +from datetime import datetime, timezone +from typing import Any + +# ─── Constants ──────────────────────────────────────────────────────────────── + +_FACILITY_LOCAL0 = 16 +_SD_ID = "relay@55555" +_NILVALUE = "-" + +SEVERITY_EMERG = 0 +SEVERITY_ALERT = 1 +SEVERITY_CRIT = 2 +SEVERITY_ERROR = 3 +SEVERITY_WARNING = 4 +SEVERITY_NOTICE = 5 +SEVERITY_INFO = 6 +SEVERITY_DEBUG = 7 + +_MAX_HOSTNAME = 255 +_MAX_APPNAME = 48 +_MAX_MSGID = 32 + +# ─── Formatter ──────────────────────────────────────────────────────────────── + +def _sd_escape(value: str) -> str: + """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3.""" + return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]") + + +def _sd_element(fields: dict[str, Any]) -> str: + if not fields: + return _NILVALUE + params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items()) + return f"[{_SD_ID} {params}]" + + +def syslog_line( + service: str, + hostname: str, + event_type: str, + severity: int = SEVERITY_INFO, + timestamp: datetime | None = None, + msg: str | None = None, + **fields: Any, +) -> str: + """ + Return a single RFC 5424-compliant syslog line (no trailing newline). + + Args: + service: APP-NAME (e.g. "http", "mysql") + hostname: HOSTNAME (node name) + event_type: MSGID (e.g. "request", "login_attempt") + severity: Syslog severity integer (default: INFO=6) + timestamp: UTC datetime; defaults to now + msg: Optional free-text MSG + **fields: Encoded as structured data params + """ + pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>" + ts = (timestamp or datetime.now(timezone.utc)).isoformat() + host = (hostname or _NILVALUE)[:_MAX_HOSTNAME] + appname = (service or _NILVALUE)[:_MAX_APPNAME] + msgid = (event_type or _NILVALUE)[:_MAX_MSGID] + sd = _sd_element(fields) + message = f" {msg}" if msg else "" + return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}" + + +def write_syslog_file(line: str) -> None: + """Emit a syslog line to stdout for container log capture.""" + print(line, flush=True) + + +def forward_syslog(line: str, log_target: str) -> None: + """No-op stub. TCP forwarding is handled by rsyslog, not by service containers.""" + pass diff --git a/templates/ldap/Dockerfile b/decnet/templates/ldap/Dockerfile similarity index 85% rename from templates/ldap/Dockerfile rename to decnet/templates/ldap/Dockerfile index 2d8aa48..64e1a50 100644 --- a/templates/ldap/Dockerfile +++ b/decnet/templates/ldap/Dockerfile @@ -5,13 +5,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ python3 \ && rm -rf /var/lib/apt/lists/* -COPY decnet_logging.py /opt/decnet_logging.py +COPY syslog_bridge.py /opt/syslog_bridge.py COPY server.py /opt/server.py COPY entrypoint.sh /entrypoint.sh RUN chmod +x /entrypoint.sh EXPOSE 389 636 -RUN useradd -r -s /bin/false -d /opt decnet \ +RUN useradd -r -s /bin/false -d /opt logrelay \ && apt-get update && apt-get install -y --no-install-recommends libcap2-bin \ && rm -rf /var/lib/apt/lists/* \ && (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true) @@ -19,5 +19,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \ HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ CMD kill -0 1 || exit 1 -USER decnet +USER logrelay ENTRYPOINT ["/entrypoint.sh"] diff --git a/templates/ldap/entrypoint.sh b/decnet/templates/ldap/entrypoint.sh similarity index 100% rename from templates/ldap/entrypoint.sh rename to decnet/templates/ldap/entrypoint.sh diff --git a/templates/ldap/server.py b/decnet/templates/ldap/server.py similarity index 97% rename from templates/ldap/server.py rename to decnet/templates/ldap/server.py index bfef78f..c7d4136 100644 --- a/templates/ldap/server.py +++ b/decnet/templates/ldap/server.py @@ -7,7 +7,7 @@ invalidCredentials error. Logs all interactions as JSON. import asyncio import os -from decnet_logging import syslog_line, write_syslog_file, forward_syslog +from syslog_bridge import syslog_line, write_syslog_file, forward_syslog NODE_NAME = os.environ.get("NODE_NAME", "ldapserver") SERVICE_NAME = "ldap" @@ -18,7 +18,6 @@ LOG_TARGET = os.environ.get("LOG_TARGET", "") def _log(event_type: str, severity: int = 6, **kwargs) -> None: line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs) - print(line, flush=True) write_syslog_file(line) forward_syslog(line, LOG_TARGET) diff --git a/decnet/templates/ldap/syslog_bridge.py b/decnet/templates/ldap/syslog_bridge.py new file mode 100644 index 0000000..c0a78d0 --- /dev/null +++ b/decnet/templates/ldap/syslog_bridge.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +""" +Shared RFC 5424 syslog helper used by service containers. + +Services call syslog_line() to format an RFC 5424 message, then +write_syslog_file() to emit it to stdout — the container runtime +captures it, and the host-side collector streams it into the log file. + +RFC 5424 structure: + 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG + +Facility: local0 (16). SD element ID uses PEN 55555. +""" + +from datetime import datetime, timezone +from typing import Any + +# ─── Constants ──────────────────────────────────────────────────────────────── + +_FACILITY_LOCAL0 = 16 +_SD_ID = "relay@55555" +_NILVALUE = "-" + +SEVERITY_EMERG = 0 +SEVERITY_ALERT = 1 +SEVERITY_CRIT = 2 +SEVERITY_ERROR = 3 +SEVERITY_WARNING = 4 +SEVERITY_NOTICE = 5 +SEVERITY_INFO = 6 +SEVERITY_DEBUG = 7 + +_MAX_HOSTNAME = 255 +_MAX_APPNAME = 48 +_MAX_MSGID = 32 + +# ─── Formatter ──────────────────────────────────────────────────────────────── + +def _sd_escape(value: str) -> str: + """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3.""" + return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]") + + +def _sd_element(fields: dict[str, Any]) -> str: + if not fields: + return _NILVALUE + params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items()) + return f"[{_SD_ID} {params}]" + + +def syslog_line( + service: str, + hostname: str, + event_type: str, + severity: int = SEVERITY_INFO, + timestamp: datetime | None = None, + msg: str | None = None, + **fields: Any, +) -> str: + """ + Return a single RFC 5424-compliant syslog line (no trailing newline). + + Args: + service: APP-NAME (e.g. "http", "mysql") + hostname: HOSTNAME (node name) + event_type: MSGID (e.g. "request", "login_attempt") + severity: Syslog severity integer (default: INFO=6) + timestamp: UTC datetime; defaults to now + msg: Optional free-text MSG + **fields: Encoded as structured data params + """ + pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>" + ts = (timestamp or datetime.now(timezone.utc)).isoformat() + host = (hostname or _NILVALUE)[:_MAX_HOSTNAME] + appname = (service or _NILVALUE)[:_MAX_APPNAME] + msgid = (event_type or _NILVALUE)[:_MAX_MSGID] + sd = _sd_element(fields) + message = f" {msg}" if msg else "" + return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}" + + +def write_syslog_file(line: str) -> None: + """Emit a syslog line to stdout for container log capture.""" + print(line, flush=True) + + +def forward_syslog(line: str, log_target: str) -> None: + """No-op stub. TCP forwarding is handled by rsyslog, not by service containers.""" + pass diff --git a/templates/llmnr/Dockerfile b/decnet/templates/llmnr/Dockerfile similarity index 86% rename from templates/llmnr/Dockerfile rename to decnet/templates/llmnr/Dockerfile index cddfc7d..724f4db 100644 --- a/templates/llmnr/Dockerfile +++ b/decnet/templates/llmnr/Dockerfile @@ -5,14 +5,14 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ python3 \ && rm -rf /var/lib/apt/lists/* -COPY decnet_logging.py /opt/decnet_logging.py +COPY syslog_bridge.py /opt/syslog_bridge.py COPY server.py /opt/server.py COPY entrypoint.sh /entrypoint.sh RUN chmod +x /entrypoint.sh EXPOSE 5355/udp EXPOSE 5353/udp -RUN useradd -r -s /bin/false -d /opt decnet \ +RUN useradd -r -s /bin/false -d /opt logrelay \ && apt-get update && apt-get install -y --no-install-recommends libcap2-bin \ && rm -rf /var/lib/apt/lists/* \ && (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true) @@ -20,5 +20,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \ HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ CMD kill -0 1 || exit 1 -USER decnet +USER logrelay ENTRYPOINT ["/entrypoint.sh"] diff --git a/templates/llmnr/entrypoint.sh b/decnet/templates/llmnr/entrypoint.sh similarity index 100% rename from templates/llmnr/entrypoint.sh rename to decnet/templates/llmnr/entrypoint.sh diff --git a/templates/llmnr/server.py b/decnet/templates/llmnr/server.py similarity index 97% rename from templates/llmnr/server.py rename to decnet/templates/llmnr/server.py index 7d0fc95..ac94707 100644 --- a/templates/llmnr/server.py +++ b/decnet/templates/llmnr/server.py @@ -9,7 +9,7 @@ Logs every packet with source IP and decoded query name where possible. import asyncio import os import struct -from decnet_logging import syslog_line, write_syslog_file, forward_syslog +from syslog_bridge import syslog_line, write_syslog_file, forward_syslog NODE_NAME = os.environ.get("NODE_NAME", "lan-host") SERVICE_NAME = "llmnr" @@ -20,7 +20,6 @@ LOG_TARGET = os.environ.get("LOG_TARGET", "") def _log(event_type: str, severity: int = 6, **kwargs) -> None: line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs) - print(line, flush=True) write_syslog_file(line) forward_syslog(line, LOG_TARGET) diff --git a/decnet/templates/llmnr/syslog_bridge.py b/decnet/templates/llmnr/syslog_bridge.py new file mode 100644 index 0000000..c0a78d0 --- /dev/null +++ b/decnet/templates/llmnr/syslog_bridge.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +""" +Shared RFC 5424 syslog helper used by service containers. + +Services call syslog_line() to format an RFC 5424 message, then +write_syslog_file() to emit it to stdout — the container runtime +captures it, and the host-side collector streams it into the log file. + +RFC 5424 structure: + 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG + +Facility: local0 (16). SD element ID uses PEN 55555. +""" + +from datetime import datetime, timezone +from typing import Any + +# ─── Constants ──────────────────────────────────────────────────────────────── + +_FACILITY_LOCAL0 = 16 +_SD_ID = "relay@55555" +_NILVALUE = "-" + +SEVERITY_EMERG = 0 +SEVERITY_ALERT = 1 +SEVERITY_CRIT = 2 +SEVERITY_ERROR = 3 +SEVERITY_WARNING = 4 +SEVERITY_NOTICE = 5 +SEVERITY_INFO = 6 +SEVERITY_DEBUG = 7 + +_MAX_HOSTNAME = 255 +_MAX_APPNAME = 48 +_MAX_MSGID = 32 + +# ─── Formatter ──────────────────────────────────────────────────────────────── + +def _sd_escape(value: str) -> str: + """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3.""" + return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]") + + +def _sd_element(fields: dict[str, Any]) -> str: + if not fields: + return _NILVALUE + params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items()) + return f"[{_SD_ID} {params}]" + + +def syslog_line( + service: str, + hostname: str, + event_type: str, + severity: int = SEVERITY_INFO, + timestamp: datetime | None = None, + msg: str | None = None, + **fields: Any, +) -> str: + """ + Return a single RFC 5424-compliant syslog line (no trailing newline). + + Args: + service: APP-NAME (e.g. "http", "mysql") + hostname: HOSTNAME (node name) + event_type: MSGID (e.g. "request", "login_attempt") + severity: Syslog severity integer (default: INFO=6) + timestamp: UTC datetime; defaults to now + msg: Optional free-text MSG + **fields: Encoded as structured data params + """ + pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>" + ts = (timestamp or datetime.now(timezone.utc)).isoformat() + host = (hostname or _NILVALUE)[:_MAX_HOSTNAME] + appname = (service or _NILVALUE)[:_MAX_APPNAME] + msgid = (event_type or _NILVALUE)[:_MAX_MSGID] + sd = _sd_element(fields) + message = f" {msg}" if msg else "" + return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}" + + +def write_syslog_file(line: str) -> None: + """Emit a syslog line to stdout for container log capture.""" + print(line, flush=True) + + +def forward_syslog(line: str, log_target: str) -> None: + """No-op stub. TCP forwarding is handled by rsyslog, not by service containers.""" + pass diff --git a/templates/mongodb/Dockerfile b/decnet/templates/mongodb/Dockerfile similarity index 85% rename from templates/mongodb/Dockerfile rename to decnet/templates/mongodb/Dockerfile index d8f7039..d7bc953 100644 --- a/templates/mongodb/Dockerfile +++ b/decnet/templates/mongodb/Dockerfile @@ -5,13 +5,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ python3 \ && rm -rf /var/lib/apt/lists/* -COPY decnet_logging.py /opt/decnet_logging.py +COPY syslog_bridge.py /opt/syslog_bridge.py COPY server.py /opt/server.py COPY entrypoint.sh /entrypoint.sh RUN chmod +x /entrypoint.sh EXPOSE 27017 -RUN useradd -r -s /bin/false -d /opt decnet \ +RUN useradd -r -s /bin/false -d /opt logrelay \ && apt-get update && apt-get install -y --no-install-recommends libcap2-bin \ && rm -rf /var/lib/apt/lists/* \ && (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true) @@ -19,5 +19,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \ HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ CMD kill -0 1 || exit 1 -USER decnet +USER logrelay ENTRYPOINT ["/entrypoint.sh"] diff --git a/templates/mongodb/entrypoint.sh b/decnet/templates/mongodb/entrypoint.sh similarity index 100% rename from templates/mongodb/entrypoint.sh rename to decnet/templates/mongodb/entrypoint.sh diff --git a/templates/mongodb/server.py b/decnet/templates/mongodb/server.py similarity index 97% rename from templates/mongodb/server.py rename to decnet/templates/mongodb/server.py index cc16af5..ce14f02 100644 --- a/templates/mongodb/server.py +++ b/decnet/templates/mongodb/server.py @@ -9,7 +9,7 @@ received messages as JSON. import asyncio import os import struct -from decnet_logging import syslog_line, write_syslog_file, forward_syslog +from syslog_bridge import syslog_line, write_syslog_file, forward_syslog NODE_NAME = os.environ.get("NODE_NAME", "mongodb") SERVICE_NAME = "mongodb" @@ -62,7 +62,6 @@ def _op_msg(request_id: int, doc: bytes) -> bytes: def _log(event_type: str, severity: int = 6, **kwargs) -> None: line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs) - print(line, flush=True) write_syslog_file(line) forward_syslog(line, LOG_TARGET) diff --git a/decnet/templates/mongodb/syslog_bridge.py b/decnet/templates/mongodb/syslog_bridge.py new file mode 100644 index 0000000..c0a78d0 --- /dev/null +++ b/decnet/templates/mongodb/syslog_bridge.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +""" +Shared RFC 5424 syslog helper used by service containers. + +Services call syslog_line() to format an RFC 5424 message, then +write_syslog_file() to emit it to stdout — the container runtime +captures it, and the host-side collector streams it into the log file. + +RFC 5424 structure: + 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG + +Facility: local0 (16). SD element ID uses PEN 55555. +""" + +from datetime import datetime, timezone +from typing import Any + +# ─── Constants ──────────────────────────────────────────────────────────────── + +_FACILITY_LOCAL0 = 16 +_SD_ID = "relay@55555" +_NILVALUE = "-" + +SEVERITY_EMERG = 0 +SEVERITY_ALERT = 1 +SEVERITY_CRIT = 2 +SEVERITY_ERROR = 3 +SEVERITY_WARNING = 4 +SEVERITY_NOTICE = 5 +SEVERITY_INFO = 6 +SEVERITY_DEBUG = 7 + +_MAX_HOSTNAME = 255 +_MAX_APPNAME = 48 +_MAX_MSGID = 32 + +# ─── Formatter ──────────────────────────────────────────────────────────────── + +def _sd_escape(value: str) -> str: + """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3.""" + return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]") + + +def _sd_element(fields: dict[str, Any]) -> str: + if not fields: + return _NILVALUE + params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items()) + return f"[{_SD_ID} {params}]" + + +def syslog_line( + service: str, + hostname: str, + event_type: str, + severity: int = SEVERITY_INFO, + timestamp: datetime | None = None, + msg: str | None = None, + **fields: Any, +) -> str: + """ + Return a single RFC 5424-compliant syslog line (no trailing newline). + + Args: + service: APP-NAME (e.g. "http", "mysql") + hostname: HOSTNAME (node name) + event_type: MSGID (e.g. "request", "login_attempt") + severity: Syslog severity integer (default: INFO=6) + timestamp: UTC datetime; defaults to now + msg: Optional free-text MSG + **fields: Encoded as structured data params + """ + pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>" + ts = (timestamp or datetime.now(timezone.utc)).isoformat() + host = (hostname or _NILVALUE)[:_MAX_HOSTNAME] + appname = (service or _NILVALUE)[:_MAX_APPNAME] + msgid = (event_type or _NILVALUE)[:_MAX_MSGID] + sd = _sd_element(fields) + message = f" {msg}" if msg else "" + return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}" + + +def write_syslog_file(line: str) -> None: + """Emit a syslog line to stdout for container log capture.""" + print(line, flush=True) + + +def forward_syslog(line: str, log_target: str) -> None: + """No-op stub. TCP forwarding is handled by rsyslog, not by service containers.""" + pass diff --git a/templates/mqtt/Dockerfile b/decnet/templates/mqtt/Dockerfile similarity index 85% rename from templates/mqtt/Dockerfile rename to decnet/templates/mqtt/Dockerfile index 1ee311d..562ed42 100644 --- a/templates/mqtt/Dockerfile +++ b/decnet/templates/mqtt/Dockerfile @@ -5,13 +5,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ python3 \ && rm -rf /var/lib/apt/lists/* -COPY decnet_logging.py /opt/decnet_logging.py +COPY syslog_bridge.py /opt/syslog_bridge.py COPY server.py /opt/server.py COPY entrypoint.sh /entrypoint.sh RUN chmod +x /entrypoint.sh EXPOSE 1883 -RUN useradd -r -s /bin/false -d /opt decnet \ +RUN useradd -r -s /bin/false -d /opt logrelay \ && apt-get update && apt-get install -y --no-install-recommends libcap2-bin \ && rm -rf /var/lib/apt/lists/* \ && (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true) @@ -19,5 +19,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \ HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ CMD kill -0 1 || exit 1 -USER decnet +USER logrelay ENTRYPOINT ["/entrypoint.sh"] diff --git a/templates/mqtt/entrypoint.sh b/decnet/templates/mqtt/entrypoint.sh similarity index 100% rename from templates/mqtt/entrypoint.sh rename to decnet/templates/mqtt/entrypoint.sh diff --git a/templates/mqtt/server.py b/decnet/templates/mqtt/server.py similarity index 98% rename from templates/mqtt/server.py rename to decnet/templates/mqtt/server.py index d0b43c1..66438bd 100644 --- a/templates/mqtt/server.py +++ b/decnet/templates/mqtt/server.py @@ -12,7 +12,7 @@ import json import os import random import struct -from decnet_logging import syslog_line, write_syslog_file, forward_syslog +from syslog_bridge import syslog_line, write_syslog_file, forward_syslog NODE_NAME = os.environ.get("NODE_NAME", "mqtt-broker") SERVICE_NAME = "mqtt" @@ -28,7 +28,6 @@ _CONNACK_NOT_AUTH = b"\x20\x02\x00\x05" def _log(event_type: str, severity: int = 6, **kwargs) -> None: line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs) - print(line, flush=True) write_syslog_file(line) forward_syslog(line, LOG_TARGET) diff --git a/decnet/templates/mqtt/syslog_bridge.py b/decnet/templates/mqtt/syslog_bridge.py new file mode 100644 index 0000000..c0a78d0 --- /dev/null +++ b/decnet/templates/mqtt/syslog_bridge.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +""" +Shared RFC 5424 syslog helper used by service containers. + +Services call syslog_line() to format an RFC 5424 message, then +write_syslog_file() to emit it to stdout — the container runtime +captures it, and the host-side collector streams it into the log file. + +RFC 5424 structure: + 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG + +Facility: local0 (16). SD element ID uses PEN 55555. +""" + +from datetime import datetime, timezone +from typing import Any + +# ─── Constants ──────────────────────────────────────────────────────────────── + +_FACILITY_LOCAL0 = 16 +_SD_ID = "relay@55555" +_NILVALUE = "-" + +SEVERITY_EMERG = 0 +SEVERITY_ALERT = 1 +SEVERITY_CRIT = 2 +SEVERITY_ERROR = 3 +SEVERITY_WARNING = 4 +SEVERITY_NOTICE = 5 +SEVERITY_INFO = 6 +SEVERITY_DEBUG = 7 + +_MAX_HOSTNAME = 255 +_MAX_APPNAME = 48 +_MAX_MSGID = 32 + +# ─── Formatter ──────────────────────────────────────────────────────────────── + +def _sd_escape(value: str) -> str: + """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3.""" + return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]") + + +def _sd_element(fields: dict[str, Any]) -> str: + if not fields: + return _NILVALUE + params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items()) + return f"[{_SD_ID} {params}]" + + +def syslog_line( + service: str, + hostname: str, + event_type: str, + severity: int = SEVERITY_INFO, + timestamp: datetime | None = None, + msg: str | None = None, + **fields: Any, +) -> str: + """ + Return a single RFC 5424-compliant syslog line (no trailing newline). + + Args: + service: APP-NAME (e.g. "http", "mysql") + hostname: HOSTNAME (node name) + event_type: MSGID (e.g. "request", "login_attempt") + severity: Syslog severity integer (default: INFO=6) + timestamp: UTC datetime; defaults to now + msg: Optional free-text MSG + **fields: Encoded as structured data params + """ + pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>" + ts = (timestamp or datetime.now(timezone.utc)).isoformat() + host = (hostname or _NILVALUE)[:_MAX_HOSTNAME] + appname = (service or _NILVALUE)[:_MAX_APPNAME] + msgid = (event_type or _NILVALUE)[:_MAX_MSGID] + sd = _sd_element(fields) + message = f" {msg}" if msg else "" + return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}" + + +def write_syslog_file(line: str) -> None: + """Emit a syslog line to stdout for container log capture.""" + print(line, flush=True) + + +def forward_syslog(line: str, log_target: str) -> None: + """No-op stub. TCP forwarding is handled by rsyslog, not by service containers.""" + pass diff --git a/templates/mssql/Dockerfile b/decnet/templates/mssql/Dockerfile similarity index 85% rename from templates/mssql/Dockerfile rename to decnet/templates/mssql/Dockerfile index 07607cb..2f34156 100644 --- a/templates/mssql/Dockerfile +++ b/decnet/templates/mssql/Dockerfile @@ -5,13 +5,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ python3 \ && rm -rf /var/lib/apt/lists/* -COPY decnet_logging.py /opt/decnet_logging.py +COPY syslog_bridge.py /opt/syslog_bridge.py COPY server.py /opt/server.py COPY entrypoint.sh /entrypoint.sh RUN chmod +x /entrypoint.sh EXPOSE 1433 -RUN useradd -r -s /bin/false -d /opt decnet \ +RUN useradd -r -s /bin/false -d /opt logrelay \ && apt-get update && apt-get install -y --no-install-recommends libcap2-bin \ && rm -rf /var/lib/apt/lists/* \ && (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true) @@ -19,5 +19,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \ HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ CMD kill -0 1 || exit 1 -USER decnet +USER logrelay ENTRYPOINT ["/entrypoint.sh"] diff --git a/templates/mssql/entrypoint.sh b/decnet/templates/mssql/entrypoint.sh similarity index 100% rename from templates/mssql/entrypoint.sh rename to decnet/templates/mssql/entrypoint.sh diff --git a/templates/mssql/server.py b/decnet/templates/mssql/server.py similarity index 97% rename from templates/mssql/server.py rename to decnet/templates/mssql/server.py index 41040d8..61114d5 100644 --- a/templates/mssql/server.py +++ b/decnet/templates/mssql/server.py @@ -8,7 +8,7 @@ a login failed error. Logs auth attempts as JSON. import asyncio import os import struct -from decnet_logging import syslog_line, write_syslog_file, forward_syslog +from syslog_bridge import syslog_line, write_syslog_file, forward_syslog NODE_NAME = os.environ.get("NODE_NAME", "dbserver") SERVICE_NAME = "mssql" @@ -45,7 +45,6 @@ _PRELOGIN_RESP = bytes([ def _log(event_type: str, severity: int = 6, **kwargs) -> None: line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs) - print(line, flush=True) write_syslog_file(line) forward_syslog(line, LOG_TARGET) diff --git a/decnet/templates/mssql/syslog_bridge.py b/decnet/templates/mssql/syslog_bridge.py new file mode 100644 index 0000000..c0a78d0 --- /dev/null +++ b/decnet/templates/mssql/syslog_bridge.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +""" +Shared RFC 5424 syslog helper used by service containers. + +Services call syslog_line() to format an RFC 5424 message, then +write_syslog_file() to emit it to stdout — the container runtime +captures it, and the host-side collector streams it into the log file. + +RFC 5424 structure: + 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG + +Facility: local0 (16). SD element ID uses PEN 55555. +""" + +from datetime import datetime, timezone +from typing import Any + +# ─── Constants ──────────────────────────────────────────────────────────────── + +_FACILITY_LOCAL0 = 16 +_SD_ID = "relay@55555" +_NILVALUE = "-" + +SEVERITY_EMERG = 0 +SEVERITY_ALERT = 1 +SEVERITY_CRIT = 2 +SEVERITY_ERROR = 3 +SEVERITY_WARNING = 4 +SEVERITY_NOTICE = 5 +SEVERITY_INFO = 6 +SEVERITY_DEBUG = 7 + +_MAX_HOSTNAME = 255 +_MAX_APPNAME = 48 +_MAX_MSGID = 32 + +# ─── Formatter ──────────────────────────────────────────────────────────────── + +def _sd_escape(value: str) -> str: + """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3.""" + return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]") + + +def _sd_element(fields: dict[str, Any]) -> str: + if not fields: + return _NILVALUE + params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items()) + return f"[{_SD_ID} {params}]" + + +def syslog_line( + service: str, + hostname: str, + event_type: str, + severity: int = SEVERITY_INFO, + timestamp: datetime | None = None, + msg: str | None = None, + **fields: Any, +) -> str: + """ + Return a single RFC 5424-compliant syslog line (no trailing newline). + + Args: + service: APP-NAME (e.g. "http", "mysql") + hostname: HOSTNAME (node name) + event_type: MSGID (e.g. "request", "login_attempt") + severity: Syslog severity integer (default: INFO=6) + timestamp: UTC datetime; defaults to now + msg: Optional free-text MSG + **fields: Encoded as structured data params + """ + pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>" + ts = (timestamp or datetime.now(timezone.utc)).isoformat() + host = (hostname or _NILVALUE)[:_MAX_HOSTNAME] + appname = (service or _NILVALUE)[:_MAX_APPNAME] + msgid = (event_type or _NILVALUE)[:_MAX_MSGID] + sd = _sd_element(fields) + message = f" {msg}" if msg else "" + return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}" + + +def write_syslog_file(line: str) -> None: + """Emit a syslog line to stdout for container log capture.""" + print(line, flush=True) + + +def forward_syslog(line: str, log_target: str) -> None: + """No-op stub. TCP forwarding is handled by rsyslog, not by service containers.""" + pass diff --git a/templates/mysql/Dockerfile b/decnet/templates/mysql/Dockerfile similarity index 85% rename from templates/mysql/Dockerfile rename to decnet/templates/mysql/Dockerfile index cbfb532..926e74b 100644 --- a/templates/mysql/Dockerfile +++ b/decnet/templates/mysql/Dockerfile @@ -5,13 +5,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ python3 \ && rm -rf /var/lib/apt/lists/* -COPY decnet_logging.py /opt/decnet_logging.py +COPY syslog_bridge.py /opt/syslog_bridge.py COPY server.py /opt/server.py COPY entrypoint.sh /entrypoint.sh RUN chmod +x /entrypoint.sh EXPOSE 3306 -RUN useradd -r -s /bin/false -d /opt decnet \ +RUN useradd -r -s /bin/false -d /opt logrelay \ && apt-get update && apt-get install -y --no-install-recommends libcap2-bin \ && rm -rf /var/lib/apt/lists/* \ && (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true) @@ -19,5 +19,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \ HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ CMD kill -0 1 || exit 1 -USER decnet +USER logrelay ENTRYPOINT ["/entrypoint.sh"] diff --git a/templates/mysql/entrypoint.sh b/decnet/templates/mysql/entrypoint.sh similarity index 100% rename from templates/mysql/entrypoint.sh rename to decnet/templates/mysql/entrypoint.sh diff --git a/templates/mysql/server.py b/decnet/templates/mysql/server.py similarity index 97% rename from templates/mysql/server.py rename to decnet/templates/mysql/server.py index 812a910..a6b1d94 100644 --- a/templates/mysql/server.py +++ b/decnet/templates/mysql/server.py @@ -9,7 +9,7 @@ attempts as JSON. import asyncio import os import struct -from decnet_logging import syslog_line, write_syslog_file, forward_syslog +from syslog_bridge import syslog_line, write_syslog_file, forward_syslog NODE_NAME = os.environ.get("NODE_NAME", "dbserver") SERVICE_NAME = "mysql" @@ -44,7 +44,6 @@ def _make_packet(payload: bytes, seq: int = 0) -> bytes: def _log(event_type: str, severity: int = 6, **kwargs) -> None: line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs) - print(line, flush=True) write_syslog_file(line) forward_syslog(line, LOG_TARGET) diff --git a/decnet/templates/mysql/syslog_bridge.py b/decnet/templates/mysql/syslog_bridge.py new file mode 100644 index 0000000..c0a78d0 --- /dev/null +++ b/decnet/templates/mysql/syslog_bridge.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +""" +Shared RFC 5424 syslog helper used by service containers. + +Services call syslog_line() to format an RFC 5424 message, then +write_syslog_file() to emit it to stdout — the container runtime +captures it, and the host-side collector streams it into the log file. + +RFC 5424 structure: + 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG + +Facility: local0 (16). SD element ID uses PEN 55555. +""" + +from datetime import datetime, timezone +from typing import Any + +# ─── Constants ──────────────────────────────────────────────────────────────── + +_FACILITY_LOCAL0 = 16 +_SD_ID = "relay@55555" +_NILVALUE = "-" + +SEVERITY_EMERG = 0 +SEVERITY_ALERT = 1 +SEVERITY_CRIT = 2 +SEVERITY_ERROR = 3 +SEVERITY_WARNING = 4 +SEVERITY_NOTICE = 5 +SEVERITY_INFO = 6 +SEVERITY_DEBUG = 7 + +_MAX_HOSTNAME = 255 +_MAX_APPNAME = 48 +_MAX_MSGID = 32 + +# ─── Formatter ──────────────────────────────────────────────────────────────── + +def _sd_escape(value: str) -> str: + """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3.""" + return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]") + + +def _sd_element(fields: dict[str, Any]) -> str: + if not fields: + return _NILVALUE + params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items()) + return f"[{_SD_ID} {params}]" + + +def syslog_line( + service: str, + hostname: str, + event_type: str, + severity: int = SEVERITY_INFO, + timestamp: datetime | None = None, + msg: str | None = None, + **fields: Any, +) -> str: + """ + Return a single RFC 5424-compliant syslog line (no trailing newline). + + Args: + service: APP-NAME (e.g. "http", "mysql") + hostname: HOSTNAME (node name) + event_type: MSGID (e.g. "request", "login_attempt") + severity: Syslog severity integer (default: INFO=6) + timestamp: UTC datetime; defaults to now + msg: Optional free-text MSG + **fields: Encoded as structured data params + """ + pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>" + ts = (timestamp or datetime.now(timezone.utc)).isoformat() + host = (hostname or _NILVALUE)[:_MAX_HOSTNAME] + appname = (service or _NILVALUE)[:_MAX_APPNAME] + msgid = (event_type or _NILVALUE)[:_MAX_MSGID] + sd = _sd_element(fields) + message = f" {msg}" if msg else "" + return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}" + + +def write_syslog_file(line: str) -> None: + """Emit a syslog line to stdout for container log capture.""" + print(line, flush=True) + + +def forward_syslog(line: str, log_target: str) -> None: + """No-op stub. TCP forwarding is handled by rsyslog, not by service containers.""" + pass diff --git a/templates/pop3/Dockerfile b/decnet/templates/pop3/Dockerfile similarity index 85% rename from templates/pop3/Dockerfile rename to decnet/templates/pop3/Dockerfile index ccbfe65..08ac966 100644 --- a/templates/pop3/Dockerfile +++ b/decnet/templates/pop3/Dockerfile @@ -5,13 +5,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ python3 \ && rm -rf /var/lib/apt/lists/* -COPY decnet_logging.py /opt/decnet_logging.py +COPY syslog_bridge.py /opt/syslog_bridge.py COPY server.py /opt/server.py COPY entrypoint.sh /entrypoint.sh RUN chmod +x /entrypoint.sh EXPOSE 110 995 -RUN useradd -r -s /bin/false -d /opt decnet \ +RUN useradd -r -s /bin/false -d /opt logrelay \ && apt-get update && apt-get install -y --no-install-recommends libcap2-bin \ && rm -rf /var/lib/apt/lists/* \ && (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true) @@ -19,5 +19,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \ HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ CMD kill -0 1 || exit 1 -USER decnet +USER logrelay ENTRYPOINT ["/entrypoint.sh"] diff --git a/templates/pop3/entrypoint.sh b/decnet/templates/pop3/entrypoint.sh similarity index 100% rename from templates/pop3/entrypoint.sh rename to decnet/templates/pop3/entrypoint.sh diff --git a/templates/pop3/server.py b/decnet/templates/pop3/server.py similarity index 99% rename from templates/pop3/server.py rename to decnet/templates/pop3/server.py index 33bca78..8599bc8 100644 --- a/templates/pop3/server.py +++ b/decnet/templates/pop3/server.py @@ -11,7 +11,7 @@ Credentials via IMAP_USERS env var (shared with IMAP service). import asyncio import os -from decnet_logging import SEVERITY_WARNING, syslog_line, write_syslog_file, forward_syslog +from syslog_bridge import SEVERITY_WARNING, syslog_line, write_syslog_file, forward_syslog NODE_NAME = os.environ.get("NODE_NAME", "mailserver") SERVICE_NAME = "pop3" @@ -161,7 +161,6 @@ _BAIT_EMAILS: list[str] = [ def _log(event_type: str, severity: int = 6, **kwargs) -> None: line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs) - print(line, flush=True) write_syslog_file(line) forward_syslog(line, LOG_TARGET) diff --git a/decnet/templates/pop3/syslog_bridge.py b/decnet/templates/pop3/syslog_bridge.py new file mode 100644 index 0000000..c0a78d0 --- /dev/null +++ b/decnet/templates/pop3/syslog_bridge.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +""" +Shared RFC 5424 syslog helper used by service containers. + +Services call syslog_line() to format an RFC 5424 message, then +write_syslog_file() to emit it to stdout — the container runtime +captures it, and the host-side collector streams it into the log file. + +RFC 5424 structure: + 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG + +Facility: local0 (16). SD element ID uses PEN 55555. +""" + +from datetime import datetime, timezone +from typing import Any + +# ─── Constants ──────────────────────────────────────────────────────────────── + +_FACILITY_LOCAL0 = 16 +_SD_ID = "relay@55555" +_NILVALUE = "-" + +SEVERITY_EMERG = 0 +SEVERITY_ALERT = 1 +SEVERITY_CRIT = 2 +SEVERITY_ERROR = 3 +SEVERITY_WARNING = 4 +SEVERITY_NOTICE = 5 +SEVERITY_INFO = 6 +SEVERITY_DEBUG = 7 + +_MAX_HOSTNAME = 255 +_MAX_APPNAME = 48 +_MAX_MSGID = 32 + +# ─── Formatter ──────────────────────────────────────────────────────────────── + +def _sd_escape(value: str) -> str: + """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3.""" + return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]") + + +def _sd_element(fields: dict[str, Any]) -> str: + if not fields: + return _NILVALUE + params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items()) + return f"[{_SD_ID} {params}]" + + +def syslog_line( + service: str, + hostname: str, + event_type: str, + severity: int = SEVERITY_INFO, + timestamp: datetime | None = None, + msg: str | None = None, + **fields: Any, +) -> str: + """ + Return a single RFC 5424-compliant syslog line (no trailing newline). + + Args: + service: APP-NAME (e.g. "http", "mysql") + hostname: HOSTNAME (node name) + event_type: MSGID (e.g. "request", "login_attempt") + severity: Syslog severity integer (default: INFO=6) + timestamp: UTC datetime; defaults to now + msg: Optional free-text MSG + **fields: Encoded as structured data params + """ + pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>" + ts = (timestamp or datetime.now(timezone.utc)).isoformat() + host = (hostname or _NILVALUE)[:_MAX_HOSTNAME] + appname = (service or _NILVALUE)[:_MAX_APPNAME] + msgid = (event_type or _NILVALUE)[:_MAX_MSGID] + sd = _sd_element(fields) + message = f" {msg}" if msg else "" + return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}" + + +def write_syslog_file(line: str) -> None: + """Emit a syslog line to stdout for container log capture.""" + print(line, flush=True) + + +def forward_syslog(line: str, log_target: str) -> None: + """No-op stub. TCP forwarding is handled by rsyslog, not by service containers.""" + pass diff --git a/templates/postgres/Dockerfile b/decnet/templates/postgres/Dockerfile similarity index 85% rename from templates/postgres/Dockerfile rename to decnet/templates/postgres/Dockerfile index 0a6a6bf..6eab4e1 100644 --- a/templates/postgres/Dockerfile +++ b/decnet/templates/postgres/Dockerfile @@ -5,13 +5,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ python3 \ && rm -rf /var/lib/apt/lists/* -COPY decnet_logging.py /opt/decnet_logging.py +COPY syslog_bridge.py /opt/syslog_bridge.py COPY server.py /opt/server.py COPY entrypoint.sh /entrypoint.sh RUN chmod +x /entrypoint.sh EXPOSE 5432 -RUN useradd -r -s /bin/false -d /opt decnet \ +RUN useradd -r -s /bin/false -d /opt logrelay \ && apt-get update && apt-get install -y --no-install-recommends libcap2-bin \ && rm -rf /var/lib/apt/lists/* \ && (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true) @@ -19,5 +19,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \ HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ CMD kill -0 1 || exit 1 -USER decnet +USER logrelay ENTRYPOINT ["/entrypoint.sh"] diff --git a/templates/postgres/entrypoint.sh b/decnet/templates/postgres/entrypoint.sh similarity index 100% rename from templates/postgres/entrypoint.sh rename to decnet/templates/postgres/entrypoint.sh diff --git a/templates/postgres/server.py b/decnet/templates/postgres/server.py similarity index 97% rename from templates/postgres/server.py rename to decnet/templates/postgres/server.py index 45126d7..267154f 100644 --- a/templates/postgres/server.py +++ b/decnet/templates/postgres/server.py @@ -9,7 +9,7 @@ returns an error. Logs all interactions as JSON. import asyncio import os import struct -from decnet_logging import syslog_line, write_syslog_file, forward_syslog +from syslog_bridge import syslog_line, write_syslog_file, forward_syslog NODE_NAME = os.environ.get("NODE_NAME", "pgserver") SERVICE_NAME = "postgres" @@ -24,7 +24,6 @@ def _error_response(message: str) -> bytes: def _log(event_type: str, severity: int = 6, **kwargs) -> None: line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs) - print(line, flush=True) write_syslog_file(line) forward_syslog(line, LOG_TARGET) diff --git a/decnet/templates/postgres/syslog_bridge.py b/decnet/templates/postgres/syslog_bridge.py new file mode 100644 index 0000000..c0a78d0 --- /dev/null +++ b/decnet/templates/postgres/syslog_bridge.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +""" +Shared RFC 5424 syslog helper used by service containers. + +Services call syslog_line() to format an RFC 5424 message, then +write_syslog_file() to emit it to stdout — the container runtime +captures it, and the host-side collector streams it into the log file. + +RFC 5424 structure: + 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG + +Facility: local0 (16). SD element ID uses PEN 55555. +""" + +from datetime import datetime, timezone +from typing import Any + +# ─── Constants ──────────────────────────────────────────────────────────────── + +_FACILITY_LOCAL0 = 16 +_SD_ID = "relay@55555" +_NILVALUE = "-" + +SEVERITY_EMERG = 0 +SEVERITY_ALERT = 1 +SEVERITY_CRIT = 2 +SEVERITY_ERROR = 3 +SEVERITY_WARNING = 4 +SEVERITY_NOTICE = 5 +SEVERITY_INFO = 6 +SEVERITY_DEBUG = 7 + +_MAX_HOSTNAME = 255 +_MAX_APPNAME = 48 +_MAX_MSGID = 32 + +# ─── Formatter ──────────────────────────────────────────────────────────────── + +def _sd_escape(value: str) -> str: + """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3.""" + return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]") + + +def _sd_element(fields: dict[str, Any]) -> str: + if not fields: + return _NILVALUE + params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items()) + return f"[{_SD_ID} {params}]" + + +def syslog_line( + service: str, + hostname: str, + event_type: str, + severity: int = SEVERITY_INFO, + timestamp: datetime | None = None, + msg: str | None = None, + **fields: Any, +) -> str: + """ + Return a single RFC 5424-compliant syslog line (no trailing newline). + + Args: + service: APP-NAME (e.g. "http", "mysql") + hostname: HOSTNAME (node name) + event_type: MSGID (e.g. "request", "login_attempt") + severity: Syslog severity integer (default: INFO=6) + timestamp: UTC datetime; defaults to now + msg: Optional free-text MSG + **fields: Encoded as structured data params + """ + pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>" + ts = (timestamp or datetime.now(timezone.utc)).isoformat() + host = (hostname or _NILVALUE)[:_MAX_HOSTNAME] + appname = (service or _NILVALUE)[:_MAX_APPNAME] + msgid = (event_type or _NILVALUE)[:_MAX_MSGID] + sd = _sd_element(fields) + message = f" {msg}" if msg else "" + return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}" + + +def write_syslog_file(line: str) -> None: + """Emit a syslog line to stdout for container log capture.""" + print(line, flush=True) + + +def forward_syslog(line: str, log_target: str) -> None: + """No-op stub. TCP forwarding is handled by rsyslog, not by service containers.""" + pass diff --git a/templates/rdp/Dockerfile b/decnet/templates/rdp/Dockerfile similarity index 87% rename from templates/rdp/Dockerfile rename to decnet/templates/rdp/Dockerfile index cf68714..06ed165 100644 --- a/templates/rdp/Dockerfile +++ b/decnet/templates/rdp/Dockerfile @@ -8,13 +8,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ ENV PIP_BREAK_SYSTEM_PACKAGES=1 RUN pip3 install --no-cache-dir twisted jinja2 -COPY decnet_logging.py /opt/decnet_logging.py +COPY syslog_bridge.py /opt/syslog_bridge.py COPY server.py /opt/server.py COPY entrypoint.sh /entrypoint.sh RUN chmod +x /entrypoint.sh EXPOSE 3389 -RUN useradd -r -s /bin/false -d /opt decnet \ +RUN useradd -r -s /bin/false -d /opt logrelay \ && apt-get update && apt-get install -y --no-install-recommends libcap2-bin \ && rm -rf /var/lib/apt/lists/* \ && (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true) @@ -22,5 +22,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \ HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ CMD kill -0 1 || exit 1 -USER decnet +USER logrelay ENTRYPOINT ["/entrypoint.sh"] diff --git a/templates/rdp/entrypoint.sh b/decnet/templates/rdp/entrypoint.sh similarity index 100% rename from templates/rdp/entrypoint.sh rename to decnet/templates/rdp/entrypoint.sh diff --git a/templates/rdp/server.py b/decnet/templates/rdp/server.py similarity index 94% rename from templates/rdp/server.py rename to decnet/templates/rdp/server.py index 12a0a48..2f61d7b 100644 --- a/templates/rdp/server.py +++ b/decnet/templates/rdp/server.py @@ -10,7 +10,7 @@ import os from twisted.internet import protocol, reactor from twisted.python import log as twisted_log -from decnet_logging import syslog_line, write_syslog_file, forward_syslog +from syslog_bridge import syslog_line, write_syslog_file, forward_syslog NODE_NAME = os.environ.get("NODE_NAME", "WORKSTATION") SERVICE_NAME = "rdp" @@ -21,7 +21,6 @@ LOG_TARGET = os.environ.get("LOG_TARGET", "") def _log(event_type: str, severity: int = 6, **kwargs) -> None: line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs) - print(line, flush=True) write_syslog_file(line) forward_syslog(line, LOG_TARGET) diff --git a/decnet/templates/rdp/syslog_bridge.py b/decnet/templates/rdp/syslog_bridge.py new file mode 100644 index 0000000..c0a78d0 --- /dev/null +++ b/decnet/templates/rdp/syslog_bridge.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +""" +Shared RFC 5424 syslog helper used by service containers. + +Services call syslog_line() to format an RFC 5424 message, then +write_syslog_file() to emit it to stdout — the container runtime +captures it, and the host-side collector streams it into the log file. + +RFC 5424 structure: + 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG + +Facility: local0 (16). SD element ID uses PEN 55555. +""" + +from datetime import datetime, timezone +from typing import Any + +# ─── Constants ──────────────────────────────────────────────────────────────── + +_FACILITY_LOCAL0 = 16 +_SD_ID = "relay@55555" +_NILVALUE = "-" + +SEVERITY_EMERG = 0 +SEVERITY_ALERT = 1 +SEVERITY_CRIT = 2 +SEVERITY_ERROR = 3 +SEVERITY_WARNING = 4 +SEVERITY_NOTICE = 5 +SEVERITY_INFO = 6 +SEVERITY_DEBUG = 7 + +_MAX_HOSTNAME = 255 +_MAX_APPNAME = 48 +_MAX_MSGID = 32 + +# ─── Formatter ──────────────────────────────────────────────────────────────── + +def _sd_escape(value: str) -> str: + """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3.""" + return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]") + + +def _sd_element(fields: dict[str, Any]) -> str: + if not fields: + return _NILVALUE + params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items()) + return f"[{_SD_ID} {params}]" + + +def syslog_line( + service: str, + hostname: str, + event_type: str, + severity: int = SEVERITY_INFO, + timestamp: datetime | None = None, + msg: str | None = None, + **fields: Any, +) -> str: + """ + Return a single RFC 5424-compliant syslog line (no trailing newline). + + Args: + service: APP-NAME (e.g. "http", "mysql") + hostname: HOSTNAME (node name) + event_type: MSGID (e.g. "request", "login_attempt") + severity: Syslog severity integer (default: INFO=6) + timestamp: UTC datetime; defaults to now + msg: Optional free-text MSG + **fields: Encoded as structured data params + """ + pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>" + ts = (timestamp or datetime.now(timezone.utc)).isoformat() + host = (hostname or _NILVALUE)[:_MAX_HOSTNAME] + appname = (service or _NILVALUE)[:_MAX_APPNAME] + msgid = (event_type or _NILVALUE)[:_MAX_MSGID] + sd = _sd_element(fields) + message = f" {msg}" if msg else "" + return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}" + + +def write_syslog_file(line: str) -> None: + """Emit a syslog line to stdout for container log capture.""" + print(line, flush=True) + + +def forward_syslog(line: str, log_target: str) -> None: + """No-op stub. TCP forwarding is handled by rsyslog, not by service containers.""" + pass diff --git a/templates/redis/Dockerfile b/decnet/templates/redis/Dockerfile similarity index 85% rename from templates/redis/Dockerfile rename to decnet/templates/redis/Dockerfile index bc627ac..b3f85de 100644 --- a/templates/redis/Dockerfile +++ b/decnet/templates/redis/Dockerfile @@ -5,13 +5,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ python3 \ && rm -rf /var/lib/apt/lists/* -COPY decnet_logging.py /opt/decnet_logging.py +COPY syslog_bridge.py /opt/syslog_bridge.py COPY server.py /opt/server.py COPY entrypoint.sh /entrypoint.sh RUN chmod +x /entrypoint.sh EXPOSE 6379 -RUN useradd -r -s /bin/false -d /opt decnet \ +RUN useradd -r -s /bin/false -d /opt logrelay \ && apt-get update && apt-get install -y --no-install-recommends libcap2-bin \ && rm -rf /var/lib/apt/lists/* \ && (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true) @@ -19,5 +19,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \ HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ CMD kill -0 1 || exit 1 -USER decnet +USER logrelay ENTRYPOINT ["/entrypoint.sh"] diff --git a/templates/redis/entrypoint.sh b/decnet/templates/redis/entrypoint.sh similarity index 100% rename from templates/redis/entrypoint.sh rename to decnet/templates/redis/entrypoint.sh diff --git a/templates/redis/server.py b/decnet/templates/redis/server.py similarity index 98% rename from templates/redis/server.py rename to decnet/templates/redis/server.py index 4aa5961..4d3242f 100644 --- a/templates/redis/server.py +++ b/decnet/templates/redis/server.py @@ -7,7 +7,7 @@ KEYS, and arbitrary commands. Logs every command and argument as JSON. import asyncio import os -from decnet_logging import syslog_line, write_syslog_file, forward_syslog +from syslog_bridge import syslog_line, write_syslog_file, forward_syslog NODE_NAME = os.environ.get("NODE_NAME", "cache-server") SERVICE_NAME = "redis" @@ -46,7 +46,6 @@ _FAKE_STORE = { def _log(event_type: str, severity: int = 6, **kwargs) -> None: line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs) - print(line, flush=True) write_syslog_file(line) forward_syslog(line, LOG_TARGET) diff --git a/decnet/templates/redis/syslog_bridge.py b/decnet/templates/redis/syslog_bridge.py new file mode 100644 index 0000000..c0a78d0 --- /dev/null +++ b/decnet/templates/redis/syslog_bridge.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +""" +Shared RFC 5424 syslog helper used by service containers. + +Services call syslog_line() to format an RFC 5424 message, then +write_syslog_file() to emit it to stdout — the container runtime +captures it, and the host-side collector streams it into the log file. + +RFC 5424 structure: + 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG + +Facility: local0 (16). SD element ID uses PEN 55555. +""" + +from datetime import datetime, timezone +from typing import Any + +# ─── Constants ──────────────────────────────────────────────────────────────── + +_FACILITY_LOCAL0 = 16 +_SD_ID = "relay@55555" +_NILVALUE = "-" + +SEVERITY_EMERG = 0 +SEVERITY_ALERT = 1 +SEVERITY_CRIT = 2 +SEVERITY_ERROR = 3 +SEVERITY_WARNING = 4 +SEVERITY_NOTICE = 5 +SEVERITY_INFO = 6 +SEVERITY_DEBUG = 7 + +_MAX_HOSTNAME = 255 +_MAX_APPNAME = 48 +_MAX_MSGID = 32 + +# ─── Formatter ──────────────────────────────────────────────────────────────── + +def _sd_escape(value: str) -> str: + """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3.""" + return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]") + + +def _sd_element(fields: dict[str, Any]) -> str: + if not fields: + return _NILVALUE + params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items()) + return f"[{_SD_ID} {params}]" + + +def syslog_line( + service: str, + hostname: str, + event_type: str, + severity: int = SEVERITY_INFO, + timestamp: datetime | None = None, + msg: str | None = None, + **fields: Any, +) -> str: + """ + Return a single RFC 5424-compliant syslog line (no trailing newline). + + Args: + service: APP-NAME (e.g. "http", "mysql") + hostname: HOSTNAME (node name) + event_type: MSGID (e.g. "request", "login_attempt") + severity: Syslog severity integer (default: INFO=6) + timestamp: UTC datetime; defaults to now + msg: Optional free-text MSG + **fields: Encoded as structured data params + """ + pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>" + ts = (timestamp or datetime.now(timezone.utc)).isoformat() + host = (hostname or _NILVALUE)[:_MAX_HOSTNAME] + appname = (service or _NILVALUE)[:_MAX_APPNAME] + msgid = (event_type or _NILVALUE)[:_MAX_MSGID] + sd = _sd_element(fields) + message = f" {msg}" if msg else "" + return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}" + + +def write_syslog_file(line: str) -> None: + """Emit a syslog line to stdout for container log capture.""" + print(line, flush=True) + + +def forward_syslog(line: str, log_target: str) -> None: + """No-op stub. TCP forwarding is handled by rsyslog, not by service containers.""" + pass diff --git a/templates/sip/Dockerfile b/decnet/templates/sip/Dockerfile similarity index 86% rename from templates/sip/Dockerfile rename to decnet/templates/sip/Dockerfile index ab37230..e42a5e2 100644 --- a/templates/sip/Dockerfile +++ b/decnet/templates/sip/Dockerfile @@ -5,14 +5,14 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ python3 \ && rm -rf /var/lib/apt/lists/* -COPY decnet_logging.py /opt/decnet_logging.py +COPY syslog_bridge.py /opt/syslog_bridge.py COPY server.py /opt/server.py COPY entrypoint.sh /entrypoint.sh RUN chmod +x /entrypoint.sh EXPOSE 5060/udp EXPOSE 5060/tcp -RUN useradd -r -s /bin/false -d /opt decnet \ +RUN useradd -r -s /bin/false -d /opt logrelay \ && apt-get update && apt-get install -y --no-install-recommends libcap2-bin \ && rm -rf /var/lib/apt/lists/* \ && (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true) @@ -20,5 +20,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \ HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ CMD kill -0 1 || exit 1 -USER decnet +USER logrelay ENTRYPOINT ["/entrypoint.sh"] diff --git a/templates/sip/entrypoint.sh b/decnet/templates/sip/entrypoint.sh similarity index 100% rename from templates/sip/entrypoint.sh rename to decnet/templates/sip/entrypoint.sh diff --git a/templates/sip/server.py b/decnet/templates/sip/server.py similarity index 97% rename from templates/sip/server.py rename to decnet/templates/sip/server.py index a84c0c7..dd40166 100644 --- a/templates/sip/server.py +++ b/decnet/templates/sip/server.py @@ -8,7 +8,7 @@ Authorization header and call metadata, then responds with 401 Unauthorized. import asyncio import os import re -from decnet_logging import syslog_line, write_syslog_file, forward_syslog +from syslog_bridge import syslog_line, write_syslog_file, forward_syslog NODE_NAME = os.environ.get("NODE_NAME", "pbx") SERVICE_NAME = "sip" @@ -30,7 +30,6 @@ _401 = ( def _log(event_type: str, severity: int = 6, **kwargs) -> None: line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs) - print(line, flush=True) write_syslog_file(line) forward_syslog(line, LOG_TARGET) diff --git a/decnet/templates/sip/syslog_bridge.py b/decnet/templates/sip/syslog_bridge.py new file mode 100644 index 0000000..c0a78d0 --- /dev/null +++ b/decnet/templates/sip/syslog_bridge.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +""" +Shared RFC 5424 syslog helper used by service containers. + +Services call syslog_line() to format an RFC 5424 message, then +write_syslog_file() to emit it to stdout — the container runtime +captures it, and the host-side collector streams it into the log file. + +RFC 5424 structure: + 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG + +Facility: local0 (16). SD element ID uses PEN 55555. +""" + +from datetime import datetime, timezone +from typing import Any + +# ─── Constants ──────────────────────────────────────────────────────────────── + +_FACILITY_LOCAL0 = 16 +_SD_ID = "relay@55555" +_NILVALUE = "-" + +SEVERITY_EMERG = 0 +SEVERITY_ALERT = 1 +SEVERITY_CRIT = 2 +SEVERITY_ERROR = 3 +SEVERITY_WARNING = 4 +SEVERITY_NOTICE = 5 +SEVERITY_INFO = 6 +SEVERITY_DEBUG = 7 + +_MAX_HOSTNAME = 255 +_MAX_APPNAME = 48 +_MAX_MSGID = 32 + +# ─── Formatter ──────────────────────────────────────────────────────────────── + +def _sd_escape(value: str) -> str: + """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3.""" + return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]") + + +def _sd_element(fields: dict[str, Any]) -> str: + if not fields: + return _NILVALUE + params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items()) + return f"[{_SD_ID} {params}]" + + +def syslog_line( + service: str, + hostname: str, + event_type: str, + severity: int = SEVERITY_INFO, + timestamp: datetime | None = None, + msg: str | None = None, + **fields: Any, +) -> str: + """ + Return a single RFC 5424-compliant syslog line (no trailing newline). + + Args: + service: APP-NAME (e.g. "http", "mysql") + hostname: HOSTNAME (node name) + event_type: MSGID (e.g. "request", "login_attempt") + severity: Syslog severity integer (default: INFO=6) + timestamp: UTC datetime; defaults to now + msg: Optional free-text MSG + **fields: Encoded as structured data params + """ + pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>" + ts = (timestamp or datetime.now(timezone.utc)).isoformat() + host = (hostname or _NILVALUE)[:_MAX_HOSTNAME] + appname = (service or _NILVALUE)[:_MAX_APPNAME] + msgid = (event_type or _NILVALUE)[:_MAX_MSGID] + sd = _sd_element(fields) + message = f" {msg}" if msg else "" + return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}" + + +def write_syslog_file(line: str) -> None: + """Emit a syslog line to stdout for container log capture.""" + print(line, flush=True) + + +def forward_syslog(line: str, log_target: str) -> None: + """No-op stub. TCP forwarding is handled by rsyslog, not by service containers.""" + pass diff --git a/templates/smb/Dockerfile b/decnet/templates/smb/Dockerfile similarity index 87% rename from templates/smb/Dockerfile rename to decnet/templates/smb/Dockerfile index cea8028..64120be 100644 --- a/templates/smb/Dockerfile +++ b/decnet/templates/smb/Dockerfile @@ -8,13 +8,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ ENV PIP_BREAK_SYSTEM_PACKAGES=1 RUN pip3 install --no-cache-dir impacket jinja2 -COPY decnet_logging.py /opt/decnet_logging.py +COPY syslog_bridge.py /opt/syslog_bridge.py COPY server.py /opt/server.py COPY entrypoint.sh /entrypoint.sh RUN chmod +x /entrypoint.sh EXPOSE 445 139 -RUN useradd -r -s /bin/false -d /opt decnet \ +RUN useradd -r -s /bin/false -d /opt logrelay \ && apt-get update && apt-get install -y --no-install-recommends libcap2-bin \ && rm -rf /var/lib/apt/lists/* \ && (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true) @@ -22,5 +22,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \ HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ CMD kill -0 1 || exit 1 -USER decnet +USER logrelay ENTRYPOINT ["/entrypoint.sh"] diff --git a/templates/smb/entrypoint.sh b/decnet/templates/smb/entrypoint.sh similarity index 100% rename from templates/smb/entrypoint.sh rename to decnet/templates/smb/entrypoint.sh diff --git a/templates/smb/server.py b/decnet/templates/smb/server.py similarity index 90% rename from templates/smb/server.py rename to decnet/templates/smb/server.py index aa5d1a9..24356a8 100644 --- a/templates/smb/server.py +++ b/decnet/templates/smb/server.py @@ -7,7 +7,7 @@ Logs all connection attempts, optionally forwarding them as JSON to LOG_TARGET. import os from impacket import smbserver -from decnet_logging import syslog_line, write_syslog_file, forward_syslog +from syslog_bridge import syslog_line, write_syslog_file, forward_syslog NODE_NAME = os.environ.get("NODE_NAME", "WORKSTATION") SERVICE_NAME = "smb" @@ -18,7 +18,6 @@ LOG_TARGET = os.environ.get("LOG_TARGET", "") def _log(event_type: str, severity: int = 6, **kwargs) -> None: line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs) - print(line, flush=True) write_syslog_file(line) forward_syslog(line, LOG_TARGET) diff --git a/decnet/templates/smb/syslog_bridge.py b/decnet/templates/smb/syslog_bridge.py new file mode 100644 index 0000000..c0a78d0 --- /dev/null +++ b/decnet/templates/smb/syslog_bridge.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +""" +Shared RFC 5424 syslog helper used by service containers. + +Services call syslog_line() to format an RFC 5424 message, then +write_syslog_file() to emit it to stdout — the container runtime +captures it, and the host-side collector streams it into the log file. + +RFC 5424 structure: + 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG + +Facility: local0 (16). SD element ID uses PEN 55555. +""" + +from datetime import datetime, timezone +from typing import Any + +# ─── Constants ──────────────────────────────────────────────────────────────── + +_FACILITY_LOCAL0 = 16 +_SD_ID = "relay@55555" +_NILVALUE = "-" + +SEVERITY_EMERG = 0 +SEVERITY_ALERT = 1 +SEVERITY_CRIT = 2 +SEVERITY_ERROR = 3 +SEVERITY_WARNING = 4 +SEVERITY_NOTICE = 5 +SEVERITY_INFO = 6 +SEVERITY_DEBUG = 7 + +_MAX_HOSTNAME = 255 +_MAX_APPNAME = 48 +_MAX_MSGID = 32 + +# ─── Formatter ──────────────────────────────────────────────────────────────── + +def _sd_escape(value: str) -> str: + """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3.""" + return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]") + + +def _sd_element(fields: dict[str, Any]) -> str: + if not fields: + return _NILVALUE + params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items()) + return f"[{_SD_ID} {params}]" + + +def syslog_line( + service: str, + hostname: str, + event_type: str, + severity: int = SEVERITY_INFO, + timestamp: datetime | None = None, + msg: str | None = None, + **fields: Any, +) -> str: + """ + Return a single RFC 5424-compliant syslog line (no trailing newline). + + Args: + service: APP-NAME (e.g. "http", "mysql") + hostname: HOSTNAME (node name) + event_type: MSGID (e.g. "request", "login_attempt") + severity: Syslog severity integer (default: INFO=6) + timestamp: UTC datetime; defaults to now + msg: Optional free-text MSG + **fields: Encoded as structured data params + """ + pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>" + ts = (timestamp or datetime.now(timezone.utc)).isoformat() + host = (hostname or _NILVALUE)[:_MAX_HOSTNAME] + appname = (service or _NILVALUE)[:_MAX_APPNAME] + msgid = (event_type or _NILVALUE)[:_MAX_MSGID] + sd = _sd_element(fields) + message = f" {msg}" if msg else "" + return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}" + + +def write_syslog_file(line: str) -> None: + """Emit a syslog line to stdout for container log capture.""" + print(line, flush=True) + + +def forward_syslog(line: str, log_target: str) -> None: + """No-op stub. TCP forwarding is handled by rsyslog, not by service containers.""" + pass diff --git a/templates/smtp/Dockerfile b/decnet/templates/smtp/Dockerfile similarity index 85% rename from templates/smtp/Dockerfile rename to decnet/templates/smtp/Dockerfile index 2013f50..c7bf5c8 100644 --- a/templates/smtp/Dockerfile +++ b/decnet/templates/smtp/Dockerfile @@ -5,13 +5,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ python3 \ && rm -rf /var/lib/apt/lists/* -COPY decnet_logging.py /opt/decnet_logging.py +COPY syslog_bridge.py /opt/syslog_bridge.py COPY server.py /opt/server.py COPY entrypoint.sh /entrypoint.sh RUN chmod +x /entrypoint.sh EXPOSE 25 587 -RUN useradd -r -s /bin/false -d /opt decnet \ +RUN useradd -r -s /bin/false -d /opt logrelay \ && apt-get update && apt-get install -y --no-install-recommends libcap2-bin \ && rm -rf /var/lib/apt/lists/* \ && (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true) @@ -19,5 +19,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \ HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ CMD kill -0 1 || exit 1 -USER decnet +USER logrelay ENTRYPOINT ["/entrypoint.sh"] diff --git a/templates/smtp/entrypoint.sh b/decnet/templates/smtp/entrypoint.sh similarity index 100% rename from templates/smtp/entrypoint.sh rename to decnet/templates/smtp/entrypoint.sh diff --git a/templates/smtp/server.py b/decnet/templates/smtp/server.py similarity index 92% rename from templates/smtp/server.py rename to decnet/templates/smtp/server.py index b5b2232..9cd52a2 100644 --- a/templates/smtp/server.py +++ b/decnet/templates/smtp/server.py @@ -23,7 +23,7 @@ import base64 import os import random import string -from decnet_logging import SEVERITY_WARNING, syslog_line, write_syslog_file, forward_syslog +from syslog_bridge import SEVERITY_WARNING, syslog_line, write_syslog_file, forward_syslog NODE_NAME = os.environ.get("NODE_NAME", "mailserver") SERVICE_NAME = "smtp" @@ -37,7 +37,6 @@ _SMTP_MTA = os.environ.get("SMTP_MTA", NODE_NAME) def _log(event_type: str, severity: int = 6, **kwargs) -> None: line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs) - print(line, flush=True) write_syslog_file(line) forward_syslog(line, LOG_TARGET) @@ -87,9 +86,10 @@ class SMTPProtocol(asyncio.Protocol): def data_received(self, data): self._buf += data - while b"\r\n" in self._buf: - line, self._buf = self._buf.split(b"\r\n", 1) - self._handle_line(line.decode(errors="replace")) + while b"\n" in self._buf: + line, self._buf = self._buf.split(b"\n", 1) + # Strip trailing \r so both CRLF and bare LF work + self._handle_line(line.rstrip(b"\r").decode(errors="replace")) def connection_lost(self, exc): _log("disconnect", src=self._peer[0] if self._peer else "?") @@ -118,7 +118,12 @@ class SMTPProtocol(asyncio.Protocol): self._data_buf.append(line[1:] if line.startswith(".") else line) return - # ── AUTH multi-step (LOGIN mechanism) ───────────────────────────────── + # ── AUTH multi-step (LOGIN / PLAIN continuation) ───────────────────── + if self._auth_state == "await_plain": + user, password = _decode_auth_plain(line) + self._finish_auth(user, password) + self._auth_state = "" + return if self._auth_state == "await_user": self._auth_user = base64.b64decode(line + "==").decode(errors="replace") self._auth_state = "await_pass" @@ -137,6 +142,11 @@ class SMTPProtocol(asyncio.Protocol): args = parts[1] if len(parts) > 1 else "" if cmd in ("EHLO", "HELO"): + if not args: + self._transport.write( + f"501 5.5.4 Syntax: {cmd} hostname\r\n".encode() + ) + return _log("ehlo", src=self._peer[0], domain=args) self._transport.write( f"250-{_SMTP_MTA}\r\n" diff --git a/decnet/templates/smtp/syslog_bridge.py b/decnet/templates/smtp/syslog_bridge.py new file mode 100644 index 0000000..c0a78d0 --- /dev/null +++ b/decnet/templates/smtp/syslog_bridge.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +""" +Shared RFC 5424 syslog helper used by service containers. + +Services call syslog_line() to format an RFC 5424 message, then +write_syslog_file() to emit it to stdout — the container runtime +captures it, and the host-side collector streams it into the log file. + +RFC 5424 structure: + 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG + +Facility: local0 (16). SD element ID uses PEN 55555. +""" + +from datetime import datetime, timezone +from typing import Any + +# ─── Constants ──────────────────────────────────────────────────────────────── + +_FACILITY_LOCAL0 = 16 +_SD_ID = "relay@55555" +_NILVALUE = "-" + +SEVERITY_EMERG = 0 +SEVERITY_ALERT = 1 +SEVERITY_CRIT = 2 +SEVERITY_ERROR = 3 +SEVERITY_WARNING = 4 +SEVERITY_NOTICE = 5 +SEVERITY_INFO = 6 +SEVERITY_DEBUG = 7 + +_MAX_HOSTNAME = 255 +_MAX_APPNAME = 48 +_MAX_MSGID = 32 + +# ─── Formatter ──────────────────────────────────────────────────────────────── + +def _sd_escape(value: str) -> str: + """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3.""" + return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]") + + +def _sd_element(fields: dict[str, Any]) -> str: + if not fields: + return _NILVALUE + params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items()) + return f"[{_SD_ID} {params}]" + + +def syslog_line( + service: str, + hostname: str, + event_type: str, + severity: int = SEVERITY_INFO, + timestamp: datetime | None = None, + msg: str | None = None, + **fields: Any, +) -> str: + """ + Return a single RFC 5424-compliant syslog line (no trailing newline). + + Args: + service: APP-NAME (e.g. "http", "mysql") + hostname: HOSTNAME (node name) + event_type: MSGID (e.g. "request", "login_attempt") + severity: Syslog severity integer (default: INFO=6) + timestamp: UTC datetime; defaults to now + msg: Optional free-text MSG + **fields: Encoded as structured data params + """ + pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>" + ts = (timestamp or datetime.now(timezone.utc)).isoformat() + host = (hostname or _NILVALUE)[:_MAX_HOSTNAME] + appname = (service or _NILVALUE)[:_MAX_APPNAME] + msgid = (event_type or _NILVALUE)[:_MAX_MSGID] + sd = _sd_element(fields) + message = f" {msg}" if msg else "" + return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}" + + +def write_syslog_file(line: str) -> None: + """Emit a syslog line to stdout for container log capture.""" + print(line, flush=True) + + +def forward_syslog(line: str, log_target: str) -> None: + """No-op stub. TCP forwarding is handled by rsyslog, not by service containers.""" + pass diff --git a/decnet/templates/sniffer/Dockerfile b/decnet/templates/sniffer/Dockerfile new file mode 100644 index 0000000..ff9a6fc --- /dev/null +++ b/decnet/templates/sniffer/Dockerfile @@ -0,0 +1,12 @@ +ARG BASE_IMAGE=debian:bookworm-slim +FROM ${BASE_IMAGE} + +RUN apt-get update && apt-get install -y --no-install-recommends \ + python3 python3-pip libpcap-dev \ + && rm -rf /var/lib/apt/lists/* + +RUN pip3 install --no-cache-dir --break-system-packages "scapy==2.6.1" + +COPY syslog_bridge.py server.py /opt/ + +ENTRYPOINT ["python3", "/opt/server.py"] diff --git a/decnet/templates/sniffer/server.py b/decnet/templates/sniffer/server.py new file mode 100644 index 0000000..9bd7714 --- /dev/null +++ b/decnet/templates/sniffer/server.py @@ -0,0 +1,1050 @@ +#!/usr/bin/env python3 +""" +syslog-relay passive TLS sniffer. + +Captures TLS handshakes on the MACVLAN interface (shared network namespace +with the decky base container). Extracts fingerprints and connection +metadata, then emits structured RFC 5424 log lines to stdout for the +host-side collector to ingest. + +Requires: NET_RAW + NET_ADMIN capabilities (set in compose fragment). + +Supported fingerprints: + JA3 — MD5(SSLVersion,Ciphers,Extensions,EllipticCurves,ECPointFormats) + JA3S — MD5(SSLVersion,Cipher,Extensions) + JA4 — {proto}{ver}{sni}{#cs}{#ext}{alpn}_{sha256_12(sorted_cs)}_{sha256_12(sorted_ext,sigalgs)} + JA4S — {proto}{ver}{#ext}{alpn}_{sha256_12(cipher,sorted_ext)} + JA4L — TCP RTT latency measurement (client_ttl, server_rtt_ms) + TLS session resumption detection (session tickets, PSK, 0-RTT) + Certificate extraction (TLS ≤1.2 only — 1.3 encrypts certs) + +GREASE values (RFC 8701) are excluded from all lists before hashing. +""" + +from __future__ import annotations + +import hashlib +import os +import struct +import time +from typing import Any + +from scapy.layers.inet import IP, TCP +from scapy.sendrecv import sniff + +from syslog_bridge import SEVERITY_INFO, SEVERITY_WARNING, syslog_line, write_syslog_file + +# ─── Configuration ──────────────────────────────────────────────────────────── + +NODE_NAME: str = os.environ.get("NODE_NAME", "decky-sniffer") +SERVICE_NAME: str = "sniffer" + +# Session TTL in seconds — drop half-open sessions after this +_SESSION_TTL: float = 60.0 + +# Dedup TTL — suppress identical fingerprint events from the same source IP +# within this window (seconds). Set to 0 to disable dedup. +_DEDUP_TTL: float = float(os.environ.get("DEDUP_TTL", "300")) + +# GREASE values per RFC 8701 — 0x0A0A, 0x1A1A, 0x2A2A, ..., 0xFAFA +_GREASE: frozenset[int] = frozenset(0x0A0A + i * 0x1010 for i in range(16)) + +# TLS record / handshake type constants +_TLS_RECORD_HANDSHAKE: int = 0x16 +_TLS_HT_CLIENT_HELLO: int = 0x01 +_TLS_HT_SERVER_HELLO: int = 0x02 +_TLS_HT_CERTIFICATE: int = 0x0B + +# TLS extension types we extract for metadata +_EXT_SNI: int = 0x0000 +_EXT_SUPPORTED_GROUPS: int = 0x000A +_EXT_EC_POINT_FORMATS: int = 0x000B +_EXT_SIGNATURE_ALGORITHMS: int = 0x000D +_EXT_ALPN: int = 0x0010 +_EXT_SESSION_TICKET: int = 0x0023 +_EXT_SUPPORTED_VERSIONS: int = 0x002B +_EXT_PRE_SHARED_KEY: int = 0x0029 +_EXT_EARLY_DATA: int = 0x002A + +# TCP flags +_TCP_SYN: int = 0x02 +_TCP_ACK: int = 0x10 + +# ─── Session tracking ───────────────────────────────────────────────────────── + +# Key: (src_ip, src_port, dst_ip, dst_port) — forward 4-tuple from ClientHello +# Value: parsed ClientHello metadata dict +_sessions: dict[tuple[str, int, str, int], dict[str, Any]] = {} +_session_ts: dict[tuple[str, int, str, int], float] = {} + +# TCP RTT tracking for JA4L: key = (client_ip, client_port, server_ip, server_port) +# Value: {"syn_time": float, "ttl": int} +_tcp_syn: dict[tuple[str, int, str, int], dict[str, Any]] = {} +# Completed RTT measurements: key = same 4-tuple, value = {"rtt_ms": float, "client_ttl": int} +_tcp_rtt: dict[tuple[str, int, str, int], dict[str, Any]] = {} + + +# ─── GREASE helpers ─────────────────────────────────────────────────────────── + +def _is_grease(value: int) -> bool: + return value in _GREASE + + +def _filter_grease(values: list[int]) -> list[int]: + return [v for v in values if not _is_grease(v)] + + +# ─── Pure-Python TLS record parser ──────────────────────────────────────────── + +def _parse_client_hello(data: bytes) -> dict[str, Any] | None: + """ + Parse a TLS ClientHello from raw bytes (starting at TLS record header). + Returns a dict of parsed fields, or None if not a valid ClientHello. + """ + try: + if len(data) < 6: + return None + # TLS record header: content_type(1) version(2) length(2) + if data[0] != _TLS_RECORD_HANDSHAKE: + return None + record_len = struct.unpack_from("!H", data, 3)[0] + if len(data) < 5 + record_len: + return None + + # Handshake header: type(1) length(3) + hs = data[5:] + if hs[0] != _TLS_HT_CLIENT_HELLO: + return None + + hs_len = struct.unpack_from("!I", b"\x00" + hs[1:4])[0] + body = hs[4: 4 + hs_len] + if len(body) < 34: + return None + + pos = 0 + # ClientHello version (2 bytes) — used for JA3 + tls_version = struct.unpack_from("!H", body, pos)[0] + pos += 2 + + # Random (32 bytes) + pos += 32 + + # Session ID + session_id_len = body[pos] + session_id = body[pos + 1: pos + 1 + session_id_len] + pos += 1 + session_id_len + + # Cipher Suites + cs_len = struct.unpack_from("!H", body, pos)[0] + pos += 2 + cipher_suites = [ + struct.unpack_from("!H", body, pos + i * 2)[0] + for i in range(cs_len // 2) + ] + pos += cs_len + + # Compression Methods + comp_len = body[pos] + pos += 1 + comp_len + + # Extensions + extensions: list[int] = [] + supported_groups: list[int] = [] + ec_point_formats: list[int] = [] + signature_algorithms: list[int] = [] + supported_versions: list[int] = [] + sni: str = "" + alpn: list[str] = [] + has_session_ticket_data: bool = False + has_pre_shared_key: bool = False + has_early_data: bool = False + + if pos + 2 <= len(body): + ext_total = struct.unpack_from("!H", body, pos)[0] + pos += 2 + ext_end = pos + ext_total + + while pos + 4 <= ext_end: + ext_type = struct.unpack_from("!H", body, pos)[0] + ext_len = struct.unpack_from("!H", body, pos + 2)[0] + ext_data = body[pos + 4: pos + 4 + ext_len] + pos += 4 + ext_len + + if not _is_grease(ext_type): + extensions.append(ext_type) + + if ext_type == _EXT_SNI and len(ext_data) > 5: + # server_name_list_length(2) type(1) name_length(2) name + sni = ext_data[5:].decode("ascii", errors="replace") + + elif ext_type == _EXT_SUPPORTED_GROUPS and len(ext_data) >= 2: + grp_len = struct.unpack_from("!H", ext_data, 0)[0] + supported_groups = [ + struct.unpack_from("!H", ext_data, 2 + i * 2)[0] + for i in range(grp_len // 2) + ] + + elif ext_type == _EXT_EC_POINT_FORMATS and len(ext_data) >= 1: + pf_len = ext_data[0] + ec_point_formats = list(ext_data[1: 1 + pf_len]) + + elif ext_type == _EXT_ALPN and len(ext_data) >= 2: + proto_list_len = struct.unpack_from("!H", ext_data, 0)[0] + ap = 2 + while ap < 2 + proto_list_len: + plen = ext_data[ap] + alpn.append(ext_data[ap + 1: ap + 1 + plen].decode("ascii", errors="replace")) + ap += 1 + plen + + elif ext_type == _EXT_SIGNATURE_ALGORITHMS and len(ext_data) >= 2: + sa_len = struct.unpack_from("!H", ext_data, 0)[0] + signature_algorithms = [ + struct.unpack_from("!H", ext_data, 2 + i * 2)[0] + for i in range(sa_len // 2) + ] + + elif ext_type == _EXT_SUPPORTED_VERSIONS and len(ext_data) >= 1: + sv_len = ext_data[0] + supported_versions = [ + struct.unpack_from("!H", ext_data, 1 + i * 2)[0] + for i in range(sv_len // 2) + ] + + elif ext_type == _EXT_SESSION_TICKET: + has_session_ticket_data = len(ext_data) > 0 + + elif ext_type == _EXT_PRE_SHARED_KEY: + has_pre_shared_key = True + + elif ext_type == _EXT_EARLY_DATA: + has_early_data = True + + filtered_ciphers = _filter_grease(cipher_suites) + filtered_groups = _filter_grease(supported_groups) + filtered_sig_algs = _filter_grease(signature_algorithms) + filtered_versions = _filter_grease(supported_versions) + + return { + "tls_version": tls_version, + "cipher_suites": filtered_ciphers, + "extensions": extensions, + "supported_groups": filtered_groups, + "ec_point_formats": ec_point_formats, + "signature_algorithms": filtered_sig_algs, + "supported_versions": filtered_versions, + "sni": sni, + "alpn": alpn, + "session_id": session_id, + "has_session_ticket_data": has_session_ticket_data, + "has_pre_shared_key": has_pre_shared_key, + "has_early_data": has_early_data, + } + + except Exception: + return None + + +def _parse_server_hello(data: bytes) -> dict[str, Any] | None: + """ + Parse a TLS ServerHello from raw bytes. + Returns dict with tls_version, cipher_suite, extensions, or None. + """ + try: + if len(data) < 6 or data[0] != _TLS_RECORD_HANDSHAKE: + return None + + hs = data[5:] + if hs[0] != _TLS_HT_SERVER_HELLO: + return None + + hs_len = struct.unpack_from("!I", b"\x00" + hs[1:4])[0] + body = hs[4: 4 + hs_len] + if len(body) < 35: + return None + + pos = 0 + tls_version = struct.unpack_from("!H", body, pos)[0] + pos += 2 + + # Random (32 bytes) + pos += 32 + + # Session ID + session_id_len = body[pos] + pos += 1 + session_id_len + + if pos + 2 > len(body): + return None + + cipher_suite = struct.unpack_from("!H", body, pos)[0] + pos += 2 + + # Compression method (1 byte) + pos += 1 + + extensions: list[int] = [] + selected_version: int | None = None + alpn: str = "" + + if pos + 2 <= len(body): + ext_total = struct.unpack_from("!H", body, pos)[0] + pos += 2 + ext_end = pos + ext_total + while pos + 4 <= ext_end: + ext_type = struct.unpack_from("!H", body, pos)[0] + ext_len = struct.unpack_from("!H", body, pos + 2)[0] + ext_data = body[pos + 4: pos + 4 + ext_len] + pos += 4 + ext_len + if not _is_grease(ext_type): + extensions.append(ext_type) + + if ext_type == _EXT_SUPPORTED_VERSIONS and len(ext_data) >= 2: + selected_version = struct.unpack_from("!H", ext_data, 0)[0] + + elif ext_type == _EXT_ALPN and len(ext_data) >= 2: + proto_list_len = struct.unpack_from("!H", ext_data, 0)[0] + if proto_list_len > 0 and len(ext_data) >= 4: + plen = ext_data[2] + alpn = ext_data[3: 3 + plen].decode("ascii", errors="replace") + + return { + "tls_version": tls_version, + "cipher_suite": cipher_suite, + "extensions": extensions, + "selected_version": selected_version, + "alpn": alpn, + } + + except Exception: + return None + + +def _parse_certificate(data: bytes) -> dict[str, Any] | None: + """ + Parse a TLS Certificate handshake message from raw bytes. + + Only works for TLS 1.2 and below — TLS 1.3 encrypts the Certificate + message. Extracts basic details from the first (leaf) certificate + using minimal DER/ASN.1 parsing. + """ + try: + if len(data) < 6 or data[0] != _TLS_RECORD_HANDSHAKE: + return None + + hs = data[5:] + if hs[0] != _TLS_HT_CERTIFICATE: + return None + + hs_len = struct.unpack_from("!I", b"\x00" + hs[1:4])[0] + body = hs[4: 4 + hs_len] + if len(body) < 3: + return None + + # Certificate list total length (3 bytes) + certs_len = struct.unpack_from("!I", b"\x00" + body[0:3])[0] + if certs_len == 0: + return None + + pos = 3 + # First certificate length (3 bytes) + if pos + 3 > len(body): + return None + cert_len = struct.unpack_from("!I", b"\x00" + body[pos:pos + 3])[0] + pos += 3 + if pos + cert_len > len(body): + return None + + cert_der = body[pos: pos + cert_len] + return _parse_x509_der(cert_der) + + except Exception: + return None + + +# ─── Minimal DER/ASN.1 X.509 parser ───────────────────────────────────────── + +def _der_read_tag_len(data: bytes, pos: int) -> tuple[int, int, int]: + """Read a DER tag and length. Returns (tag, content_start, content_length).""" + tag = data[pos] + pos += 1 + length_byte = data[pos] + pos += 1 + if length_byte & 0x80: + num_bytes = length_byte & 0x7F + length = int.from_bytes(data[pos: pos + num_bytes], "big") + pos += num_bytes + else: + length = length_byte + return tag, pos, length + + +def _der_read_sequence(data: bytes, pos: int) -> tuple[int, int]: + """Read a SEQUENCE tag, return (content_start, content_length).""" + tag, content_start, length = _der_read_tag_len(data, pos) + return content_start, length + + +def _der_read_oid(data: bytes, pos: int, length: int) -> str: + """Decode a DER OID to dotted string.""" + if length < 1: + return "" + first = data[pos] + oid_parts = [str(first // 40), str(first % 40)] + val = 0 + for i in range(1, length): + b = data[pos + i] + val = (val << 7) | (b & 0x7F) + if not (b & 0x80): + oid_parts.append(str(val)) + val = 0 + return ".".join(oid_parts) + + +def _der_extract_cn(data: bytes, start: int, length: int) -> str: + """Walk an X.501 Name (SEQUENCE of SETs of SEQUENCE of OID+value) to find CN.""" + pos = start + end = start + length + while pos < end: + # Each RDN is a SET + set_tag, set_start, set_len = _der_read_tag_len(data, pos) + if set_tag != 0x31: # SET + break + set_end = set_start + set_len + + # Inside the SET, each attribute is a SEQUENCE + attr_pos = set_start + while attr_pos < set_end: + seq_tag, seq_start, seq_len = _der_read_tag_len(data, attr_pos) + if seq_tag != 0x30: # SEQUENCE + break + # OID + oid_tag, oid_start, oid_len = _der_read_tag_len(data, seq_start) + if oid_tag == 0x06: + oid = _der_read_oid(data, oid_start, oid_len) + # CN OID = 2.5.4.3 + if oid == "2.5.4.3": + val_tag, val_start, val_len = _der_read_tag_len(data, oid_start + oid_len) + return data[val_start: val_start + val_len].decode("utf-8", errors="replace") + attr_pos = seq_start + seq_len + + pos = set_end + return "" + + +def _der_extract_name_str(data: bytes, start: int, length: int) -> str: + """Extract a human-readable summary of an X.501 Name (all RDN values joined).""" + parts: list[str] = [] + pos = start + end = start + length + oid_names = { + "2.5.4.3": "CN", + "2.5.4.6": "C", + "2.5.4.7": "L", + "2.5.4.8": "ST", + "2.5.4.10": "O", + "2.5.4.11": "OU", + } + while pos < end: + set_tag, set_start, set_len = _der_read_tag_len(data, pos) + if set_tag != 0x31: + break + set_end = set_start + set_len + attr_pos = set_start + while attr_pos < set_end: + seq_tag, seq_start, seq_len = _der_read_tag_len(data, attr_pos) + if seq_tag != 0x30: + break + oid_tag, oid_start, oid_len = _der_read_tag_len(data, seq_start) + if oid_tag == 0x06: + oid = _der_read_oid(data, oid_start, oid_len) + val_tag, val_start, val_len = _der_read_tag_len(data, oid_start + oid_len) + val = data[val_start: val_start + val_len].decode("utf-8", errors="replace") + name = oid_names.get(oid, oid) + parts.append(f"{name}={val}") + attr_pos = seq_start + seq_len + pos = set_end + return ", ".join(parts) + + +def _parse_x509_der(cert_der: bytes) -> dict[str, Any] | None: + """ + Minimal X.509 DER parser. Extracts subject CN, issuer string, + validity period, and self-signed flag. + + Structure: SEQUENCE { tbsCertificate, signatureAlgorithm, signatureValue } + tbsCertificate: SEQUENCE { + version [0] EXPLICIT, serialNumber, signature, + issuer, validity { notBefore, notAfter }, + subject, subjectPublicKeyInfo, ...extensions + } + """ + try: + # Outer SEQUENCE + outer_start, outer_len = _der_read_sequence(cert_der, 0) + # tbsCertificate SEQUENCE + tbs_tag, tbs_start, tbs_len = _der_read_tag_len(cert_der, outer_start) + tbs_end = tbs_start + tbs_len + pos = tbs_start + + # version [0] EXPLICIT — optional, skip if present + if cert_der[pos] == 0xA0: + _, v_start, v_len = _der_read_tag_len(cert_der, pos) + pos = v_start + v_len + + # serialNumber (INTEGER) + _, sn_start, sn_len = _der_read_tag_len(cert_der, pos) + pos = sn_start + sn_len + + # signature algorithm (SEQUENCE) + _, sa_start, sa_len = _der_read_tag_len(cert_der, pos) + pos = sa_start + sa_len + + # issuer (SEQUENCE) + issuer_tag, issuer_start, issuer_len = _der_read_tag_len(cert_der, pos) + issuer_str = _der_extract_name_str(cert_der, issuer_start, issuer_len) + issuer_cn = _der_extract_cn(cert_der, issuer_start, issuer_len) + pos = issuer_start + issuer_len + + # validity (SEQUENCE of two times) + val_tag, val_start, val_len = _der_read_tag_len(cert_der, pos) + # notBefore + nb_tag, nb_start, nb_len = _der_read_tag_len(cert_der, val_start) + not_before = cert_der[nb_start: nb_start + nb_len].decode("ascii", errors="replace") + # notAfter + na_tag, na_start, na_len = _der_read_tag_len(cert_der, nb_start + nb_len) + not_after = cert_der[na_start: na_start + na_len].decode("ascii", errors="replace") + pos = val_start + val_len + + # subject (SEQUENCE) + subj_tag, subj_start, subj_len = _der_read_tag_len(cert_der, pos) + subject_cn = _der_extract_cn(cert_der, subj_start, subj_len) + subject_str = _der_extract_name_str(cert_der, subj_start, subj_len) + + # Self-signed: issuer CN matches subject CN (basic check) + self_signed = (issuer_cn == subject_cn) and subject_cn != "" + + # SANs are in extensions — attempt to find them + pos = subj_start + subj_len + sans: list[str] = _extract_sans(cert_der, pos, tbs_end) + + return { + "subject_cn": subject_cn, + "subject": subject_str, + "issuer": issuer_str, + "issuer_cn": issuer_cn, + "not_before": not_before, + "not_after": not_after, + "self_signed": self_signed, + "sans": sans, + } + + except Exception: + return None + + +def _extract_sans(cert_der: bytes, pos: int, end: int) -> list[str]: + """ + Attempt to extract Subject Alternative Names from X.509v3 extensions. + SAN OID = 2.5.29.17 + """ + sans: list[str] = [] + try: + # Skip subjectPublicKeyInfo SEQUENCE + if pos >= end: + return sans + spki_tag, spki_start, spki_len = _der_read_tag_len(cert_der, pos) + pos = spki_start + spki_len + + # Extensions are wrapped in [3] EXPLICIT + while pos < end: + tag = cert_der[pos] + if tag == 0xA3: # [3] EXPLICIT — extensions wrapper + _, ext_wrap_start, ext_wrap_len = _der_read_tag_len(cert_der, pos) + # Inner SEQUENCE of extensions + _, exts_start, exts_len = _der_read_tag_len(cert_der, ext_wrap_start) + epos = exts_start + eend = exts_start + exts_len + while epos < eend: + # Each extension is a SEQUENCE { OID, [critical], value } + ext_tag, ext_start, ext_len = _der_read_tag_len(cert_der, epos) + ext_end = ext_start + ext_len + + oid_tag, oid_start, oid_len = _der_read_tag_len(cert_der, ext_start) + if oid_tag == 0x06: + oid = _der_read_oid(cert_der, oid_start, oid_len) + if oid == "2.5.29.17": # SAN + # Find the OCTET STRING containing the SAN value + vpos = oid_start + oid_len + # Skip optional BOOLEAN (critical) + if vpos < ext_end and cert_der[vpos] == 0x01: + _, bs, bl = _der_read_tag_len(cert_der, vpos) + vpos = bs + bl + # OCTET STRING wrapping the SAN SEQUENCE + if vpos < ext_end: + os_tag, os_start, os_len = _der_read_tag_len(cert_der, vpos) + if os_tag == 0x04: + sans = _parse_san_sequence(cert_der, os_start, os_len) + epos = ext_end + break + else: + _, skip_start, skip_len = _der_read_tag_len(cert_der, pos) + pos = skip_start + skip_len + except Exception: + pass + return sans + + +def _parse_san_sequence(data: bytes, start: int, length: int) -> list[str]: + """Parse a GeneralNames SEQUENCE to extract DNS names and IPs.""" + names: list[str] = [] + try: + # The SAN value is itself a SEQUENCE of GeneralName + seq_tag, seq_start, seq_len = _der_read_tag_len(data, start) + pos = seq_start + end = seq_start + seq_len + while pos < end: + tag = data[pos] + _, val_start, val_len = _der_read_tag_len(data, pos) + context_tag = tag & 0x1F + if context_tag == 2: # dNSName + names.append(data[val_start: val_start + val_len].decode("ascii", errors="replace")) + elif context_tag == 7 and val_len == 4: # iPAddress (IPv4) + names.append(".".join(str(b) for b in data[val_start: val_start + val_len])) + pos = val_start + val_len + except Exception: + pass + return names + + +# ─── JA3 / JA3S computation ─────────────────────────────────────────────────── + +def _tls_version_str(version: int) -> str: + return { + 0x0301: "TLS 1.0", + 0x0302: "TLS 1.1", + 0x0303: "TLS 1.2", + 0x0304: "TLS 1.3", + 0x0200: "SSL 2.0", + 0x0300: "SSL 3.0", + }.get(version, f"0x{version:04x}") + + +def _ja3(ch: dict[str, Any]) -> tuple[str, str]: + """Return (ja3_string, ja3_hash) for a parsed ClientHello.""" + parts = [ + str(ch["tls_version"]), + "-".join(str(c) for c in ch["cipher_suites"]), + "-".join(str(e) for e in ch["extensions"]), + "-".join(str(g) for g in ch["supported_groups"]), + "-".join(str(p) for p in ch["ec_point_formats"]), + ] + ja3_str = ",".join(parts) + return ja3_str, hashlib.md5(ja3_str.encode()).hexdigest() + + +def _ja3s(sh: dict[str, Any]) -> tuple[str, str]: + """Return (ja3s_string, ja3s_hash) for a parsed ServerHello.""" + parts = [ + str(sh["tls_version"]), + str(sh["cipher_suite"]), + "-".join(str(e) for e in sh["extensions"]), + ] + ja3s_str = ",".join(parts) + return ja3s_str, hashlib.md5(ja3s_str.encode()).hexdigest() + + +# ─── JA4 / JA4S computation ────────────────────────────────────────────────── + +def _ja4_version(ch: dict[str, Any]) -> str: + """ + Determine JA4 TLS version string (2 chars). + Uses supported_versions extension if present (TLS 1.3 advertises 0x0303 in + ClientHello.version but 0x0304 in supported_versions). + """ + versions = ch.get("supported_versions", []) + if versions: + best = max(versions) + else: + best = ch["tls_version"] + return { + 0x0304: "13", + 0x0303: "12", + 0x0302: "11", + 0x0301: "10", + 0x0300: "s3", + 0x0200: "s2", + }.get(best, "00") + + +def _ja4_alpn_tag(alpn_list: list[str] | str) -> str: + """ + JA4 ALPN tag: first and last character of the first ALPN protocol. + No ALPN → "00". + """ + if isinstance(alpn_list, str): + proto = alpn_list + elif alpn_list: + proto = alpn_list[0] + else: + return "00" + + if not proto: + return "00" + if len(proto) == 1: + return proto[0] + proto[0] + return proto[0] + proto[-1] + + +def _sha256_12(text: str) -> str: + """First 12 hex chars of SHA-256.""" + return hashlib.sha256(text.encode()).hexdigest()[:12] + + +def _ja4(ch: dict[str, Any]) -> str: + """ + Compute JA4 fingerprint from a parsed ClientHello. + + Format: a_b_c where + a = {t|q}{version:2}{d|i}{cipher_count:02d}{ext_count:02d}{alpn_tag:2} + b = sha256_12(sorted_cipher_suites, comma-separated) + c = sha256_12(sorted_extensions,sorted_signature_algorithms) + + Protocol is always 't' (TCP) since we capture on a TCP socket. + SNI present → 'd' (domain), absent → 'i' (IP). + """ + proto = "t" + ver = _ja4_version(ch) + sni_flag = "d" if ch.get("sni") else "i" + + # Counts — GREASE already filtered, but also exclude SNI (0x0000) and ALPN (0x0010) + # from extension count per JA4 spec? No — JA4 counts all non-GREASE extensions. + cs_count = min(len(ch["cipher_suites"]), 99) + ext_count = min(len(ch["extensions"]), 99) + alpn_tag = _ja4_alpn_tag(ch.get("alpn", [])) + + section_a = f"{proto}{ver}{sni_flag}{cs_count:02d}{ext_count:02d}{alpn_tag}" + + # Section b: sorted cipher suites as decimal, comma-separated + sorted_cs = sorted(ch["cipher_suites"]) + section_b = _sha256_12(",".join(str(c) for c in sorted_cs)) + + # Section c: sorted extensions + sorted signature algorithms + sorted_ext = sorted(ch["extensions"]) + sorted_sa = sorted(ch.get("signature_algorithms", [])) + ext_str = ",".join(str(e) for e in sorted_ext) + sa_str = ",".join(str(s) for s in sorted_sa) + combined = f"{ext_str}_{sa_str}" if sa_str else ext_str + section_c = _sha256_12(combined) + + return f"{section_a}_{section_b}_{section_c}" + + +def _ja4s(sh: dict[str, Any]) -> str: + """ + Compute JA4S fingerprint from a parsed ServerHello. + + Format: a_b where + a = {t|q}{version:2}{ext_count:02d}{alpn_tag:2} + b = sha256_12({cipher_suite},{sorted_extensions comma-separated}) + """ + proto = "t" + # Use selected_version from supported_versions ext if available + selected = sh.get("selected_version") + if selected: + ver = {0x0304: "13", 0x0303: "12", 0x0302: "11", 0x0301: "10", + 0x0300: "s3", 0x0200: "s2"}.get(selected, "00") + else: + ver = {0x0304: "13", 0x0303: "12", 0x0302: "11", 0x0301: "10", + 0x0300: "s3", 0x0200: "s2"}.get(sh["tls_version"], "00") + + ext_count = min(len(sh["extensions"]), 99) + alpn_tag = _ja4_alpn_tag(sh.get("alpn", "")) + + section_a = f"{proto}{ver}{ext_count:02d}{alpn_tag}" + + sorted_ext = sorted(sh["extensions"]) + inner = f"{sh['cipher_suite']},{','.join(str(e) for e in sorted_ext)}" + section_b = _sha256_12(inner) + + return f"{section_a}_{section_b}" + + +# ─── JA4L (latency) ────────────────────────────────────────────────────────── + +def _ja4l(key: tuple[str, int, str, int]) -> dict[str, Any] | None: + """ + Retrieve JA4L data for a connection. + + JA4L measures the TCP handshake RTT: time from SYN to SYN-ACK. + Returns {"rtt_ms": float, "client_ttl": int} or None. + """ + return _tcp_rtt.get(key) + + +# ─── Session resumption ────────────────────────────────────────────────────── + +def _session_resumption_info(ch: dict[str, Any]) -> dict[str, Any]: + """ + Analyze ClientHello for TLS session resumption behavior. + Returns a dict describing what resumption mechanisms the client uses. + """ + mechanisms: list[str] = [] + + if ch.get("has_session_ticket_data"): + mechanisms.append("session_ticket") + + if ch.get("has_pre_shared_key"): + mechanisms.append("psk") + + if ch.get("has_early_data"): + mechanisms.append("early_data_0rtt") + + if ch.get("session_id") and len(ch["session_id"]) > 0: + mechanisms.append("session_id") + + return { + "resumption_attempted": len(mechanisms) > 0, + "mechanisms": mechanisms, + } + + +# ─── Session cleanup ───────────────────────────────────────────────────────── + +def _cleanup_sessions() -> None: + now = time.monotonic() + stale = [k for k, ts in _session_ts.items() if now - ts > _SESSION_TTL] + for k in stale: + _sessions.pop(k, None) + _session_ts.pop(k, None) + # Also clean up TCP RTT tracking + stale_syn = [k for k, v in _tcp_syn.items() + if now - v.get("time", 0) > _SESSION_TTL] + for k in stale_syn: + _tcp_syn.pop(k, None) + stale_rtt = [k for k, _ in _tcp_rtt.items() + if k not in _sessions and k not in _session_ts] + for k in stale_rtt: + _tcp_rtt.pop(k, None) + + +# ─── Dedup cache ───────────────────────────────────────────────────────────── + +# Key: (src_ip, event_type, fingerprint_key) → timestamp of last emit +_dedup_cache: dict[tuple[str, str, str], float] = {} +_DEDUP_CLEANUP_INTERVAL: float = 60.0 +_dedup_last_cleanup: float = 0.0 + + +def _dedup_key_for(event_type: str, fields: dict[str, Any]) -> str: + """Build a dedup fingerprint from the most significant fields.""" + if event_type == "tls_client_hello": + return fields.get("ja3", "") + "|" + fields.get("ja4", "") + if event_type == "tls_session": + return (fields.get("ja3", "") + "|" + fields.get("ja3s", "") + + "|" + fields.get("ja4", "") + "|" + fields.get("ja4s", "")) + if event_type == "tls_certificate": + return fields.get("subject_cn", "") + "|" + fields.get("issuer", "") + # tls_resumption or unknown — dedup on mechanisms + return fields.get("mechanisms", fields.get("resumption", "")) + + +def _is_duplicate(event_type: str, fields: dict[str, Any]) -> bool: + """Return True if this event was already emitted within the dedup window.""" + if _DEDUP_TTL <= 0: + return False + + global _dedup_last_cleanup + now = time.monotonic() + + # Periodic cleanup + if now - _dedup_last_cleanup > _DEDUP_CLEANUP_INTERVAL: + stale = [k for k, ts in _dedup_cache.items() if now - ts > _DEDUP_TTL] + for k in stale: + del _dedup_cache[k] + _dedup_last_cleanup = now + + src_ip = fields.get("src_ip", "") + fp = _dedup_key_for(event_type, fields) + cache_key = (src_ip, event_type, fp) + + last_seen = _dedup_cache.get(cache_key) + if last_seen is not None and now - last_seen < _DEDUP_TTL: + return True + + _dedup_cache[cache_key] = now + return False + + +# ─── Logging helpers ───────────────────────────────────────────────────────── + +def _log(event_type: str, severity: int = SEVERITY_INFO, **fields: Any) -> None: + if _is_duplicate(event_type, fields): + return + line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity=severity, **fields) + write_syslog_file(line) + + +# ─── Packet callback ───────────────────────────────────────────────────────── + +def _on_packet(pkt: Any) -> None: + if not (pkt.haslayer(IP) and pkt.haslayer(TCP)): + return + + ip = pkt[IP] + tcp = pkt[TCP] + + src_ip: str = ip.src + dst_ip: str = ip.dst + src_port: int = tcp.sport + dst_port: int = tcp.dport + flags: int = tcp.flags.value if hasattr(tcp.flags, 'value') else int(tcp.flags) + + # ── TCP SYN tracking for JA4L ── + if flags & _TCP_SYN and not (flags & _TCP_ACK): + # Pure SYN — record timestamp and TTL + key = (src_ip, src_port, dst_ip, dst_port) + _tcp_syn[key] = {"time": time.monotonic(), "ttl": ip.ttl} + + elif flags & _TCP_SYN and flags & _TCP_ACK: + # SYN-ACK — calculate RTT for the original SYN sender + rev_key = (dst_ip, dst_port, src_ip, src_port) + syn_data = _tcp_syn.pop(rev_key, None) + if syn_data: + rtt_ms = round((time.monotonic() - syn_data["time"]) * 1000, 2) + _tcp_rtt[rev_key] = { + "rtt_ms": rtt_ms, + "client_ttl": syn_data["ttl"], + } + + payload = bytes(tcp.payload) + if not payload: + return + + # TLS record check + if payload[0] != _TLS_RECORD_HANDSHAKE: + return + + # Attempt ClientHello parse + ch = _parse_client_hello(payload) + if ch is not None: + _cleanup_sessions() + + key = (src_ip, src_port, dst_ip, dst_port) + ja3_str, ja3_hash = _ja3(ch) + ja4_hash = _ja4(ch) + resumption = _session_resumption_info(ch) + rtt_data = _ja4l(key) + + _sessions[key] = { + "ja3": ja3_hash, + "ja3_str": ja3_str, + "ja4": ja4_hash, + "tls_version": ch["tls_version"], + "cipher_suites": ch["cipher_suites"], + "extensions": ch["extensions"], + "signature_algorithms": ch.get("signature_algorithms", []), + "supported_versions": ch.get("supported_versions", []), + "sni": ch["sni"], + "alpn": ch["alpn"], + "resumption": resumption, + } + _session_ts[key] = time.monotonic() + + log_fields: dict[str, Any] = { + "src_ip": src_ip, + "src_port": str(src_port), + "dst_ip": dst_ip, + "dst_port": str(dst_port), + "ja3": ja3_hash, + "ja4": ja4_hash, + "tls_version": _tls_version_str(ch["tls_version"]), + "sni": ch["sni"] or "", + "alpn": ",".join(ch["alpn"]), + "raw_ciphers": "-".join(str(c) for c in ch["cipher_suites"]), + "raw_extensions": "-".join(str(e) for e in ch["extensions"]), + } + + if resumption["resumption_attempted"]: + log_fields["resumption"] = ",".join(resumption["mechanisms"]) + + if rtt_data: + log_fields["ja4l_rtt_ms"] = str(rtt_data["rtt_ms"]) + log_fields["ja4l_client_ttl"] = str(rtt_data["client_ttl"]) + + _log("tls_client_hello", **log_fields) + return + + # Attempt ServerHello parse + sh = _parse_server_hello(payload) + if sh is not None: + # Reverse 4-tuple to find the matching ClientHello + rev_key = (dst_ip, dst_port, src_ip, src_port) + ch_data = _sessions.pop(rev_key, None) + _session_ts.pop(rev_key, None) + + ja3s_str, ja3s_hash = _ja3s(sh) + ja4s_hash = _ja4s(sh) + + fields: dict[str, Any] = { + "src_ip": dst_ip, # original attacker is now the destination + "src_port": str(dst_port), + "dst_ip": src_ip, + "dst_port": str(src_port), + "ja3s": ja3s_hash, + "ja4s": ja4s_hash, + "tls_version": _tls_version_str(sh["tls_version"]), + } + + if ch_data: + fields["ja3"] = ch_data["ja3"] + fields["ja4"] = ch_data.get("ja4", "") + fields["sni"] = ch_data["sni"] or "" + fields["alpn"] = ",".join(ch_data["alpn"]) + fields["raw_ciphers"] = "-".join(str(c) for c in ch_data["cipher_suites"]) + fields["raw_extensions"] = "-".join(str(e) for e in ch_data["extensions"]) + if ch_data.get("resumption", {}).get("resumption_attempted"): + fields["resumption"] = ",".join(ch_data["resumption"]["mechanisms"]) + + rtt_data = _tcp_rtt.pop(rev_key, None) + if rtt_data: + fields["ja4l_rtt_ms"] = str(rtt_data["rtt_ms"]) + fields["ja4l_client_ttl"] = str(rtt_data["client_ttl"]) + + _log("tls_session", severity=SEVERITY_WARNING, **fields) + return + + # Attempt Certificate parse (TLS 1.2 only — 1.3 encrypts it) + cert = _parse_certificate(payload) + if cert is not None: + # Match to a session — the cert comes from the server side + rev_key = (dst_ip, dst_port, src_ip, src_port) + ch_data = _sessions.get(rev_key) + + cert_fields: dict[str, Any] = { + "src_ip": dst_ip, + "src_port": str(dst_port), + "dst_ip": src_ip, + "dst_port": str(src_port), + "subject_cn": cert["subject_cn"], + "issuer": cert["issuer"], + "self_signed": str(cert["self_signed"]).lower(), + "not_before": cert["not_before"], + "not_after": cert["not_after"], + } + if cert["sans"]: + cert_fields["sans"] = ",".join(cert["sans"]) + if ch_data: + cert_fields["sni"] = ch_data.get("sni", "") + + _log("tls_certificate", **cert_fields) + + +# ─── Entry point ───────────────────────────────────────────────────────────── + +if __name__ == "__main__": + _log("startup", msg=f"sniffer started node={NODE_NAME}") + sniff( + filter="tcp", + prn=_on_packet, + store=False, + ) diff --git a/templates/snmp/Dockerfile b/decnet/templates/snmp/Dockerfile similarity index 85% rename from templates/snmp/Dockerfile rename to decnet/templates/snmp/Dockerfile index 5a452e9..9b79675 100644 --- a/templates/snmp/Dockerfile +++ b/decnet/templates/snmp/Dockerfile @@ -5,13 +5,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ python3 \ && rm -rf /var/lib/apt/lists/* -COPY decnet_logging.py /opt/decnet_logging.py +COPY syslog_bridge.py /opt/syslog_bridge.py COPY server.py /opt/server.py COPY entrypoint.sh /entrypoint.sh RUN chmod +x /entrypoint.sh EXPOSE 161/udp -RUN useradd -r -s /bin/false -d /opt decnet \ +RUN useradd -r -s /bin/false -d /opt logrelay \ && apt-get update && apt-get install -y --no-install-recommends libcap2-bin \ && rm -rf /var/lib/apt/lists/* \ && (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true) @@ -19,5 +19,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \ HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ CMD kill -0 1 || exit 1 -USER decnet +USER logrelay ENTRYPOINT ["/entrypoint.sh"] diff --git a/templates/snmp/entrypoint.sh b/decnet/templates/snmp/entrypoint.sh similarity index 100% rename from templates/snmp/entrypoint.sh rename to decnet/templates/snmp/entrypoint.sh diff --git a/templates/snmp/server.py b/decnet/templates/snmp/server.py similarity index 98% rename from templates/snmp/server.py rename to decnet/templates/snmp/server.py index 34bb7bd..9410939 100644 --- a/templates/snmp/server.py +++ b/decnet/templates/snmp/server.py @@ -9,7 +9,7 @@ Logs all requests as JSON. import asyncio import os import struct -from decnet_logging import syslog_line, write_syslog_file, forward_syslog +from syslog_bridge import syslog_line, write_syslog_file, forward_syslog NODE_NAME = os.environ.get("NODE_NAME", "switch") SERVICE_NAME = "snmp" @@ -68,7 +68,6 @@ _OID_VALUES = { def _log(event_type: str, severity: int = 6, **kwargs) -> None: line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs) - print(line, flush=True) write_syslog_file(line) forward_syslog(line, LOG_TARGET) diff --git a/decnet/templates/snmp/syslog_bridge.py b/decnet/templates/snmp/syslog_bridge.py new file mode 100644 index 0000000..c0a78d0 --- /dev/null +++ b/decnet/templates/snmp/syslog_bridge.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +""" +Shared RFC 5424 syslog helper used by service containers. + +Services call syslog_line() to format an RFC 5424 message, then +write_syslog_file() to emit it to stdout — the container runtime +captures it, and the host-side collector streams it into the log file. + +RFC 5424 structure: + 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG + +Facility: local0 (16). SD element ID uses PEN 55555. +""" + +from datetime import datetime, timezone +from typing import Any + +# ─── Constants ──────────────────────────────────────────────────────────────── + +_FACILITY_LOCAL0 = 16 +_SD_ID = "relay@55555" +_NILVALUE = "-" + +SEVERITY_EMERG = 0 +SEVERITY_ALERT = 1 +SEVERITY_CRIT = 2 +SEVERITY_ERROR = 3 +SEVERITY_WARNING = 4 +SEVERITY_NOTICE = 5 +SEVERITY_INFO = 6 +SEVERITY_DEBUG = 7 + +_MAX_HOSTNAME = 255 +_MAX_APPNAME = 48 +_MAX_MSGID = 32 + +# ─── Formatter ──────────────────────────────────────────────────────────────── + +def _sd_escape(value: str) -> str: + """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3.""" + return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]") + + +def _sd_element(fields: dict[str, Any]) -> str: + if not fields: + return _NILVALUE + params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items()) + return f"[{_SD_ID} {params}]" + + +def syslog_line( + service: str, + hostname: str, + event_type: str, + severity: int = SEVERITY_INFO, + timestamp: datetime | None = None, + msg: str | None = None, + **fields: Any, +) -> str: + """ + Return a single RFC 5424-compliant syslog line (no trailing newline). + + Args: + service: APP-NAME (e.g. "http", "mysql") + hostname: HOSTNAME (node name) + event_type: MSGID (e.g. "request", "login_attempt") + severity: Syslog severity integer (default: INFO=6) + timestamp: UTC datetime; defaults to now + msg: Optional free-text MSG + **fields: Encoded as structured data params + """ + pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>" + ts = (timestamp or datetime.now(timezone.utc)).isoformat() + host = (hostname or _NILVALUE)[:_MAX_HOSTNAME] + appname = (service or _NILVALUE)[:_MAX_APPNAME] + msgid = (event_type or _NILVALUE)[:_MAX_MSGID] + sd = _sd_element(fields) + message = f" {msg}" if msg else "" + return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}" + + +def write_syslog_file(line: str) -> None: + """Emit a syslog line to stdout for container log capture.""" + print(line, flush=True) + + +def forward_syslog(line: str, log_target: str) -> None: + """No-op stub. TCP forwarding is handled by rsyslog, not by service containers.""" + pass diff --git a/decnet/templates/ssh/Dockerfile b/decnet/templates/ssh/Dockerfile new file mode 100644 index 0000000..5e91886 --- /dev/null +++ b/decnet/templates/ssh/Dockerfile @@ -0,0 +1,116 @@ +ARG BASE_IMAGE=debian:bookworm-slim +FROM ${BASE_IMAGE} + +RUN apt-get update && apt-get install -y --no-install-recommends \ + openssh-server \ + sudo \ + rsyslog \ + curl \ + wget \ + vim \ + nano \ + net-tools \ + procps \ + htop \ + git \ + inotify-tools \ + psmisc \ + iproute2 \ + iputils-ping \ + ca-certificates \ + nmap \ + jq \ + python3 \ + && rm -rf /var/lib/apt/lists/* + +RUN mkdir -p /var/run/sshd /root/.ssh /var/log/journal /var/lib/systemd/coredump \ + && chmod 700 /var/lib/systemd/coredump + +# sshd_config: allow root + password auth; VERBOSE so session lines carry +# client IP + session PID (needed for file-capture attribution). +RUN sed -i \ + -e 's|^#\?PermitRootLogin.*|PermitRootLogin yes|' \ + -e 's|^#\?PasswordAuthentication.*|PasswordAuthentication yes|' \ + -e 's|^#\?ChallengeResponseAuthentication.*|ChallengeResponseAuthentication no|' \ + -e 's|^#\?LogLevel.*|LogLevel VERBOSE|' \ + /etc/ssh/sshd_config + +# rsyslog: forward auth.* and user.* to PID 1's stdout in RFC 5424 format. +# /proc/1/fd/1 is the container-stdout fd Docker attached — writing there +# surfaces lines in `docker logs` without needing a named pipe + relay cat +# (which would be readable AND writable by any root-in-container process). +RUN printf '%s\n' \ + '# auth + user events → container stdout as RFC 5424' \ + '$template RFC5424fmt,"<%PRI%>1 %TIMESTAMP:::date-rfc3339% %HOSTNAME% %APP-NAME% %PROCID% %MSGID% %STRUCTURED-DATA% %msg%\n"' \ + 'auth,authpriv.* /proc/1/fd/1;RFC5424fmt' \ + 'user.* /proc/1/fd/1;RFC5424fmt' \ + > /etc/rsyslog.d/50-journal-forward.conf + +# Silence default catch-all rules so we own auth/user routing exclusively. +# Also disable rsyslog's privilege drop: PID 1's stdout (/proc/1/fd/1) is +# owned by root, so a syslog-user rsyslogd gets EACCES and silently drops +# every auth/user line (bash CMD events + file_captured emissions). +RUN sed -i \ + -e 's|^\(\*\.\*;auth,authpriv\.none\)|#\1|' \ + -e 's|^auth,authpriv\.\*|#auth,authpriv.*|' \ + -e 's|^\$PrivDropToUser|#$PrivDropToUser|' \ + -e 's|^\$PrivDropToGroup|#$PrivDropToGroup|' \ + /etc/rsyslog.conf + +# Sudo: log to syslog (auth facility) AND a local file with full I/O capture +RUN echo 'Defaults logfile="/var/log/sudo.log"' >> /etc/sudoers && \ + echo 'Defaults syslog=auth' >> /etc/sudoers && \ + echo 'Defaults log_input,log_output' >> /etc/sudoers + +# Lived-in environment: motd, shell aliases, fake project files +RUN echo "Ubuntu 22.04.3 LTS" > /etc/issue.net && \ + echo "Welcome to Ubuntu 22.04.3 LTS (GNU/Linux 5.15.0-88-generic x86_64)" > /etc/motd && \ + echo "" >> /etc/motd && \ + echo " * Documentation: https://help.ubuntu.com" >> /etc/motd && \ + echo " * Management: https://landscape.canonical.com" >> /etc/motd && \ + echo " * Support: https://ubuntu.com/advantage" >> /etc/motd + +RUN echo 'alias ll="ls -alF"' >> /root/.bashrc && \ + echo 'alias la="ls -A"' >> /root/.bashrc && \ + echo 'alias l="ls -CF"' >> /root/.bashrc && \ + echo 'export HISTSIZE=1000' >> /root/.bashrc && \ + echo 'export HISTFILESIZE=2000' >> /root/.bashrc && \ + echo 'PROMPT_COMMAND='"'"'logger -p user.info -t bash "CMD uid=$UID user=$USER src=${SSH_CLIENT%% *} pwd=$PWD cmd=$(history 1 | sed "s/^ *[0-9]* *//")";'"'" >> /root/.bashrc + +# Fake project files to look lived-in +RUN mkdir -p /root/projects /root/backups /var/www/html && \ + printf '# TODO: migrate DB to new server\n# check cron jobs\n# update SSL cert\n' > /root/notes.txt && \ + printf 'DB_HOST=10.0.0.5\nDB_USER=admin\nDB_PASS=changeme123\nDB_NAME=prod_db\n' > /root/projects/.env && \ + printf '[Unit]\nDescription=App Server\n[Service]\nExecStart=/usr/bin/python3 /opt/app/server.py\n' > /root/projects/app.service + +# Stage all capture sources in a scratch dir. Nothing here survives the layer: +# _build_stealth.py packs syslog_bridge.py + emit_capture.py + capture.sh into +# XOR+gzip+base64 blobs embedded directly in /entrypoint.sh, and the whole +# /tmp/build tree is wiped at the end of the RUN — so the final image has no +# `.py` file under /opt and no `journal-relay` script under /usr/libexec/udev. +COPY entrypoint.sh capture.sh syslog_bridge.py emit_capture.py \ + argv_zap.c _build_stealth.py /tmp/build/ + +# argv_zap is compiled into a shared object disguised as a multiarch +# udev-companion library (sits next to real libudev.so.1). gcc is installed +# only for this build step and purged in the same layer. +RUN set -eu \ + && apt-get update \ + && apt-get install -y --no-install-recommends gcc libc6-dev \ + && mkdir -p /usr/lib/x86_64-linux-gnu /usr/libexec/udev \ + && gcc -O2 -fPIC -shared \ + -o /usr/lib/x86_64-linux-gnu/libudev-shared.so.1 \ + /tmp/build/argv_zap.c -ldl \ + && apt-get purge -y gcc libc6-dev \ + && apt-get autoremove -y \ + && rm -rf /var/lib/apt/lists/* \ + && ln -sf /usr/bin/inotifywait /usr/libexec/udev/kmsg-watch \ + && python3 /tmp/build/_build_stealth.py \ + && rm -rf /tmp/build + +EXPOSE 22 + +HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ + CMD kill -0 1 || exit 1 + +ENTRYPOINT ["/entrypoint.sh"] diff --git a/decnet/templates/ssh/_build_stealth.py b/decnet/templates/ssh/_build_stealth.py new file mode 100644 index 0000000..a3a4ceb --- /dev/null +++ b/decnet/templates/ssh/_build_stealth.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +""" +Build-time helper: merge capture Python sources, XOR+gzip+base64 pack them +and the capture.sh loop, and render the final /entrypoint.sh from its +templated form. + +Runs inside the Docker build. Reads from /tmp/build/, writes /entrypoint.sh. +""" + +from __future__ import annotations + +import base64 +import gzip +import random +import sys +from pathlib import Path + +BUILD = Path("/tmp/build") + + +def _merge_python() -> str: + bridge = (BUILD / "syslog_bridge.py").read_text() + emit = (BUILD / "emit_capture.py").read_text() + + def _clean(src: str) -> tuple[list[str], list[str]]: + """Return (future_imports, other_lines) with noise stripped.""" + futures: list[str] = [] + rest: list[str] = [] + for line in src.splitlines(): + ls = line.lstrip() + if ls.startswith("from __future__"): + futures.append(line) + elif ls.startswith("sys.path.insert") or ls.startswith("from syslog_bridge"): + continue + else: + rest.append(line) + return futures, rest + + b_fut, b_rest = _clean(bridge) + e_fut, e_rest = _clean(emit) + + # Deduplicate future imports and hoist to the very top. + seen: set[str] = set() + futures: list[str] = [] + for line in (*b_fut, *e_fut): + stripped = line.strip() + if stripped not in seen: + seen.add(stripped) + futures.append(line) + + header = "\n".join(futures) + body = "\n".join(b_rest) + "\n\n" + "\n".join(e_rest) + return (header + "\n" if header else "") + body + + +def _pack(text: str, key: int) -> str: + gz = gzip.compress(text.encode("utf-8")) + xored = bytes(b ^ key for b in gz) + return base64.b64encode(xored).decode("ascii") + + +def main() -> int: + key = random.SystemRandom().randint(1, 255) + + merged_py = _merge_python() + capture_sh = (BUILD / "capture.sh").read_text() + + emit_b64 = _pack(merged_py, key) + relay_b64 = _pack(capture_sh, key) + + tpl = (BUILD / "entrypoint.sh").read_text() + rendered = ( + tpl.replace("__STEALTH_KEY__", str(key)) + .replace("__EMIT_CAPTURE_B64__", emit_b64) + .replace("__JOURNAL_RELAY_B64__", relay_b64) + ) + + for marker in ("__STEALTH_KEY__", "__EMIT_CAPTURE_B64__", "__JOURNAL_RELAY_B64__"): + if marker in rendered: + print(f"build: placeholder {marker} still present after render", file=sys.stderr) + return 1 + + Path("/entrypoint.sh").write_text(rendered) + Path("/entrypoint.sh").chmod(0o755) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/decnet/templates/ssh/argv_zap.c b/decnet/templates/ssh/argv_zap.c new file mode 100644 index 0000000..4f60996 --- /dev/null +++ b/decnet/templates/ssh/argv_zap.c @@ -0,0 +1,65 @@ +/* + * argv_zap.so — LD_PRELOAD shim that blanks argv[1..] from /proc/PID/cmdline + * after the target binary has parsed its arguments. + * + * Rationale: exec -a can rewrite argv[0], but the remaining args (paths, + * flags) remain visible via `ps aux`. By hooking __libc_start_main we can + * copy argv into heap-backed storage, hand that to the real main, then + * zero the stack-resident argv region so the kernel's cmdline reader + * returns just argv[0]. + * + * Usage: + * gcc -O2 -fPIC -shared -o argv_zap.so argv_zap.c -ldl + * ARGV_ZAP_COMM=kmsg-watch LD_PRELOAD=/path/argv_zap.so \ + * exec -a "kmsg-watch" inotifywait … + */ + +#define _GNU_SOURCE +#include +#include +#include +#include + +typedef int (*main_t)(int, char **, char **); +typedef int (*libc_start_main_t)(main_t, int, char **, + void (*)(void), void (*)(void), + void (*)(void), void *); + +static main_t real_main; + +static int wrapped_main(int argc, char **argv, char **envp) { + /* Heap-copy argv so the target keeps its arguments. */ + char **heap_argv = (char **)calloc(argc + 1, sizeof(char *)); + if (heap_argv) { + for (int i = 0; i < argc; i++) { + heap_argv[i] = strdup(argv[i] ? argv[i] : ""); + } + } + + /* Zero the contiguous argv[1..] region (argv[0] stays for ps). */ + if (argc > 1 && argv[1] && argv[argc - 1]) { + char *start = argv[1]; + char *end = argv[argc - 1] + strlen(argv[argc - 1]); + if (end > start) memset(start, 0, (size_t)(end - start)); + } + + /* Optional comm rename so /proc/self/comm mirrors the argv[0] disguise. + * Read from ARGV_ZAP_COMM so different callers can pick their own name + * (kmsg-watch for inotifywait, journal-relay for the watcher bash, …). + * Unset afterwards so children don't accidentally inherit the override. */ + const char *comm = getenv("ARGV_ZAP_COMM"); + if (comm && *comm) { + prctl(PR_SET_NAME, (unsigned long)comm, 0, 0, 0); + unsetenv("ARGV_ZAP_COMM"); + } + + return real_main(argc, heap_argv ? heap_argv : argv, envp); +} + +int __libc_start_main(main_t main_fn, int argc, char **argv, + void (*init)(void), void (*fini)(void), + void (*rtld_fini)(void), void *stack_end) { + real_main = main_fn; + libc_start_main_t real = (libc_start_main_t)dlsym(RTLD_NEXT, "__libc_start_main"); + return real(wrapped_main, argc, argv, init, fini, rtld_fini, stack_end); +} diff --git a/decnet/templates/ssh/capture.sh b/decnet/templates/ssh/capture.sh new file mode 100755 index 0000000..21952c5 --- /dev/null +++ b/decnet/templates/ssh/capture.sh @@ -0,0 +1,265 @@ +#!/bin/bash +# systemd-journal relay helper: mirrors newly-written files under a +# monitored set of paths into the coredump staging directory and emits +# a structured journal line per event. +# +# `lastpipe` runs the tail of `inotify | while` in the current shell so +# the process tree stays flat (one bash, not two). Job control must be +# off for lastpipe to apply — non-interactive scripts already have it off. +shopt -s lastpipe +set +m + +set -u + +CAPTURE_DIR="${CAPTURE_DIR:-/var/lib/systemd/coredump}" +CAPTURE_MAX_BYTES="${CAPTURE_MAX_BYTES:-52428800}" # 50 MiB +CAPTURE_WATCH_PATHS="${CAPTURE_WATCH_PATHS:-/root /tmp /var/tmp /home /var/www /opt /dev/shm}" +# Invoke inotifywait through the udev-sided symlink; fall back to the real +# binary if the symlink is missing. +INOTIFY_BIN="${INOTIFY_BIN:-/usr/libexec/udev/kmsg-watch}" +[ -x "$INOTIFY_BIN" ] || INOTIFY_BIN="$(command -v inotifywait)" + +mkdir -p "$CAPTURE_DIR" +chmod 700 "$CAPTURE_DIR" + +# Filenames we never capture (boot noise, self-writes). +_is_ignored_path() { + local p="$1" + case "$p" in + "$CAPTURE_DIR"/*) return 0 ;; + /var/lib/systemd/*) return 0 ;; + */.bash_history) return 0 ;; + */.viminfo) return 0 ;; + */ssh_host_*_key*) return 0 ;; + esac + return 1 +} + +# Resolve the writer PID best-effort. Prints the PID or nothing. +_writer_pid() { + local path="$1" + local pid + pid="$(fuser "$path" 2>/dev/null | tr -d ' \t\n')" + if [ -n "$pid" ]; then + printf '%s' "${pid%% *}" + return + fi + # Fallback: scan /proc/*/fd for an open handle on the path. + for fd_link in /proc/[0-9]*/fd/*; do + [ -L "$fd_link" ] || continue + if [ "$(readlink -f "$fd_link" 2>/dev/null)" = "$path" ]; then + printf '%s' "$(echo "$fd_link" | awk -F/ '{print $3}')" + return + fi + done +} + +# Walk PPid chain from $1 until we hit an sshd session leader. +# Prints: (empty on no match). +_walk_to_sshd() { + local pid="$1" + local depth=0 + while [ -n "$pid" ] && [ "$pid" != "0" ] && [ "$pid" != "1" ] && [ $depth -lt 20 ]; do + local cmd + cmd="$(tr '\0' ' ' < "/proc/$pid/cmdline" 2>/dev/null)" + # sshd session leaders look like: "sshd: root@pts/0" or "sshd: root@notty" + if echo "$cmd" | grep -qE '^sshd: [^ ]+@'; then + local user + user="$(echo "$cmd" | sed -E 's/^sshd: ([^@]+)@.*/\1/')" + printf '%s %s' "$pid" "$user" + return + fi + pid="$(awk '/^PPid:/ {print $2}' "/proc/$pid/status" 2>/dev/null)" + depth=$((depth + 1)) + done +} + +# Emit a JSON array of currently-established SSH peers. +# Each item: {pid, src_ip, src_port}. +_ss_sessions_json() { + ss -Htnp state established sport = :22 2>/dev/null \ + | awk ' + { + peer=$4; local_=$3; + # peer looks like 198.51.100.7:55342 (may be IPv6 [::1]:x) + n=split(peer, a, ":"); + port=a[n]; + ip=peer; sub(":" port "$", "", ip); + gsub(/[\[\]]/, "", ip); + # extract pid from users:(("sshd",pid=1234,fd=5)) + pid=""; + if (match($0, /pid=[0-9]+/)) { + pid=substr($0, RSTART+4, RLENGTH-4); + } + printf "{\"pid\":%s,\"src_ip\":\"%s\",\"src_port\":%s}\n", + (pid==""?"null":pid), ip, (port+0); + }' \ + | jq -s '.' +} + +# Emit a JSON array of logged-in users from utmp. +# Each item: {user, src_ip, login_at}. +_who_sessions_json() { + who --ips 2>/dev/null \ + | awk '{ printf "{\"user\":\"%s\",\"tty\":\"%s\",\"login_at\":\"%s %s\",\"src_ip\":\"%s\"}\n", $1, $2, $3, $4, $NF }' \ + | jq -s '.' +} + +_capture_one() { + local src="$1" + [ -f "$src" ] || return 0 + _is_ignored_path "$src" && return 0 + + local size + size="$(stat -c '%s' "$src" 2>/dev/null)" + [ -z "$size" ] && return 0 + if [ "$size" -gt "$CAPTURE_MAX_BYTES" ]; then + logger -p user.info -t systemd-journal "file_skipped size=$size path=$src reason=oversize" + return 0 + fi + + # Attribution first — PID may disappear after the copy races. + local writer_pid writer_comm writer_cmdline writer_uid writer_loginuid + writer_pid="$(_writer_pid "$src")" + if [ -n "$writer_pid" ] && [ -d "/proc/$writer_pid" ]; then + writer_comm="$(cat "/proc/$writer_pid/comm" 2>/dev/null)" + writer_cmdline="$(tr '\0' ' ' < "/proc/$writer_pid/cmdline" 2>/dev/null)" + writer_uid="$(awk '/^Uid:/ {print $2}' "/proc/$writer_pid/status" 2>/dev/null)" + writer_loginuid="$(cat "/proc/$writer_pid/loginuid" 2>/dev/null)" + fi + + local ssh_pid ssh_user + if [ -n "$writer_pid" ]; then + read -r ssh_pid ssh_user < <(_walk_to_sshd "$writer_pid" || true) + fi + + local ss_json who_json + ss_json="$(_ss_sessions_json 2>/dev/null || echo '[]')" + who_json="$(_who_sessions_json 2>/dev/null || echo '[]')" + + # Resolve src_ip via ss by matching ssh_pid. + local src_ip="" src_port="null" attribution="unknown" + if [ -n "${ssh_pid:-}" ]; then + local matched + matched="$(echo "$ss_json" | jq -c --argjson p "$ssh_pid" '.[] | select(.pid==$p)')" + if [ -n "$matched" ]; then + src_ip="$(echo "$matched" | jq -r '.src_ip')" + src_port="$(echo "$matched" | jq -r '.src_port')" + attribution="pid-chain" + fi + fi + # Fallback 1: ss-only. scp/wget/sftp close their fd before close_write + # fires, so fuser/proc-fd walks miss them. If there's exactly one live + # sshd session, attribute to it. With multiple, attribute to the first + # but tag ambiguous so analysts know to cross-check concurrent_sessions. + if [ "$attribution" = "unknown" ]; then + local ss_len + ss_len="$(echo "$ss_json" | jq 'length')" + if [ "$ss_len" -ge 1 ]; then + src_ip="$(echo "$ss_json" | jq -r '.[0].src_ip')" + src_port="$(echo "$ss_json" | jq -r '.[0].src_port')" + ssh_pid="$(echo "$ss_json" | jq -r '.[0].pid // empty')" + if [ -n "${ssh_pid:-}" ] && [ -d "/proc/$ssh_pid" ]; then + local ssh_cmd + ssh_cmd="$(tr '\0' ' ' < "/proc/$ssh_pid/cmdline" 2>/dev/null)" + ssh_user="$(echo "$ssh_cmd" | sed -nE 's/^sshd: ([^@]+)@.*/\1/p')" + fi + if [ "$ss_len" -eq 1 ]; then + attribution="ss-only" + else + attribution="ss-ambiguous" + fi + fi + fi + + # Fallback 2: utmp. Weakest signal; often empty in containers. + if [ "$attribution" = "unknown" ] && [ "$(echo "$who_json" | jq 'length')" -gt 0 ]; then + src_ip="$(echo "$who_json" | jq -r '.[0].src_ip')" + attribution="utmp-only" + fi + + local sha + sha="$(sha256sum "$src" 2>/dev/null | awk '{print $1}')" + [ -z "$sha" ] && return 0 + + local ts base stored_as + ts="$(date -u +%Y-%m-%dT%H:%M:%SZ)" + base="$(basename "$src")" + stored_as="${ts}_${sha:0:12}_${base}" + + cp --preserve=timestamps,ownership "$src" "$CAPTURE_DIR/$stored_as" 2>/dev/null || return 0 + + local mtime + mtime="$(stat -c '%y' "$src" 2>/dev/null)" + + # Prefer NODE_NAME (the deployer-supplied decky identifier) over + # $HOSTNAME, which is a cosmetic fake like "SRV-DEV-36" set by + # entrypoint.sh. The UI and the artifact bind mount both key on the + # decky name, so using $HOSTNAME here makes /artifacts/{decky}/... URLs + # unresolvable. + local decky="${NODE_NAME:-${HOSTNAME:-unknown}}" + + # One syslog line, no sidecar. Flat summary fields ride as top-level SD + # params (searchable pills in the UI); bulky nested structures (writer + # cmdline, concurrent_sessions, ss_snapshot) are base64-packed into a + # single meta_json_b64 SD param by emit_capture.py. + jq -n \ + --arg _hostname "$decky" \ + --arg _service "ssh" \ + --arg _event_type "file_captured" \ + --arg captured_at "$ts" \ + --arg orig_path "$src" \ + --arg stored_as "$stored_as" \ + --arg sha256 "$sha" \ + --argjson size "$size" \ + --arg mtime "$mtime" \ + --arg attribution "$attribution" \ + --arg writer_pid "${writer_pid:-}" \ + --arg writer_comm "${writer_comm:-}" \ + --arg writer_cmdline "${writer_cmdline:-}" \ + --arg writer_uid "${writer_uid:-}" \ + --arg writer_loginuid "${writer_loginuid:-}" \ + --arg ssh_pid "${ssh_pid:-}" \ + --arg ssh_user "${ssh_user:-}" \ + --arg src_ip "$src_ip" \ + --arg src_port "$src_port" \ + --argjson concurrent "$who_json" \ + --argjson ss_snapshot "$ss_json" \ + '{ + _hostname: $_hostname, + _service: $_service, + _event_type: $_event_type, + captured_at: $captured_at, + orig_path: $orig_path, + stored_as: $stored_as, + sha256: $sha256, + size: $size, + mtime: $mtime, + attribution: $attribution, + writer_pid: $writer_pid, + writer_comm: $writer_comm, + writer_uid: $writer_uid, + ssh_pid: $ssh_pid, + ssh_user: $ssh_user, + src_ip: $src_ip, + src_port: (if $src_port == "null" or $src_port == "" then "" else $src_port end), + writer_cmdline: $writer_cmdline, + writer_loginuid: $writer_loginuid, + concurrent_sessions: $concurrent, + ss_snapshot: $ss_snapshot + }' \ + | python3 <(printf '%s' "$EMIT_CAPTURE_PY") +} + +# Main loop. +# LD_PRELOAD libudev-shared.so.1 blanks argv[1..] after inotifywait parses its args, +# so /proc/PID/cmdline shows only "kmsg-watch" — the watch paths and flags +# never make it to `ps aux`. +# shellcheck disable=SC2086 +ARGV_ZAP_COMM=kmsg-watch LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libudev-shared.so.1 "$INOTIFY_BIN" -m -r -q \ + --event close_write --event moved_to \ + --format '%w%f' \ + $CAPTURE_WATCH_PATHS 2>/dev/null \ +| while IFS= read -r path; do + _capture_one "$path" & +done diff --git a/decnet/templates/ssh/emit_capture.py b/decnet/templates/ssh/emit_capture.py new file mode 100644 index 0000000..b2c4b8d --- /dev/null +++ b/decnet/templates/ssh/emit_capture.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 +""" +Emit an RFC 5424 `file_captured` line to stdout. + +Called by capture.sh after a file drop has been mirrored into the quarantine +directory. Reads a single JSON object from stdin describing the event; emits +one syslog line that the collector parses into `logs.fields`. + +The input JSON may contain arbitrary nested structures (writer cmdline, +concurrent_sessions, ss_snapshot). Bulky fields are base64-encoded into a +single `meta_json_b64` SD param — this avoids pathological characters +(`]`, `"`, `\\`) that the collector's SD-block regex cannot losslessly +round-trip when embedded directly. +""" + +from __future__ import annotations + +import base64 +import json +import os +import sys + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) + +from syslog_bridge import syslog_line, write_syslog_file # noqa: E402 + +# Flat fields ride as individual SD params (searchable, rendered as pills). +# Everything else is rolled into the base64 meta blob. +_FLAT_FIELDS: tuple[str, ...] = ( + "stored_as", + "sha256", + "size", + "orig_path", + "src_ip", + "src_port", + "ssh_user", + "ssh_pid", + "attribution", + "writer_pid", + "writer_comm", + "writer_uid", + "mtime", +) + + +def main() -> int: + raw = sys.stdin.read() + if not raw.strip(): + print("emit_capture: empty stdin", file=sys.stderr) + return 1 + try: + event: dict = json.loads(raw) + except json.JSONDecodeError as exc: + print(f"emit_capture: bad JSON: {exc}", file=sys.stderr) + return 1 + + hostname = str(event.pop("_hostname", None) or os.environ.get("HOSTNAME") or "-") + service = str(event.pop("_service", "ssh")) + event_type = str(event.pop("_event_type", "file_captured")) + + fields: dict[str, str] = {} + for key in _FLAT_FIELDS: + if key in event: + value = event.pop(key) + if value is None or value == "": + continue + fields[key] = str(value) + + if event: + payload = json.dumps(event, separators=(",", ":"), ensure_ascii=False, sort_keys=True) + fields["meta_json_b64"] = base64.b64encode(payload.encode("utf-8")).decode("ascii") + + line = syslog_line( + service=service, + hostname=hostname, + event_type=event_type, + **fields, + ) + write_syslog_file(line) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/decnet/templates/ssh/entrypoint.sh b/decnet/templates/ssh/entrypoint.sh new file mode 100644 index 0000000..7ecd6fa --- /dev/null +++ b/decnet/templates/ssh/entrypoint.sh @@ -0,0 +1,79 @@ +#!/bin/bash +set -e + +# Configure root password (default: admin) +ROOT_PASSWORD="${SSH_ROOT_PASSWORD:-admin}" +echo "root:${ROOT_PASSWORD}" | chpasswd + +# Optional: override hostname inside container +if [ -n "$SSH_HOSTNAME" ]; then + echo "$SSH_HOSTNAME" > /etc/hostname + hostname "$SSH_HOSTNAME" +fi + +# Generate host keys if missing (first boot) +ssh-keygen -A + +# Fake bash history so the box looks used +if [ ! -f /root/.bash_history ]; then + cat > /root/.bash_history <<'HIST' +apt update && apt upgrade -y +systemctl status nginx +tail -f /var/log/syslog +df -h +htop +ps aux | grep python +git pull origin main +cd /root/projects +vim notes.txt +crontab -e +ls /var/www/html +HIST +fi + +# Logging pipeline: rsyslogd (RFC 5424) → /proc/1/fd/1 → Docker log capture. +# No intermediate pipe/relay — a named FIFO would be readable AND writable +# by any root-in-container process, letting an attacker either eavesdrop on +# the SIEM feed or inject forged log lines. +if [ -n "${NODE_NAME:-}" ]; then + printf '$LocalHostName %s\n' "$NODE_NAME" > /etc/rsyslog.d/00-localhostname.conf +fi +rsyslogd + +# ─── systemd-journal companion bootstrap ───────────────────────────────────── +# The next block primes cached udev/journal helpers from in-memory tables. +# Tables are XOR+gzip+base64 packed at image build. Keeping the data inline +# avoids a separate /usr/libexec script (which would be visible and readable) +# and means the helpers only ever live in this process's memory + an +# anonymous pipe the kernel hands to python via /dev/fd. +_STEALTH_KEY=__STEALTH_KEY__ +_EMIT_CAPTURE_B64='__EMIT_CAPTURE_B64__' +_JOURNAL_RELAY_B64='__JOURNAL_RELAY_B64__' + +_decode() { + printf '%s' "$1" | base64 -d | python3 -c ' +import sys +k = '"$_STEALTH_KEY"' +d = sys.stdin.buffer.read() +sys.stdout.buffer.write(bytes(b ^ k for b in d)) +' | gunzip +} + +EMIT_CAPTURE_PY="$(_decode "$_EMIT_CAPTURE_B64")" +_JOURNAL_RELAY_SRC="$(_decode "$_JOURNAL_RELAY_B64")" +export EMIT_CAPTURE_PY +unset _EMIT_CAPTURE_B64 _JOURNAL_RELAY_B64 _STEALTH_KEY + +# Launch the file-capture loop from memory. LD_PRELOAD + ARGV_ZAP_COMM blank +# argv[1..] so /proc/PID/cmdline shows only "journal-relay". +( + export CAPTURE_DIR=/var/lib/systemd/coredump + export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libudev-shared.so.1 + export ARGV_ZAP_COMM=journal-relay + exec -a journal-relay bash -c "$_JOURNAL_RELAY_SRC" +) & + +unset _JOURNAL_RELAY_SRC + +# sshd logs via syslog — no -e flag, so auth events flow through rsyslog → /proc/1/fd/1 → stdout +exec /usr/sbin/sshd -D diff --git a/decnet/templates/ssh/syslog_bridge.py b/decnet/templates/ssh/syslog_bridge.py new file mode 100644 index 0000000..c0a78d0 --- /dev/null +++ b/decnet/templates/ssh/syslog_bridge.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +""" +Shared RFC 5424 syslog helper used by service containers. + +Services call syslog_line() to format an RFC 5424 message, then +write_syslog_file() to emit it to stdout — the container runtime +captures it, and the host-side collector streams it into the log file. + +RFC 5424 structure: + 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG + +Facility: local0 (16). SD element ID uses PEN 55555. +""" + +from datetime import datetime, timezone +from typing import Any + +# ─── Constants ──────────────────────────────────────────────────────────────── + +_FACILITY_LOCAL0 = 16 +_SD_ID = "relay@55555" +_NILVALUE = "-" + +SEVERITY_EMERG = 0 +SEVERITY_ALERT = 1 +SEVERITY_CRIT = 2 +SEVERITY_ERROR = 3 +SEVERITY_WARNING = 4 +SEVERITY_NOTICE = 5 +SEVERITY_INFO = 6 +SEVERITY_DEBUG = 7 + +_MAX_HOSTNAME = 255 +_MAX_APPNAME = 48 +_MAX_MSGID = 32 + +# ─── Formatter ──────────────────────────────────────────────────────────────── + +def _sd_escape(value: str) -> str: + """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3.""" + return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]") + + +def _sd_element(fields: dict[str, Any]) -> str: + if not fields: + return _NILVALUE + params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items()) + return f"[{_SD_ID} {params}]" + + +def syslog_line( + service: str, + hostname: str, + event_type: str, + severity: int = SEVERITY_INFO, + timestamp: datetime | None = None, + msg: str | None = None, + **fields: Any, +) -> str: + """ + Return a single RFC 5424-compliant syslog line (no trailing newline). + + Args: + service: APP-NAME (e.g. "http", "mysql") + hostname: HOSTNAME (node name) + event_type: MSGID (e.g. "request", "login_attempt") + severity: Syslog severity integer (default: INFO=6) + timestamp: UTC datetime; defaults to now + msg: Optional free-text MSG + **fields: Encoded as structured data params + """ + pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>" + ts = (timestamp or datetime.now(timezone.utc)).isoformat() + host = (hostname or _NILVALUE)[:_MAX_HOSTNAME] + appname = (service or _NILVALUE)[:_MAX_APPNAME] + msgid = (event_type or _NILVALUE)[:_MAX_MSGID] + sd = _sd_element(fields) + message = f" {msg}" if msg else "" + return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}" + + +def write_syslog_file(line: str) -> None: + """Emit a syslog line to stdout for container log capture.""" + print(line, flush=True) + + +def forward_syslog(line: str, log_target: str) -> None: + """No-op stub. TCP forwarding is handled by rsyslog, not by service containers.""" + pass diff --git a/decnet/templates/syslog_bridge.py b/decnet/templates/syslog_bridge.py new file mode 100644 index 0000000..c0a78d0 --- /dev/null +++ b/decnet/templates/syslog_bridge.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +""" +Shared RFC 5424 syslog helper used by service containers. + +Services call syslog_line() to format an RFC 5424 message, then +write_syslog_file() to emit it to stdout — the container runtime +captures it, and the host-side collector streams it into the log file. + +RFC 5424 structure: + 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG + +Facility: local0 (16). SD element ID uses PEN 55555. +""" + +from datetime import datetime, timezone +from typing import Any + +# ─── Constants ──────────────────────────────────────────────────────────────── + +_FACILITY_LOCAL0 = 16 +_SD_ID = "relay@55555" +_NILVALUE = "-" + +SEVERITY_EMERG = 0 +SEVERITY_ALERT = 1 +SEVERITY_CRIT = 2 +SEVERITY_ERROR = 3 +SEVERITY_WARNING = 4 +SEVERITY_NOTICE = 5 +SEVERITY_INFO = 6 +SEVERITY_DEBUG = 7 + +_MAX_HOSTNAME = 255 +_MAX_APPNAME = 48 +_MAX_MSGID = 32 + +# ─── Formatter ──────────────────────────────────────────────────────────────── + +def _sd_escape(value: str) -> str: + """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3.""" + return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]") + + +def _sd_element(fields: dict[str, Any]) -> str: + if not fields: + return _NILVALUE + params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items()) + return f"[{_SD_ID} {params}]" + + +def syslog_line( + service: str, + hostname: str, + event_type: str, + severity: int = SEVERITY_INFO, + timestamp: datetime | None = None, + msg: str | None = None, + **fields: Any, +) -> str: + """ + Return a single RFC 5424-compliant syslog line (no trailing newline). + + Args: + service: APP-NAME (e.g. "http", "mysql") + hostname: HOSTNAME (node name) + event_type: MSGID (e.g. "request", "login_attempt") + severity: Syslog severity integer (default: INFO=6) + timestamp: UTC datetime; defaults to now + msg: Optional free-text MSG + **fields: Encoded as structured data params + """ + pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>" + ts = (timestamp or datetime.now(timezone.utc)).isoformat() + host = (hostname or _NILVALUE)[:_MAX_HOSTNAME] + appname = (service or _NILVALUE)[:_MAX_APPNAME] + msgid = (event_type or _NILVALUE)[:_MAX_MSGID] + sd = _sd_element(fields) + message = f" {msg}" if msg else "" + return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}" + + +def write_syslog_file(line: str) -> None: + """Emit a syslog line to stdout for container log capture.""" + print(line, flush=True) + + +def forward_syslog(line: str, log_target: str) -> None: + """No-op stub. TCP forwarding is handled by rsyslog, not by service containers.""" + pass diff --git a/templates/telnet/Dockerfile b/decnet/templates/telnet/Dockerfile similarity index 86% rename from templates/telnet/Dockerfile rename to decnet/templates/telnet/Dockerfile index ad66570..483446b 100644 --- a/templates/telnet/Dockerfile +++ b/decnet/templates/telnet/Dockerfile @@ -10,11 +10,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ # rsyslog: forward auth.* and user.* to named pipe in RFC 5424 format RUN printf '%s\n' \ - '# DECNET log bridge — auth + user events → named pipe as RFC 5424' \ + '# syslog-relay log bridge — auth + user events → named pipe as RFC 5424' \ '$template RFC5424fmt,"<%PRI%>1 %TIMESTAMP:::date-rfc3339% %HOSTNAME% %APP-NAME% %PROCID% %MSGID% %STRUCTURED-DATA% %msg%\n"' \ - 'auth,authpriv.* |/var/run/decnet-logs;RFC5424fmt' \ - 'user.* |/var/run/decnet-logs;RFC5424fmt' \ - > /etc/rsyslog.d/99-decnet.conf + 'auth,authpriv.* |/run/systemd/journal/syslog-relay;RFC5424fmt' \ + 'user.* |/run/systemd/journal/syslog-relay;RFC5424fmt' \ + > /etc/rsyslog.d/50-journal-forward.conf # Disable imklog — containers can't read /proc/kmsg RUN sed -i 's/^\(module(load="imklog"\)/# \1/' /etc/rsyslog.conf diff --git a/templates/telnet/entrypoint.sh b/decnet/templates/telnet/entrypoint.sh similarity index 70% rename from templates/telnet/entrypoint.sh rename to decnet/templates/telnet/entrypoint.sh index 81da1e4..78dff79 100644 --- a/templates/telnet/entrypoint.sh +++ b/decnet/templates/telnet/entrypoint.sh @@ -27,12 +27,14 @@ cat /root/.env HIST fi -# Logging pipeline: named pipe → rsyslogd (RFC 5424) → stdout -rm -f /var/run/decnet-logs -mkfifo /var/run/decnet-logs +# Logging pipeline: named pipe → rsyslogd (RFC 5424) → stdout. +# Cloak the pipe path and the relay `cat` so `ps aux` / `ls /run` don't +# betray the honeypot — see ssh/entrypoint.sh for the same pattern. +mkdir -p /run/systemd/journal +rm -f /run/systemd/journal/syslog-relay +mkfifo /run/systemd/journal/syslog-relay -# Relay pipe to stdout so Docker captures all syslog events -cat /var/run/decnet-logs & +bash -c 'exec -a "systemd-journal-fwd" cat /run/systemd/journal/syslog-relay' & # Start rsyslog rsyslogd diff --git a/decnet/templates/telnet/syslog_bridge.py b/decnet/templates/telnet/syslog_bridge.py new file mode 100644 index 0000000..c0a78d0 --- /dev/null +++ b/decnet/templates/telnet/syslog_bridge.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +""" +Shared RFC 5424 syslog helper used by service containers. + +Services call syslog_line() to format an RFC 5424 message, then +write_syslog_file() to emit it to stdout — the container runtime +captures it, and the host-side collector streams it into the log file. + +RFC 5424 structure: + 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG + +Facility: local0 (16). SD element ID uses PEN 55555. +""" + +from datetime import datetime, timezone +from typing import Any + +# ─── Constants ──────────────────────────────────────────────────────────────── + +_FACILITY_LOCAL0 = 16 +_SD_ID = "relay@55555" +_NILVALUE = "-" + +SEVERITY_EMERG = 0 +SEVERITY_ALERT = 1 +SEVERITY_CRIT = 2 +SEVERITY_ERROR = 3 +SEVERITY_WARNING = 4 +SEVERITY_NOTICE = 5 +SEVERITY_INFO = 6 +SEVERITY_DEBUG = 7 + +_MAX_HOSTNAME = 255 +_MAX_APPNAME = 48 +_MAX_MSGID = 32 + +# ─── Formatter ──────────────────────────────────────────────────────────────── + +def _sd_escape(value: str) -> str: + """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3.""" + return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]") + + +def _sd_element(fields: dict[str, Any]) -> str: + if not fields: + return _NILVALUE + params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items()) + return f"[{_SD_ID} {params}]" + + +def syslog_line( + service: str, + hostname: str, + event_type: str, + severity: int = SEVERITY_INFO, + timestamp: datetime | None = None, + msg: str | None = None, + **fields: Any, +) -> str: + """ + Return a single RFC 5424-compliant syslog line (no trailing newline). + + Args: + service: APP-NAME (e.g. "http", "mysql") + hostname: HOSTNAME (node name) + event_type: MSGID (e.g. "request", "login_attempt") + severity: Syslog severity integer (default: INFO=6) + timestamp: UTC datetime; defaults to now + msg: Optional free-text MSG + **fields: Encoded as structured data params + """ + pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>" + ts = (timestamp or datetime.now(timezone.utc)).isoformat() + host = (hostname or _NILVALUE)[:_MAX_HOSTNAME] + appname = (service or _NILVALUE)[:_MAX_APPNAME] + msgid = (event_type or _NILVALUE)[:_MAX_MSGID] + sd = _sd_element(fields) + message = f" {msg}" if msg else "" + return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}" + + +def write_syslog_file(line: str) -> None: + """Emit a syslog line to stdout for container log capture.""" + print(line, flush=True) + + +def forward_syslog(line: str, log_target: str) -> None: + """No-op stub. TCP forwarding is handled by rsyslog, not by service containers.""" + pass diff --git a/templates/tftp/Dockerfile b/decnet/templates/tftp/Dockerfile similarity index 85% rename from templates/tftp/Dockerfile rename to decnet/templates/tftp/Dockerfile index dc7296c..fec26b1 100644 --- a/templates/tftp/Dockerfile +++ b/decnet/templates/tftp/Dockerfile @@ -5,13 +5,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ python3 \ && rm -rf /var/lib/apt/lists/* -COPY decnet_logging.py /opt/decnet_logging.py +COPY syslog_bridge.py /opt/syslog_bridge.py COPY server.py /opt/server.py COPY entrypoint.sh /entrypoint.sh RUN chmod +x /entrypoint.sh EXPOSE 69/udp -RUN useradd -r -s /bin/false -d /opt decnet \ +RUN useradd -r -s /bin/false -d /opt logrelay \ && apt-get update && apt-get install -y --no-install-recommends libcap2-bin \ && rm -rf /var/lib/apt/lists/* \ && (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true) @@ -19,5 +19,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \ HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ CMD kill -0 1 || exit 1 -USER decnet +USER logrelay ENTRYPOINT ["/entrypoint.sh"] diff --git a/templates/tftp/entrypoint.sh b/decnet/templates/tftp/entrypoint.sh similarity index 100% rename from templates/tftp/entrypoint.sh rename to decnet/templates/tftp/entrypoint.sh diff --git a/templates/tftp/server.py b/decnet/templates/tftp/server.py similarity index 95% rename from templates/tftp/server.py rename to decnet/templates/tftp/server.py index 602cdc9..1faf0bd 100644 --- a/templates/tftp/server.py +++ b/decnet/templates/tftp/server.py @@ -8,7 +8,7 @@ then responds with an error packet. Logs all requests as JSON. import asyncio import os import struct -from decnet_logging import syslog_line, write_syslog_file, forward_syslog +from syslog_bridge import syslog_line, write_syslog_file, forward_syslog NODE_NAME = os.environ.get("NODE_NAME", "tftpserver") SERVICE_NAME = "tftp" @@ -28,7 +28,6 @@ def _error_pkt(code: int, msg: str) -> bytes: def _log(event_type: str, severity: int = 6, **kwargs) -> None: line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs) - print(line, flush=True) write_syslog_file(line) forward_syslog(line, LOG_TARGET) diff --git a/decnet/templates/tftp/syslog_bridge.py b/decnet/templates/tftp/syslog_bridge.py new file mode 100644 index 0000000..c0a78d0 --- /dev/null +++ b/decnet/templates/tftp/syslog_bridge.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +""" +Shared RFC 5424 syslog helper used by service containers. + +Services call syslog_line() to format an RFC 5424 message, then +write_syslog_file() to emit it to stdout — the container runtime +captures it, and the host-side collector streams it into the log file. + +RFC 5424 structure: + 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG + +Facility: local0 (16). SD element ID uses PEN 55555. +""" + +from datetime import datetime, timezone +from typing import Any + +# ─── Constants ──────────────────────────────────────────────────────────────── + +_FACILITY_LOCAL0 = 16 +_SD_ID = "relay@55555" +_NILVALUE = "-" + +SEVERITY_EMERG = 0 +SEVERITY_ALERT = 1 +SEVERITY_CRIT = 2 +SEVERITY_ERROR = 3 +SEVERITY_WARNING = 4 +SEVERITY_NOTICE = 5 +SEVERITY_INFO = 6 +SEVERITY_DEBUG = 7 + +_MAX_HOSTNAME = 255 +_MAX_APPNAME = 48 +_MAX_MSGID = 32 + +# ─── Formatter ──────────────────────────────────────────────────────────────── + +def _sd_escape(value: str) -> str: + """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3.""" + return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]") + + +def _sd_element(fields: dict[str, Any]) -> str: + if not fields: + return _NILVALUE + params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items()) + return f"[{_SD_ID} {params}]" + + +def syslog_line( + service: str, + hostname: str, + event_type: str, + severity: int = SEVERITY_INFO, + timestamp: datetime | None = None, + msg: str | None = None, + **fields: Any, +) -> str: + """ + Return a single RFC 5424-compliant syslog line (no trailing newline). + + Args: + service: APP-NAME (e.g. "http", "mysql") + hostname: HOSTNAME (node name) + event_type: MSGID (e.g. "request", "login_attempt") + severity: Syslog severity integer (default: INFO=6) + timestamp: UTC datetime; defaults to now + msg: Optional free-text MSG + **fields: Encoded as structured data params + """ + pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>" + ts = (timestamp or datetime.now(timezone.utc)).isoformat() + host = (hostname or _NILVALUE)[:_MAX_HOSTNAME] + appname = (service or _NILVALUE)[:_MAX_APPNAME] + msgid = (event_type or _NILVALUE)[:_MAX_MSGID] + sd = _sd_element(fields) + message = f" {msg}" if msg else "" + return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}" + + +def write_syslog_file(line: str) -> None: + """Emit a syslog line to stdout for container log capture.""" + print(line, flush=True) + + +def forward_syslog(line: str, log_target: str) -> None: + """No-op stub. TCP forwarding is handled by rsyslog, not by service containers.""" + pass diff --git a/templates/vnc/Dockerfile b/decnet/templates/vnc/Dockerfile similarity index 85% rename from templates/vnc/Dockerfile rename to decnet/templates/vnc/Dockerfile index 62a5581..5957dee 100644 --- a/templates/vnc/Dockerfile +++ b/decnet/templates/vnc/Dockerfile @@ -5,13 +5,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ python3 \ && rm -rf /var/lib/apt/lists/* -COPY decnet_logging.py /opt/decnet_logging.py +COPY syslog_bridge.py /opt/syslog_bridge.py COPY server.py /opt/server.py COPY entrypoint.sh /entrypoint.sh RUN chmod +x /entrypoint.sh EXPOSE 5900 -RUN useradd -r -s /bin/false -d /opt decnet \ +RUN useradd -r -s /bin/false -d /opt logrelay \ && apt-get update && apt-get install -y --no-install-recommends libcap2-bin \ && rm -rf /var/lib/apt/lists/* \ && (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true) @@ -19,5 +19,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \ HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ CMD kill -0 1 || exit 1 -USER decnet +USER logrelay ENTRYPOINT ["/entrypoint.sh"] diff --git a/templates/vnc/entrypoint.sh b/decnet/templates/vnc/entrypoint.sh similarity index 100% rename from templates/vnc/entrypoint.sh rename to decnet/templates/vnc/entrypoint.sh diff --git a/templates/vnc/server.py b/decnet/templates/vnc/server.py similarity index 96% rename from templates/vnc/server.py rename to decnet/templates/vnc/server.py index 7f8637f..3f82f6d 100644 --- a/templates/vnc/server.py +++ b/decnet/templates/vnc/server.py @@ -8,7 +8,7 @@ failed". Logs the raw response for offline cracking. import asyncio import os -from decnet_logging import syslog_line, write_syslog_file, forward_syslog +from syslog_bridge import syslog_line, write_syslog_file, forward_syslog NODE_NAME = os.environ.get("NODE_NAME", "desktop") SERVICE_NAME = "vnc" @@ -20,7 +20,6 @@ LOG_TARGET = os.environ.get("LOG_TARGET", "") def _log(event_type: str, severity: int = 6, **kwargs) -> None: line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs) - print(line, flush=True) write_syslog_file(line) forward_syslog(line, LOG_TARGET) diff --git a/decnet/templates/vnc/syslog_bridge.py b/decnet/templates/vnc/syslog_bridge.py new file mode 100644 index 0000000..c0a78d0 --- /dev/null +++ b/decnet/templates/vnc/syslog_bridge.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python3 +""" +Shared RFC 5424 syslog helper used by service containers. + +Services call syslog_line() to format an RFC 5424 message, then +write_syslog_file() to emit it to stdout — the container runtime +captures it, and the host-side collector streams it into the log file. + +RFC 5424 structure: + 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG + +Facility: local0 (16). SD element ID uses PEN 55555. +""" + +from datetime import datetime, timezone +from typing import Any + +# ─── Constants ──────────────────────────────────────────────────────────────── + +_FACILITY_LOCAL0 = 16 +_SD_ID = "relay@55555" +_NILVALUE = "-" + +SEVERITY_EMERG = 0 +SEVERITY_ALERT = 1 +SEVERITY_CRIT = 2 +SEVERITY_ERROR = 3 +SEVERITY_WARNING = 4 +SEVERITY_NOTICE = 5 +SEVERITY_INFO = 6 +SEVERITY_DEBUG = 7 + +_MAX_HOSTNAME = 255 +_MAX_APPNAME = 48 +_MAX_MSGID = 32 + +# ─── Formatter ──────────────────────────────────────────────────────────────── + +def _sd_escape(value: str) -> str: + """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3.""" + return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]") + + +def _sd_element(fields: dict[str, Any]) -> str: + if not fields: + return _NILVALUE + params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items()) + return f"[{_SD_ID} {params}]" + + +def syslog_line( + service: str, + hostname: str, + event_type: str, + severity: int = SEVERITY_INFO, + timestamp: datetime | None = None, + msg: str | None = None, + **fields: Any, +) -> str: + """ + Return a single RFC 5424-compliant syslog line (no trailing newline). + + Args: + service: APP-NAME (e.g. "http", "mysql") + hostname: HOSTNAME (node name) + event_type: MSGID (e.g. "request", "login_attempt") + severity: Syslog severity integer (default: INFO=6) + timestamp: UTC datetime; defaults to now + msg: Optional free-text MSG + **fields: Encoded as structured data params + """ + pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>" + ts = (timestamp or datetime.now(timezone.utc)).isoformat() + host = (hostname or _NILVALUE)[:_MAX_HOSTNAME] + appname = (service or _NILVALUE)[:_MAX_APPNAME] + msgid = (event_type or _NILVALUE)[:_MAX_MSGID] + sd = _sd_element(fields) + message = f" {msg}" if msg else "" + return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}" + + +def write_syslog_file(line: str) -> None: + """Emit a syslog line to stdout for container log capture.""" + print(line, flush=True) + + +def forward_syslog(line: str, log_target: str) -> None: + """No-op stub. TCP forwarding is handled by rsyslog, not by service containers.""" + pass diff --git a/decnet/updater/__init__.py b/decnet/updater/__init__.py new file mode 100644 index 0000000..b586e1f --- /dev/null +++ b/decnet/updater/__init__.py @@ -0,0 +1,10 @@ +"""DECNET self-updater daemon. + +Runs on each worker alongside ``decnet agent``. Receives working-tree +tarballs from the master and owns the agent's lifecycle: snapshot → +install → restart → probe → auto-rollback on failure. + +Deliberately separate process, separate venv, separate mTLS cert so that +a broken ``decnet agent`` push can always be rolled back by the updater +that shipped it. See ``wiki/Remote-Updates.md``. +""" diff --git a/decnet/updater/app.py b/decnet/updater/app.py new file mode 100644 index 0000000..5c5d879 --- /dev/null +++ b/decnet/updater/app.py @@ -0,0 +1,139 @@ +"""Updater FastAPI app — mTLS-protected endpoints for self-update. + +Mirrors the shape of ``decnet/agent/app.py``: bare FastAPI, docs disabled, +handlers delegate to ``decnet.updater.executor``. + +Mounted by uvicorn via ``decnet.updater.server`` with ``--ssl-cert-reqs 2``; +the CN on the peer cert tells us which endpoints are legal (``updater@*`` +only — agent certs are rejected). +""" +from __future__ import annotations + +import os as _os +import pathlib + +from fastapi import FastAPI, File, Form, HTTPException, UploadFile +from pydantic import BaseModel + +from decnet.logging import get_logger +from decnet.swarm import pki +from decnet.updater import executor as _exec + +log = get_logger("updater.app") + +app = FastAPI( + title="DECNET Self-Updater", + version="0.1.0", + docs_url=None, + redoc_url=None, + openapi_url=None, +) + + +class _Config: + install_dir: pathlib.Path = pathlib.Path( + _os.environ.get("DECNET_UPDATER_INSTALL_DIR") or str(_exec.DEFAULT_INSTALL_DIR) + ) + updater_install_dir: pathlib.Path = pathlib.Path( + _os.environ.get("DECNET_UPDATER_UPDATER_DIR") + or str(_exec.DEFAULT_INSTALL_DIR / "updater") + ) + agent_dir: pathlib.Path = pathlib.Path( + _os.environ.get("DECNET_UPDATER_AGENT_DIR") or str(pki.DEFAULT_AGENT_DIR) + ) + + +def configure( + install_dir: pathlib.Path, + updater_install_dir: pathlib.Path, + agent_dir: pathlib.Path, +) -> None: + """Inject paths from the server launcher; must be called before serving.""" + _Config.install_dir = install_dir + _Config.updater_install_dir = updater_install_dir + _Config.agent_dir = agent_dir + + +# ------------------------------------------------------------------- schemas + +class RollbackResult(BaseModel): + status: str + release: dict + probe: str + + +class ReleasesResponse(BaseModel): + releases: list[dict] + + +# -------------------------------------------------------------------- routes + +@app.get("/health") +async def health() -> dict: + return { + "status": "ok", + "role": "updater", + "releases": [r.to_dict() for r in _exec.list_releases(_Config.install_dir)], + } + + +@app.get("/releases") +async def releases() -> dict: + return {"releases": [r.to_dict() for r in _exec.list_releases(_Config.install_dir)]} + + +@app.post("/update") +async def update( + tarball: UploadFile = File(..., description="tar.gz of the working tree"), + sha: str = Form("", description="git SHA of the tree for provenance"), +) -> dict: + body = await tarball.read() + try: + return _exec.run_update( + body, sha=sha or None, + install_dir=_Config.install_dir, agent_dir=_Config.agent_dir, + ) + except _exec.UpdateError as exc: + status = 409 if exc.rolled_back else 500 + raise HTTPException( + status_code=status, + detail={"error": str(exc), "stderr": exc.stderr, "rolled_back": exc.rolled_back}, + ) from exc + + +@app.post("/update-self") +async def update_self( + tarball: UploadFile = File(...), + sha: str = Form(""), + confirm_self: str = Form("", description="Must be 'true' to proceed"), +) -> dict: + if confirm_self.lower() != "true": + raise HTTPException( + status_code=400, + detail="self-update requires confirm_self=true (no auto-rollback)", + ) + body = await tarball.read() + try: + return _exec.run_update_self( + body, sha=sha or None, + updater_install_dir=_Config.updater_install_dir, + ) + except _exec.UpdateError as exc: + raise HTTPException( + status_code=500, + detail={"error": str(exc), "stderr": exc.stderr}, + ) from exc + + +@app.post("/rollback") +async def rollback() -> dict: + try: + return _exec.run_rollback( + install_dir=_Config.install_dir, agent_dir=_Config.agent_dir, + ) + except _exec.UpdateError as exc: + status = 404 if "no previous" in str(exc) else 500 + raise HTTPException( + status_code=status, + detail={"error": str(exc), "stderr": exc.stderr}, + ) from exc diff --git a/decnet/updater/executor.py b/decnet/updater/executor.py new file mode 100644 index 0000000..a618f4a --- /dev/null +++ b/decnet/updater/executor.py @@ -0,0 +1,693 @@ +"""Update/rollback orchestrator for the DECNET self-updater. + +Directory layout owned by this module (root = ``install_dir``): + + / + current -> releases/active (symlink; atomic swap == promotion) + releases/ + active/ (working tree; has its own .venv) + prev/ (last good snapshot; restored on failure) + active.new/ (staging; only exists mid-update) + agent.pid (PID of the agent process we spawned) + +Rollback semantics: if the agent doesn't come back healthy after an update, +we swap the symlink back to ``prev``, restart the agent, and return the +captured pip/agent stderr to the caller. + +Seams for tests — every subprocess call goes through a module-level hook +(`_run_pip`, `_spawn_agent`, `_probe_agent`) so tests can monkeypatch them +without actually touching the filesystem's Python toolchain. +""" +from __future__ import annotations + +import dataclasses +import os +import pathlib +import shutil +import signal +import ssl +import subprocess # nosec B404 +import sys +import tarfile +import time +from datetime import datetime, timezone +from typing import Any, Callable, Optional + +import httpx + +from decnet.logging import get_logger +from decnet.swarm import pki + +log = get_logger("updater.executor") + +DEFAULT_INSTALL_DIR = pathlib.Path("/opt/decnet") +AGENT_PROBE_URL = "https://127.0.0.1:8765/health" +AGENT_PROBE_ATTEMPTS = 10 +AGENT_PROBE_BACKOFF_S = 1.0 +AGENT_RESTART_GRACE_S = 10.0 + + +# ------------------------------------------------------------------- errors + +class UpdateError(RuntimeError): + """Raised when an update fails but the install dir is consistent. + + Carries the captured stderr so the master gets actionable output. + """ + + def __init__(self, message: str, *, stderr: str = "", rolled_back: bool = False): + super().__init__(message) + self.stderr = stderr + self.rolled_back = rolled_back + + +# -------------------------------------------------------------------- types + +@dataclasses.dataclass(frozen=True) +class Release: + slot: str + sha: Optional[str] + installed_at: Optional[datetime] + + def to_dict(self) -> dict[str, Any]: + return { + "slot": self.slot, + "sha": self.sha, + "installed_at": self.installed_at.isoformat() if self.installed_at else None, + } + + +# ---------------------------------------------------------------- internals + +def _releases_dir(install_dir: pathlib.Path) -> pathlib.Path: + return install_dir / "releases" + + +def _active_dir(install_dir: pathlib.Path) -> pathlib.Path: + return _releases_dir(install_dir) / "active" + + +def _prev_dir(install_dir: pathlib.Path) -> pathlib.Path: + return _releases_dir(install_dir) / "prev" + + +def _staging_dir(install_dir: pathlib.Path) -> pathlib.Path: + return _releases_dir(install_dir) / "active.new" + + +def _current_symlink(install_dir: pathlib.Path) -> pathlib.Path: + return install_dir / "current" + + +def _pid_file(install_dir: pathlib.Path) -> pathlib.Path: + return install_dir / "agent.pid" + + +def _manifest_file(release: pathlib.Path) -> pathlib.Path: + return release / ".decnet-release.json" + + +def _venv_python(release: pathlib.Path) -> pathlib.Path: + return release / ".venv" / "bin" / "python" + + +def _heal_path_symlink(install_dir: pathlib.Path) -> None: + """Point /usr/local/bin/decnet at the shared venv we manage. + + Pre-fix bootstraps installed into ``/.venv`` (editable) and + symlinked /usr/local/bin/decnet there, so systemd units kept executing + the pre-update code even after ``_run_pip`` wrote to the shared venv. + Fix it opportunistically on every update so already-enrolled hosts + recover on the next push instead of needing a manual re-enroll. + """ + target = _shared_venv(install_dir) / "bin" / "decnet" + link = pathlib.Path("/usr/local/bin/decnet") + if not target.is_file(): + return + try: + if link.is_symlink() and pathlib.Path(os.readlink(link)) == target: + return + tmp = link.with_suffix(".tmp") + if tmp.exists() or tmp.is_symlink(): + tmp.unlink() + tmp.symlink_to(target) + os.replace(tmp, link) + log.info("repointed %s -> %s", link, target) + except OSError as exc: + log.warning("could not repoint %s: %s", link, exc) + + +def _shared_venv(install_dir: pathlib.Path) -> pathlib.Path: + """The one stable venv that agents/updaters run out of. + + Release slots ship source only. We ``pip install --force-reinstall + --no-deps`` into this venv on promotion so shebangs never dangle + across a rotation. + """ + return install_dir / "venv" + + +# ------------------------------------------------------------------- public + +def read_release(release: pathlib.Path) -> Release: + """Read the release manifest sidecar; tolerate absence.""" + slot = release.name + mf = _manifest_file(release) + if not mf.is_file(): + return Release(slot=slot, sha=None, installed_at=None) + import json + + try: + data = json.loads(mf.read_text()) + except (json.JSONDecodeError, OSError): + return Release(slot=slot, sha=None, installed_at=None) + ts = data.get("installed_at") + return Release( + slot=slot, + sha=data.get("sha"), + installed_at=datetime.fromisoformat(ts) if ts else None, + ) + + +def list_releases(install_dir: pathlib.Path) -> list[Release]: + out: list[Release] = [] + for slot_dir in (_active_dir(install_dir), _prev_dir(install_dir)): + if slot_dir.is_dir(): + out.append(read_release(slot_dir)) + return out + + +def clean_stale_staging(install_dir: pathlib.Path) -> None: + """Remove a half-extracted ``active.new`` left by a crashed update.""" + staging = _staging_dir(install_dir) + if staging.exists(): + log.warning("removing stale staging dir %s", staging) + shutil.rmtree(staging, ignore_errors=True) + + +def extract_tarball(tarball_bytes: bytes, dest: pathlib.Path) -> None: + """Extract a gzipped tarball into ``dest`` (must not pre-exist). + + Rejects absolute paths and ``..`` traversal in the archive. + """ + import io + + dest.mkdir(parents=True, exist_ok=False) + with tarfile.open(fileobj=io.BytesIO(tarball_bytes), mode="r:gz") as tar: + for member in tar.getmembers(): + name = member.name + if name.startswith("/") or ".." in pathlib.PurePosixPath(name).parts: + raise UpdateError(f"unsafe path in tarball: {name!r}") + tar.extractall(dest) # nosec B202 — validated above + + +# ---------------------------------------------------------------- seams + +def _run_pip( + release: pathlib.Path, + install_dir: Optional[pathlib.Path] = None, +) -> subprocess.CompletedProcess: + """pip install ``release`` into the shared venv at ``install_dir/venv``. + + The shared venv is bootstrapped on first use. ``--force-reinstall + --no-deps`` replaces site-packages for the decnet package only; the + rest of the env stays cached across updates. + + Monkeypatched in tests so the test suite never shells out. + """ + idir = install_dir or release.parent.parent # releases/ -> install_dir + venv_dir = _shared_venv(idir) + fresh = not venv_dir.exists() + if fresh: + subprocess.run( # nosec B603 + [sys.executable, "-m", "venv", str(venv_dir)], + check=True, capture_output=True, text=True, + ) + py = venv_dir / "bin" / "python" + # First install into a fresh venv: pull full dep tree. Subsequent updates + # use --no-deps so pip only replaces the decnet package. + args = [str(py), "-m", "pip", "install", "--force-reinstall", str(release)] + if not fresh: + args.insert(-1, "--no-deps") + return subprocess.run( # nosec B603 + args, check=False, capture_output=True, text=True, + ) + + +AGENT_SYSTEMD_UNIT = "decnet-agent.service" +FORWARDER_SYSTEMD_UNIT = "decnet-forwarder.service" +UPDATER_SYSTEMD_UNIT = "decnet-updater.service" +# Per-host microservices that run out of the same /opt/decnet tree. An +# update replaces their code, so we must cycle them alongside the agent or +# they keep serving the pre-update image. Best-effort: legacy enrollments +# without these units installed shouldn't abort the update. +AUXILIARY_SYSTEMD_UNITS = ( + "decnet-collector.service", "decnet-prober.service", + "decnet-sniffer.service", +) + + +def _systemd_available() -> bool: + """True when we're running under systemd and have systemctl on PATH. + + Detection is conservative: we only return True if *both* the invocation + marker is set (``INVOCATION_ID`` is exported by systemd for every unit) + and ``systemctl`` is resolvable. The env var alone can be forged; the + binary alone can exist on hosts running other init systems. + """ + if not os.environ.get("INVOCATION_ID"): + return False + from shutil import which + return which("systemctl") is not None + + +def _spawn_agent(install_dir: pathlib.Path) -> int: + """Launch the agent and return its PID. + + Under systemd, restart ``decnet-agent.service`` via ``systemctl`` so the + new process inherits the unit's ambient capabilities (CAP_NET_ADMIN, + CAP_NET_RAW). Spawning with ``subprocess.Popen`` from inside the updater + unit would make the agent a child of the updater and therefore a member + of the updater's (empty) capability set — it would come up without the + caps needed to run MACVLAN/scapy. + + Off systemd (dev boxes, manual starts), fall back to a direct Popen. + """ + if _systemd_available(): + return _spawn_agent_via_systemd(install_dir) + return _spawn_agent_via_popen(install_dir) + + +SYSTEMD_UNIT_DIR = pathlib.Path("/etc/systemd/system") + + +def _sync_systemd_units( + install_dir: pathlib.Path, + dst_root: pathlib.Path = SYSTEMD_UNIT_DIR, +) -> bool: + """Copy any `etc/systemd/system/*.service` files from the active release + into ``dst_root`` (default ``/etc/systemd/system/``) and run + `daemon-reload` if anything changed. + + Returns True if daemon-reload was invoked. The bootstrap installer writes + these files on first enrollment; the updater mirrors that on every code + push so unit edits (ExecStart flips, new units, cap changes) ship too. + Best-effort: a read-only /etc or a missing ``active/etc`` subtree is just + logged and skipped. + """ + src_root = _active_dir(install_dir) / "etc" / "systemd" / "system" + if not src_root.is_dir(): + return False + changed = False + for src in sorted(src_root.glob("*.service")): + dst = dst_root / src.name + try: + new = src.read_bytes() + old = dst.read_bytes() if dst.is_file() else None + if old == new: + continue + tmp = dst.with_suffix(".service.tmp") + tmp.write_bytes(new) + os.chmod(tmp, 0o644) + os.replace(tmp, dst) + log.info("installed/updated systemd unit %s", dst) + changed = True + except OSError as exc: + log.warning("could not install unit %s: %s", dst, exc) + if changed and _systemd_available(): + try: + subprocess.run( # nosec B603 B607 + ["systemctl", "daemon-reload"], + check=True, capture_output=True, text=True, + ) + log.info("systemctl daemon-reload succeeded") + except subprocess.CalledProcessError as exc: + log.warning("systemctl daemon-reload failed: %s", exc.stderr.strip()) + return changed + + +def _spawn_agent_via_systemd(install_dir: pathlib.Path) -> int: + # Restart agent + forwarder together: both processes run out of the same + # /opt/decnet tree, so a code push that replaces the tree must cycle both + # or the forwarder keeps the pre-update code in memory. Forwarder restart + # is best-effort — a worker without the forwarder unit installed (e.g. a + # legacy enrollment) shouldn't abort the update. + subprocess.run( # nosec B603 B607 + ["systemctl", "restart", AGENT_SYSTEMD_UNIT], + check=True, capture_output=True, text=True, + ) + fwd = subprocess.run( # nosec B603 B607 + ["systemctl", "restart", FORWARDER_SYSTEMD_UNIT], + check=False, capture_output=True, text=True, + ) + if fwd.returncode != 0: + log.warning("forwarder restart failed (ignored): %s", fwd.stderr.strip()) + for unit in AUXILIARY_SYSTEMD_UNITS: + aux = subprocess.run( # nosec B603 B607 + ["systemctl", "restart", unit], + check=False, capture_output=True, text=True, + ) + if aux.returncode != 0: + log.warning("%s restart failed (ignored): %s", unit, aux.stderr.strip()) + pid_out = subprocess.run( # nosec B603 B607 + ["systemctl", "show", "--property=MainPID", "--value", AGENT_SYSTEMD_UNIT], + check=True, capture_output=True, text=True, + ) + pid = int(pid_out.stdout.strip() or "0") + if pid: + _pid_file(install_dir).write_text(str(pid)) + return pid + + +def _spawn_agent_via_popen(install_dir: pathlib.Path) -> int: + decnet_bin = _shared_venv(install_dir) / "bin" / "decnet" + log_path = install_dir / "agent.spawn.log" + # cwd=install_dir so a persistent ``/.env.local`` gets + # picked up by decnet.env (which loads from CWD). The release slot + # itself is immutable across updates, so the env file cannot live + # inside it. + proc = subprocess.Popen( # nosec B603 + [str(decnet_bin), "agent", "--daemon"], + start_new_session=True, + cwd=str(install_dir), + stdout=open(log_path, "ab"), # noqa: SIM115 + stderr=subprocess.STDOUT, + ) + _pid_file(install_dir).write_text(str(proc.pid)) + return proc.pid + + +def _discover_agent_pids() -> list[int]: + """Scan /proc for any running ``decnet agent`` process. + + Used as a fallback when agent.pid is missing (e.g., the agent was started + by hand rather than by the updater) so an update still produces a clean + restart instead of leaving the old in-memory code serving requests. + """ + pids: list[int] = [] + self_pid = os.getpid() + for entry in pathlib.Path("/proc").iterdir(): + if not entry.name.isdigit(): + continue + pid = int(entry.name) + if pid == self_pid: + continue + try: + raw = (entry / "cmdline").read_bytes() + except (FileNotFoundError, PermissionError, OSError): + continue + argv = [a for a in raw.split(b"\x00") if a] + if len(argv) < 2: + continue + if not argv[0].endswith(b"python") and b"python" not in pathlib.Path(argv[0].decode(errors="ignore")).name.encode(): + # Allow direct console-script invocation too: argv[0] ends with /decnet + if not argv[0].endswith(b"/decnet"): + continue + if b"decnet" in b" ".join(argv) and b"agent" in argv: + pids.append(pid) + return pids + + +def _stop_agent(install_dir: pathlib.Path, grace: float = AGENT_RESTART_GRACE_S) -> None: + """SIGTERM the agent and wait for it to exit; SIGKILL after ``grace`` s. + + Prefers the PID recorded in ``agent.pid`` (processes we spawned) but + falls back to scanning /proc for any ``decnet agent`` so manually-started + agents are also restarted cleanly during an update. + + Under systemd, stop is a no-op — ``_spawn_agent`` issues a single + ``systemctl restart`` that handles stop and start atomically. Pre-stopping + would only race the restart's own stop phase. + """ + if _systemd_available(): + return + pids: list[int] = [] + pid_file = _pid_file(install_dir) + if pid_file.is_file(): + try: + pids.append(int(pid_file.read_text().strip())) + except (ValueError, OSError): + pass + for pid in _discover_agent_pids(): + if pid not in pids: + pids.append(pid) + if not pids: + return + for pid in pids: + try: + os.kill(pid, signal.SIGTERM) + except ProcessLookupError: + continue + deadline = time.monotonic() + grace + remaining = list(pids) + while remaining and time.monotonic() < deadline: + remaining = [p for p in remaining if _pid_alive(p)] + if remaining: + time.sleep(0.2) + for pid in remaining: + try: + os.kill(pid, signal.SIGKILL) + except ProcessLookupError: + pass + + +def _pid_alive(pid: int) -> bool: + try: + os.kill(pid, 0) + return True + except ProcessLookupError: + return False + + +def _probe_agent( + agent_dir: pathlib.Path = pki.DEFAULT_AGENT_DIR, + url: str = AGENT_PROBE_URL, + attempts: int = AGENT_PROBE_ATTEMPTS, + backoff_s: float = AGENT_PROBE_BACKOFF_S, +) -> tuple[bool, str]: + """Local mTLS health probe against the agent. Returns (ok, detail).""" + worker_key = agent_dir / "worker.key" + worker_crt = agent_dir / "worker.crt" + ca = agent_dir / "ca.crt" + if not (worker_key.is_file() and worker_crt.is_file() and ca.is_file()): + return False, f"no mTLS bundle at {agent_dir}" + ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) + ctx.load_cert_chain(certfile=str(worker_crt), keyfile=str(worker_key)) + ctx.load_verify_locations(cafile=str(ca)) + ctx.verify_mode = ssl.CERT_REQUIRED + ctx.check_hostname = False + + last = "" + for i in range(attempts): + try: + with httpx.Client(verify=ctx, timeout=3.0) as client: + r = client.get(url) + if r.status_code == 200: + return True, r.text + last = f"status={r.status_code} body={r.text[:200]}" + except Exception as exc: # noqa: BLE001 + last = f"{type(exc).__name__}: {exc}" + if i < attempts - 1: + time.sleep(backoff_s) + return False, last + + +# -------------------------------------------------------------- orchestrator + +def _write_manifest(release: pathlib.Path, sha: Optional[str]) -> None: + import json + + _manifest_file(release).write_text(json.dumps({ + "sha": sha, + "installed_at": datetime.now(timezone.utc).isoformat(), + })) + + +def _rotate(install_dir: pathlib.Path) -> None: + """Rotate directories: prev→(deleted), active→prev, active.new→active. + + Caller must ensure ``active.new`` exists. ``active`` may or may not. + """ + active = _active_dir(install_dir) + prev = _prev_dir(install_dir) + staging = _staging_dir(install_dir) + + if prev.exists(): + shutil.rmtree(prev) + if active.exists(): + active.rename(prev) + staging.rename(active) + + +def _point_current_at(install_dir: pathlib.Path, target: pathlib.Path) -> None: + """Atomic symlink flip via rename.""" + link = _current_symlink(install_dir) + tmp = install_dir / ".current.tmp" + if tmp.exists() or tmp.is_symlink(): + tmp.unlink() + tmp.symlink_to(target) + os.replace(tmp, link) + + +def run_update( + tarball_bytes: bytes, + sha: Optional[str], + install_dir: pathlib.Path = DEFAULT_INSTALL_DIR, + agent_dir: pathlib.Path = pki.DEFAULT_AGENT_DIR, +) -> dict[str, Any]: + """Apply an update atomically. Rolls back on probe failure.""" + log.info("update received sha=%s bytes=%d install_dir=%s", sha, len(tarball_bytes), install_dir) + clean_stale_staging(install_dir) + staging = _staging_dir(install_dir) + + log.info("extracting tarball -> %s", staging) + extract_tarball(tarball_bytes, staging) + _write_manifest(staging, sha) + + log.info("pip install into shared venv (%s)", _shared_venv(install_dir)) + pip = _run_pip(staging) + if pip.returncode != 0: + log.error("pip install failed rc=%d stderr=%s", pip.returncode, (pip.stderr or pip.stdout).strip()[:400]) + shutil.rmtree(staging, ignore_errors=True) + raise UpdateError( + "pip install failed on new release", stderr=pip.stderr or pip.stdout, + ) + + log.info("rotating releases: active.new -> active, active -> prev") + _rotate(install_dir) + _point_current_at(install_dir, _active_dir(install_dir)) + _heal_path_symlink(install_dir) + _sync_systemd_units(install_dir) + + log.info("restarting agent (and forwarder if present)") + _stop_agent(install_dir) + _spawn_agent(install_dir) + + ok, detail = _probe_agent(agent_dir=agent_dir) + if ok: + log.info("update complete sha=%s probe=ok", sha) + return { + "status": "updated", + "release": read_release(_active_dir(install_dir)).to_dict(), + "probe": detail, + } + + # Rollback. + log.warning("agent probe failed after update: %s — rolling back", detail) + _stop_agent(install_dir) + # Swap active <-> prev. + active = _active_dir(install_dir) + prev = _prev_dir(install_dir) + tmp = _releases_dir(install_dir) / ".swap" + if tmp.exists(): + shutil.rmtree(tmp) + active.rename(tmp) + prev.rename(active) + tmp.rename(prev) + _point_current_at(install_dir, active) + _spawn_agent(install_dir) + ok2, detail2 = _probe_agent(agent_dir=agent_dir) + raise UpdateError( + "agent failed health probe after update; rolled back to previous release", + stderr=f"forward-probe: {detail}\nrollback-probe: {detail2}", + rolled_back=ok2, + ) + + +def run_rollback( + install_dir: pathlib.Path = DEFAULT_INSTALL_DIR, + agent_dir: pathlib.Path = pki.DEFAULT_AGENT_DIR, +) -> dict[str, Any]: + """Manually swap active with prev and restart the agent.""" + active = _active_dir(install_dir) + prev = _prev_dir(install_dir) + if not prev.is_dir(): + raise UpdateError("no previous release to roll back to") + + _stop_agent(install_dir) + tmp = _releases_dir(install_dir) / ".swap" + if tmp.exists(): + shutil.rmtree(tmp) + active.rename(tmp) + prev.rename(active) + tmp.rename(prev) + _point_current_at(install_dir, active) + _spawn_agent(install_dir) + ok, detail = _probe_agent(agent_dir=agent_dir) + if not ok: + raise UpdateError("agent unhealthy after rollback", stderr=detail) + return { + "status": "rolled_back", + "release": read_release(active).to_dict(), + "probe": detail, + } + + +def run_update_self( + tarball_bytes: bytes, + sha: Optional[str], + updater_install_dir: pathlib.Path, + exec_cb: Optional[Callable[[list[str]], None]] = None, +) -> dict[str, Any]: + """Replace the updater's own source tree, then re-exec this process. + + No auto-rollback. Caller must treat "connection dropped + /health + returns new SHA within 30s" as success. + """ + log.info("self-update received sha=%s bytes=%d install_dir=%s", sha, len(tarball_bytes), updater_install_dir) + clean_stale_staging(updater_install_dir) + staging = _staging_dir(updater_install_dir) + log.info("extracting tarball -> %s", staging) + extract_tarball(tarball_bytes, staging) + _write_manifest(staging, sha) + + log.info("pip install updater release into shared venv (%s)", _shared_venv(updater_install_dir)) + pip = _run_pip(staging) + if pip.returncode != 0: + log.error("self-update pip install failed rc=%d stderr=%s", pip.returncode, (pip.stderr or pip.stdout).strip()[:400]) + shutil.rmtree(staging, ignore_errors=True) + raise UpdateError( + "pip install failed on new updater release", + stderr=pip.stderr or pip.stdout, + ) + + log.info("rotating updater releases and flipping current symlink") + _rotate(updater_install_dir) + _point_current_at(updater_install_dir, _active_dir(updater_install_dir)) + _heal_path_symlink(updater_install_dir) + _sync_systemd_units(updater_install_dir) + + # Reconstruct the updater's original launch command from env vars set by + # `decnet.updater.server.run`. We can't reuse sys.argv: inside the app + # process this is the uvicorn subprocess invocation (--ssl-keyfile, etc.), + # not the operator-visible `decnet updater ...` command. + decnet_bin = str(_shared_venv(updater_install_dir) / "bin" / "decnet") + argv = [decnet_bin, "updater", + "--host", os.environ.get("DECNET_UPDATER_HOST", "0.0.0.0"), # nosec B104 + "--port", os.environ.get("DECNET_UPDATER_PORT", "8766"), + "--updater-dir", os.environ.get("DECNET_UPDATER_BUNDLE_DIR", + str(pki.DEFAULT_AGENT_DIR.parent / "updater")), + "--install-dir", os.environ.get("DECNET_UPDATER_INSTALL_DIR", + str(updater_install_dir.parent)), + "--agent-dir", os.environ.get("DECNET_UPDATER_AGENT_DIR", + str(pki.DEFAULT_AGENT_DIR))] + if exec_cb is not None: + exec_cb(argv) # tests stub this — we don't actually re-exec + return {"status": "self_update_queued", "argv": argv} + # Under systemd, hand the restart to the init system so the new process + # keeps its unit context (capabilities, cgroup, logging target) instead + # of inheriting whatever we had here. Spawn a detached sh that waits for + # this response to flush before issuing the restart — `systemctl restart` + # on our own unit would kill us mid-response and the caller would see a + # connection drop with no indication of success. + if _systemd_available(): + log.info("self-update queued: systemctl restart %s (deferred 1s)", UPDATER_SYSTEMD_UNIT) + subprocess.Popen( # nosec B603 B607 + ["sh", "-c", f"sleep 1 && systemctl restart {UPDATER_SYSTEMD_UNIT}"], + start_new_session=True, + stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, + ) + return {"status": "self_update_queued", "via": "systemd"} + # Off-systemd fallback: replace the process image directly. + os.execv(argv[0], argv) # nosec B606 - pragma: no cover + return {"status": "self_update_queued"} # pragma: no cover diff --git a/decnet/updater/server.py b/decnet/updater/server.py new file mode 100644 index 0000000..ed4b93d --- /dev/null +++ b/decnet/updater/server.py @@ -0,0 +1,90 @@ +"""Self-updater uvicorn launcher. + +Parallels ``decnet/agent/server.py`` but uses a distinct bundle directory +(``~/.decnet/updater``) with a cert whose CN is ``updater@``. That +cert is signed by the same DECNET CA as the agent's, so the master's one +CA still gates both channels; the CN is how we tell them apart. +""" +from __future__ import annotations + +import os +import pathlib +import signal +import subprocess # nosec B404 +import sys + +from decnet.logging import get_logger +from decnet.swarm import pki + +log = get_logger("updater.server") + +DEFAULT_UPDATER_DIR = pathlib.Path(os.path.expanduser("~/.decnet/updater")) + + +def _load_bundle(updater_dir: pathlib.Path) -> bool: + return all( + (updater_dir / name).is_file() + for name in ("ca.crt", "updater.crt", "updater.key") + ) + + +def run( + host: str, + port: int, + updater_dir: pathlib.Path = DEFAULT_UPDATER_DIR, + install_dir: pathlib.Path = pathlib.Path("/opt/decnet"), + agent_dir: pathlib.Path = pki.DEFAULT_AGENT_DIR, +) -> int: + if not _load_bundle(updater_dir): + print( + f"[updater] No cert bundle at {updater_dir}. " + f"Run `decnet swarm enroll --updater` from the master first.", + file=sys.stderr, + ) + return 2 + + # Pass config into the app module via env so uvicorn subprocess picks it up. + os.environ["DECNET_UPDATER_INSTALL_DIR"] = str(install_dir) + os.environ["DECNET_UPDATER_UPDATER_DIR"] = str(install_dir / "updater") + os.environ["DECNET_UPDATER_AGENT_DIR"] = str(agent_dir) + # Needed by run_update_self to rebuild the updater's launch argv. + os.environ["DECNET_UPDATER_BUNDLE_DIR"] = str(updater_dir) + os.environ["DECNET_UPDATER_HOST"] = str(host) + os.environ["DECNET_UPDATER_PORT"] = str(port) + + keyfile = updater_dir / "updater.key" + certfile = updater_dir / "updater.crt" + cafile = updater_dir / "ca.crt" + + cmd = [ + sys.executable, + "-m", + "uvicorn", + "decnet.updater.app:app", + "--host", + host, + "--port", + str(port), + "--ssl-keyfile", + str(keyfile), + "--ssl-certfile", + str(certfile), + "--ssl-ca-certs", + str(cafile), + "--ssl-cert-reqs", + "2", + ] + log.info("updater starting host=%s port=%d bundle=%s", host, port, updater_dir) + proc = subprocess.Popen(cmd, start_new_session=True) # nosec B603 + try: + return proc.wait() + except KeyboardInterrupt: + try: + os.killpg(proc.pid, signal.SIGTERM) + try: + return proc.wait(timeout=10) + except subprocess.TimeoutExpired: + os.killpg(proc.pid, signal.SIGKILL) + return proc.wait() + except ProcessLookupError: + return 0 diff --git a/decnet/web/_uvicorn_tls_scope.py b/decnet/web/_uvicorn_tls_scope.py new file mode 100644 index 0000000..f68ad56 --- /dev/null +++ b/decnet/web/_uvicorn_tls_scope.py @@ -0,0 +1,72 @@ +"""Inject the TLS peer cert into ASGI scope — uvicorn ≤ 0.44 does not. + +Uvicorn's h11/httptools HTTP protocols build the ASGI ``scope`` dict +without any ``extensions.tls`` entry, so per-request cert pinning +handlers (like POST /swarm/heartbeat) can't see the client cert that +CERT_REQUIRED already validated at handshake. + +We patch ``RequestResponseCycle.__init__`` on both protocol modules to +read the peer cert off the asyncio transport (which *does* carry it) +and write the DER bytes into +``scope["extensions"]["tls"]["client_cert_chain"]``. This is the same +key the ASGI TLS extension proposal uses, so the application code will +keep working unchanged if a future uvicorn populates it natively. + +Import this module once at app startup time (before uvicorn starts +accepting connections). Idempotent — subsequent imports are no-ops. +""" +from __future__ import annotations + +from typing import Any + + +_PATCHED = False + + +def _wrap_cycle_init(cycle_cls) -> None: + original = cycle_cls.__init__ + + def _patched_init(self, *args: Any, **kwargs: Any) -> None: + original(self, *args, **kwargs) + transport = kwargs.get("transport") or getattr(self, "transport", None) + if transport is None: + return + ssl_obj = transport.get_extra_info("ssl_object") + if ssl_obj is None: + return + try: + der = ssl_obj.getpeercert(binary_form=True) + except Exception: + return + if not der: + return + # scope is a mutable dict uvicorn stores here; Starlette forwards + # it to handlers as request.scope. Use setdefault so we don't clobber + # any future native extension entries from uvicorn itself. + scope = self.scope + extensions = scope.setdefault("extensions", {}) + extensions.setdefault("tls", {"client_cert_chain": [der]}) + + cycle_cls.__init__ = _patched_init + + +def install() -> None: + """Patch uvicorn's HTTP cycle classes. Safe to call multiple times.""" + global _PATCHED + if _PATCHED: + return + try: + from uvicorn.protocols.http import h11_impl + _wrap_cycle_init(h11_impl.RequestResponseCycle) + except Exception: # nosec B110 - optional uvicorn impl may be unavailable + pass + try: + from uvicorn.protocols.http import httptools_impl + _wrap_cycle_init(httptools_impl.RequestResponseCycle) + except Exception: # nosec B110 - optional uvicorn impl may be unavailable + pass + _PATCHED = True + + +# Auto-install on import so simply importing this module patches uvicorn. +install() diff --git a/decnet/web/api.py b/decnet/web/api.py index d5e3ca3..f33b9de 100644 --- a/decnet/web/api.py +++ b/decnet/web/api.py @@ -1,33 +1,66 @@ import asyncio -import logging import os from contextlib import asynccontextmanager from typing import Any, AsyncGenerator, Optional from fastapi import FastAPI, Request, status from fastapi.exceptions import RequestValidationError -from fastapi.responses import JSONResponse +from fastapi.responses import ORJSONResponse from pydantic import ValidationError from fastapi.middleware.cors import CORSMiddleware -from decnet.env import DECNET_CORS_ORIGINS, DECNET_DEVELOPER, DECNET_INGEST_LOG_FILE +from decnet.env import ( + DECNET_CORS_ORIGINS, + DECNET_DEVELOPER, + DECNET_EMBED_PROFILER, + DECNET_EMBED_SNIFFER, + DECNET_INGEST_LOG_FILE, + DECNET_PROFILE_DIR, + DECNET_PROFILE_REQUESTS, +) +from decnet.logging import get_logger from decnet.web.dependencies import repo from decnet.collector import log_collector_worker from decnet.web.ingester import log_ingestion_worker +from decnet.profiler import attacker_profile_worker from decnet.web.router import api_router -log = logging.getLogger(__name__) +log = get_logger("api") ingestion_task: Optional[asyncio.Task[Any]] = None collector_task: Optional[asyncio.Task[Any]] = None +attacker_task: Optional[asyncio.Task[Any]] = None +sniffer_task: Optional[asyncio.Task[Any]] = None + + +def get_background_tasks() -> dict[str, Optional[asyncio.Task[Any]]]: + """Expose background task handles for the health endpoint.""" + return { + "ingestion_worker": ingestion_task, + "collector_worker": collector_task, + "attacker_worker": attacker_task, + "sniffer_worker": sniffer_task, + } @asynccontextmanager async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: - global ingestion_task, collector_task + global ingestion_task, collector_task, attacker_task, sniffer_task + import resource + soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE) + if soft < 4096: + log.warning( + "Low open-file limit detected (ulimit -n = %d). " + "High-traffic deployments may hit 'Too many open files' errors. " + "Raise it with: ulimit -n 65536 (session) or LimitNOFILE=65536 (systemd)", + soft, + ) + + log.info("API startup initialising database") for attempt in range(1, 6): try: await repo.initialize() + log.debug("API startup DB initialised attempt=%d", attempt) break except Exception as exc: log.warning("DB init attempt %d/5 failed: %s", attempt, exc) @@ -35,25 +68,57 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: log.error("DB failed to initialize after 5 attempts — startup may be degraded") await asyncio.sleep(0.5) + # Conditionally enable OpenTelemetry tracing + from decnet.telemetry import setup_tracing + setup_tracing(app) + # Start background tasks only if not in contract test mode if os.environ.get("DECNET_CONTRACT_TEST") != "true": # Start background ingestion task if ingestion_task is None or ingestion_task.done(): ingestion_task = asyncio.create_task(log_ingestion_worker(repo)) + log.debug("API startup ingest worker started") # Start Docker log collector (writes to log file; ingester reads from it) _log_file = os.environ.get("DECNET_INGEST_LOG_FILE", DECNET_INGEST_LOG_FILE) if _log_file and (collector_task is None or collector_task.done()): collector_task = asyncio.create_task(log_collector_worker(_log_file)) + log.debug("API startup collector worker started log_file=%s", _log_file) elif not _log_file: log.warning("DECNET_INGEST_LOG_FILE not set — Docker log collection disabled.") + + # Start attacker profile rebuild worker only when explicitly requested. + # Default is OFF because `decnet deploy` always starts a standalone + # `decnet profiler --daemon` process. Running both against the same + # DB cursor causes events to be skipped or double-processed. + if DECNET_EMBED_PROFILER: + if attacker_task is None or attacker_task.done(): + attacker_task = asyncio.create_task(attacker_profile_worker(repo)) + log.info("API startup: embedded profiler started (DECNET_EMBED_PROFILER=true)") + else: + log.debug("API startup: profiler not embedded — expecting standalone daemon") + + # Start fleet-wide MACVLAN sniffer only when explicitly requested. + # Default is OFF because `decnet deploy` always starts a standalone + # `decnet sniffer --daemon` process. Running both against the same + # interface produces duplicated events and wastes CPU. + if DECNET_EMBED_SNIFFER: + try: + from decnet.sniffer import sniffer_worker + if sniffer_task is None or sniffer_task.done(): + sniffer_task = asyncio.create_task(sniffer_worker(_log_file)) + log.info("API startup: embedded sniffer started (DECNET_EMBED_SNIFFER=true)") + except Exception as exc: + log.warning("Sniffer worker failed to start — API continues without sniffing: %s", exc) + else: + log.debug("API startup: sniffer not embedded — expecting standalone daemon") else: log.info("Contract Test Mode: skipping background worker startup") yield - # Shutdown background tasks - for task in (ingestion_task, collector_task): + log.info("API shutdown cancelling background tasks") + for task in (ingestion_task, collector_task, attacker_task, sniffer_task): if task and not task.done(): task.cancel() try: @@ -62,12 +127,16 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: pass except Exception as exc: log.warning("Task shutdown error: %s", exc) + from decnet.telemetry import shutdown_tracing + shutdown_tracing() + log.info("API shutdown complete") app: FastAPI = FastAPI( title="DECNET Web Dashboard API", version="1.0.0", lifespan=lifespan, + default_response_class=ORJSONResponse, docs_url="/docs" if DECNET_DEVELOPER else None, redoc_url="/redoc" if DECNET_DEVELOPER else None, openapi_url="/openapi.json" if DECNET_DEVELOPER else None @@ -81,12 +150,37 @@ app.add_middleware( allow_headers=["Authorization", "Content-Type", "Last-Event-ID"], ) +if DECNET_PROFILE_REQUESTS: + import time + from pathlib import Path + from pyinstrument import Profiler + from starlette.middleware.base import BaseHTTPMiddleware + + _profile_dir = Path(DECNET_PROFILE_DIR) + _profile_dir.mkdir(parents=True, exist_ok=True) + + class PyinstrumentMiddleware(BaseHTTPMiddleware): + async def dispatch(self, request: Request, call_next): + profiler = Profiler(async_mode="enabled") + profiler.start() + try: + response = await call_next(request) + finally: + profiler.stop() + slug = request.url.path.strip("/").replace("/", "_") or "root" + out = _profile_dir / f"{int(time.time() * 1000)}-{request.method}-{slug}.html" + out.write_text(profiler.output_html()) + return response + + app.add_middleware(PyinstrumentMiddleware) + log.info("Pyinstrument middleware mounted — flamegraphs -> %s", _profile_dir) + # Include the modular API router app.include_router(api_router, prefix="/api/v1") @app.exception_handler(RequestValidationError) -async def validation_exception_handler(request: Request, exc: RequestValidationError) -> JSONResponse: +async def validation_exception_handler(request: Request, exc: RequestValidationError) -> ORJSONResponse: """ Handle validation errors with targeted status codes to satisfy contract tests. Tiered Prioritization: @@ -106,7 +200,7 @@ async def validation_exception_handler(request: Request, exc: RequestValidationE for err in errors ) if is_structural_violation: - return JSONResponse( + return ORJSONResponse( status_code=status.HTTP_400_BAD_REQUEST, content={"detail": "Bad Request: Schema structural violation (wrong type, extra fields, or invalid length)."}, ) @@ -117,7 +211,7 @@ async def validation_exception_handler(request: Request, exc: RequestValidationE # Empty INI content (Valid string but semantically empty) is_ini_empty = any("INI content is empty" in err.get("msg", "") for err in errors) if is_ini_empty: - return JSONResponse( + return ORJSONResponse( status_code=status.HTTP_409_CONFLICT, content={"detail": "Configuration conflict: INI content is empty."}, ) @@ -126,7 +220,7 @@ async def validation_exception_handler(request: Request, exc: RequestValidationE # Mapping to 409 for Positive Data compliance. is_invalid_characters = any("Invalid INI format" in err.get("msg", "") for err in errors) if is_invalid_characters: - return JSONResponse( + return ORJSONResponse( status_code=status.HTTP_409_CONFLICT, content={"detail": "Configuration conflict: INI syntax or characters are invalid."}, ) @@ -134,7 +228,7 @@ async def validation_exception_handler(request: Request, exc: RequestValidationE # Logical invalidity (Valid string, valid syntax, but missing required DECNET logic like sections) is_ini_invalid_logic = any("at least one section" in err.get("msg", "") for err in errors) if is_ini_invalid_logic: - return JSONResponse( + return ORJSONResponse( status_code=status.HTTP_409_CONFLICT, content={"detail": "Invalid INI config structure: No decky sections found."}, ) @@ -149,19 +243,19 @@ async def validation_exception_handler(request: Request, exc: RequestValidationE if "/deckies/deploy" in request.url.path: message = "Invalid INI config" - return JSONResponse( + return ORJSONResponse( status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, content={"detail": message}, ) @app.exception_handler(ValidationError) -async def pydantic_validation_exception_handler(request: Request, exc: ValidationError) -> JSONResponse: +async def pydantic_validation_exception_handler(request: Request, exc: ValidationError) -> ORJSONResponse: """ Handle Pydantic errors that occur during manual model instantiation (e.g. state hydration). Prevents 500 errors when the database contains inconsistent or outdated schema data. """ log.error("Internal Pydantic validation error: %s", exc) - return JSONResponse( + return ORJSONResponse( status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, content={ "detail": "Internal data consistency error", diff --git a/decnet/web/auth.py b/decnet/web/auth.py index 6ece1e3..81879c5 100644 --- a/decnet/web/auth.py +++ b/decnet/web/auth.py @@ -1,3 +1,4 @@ +import asyncio from datetime import datetime, timedelta, timezone from typing import Optional, Any import jwt @@ -24,6 +25,15 @@ def get_password_hash(password: str) -> str: return _hashed.decode("utf-8") +async def averify_password(plain_password: str, hashed_password: str) -> bool: + # bcrypt is CPU-bound and ~250ms/call; keep it off the event loop. + return await asyncio.to_thread(verify_password, plain_password, hashed_password) + + +async def ahash_password(password: str) -> str: + return await asyncio.to_thread(get_password_hash, password) + + def create_access_token(data: dict[str, Any], expires_delta: Optional[timedelta] = None) -> str: _to_encode: dict[str, Any] = data.copy() _expire: datetime diff --git a/decnet/web/db/factory.py b/decnet/web/db/factory.py index b98884e..af5ff5c 100644 --- a/decnet/web/db/factory.py +++ b/decnet/web/db/factory.py @@ -1,18 +1,33 @@ +""" +Repository factory — selects a :class:`BaseRepository` implementation based on +``DECNET_DB_TYPE`` (``sqlite`` or ``mysql``). +""" +from __future__ import annotations + +import os from typing import Any -from decnet.env import os + from decnet.web.db.repository import BaseRepository + def get_repository(**kwargs: Any) -> BaseRepository: - """Factory function to instantiate the correct repository implementation based on environment.""" + """Instantiate the repository implementation selected by ``DECNET_DB_TYPE``. + + Keyword arguments are forwarded to the concrete implementation: + + * SQLite accepts ``db_path``. + * MySQL accepts ``url`` and engine tuning knobs (``pool_size``, …). + """ db_type = os.environ.get("DECNET_DB_TYPE", "sqlite").lower() if db_type == "sqlite": from decnet.web.db.sqlite.repository import SQLiteRepository - return SQLiteRepository(**kwargs) + repo = SQLiteRepository(**kwargs) elif db_type == "mysql": - # Placeholder for future implementation - # from decnet.web.db.mysql.repository import MySQLRepository - # return MySQLRepository() - raise NotImplementedError("MySQL support is planned but not yet implemented.") + from decnet.web.db.mysql.repository import MySQLRepository + repo = MySQLRepository(**kwargs) else: raise ValueError(f"Unsupported database type: {db_type}") + + from decnet.telemetry import wrap_repository + return wrap_repository(repo) diff --git a/decnet/web/db/models.py b/decnet/web/db/models.py index 681db23..5d75bc7 100644 --- a/decnet/web/db/models.py +++ b/decnet/web/db/models.py @@ -1,8 +1,16 @@ from datetime import datetime, timezone -from typing import Optional, Any, List, Annotated +from typing import Literal, Optional, Any, List, Annotated +from sqlalchemy import Column, Text +from sqlalchemy.dialects.mysql import MEDIUMTEXT from sqlmodel import SQLModel, Field from pydantic import BaseModel, ConfigDict, Field as PydanticField, BeforeValidator -from decnet.models import IniContent +from decnet.models import IniContent, DecnetConfig + +# Use on columns that accumulate over an attacker's lifetime (commands, +# fingerprints, state blobs). TEXT on MySQL caps at 64 KiB; MEDIUMTEXT +# stretches to 16 MiB. SQLite has no fixed-width text types so Text() +# stays unchanged there. +_BIG_TEXT = Text().with_variant(MEDIUMTEXT(), "mysql") def _normalize_null(v: Any) -> Any: if isinstance(v, str) and v.lower() in ("null", "undefined", ""): @@ -30,9 +38,16 @@ class Log(SQLModel, table=True): service: str = Field(index=True) event_type: str = Field(index=True) attacker_ip: str = Field(index=True) - raw_line: str - fields: str - msg: Optional[str] = None + # Long-text columns — use TEXT so MySQL DDL doesn't truncate to VARCHAR(255). + # TEXT is equivalent to plain text in SQLite. + raw_line: str = Field(sa_column=Column("raw_line", Text, nullable=False)) + fields: str = Field(sa_column=Column("fields", Text, nullable=False)) + msg: Optional[str] = Field(default=None, sa_column=Column("msg", Text, nullable=True)) + # OTEL trace context — bridges the collector→ingester trace to the SSE + # read path. Nullable so pre-existing rows and non-traced deployments + # are unaffected. + trace_id: Optional[str] = Field(default=None) + span_id: Optional[str] = Field(default=None) class Bounty(SQLModel, table=True): __tablename__ = "bounty" @@ -42,13 +57,140 @@ class Bounty(SQLModel, table=True): service: str = Field(index=True) attacker_ip: str = Field(index=True) bounty_type: str = Field(index=True) - payload: str + payload: str = Field(sa_column=Column("payload", Text, nullable=False)) class State(SQLModel, table=True): __tablename__ = "state" key: str = Field(primary_key=True) - value: str # Stores JSON serialized DecnetConfig or other state blobs + # JSON-serialized DecnetConfig or other state blobs — can be large as + # deckies/services accumulate. MEDIUMTEXT on MySQL (16 MiB ceiling). + value: str = Field(sa_column=Column("value", _BIG_TEXT, nullable=False)) + + +class Attacker(SQLModel, table=True): + __tablename__ = "attackers" + uuid: str = Field(primary_key=True) + ip: str = Field(index=True) + first_seen: datetime = Field(index=True) + last_seen: datetime = Field(index=True) + event_count: int = Field(default=0) + service_count: int = Field(default=0) + decky_count: int = Field(default=0) + # JSON blobs — these grow over the attacker's lifetime. Use MEDIUMTEXT on + # MySQL (16 MiB) for the fields that accumulate (fingerprints, commands, + # and the deckies/services lists that are unbounded in principle). + services: str = Field( + default="[]", sa_column=Column("services", _BIG_TEXT, nullable=False, default="[]") + ) # JSON list[str] + deckies: str = Field( + default="[]", sa_column=Column("deckies", _BIG_TEXT, nullable=False, default="[]") + ) # JSON list[str], first-contact ordered + traversal_path: Optional[str] = Field( + default=None, sa_column=Column("traversal_path", Text, nullable=True) + ) # "decky-01 → decky-03 → decky-05" + is_traversal: bool = Field(default=False) + bounty_count: int = Field(default=0) + credential_count: int = Field(default=0) + fingerprints: str = Field( + default="[]", sa_column=Column("fingerprints", _BIG_TEXT, nullable=False, default="[]") + ) # JSON list[dict] — bounty fingerprints + commands: str = Field( + default="[]", sa_column=Column("commands", _BIG_TEXT, nullable=False, default="[]") + ) # JSON list[dict] — commands per service/decky + updated_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), index=True + ) + + +class SwarmHost(SQLModel, table=True): + """A worker host enrolled into a DECNET swarm. + + Rows exist only on the master. Populated by `decnet swarm enroll` and + read by the swarm controller when sharding deckies onto workers. + """ + __tablename__ = "swarm_hosts" + uuid: str = Field(primary_key=True) + name: str = Field(index=True, unique=True) + address: str # IP or hostname reachable by the master + agent_port: int = Field(default=8765) + status: str = Field(default="enrolled", index=True) + # ISO-8601 string of the last successful agent /health probe + last_heartbeat: Optional[datetime] = Field(default=None) + client_cert_fingerprint: str # SHA-256 hex of worker's issued client cert + # SHA-256 hex of the updater-identity cert, if the host was enrolled + # with ``--updater`` / ``issue_updater_bundle``. ``None`` for hosts + # that only have an agent identity. + updater_cert_fingerprint: Optional[str] = Field(default=None) + # Directory on the master where the per-worker cert bundle lives + cert_bundle_path: str + enrolled_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) + notes: Optional[str] = Field(default=None, sa_column=Column("notes", Text, nullable=True)) + # Per-host driver preference. True => deckies on this host run over IPvlan + # (L2) instead of macvlan — required when the host is a VirtualBox guest + # bridged over Wi-Fi, because Wi-Fi APs only allow one MAC per station + # and macvlan's per-container MACs rotate the VM's DHCP lease. + use_ipvlan: bool = Field(default=False) + + +class DeckyShard(SQLModel, table=True): + """Mapping of a single decky to the worker host running it (swarm mode).""" + __tablename__ = "decky_shards" + decky_name: str = Field(primary_key=True) + host_uuid: str = Field(foreign_key="swarm_hosts.uuid", index=True) + # JSON list of service names running on this decky (snapshot of assignment). + services: str = Field(sa_column=Column("services", _BIG_TEXT, nullable=False, default="[]")) + # Full serialised DeckyConfig from the most recent dispatch or heartbeat. + # Lets the dashboard render the same rich card (hostname/distro/archetype/ + # service_config/mutate_interval) that the local-fleet view uses, without + # needing a live round-trip to the worker for every page render. + decky_config: Optional[str] = Field( + default=None, sa_column=Column("decky_config", _BIG_TEXT, nullable=True) + ) + decky_ip: Optional[str] = Field(default=None) + state: str = Field(default="pending", index=True) # pending|running|failed|torn_down|degraded|tearing_down|teardown_failed + last_error: Optional[str] = Field(default=None, sa_column=Column("last_error", Text, nullable=True)) + compose_hash: Optional[str] = Field(default=None) + # Timestamp of the last heartbeat that echoed this shard; lets the UI + # show "stale" decks whose agent has gone silent. + last_seen: Optional[datetime] = Field(default=None) + updated_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc)) + + +class AttackerBehavior(SQLModel, table=True): + """ + Timing & behavioral profile for an attacker, joined to Attacker by uuid. + + Kept in a separate table so the core Attacker row stays narrow and + behavior data can be updated independently (e.g. as the sniffer observes + more packets) without touching the event-count aggregates. + """ + __tablename__ = "attacker_behavior" + attacker_uuid: str = Field(primary_key=True, foreign_key="attackers.uuid") + # OS / TCP stack fingerprint (rolled up from sniffer events) + os_guess: Optional[str] = None + hop_distance: Optional[int] = None + tcp_fingerprint: str = Field( + default="{}", + sa_column=Column("tcp_fingerprint", Text, nullable=False, default="{}"), + ) # JSON: window, wscale, mss, options_sig + retransmit_count: int = Field(default=0) + # Behavioral (derived by the profiler from log-event timing) + behavior_class: Optional[str] = None # beaconing | interactive | scanning | brute_force | slow_scan | mixed | unknown + beacon_interval_s: Optional[float] = None + beacon_jitter_pct: Optional[float] = None + tool_guesses: Optional[str] = None # JSON list[str] — all matched tools + timing_stats: str = Field( + default="{}", + sa_column=Column("timing_stats", Text, nullable=False, default="{}"), + ) # JSON: mean/median/stdev/min/max IAT + phase_sequence: str = Field( + default="{}", + sa_column=Column("phase_sequence", Text, nullable=False, default="{}"), + ) # JSON: recon_end/exfil_start/latency + updated_at: datetime = Field( + default_factory=lambda: datetime.now(timezone.utc), index=True + ) # --- API Request/Response Models (Pydantic) --- @@ -77,6 +219,12 @@ class BountyResponse(BaseModel): offset: int data: List[dict[str, Any]] +class AttackersResponse(BaseModel): + total: int + limit: int + offset: int + data: List[dict[str, Any]] + class StatsResponse(BaseModel): total_logs: int unique_attackers: int @@ -93,3 +241,251 @@ class DeployIniRequest(BaseModel): # This field now enforces strict INI structure during Pydantic initialization. # The OpenAPI schema correctly shows it as a required string. ini_content: IniContent = PydanticField(..., description="A valid INI formatted string") + + +# --- Configuration Models --- + +class CreateUserRequest(BaseModel): + username: str = PydanticField(..., min_length=1, max_length=64) + password: str = PydanticField(..., min_length=8, max_length=72) + role: Literal["admin", "viewer"] = "viewer" + +class UpdateUserRoleRequest(BaseModel): + role: Literal["admin", "viewer"] + +class ResetUserPasswordRequest(BaseModel): + new_password: str = PydanticField(..., min_length=8, max_length=72) + +class DeploymentLimitRequest(BaseModel): + deployment_limit: int = PydanticField(..., ge=1, le=500) + +class GlobalMutationIntervalRequest(BaseModel): + global_mutation_interval: str = PydanticField(..., pattern=r"^[1-9]\d*[mdMyY]$") + +class UserResponse(BaseModel): + uuid: str + username: str + role: str + must_change_password: bool + +class ConfigResponse(BaseModel): + role: str + deployment_limit: int + global_mutation_interval: str + +class AdminConfigResponse(ConfigResponse): + users: List[UserResponse] + + +class ComponentHealth(BaseModel): + status: Literal["ok", "failing"] + detail: Optional[str] = None + + +class HealthResponse(BaseModel): + status: Literal["healthy", "degraded", "unhealthy"] + components: dict[str, ComponentHealth] + + +# --- Swarm API DTOs --- +# Request/response contracts for the master-side swarm controller +# (decnet/web/swarm_api.py). The underlying SQLModel tables — SwarmHost and +# DeckyShard — live above; these are the HTTP-facing shapes. + +class SwarmEnrollRequest(BaseModel): + # x509 CommonName is capped at 64 bytes (RFC 5280 UB-common-name) — the + # cert issuer would reject anything longer with a ValueError. + # Pattern: ASCII hostname-safe characters only. The name is embedded + # both in the CN and as a SAN DNS entry; x509.DNSName only accepts + # A-label ASCII, so non-ASCII would blow up at issuance. + name: str = PydanticField( + ..., min_length=1, max_length=64, + pattern=r"^[A-Za-z0-9][A-Za-z0-9._\-]*$", + ) + address: str = PydanticField( + ..., min_length=1, max_length=253, + pattern=r"^[A-Za-z0-9][A-Za-z0-9._:\-]*$", + description="IP or DNS the master uses to reach the worker", + ) + agent_port: int = PydanticField(default=8765, ge=1, le=65535) + sans: list[ + Annotated[ + str, + PydanticField( + min_length=1, max_length=253, + pattern=r"^[A-Za-z0-9][A-Za-z0-9._:\-]*$", + ), + ] + ] = PydanticField( + default_factory=list, + description="Extra SANs (IPs / hostnames) to embed in the worker cert", + ) + notes: Optional[str] = None + issue_updater_bundle: bool = PydanticField( + default=False, + description="If true, also issue an updater cert (CN=updater@) for the remote self-updater", + ) + + +class SwarmUpdaterBundle(BaseModel): + """Subset of SwarmEnrolledBundle for the updater identity.""" + fingerprint: str + updater_cert_pem: str + updater_key_pem: str + + +class SwarmEnrolledBundle(BaseModel): + """Cert bundle returned to the operator — must be delivered to the worker.""" + host_uuid: str + name: str + address: str + agent_port: int + fingerprint: str + ca_cert_pem: str + worker_cert_pem: str + worker_key_pem: str + updater: Optional[SwarmUpdaterBundle] = None + + +class SwarmHostView(BaseModel): + uuid: str + name: str + address: str + agent_port: int + status: str + last_heartbeat: Optional[datetime] = None + client_cert_fingerprint: str + updater_cert_fingerprint: Optional[str] = None + enrolled_at: datetime + notes: Optional[str] = None + use_ipvlan: bool = False + + +class DeckyShardView(BaseModel): + """One decky → host mapping, enriched with the host's identity for display.""" + decky_name: str + decky_ip: Optional[str] = None # resolved from the stored DecnetConfig at read time + host_uuid: str + host_name: str + host_address: str + host_status: str + services: list[str] + state: str + last_error: Optional[str] = None + compose_hash: Optional[str] = None + updated_at: datetime + # Enriched fields lifted from the stored DeckyConfig snapshot so the + # dashboard can render the same card shape as the local-fleet view. + hostname: Optional[str] = None + distro: Optional[str] = None + archetype: Optional[str] = None + service_config: dict[str, dict[str, Any]] = {} + mutate_interval: Optional[int] = None + last_mutated: float = 0.0 + last_seen: Optional[datetime] = None + + +class SwarmDeployRequest(BaseModel): + config: DecnetConfig + dry_run: bool = False + no_cache: bool = False + + +class SwarmTeardownRequest(BaseModel): + host_uuid: Optional[str] = PydanticField( + default=None, + description="If set, tear down only this worker; otherwise tear down all hosts", + ) + decky_id: Optional[str] = None + + +class SwarmHostResult(BaseModel): + host_uuid: str + host_name: str + ok: bool + detail: Any | None = None + + +class SwarmDeployResponse(BaseModel): + results: list[SwarmHostResult] + + +class SwarmHostHealth(BaseModel): + host_uuid: str + name: str + address: str + reachable: bool + detail: Any | None = None + + +class SwarmCheckResponse(BaseModel): + results: list[SwarmHostHealth] + + +# --- Remote Updates (master → worker /updater) DTOs --- +# Powers the dashboard's Remote Updates page. The master dashboard calls +# these (auth-gated) endpoints; internally they fan out to each worker's +# updater daemon over mTLS via UpdaterClient. + +class HostReleaseInfo(BaseModel): + host_uuid: str + host_name: str + address: str + reachable: bool + # These fields mirror the updater's /health payload when reachable; they + # are all Optional so an unreachable host still serializes cleanly. + agent_status: Optional[str] = None + current_sha: Optional[str] = None + previous_sha: Optional[str] = None + releases: list[dict[str, Any]] = PydanticField(default_factory=list) + detail: Optional[str] = None # populated when unreachable + + +class HostReleasesResponse(BaseModel): + hosts: list[HostReleaseInfo] + + +class PushUpdateRequest(BaseModel): + host_uuids: Optional[list[str]] = PydanticField( + default=None, + description="Target specific hosts; mutually exclusive with 'all'.", + ) + all: bool = PydanticField(default=False, description="Target every non-decommissioned host with an updater bundle.") + include_self: bool = PydanticField( + default=False, + description="After a successful /update, also push /update-self to upgrade the updater itself.", + ) + exclude: list[str] = PydanticField( + default_factory=list, + description="Additional tarball exclude globs (on top of the built-in defaults).", + ) + + +class PushUpdateResult(BaseModel): + host_uuid: str + host_name: str + # updated = /update 200. rolled-back = /update 409 (auto-recovered). + # failed = transport error or non-200/409 response. self-updated = /update-self succeeded. + status: Literal["updated", "rolled-back", "failed", "self-updated", "self-failed"] + http_status: Optional[int] = None + sha: Optional[str] = None + detail: Optional[str] = None + stderr: Optional[str] = None + + +class PushUpdateResponse(BaseModel): + sha: str + tarball_bytes: int + results: list[PushUpdateResult] + + +class RollbackRequest(BaseModel): + host_uuid: str = PydanticField(..., description="Host to roll back to its previous release slot.") + + +class RollbackResponse(BaseModel): + host_uuid: str + host_name: str + status: Literal["rolled-back", "failed"] + http_status: Optional[int] = None + detail: Optional[str] = None diff --git a/decnet/web/db/mysql/__init__.py b/decnet/web/db/mysql/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/decnet/web/db/mysql/database.py b/decnet/web/db/mysql/database.py new file mode 100644 index 0000000..2e7b329 --- /dev/null +++ b/decnet/web/db/mysql/database.py @@ -0,0 +1,98 @@ +""" +MySQL async engine factory. + +Builds a SQLAlchemy AsyncEngine against MySQL using the ``asyncmy`` driver. + +Connection info is resolved (in order of precedence): + +1. An explicit ``url`` argument passed to :func:`get_async_engine` +2. ``DECNET_DB_URL`` — full SQLAlchemy URL +3. Component env vars: + ``DECNET_DB_HOST`` (default ``localhost``) + ``DECNET_DB_PORT`` (default ``3306``) + ``DECNET_DB_NAME`` (default ``decnet``) + ``DECNET_DB_USER`` (default ``decnet``) + ``DECNET_DB_PASSWORD`` (default empty — raises unless pytest is running) +""" +from __future__ import annotations + +import os +from typing import Optional +from urllib.parse import quote_plus + +from sqlalchemy.ext.asyncio import AsyncEngine, create_async_engine + + +DEFAULT_POOL_SIZE = int(os.environ.get("DECNET_DB_POOL_SIZE", "20")) +DEFAULT_MAX_OVERFLOW = int(os.environ.get("DECNET_DB_MAX_OVERFLOW", "40")) +DEFAULT_POOL_RECYCLE = int(os.environ.get("DECNET_DB_POOL_RECYCLE", "3600")) +DEFAULT_POOL_PRE_PING = os.environ.get("DECNET_DB_POOL_PRE_PING", "true").lower() == "true" + + +def build_mysql_url( + host: Optional[str] = None, + port: Optional[int] = None, + database: Optional[str] = None, + user: Optional[str] = None, + password: Optional[str] = None, +) -> str: + """Compose an async SQLAlchemy URL for MySQL using the asyncmy driver. + + Component args override env vars. Password is percent-encoded so special + characters (``@``, ``:``, ``/``…) don't break URL parsing. + """ + host = host or os.environ.get("DECNET_DB_HOST", "localhost") + port = port or int(os.environ.get("DECNET_DB_PORT", "3306")) + database = database or os.environ.get("DECNET_DB_NAME", "decnet") + user = user or os.environ.get("DECNET_DB_USER", "decnet") + + if password is None: + password = os.environ.get("DECNET_DB_PASSWORD", "") + + # Allow empty passwords during tests (pytest sets PYTEST_* env vars). + # Outside tests, an empty MySQL password is almost never intentional. + if not password and not any(k.startswith("PYTEST") for k in os.environ): + raise ValueError( + "DECNET_DB_PASSWORD is not set. Either export it, set DECNET_DB_URL, " + "or run under pytest for an empty-password default." + ) + + pw_enc = quote_plus(password) + user_enc = quote_plus(user) + return f"mysql+asyncmy://{user_enc}:{pw_enc}@{host}:{port}/{database}" + + +def resolve_url(url: Optional[str] = None) -> str: + """Pick a connection URL: explicit arg → DECNET_DB_URL env → built from components.""" + if url: + return url + env_url = os.environ.get("DECNET_DB_URL") + if env_url: + return env_url + return build_mysql_url() + + +def get_async_engine( + url: Optional[str] = None, + *, + pool_size: int = DEFAULT_POOL_SIZE, + max_overflow: int = DEFAULT_MAX_OVERFLOW, + pool_recycle: int = DEFAULT_POOL_RECYCLE, + pool_pre_ping: bool = DEFAULT_POOL_PRE_PING, + echo: bool = False, +) -> AsyncEngine: + """Create an AsyncEngine for MySQL. + + Defaults tuned for a dashboard workload: a modest pool, hourly recycle + to sidestep MySQL's idle-connection reaper, and pre-ping to fail fast + if a pooled connection has been killed server-side. + """ + dsn = resolve_url(url) + return create_async_engine( + dsn, + echo=echo, + pool_size=pool_size, + max_overflow=max_overflow, + pool_recycle=pool_recycle, + pool_pre_ping=pool_pre_ping, + ) diff --git a/decnet/web/db/mysql/repository.py b/decnet/web/db/mysql/repository.py new file mode 100644 index 0000000..f83b4bf --- /dev/null +++ b/decnet/web/db/mysql/repository.py @@ -0,0 +1,141 @@ +""" +MySQL implementation of :class:`BaseRepository`. + +Inherits the portable SQLModel query code from :class:`SQLModelRepository` +and only overrides the two places where MySQL's SQL dialect differs from +SQLite's: + +* :meth:`_migrate_attackers_table` — uses ``information_schema`` (MySQL + has no ``PRAGMA``). +* :meth:`get_log_histogram` — uses ``FROM_UNIXTIME`` / + ``UNIX_TIMESTAMP`` + integer division for bucketing. +""" +from __future__ import annotations + +from typing import List, Optional + +from sqlalchemy import func, select, text, literal_column +from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker +from sqlmodel.sql.expression import SelectOfScalar + +from decnet.web.db.models import Log +from decnet.web.db.mysql.database import get_async_engine +from decnet.web.db.sqlmodel_repo import SQLModelRepository + + +class MySQLRepository(SQLModelRepository): + """MySQL backend — uses ``asyncmy``.""" + + def __init__(self, url: Optional[str] = None, **engine_kwargs) -> None: + self.engine = get_async_engine(url=url, **engine_kwargs) + self.session_factory = async_sessionmaker( + self.engine, class_=AsyncSession, expire_on_commit=False + ) + + async def _migrate_attackers_table(self) -> None: + """Drop the legacy (pre-UUID) ``attackers`` table if it exists without a ``uuid`` column. + + MySQL exposes column metadata via ``information_schema.COLUMNS``. + ``DATABASE()`` scopes the lookup to the currently connected schema. + """ + async with self.engine.begin() as conn: + rows = (await conn.execute(text( + "SELECT COLUMN_NAME FROM information_schema.COLUMNS " + "WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = 'attackers'" + ))).fetchall() + if rows and not any(r[0] == "uuid" for r in rows): + await conn.execute(text("DROP TABLE attackers")) + + async def _migrate_column_types(self) -> None: + """Upgrade TEXT → MEDIUMTEXT for columns that accumulate large JSON blobs. + + ``create_all()`` never alters existing columns, so tables created before + ``_BIG_TEXT`` was introduced keep their 64 KiB ``TEXT`` cap. This method + inspects ``information_schema`` and issues ``ALTER TABLE … MODIFY COLUMN`` + for each offending column found. + """ + targets: dict[str, dict[str, str]] = { + "attackers": { + "commands": "MEDIUMTEXT NOT NULL DEFAULT '[]'", + "fingerprints": "MEDIUMTEXT NOT NULL DEFAULT '[]'", + "services": "MEDIUMTEXT NOT NULL DEFAULT '[]'", + "deckies": "MEDIUMTEXT NOT NULL DEFAULT '[]'", + }, + "state": { + "value": "MEDIUMTEXT NOT NULL", + }, + } + async with self.engine.begin() as conn: + rows = (await conn.execute(text( + "SELECT TABLE_NAME, COLUMN_NAME FROM information_schema.COLUMNS " + "WHERE TABLE_SCHEMA = DATABASE() " + " AND TABLE_NAME IN ('attackers', 'state') " + " AND COLUMN_NAME IN ('commands','fingerprints','services','deckies','value') " + " AND DATA_TYPE = 'text'" + ))).fetchall() + for table_name, col_name in rows: + spec = targets.get(table_name, {}).get(col_name) + if spec: + await conn.execute(text( + f"ALTER TABLE `{table_name}` MODIFY COLUMN `{col_name}` {spec}" + )) + + async def initialize(self) -> None: + """Create tables and run all MySQL-specific migrations. + + Uses a MySQL advisory lock to serialize DDL across concurrent + uvicorn workers — prevents the 'Table was skipped since its + definition is being modified by concurrent DDL' race. + """ + from sqlmodel import SQLModel + async with self.engine.connect() as lock_conn: + await lock_conn.execute(text("SELECT GET_LOCK('decnet_schema_init', 30)")) + try: + await self._migrate_attackers_table() + await self._migrate_column_types() + async with self.engine.begin() as conn: + await conn.run_sync(SQLModel.metadata.create_all) + await self._ensure_admin_user() + finally: + await lock_conn.execute(text("SELECT RELEASE_LOCK('decnet_schema_init')")) + await lock_conn.close() + + def _json_field_equals(self, key: str): + # MySQL 5.7+ exposes JSON_EXTRACT; quoted string result returned for + # TEXT-stored JSON, same behavior we rely on in SQLite. + return text(f"JSON_UNQUOTE(JSON_EXTRACT(fields, '$.{key}')) = :val") + + async def get_log_histogram( + self, + search: Optional[str] = None, + start_time: Optional[str] = None, + end_time: Optional[str] = None, + interval_minutes: int = 15, + ) -> List[dict]: + bucket_seconds = max(interval_minutes, 1) * 60 + # Truncate each timestamp to the start of its bucket: + # FROM_UNIXTIME( (UNIX_TIMESTAMP(timestamp) DIV N) * N ) + # DIV is MySQL's integer division operator. + bucket_expr = literal_column( + f"FROM_UNIXTIME((UNIX_TIMESTAMP(timestamp) DIV {bucket_seconds}) * {bucket_seconds})" + ).label("bucket_time") + + statement: SelectOfScalar = select(bucket_expr, func.count().label("count")).select_from(Log) + statement = self._apply_filters(statement, search, start_time, end_time) + statement = statement.group_by(literal_column("bucket_time")).order_by( + literal_column("bucket_time") + ) + + async with self._session() as session: + results = await session.execute(statement) + # Normalize to ISO string for API parity with the SQLite backend + # (SQLite's datetime() returns a string already; FROM_UNIXTIME + # returns a datetime). + out: List[dict] = [] + for r in results.all(): + ts = r[0] + out.append({ + "time": ts.isoformat(sep=" ") if hasattr(ts, "isoformat") else ts, + "count": r[1], + }) + return out diff --git a/decnet/web/db/repository.py b/decnet/web/db/repository.py index 08a6259..d0513d4 100644 --- a/decnet/web/db/repository.py +++ b/decnet/web/db/repository.py @@ -15,6 +15,15 @@ class BaseRepository(ABC): """Add a new log entry to the database.""" pass + async def add_logs(self, log_entries: list[dict[str, Any]]) -> None: + """Bulk-insert log entries in a single transaction. + + Default implementation falls back to per-row add_log; concrete + repositories should override for a real single-commit insert. + """ + for _entry in log_entries: + await self.add_log(_entry) + @abstractmethod async def get_logs( self, @@ -60,6 +69,26 @@ class BaseRepository(ABC): """Update a user's password and change the must_change_password flag.""" pass + @abstractmethod + async def list_users(self) -> list[dict[str, Any]]: + """Retrieve all users (caller must strip password_hash before returning to clients).""" + pass + + @abstractmethod + async def delete_user(self, uuid: str) -> bool: + """Delete a user by UUID. Returns True if user was found and deleted.""" + pass + + @abstractmethod + async def update_user_role(self, uuid: str, role: str) -> None: + """Update a user's role.""" + pass + + @abstractmethod + async def purge_logs_and_bounties(self) -> dict[str, int]: + """Delete all logs, bounties, and attacker profiles. Returns counts of deleted rows.""" + pass + @abstractmethod async def add_bounty(self, bounty_data: dict[str, Any]) -> None: """Add a new harvested artifact (bounty) to the database.""" @@ -90,3 +119,118 @@ class BaseRepository(ABC): async def set_state(self, key: str, value: Any) -> None: """Store a specific state entry by key.""" pass + + @abstractmethod + async def get_max_log_id(self) -> int: + """Return the highest log ID, or 0 if the table is empty.""" + pass + + @abstractmethod + async def get_logs_after_id(self, last_id: int, limit: int = 500) -> list[dict[str, Any]]: + """Return logs with id > last_id, ordered by id ASC, up to limit.""" + pass + + @abstractmethod + async def get_all_bounties_by_ip(self) -> dict[str, list[dict[str, Any]]]: + """Retrieve all bounty rows grouped by attacker_ip.""" + pass + + @abstractmethod + async def get_bounties_for_ips(self, ips: set[str]) -> dict[str, list[dict[str, Any]]]: + """Retrieve bounty rows grouped by attacker_ip, filtered to only the given IPs.""" + pass + + @abstractmethod + async def upsert_attacker(self, data: dict[str, Any]) -> str: + """Insert or replace an attacker profile record. Returns the row's UUID.""" + pass + + @abstractmethod + async def upsert_attacker_behavior(self, attacker_uuid: str, data: dict[str, Any]) -> None: + """Insert or replace the behavioral/fingerprint row for an attacker.""" + pass + + @abstractmethod + async def get_attacker_behavior(self, attacker_uuid: str) -> Optional[dict[str, Any]]: + """Retrieve the behavioral/fingerprint row for an attacker UUID.""" + pass + + @abstractmethod + async def get_behaviors_for_ips(self, ips: set[str]) -> dict[str, dict[str, Any]]: + """Bulk-fetch behavior rows keyed by attacker IP (JOIN to attackers).""" + pass + + @abstractmethod + async def get_attacker_by_uuid(self, uuid: str) -> Optional[dict[str, Any]]: + """Retrieve a single attacker profile by UUID.""" + pass + + @abstractmethod + async def get_attackers( + self, + limit: int = 50, + offset: int = 0, + search: Optional[str] = None, + sort_by: str = "recent", + service: Optional[str] = None, + ) -> list[dict[str, Any]]: + """Retrieve paginated attacker profile records.""" + pass + + @abstractmethod + async def get_total_attackers(self, search: Optional[str] = None, service: Optional[str] = None) -> int: + """Retrieve the total count of attacker profile records, optionally filtered.""" + pass + + @abstractmethod + async def get_attacker_commands( + self, + uuid: str, + limit: int = 50, + offset: int = 0, + service: Optional[str] = None, + ) -> dict[str, Any]: + """Retrieve paginated commands for an attacker, optionally filtered by service.""" + pass + + @abstractmethod + async def get_attacker_artifacts(self, uuid: str) -> list[dict[str, Any]]: + """Return `file_captured` log rows for this attacker, newest first.""" + pass + + # ------------------------------------------------------------- swarm + # Swarm methods have default no-op / empty implementations so existing + # subclasses and non-swarm deployments continue to work without change. + + async def add_swarm_host(self, data: dict[str, Any]) -> None: + raise NotImplementedError + + async def get_swarm_host_by_name(self, name: str) -> Optional[dict[str, Any]]: + raise NotImplementedError + + async def get_swarm_host_by_uuid(self, uuid: str) -> Optional[dict[str, Any]]: + raise NotImplementedError + + async def get_swarm_host_by_fingerprint(self, fingerprint: str) -> Optional[dict[str, Any]]: + raise NotImplementedError + + async def list_swarm_hosts(self, status: Optional[str] = None) -> list[dict[str, Any]]: + raise NotImplementedError + + async def update_swarm_host(self, uuid: str, fields: dict[str, Any]) -> None: + raise NotImplementedError + + async def delete_swarm_host(self, uuid: str) -> bool: + raise NotImplementedError + + async def upsert_decky_shard(self, data: dict[str, Any]) -> None: + raise NotImplementedError + + async def list_decky_shards(self, host_uuid: Optional[str] = None) -> list[dict[str, Any]]: + raise NotImplementedError + + async def delete_decky_shards_for_host(self, host_uuid: str) -> int: + raise NotImplementedError + + async def delete_decky_shard(self, decky_name: str) -> bool: + raise NotImplementedError diff --git a/decnet/web/db/sqlite/database.py b/decnet/web/db/sqlite/database.py index 22ca549..e446958 100644 --- a/decnet/web/db/sqlite/database.py +++ b/decnet/web/db/sqlite/database.py @@ -1,5 +1,7 @@ +import os + from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession, async_sessionmaker, create_async_engine -from sqlalchemy import create_engine, Engine +from sqlalchemy import create_engine, Engine, event from sqlmodel import SQLModel from typing import AsyncGenerator @@ -11,7 +13,34 @@ def get_async_engine(db_path: str) -> AsyncEngine: prefix = "sqlite+aiosqlite:///" if db_path.startswith(":memory:"): prefix = "sqlite+aiosqlite://" - return create_async_engine(f"{prefix}{db_path}", echo=False, connect_args={"uri": True}) + + pool_size = int(os.environ.get("DECNET_DB_POOL_SIZE", "20")) + max_overflow = int(os.environ.get("DECNET_DB_MAX_OVERFLOW", "40")) + + pool_recycle = int(os.environ.get("DECNET_DB_POOL_RECYCLE", "3600")) + # SQLite is a local file — dead-connection probes are pure overhead. + # Env var stays for network-mounted setups that still want it. + pool_pre_ping = os.environ.get("DECNET_DB_POOL_PRE_PING", "false").lower() == "true" + + engine = create_async_engine( + f"{prefix}{db_path}", + echo=False, + pool_size=pool_size, + max_overflow=max_overflow, + pool_recycle=pool_recycle, + pool_pre_ping=pool_pre_ping, + connect_args={"uri": True, "timeout": 30}, + ) + + @event.listens_for(engine.sync_engine, "connect") + def _set_sqlite_pragmas(dbapi_conn, _conn_record): + cursor = dbapi_conn.cursor() + cursor.execute("PRAGMA journal_mode=WAL") + cursor.execute("PRAGMA synchronous=NORMAL") + cursor.execute("PRAGMA busy_timeout=30000") + cursor.close() + + return engine def get_sync_engine(db_path: str) -> Engine: prefix = "sqlite:///" diff --git a/decnet/web/db/sqlite/repository.py b/decnet/web/db/sqlite/repository.py index 9f28a33..5965d0b 100644 --- a/decnet/web/db/sqlite/repository.py +++ b/decnet/web/db/sqlite/repository.py @@ -1,23 +1,22 @@ -import asyncio -import json -import uuid -from datetime import datetime -from typing import Any, Optional, List +from typing import List, Optional -from sqlalchemy import func, select, desc, asc, text, or_, update, literal_column +from sqlalchemy import func, select, text, literal_column from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker from sqlmodel.sql.expression import SelectOfScalar -from decnet.config import load_state, _ROOT -from decnet.env import DECNET_ADMIN_USER, DECNET_ADMIN_PASSWORD -from decnet.web.auth import get_password_hash -from decnet.web.db.repository import BaseRepository -from decnet.web.db.models import User, Log, Bounty, State +from decnet.config import _ROOT +from decnet.web.db.models import Log from decnet.web.db.sqlite.database import get_async_engine +from decnet.web.db.sqlmodel_repo import SQLModelRepository -class SQLiteRepository(BaseRepository): - """SQLite implementation using SQLModel and SQLAlchemy Async.""" +class SQLiteRepository(SQLModelRepository): + """SQLite backend — uses ``aiosqlite``. + + Overrides the two places where SQLite's SQL dialect differs from + MySQL/PostgreSQL: legacy-schema migration (via ``PRAGMA table_info``) + and the log-histogram bucket expression (via ``strftime`` + ``unixepoch``). + """ def __init__(self, db_path: str = str(_ROOT / "decnet.db")) -> None: self.db_path = db_path @@ -26,173 +25,16 @@ class SQLiteRepository(BaseRepository): self.engine, class_=AsyncSession, expire_on_commit=False ) - async def initialize(self) -> None: - """Async warm-up / verification. Creates tables if they don't exist.""" - from sqlmodel import SQLModel + async def _migrate_attackers_table(self) -> None: + """Drop the old attackers table if it lacks the uuid column (pre-UUID schema).""" async with self.engine.begin() as conn: - await conn.run_sync(SQLModel.metadata.create_all) + rows = (await conn.execute(text("PRAGMA table_info(attackers)"))).fetchall() + if rows and not any(r[1] == "uuid" for r in rows): + await conn.execute(text("DROP TABLE attackers")) - async with self.session_factory() as session: - # Check if admin exists - result = await session.execute( - select(User).where(User.username == DECNET_ADMIN_USER) - ) - if not result.scalar_one_or_none(): - session.add(User( - uuid=str(uuid.uuid4()), - username=DECNET_ADMIN_USER, - password_hash=get_password_hash(DECNET_ADMIN_PASSWORD), - role="admin", - must_change_password=True, - )) - await session.commit() - - async def reinitialize(self) -> None: - """Initialize the database schema asynchronously (useful for tests).""" - from sqlmodel import SQLModel - async with self.engine.begin() as conn: - await conn.run_sync(SQLModel.metadata.create_all) - - async with self.session_factory() as session: - result = await session.execute( - select(User).where(User.username == DECNET_ADMIN_USER) - ) - if not result.scalar_one_or_none(): - session.add(User( - uuid=str(uuid.uuid4()), - username=DECNET_ADMIN_USER, - password_hash=get_password_hash(DECNET_ADMIN_PASSWORD), - role="admin", - must_change_password=True, - )) - await session.commit() - - # ------------------------------------------------------------------ logs - - async def add_log(self, log_data: dict[str, Any]) -> None: - data = log_data.copy() - if "fields" in data and isinstance(data["fields"], dict): - data["fields"] = json.dumps(data["fields"]) - if "timestamp" in data and isinstance(data["timestamp"], str): - try: - data["timestamp"] = datetime.fromisoformat( - data["timestamp"].replace("Z", "+00:00") - ) - except ValueError: - pass - - async with self.session_factory() as session: - session.add(Log(**data)) - await session.commit() - - def _apply_filters( - self, - statement: SelectOfScalar, - search: Optional[str], - start_time: Optional[str], - end_time: Optional[str], - ) -> SelectOfScalar: - import re - import shlex - - if start_time: - statement = statement.where(Log.timestamp >= start_time) - if end_time: - statement = statement.where(Log.timestamp <= end_time) - - if search: - try: - tokens = shlex.split(search) - except ValueError: - tokens = search.split() - - core_fields = { - "decky": Log.decky, - "service": Log.service, - "event": Log.event_type, - "attacker": Log.attacker_ip, - "attacker-ip": Log.attacker_ip, - "attacker_ip": Log.attacker_ip, - } - - for token in tokens: - if ":" in token: - key, val = token.split(":", 1) - if key in core_fields: - statement = statement.where(core_fields[key] == val) - else: - key_safe = re.sub(r"[^a-zA-Z0-9_]", "", key) - if key_safe: - statement = statement.where( - text(f"json_extract(fields, '$.{key_safe}') = :val") - ).params(val=val) - else: - lk = f"%{token}%" - statement = statement.where( - or_( - Log.raw_line.like(lk), - Log.decky.like(lk), - Log.service.like(lk), - Log.attacker_ip.like(lk), - ) - ) - return statement - - async def get_logs( - self, - limit: int = 50, - offset: int = 0, - search: Optional[str] = None, - start_time: Optional[str] = None, - end_time: Optional[str] = None, - ) -> List[dict]: - statement = ( - select(Log) - .order_by(desc(Log.timestamp)) - .offset(offset) - .limit(limit) - ) - statement = self._apply_filters(statement, search, start_time, end_time) - - async with self.session_factory() as session: - results = await session.execute(statement) - return [log.model_dump(mode='json') for log in results.scalars().all()] - - async def get_max_log_id(self) -> int: - async with self.session_factory() as session: - result = await session.execute(select(func.max(Log.id))) - val = result.scalar() - return val if val is not None else 0 - - async def get_logs_after_id( - self, - last_id: int, - limit: int = 50, - search: Optional[str] = None, - start_time: Optional[str] = None, - end_time: Optional[str] = None, - ) -> List[dict]: - statement = ( - select(Log).where(Log.id > last_id).order_by(asc(Log.id)).limit(limit) - ) - statement = self._apply_filters(statement, search, start_time, end_time) - - async with self.session_factory() as session: - results = await session.execute(statement) - return [log.model_dump(mode='json') for log in results.scalars().all()] - - async def get_total_logs( - self, - search: Optional[str] = None, - start_time: Optional[str] = None, - end_time: Optional[str] = None, - ) -> int: - statement = select(func.count()).select_from(Log) - statement = self._apply_filters(statement, search, start_time, end_time) - - async with self.session_factory() as session: - result = await session.execute(statement) - return result.scalar() or 0 + def _json_field_equals(self, key: str): + # SQLite stores JSON as text; json_extract is the canonical accessor. + return text(f"json_extract(fields, '$.{key}') = :val") async def get_log_histogram( self, @@ -206,173 +48,12 @@ class SQLiteRepository(BaseRepository): f"datetime((strftime('%s', timestamp) / {bucket_seconds}) * {bucket_seconds}, 'unixepoch')" ).label("bucket_time") - statement = select(bucket_expr, func.count().label("count")).select_from(Log) + statement: SelectOfScalar = select(bucket_expr, func.count().label("count")).select_from(Log) statement = self._apply_filters(statement, search, start_time, end_time) statement = statement.group_by(literal_column("bucket_time")).order_by( literal_column("bucket_time") ) - async with self.session_factory() as session: + async with self._session() as session: results = await session.execute(statement) return [{"time": r[0], "count": r[1]} for r in results.all()] - - async def get_stats_summary(self) -> dict[str, Any]: - async with self.session_factory() as session: - total_logs = ( - await session.execute(select(func.count()).select_from(Log)) - ).scalar() or 0 - unique_attackers = ( - await session.execute( - select(func.count(func.distinct(Log.attacker_ip))) - ) - ).scalar() or 0 - active_deckies = ( - await session.execute( - select(func.count(func.distinct(Log.decky))) - ) - ).scalar() or 0 - - _state = await asyncio.to_thread(load_state) - deployed_deckies = len(_state[0].deckies) if _state else 0 - - return { - "total_logs": total_logs, - "unique_attackers": unique_attackers, - "active_deckies": active_deckies, - "deployed_deckies": deployed_deckies, - } - - async def get_deckies(self) -> List[dict]: - _state = await asyncio.to_thread(load_state) - return [_d.model_dump() for _d in _state[0].deckies] if _state else [] - - # ------------------------------------------------------------------ users - - async def get_user_by_username(self, username: str) -> Optional[dict]: - async with self.session_factory() as session: - result = await session.execute( - select(User).where(User.username == username) - ) - user = result.scalar_one_or_none() - return user.model_dump() if user else None - - async def get_user_by_uuid(self, uuid: str) -> Optional[dict]: - async with self.session_factory() as session: - result = await session.execute( - select(User).where(User.uuid == uuid) - ) - user = result.scalar_one_or_none() - return user.model_dump() if user else None - - async def create_user(self, user_data: dict[str, Any]) -> None: - async with self.session_factory() as session: - session.add(User(**user_data)) - await session.commit() - - async def update_user_password( - self, uuid: str, password_hash: str, must_change_password: bool = False - ) -> None: - async with self.session_factory() as session: - await session.execute( - update(User) - .where(User.uuid == uuid) - .values( - password_hash=password_hash, - must_change_password=must_change_password, - ) - ) - await session.commit() - - # ---------------------------------------------------------------- bounties - - async def add_bounty(self, bounty_data: dict[str, Any]) -> None: - data = bounty_data.copy() - if "payload" in data and isinstance(data["payload"], dict): - data["payload"] = json.dumps(data["payload"]) - - async with self.session_factory() as session: - session.add(Bounty(**data)) - await session.commit() - - def _apply_bounty_filters( - self, - statement: SelectOfScalar, - bounty_type: Optional[str], - search: Optional[str] - ) -> SelectOfScalar: - if bounty_type: - statement = statement.where(Bounty.bounty_type == bounty_type) - if search: - lk = f"%{search}%" - statement = statement.where( - or_( - Bounty.decky.like(lk), - Bounty.service.like(lk), - Bounty.attacker_ip.like(lk), - Bounty.payload.like(lk), - ) - ) - return statement - - async def get_bounties( - self, - limit: int = 50, - offset: int = 0, - bounty_type: Optional[str] = None, - search: Optional[str] = None, - ) -> List[dict]: - statement = ( - select(Bounty) - .order_by(desc(Bounty.timestamp)) - .offset(offset) - .limit(limit) - ) - statement = self._apply_bounty_filters(statement, bounty_type, search) - - async with self.session_factory() as session: - results = await session.execute(statement) - final = [] - for item in results.scalars().all(): - d = item.model_dump(mode='json') - try: - d["payload"] = json.loads(d["payload"]) - except (json.JSONDecodeError, TypeError): - pass - final.append(d) - return final - - async def get_total_bounties( - self, bounty_type: Optional[str] = None, search: Optional[str] = None - ) -> int: - statement = select(func.count()).select_from(Bounty) - statement = self._apply_bounty_filters(statement, bounty_type, search) - - async with self.session_factory() as session: - result = await session.execute(statement) - return result.scalar() or 0 - - async def get_state(self, key: str) -> Optional[dict[str, Any]]: - async with self.session_factory() as session: - statement = select(State).where(State.key == key) - result = await session.execute(statement) - state = result.scalar_one_or_none() - if state: - return json.loads(state.value) - return None - - async def set_state(self, key: str, value: Any) -> None: # noqa: ANN401 - async with self.session_factory() as session: - # Check if exists - statement = select(State).where(State.key == key) - result = await session.execute(statement) - state = result.scalar_one_or_none() - - value_json = json.dumps(value) - if state: - state.value = value_json - session.add(state) - else: - new_state = State(key=key, value=value_json) - session.add(new_state) - - await session.commit() diff --git a/decnet/web/db/sqlmodel_repo.py b/decnet/web/db/sqlmodel_repo.py new file mode 100644 index 0000000..b5f40f4 --- /dev/null +++ b/decnet/web/db/sqlmodel_repo.py @@ -0,0 +1,901 @@ +""" +Shared SQLModel-based repository implementation. + +Contains all dialect-portable query code used by the SQLite and MySQL +backends. Dialect-specific behavior lives in subclasses: + +* engine/session construction (``__init__``) +* ``_migrate_attackers_table`` (legacy schema check; DDL introspection + is not portable) +* ``get_log_histogram`` (date-bucket expression differs per dialect) +""" +from __future__ import annotations + +import asyncio +import json + +import orjson +import uuid +from datetime import datetime, timezone +from typing import Any, Optional, List + +from sqlalchemy import func, select, desc, asc, text, or_, update +from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession, async_sessionmaker +from sqlmodel.sql.expression import SelectOfScalar + +from decnet.config import load_state +from decnet.env import DECNET_ADMIN_USER, DECNET_ADMIN_PASSWORD +from decnet.web.auth import get_password_hash +from decnet.web.db.repository import BaseRepository +from decnet.web.db.models import ( + User, + Log, + Bounty, + State, + Attacker, + AttackerBehavior, + SwarmHost, + DeckyShard, +) + + +from contextlib import asynccontextmanager + +from decnet.logging import get_logger + +_log = get_logger("db.pool") + +# Hold strong refs to in-flight cleanup tasks so they aren't GC'd mid-run. +_cleanup_tasks: set[asyncio.Task] = set() + + +def _detach_close(session: AsyncSession) -> None: + """Hand session cleanup to a fresh task so the caller's cancellation + doesn't interrupt it. + + ``asyncio.shield`` doesn't help on the exception path: shield prevents + *other* tasks from cancelling the inner coroutine, but if the *current* + task is already cancelled, its next ``await`` re-raises + ``CancelledError`` as soon as the inner coroutine yields. That's what + happens when uvicorn cancels a request mid-query — the rollback inside + ``session.close()`` can't complete, and the aiomysql connection is + orphaned (pool logs "non-checked-in connection" on GC). + + A fresh task isn't subject to the caller's pending cancellation, so + ``close()`` (or the ``invalidate()`` fallback for a dead connection) + runs to completion and the pool reclaims the connection promptly. + + Fire-and-forget on purpose: the caller is already unwinding and must + not wait on cleanup. + """ + async def _cleanup() -> None: + try: + await session.close() + except BaseException: + try: + session.sync_session.invalidate() + except BaseException: + _log.debug("detach-close: invalidate failed", exc_info=True) + + try: + loop = asyncio.get_running_loop() + except RuntimeError: + # No running loop (shutdown path) — best-effort sync invalidate. + try: + session.sync_session.invalidate() + except BaseException: + _log.debug("detach-close: no-loop invalidate failed", exc_info=True) + return + task = loop.create_task(_cleanup()) + _cleanup_tasks.add(task) + # Consume any exception to silence "Task exception was never retrieved". + task.add_done_callback(lambda t: (_cleanup_tasks.discard(t), t.exception())) + + +@asynccontextmanager +async def _safe_session(factory: async_sessionmaker[AsyncSession]): + """Session context manager that keeps close() reliable under cancellation. + + Success path: await close() inline so the caller observes cleanup + (commit visibility, connection release) before proceeding. + + Exception path (includes CancelledError from client disconnects): + detach close() to a fresh task. The caller is unwinding and its + own cancellation would abort an inline close mid-rollback, leaving + the aiomysql connection orphaned. + """ + session = factory() + try: + yield session + except BaseException: + _detach_close(session) + raise + else: + await session.close() + + +class SQLModelRepository(BaseRepository): + """Concrete SQLModel/SQLAlchemy-async repository. + + Subclasses provide ``self.engine`` (AsyncEngine) and ``self.session_factory`` + in ``__init__``, and override the few dialect-specific helpers. + """ + + engine: AsyncEngine + session_factory: async_sessionmaker[AsyncSession] + + def _session(self): + """Return a cancellation-safe session context manager.""" + return _safe_session(self.session_factory) + + # ------------------------------------------------------------ lifecycle + + async def initialize(self) -> None: + """Create tables if absent and seed the admin user.""" + from sqlmodel import SQLModel + await self._migrate_attackers_table() + async with self.engine.begin() as conn: + await conn.run_sync(SQLModel.metadata.create_all) + await self._ensure_admin_user() + + async def reinitialize(self) -> None: + """Re-create schema (for tests / reset flows). Does NOT drop existing tables.""" + from sqlmodel import SQLModel + async with self.engine.begin() as conn: + await conn.run_sync(SQLModel.metadata.create_all) + await self._ensure_admin_user() + + async def _ensure_admin_user(self) -> None: + async with self._session() as session: + result = await session.execute( + select(User).where(User.username == DECNET_ADMIN_USER) + ) + existing = result.scalar_one_or_none() + if existing is None: + session.add(User( + uuid=str(uuid.uuid4()), + username=DECNET_ADMIN_USER, + password_hash=get_password_hash(DECNET_ADMIN_PASSWORD), + role="admin", + must_change_password=True, + )) + await session.commit() + return + # Self-heal env drift: if admin never finalized their password, + # re-sync the hash from DECNET_ADMIN_PASSWORD. Otherwise leave + # the user's chosen password alone. + if existing.must_change_password: + existing.password_hash = get_password_hash(DECNET_ADMIN_PASSWORD) + session.add(existing) + await session.commit() + + async def _migrate_attackers_table(self) -> None: + """Legacy-schema cleanup. Override per dialect (DDL introspection is non-portable).""" + return None + + # ---------------------------------------------------------------- logs + + @staticmethod + def _normalize_log_row(log_data: dict[str, Any]) -> dict[str, Any]: + data = log_data.copy() + if "fields" in data and isinstance(data["fields"], dict): + data["fields"] = orjson.dumps(data["fields"]).decode() + if "timestamp" in data and isinstance(data["timestamp"], str): + try: + data["timestamp"] = datetime.fromisoformat( + data["timestamp"].replace("Z", "+00:00") + ) + except ValueError: + pass + return data + + async def add_log(self, log_data: dict[str, Any]) -> None: + data = self._normalize_log_row(log_data) + async with self._session() as session: + session.add(Log(**data)) + await session.commit() + + async def add_logs(self, log_entries: list[dict[str, Any]]) -> None: + """Bulk insert — one session, one commit for the whole batch.""" + if not log_entries: + return + _rows = [Log(**self._normalize_log_row(e)) for e in log_entries] + async with self._session() as session: + session.add_all(_rows) + await session.commit() + + def _apply_filters( + self, + statement: SelectOfScalar, + search: Optional[str], + start_time: Optional[str], + end_time: Optional[str], + ) -> SelectOfScalar: + import re + import shlex + + if start_time: + statement = statement.where(Log.timestamp >= start_time) + if end_time: + statement = statement.where(Log.timestamp <= end_time) + + if search: + try: + tokens = shlex.split(search) + except ValueError: + tokens = search.split() + + core_fields = { + "decky": Log.decky, + "service": Log.service, + "event": Log.event_type, + "attacker": Log.attacker_ip, + "attacker-ip": Log.attacker_ip, + "attacker_ip": Log.attacker_ip, + } + + for token in tokens: + if ":" in token: + key, val = token.split(":", 1) + if key in core_fields: + statement = statement.where(core_fields[key] == val) + else: + key_safe = re.sub(r"[^a-zA-Z0-9_]", "", key) + if key_safe: + statement = statement.where( + self._json_field_equals(key_safe) + ).params(val=val) + else: + lk = f"%{token}%" + statement = statement.where( + or_( + Log.raw_line.like(lk), + Log.decky.like(lk), + Log.service.like(lk), + Log.attacker_ip.like(lk), + ) + ) + return statement + + def _json_field_equals(self, key: str): + """Return a text() predicate that matches rows where fields->key == :val. + + Both SQLite and MySQL expose a ``JSON_EXTRACT`` function; MySQL also + exposes the same function under ``json_extract`` (case-insensitive). + The ``:val`` parameter is bound separately and must be supplied with + ``.params(val=...)`` by the caller, which keeps us safe from injection. + """ + return text(f"JSON_EXTRACT(fields, '$.{key}') = :val") + + async def get_logs( + self, + limit: int = 50, + offset: int = 0, + search: Optional[str] = None, + start_time: Optional[str] = None, + end_time: Optional[str] = None, + ) -> List[dict]: + statement = ( + select(Log) + .order_by(desc(Log.timestamp)) + .offset(offset) + .limit(limit) + ) + statement = self._apply_filters(statement, search, start_time, end_time) + + async with self._session() as session: + results = await session.execute(statement) + return [log.model_dump(mode="json") for log in results.scalars().all()] + + async def get_max_log_id(self) -> int: + async with self._session() as session: + result = await session.execute(select(func.max(Log.id))) + val = result.scalar() + return val if val is not None else 0 + + async def get_logs_after_id( + self, + last_id: int, + limit: int = 50, + search: Optional[str] = None, + start_time: Optional[str] = None, + end_time: Optional[str] = None, + ) -> List[dict]: + statement = ( + select(Log).where(Log.id > last_id).order_by(asc(Log.id)).limit(limit) + ) + statement = self._apply_filters(statement, search, start_time, end_time) + + async with self._session() as session: + results = await session.execute(statement) + return [log.model_dump(mode="json") for log in results.scalars().all()] + + async def get_total_logs( + self, + search: Optional[str] = None, + start_time: Optional[str] = None, + end_time: Optional[str] = None, + ) -> int: + statement = select(func.count()).select_from(Log) + statement = self._apply_filters(statement, search, start_time, end_time) + + async with self._session() as session: + result = await session.execute(statement) + return result.scalar() or 0 + + async def get_log_histogram( + self, + search: Optional[str] = None, + start_time: Optional[str] = None, + end_time: Optional[str] = None, + interval_minutes: int = 15, + ) -> List[dict]: + """Dialect-specific — override per backend.""" + raise NotImplementedError + + async def get_stats_summary(self) -> dict[str, Any]: + async with self._session() as session: + total_logs = ( + await session.execute(select(func.count()).select_from(Log)) + ).scalar() or 0 + unique_attackers = ( + await session.execute( + select(func.count(func.distinct(Log.attacker_ip))) + ) + ).scalar() or 0 + + _state = await asyncio.to_thread(load_state) + deployed_deckies = len(_state[0].deckies) if _state else 0 + + return { + "total_logs": total_logs, + "unique_attackers": unique_attackers, + "active_deckies": deployed_deckies, + "deployed_deckies": deployed_deckies, + } + + async def get_deckies(self) -> List[dict]: + _state = await asyncio.to_thread(load_state) + return [_d.model_dump() for _d in _state[0].deckies] if _state else [] + + # --------------------------------------------------------------- users + + async def get_user_by_username(self, username: str) -> Optional[dict]: + async with self._session() as session: + result = await session.execute( + select(User).where(User.username == username) + ) + user = result.scalar_one_or_none() + return user.model_dump() if user else None + + async def get_user_by_uuid(self, uuid: str) -> Optional[dict]: + async with self._session() as session: + result = await session.execute( + select(User).where(User.uuid == uuid) + ) + user = result.scalar_one_or_none() + return user.model_dump() if user else None + + async def create_user(self, user_data: dict[str, Any]) -> None: + async with self._session() as session: + session.add(User(**user_data)) + await session.commit() + + async def update_user_password( + self, uuid: str, password_hash: str, must_change_password: bool = False + ) -> None: + async with self._session() as session: + await session.execute( + update(User) + .where(User.uuid == uuid) + .values( + password_hash=password_hash, + must_change_password=must_change_password, + ) + ) + await session.commit() + + async def list_users(self) -> list[dict]: + async with self._session() as session: + result = await session.execute(select(User)) + return [u.model_dump() for u in result.scalars().all()] + + async def delete_user(self, uuid: str) -> bool: + async with self._session() as session: + result = await session.execute(select(User).where(User.uuid == uuid)) + user = result.scalar_one_or_none() + if not user: + return False + await session.delete(user) + await session.commit() + return True + + async def update_user_role(self, uuid: str, role: str) -> None: + async with self._session() as session: + await session.execute( + update(User).where(User.uuid == uuid).values(role=role) + ) + await session.commit() + + async def purge_logs_and_bounties(self) -> dict[str, int]: + async with self._session() as session: + logs_deleted = (await session.execute(text("DELETE FROM logs"))).rowcount + bounties_deleted = (await session.execute(text("DELETE FROM bounty"))).rowcount + # attacker_behavior has FK → attackers.uuid; delete children first. + await session.execute(text("DELETE FROM attacker_behavior")) + attackers_deleted = (await session.execute(text("DELETE FROM attackers"))).rowcount + await session.commit() + return { + "logs": logs_deleted, + "bounties": bounties_deleted, + "attackers": attackers_deleted, + } + + # ------------------------------------------------------------ bounties + + async def add_bounty(self, bounty_data: dict[str, Any]) -> None: + data = bounty_data.copy() + if "payload" in data and isinstance(data["payload"], dict): + data["payload"] = orjson.dumps(data["payload"]).decode() + + async with self._session() as session: + dup = await session.execute( + select(Bounty.id).where( + Bounty.bounty_type == data.get("bounty_type"), + Bounty.attacker_ip == data.get("attacker_ip"), + Bounty.payload == data.get("payload"), + ).limit(1) + ) + if dup.first() is not None: + return + session.add(Bounty(**data)) + await session.commit() + + def _apply_bounty_filters( + self, + statement: SelectOfScalar, + bounty_type: Optional[str], + search: Optional[str], + ) -> SelectOfScalar: + if bounty_type: + statement = statement.where(Bounty.bounty_type == bounty_type) + if search: + lk = f"%{search}%" + statement = statement.where( + or_( + Bounty.decky.like(lk), + Bounty.service.like(lk), + Bounty.attacker_ip.like(lk), + Bounty.payload.like(lk), + ) + ) + return statement + + async def get_bounties( + self, + limit: int = 50, + offset: int = 0, + bounty_type: Optional[str] = None, + search: Optional[str] = None, + ) -> List[dict]: + statement = ( + select(Bounty) + .order_by(desc(Bounty.timestamp)) + .offset(offset) + .limit(limit) + ) + statement = self._apply_bounty_filters(statement, bounty_type, search) + + async with self._session() as session: + results = await session.execute(statement) + final = [] + for item in results.scalars().all(): + d = item.model_dump(mode="json") + try: + d["payload"] = json.loads(d["payload"]) + except (json.JSONDecodeError, TypeError): + pass + final.append(d) + return final + + async def get_total_bounties( + self, bounty_type: Optional[str] = None, search: Optional[str] = None + ) -> int: + statement = select(func.count()).select_from(Bounty) + statement = self._apply_bounty_filters(statement, bounty_type, search) + + async with self._session() as session: + result = await session.execute(statement) + return result.scalar() or 0 + + async def get_state(self, key: str) -> Optional[dict[str, Any]]: + async with self._session() as session: + statement = select(State).where(State.key == key) + result = await session.execute(statement) + state = result.scalar_one_or_none() + if state: + return json.loads(state.value) + return None + + async def set_state(self, key: str, value: Any) -> None: # noqa: ANN401 + async with self._session() as session: + statement = select(State).where(State.key == key) + result = await session.execute(statement) + state = result.scalar_one_or_none() + + value_json = orjson.dumps(value).decode() + if state: + state.value = value_json + session.add(state) + else: + session.add(State(key=key, value=value_json)) + + await session.commit() + + # ----------------------------------------------------------- attackers + + async def get_all_bounties_by_ip(self) -> dict[str, List[dict[str, Any]]]: + from collections import defaultdict + async with self._session() as session: + result = await session.execute( + select(Bounty).order_by(asc(Bounty.timestamp)) + ) + grouped: dict[str, List[dict[str, Any]]] = defaultdict(list) + for item in result.scalars().all(): + d = item.model_dump(mode="json") + try: + d["payload"] = json.loads(d["payload"]) + except (json.JSONDecodeError, TypeError): + pass + grouped[item.attacker_ip].append(d) + return dict(grouped) + + async def get_bounties_for_ips(self, ips: set[str]) -> dict[str, List[dict[str, Any]]]: + from collections import defaultdict + async with self._session() as session: + result = await session.execute( + select(Bounty).where(Bounty.attacker_ip.in_(ips)).order_by(asc(Bounty.timestamp)) + ) + grouped: dict[str, List[dict[str, Any]]] = defaultdict(list) + for item in result.scalars().all(): + d = item.model_dump(mode="json") + try: + d["payload"] = json.loads(d["payload"]) + except (json.JSONDecodeError, TypeError): + pass + grouped[item.attacker_ip].append(d) + return dict(grouped) + + async def upsert_attacker(self, data: dict[str, Any]) -> str: + async with self._session() as session: + result = await session.execute( + select(Attacker).where(Attacker.ip == data["ip"]) + ) + existing = result.scalar_one_or_none() + if existing: + for k, v in data.items(): + setattr(existing, k, v) + session.add(existing) + row_uuid = existing.uuid + else: + row_uuid = str(uuid.uuid4()) + data = {**data, "uuid": row_uuid} + session.add(Attacker(**data)) + await session.commit() + return row_uuid + + async def upsert_attacker_behavior( + self, + attacker_uuid: str, + data: dict[str, Any], + ) -> None: + async with self._session() as session: + result = await session.execute( + select(AttackerBehavior).where( + AttackerBehavior.attacker_uuid == attacker_uuid + ) + ) + existing = result.scalar_one_or_none() + payload = {**data, "updated_at": datetime.now(timezone.utc)} + if existing: + for k, v in payload.items(): + setattr(existing, k, v) + session.add(existing) + else: + session.add(AttackerBehavior(attacker_uuid=attacker_uuid, **payload)) + await session.commit() + + async def get_attacker_behavior( + self, + attacker_uuid: str, + ) -> Optional[dict[str, Any]]: + async with self._session() as session: + result = await session.execute( + select(AttackerBehavior).where( + AttackerBehavior.attacker_uuid == attacker_uuid + ) + ) + row = result.scalar_one_or_none() + if not row: + return None + return self._deserialize_behavior(row.model_dump(mode="json")) + + async def get_behaviors_for_ips( + self, + ips: set[str], + ) -> dict[str, dict[str, Any]]: + if not ips: + return {} + async with self._session() as session: + result = await session.execute( + select(Attacker.ip, AttackerBehavior) + .join(AttackerBehavior, Attacker.uuid == AttackerBehavior.attacker_uuid) + .where(Attacker.ip.in_(ips)) + ) + out: dict[str, dict[str, Any]] = {} + for ip, row in result.all(): + out[ip] = self._deserialize_behavior(row.model_dump(mode="json")) + return out + + @staticmethod + def _deserialize_behavior(d: dict[str, Any]) -> dict[str, Any]: + for key in ("tcp_fingerprint", "timing_stats", "phase_sequence"): + if isinstance(d.get(key), str): + try: + d[key] = json.loads(d[key]) + except (json.JSONDecodeError, TypeError): + pass + # Deserialize tool_guesses JSON array; normalise None → []. + raw = d.get("tool_guesses") + if isinstance(raw, str): + try: + parsed = json.loads(raw) + d["tool_guesses"] = parsed if isinstance(parsed, list) else [parsed] + except (json.JSONDecodeError, TypeError): + d["tool_guesses"] = [] + elif raw is None: + d["tool_guesses"] = [] + return d + + @staticmethod + def _deserialize_attacker(d: dict[str, Any]) -> dict[str, Any]: + for key in ("services", "deckies", "fingerprints", "commands"): + if isinstance(d.get(key), str): + try: + d[key] = json.loads(d[key]) + except (json.JSONDecodeError, TypeError): + pass + return d + + async def get_attacker_by_uuid(self, uuid: str) -> Optional[dict[str, Any]]: + async with self._session() as session: + result = await session.execute( + select(Attacker).where(Attacker.uuid == uuid) + ) + attacker = result.scalar_one_or_none() + if not attacker: + return None + return self._deserialize_attacker(attacker.model_dump(mode="json")) + + async def get_attackers( + self, + limit: int = 50, + offset: int = 0, + search: Optional[str] = None, + sort_by: str = "recent", + service: Optional[str] = None, + ) -> List[dict[str, Any]]: + order = { + "active": desc(Attacker.event_count), + "traversals": desc(Attacker.is_traversal), + }.get(sort_by, desc(Attacker.last_seen)) + + statement = select(Attacker).order_by(order).offset(offset).limit(limit) + if search: + statement = statement.where(Attacker.ip.like(f"%{search}%")) + if service: + statement = statement.where(Attacker.services.like(f'%"{service}"%')) + + async with self._session() as session: + result = await session.execute(statement) + return [ + self._deserialize_attacker(a.model_dump(mode="json")) + for a in result.scalars().all() + ] + + async def get_total_attackers( + self, search: Optional[str] = None, service: Optional[str] = None + ) -> int: + statement = select(func.count()).select_from(Attacker) + if search: + statement = statement.where(Attacker.ip.like(f"%{search}%")) + if service: + statement = statement.where(Attacker.services.like(f'%"{service}"%')) + + async with self._session() as session: + result = await session.execute(statement) + return result.scalar() or 0 + + async def get_attacker_commands( + self, + uuid: str, + limit: int = 50, + offset: int = 0, + service: Optional[str] = None, + ) -> dict[str, Any]: + async with self._session() as session: + result = await session.execute( + select(Attacker.commands).where(Attacker.uuid == uuid) + ) + raw = result.scalar_one_or_none() + if raw is None: + return {"total": 0, "data": []} + + commands: list = json.loads(raw) if isinstance(raw, str) else raw + if service: + commands = [c for c in commands if c.get("service") == service] + + total = len(commands) + page = commands[offset: offset + limit] + return {"total": total, "data": page} + + async def get_attacker_artifacts(self, uuid: str) -> list[dict[str, Any]]: + """Return `file_captured` logs for the attacker identified by UUID. + + Resolves the attacker's IP first, then queries the logs table on two + indexed columns (``attacker_ip`` and ``event_type``). No JSON extract + needed — the decky/stored_as are already decoded into ``fields`` by + the ingester and returned to the frontend for drawer rendering. + """ + async with self._session() as session: + ip_res = await session.execute( + select(Attacker.ip).where(Attacker.uuid == uuid) + ) + ip = ip_res.scalar_one_or_none() + if not ip: + return [] + rows = await session.execute( + select(Log) + .where(Log.attacker_ip == ip) + .where(Log.event_type == "file_captured") + .order_by(desc(Log.timestamp)) + .limit(200) + ) + return [r.model_dump(mode="json") for r in rows.scalars().all()] + + # ------------------------------------------------------------- swarm + + async def add_swarm_host(self, data: dict[str, Any]) -> None: + async with self._session() as session: + session.add(SwarmHost(**data)) + await session.commit() + + async def get_swarm_host_by_name(self, name: str) -> Optional[dict[str, Any]]: + async with self._session() as session: + result = await session.execute(select(SwarmHost).where(SwarmHost.name == name)) + row = result.scalar_one_or_none() + return row.model_dump(mode="json") if row else None + + async def get_swarm_host_by_uuid(self, uuid: str) -> Optional[dict[str, Any]]: + async with self._session() as session: + result = await session.execute(select(SwarmHost).where(SwarmHost.uuid == uuid)) + row = result.scalar_one_or_none() + return row.model_dump(mode="json") if row else None + + async def get_swarm_host_by_fingerprint(self, fingerprint: str) -> Optional[dict[str, Any]]: + async with self._session() as session: + result = await session.execute( + select(SwarmHost).where(SwarmHost.client_cert_fingerprint == fingerprint) + ) + row = result.scalar_one_or_none() + return row.model_dump(mode="json") if row else None + + async def list_swarm_hosts(self, status: Optional[str] = None) -> list[dict[str, Any]]: + statement = select(SwarmHost).order_by(asc(SwarmHost.name)) + if status: + statement = statement.where(SwarmHost.status == status) + async with self._session() as session: + result = await session.execute(statement) + return [r.model_dump(mode="json") for r in result.scalars().all()] + + async def update_swarm_host(self, uuid: str, fields: dict[str, Any]) -> None: + if not fields: + return + async with self._session() as session: + await session.execute( + update(SwarmHost).where(SwarmHost.uuid == uuid).values(**fields) + ) + await session.commit() + + async def delete_swarm_host(self, uuid: str) -> bool: + async with self._session() as session: + # Clean up child shards first (no ON DELETE CASCADE portable across dialects). + await session.execute( + text("DELETE FROM decky_shards WHERE host_uuid = :u"), {"u": uuid} + ) + result = await session.execute( + select(SwarmHost).where(SwarmHost.uuid == uuid) + ) + host = result.scalar_one_or_none() + if not host: + await session.commit() + return False + await session.delete(host) + await session.commit() + return True + + async def upsert_decky_shard(self, data: dict[str, Any]) -> None: + payload = {**data, "updated_at": datetime.now(timezone.utc)} + if isinstance(payload.get("services"), list): + payload["services"] = orjson.dumps(payload["services"]).decode() + async with self._session() as session: + result = await session.execute( + select(DeckyShard).where(DeckyShard.decky_name == payload["decky_name"]) + ) + existing = result.scalar_one_or_none() + if existing: + for k, v in payload.items(): + setattr(existing, k, v) + session.add(existing) + else: + session.add(DeckyShard(**payload)) + await session.commit() + + async def list_decky_shards( + self, host_uuid: Optional[str] = None + ) -> list[dict[str, Any]]: + statement = select(DeckyShard).order_by(asc(DeckyShard.decky_name)) + if host_uuid: + statement = statement.where(DeckyShard.host_uuid == host_uuid) + async with self._session() as session: + result = await session.execute(statement) + out: list[dict[str, Any]] = [] + for r in result.scalars().all(): + d = r.model_dump(mode="json") + raw = d.get("services") + if isinstance(raw, str): + try: + d["services"] = json.loads(raw) + except (json.JSONDecodeError, TypeError): + d["services"] = [] + # Flatten the stored DeckyConfig snapshot into the row so + # routers can hand it to DeckyShardView without re-parsing. + # Rows predating the migration have decky_config=NULL and + # fall through with the default (None/{}) view values. + cfg_raw = d.get("decky_config") + if isinstance(cfg_raw, str): + try: + cfg = json.loads(cfg_raw) + except (json.JSONDecodeError, TypeError): + cfg = {} + if isinstance(cfg, dict): + for k in ("hostname", "distro", "archetype", + "service_config", "mutate_interval", + "last_mutated"): + if k in cfg and d.get(k) is None: + d[k] = cfg[k] + # Keep decky_ip authoritative from the column (newer + # heartbeats overwrite it) but fall back to the + # snapshot if the column is still NULL. + if not d.get("decky_ip") and cfg.get("ip"): + d["decky_ip"] = cfg["ip"] + out.append(d) + return out + + async def delete_decky_shards_for_host(self, host_uuid: str) -> int: + async with self._session() as session: + result = await session.execute( + text("DELETE FROM decky_shards WHERE host_uuid = :u"), + {"u": host_uuid}, + ) + await session.commit() + return result.rowcount or 0 + + async def delete_decky_shard(self, decky_name: str) -> bool: + async with self._session() as session: + result = await session.execute( + text("DELETE FROM decky_shards WHERE decky_name = :n"), + {"n": decky_name}, + ) + await session.commit() + return bool(result.rowcount) diff --git a/decnet/web/dependencies.py b/decnet/web/dependencies.py index 99a6d39..d3f83d2 100644 --- a/decnet/web/dependencies.py +++ b/decnet/web/dependencies.py @@ -1,3 +1,5 @@ +import asyncio +import time from typing import Any, Optional import jwt @@ -23,6 +25,88 @@ repo = get_repo() oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/api/v1/auth/login") +# Per-request user lookup was the hidden tax behind every authed endpoint — +# SELECT users WHERE uuid=? ran once per call, serializing through aiosqlite. +# 10s TTL is well below JWT expiry and we invalidate on all user writes. +_USER_TTL = 10.0 +_user_cache: dict[str, tuple[Optional[dict[str, Any]], float]] = {} +_user_cache_lock: Optional[asyncio.Lock] = None + +# Username cache for the login hot path. Short TTL — the bcrypt verify +# still runs against the cached hash, so security is unchanged. The +# staleness window is: if a password is changed, the old password is +# usable for up to _USERNAME_TTL seconds until the cache expires (or +# invalidate_user_cache fires). We invalidate on every user write. +# Missing lookups are NOT cached to avoid locking out a just-created user. +_USERNAME_TTL = 5.0 +_username_cache: dict[str, tuple[dict[str, Any], float]] = {} +_username_cache_lock: Optional[asyncio.Lock] = None + + +def _reset_user_cache() -> None: + global _user_cache, _user_cache_lock, _username_cache, _username_cache_lock + _user_cache = {} + _user_cache_lock = None + _username_cache = {} + _username_cache_lock = None + + +def invalidate_user_cache(user_uuid: Optional[str] = None) -> None: + """Drop a single user (or all users) from the auth caches. + + Callers: password change, role change, user create/delete. + The username cache is always cleared wholesale — we don't track + uuid→username and user writes are rare, so the cost is trivial. + """ + if user_uuid is None: + _user_cache.clear() + else: + _user_cache.pop(user_uuid, None) + _username_cache.clear() + + +async def get_user_by_username_cached(username: str) -> Optional[dict[str, Any]]: + """Cached read of get_user_by_username for the login path. + + Positive hits are cached for _USERNAME_TTL seconds. Misses bypass + the cache so a freshly-created user can log in immediately. + """ + global _username_cache_lock + entry = _username_cache.get(username) + now = time.monotonic() + if entry is not None and now - entry[1] < _USERNAME_TTL: + return entry[0] + if _username_cache_lock is None: + _username_cache_lock = asyncio.Lock() + async with _username_cache_lock: + entry = _username_cache.get(username) + now = time.monotonic() + if entry is not None and now - entry[1] < _USERNAME_TTL: + return entry[0] + user = await repo.get_user_by_username(username) + if user is not None: + _username_cache[username] = (user, time.monotonic()) + return user + + +async def _get_user_cached(user_uuid: str) -> Optional[dict[str, Any]]: + global _user_cache_lock + entry = _user_cache.get(user_uuid) + now = time.monotonic() + if entry is not None and now - entry[1] < _USER_TTL: + return entry[0] + if _user_cache_lock is None: + _user_cache_lock = asyncio.Lock() + async with _user_cache_lock: + entry = _user_cache.get(user_uuid) + now = time.monotonic() + if entry is not None and now - entry[1] < _USER_TTL: + return entry[0] + user = await repo.get_user_by_uuid(user_uuid) + _user_cache[user_uuid] = (user, time.monotonic()) + return user + + async def get_stream_user(request: Request, token: Optional[str] = None) -> str: """Auth dependency for SSE endpoints — accepts Bearer header OR ?token= query param. EventSource does not support custom headers, so the query-string fallback is intentional here only. @@ -82,7 +166,7 @@ async def _decode_token(request: Request) -> str: async def get_current_user(request: Request) -> str: """Auth dependency — enforces must_change_password.""" _user_uuid = await _decode_token(request) - _user = await repo.get_user_by_uuid(_user_uuid) + _user = await _get_user_cached(_user_uuid) if _user and _user.get("must_change_password"): raise HTTPException( status_code=status.HTTP_403_FORBIDDEN, @@ -96,3 +180,57 @@ async def get_current_user_unchecked(request: Request) -> str: Use only for endpoints that must remain reachable with the flag set (e.g. change-password). """ return await _decode_token(request) + + +# --------------------------------------------------------------------------- +# Role-based access control +# --------------------------------------------------------------------------- + +def require_role(*allowed_roles: str): + """Factory that returns a FastAPI dependency enforcing role membership. + + Inlines JWT decode + user lookup + must_change_password + role check so the + user is only loaded from the DB once per request (not once in + ``get_current_user`` and again here). Returns the full user dict so + endpoints can inspect ``user["uuid"]``, ``user["role"]``, etc. + """ + async def _check(request: Request) -> dict: + user_uuid = await _decode_token(request) + user = await _get_user_cached(user_uuid) + if not user: + raise HTTPException( + status_code=status.HTTP_401_UNAUTHORIZED, + detail="Could not validate credentials", + headers={"WWW-Authenticate": "Bearer"}, + ) + if user.get("must_change_password"): + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail="Password change required before accessing this resource", + ) + if user["role"] not in allowed_roles: + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail="Insufficient permissions", + ) + return user + return _check + + +def require_stream_role(*allowed_roles: str): + """Like ``require_role`` but for SSE endpoints that accept a query-param token.""" + async def _check(request: Request, token: Optional[str] = None) -> dict: + user_uuid = await get_stream_user(request, token) + user = await _get_user_cached(user_uuid) + if not user or user["role"] not in allowed_roles: + raise HTTPException( + status_code=status.HTTP_403_FORBIDDEN, + detail="Insufficient permissions", + ) + return user + return _check + + +require_admin = require_role("admin") +require_viewer = require_role("viewer", "admin") +require_stream_viewer = require_stream_role("viewer", "admin") diff --git a/decnet/web/ingester.py b/decnet/web/ingester.py index 96a224a..bca1d63 100644 --- a/decnet/web/ingester.py +++ b/decnet/web/ingester.py @@ -1,13 +1,24 @@ import asyncio import os -import logging import json +import time from typing import Any from pathlib import Path +from decnet.env import DECNET_BATCH_SIZE, DECNET_BATCH_MAX_WAIT_MS +from decnet.logging import get_logger +from decnet.telemetry import ( + traced as _traced, + get_tracer as _get_tracer, + extract_context as _extract_ctx, + start_span_with_context as _start_span, +) from decnet.web.db.repository import BaseRepository -logger: logging.Logger = logging.getLogger("decnet.web.ingester") +logger = get_logger("api") + +_INGEST_STATE_KEY = "ingest_worker_position" + async def log_ingestion_worker(repo: BaseRepository) -> None: """ @@ -20,9 +31,11 @@ async def log_ingestion_worker(repo: BaseRepository) -> None: return _json_log_path: Path = Path(_base_log_file).with_suffix(".json") - _position: int = 0 - logger.info(f"Starting JSON log ingestion from {_json_log_path}") + _saved = await repo.get_state(_INGEST_STATE_KEY) + _position: int = _saved.get("position", 0) if _saved else 0 + + logger.info("ingest worker started path=%s position=%d", _json_log_path, _position) while True: try: @@ -34,46 +47,103 @@ async def log_ingestion_worker(repo: BaseRepository) -> None: if _stat.st_size < _position: # File rotated or truncated _position = 0 + await repo.set_state(_INGEST_STATE_KEY, {"position": 0}) if _stat.st_size == _position: # No new data await asyncio.sleep(1) continue + # Accumulate parsed rows and the file offset they end at. We + # only advance _position after the batch is successfully + # committed — if we get cancelled mid-flush, the next run + # re-reads the un-committed lines rather than losing them. + _batch: list[tuple[dict[str, Any], int]] = [] + _batch_started: float = time.monotonic() + _max_wait_s: float = DECNET_BATCH_MAX_WAIT_MS / 1000.0 + with open(_json_log_path, "r", encoding="utf-8", errors="replace") as _f: _f.seek(_position) while True: _line: str = _f.readline() - if not _line: - break # EOF reached - - if not _line.endswith('\n'): - # Partial line read, don't process yet, don't advance position + if not _line or not _line.endswith('\n'): + # EOF or partial line — flush what we have and stop break try: _log_data: dict[str, Any] = json.loads(_line.strip()) - await repo.add_log(_log_data) - await _extract_bounty(repo, _log_data) + # Collector injects trace context so the ingester span + # chains off the collector's — full event journey in Jaeger. + _parent_ctx = _extract_ctx(_log_data) + _tracer = _get_tracer("ingester") + with _start_span(_tracer, "ingester.process_record", context=_parent_ctx) as _span: + _span.set_attribute("decky", _log_data.get("decky", "")) + _span.set_attribute("service", _log_data.get("service", "")) + _span.set_attribute("event_type", _log_data.get("event_type", "")) + _span.set_attribute("attacker_ip", _log_data.get("attacker_ip", "")) + _sctx = getattr(_span, "get_span_context", None) + if _sctx: + _ctx = _sctx() + if _ctx and getattr(_ctx, "trace_id", 0): + _log_data["trace_id"] = format(_ctx.trace_id, "032x") + _log_data["span_id"] = format(_ctx.span_id, "016x") + _batch.append((_log_data, _f.tell())) except json.JSONDecodeError: - logger.error(f"Failed to decode JSON log line: {_line}") + logger.error("ingest: failed to decode JSON log line: %s", _line.strip()) + # Skip past bad line so we don't loop forever on it. + _position = _f.tell() continue - # Update position after successful line read - _position = _f.tell() + if len(_batch) >= DECNET_BATCH_SIZE or ( + time.monotonic() - _batch_started >= _max_wait_s + ): + _position = await _flush_batch(repo, _batch, _position) + _batch.clear() + _batch_started = time.monotonic() + + # Flush any remainder collected before EOF / partial-line break. + if _batch: + _position = await _flush_batch(repo, _batch, _position) except Exception as _e: _err_str = str(_e).lower() if "no such table" in _err_str or "no active connection" in _err_str or "connection closed" in _err_str: - logger.error(f"Post-shutdown or fatal DB error in ingester: {_e}") + logger.error("ingest: post-shutdown or fatal DB error: %s", _e) break # Exit worker — DB is gone or uninitialized - logger.error(f"Error in log ingestion worker: {_e}") + logger.error("ingest: error in worker: %s", _e) await asyncio.sleep(5) await asyncio.sleep(1) +async def _flush_batch( + repo: BaseRepository, + batch: list[tuple[dict[str, Any], int]], + current_position: int, +) -> int: + """Commit a batch of log rows and return the new file position. + + If the enclosing task is being cancelled, bail out without touching + the DB — the session factory may already be disposed during lifespan + teardown, and awaiting it would stall the worker. The un-flushed + lines stay uncommitted; the next startup re-reads them from + ``current_position``. + """ + _task = asyncio.current_task() + if _task is not None and _task.cancelling(): + raise asyncio.CancelledError() + + _entries = [_entry for _entry, _ in batch] + _new_position = batch[-1][1] + await repo.add_logs(_entries) + for _entry in _entries: + await _extract_bounty(repo, _entry) + await repo.set_state(_INGEST_STATE_KEY, {"position": _new_position}) + return _new_position + + +@_traced("ingester.extract_bounty") async def _extract_bounty(repo: BaseRepository, log_data: dict[str, Any]) -> None: """Detect and extract valuable artifacts (bounties) from log entries.""" _fields = log_data.get("fields") @@ -96,4 +166,180 @@ async def _extract_bounty(repo: BaseRepository, log_data: dict[str, Any]) -> Non } }) - # 2. Add more extractors here later (e.g. file hashes, crypto keys) + # 2. HTTP User-Agent fingerprint + _h_raw = _fields.get("headers") + if isinstance(_h_raw, dict): + _headers = _h_raw + elif isinstance(_h_raw, str): + try: + _parsed = json.loads(_h_raw) + _headers = _parsed if isinstance(_parsed, dict) else {} + except (json.JSONDecodeError, ValueError): + _headers = {} + else: + _headers = {} + _ua = _headers.get("User-Agent") or _headers.get("user-agent") + if _ua: + await repo.add_bounty({ + "decky": log_data.get("decky"), + "service": log_data.get("service"), + "attacker_ip": log_data.get("attacker_ip"), + "bounty_type": "fingerprint", + "payload": { + "fingerprint_type": "http_useragent", + "value": _ua, + "method": _fields.get("method"), + "path": _fields.get("path"), + } + }) + + # 3. VNC client version fingerprint + _vnc_ver = _fields.get("client_version") + if _vnc_ver and log_data.get("event_type") == "version": + await repo.add_bounty({ + "decky": log_data.get("decky"), + "service": log_data.get("service"), + "attacker_ip": log_data.get("attacker_ip"), + "bounty_type": "fingerprint", + "payload": { + "fingerprint_type": "vnc_client_version", + "value": _vnc_ver, + } + }) + + # 4. SSH client banner fingerprint (deferred — requires asyncssh server) + # Fires on: service=ssh, event_type=client_banner, fields.client_banner + + # 5. JA3/JA3S TLS fingerprint from sniffer container + _ja3 = _fields.get("ja3") + if _ja3 and log_data.get("service") == "sniffer": + await repo.add_bounty({ + "decky": log_data.get("decky"), + "service": "sniffer", + "attacker_ip": log_data.get("attacker_ip"), + "bounty_type": "fingerprint", + "payload": { + "fingerprint_type": "ja3", + "ja3": _ja3, + "ja3s": _fields.get("ja3s"), + "ja4": _fields.get("ja4"), + "ja4s": _fields.get("ja4s"), + "tls_version": _fields.get("tls_version"), + "sni": _fields.get("sni") or None, + "alpn": _fields.get("alpn") or None, + "dst_port": _fields.get("dst_port"), + "raw_ciphers": _fields.get("raw_ciphers"), + "raw_extensions": _fields.get("raw_extensions"), + }, + }) + + # 6. JA4L latency fingerprint from sniffer + _ja4l_rtt = _fields.get("ja4l_rtt_ms") + if _ja4l_rtt and log_data.get("service") == "sniffer": + await repo.add_bounty({ + "decky": log_data.get("decky"), + "service": "sniffer", + "attacker_ip": log_data.get("attacker_ip"), + "bounty_type": "fingerprint", + "payload": { + "fingerprint_type": "ja4l", + "rtt_ms": _ja4l_rtt, + "client_ttl": _fields.get("ja4l_client_ttl"), + }, + }) + + # 7. TLS session resumption behavior + _resumption = _fields.get("resumption") + if _resumption and log_data.get("service") == "sniffer": + await repo.add_bounty({ + "decky": log_data.get("decky"), + "service": "sniffer", + "attacker_ip": log_data.get("attacker_ip"), + "bounty_type": "fingerprint", + "payload": { + "fingerprint_type": "tls_resumption", + "mechanisms": _resumption, + }, + }) + + # 8. TLS certificate details (TLS 1.2 only — passive extraction) + _subject_cn = _fields.get("subject_cn") + if _subject_cn and log_data.get("service") == "sniffer": + await repo.add_bounty({ + "decky": log_data.get("decky"), + "service": "sniffer", + "attacker_ip": log_data.get("attacker_ip"), + "bounty_type": "fingerprint", + "payload": { + "fingerprint_type": "tls_certificate", + "subject_cn": _subject_cn, + "issuer": _fields.get("issuer"), + "self_signed": _fields.get("self_signed"), + "not_before": _fields.get("not_before"), + "not_after": _fields.get("not_after"), + "sans": _fields.get("sans"), + "sni": _fields.get("sni") or None, + }, + }) + + # 9. JARM fingerprint from active prober + _jarm = _fields.get("jarm_hash") + if _jarm and log_data.get("service") == "prober": + await repo.add_bounty({ + "decky": log_data.get("decky"), + "service": "prober", + "attacker_ip": _fields.get("target_ip", "Unknown"), + "bounty_type": "fingerprint", + "payload": { + "fingerprint_type": "jarm", + "hash": _jarm, + "target_ip": _fields.get("target_ip"), + "target_port": _fields.get("target_port"), + }, + }) + + # 10. HASSHServer fingerprint from active prober + _hassh = _fields.get("hassh_server_hash") + if _hassh and log_data.get("service") == "prober": + await repo.add_bounty({ + "decky": log_data.get("decky"), + "service": "prober", + "attacker_ip": _fields.get("target_ip", "Unknown"), + "bounty_type": "fingerprint", + "payload": { + "fingerprint_type": "hassh_server", + "hash": _hassh, + "target_ip": _fields.get("target_ip"), + "target_port": _fields.get("target_port"), + "ssh_banner": _fields.get("ssh_banner"), + "kex_algorithms": _fields.get("kex_algorithms"), + "encryption_s2c": _fields.get("encryption_s2c"), + "mac_s2c": _fields.get("mac_s2c"), + "compression_s2c": _fields.get("compression_s2c"), + }, + }) + + # 11. TCP/IP stack fingerprint from active prober + _tcpfp = _fields.get("tcpfp_hash") + if _tcpfp and log_data.get("service") == "prober": + await repo.add_bounty({ + "decky": log_data.get("decky"), + "service": "prober", + "attacker_ip": _fields.get("target_ip", "Unknown"), + "bounty_type": "fingerprint", + "payload": { + "fingerprint_type": "tcpfp", + "hash": _tcpfp, + "raw": _fields.get("tcpfp_raw"), + "target_ip": _fields.get("target_ip"), + "target_port": _fields.get("target_port"), + "ttl": _fields.get("ttl"), + "window_size": _fields.get("window_size"), + "df_bit": _fields.get("df_bit"), + "mss": _fields.get("mss"), + "window_scale": _fields.get("window_scale"), + "sack_ok": _fields.get("sack_ok"), + "timestamp": _fields.get("timestamp"), + "options_order": _fields.get("options_order"), + }, + }) diff --git a/decnet/web/router/__init__.py b/decnet/web/router/__init__.py index b1bd92e..cbbb99c 100644 --- a/decnet/web/router/__init__.py +++ b/decnet/web/router/__init__.py @@ -11,8 +11,32 @@ from .fleet.api_mutate_decky import router as mutate_decky_router from .fleet.api_mutate_interval import router as mutate_interval_router from .fleet.api_deploy_deckies import router as deploy_deckies_router from .stream.api_stream_events import router as stream_router +from .attackers.api_get_attackers import router as attackers_router +from .attackers.api_get_attacker_detail import router as attacker_detail_router +from .attackers.api_get_attacker_commands import router as attacker_commands_router +from .attackers.api_get_attacker_artifacts import router as attacker_artifacts_router +from .config.api_get_config import router as config_get_router +from .config.api_update_config import router as config_update_router +from .config.api_manage_users import router as config_users_router +from .config.api_reinit import router as config_reinit_router +from .health.api_get_health import router as health_router +from .artifacts.api_get_artifact import router as artifacts_router +from .swarm_updates import swarm_updates_router +from .swarm_mgmt import swarm_mgmt_router +from .system import system_router -api_router = APIRouter() +api_router = APIRouter( + # Every route under /api/v1 is auth-guarded (either by an explicit + # require_* Depends or by the global auth middleware). Document 401/403 + # here so the OpenAPI schema reflects reality for contract tests. + responses={ + 400: {"description": "Malformed request body"}, + 401: {"description": "Missing or invalid credentials"}, + 403: {"description": "Authenticated but not authorized"}, + 404: {"description": "Referenced resource does not exist"}, + 409: {"description": "Conflict with existing resource"}, + }, +) # Authentication api_router.include_router(login_router) @@ -31,6 +55,31 @@ api_router.include_router(mutate_decky_router) api_router.include_router(mutate_interval_router) api_router.include_router(deploy_deckies_router) +# Attacker Profiles +api_router.include_router(attackers_router) +api_router.include_router(attacker_detail_router) +api_router.include_router(attacker_commands_router) +api_router.include_router(attacker_artifacts_router) + # Observability api_router.include_router(stats_router) api_router.include_router(stream_router) +api_router.include_router(health_router) + +# Configuration +api_router.include_router(config_get_router) +api_router.include_router(config_update_router) +api_router.include_router(config_users_router) +api_router.include_router(config_reinit_router) + +# Artifacts (captured attacker file drops) +api_router.include_router(artifacts_router) + +# Remote Updates (dashboard → worker updater daemons) +api_router.include_router(swarm_updates_router) + +# Swarm Management (dashboard: hosts, deckies, agent enrollment bundles) +api_router.include_router(swarm_mgmt_router) + +# System info (deployment-mode auto-detection, etc.) +api_router.include_router(system_router) diff --git a/decnet/web/router/artifacts/__init__.py b/decnet/web/router/artifacts/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/decnet/web/router/artifacts/api_get_artifact.py b/decnet/web/router/artifacts/api_get_artifact.py new file mode 100644 index 0000000..c5f6c92 --- /dev/null +++ b/decnet/web/router/artifacts/api_get_artifact.py @@ -0,0 +1,84 @@ +""" +Artifact download endpoint. + +SSH deckies farm attacker file drops into a host-mounted quarantine: + /var/lib/decnet/artifacts/{decky}/ssh/{stored_as} + +The capture event already flows through the normal log pipeline (one +RFC 5424 line per capture, see templates/ssh/emit_capture.py), so metadata +is served via /logs. This endpoint exists only to retrieve the raw bytes — +admin-gated because the payloads are attacker-controlled content. +""" + +from __future__ import annotations + +import os +import re +from pathlib import Path + +from fastapi import APIRouter, Depends, HTTPException +from fastapi.responses import FileResponse + +from decnet.telemetry import traced as _traced +from decnet.web.dependencies import require_admin + +router = APIRouter() + +# Override via env for tests; the prod path matches the bind mount declared in +# decnet/services/ssh.py. +ARTIFACTS_ROOT = Path(os.environ.get("DECNET_ARTIFACTS_ROOT", "/var/lib/decnet/artifacts")) + +# decky names come from the deployer — lowercase alnum plus hyphens. +_DECKY_RE = re.compile(r"^[a-z0-9][a-z0-9-]{0,62}$") + +# stored_as is assembled by capture.sh as: +# ${ts}_${sha:0:12}_${base} +# where ts is ISO-8601 UTC (e.g. 2026-04-18T02:22:56Z), sha is 12 hex chars, +# and base is the original filename's basename. Keep the filename charset +# tight but allow common punctuation dropped files actually use. +_STORED_AS_RE = re.compile( + r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z_[a-f0-9]{12}_[A-Za-z0-9._-]{1,255}$" +) + + +def _resolve_artifact_path(decky: str, stored_as: str) -> Path: + """Validate inputs, resolve the on-disk path, and confirm it stays inside + the artifacts root. Raises HTTPException(400) on any violation.""" + if not _DECKY_RE.fullmatch(decky): + raise HTTPException(status_code=400, detail="invalid decky name") + if not _STORED_AS_RE.fullmatch(stored_as): + raise HTTPException(status_code=400, detail="invalid stored_as") + + root = ARTIFACTS_ROOT.resolve() + candidate = (root / decky / "ssh" / stored_as).resolve() + # defence-in-depth: even though the regexes reject `..`, make sure a + # symlink or weird filesystem state can't escape the root. + if root not in candidate.parents and candidate != root: + raise HTTPException(status_code=400, detail="path escapes artifacts root") + return candidate + + +@router.get( + "/artifacts/{decky}/{stored_as}", + tags=["Artifacts"], + responses={ + 400: {"description": "Invalid decky or stored_as parameter"}, + 401: {"description": "Could not validate credentials"}, + 403: {"description": "Admin access required"}, + 404: {"description": "Artifact not found"}, + }, +) +@_traced("api.get_artifact") +async def get_artifact( + decky: str, + stored_as: str, + admin: dict = Depends(require_admin), +) -> FileResponse: + path = _resolve_artifact_path(decky, stored_as) + if not path.is_file(): + raise HTTPException(status_code=404, detail="artifact not found") + return FileResponse( + path=str(path), + media_type="application/octet-stream", + filename=stored_as, + ) diff --git a/decnet/web/router/attackers/__init__.py b/decnet/web/router/attackers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/decnet/web/router/attackers/api_get_attacker_artifacts.py b/decnet/web/router/attackers/api_get_attacker_artifacts.py new file mode 100644 index 0000000..000dc1f --- /dev/null +++ b/decnet/web/router/attackers/api_get_attacker_artifacts.py @@ -0,0 +1,34 @@ +from typing import Any + +from fastapi import APIRouter, Depends, HTTPException + +from decnet.telemetry import traced as _traced +from decnet.web.dependencies import require_viewer, repo + +router = APIRouter() + + +@router.get( + "/attackers/{uuid}/artifacts", + tags=["Attacker Profiles"], + responses={ + 401: {"description": "Could not validate credentials"}, + 403: {"description": "Insufficient permissions"}, + 404: {"description": "Attacker not found"}, + }, +) +@_traced("api.get_attacker_artifacts") +async def get_attacker_artifacts( + uuid: str, + user: dict = Depends(require_viewer), +) -> dict[str, Any]: + """List captured file-drop artifacts for an attacker (newest first). + + Each entry is a `file_captured` log row — the frontend renders the + badge/drawer using the same `fields` payload as /logs. + """ + attacker = await repo.get_attacker_by_uuid(uuid) + if not attacker: + raise HTTPException(status_code=404, detail="Attacker not found") + rows = await repo.get_attacker_artifacts(uuid) + return {"total": len(rows), "data": rows} diff --git a/decnet/web/router/attackers/api_get_attacker_commands.py b/decnet/web/router/attackers/api_get_attacker_commands.py new file mode 100644 index 0000000..14d03eb --- /dev/null +++ b/decnet/web/router/attackers/api_get_attacker_commands.py @@ -0,0 +1,42 @@ +from typing import Any, Optional + +from fastapi import APIRouter, Depends, HTTPException, Query + +from decnet.telemetry import traced as _traced +from decnet.web.dependencies import require_viewer, repo + +router = APIRouter() + + +@router.get( + "/attackers/{uuid}/commands", + tags=["Attacker Profiles"], + responses={ + 401: {"description": "Could not validate credentials"}, + 403: {"description": "Insufficient permissions"}, + 404: {"description": "Attacker not found"}, + 422: {"description": "Query parameter validation error (limit/offset out of range or invalid)"}, + }, +) +@_traced("api.get_attacker_commands") +async def get_attacker_commands( + uuid: str, + limit: int = Query(50, ge=1, le=200), + offset: int = Query(0, ge=0, le=2147483647), + service: Optional[str] = None, + user: dict = Depends(require_viewer), +) -> dict[str, Any]: + """Retrieve paginated commands for an attacker profile.""" + attacker = await repo.get_attacker_by_uuid(uuid) + if not attacker: + raise HTTPException(status_code=404, detail="Attacker not found") + + def _norm(v: Optional[str]) -> Optional[str]: + if v in (None, "null", "NULL", "undefined", ""): + return None + return v + + result = await repo.get_attacker_commands( + uuid=uuid, limit=limit, offset=offset, service=_norm(service), + ) + return {"total": result["total"], "limit": limit, "offset": offset, "data": result["data"]} diff --git a/decnet/web/router/attackers/api_get_attacker_detail.py b/decnet/web/router/attackers/api_get_attacker_detail.py new file mode 100644 index 0000000..dcc9ebd --- /dev/null +++ b/decnet/web/router/attackers/api_get_attacker_detail.py @@ -0,0 +1,30 @@ +from typing import Any + +from fastapi import APIRouter, Depends, HTTPException + +from decnet.telemetry import traced as _traced +from decnet.web.dependencies import require_viewer, repo + +router = APIRouter() + + +@router.get( + "/attackers/{uuid}", + tags=["Attacker Profiles"], + responses={ + 401: {"description": "Could not validate credentials"}, + 403: {"description": "Insufficient permissions"}, + 404: {"description": "Attacker not found"}, + }, +) +@_traced("api.get_attacker_detail") +async def get_attacker_detail( + uuid: str, + user: dict = Depends(require_viewer), +) -> dict[str, Any]: + """Retrieve a single attacker profile by UUID (with behavior block).""" + attacker = await repo.get_attacker_by_uuid(uuid) + if not attacker: + raise HTTPException(status_code=404, detail="Attacker not found") + attacker["behavior"] = await repo.get_attacker_behavior(uuid) + return attacker diff --git a/decnet/web/router/attackers/api_get_attackers.py b/decnet/web/router/attackers/api_get_attackers.py new file mode 100644 index 0000000..f1ff7b4 --- /dev/null +++ b/decnet/web/router/attackers/api_get_attackers.py @@ -0,0 +1,83 @@ +import asyncio +import time +from typing import Any, Optional + +from fastapi import APIRouter, Depends, Query + +from decnet.telemetry import traced as _traced +from decnet.web.dependencies import require_viewer, repo +from decnet.web.db.models import AttackersResponse + +router = APIRouter() + +# Same pattern as /logs — cache the unfiltered total count; filtered +# counts go straight to the DB. +_TOTAL_TTL = 2.0 +_total_cache: tuple[Optional[int], float] = (None, 0.0) +_total_lock: Optional[asyncio.Lock] = None + + +def _reset_total_cache() -> None: + global _total_cache, _total_lock + _total_cache = (None, 0.0) + _total_lock = None + + +async def _get_total_attackers_cached() -> int: + global _total_cache, _total_lock + value, ts = _total_cache + now = time.monotonic() + if value is not None and now - ts < _TOTAL_TTL: + return value + if _total_lock is None: + _total_lock = asyncio.Lock() + async with _total_lock: + value, ts = _total_cache + now = time.monotonic() + if value is not None and now - ts < _TOTAL_TTL: + return value + value = await repo.get_total_attackers() + _total_cache = (value, time.monotonic()) + return value + + +@router.get( + "/attackers", + response_model=AttackersResponse, + tags=["Attacker Profiles"], + responses={ + 401: {"description": "Could not validate credentials"}, + 403: {"description": "Insufficient permissions"}, + 422: {"description": "Validation error"}, + }, +) +@_traced("api.get_attackers") +async def get_attackers( + limit: int = Query(50, ge=1, le=1000), + offset: int = Query(0, ge=0, le=2147483647), + search: Optional[str] = None, + sort_by: str = Query("recent", pattern="^(recent|active|traversals)$"), + service: Optional[str] = None, + user: dict = Depends(require_viewer), +) -> dict[str, Any]: + """Retrieve paginated attacker profiles.""" + def _norm(v: Optional[str]) -> Optional[str]: + if v in (None, "null", "NULL", "undefined", ""): + return None + return v + + s = _norm(search) + svc = _norm(service) + _data = await repo.get_attackers(limit=limit, offset=offset, search=s, sort_by=sort_by, service=svc) + if s is None and svc is None: + _total = await _get_total_attackers_cached() + else: + _total = await repo.get_total_attackers(search=s, service=svc) + + # Bulk-join behavior rows for the IPs in this page to avoid N+1 queries. + _ips = {row["ip"] for row in _data if row.get("ip")} + _behaviors = await repo.get_behaviors_for_ips(_ips) if _ips else {} + for row in _data: + row["behavior"] = _behaviors.get(row.get("ip")) + + return {"total": _total, "limit": limit, "offset": offset, "data": _data} diff --git a/decnet/web/router/auth/api_change_pass.py b/decnet/web/router/auth/api_change_pass.py index c186973..592b11e 100644 --- a/decnet/web/router/auth/api_change_pass.py +++ b/decnet/web/router/auth/api_change_pass.py @@ -2,8 +2,9 @@ from typing import Any, Optional from fastapi import APIRouter, Depends, HTTPException, status -from decnet.web.auth import get_password_hash, verify_password -from decnet.web.dependencies import get_current_user_unchecked, repo +from decnet.telemetry import traced as _traced +from decnet.web.auth import ahash_password, averify_password +from decnet.web.dependencies import get_current_user_unchecked, invalidate_user_cache, repo from decnet.web.db.models import ChangePasswordRequest router = APIRouter() @@ -18,14 +19,16 @@ router = APIRouter() 422: {"description": "Validation error"} }, ) +@_traced("api.change_password") async def change_password(request: ChangePasswordRequest, current_user: str = Depends(get_current_user_unchecked)) -> dict[str, str]: _user: Optional[dict[str, Any]] = await repo.get_user_by_uuid(current_user) - if not _user or not verify_password(request.old_password, _user["password_hash"]): + if not _user or not await averify_password(request.old_password, _user["password_hash"]): raise HTTPException( status_code=status.HTTP_401_UNAUTHORIZED, detail="Incorrect old password", ) - _new_hash: str = get_password_hash(request.new_password) + _new_hash: str = await ahash_password(request.new_password) await repo.update_user_password(current_user, _new_hash, must_change_password=False) + invalidate_user_cache(current_user) return {"message": "Password updated successfully"} diff --git a/decnet/web/router/auth/api_login.py b/decnet/web/router/auth/api_login.py index a9db5b7..a41eaab 100644 --- a/decnet/web/router/auth/api_login.py +++ b/decnet/web/router/auth/api_login.py @@ -3,12 +3,13 @@ from typing import Any, Optional from fastapi import APIRouter, HTTPException, status +from decnet.telemetry import traced as _traced from decnet.web.auth import ( ACCESS_TOKEN_EXPIRE_MINUTES, + averify_password, create_access_token, - verify_password, ) -from decnet.web.dependencies import repo +from decnet.web.dependencies import get_user_by_username_cached from decnet.web.db.models import LoginRequest, Token router = APIRouter() @@ -24,9 +25,10 @@ router = APIRouter() 422: {"description": "Validation error"} }, ) +@_traced("api.login") async def login(request: LoginRequest) -> dict[str, Any]: - _user: Optional[dict[str, Any]] = await repo.get_user_by_username(request.username) - if not _user or not verify_password(request.password, _user["password_hash"]): + _user: Optional[dict[str, Any]] = await get_user_by_username_cached(request.username) + if not _user or not await averify_password(request.password, _user["password_hash"]): raise HTTPException( status_code=status.HTTP_401_UNAUTHORIZED, detail="Incorrect username or password", @@ -40,6 +42,6 @@ async def login(request: LoginRequest) -> dict[str, Any]: ) return { "access_token": _access_token, - "token_type": "bearer", # nosec B105 + "token_type": "bearer", # nosec B105 — OAuth2 token type, not a password "must_change_password": bool(_user.get("must_change_password", False)) } diff --git a/decnet/web/router/bounty/api_get_bounties.py b/decnet/web/router/bounty/api_get_bounties.py index 5ff7fd2..5560181 100644 --- a/decnet/web/router/bounty/api_get_bounties.py +++ b/decnet/web/router/bounty/api_get_bounties.py @@ -1,21 +1,62 @@ +import asyncio +import time from typing import Any, Optional from fastapi import APIRouter, Depends, Query -from decnet.web.dependencies import get_current_user, repo +from decnet.telemetry import traced as _traced +from decnet.web.dependencies import require_viewer, repo from decnet.web.db.models import BountyResponse router = APIRouter() +# Cache the unfiltered default page — the UI/locust hit this constantly +# with no params. Filtered requests (bounty_type/search) bypass: rare +# and staleness matters for search. +_BOUNTY_TTL = 5.0 +_DEFAULT_LIMIT = 50 +_DEFAULT_OFFSET = 0 +_bounty_cache: tuple[Optional[dict[str, Any]], float] = (None, 0.0) +_bounty_lock: Optional[asyncio.Lock] = None + + +def _reset_bounty_cache() -> None: + global _bounty_cache, _bounty_lock + _bounty_cache = (None, 0.0) + _bounty_lock = None + + +async def _get_bounty_default_cached() -> dict[str, Any]: + global _bounty_cache, _bounty_lock + value, ts = _bounty_cache + now = time.monotonic() + if value is not None and now - ts < _BOUNTY_TTL: + return value + if _bounty_lock is None: + _bounty_lock = asyncio.Lock() + async with _bounty_lock: + value, ts = _bounty_cache + now = time.monotonic() + if value is not None and now - ts < _BOUNTY_TTL: + return value + _data = await repo.get_bounties( + limit=_DEFAULT_LIMIT, offset=_DEFAULT_OFFSET, bounty_type=None, search=None, + ) + _total = await repo.get_total_bounties(bounty_type=None, search=None) + value = {"total": _total, "limit": _DEFAULT_LIMIT, "offset": _DEFAULT_OFFSET, "data": _data} + _bounty_cache = (value, time.monotonic()) + return value + @router.get("/bounty", response_model=BountyResponse, tags=["Bounty Vault"], - responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}},) + responses={401: {"description": "Could not validate credentials"}, 403: {"description": "Insufficient permissions"}, 422: {"description": "Validation error"}},) +@_traced("api.get_bounties") async def get_bounties( limit: int = Query(50, ge=1, le=1000), offset: int = Query(0, ge=0, le=2147483647), bounty_type: Optional[str] = None, search: Optional[str] = None, - current_user: str = Depends(get_current_user) + user: dict = Depends(require_viewer) ) -> dict[str, Any]: """Retrieve collected bounties (harvested credentials, payloads, etc.).""" def _norm(v: Optional[str]) -> Optional[str]: @@ -26,6 +67,9 @@ async def get_bounties( bt = _norm(bounty_type) s = _norm(search) + if bt is None and s is None and limit == _DEFAULT_LIMIT and offset == _DEFAULT_OFFSET: + return await _get_bounty_default_cached() + _data = await repo.get_bounties(limit=limit, offset=offset, bounty_type=bt, search=s) _total = await repo.get_total_bounties(bounty_type=bt, search=s) return { diff --git a/decnet/web/router/config/__init__.py b/decnet/web/router/config/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/decnet/web/router/config/api_get_config.py b/decnet/web/router/config/api_get_config.py new file mode 100644 index 0000000..d21f474 --- /dev/null +++ b/decnet/web/router/config/api_get_config.py @@ -0,0 +1,124 @@ +import asyncio +import time +from typing import Any, Optional + +from fastapi import APIRouter, Depends + +from decnet.env import DECNET_DEVELOPER +from decnet.telemetry import traced as _traced +from decnet.web.dependencies import require_viewer, repo +from decnet.web.db.models import UserResponse + +router = APIRouter() + +_DEFAULT_DEPLOYMENT_LIMIT = 10 +_DEFAULT_MUTATION_INTERVAL = "30m" + +# Cache config_limits / config_globals reads — these change on rare admin +# writes but get polled constantly by the UI and locust. +_STATE_TTL = 5.0 +_state_cache: dict[str, tuple[Optional[dict[str, Any]], float]] = {} +_state_locks: dict[str, asyncio.Lock] = {} + +# Admin branch fetched repo.list_users() on every /config call — cache 5s, +# invalidate on user create/update/delete so the admin UI stays consistent. +_USERS_TTL = 5.0 +_users_cache: tuple[Optional[list[dict[str, Any]]], float] = (None, 0.0) +_users_lock: Optional[asyncio.Lock] = None + + +def _reset_state_cache() -> None: + """Reset cached config state — used by tests.""" + global _users_cache, _users_lock + _state_cache.clear() + # Drop any locks bound to the previous event loop — reusing one from + # a dead loop deadlocks the next test. + _state_locks.clear() + _users_cache = (None, 0.0) + _users_lock = None + + +def invalidate_list_users_cache() -> None: + global _users_cache + _users_cache = (None, 0.0) + + +async def _get_list_users_cached() -> list[dict[str, Any]]: + global _users_cache, _users_lock + value, ts = _users_cache + now = time.monotonic() + if value is not None and now - ts < _USERS_TTL: + return value + if _users_lock is None: + _users_lock = asyncio.Lock() + async with _users_lock: + value, ts = _users_cache + now = time.monotonic() + if value is not None and now - ts < _USERS_TTL: + return value + value = await repo.list_users() + _users_cache = (value, time.monotonic()) + return value + + +async def _get_state_cached(name: str) -> Optional[dict[str, Any]]: + entry = _state_cache.get(name) + now = time.monotonic() + if entry is not None and now - entry[1] < _STATE_TTL: + return entry[0] + lock = _state_locks.setdefault(name, asyncio.Lock()) + async with lock: + entry = _state_cache.get(name) + now = time.monotonic() + if entry is not None and now - entry[1] < _STATE_TTL: + return entry[0] + value = await repo.get_state(name) + _state_cache[name] = (value, time.monotonic()) + return value + + +@router.get( + "/config", + tags=["Configuration"], + responses={ + 401: {"description": "Could not validate credentials"}, + 403: {"description": "Insufficient permissions"}, + }, +) +@_traced("api.get_config") +async def api_get_config(user: dict = Depends(require_viewer)) -> dict: + limits_state = await _get_state_cached("config_limits") + globals_state = await _get_state_cached("config_globals") + + deployment_limit = ( + limits_state.get("deployment_limit", _DEFAULT_DEPLOYMENT_LIMIT) + if limits_state + else _DEFAULT_DEPLOYMENT_LIMIT + ) + global_mutation_interval = ( + globals_state.get("global_mutation_interval", _DEFAULT_MUTATION_INTERVAL) + if globals_state + else _DEFAULT_MUTATION_INTERVAL + ) + + base = { + "role": user["role"], + "deployment_limit": deployment_limit, + "global_mutation_interval": global_mutation_interval, + } + + if user["role"] == "admin": + all_users = await _get_list_users_cached() + base["users"] = [ + UserResponse( + uuid=u["uuid"], + username=u["username"], + role=u["role"], + must_change_password=u["must_change_password"], + ).model_dump() + for u in all_users + ] + if DECNET_DEVELOPER: + base["developer_mode"] = True + + return base diff --git a/decnet/web/router/config/api_manage_users.py b/decnet/web/router/config/api_manage_users.py new file mode 100644 index 0000000..70e0fe9 --- /dev/null +++ b/decnet/web/router/config/api_manage_users.py @@ -0,0 +1,139 @@ +import uuid as _uuid + +from fastapi import APIRouter, Depends, HTTPException + +from decnet.telemetry import traced as _traced +from decnet.web.auth import ahash_password +from decnet.web.dependencies import require_admin, invalidate_user_cache, repo +from decnet.web.router.config.api_get_config import invalidate_list_users_cache +from decnet.web.db.models import ( + CreateUserRequest, + UpdateUserRoleRequest, + ResetUserPasswordRequest, + UserResponse, +) + +router = APIRouter() + + +@router.post( + "/config/users", + tags=["Configuration"], + responses={ + 400: {"description": "Bad Request (e.g. malformed JSON)"}, + 401: {"description": "Could not validate credentials"}, + 403: {"description": "Admin access required"}, + 409: {"description": "Username already exists"}, + 422: {"description": "Validation error"}, + }, +) +@_traced("api.create_user") +async def api_create_user( + req: CreateUserRequest, + admin: dict = Depends(require_admin), +) -> UserResponse: + existing = await repo.get_user_by_username(req.username) + if existing: + raise HTTPException(status_code=409, detail="Username already exists") + + user_uuid = str(_uuid.uuid4()) + await repo.create_user({ + "uuid": user_uuid, + "username": req.username, + "password_hash": await ahash_password(req.password), + "role": req.role, + "must_change_password": True, # nosec B105 — not a password + }) + invalidate_list_users_cache() + return UserResponse( + uuid=user_uuid, + username=req.username, + role=req.role, + must_change_password=True, + ) + + +@router.delete( + "/config/users/{user_uuid}", + tags=["Configuration"], + responses={ + 401: {"description": "Could not validate credentials"}, + 403: {"description": "Admin access required / cannot delete self"}, + 404: {"description": "User not found"}, + }, +) +@_traced("api.delete_user") +async def api_delete_user( + user_uuid: str, + admin: dict = Depends(require_admin), +) -> dict[str, str]: + if user_uuid == admin["uuid"]: + raise HTTPException(status_code=403, detail="Cannot delete your own account") + + deleted = await repo.delete_user(user_uuid) + if not deleted: + raise HTTPException(status_code=404, detail="User not found") + invalidate_user_cache(user_uuid) + invalidate_list_users_cache() + return {"message": "User deleted"} + + +@router.put( + "/config/users/{user_uuid}/role", + tags=["Configuration"], + responses={ + 400: {"description": "Bad Request (e.g. malformed JSON)"}, + 401: {"description": "Could not validate credentials"}, + 403: {"description": "Admin access required / cannot change own role"}, + 404: {"description": "User not found"}, + 422: {"description": "Validation error"}, + }, +) +@_traced("api.update_user_role") +async def api_update_user_role( + user_uuid: str, + req: UpdateUserRoleRequest, + admin: dict = Depends(require_admin), +) -> dict[str, str]: + if user_uuid == admin["uuid"]: + raise HTTPException(status_code=403, detail="Cannot change your own role") + + target = await repo.get_user_by_uuid(user_uuid) + if not target: + raise HTTPException(status_code=404, detail="User not found") + + await repo.update_user_role(user_uuid, req.role) + invalidate_user_cache(user_uuid) + invalidate_list_users_cache() + return {"message": "User role updated"} + + +@router.put( + "/config/users/{user_uuid}/reset-password", + tags=["Configuration"], + responses={ + 400: {"description": "Bad Request (e.g. malformed JSON)"}, + 401: {"description": "Could not validate credentials"}, + 403: {"description": "Admin access required"}, + 404: {"description": "User not found"}, + 422: {"description": "Validation error"}, + }, +) +@_traced("api.reset_user_password") +async def api_reset_user_password( + user_uuid: str, + req: ResetUserPasswordRequest, + admin: dict = Depends(require_admin), +) -> dict[str, str]: + target = await repo.get_user_by_uuid(user_uuid) + if not target: + raise HTTPException(status_code=404, detail="User not found") + + await repo.update_user_password( + user_uuid, + await ahash_password(req.new_password), + must_change_password=True, + ) + invalidate_user_cache(user_uuid) + invalidate_list_users_cache() + return {"message": "Password reset successfully"} diff --git a/decnet/web/router/config/api_reinit.py b/decnet/web/router/config/api_reinit.py new file mode 100644 index 0000000..ebdd1c7 --- /dev/null +++ b/decnet/web/router/config/api_reinit.py @@ -0,0 +1,27 @@ +from fastapi import APIRouter, Depends, HTTPException + +from decnet.env import DECNET_DEVELOPER +from decnet.telemetry import traced as _traced +from decnet.web.dependencies import require_admin, repo + +router = APIRouter() + + +@router.delete( + "/config/reinit", + tags=["Configuration"], + responses={ + 401: {"description": "Could not validate credentials"}, + 403: {"description": "Admin access required or developer mode not enabled"}, + }, +) +@_traced("api.reinit") +async def api_reinit(admin: dict = Depends(require_admin)) -> dict: + if not DECNET_DEVELOPER: + raise HTTPException(status_code=403, detail="Developer mode is not enabled") + + counts = await repo.purge_logs_and_bounties() + return { + "message": "Data purged", + "deleted": counts, + } diff --git a/decnet/web/router/config/api_update_config.py b/decnet/web/router/config/api_update_config.py new file mode 100644 index 0000000..a7feee3 --- /dev/null +++ b/decnet/web/router/config/api_update_config.py @@ -0,0 +1,48 @@ +from fastapi import APIRouter, Depends + +from decnet.telemetry import traced as _traced +from decnet.web.dependencies import require_admin, repo +from decnet.web.db.models import DeploymentLimitRequest, GlobalMutationIntervalRequest + +router = APIRouter() + + +@router.put( + "/config/deployment-limit", + tags=["Configuration"], + responses={ + 400: {"description": "Bad Request (e.g. malformed JSON)"}, + 401: {"description": "Could not validate credentials"}, + 403: {"description": "Admin access required"}, + 422: {"description": "Validation error"}, + }, +) +@_traced("api.update_deployment_limit") +async def api_update_deployment_limit( + req: DeploymentLimitRequest, + admin: dict = Depends(require_admin), +) -> dict[str, str]: + await repo.set_state("config_limits", {"deployment_limit": req.deployment_limit}) + return {"message": "Deployment limit updated"} + + +@router.put( + "/config/global-mutation-interval", + tags=["Configuration"], + responses={ + 400: {"description": "Bad Request (e.g. malformed JSON)"}, + 401: {"description": "Could not validate credentials"}, + 403: {"description": "Admin access required"}, + 422: {"description": "Validation error"}, + }, +) +@_traced("api.update_global_mutation_interval") +async def api_update_global_mutation_interval( + req: GlobalMutationIntervalRequest, + admin: dict = Depends(require_admin), +) -> dict[str, str]: + await repo.set_state( + "config_globals", + {"global_mutation_interval": req.global_mutation_interval}, + ) + return {"message": "Global mutation interval updated"} diff --git a/decnet/web/router/fleet/api_deploy_deckies.py b/decnet/web/router/fleet/api_deploy_deckies.py index 914a64c..5371ef7 100644 --- a/decnet/web/router/fleet/api_deploy_deckies.py +++ b/decnet/web/router/fleet/api_deploy_deckies.py @@ -1,14 +1,18 @@ -import logging import os from fastapi import APIRouter, Depends, HTTPException -from decnet.config import DEFAULT_MUTATE_INTERVAL, DecnetConfig, _ROOT, log +from decnet.logging import get_logger +from decnet.telemetry import traced as _traced +from decnet.config import DEFAULT_MUTATE_INTERVAL, DecnetConfig, _ROOT from decnet.engine import deploy as _deploy from decnet.ini_loader import load_ini_from_string from decnet.network import detect_interface, detect_subnet, get_host_ip -from decnet.web.dependencies import get_current_user, repo +from decnet.web.dependencies import require_admin, repo from decnet.web.db.models import DeployIniRequest +from decnet.web.router.swarm.api_deploy_swarm import dispatch_decnet_config + +log = get_logger("api") router = APIRouter() @@ -19,12 +23,15 @@ router = APIRouter() responses={ 400: {"description": "Bad Request (e.g. malformed JSON)"}, 401: {"description": "Could not validate credentials"}, + 403: {"description": "Insufficient permissions"}, 409: {"description": "Configuration conflict (e.g. invalid IP allocation or network mismatch)"}, 422: {"description": "Invalid INI config or schema validation error"}, - 500: {"description": "Deployment failed"} + 500: {"description": "Deployment failed"}, + 502: {"description": "Partial swarm deploy failure — one or more worker hosts returned an error"}, } ) -async def api_deploy_deckies(req: DeployIniRequest, current_user: str = Depends(get_current_user)) -> dict[str, str]: +@_traced("api.deploy_deckies") +async def api_deploy_deckies(req: DeployIniRequest, admin: dict = Depends(require_admin)) -> dict[str, str]: from decnet.fleet import build_deckies_from_ini try: @@ -38,16 +45,20 @@ async def api_deploy_deckies(req: DeployIniRequest, current_user: str = Depends( state_dict = await repo.get_state("deployment") ingest_log_file = os.environ.get("DECNET_INGEST_LOG_FILE") + config: DecnetConfig | None = None if state_dict: config = DecnetConfig(**state_dict["config"]) subnet_cidr = ini.subnet or config.subnet gateway = ini.gateway or config.gateway - host_ip = get_host_ip(config.interface) + iface = config.interface + host_ip = get_host_ip(iface) # Always sync config log_file with current API ingestion target if ingest_log_file: config.log_file = ingest_log_file else: - # If no state exists, we need to infer network details from the INI or the host. + # No state yet — infer network details from the INI or the host. We + # defer instantiating DecnetConfig until after build_deckies_from_ini + # because DecnetConfig.deckies has min_length=1. try: iface = ini.interface or detect_interface() subnet_cidr, gateway = ini.subnet, ini.gateway @@ -62,16 +73,6 @@ async def api_deploy_deckies(req: DeployIniRequest, current_user: str = Depends( detail=f"Network configuration conflict: {e}. " "Add a [general] section with interface=, net=, and gw= to the INI." ) - config = DecnetConfig( - mode="unihost", - interface=iface, - subnet=subnet_cidr, - gateway=gateway, - deckies=[], - log_file=ingest_log_file, - ipvlan=False, - mutate_interval=ini.mutate_interval or DEFAULT_MUTATE_INTERVAL - ) try: new_decky_configs = build_deckies_from_ini( @@ -81,26 +82,94 @@ async def api_deploy_deckies(req: DeployIniRequest, current_user: str = Depends( log.debug("deploy: build_deckies_from_ini rejected input: %s", e) raise HTTPException(status_code=409, detail=str(e)) - # Merge deckies - existing_deckies_map = {d.name: d for d in config.deckies} - for new_decky in new_decky_configs: - existing_deckies_map[new_decky.name] = new_decky + if config is None: + config = DecnetConfig( + mode="unihost", + interface=iface, + subnet=subnet_cidr, + gateway=gateway, + deckies=new_decky_configs, + log_file=ingest_log_file, + ipvlan=False, + mutate_interval=ini.mutate_interval or DEFAULT_MUTATE_INTERVAL, + ) - config.deckies = list(existing_deckies_map.values()) + # The INI is the source of truth for *which* deckies exist this deploy. + # The old "merge with prior state" behaviour meant submitting `[decky1]` + # after a 3-decky run silently redeployed decky2/decky3 too — and then + # collided on their stale IPs ("Address already in use"). Full replace + # matches what the operator sees in the submitted config. + config.deckies = list(new_decky_configs) - # We call deploy(config) which regenerates docker-compose and runs `up -d --remove-orphans`. + limits_state = await repo.get_state("config_limits") + deployment_limit = limits_state.get("deployment_limit", 10) if limits_state else 10 + if len(config.deckies) > deployment_limit: + raise HTTPException( + status_code=409, + detail=f"Deployment would result in {len(config.deckies)} deckies, " + f"exceeding the configured limit of {deployment_limit}", + ) + + # Auto-mode: if we're a master with at least one enrolled/active SWARM + # host, shard the deckies across those workers instead of spawning docker + # containers on the master itself. Round-robin assignment over deckies + # that don't already carry a host_uuid (state from a prior swarm deploy + # keeps its original assignment). + swarm_hosts: list[dict] = [] + if os.environ.get("DECNET_MODE", "master").lower() == "master": + swarm_hosts = [ + h for h in await repo.list_swarm_hosts() + if h.get("status") in ("active", "enrolled") and h.get("address") + ] + + if swarm_hosts: + # Carry-over from a prior deployment may reference a host_uuid that's + # since been decommissioned / re-enrolled at a new uuid. Drop any + # assignment that isn't in the currently-reachable set, then round- + # robin-fill the blanks — otherwise dispatch 404s on a dead uuid. + live_uuids = {h["uuid"] for h in swarm_hosts} + for d in config.deckies: + if d.host_uuid and d.host_uuid not in live_uuids: + d.host_uuid = None + unassigned = [d for d in config.deckies if not d.host_uuid] + for i, d in enumerate(unassigned): + d.host_uuid = swarm_hosts[i % len(swarm_hosts)]["uuid"] + config = config.model_copy(update={"mode": "swarm"}) + + try: + result = await dispatch_decnet_config(config, repo, dry_run=False, no_cache=False) + except HTTPException: + raise + except Exception as e: + log.exception("swarm-auto deploy dispatch failed: %s", e) + raise HTTPException(status_code=500, detail="Swarm dispatch failed. Check server logs.") + + await repo.set_state("deployment", { + "config": config.model_dump(), + "compose_path": state_dict["compose_path"] if state_dict else "", + }) + + failed = [r for r in result.results if not r.ok] + if failed: + detail = "; ".join(f"{r.host_name}: {r.detail}" for r in failed) + raise HTTPException(status_code=502, detail=f"Partial swarm deploy failure — {detail}") + return { + "message": f"Deckies deployed across {len(result.results)} swarm host(s)", + "mode": "swarm", + } + + # Unihost path — docker-compose on the master itself. try: if os.environ.get("DECNET_CONTRACT_TEST") != "true": _deploy(config) - # Persist new state to DB new_state_payload = { "config": config.model_dump(), "compose_path": str(_ROOT / "docker-compose.yml") if not state_dict else state_dict["compose_path"] } await repo.set_state("deployment", new_state_payload) except Exception as e: - logging.getLogger("decnet.web.api").exception("Deployment failed: %s", e) + log.exception("Deployment failed: %s", e) raise HTTPException(status_code=500, detail="Deployment failed. Check server logs for details.") - return {"message": "Deckies deployed successfully"} + return {"message": "Deckies deployed successfully", "mode": "unihost"} diff --git a/decnet/web/router/fleet/api_get_deckies.py b/decnet/web/router/fleet/api_get_deckies.py index 7353373..593ff4e 100644 --- a/decnet/web/router/fleet/api_get_deckies.py +++ b/decnet/web/router/fleet/api_get_deckies.py @@ -1,13 +1,48 @@ -from typing import Any +import asyncio +import time +from typing import Any, Optional from fastapi import APIRouter, Depends -from decnet.web.dependencies import get_current_user, repo +from decnet.telemetry import traced as _traced +from decnet.web.dependencies import require_viewer, repo router = APIRouter() +# /deckies is full fleet inventory — polled by the UI and under locust. +# Fleet state changes on deploy/teardown (seconds to minutes); a 5s window +# collapses the read storm into one DB hit. +_DECKIES_TTL = 5.0 +_deckies_cache: tuple[Optional[list[dict[str, Any]]], float] = (None, 0.0) +_deckies_lock: Optional[asyncio.Lock] = None + + +def _reset_deckies_cache() -> None: + global _deckies_cache, _deckies_lock + _deckies_cache = (None, 0.0) + _deckies_lock = None + + +async def _get_deckies_cached() -> list[dict[str, Any]]: + global _deckies_cache, _deckies_lock + value, ts = _deckies_cache + now = time.monotonic() + if value is not None and now - ts < _DECKIES_TTL: + return value + if _deckies_lock is None: + _deckies_lock = asyncio.Lock() + async with _deckies_lock: + value, ts = _deckies_cache + now = time.monotonic() + if value is not None and now - ts < _DECKIES_TTL: + return value + value = await repo.get_deckies() + _deckies_cache = (value, time.monotonic()) + return value + @router.get("/deckies", tags=["Fleet Management"], - responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}},) -async def get_deckies(current_user: str = Depends(get_current_user)) -> list[dict[str, Any]]: - return await repo.get_deckies() + responses={401: {"description": "Could not validate credentials"}, 403: {"description": "Insufficient permissions"}, 422: {"description": "Validation error"}},) +@_traced("api.get_deckies") +async def get_deckies(user: dict = Depends(require_viewer)) -> list[dict[str, Any]]: + return await _get_deckies_cached() diff --git a/decnet/web/router/fleet/api_mutate_decky.py b/decnet/web/router/fleet/api_mutate_decky.py index e3facc6..7f2e095 100644 --- a/decnet/web/router/fleet/api_mutate_decky.py +++ b/decnet/web/router/fleet/api_mutate_decky.py @@ -1,8 +1,9 @@ import os from fastapi import APIRouter, Depends, HTTPException, Path +from decnet.telemetry import traced as _traced from decnet.mutator import mutate_decky -from decnet.web.dependencies import get_current_user, repo +from decnet.web.dependencies import require_admin, repo router = APIRouter() @@ -10,11 +11,17 @@ router = APIRouter() @router.post( "/deckies/{decky_name}/mutate", tags=["Fleet Management"], - responses={401: {"description": "Could not validate credentials"}, 404: {"description": "Decky not found"}} + responses={ + 401: {"description": "Could not validate credentials"}, + 403: {"description": "Insufficient permissions"}, + 404: {"description": "Decky not found"}, + 422: {"description": "Path parameter validation error (decky_name must match ^[a-z0-9\\-]{1,64}$)"}, + } ) +@_traced("api.mutate_decky") async def api_mutate_decky( decky_name: str = Path(..., pattern=r"^[a-z0-9\-]{1,64}$"), - current_user: str = Depends(get_current_user), + admin: dict = Depends(require_admin), ) -> dict[str, str]: if os.environ.get("DECNET_CONTRACT_TEST") == "true": return {"message": f"Successfully mutated {decky_name} (Contract Test Mock)"} diff --git a/decnet/web/router/fleet/api_mutate_interval.py b/decnet/web/router/fleet/api_mutate_interval.py index f437340..10afba9 100644 --- a/decnet/web/router/fleet/api_mutate_interval.py +++ b/decnet/web/router/fleet/api_mutate_interval.py @@ -1,7 +1,8 @@ from fastapi import APIRouter, Depends, HTTPException +from decnet.telemetry import traced as _traced from decnet.config import DecnetConfig -from decnet.web.dependencies import get_current_user, repo +from decnet.web.dependencies import require_admin, repo from decnet.web.db.models import MutateIntervalRequest router = APIRouter() @@ -19,11 +20,13 @@ def _parse_duration(s: str) -> int: responses={ 400: {"description": "Bad Request (e.g. malformed JSON)"}, 401: {"description": "Could not validate credentials"}, + 403: {"description": "Insufficient permissions"}, 404: {"description": "No active deployment or decky not found"}, 422: {"description": "Validation error"} }, ) -async def api_update_mutate_interval(decky_name: str, req: MutateIntervalRequest, current_user: str = Depends(get_current_user)) -> dict[str, str]: +@_traced("api.update_mutate_interval") +async def api_update_mutate_interval(decky_name: str, req: MutateIntervalRequest, admin: dict = Depends(require_admin)) -> dict[str, str]: state_dict = await repo.get_state("deployment") if not state_dict: raise HTTPException(status_code=404, detail="No active deployment") diff --git a/decnet/web/router/health/__init__.py b/decnet/web/router/health/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/decnet/web/router/health/api_get_health.py b/decnet/web/router/health/api_get_health.py new file mode 100644 index 0000000..056519f --- /dev/null +++ b/decnet/web/router/health/api_get_health.py @@ -0,0 +1,151 @@ +import asyncio +import time +from typing import Any, Optional + +from fastapi import APIRouter, Depends +from fastapi.responses import ORJSONResponse + +from decnet.telemetry import traced as _traced +from decnet.web.dependencies import require_viewer, repo +from decnet.web.db.models import HealthResponse, ComponentHealth + +router = APIRouter() + +_CRITICAL_SERVICES = {"database", "docker", "ingestion_worker"} + +# Cache Docker client and health result to avoid hammering the Docker socket +_docker_client: Optional[Any] = None +_docker_healthy: bool = False +_docker_detail: str = "" +_docker_last_check: float = 0.0 +_DOCKER_CHECK_INTERVAL = 5.0 # seconds between actual Docker pings + +# Cache DB liveness result — under load, every request was hitting +# repo.get_total_logs() and filling the aiosqlite queue. +_db_component: Optional[ComponentHealth] = None +_db_last_check: float = 0.0 +# Lazy-init — an asyncio.Lock bound to a dead event loop deadlocks any +# later test running under a fresh loop. Create on first use. +_db_lock: Optional[asyncio.Lock] = None +_DB_CHECK_INTERVAL = 1.0 # seconds + + +def _reset_docker_cache() -> None: + """Reset cached Docker state — used by tests.""" + global _docker_client, _docker_healthy, _docker_detail, _docker_last_check + _docker_client = None + _docker_healthy = False + _docker_detail = "" + _docker_last_check = 0.0 + + +def _reset_db_cache() -> None: + """Reset cached DB liveness — used by tests.""" + global _db_component, _db_last_check, _db_lock + _db_component = None + _db_last_check = 0.0 + _db_lock = None + + +async def _check_database_cached() -> ComponentHealth: + global _db_component, _db_last_check, _db_lock + now = time.monotonic() + if _db_component is not None and now - _db_last_check < _DB_CHECK_INTERVAL: + return _db_component + if _db_lock is None: + _db_lock = asyncio.Lock() + async with _db_lock: + now = time.monotonic() + if _db_component is not None and now - _db_last_check < _DB_CHECK_INTERVAL: + return _db_component + try: + await repo.get_total_logs() + _db_component = ComponentHealth(status="ok") + except Exception as exc: + _db_component = ComponentHealth(status="failing", detail=str(exc)) + _db_last_check = time.monotonic() + return _db_component + + +@router.get( + "/health", + response_model=HealthResponse, + tags=["Observability"], + responses={ + 401: {"description": "Could not validate credentials"}, + 403: {"description": "Insufficient permissions"}, + 503: {"model": HealthResponse, "description": "System unhealthy"}, + }, +) +@_traced("api.get_health") +async def get_health(user: dict = Depends(require_viewer)) -> Any: + components: dict[str, ComponentHealth] = {} + + # 1. Database (cached — avoids a DB round-trip per request) + components["database"] = await _check_database_cached() + + # 2. Background workers + from decnet.web.api import get_background_tasks + for name, task in get_background_tasks().items(): + if task is None: + components[name] = ComponentHealth(status="failing", detail="not started") + elif task.done(): + if task.cancelled(): + detail = "cancelled" + else: + exc = task.exception() + detail = f"exited: {exc}" if exc else "exited unexpectedly" + components[name] = ComponentHealth(status="failing", detail=detail) + else: + components[name] = ComponentHealth(status="ok") + + # 3. Docker daemon (cached — avoids creating a new client per request) + global _docker_client, _docker_healthy, _docker_detail, _docker_last_check + now = time.monotonic() + if now - _docker_last_check > _DOCKER_CHECK_INTERVAL: + try: + import docker + + if _docker_client is None: + _docker_client = await asyncio.to_thread(docker.from_env) + await asyncio.to_thread(_docker_client.ping) + _docker_healthy = True + _docker_detail = "" + except Exception as exc: + _docker_client = None + _docker_healthy = False + _docker_detail = str(exc) + _docker_last_check = now + + if _docker_healthy: + components["docker"] = ComponentHealth(status="ok") + else: + components["docker"] = ComponentHealth(status="failing", detail=_docker_detail) + + # Overall status tiers: + # healthy — every component ok + # degraded — only non-critical components failing (service usable, + # falls back to cache or skips non-essential work) + # unhealthy — a critical component (db, docker, ingestion) failing; + # survival depends on caches + critical_failing = any( + c.status == "failing" + for name, c in components.items() + if name in _CRITICAL_SERVICES + ) + noncritical_failing = any( + c.status == "failing" + for name, c in components.items() + if name not in _CRITICAL_SERVICES + ) + + if critical_failing: + overall = "unhealthy" + elif noncritical_failing: + overall = "degraded" + else: + overall = "healthy" + + result = HealthResponse(status=overall, components=components) + status_code = 503 if overall == "unhealthy" else 200 + return ORJSONResponse(content=result.model_dump(), status_code=status_code) diff --git a/decnet/web/router/logs/api_get_histogram.py b/decnet/web/router/logs/api_get_histogram.py index 6e6d877..c334987 100644 --- a/decnet/web/router/logs/api_get_histogram.py +++ b/decnet/web/router/logs/api_get_histogram.py @@ -1,20 +1,58 @@ +import asyncio +import time from typing import Any, Optional from fastapi import APIRouter, Depends, Query -from decnet.web.dependencies import get_current_user, repo +from decnet.telemetry import traced as _traced +from decnet.web.dependencies import require_viewer, repo router = APIRouter() +# /logs/histogram aggregates over the full logs table — expensive and +# polled constantly by the UI. Cache only the unfiltered default call +# (which is what the UI and locust hit); any filter bypasses. +_HISTOGRAM_TTL = 5.0 +_DEFAULT_INTERVAL = 15 +_histogram_cache: tuple[Optional[list[dict[str, Any]]], float] = (None, 0.0) +_histogram_lock: Optional[asyncio.Lock] = None + + +def _reset_histogram_cache() -> None: + global _histogram_cache, _histogram_lock + _histogram_cache = (None, 0.0) + _histogram_lock = None + + +async def _get_histogram_cached() -> list[dict[str, Any]]: + global _histogram_cache, _histogram_lock + value, ts = _histogram_cache + now = time.monotonic() + if value is not None and now - ts < _HISTOGRAM_TTL: + return value + if _histogram_lock is None: + _histogram_lock = asyncio.Lock() + async with _histogram_lock: + value, ts = _histogram_cache + now = time.monotonic() + if value is not None and now - ts < _HISTOGRAM_TTL: + return value + value = await repo.get_log_histogram( + search=None, start_time=None, end_time=None, interval_minutes=_DEFAULT_INTERVAL, + ) + _histogram_cache = (value, time.monotonic()) + return value + @router.get("/logs/histogram", tags=["Logs"], - responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}},) + responses={401: {"description": "Could not validate credentials"}, 403: {"description": "Insufficient permissions"}, 422: {"description": "Validation error"}},) +@_traced("api.get_logs_histogram") async def get_logs_histogram( search: Optional[str] = None, start_time: Optional[str] = Query(None), end_time: Optional[str] = Query(None), interval_minutes: int = Query(15, ge=1), - current_user: str = Depends(get_current_user) + user: dict = Depends(require_viewer) ) -> list[dict[str, Any]]: def _norm(v: Optional[str]) -> Optional[str]: if v in (None, "null", "NULL", "undefined", ""): @@ -25,4 +63,6 @@ async def get_logs_histogram( st = _norm(start_time) et = _norm(end_time) + if s is None and st is None and et is None and interval_minutes == _DEFAULT_INTERVAL: + return await _get_histogram_cached() return await repo.get_log_histogram(search=s, start_time=st, end_time=et, interval_minutes=interval_minutes) diff --git a/decnet/web/router/logs/api_get_logs.py b/decnet/web/router/logs/api_get_logs.py index 2324c8c..8bd864b 100644 --- a/decnet/web/router/logs/api_get_logs.py +++ b/decnet/web/router/logs/api_get_logs.py @@ -1,22 +1,57 @@ +import asyncio +import time from typing import Any, Optional from fastapi import APIRouter, Depends, Query -from decnet.web.dependencies import get_current_user, repo +from decnet.telemetry import traced as _traced +from decnet.web.dependencies import require_viewer, repo from decnet.web.db.models import LogsResponse router = APIRouter() +# Cache the unfiltered total-logs count. Filtered counts bypass the cache +# (rare, freshness matters for search). SELECT count(*) FROM logs is a +# full scan and gets hammered by paginating clients. +_TOTAL_TTL = 2.0 +_total_cache: tuple[Optional[int], float] = (None, 0.0) +_total_lock: Optional[asyncio.Lock] = None + + +def _reset_total_cache() -> None: + global _total_cache, _total_lock + _total_cache = (None, 0.0) + _total_lock = None + + +async def _get_total_logs_cached() -> int: + global _total_cache, _total_lock + value, ts = _total_cache + now = time.monotonic() + if value is not None and now - ts < _TOTAL_TTL: + return value + if _total_lock is None: + _total_lock = asyncio.Lock() + async with _total_lock: + value, ts = _total_cache + now = time.monotonic() + if value is not None and now - ts < _TOTAL_TTL: + return value + value = await repo.get_total_logs() + _total_cache = (value, time.monotonic()) + return value + @router.get("/logs", response_model=LogsResponse, tags=["Logs"], - responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}}) + responses={401: {"description": "Could not validate credentials"}, 403: {"description": "Insufficient permissions"}, 422: {"description": "Validation error"}}) +@_traced("api.get_logs") async def get_logs( limit: int = Query(50, ge=1, le=1000), offset: int = Query(0, ge=0, le=2147483647), search: Optional[str] = Query(None, max_length=512), start_time: Optional[str] = Query(None), end_time: Optional[str] = Query(None), - current_user: str = Depends(get_current_user) + user: dict = Depends(require_viewer) ) -> dict[str, Any]: def _norm(v: Optional[str]) -> Optional[str]: if v in (None, "null", "NULL", "undefined", ""): @@ -28,7 +63,10 @@ async def get_logs( et = _norm(end_time) _logs: list[dict[str, Any]] = await repo.get_logs(limit=limit, offset=offset, search=s, start_time=st, end_time=et) - _total: int = await repo.get_total_logs(search=s, start_time=st, end_time=et) + if s is None and st is None and et is None: + _total: int = await _get_total_logs_cached() + else: + _total = await repo.get_total_logs(search=s, start_time=st, end_time=et) return { "total": _total, "limit": limit, diff --git a/decnet/web/router/stats/api_get_stats.py b/decnet/web/router/stats/api_get_stats.py index f72d8ad..474331d 100644 --- a/decnet/web/router/stats/api_get_stats.py +++ b/decnet/web/router/stats/api_get_stats.py @@ -1,14 +1,50 @@ -from typing import Any +import asyncio +import time +from typing import Any, Optional from fastapi import APIRouter, Depends -from decnet.web.dependencies import get_current_user, repo +from decnet.telemetry import traced as _traced +from decnet.web.dependencies import require_viewer, repo from decnet.web.db.models import StatsResponse router = APIRouter() +# /stats is aggregate telemetry polled constantly by the UI and locust. +# A 5s window collapses thousands of concurrent calls — each of which +# runs SELECT count(*) FROM logs + SELECT count(DISTINCT attacker_ip) — +# into one DB hit per window. +_STATS_TTL = 5.0 +_stats_cache: tuple[Optional[dict[str, Any]], float] = (None, 0.0) +_stats_lock: Optional[asyncio.Lock] = None + + +def _reset_stats_cache() -> None: + global _stats_cache, _stats_lock + _stats_cache = (None, 0.0) + _stats_lock = None + + +async def _get_stats_cached() -> dict[str, Any]: + global _stats_cache, _stats_lock + value, ts = _stats_cache + now = time.monotonic() + if value is not None and now - ts < _STATS_TTL: + return value + if _stats_lock is None: + _stats_lock = asyncio.Lock() + async with _stats_lock: + value, ts = _stats_cache + now = time.monotonic() + if value is not None and now - ts < _STATS_TTL: + return value + value = await repo.get_stats_summary() + _stats_cache = (value, time.monotonic()) + return value + @router.get("/stats", response_model=StatsResponse, tags=["Observability"], - responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}},) -async def get_stats(current_user: str = Depends(get_current_user)) -> dict[str, Any]: - return await repo.get_stats_summary() + responses={401: {"description": "Could not validate credentials"}, 403: {"description": "Insufficient permissions"}, 422: {"description": "Validation error"}},) +@_traced("api.get_stats") +async def get_stats(user: dict = Depends(require_viewer)) -> dict[str, Any]: + return await _get_stats_cached() diff --git a/decnet/web/router/stream/api_stream_events.py b/decnet/web/router/stream/api_stream_events.py index 0690b6a..f463703 100644 --- a/decnet/web/router/stream/api_stream_events.py +++ b/decnet/web/router/stream/api_stream_events.py @@ -1,19 +1,49 @@ -import json import asyncio -import logging + +import orjson from typing import AsyncGenerator, Optional from fastapi import APIRouter, Depends, Query, Request from fastapi.responses import StreamingResponse from decnet.env import DECNET_DEVELOPER -from decnet.web.dependencies import get_stream_user, repo +from decnet.logging import get_logger +from decnet.telemetry import traced as _traced, get_tracer as _get_tracer +from decnet.web.dependencies import require_stream_viewer, repo -log = logging.getLogger(__name__) +log = get_logger("api") router = APIRouter() +def _build_trace_links(logs: list[dict]) -> list: + """Build OTEL span links from persisted trace_id/span_id in log rows. + + Returns an empty list when tracing is disabled (no OTEL imports). + """ + try: + from opentelemetry.trace import Link, SpanContext, TraceFlags + except ImportError: + return [] + links: list[Link] = [] + for entry in logs: + tid = entry.get("trace_id") + sid = entry.get("span_id") + if not tid or not sid or tid == "0": + continue + try: + ctx = SpanContext( + trace_id=int(tid, 16), + span_id=int(sid, 16), + is_remote=True, + trace_flags=TraceFlags(TraceFlags.SAMPLED), + ) + links.append(Link(ctx)) + except (ValueError, TypeError): + continue + return links + + @router.get("/stream", tags=["Observability"], responses={ 200: { @@ -21,9 +51,11 @@ router = APIRouter() "description": "Real-time Server-Sent Events (SSE) stream" }, 401: {"description": "Could not validate credentials"}, + 403: {"description": "Insufficient permissions"}, 422: {"description": "Validation error"} }, ) +@_traced("api.stream_events") async def stream_events( request: Request, last_event_id: int = Query(0, alias="lastEventId"), @@ -31,26 +63,33 @@ async def stream_events( start_time: Optional[str] = None, end_time: Optional[str] = None, max_output: Optional[int] = Query(None, alias="maxOutput"), - current_user: str = Depends(get_stream_user) + user: dict = Depends(require_stream_viewer) ) -> StreamingResponse: + # Prefetch the initial snapshot before entering the streaming generator. + # With asyncmy (pure async TCP I/O), the first DB await inside the generator + # fires immediately after the ASGI layer sends the keepalive chunk — the HTTP + # write and the MySQL read compete for asyncio I/O callbacks and the MySQL + # callback can stall. Running these here (normal async context, no streaming) + # avoids that race entirely. aiosqlite is immune because it runs SQLite in a + # thread, decoupled from the event loop's I/O scheduler. + _start_id = last_event_id if last_event_id != 0 else await repo.get_max_log_id() + _initial_stats = await repo.get_stats_summary() + _initial_histogram = await repo.get_log_histogram( + search=search, start_time=start_time, end_time=end_time, interval_minutes=15, + ) + async def event_generator() -> AsyncGenerator[str, None]: - last_id = last_event_id + last_id = _start_id stats_interval_sec = 10 loops_since_stats = 0 emitted_chunks = 0 try: - if last_id == 0: - last_id = await repo.get_max_log_id() + yield ": keepalive\n\n" # flush headers immediately - # Emit initial snapshot immediately so the client never needs to poll /stats - stats = await repo.get_stats_summary() - yield f"event: message\ndata: {json.dumps({'type': 'stats', 'data': stats})}\n\n" - histogram = await repo.get_log_histogram( - search=search, start_time=start_time, - end_time=end_time, interval_minutes=15, - ) - yield f"event: message\ndata: {json.dumps({'type': 'histogram', 'data': histogram})}\n\n" + # Emit pre-fetched initial snapshot — no DB calls in generator until the loop + yield f"event: message\ndata: {orjson.dumps({'type': 'stats', 'data': _initial_stats}).decode()}\n\n" + yield f"event: message\ndata: {orjson.dumps({'type': 'histogram', 'data': _initial_histogram}).decode()}\n\n" while True: if DECNET_DEVELOPER and max_output is not None: @@ -68,17 +107,25 @@ async def stream_events( ) if new_logs: last_id = max(entry["id"] for entry in new_logs) - yield f"event: message\ndata: {json.dumps({'type': 'logs', 'data': new_logs})}\n\n" + # Create a span linking back to the ingestion traces + # stored in each log row, closing the pipeline gap. + _links = _build_trace_links(new_logs) + _tracer = _get_tracer("sse") + with _tracer.start_as_current_span( + "sse.emit_logs", links=_links, + attributes={"log_count": len(new_logs)}, + ): + yield f"event: message\ndata: {orjson.dumps({'type': 'logs', 'data': new_logs}).decode()}\n\n" loops_since_stats = stats_interval_sec if loops_since_stats >= stats_interval_sec: stats = await repo.get_stats_summary() - yield f"event: message\ndata: {json.dumps({'type': 'stats', 'data': stats})}\n\n" + yield f"event: message\ndata: {orjson.dumps({'type': 'stats', 'data': stats}).decode()}\n\n" histogram = await repo.get_log_histogram( search=search, start_time=start_time, end_time=end_time, interval_minutes=15, ) - yield f"event: message\ndata: {json.dumps({'type': 'histogram', 'data': histogram})}\n\n" + yield f"event: message\ndata: {orjson.dumps({'type': 'histogram', 'data': histogram}).decode()}\n\n" loops_since_stats = 0 loops_since_stats += 1 @@ -88,6 +135,13 @@ async def stream_events( pass except Exception: log.exception("SSE stream error for user %s", last_event_id) - yield f"event: error\ndata: {json.dumps({'type': 'error', 'message': 'Stream interrupted'})}\n\n" + yield f"event: error\ndata: {orjson.dumps({'type': 'error', 'message': 'Stream interrupted'}).decode()}\n\n" - return StreamingResponse(event_generator(), media_type="text/event-stream") + return StreamingResponse( + event_generator(), + media_type="text/event-stream", + headers={ + "Cache-Control": "no-cache", + "X-Accel-Buffering": "no", + }, + ) diff --git a/decnet/web/router/swarm/__init__.py b/decnet/web/router/swarm/__init__.py new file mode 100644 index 0000000..7d3b4c2 --- /dev/null +++ b/decnet/web/router/swarm/__init__.py @@ -0,0 +1,47 @@ +"""Swarm controller routers. + +One file per endpoint, aggregated under the ``/swarm`` prefix. Mounted +onto the swarm-api FastAPI app (``decnet/web/swarm_api.py``), a separate +process from the main DECNET API so swarm failures cannot cascade into +log ingestion / dashboard serving. +""" +from fastapi import APIRouter + +from .api_enroll_host import router as enroll_host_router +from .api_list_hosts import router as list_hosts_router +from .api_get_host import router as get_host_router +from .api_decommission_host import router as decommission_host_router +from .api_deploy_swarm import router as deploy_swarm_router +from .api_teardown_swarm import router as teardown_swarm_router +from .api_get_swarm_health import router as get_swarm_health_router +from .api_check_hosts import router as check_hosts_router +from .api_heartbeat import router as heartbeat_router +from .api_list_deckies import router as list_deckies_router + +swarm_router = APIRouter( + prefix="/swarm", + # Error responses that every swarm route can surface. Route-level + # `responses=` entries still override/extend these for route-specific + # codes (e.g. 409 on /enroll). + responses={ + 400: {"description": "Malformed request"}, + 403: {"description": "Peer cert missing or fingerprint mismatch"}, + 404: {"description": "Referenced host does not exist"}, + }, +) + +# Hosts +swarm_router.include_router(enroll_host_router) +swarm_router.include_router(list_hosts_router) +swarm_router.include_router(get_host_router) +swarm_router.include_router(decommission_host_router) + +# Deployments +swarm_router.include_router(deploy_swarm_router) +swarm_router.include_router(teardown_swarm_router) +swarm_router.include_router(list_deckies_router) + +# Health +swarm_router.include_router(get_swarm_health_router) +swarm_router.include_router(check_hosts_router) +swarm_router.include_router(heartbeat_router) diff --git a/decnet/web/router/swarm/api_check_hosts.py b/decnet/web/router/swarm/api_check_hosts.py new file mode 100644 index 0000000..f058567 --- /dev/null +++ b/decnet/web/router/swarm/api_check_hosts.py @@ -0,0 +1,61 @@ +"""POST /swarm/check — active mTLS probe of every enrolled worker. + +Updates ``SwarmHost.status`` and ``last_heartbeat`` for each host based +on the outcome of the probe. +""" +from __future__ import annotations + +import asyncio +from datetime import datetime, timezone +from typing import Any + +from fastapi import APIRouter, Depends + +from decnet.logging import get_logger +from decnet.swarm.client import AgentClient +from decnet.web.db.repository import BaseRepository +from decnet.web.dependencies import get_repo +from decnet.web.db.models import SwarmCheckResponse, SwarmHostHealth + +log = get_logger("swarm.check") + +router = APIRouter() + + +@router.post("/check", response_model=SwarmCheckResponse, tags=["Swarm Health"]) +async def api_check_hosts( + repo: BaseRepository = Depends(get_repo), +) -> SwarmCheckResponse: + hosts = await repo.list_swarm_hosts() + + async def _probe(host: dict[str, Any]) -> SwarmHostHealth: + try: + async with AgentClient(host=host) as agent: + body = await agent.health() + await repo.update_swarm_host( + host["uuid"], + { + "status": "active", + "last_heartbeat": datetime.now(timezone.utc), + }, + ) + return SwarmHostHealth( + host_uuid=host["uuid"], + name=host["name"], + address=host["address"], + reachable=True, + detail=body, + ) + except Exception as exc: + log.warning("swarm.check unreachable host=%s err=%s", host["name"], exc) + await repo.update_swarm_host(host["uuid"], {"status": "unreachable"}) + return SwarmHostHealth( + host_uuid=host["uuid"], + name=host["name"], + address=host["address"], + reachable=False, + detail=str(exc), + ) + + results = await asyncio.gather(*(_probe(h) for h in hosts)) + return SwarmCheckResponse(results=list(results)) diff --git a/decnet/web/router/swarm/api_decommission_host.py b/decnet/web/router/swarm/api_decommission_host.py new file mode 100644 index 0000000..7e6c669 --- /dev/null +++ b/decnet/web/router/swarm/api_decommission_host.py @@ -0,0 +1,63 @@ +"""DELETE /swarm/hosts/{uuid} — decommission a worker. + +Removes the DeckyShard rows bound to the host (portable cascade — MySQL +and SQLite both honor it via the repo layer), deletes the SwarmHost row, +and best-effort-cleans the per-worker bundle directory on the master. + +Also asks the worker agent to wipe its own install (keeping logs). A +dead/unreachable worker does not block master-side cleanup. +""" +from __future__ import annotations + +import pathlib + +from fastapi import APIRouter, Depends, HTTPException, status + +from decnet.logging import get_logger +from decnet.swarm.client import AgentClient +from decnet.web.db.repository import BaseRepository +from decnet.web.dependencies import get_repo + +log = get_logger("swarm.decommission") +router = APIRouter() + + +@router.delete( + "/hosts/{uuid}", + status_code=status.HTTP_204_NO_CONTENT, + tags=["Swarm Hosts"], + responses={404: {"description": "No host with this UUID is enrolled"}}, +) +async def api_decommission_host( + uuid: str, + repo: BaseRepository = Depends(get_repo), +) -> None: + row = await repo.get_swarm_host_by_uuid(uuid) + if row is None: + raise HTTPException(status_code=404, detail="host not found") + + try: + async with AgentClient(host=row) as agent: + await agent.self_destruct() + except Exception: + log.exception( + "decommission: self-destruct dispatch failed host=%s — " + "proceeding with master-side cleanup anyway", + row.get("name"), + ) + + await repo.delete_decky_shards_for_host(uuid) + await repo.delete_swarm_host(uuid) + + # Best-effort bundle cleanup; if the dir was moved manually, don't fail. + bundle_dir = pathlib.Path(row.get("cert_bundle_path") or "") + if bundle_dir.is_dir(): + for child in bundle_dir.iterdir(): + try: + child.unlink() + except OSError: + pass + try: + bundle_dir.rmdir() + except OSError: + pass diff --git a/decnet/web/router/swarm/api_deploy_swarm.py b/decnet/web/router/swarm/api_deploy_swarm.py new file mode 100644 index 0000000..1142df8 --- /dev/null +++ b/decnet/web/router/swarm/api_deploy_swarm.py @@ -0,0 +1,155 @@ +"""POST /swarm/deploy — shard a DecnetConfig across enrolled workers. + +Per worker we build a filtered copy containing only the deckies assigned +to that worker (via ``host_uuid``), then POST it to the worker agent. +The caller is expected to have already set ``host_uuid`` on every decky; +if any decky arrives without one, we fail fast. Auto-sharding lives in +the CLI layer, not here. +""" +from __future__ import annotations + +import asyncio +import json +from datetime import datetime, timezone +from typing import Any + +from fastapi import APIRouter, Depends, HTTPException + +from decnet.config import DecnetConfig, DeckyConfig +from decnet.logging import get_logger +from decnet.swarm.client import AgentClient +from decnet.web.db.repository import BaseRepository +from decnet.web.dependencies import get_repo +from decnet.web.db.models import ( + SwarmDeployRequest, + SwarmDeployResponse, + SwarmHostResult, +) + +log = get_logger("swarm.deploy") + +router = APIRouter() + + +def _shard_by_host(config: DecnetConfig) -> dict[str, list[DeckyConfig]]: + buckets: dict[str, list[DeckyConfig]] = {} + for d in config.deckies: + if not d.host_uuid: + raise HTTPException( + status_code=400, + detail=f"decky '{d.name}' has no host_uuid — caller must shard before dispatch", + ) + buckets.setdefault(d.host_uuid, []).append(d) + return buckets + + +def _worker_config( + base: DecnetConfig, + shard: list[DeckyConfig], + host: dict[str, Any], +) -> DecnetConfig: + updates: dict[str, Any] = {"deckies": shard} + # Per-host driver opt-in (Wi-Fi-bridged VMs can't use macvlan — see + # SwarmHost.use_ipvlan). Never downgrade: if the operator picked ipvlan + # at the deploy level, keep it regardless of the per-host flag. + if host.get("use_ipvlan"): + updates["ipvlan"] = True + return base.model_copy(update=updates) + + +async def dispatch_decnet_config( + config: DecnetConfig, + repo: BaseRepository, + dry_run: bool = False, + no_cache: bool = False, +) -> SwarmDeployResponse: + """Shard ``config`` by ``host_uuid`` and dispatch to each worker in parallel. + + Shared between POST /swarm/deploy (explicit swarm call) and the auto-swarm + branch of POST /deckies/deploy. + """ + buckets = _shard_by_host(config) + + hosts: dict[str, dict[str, Any]] = {} + for host_uuid in buckets: + row = await repo.get_swarm_host_by_uuid(host_uuid) + if row is None: + raise HTTPException(status_code=404, detail=f"unknown host_uuid: {host_uuid}") + hosts[host_uuid] = row + + async def _dispatch(host_uuid: str, shard: list[DeckyConfig]) -> SwarmHostResult: + host = hosts[host_uuid] + cfg = _worker_config(config, shard, host) + try: + async with AgentClient(host=host) as agent: + body = await agent.deploy(cfg, dry_run=dry_run, no_cache=no_cache) + for d in shard: + await repo.upsert_decky_shard( + { + "decky_name": d.name, + "host_uuid": host_uuid, + "services": json.dumps(d.services), + "decky_config": d.model_dump_json(), + "decky_ip": d.ip, + "state": "running" if not dry_run else "pending", + "last_error": None, + "updated_at": datetime.now(timezone.utc), + } + ) + await repo.update_swarm_host(host_uuid, {"status": "active"}) + return SwarmHostResult(host_uuid=host_uuid, host_name=host["name"], ok=True, detail=body) + except Exception as exc: + log.exception("swarm.deploy dispatch failed host=%s", host["name"]) + # Compose-up is partial-success-friendly: one decky failing to + # build doesn't roll back the ones that already came up. Ask the + # agent which containers actually exist before painting the whole + # shard red — otherwise decky1 and decky2 look "failed" even + # though they're live on the worker. + runtime: dict[str, Any] = {} + try: + async with AgentClient(host=host) as probe: + snap = await probe.status() + runtime = snap.get("runtime") or {} + except Exception: + log.warning("swarm.deploy: runtime probe failed host=%s — marking shard failed", host["name"]) + for d in shard: + rstate = runtime.get(d.name) or {} + is_up = bool(rstate.get("running")) + await repo.upsert_decky_shard( + { + "decky_name": d.name, + "host_uuid": host_uuid, + "services": json.dumps(d.services), + "decky_config": d.model_dump_json(), + "decky_ip": d.ip, + "state": "running" if is_up else "failed", + "last_error": None if is_up else str(exc)[:512], + "updated_at": datetime.now(timezone.utc), + } + ) + return SwarmHostResult(host_uuid=host_uuid, host_name=host["name"], ok=False, detail=str(exc)) + + results = await asyncio.gather( + *(_dispatch(uuid_, shard) for uuid_, shard in buckets.items()) + ) + return SwarmDeployResponse(results=list(results)) + + +@router.post( + "/deploy", + response_model=SwarmDeployResponse, + tags=["Swarm Deployments"], + responses={ + 400: {"description": "Deployment mode must be 'swarm'"}, + 404: {"description": "A referenced host_uuid is not enrolled"}, + }, +) +async def api_deploy_swarm( + req: SwarmDeployRequest, + repo: BaseRepository = Depends(get_repo), +) -> SwarmDeployResponse: + if req.config.mode != "swarm": + raise HTTPException(status_code=400, detail="mode must be 'swarm'") + return await dispatch_decnet_config( + req.config, repo, dry_run=req.dry_run, no_cache=req.no_cache + ) diff --git a/decnet/web/router/swarm/api_enroll_host.py b/decnet/web/router/swarm/api_enroll_host.py new file mode 100644 index 0000000..351a922 --- /dev/null +++ b/decnet/web/router/swarm/api_enroll_host.py @@ -0,0 +1,100 @@ +"""POST /swarm/enroll — issue a worker cert bundle and register the host. + +Enrollment is master-driven: the controller holds the CA private key, +generates a fresh worker keypair + CA-signed cert, and returns the full +bundle to the operator. Bundle delivery to the worker (scp/sshpass/etc.) +is outside this process's trust boundary. + +Rationale: the worker agent speaks ONLY mTLS; there is no pre-auth +bootstrap endpoint, so nothing to attack before the worker is enrolled. +""" +from __future__ import annotations + +import uuid as _uuid +from datetime import datetime, timezone +from typing import Optional + +from fastapi import APIRouter, Depends, HTTPException, status + +from decnet.swarm import pki +from decnet.web.db.repository import BaseRepository +from decnet.web.dependencies import get_repo +from decnet.web.db.models import SwarmEnrolledBundle, SwarmEnrollRequest, SwarmUpdaterBundle + +router = APIRouter() + + +@router.post( + "/enroll", + response_model=SwarmEnrolledBundle, + status_code=status.HTTP_201_CREATED, + tags=["Swarm Hosts"], + responses={ + 400: {"description": "Bad Request (malformed JSON body)"}, + 409: {"description": "A worker with this name is already enrolled"}, + 422: {"description": "Request body validation error"}, + }, +) +async def api_enroll_host( + req: SwarmEnrollRequest, + repo: BaseRepository = Depends(get_repo), +) -> SwarmEnrolledBundle: + existing = await repo.get_swarm_host_by_name(req.name) + if existing is not None: + raise HTTPException(status_code=409, detail=f"Worker '{req.name}' is already enrolled") + + ca = pki.ensure_ca() + sans = list({*req.sans, req.address, req.name}) + issued = pki.issue_worker_cert(ca, req.name, sans) + + # Persist the bundle under ~/.decnet/ca/workers// so the master + # can replay it if the operator loses the original delivery. + bundle_dir = pki.DEFAULT_CA_DIR / "workers" / req.name + pki.write_worker_bundle(issued, bundle_dir) + + updater_view: Optional[SwarmUpdaterBundle] = None + updater_fp: Optional[str] = None + if req.issue_updater_bundle: + updater_cn = f"updater@{req.name}" + updater_sans = list({*sans, updater_cn, "127.0.0.1"}) + updater_issued = pki.issue_worker_cert(ca, updater_cn, updater_sans) + # Persist alongside the worker bundle for replay. + updater_dir = bundle_dir / "updater" + updater_dir.mkdir(parents=True, exist_ok=True) + (updater_dir / "updater.crt").write_bytes(updater_issued.cert_pem) + (updater_dir / "updater.key").write_bytes(updater_issued.key_pem) + import os as _os + _os.chmod(updater_dir / "updater.key", 0o600) + updater_fp = updater_issued.fingerprint_sha256 + updater_view = SwarmUpdaterBundle( + fingerprint=updater_fp, + updater_cert_pem=updater_issued.cert_pem.decode(), + updater_key_pem=updater_issued.key_pem.decode(), + ) + + host_uuid = str(_uuid.uuid4()) + await repo.add_swarm_host( + { + "uuid": host_uuid, + "name": req.name, + "address": req.address, + "agent_port": req.agent_port, + "status": "enrolled", + "client_cert_fingerprint": issued.fingerprint_sha256, + "updater_cert_fingerprint": updater_fp, + "cert_bundle_path": str(bundle_dir), + "enrolled_at": datetime.now(timezone.utc), + "notes": req.notes, + } + ) + return SwarmEnrolledBundle( + host_uuid=host_uuid, + name=req.name, + address=req.address, + agent_port=req.agent_port, + fingerprint=issued.fingerprint_sha256, + ca_cert_pem=issued.ca_cert_pem.decode(), + worker_cert_pem=issued.cert_pem.decode(), + worker_key_pem=issued.key_pem.decode(), + updater=updater_view, + ) diff --git a/decnet/web/router/swarm/api_get_host.py b/decnet/web/router/swarm/api_get_host.py new file mode 100644 index 0000000..556b6ee --- /dev/null +++ b/decnet/web/router/swarm/api_get_host.py @@ -0,0 +1,26 @@ +"""GET /swarm/hosts/{uuid} — fetch a single worker by UUID.""" +from __future__ import annotations + +from fastapi import APIRouter, Depends, HTTPException + +from decnet.web.db.repository import BaseRepository +from decnet.web.dependencies import get_repo +from decnet.web.db.models import SwarmHostView + +router = APIRouter() + + +@router.get( + "/hosts/{uuid}", + response_model=SwarmHostView, + tags=["Swarm Hosts"], + responses={404: {"description": "No host with this UUID is enrolled"}}, +) +async def api_get_host( + uuid: str, + repo: BaseRepository = Depends(get_repo), +) -> SwarmHostView: + row = await repo.get_swarm_host_by_uuid(uuid) + if row is None: + raise HTTPException(status_code=404, detail="host not found") + return SwarmHostView(**row) diff --git a/decnet/web/router/swarm/api_get_swarm_health.py b/decnet/web/router/swarm/api_get_swarm_health.py new file mode 100644 index 0000000..5960136 --- /dev/null +++ b/decnet/web/router/swarm/api_get_swarm_health.py @@ -0,0 +1,11 @@ +"""GET /swarm/health — controller liveness (no I/O).""" +from __future__ import annotations + +from fastapi import APIRouter + +router = APIRouter() + + +@router.get("/health", tags=["Swarm Health"]) +async def api_get_swarm_health() -> dict[str, str]: + return {"status": "ok", "role": "swarm-controller"} diff --git a/decnet/web/router/swarm/api_heartbeat.py b/decnet/web/router/swarm/api_heartbeat.py new file mode 100644 index 0000000..52487ca --- /dev/null +++ b/decnet/web/router/swarm/api_heartbeat.py @@ -0,0 +1,148 @@ +"""POST /swarm/heartbeat — agent→master liveness + decky snapshot refresh. + +Workers call this every ~30 s with the output of ``executor.status()``. +The master bumps ``SwarmHost.last_heartbeat`` and re-upserts each +``DeckyShard`` with the fresh ``DeckyConfig`` snapshot + runtime-derived +state so the dashboard stays current without a master-pull probe. + +Security: CA-signed mTLS is necessary but not sufficient — a +decommissioned worker's still-valid cert must not resurrect ghost +shards. We pin the presented peer cert's SHA-256 to the +``client_cert_fingerprint`` stored for the claimed ``host_uuid``. +Mismatch (or decommissioned host) → 403. +""" +from __future__ import annotations + +import hashlib +import json +from datetime import datetime, timezone +from typing import Any, Optional + +from fastapi import APIRouter, Depends, HTTPException, Request +from pydantic import BaseModel + +from decnet.config import DeckyConfig +from decnet.logging import get_logger +from decnet.web.db.repository import BaseRepository +from decnet.web.dependencies import get_repo + +log = get_logger("swarm.heartbeat") + +router = APIRouter() + + +class HeartbeatRequest(BaseModel): + host_uuid: str + agent_version: Optional[str] = None + status: dict[str, Any] + + +def _extract_peer_fingerprint(scope: dict[str, Any]) -> Optional[str]: + """Pull the peer cert's SHA-256 fingerprint from an ASGI scope. + + Tries two extraction paths because uvicorn has historically stashed + the TLS peer cert in different scope keys across versions: + + 1. Primary: ``scope["extensions"]["tls"]["client_cert_chain"][0]`` + (uvicorn ≥ 0.30 ASGI TLS extension). + 2. Fallback: the transport object's ``ssl_object.getpeercert(binary_form=True)`` + (older uvicorn builds + some other servers). + + Returns the lowercase hex SHA-256 of the DER-encoded cert, or None + when neither path yields bytes. The endpoint fails closed on None. + """ + peer_der: Optional[bytes] = None + source = "none" + + try: + chain = scope.get("extensions", {}).get("tls", {}).get("client_cert_chain") + if chain: + peer_der = chain[0] + source = "primary" + except Exception: + peer_der = None + + if peer_der is None: + transport = scope.get("transport") + try: + ssl_obj = transport.get_extra_info("ssl_object") if transport else None + if ssl_obj is not None: + peer_der = ssl_obj.getpeercert(binary_form=True) + if peer_der: + source = "fallback" + except Exception: + peer_der = None + + if not peer_der: + log.debug("heartbeat: peer cert extraction failed via none") + return None + + log.debug("heartbeat: peer cert extraction succeeded via %s", source) + return hashlib.sha256(peer_der).hexdigest().lower() + + +async def _verify_peer_matches_host( + request: Request, host_uuid: str, repo: BaseRepository +) -> dict[str, Any]: + host = await repo.get_swarm_host_by_uuid(host_uuid) + if host is None: + raise HTTPException(status_code=404, detail="unknown host") + fp = _extract_peer_fingerprint(request.scope) + if fp is None: + raise HTTPException(status_code=403, detail="peer cert unavailable") + expected = (host.get("client_cert_fingerprint") or "").lower() + if not expected or fp != expected: + raise HTTPException(status_code=403, detail="cert fingerprint mismatch") + return host + + +@router.post( + "/heartbeat", + status_code=204, + tags=["Swarm Health"], + responses={ + 400: {"description": "Bad Request (malformed JSON body)"}, + 403: {"description": "Peer cert missing, or its fingerprint does not match the host's pinned cert"}, + 404: {"description": "host_uuid is not enrolled"}, + 422: {"description": "Request body validation error"}, + }, +) +async def heartbeat( + req: HeartbeatRequest, + request: Request, + repo: BaseRepository = Depends(get_repo), +) -> None: + await _verify_peer_matches_host(request, req.host_uuid, repo) + + now = datetime.now(timezone.utc) + await repo.update_swarm_host( + req.host_uuid, + {"status": "active", "last_heartbeat": now}, + ) + + status_body = req.status or {} + if not status_body.get("deployed"): + return + + runtime = status_body.get("runtime") or {} + for decky_dict in status_body.get("deckies") or []: + try: + d = DeckyConfig(**decky_dict) + except Exception: + log.exception("heartbeat: skipping malformed decky payload host=%s", req.host_uuid) + continue + rstate = runtime.get(d.name) or {} + is_up = bool(rstate.get("running")) + await repo.upsert_decky_shard( + { + "decky_name": d.name, + "host_uuid": req.host_uuid, + "services": json.dumps(d.services), + "decky_config": d.model_dump_json(), + "decky_ip": d.ip, + "state": "running" if is_up else "degraded", + "last_error": None, + "last_seen": now, + "updated_at": now, + } + ) diff --git a/decnet/web/router/swarm/api_list_deckies.py b/decnet/web/router/swarm/api_list_deckies.py new file mode 100644 index 0000000..43a5d98 --- /dev/null +++ b/decnet/web/router/swarm/api_list_deckies.py @@ -0,0 +1,55 @@ +"""GET /swarm/deckies — list decky shards with their worker host's identity. + +The DeckyShard table maps decky_name → host_uuid; users want to see which +deckies are running and *where*, so we enrich each shard with the owning +host's name/address/status from SwarmHost rather than making callers do +the join themselves. +""" +from __future__ import annotations + +from typing import Optional + +from fastapi import APIRouter, Depends + +from decnet.web.db.repository import BaseRepository +from decnet.web.dependencies import get_repo +from decnet.web.db.models import DeckyShardView + +router = APIRouter() + + +@router.get("/deckies", response_model=list[DeckyShardView], tags=["Swarm Deckies"]) +async def api_list_deckies( + host_uuid: Optional[str] = None, + state: Optional[str] = None, + repo: BaseRepository = Depends(get_repo), +) -> list[DeckyShardView]: + shards = await repo.list_decky_shards(host_uuid) + hosts = {h["uuid"]: h for h in await repo.list_swarm_hosts()} + + out: list[DeckyShardView] = [] + for s in shards: + if state and s.get("state") != state: + continue + host = hosts.get(s["host_uuid"], {}) + out.append(DeckyShardView( + decky_name=s["decky_name"], + decky_ip=s.get("decky_ip"), + host_uuid=s["host_uuid"], + host_name=host.get("name") or "", + host_address=host.get("address") or "", + host_status=host.get("status") or "unknown", + services=s.get("services") or [], + state=s.get("state") or "pending", + last_error=s.get("last_error"), + compose_hash=s.get("compose_hash"), + updated_at=s["updated_at"], + hostname=s.get("hostname"), + distro=s.get("distro"), + archetype=s.get("archetype"), + service_config=s.get("service_config") or {}, + mutate_interval=s.get("mutate_interval"), + last_mutated=s.get("last_mutated") or 0.0, + last_seen=s.get("last_seen"), + )) + return out diff --git a/decnet/web/router/swarm/api_list_hosts.py b/decnet/web/router/swarm/api_list_hosts.py new file mode 100644 index 0000000..acc7ba9 --- /dev/null +++ b/decnet/web/router/swarm/api_list_hosts.py @@ -0,0 +1,21 @@ +"""GET /swarm/hosts — list enrolled workers, optionally filtered by status.""" +from __future__ import annotations + +from typing import Optional + +from fastapi import APIRouter, Depends + +from decnet.web.db.repository import BaseRepository +from decnet.web.dependencies import get_repo +from decnet.web.db.models import SwarmHostView + +router = APIRouter() + + +@router.get("/hosts", response_model=list[SwarmHostView], tags=["Swarm Hosts"]) +async def api_list_hosts( + host_status: Optional[str] = None, + repo: BaseRepository = Depends(get_repo), +) -> list[SwarmHostView]: + rows = await repo.list_swarm_hosts(host_status) + return [SwarmHostView(**r) for r in rows] diff --git a/decnet/web/router/swarm/api_teardown_swarm.py b/decnet/web/router/swarm/api_teardown_swarm.py new file mode 100644 index 0000000..d62f013 --- /dev/null +++ b/decnet/web/router/swarm/api_teardown_swarm.py @@ -0,0 +1,60 @@ +"""POST /swarm/teardown — tear down one or all enrolled workers.""" +from __future__ import annotations + +import asyncio +from typing import Any + +from fastapi import APIRouter, Depends, HTTPException + +from decnet.logging import get_logger +from decnet.swarm.client import AgentClient +from decnet.web.db.repository import BaseRepository +from decnet.web.dependencies import get_repo +from decnet.web.db.models import ( + SwarmDeployResponse, + SwarmHostResult, + SwarmTeardownRequest, +) + +log = get_logger("swarm.teardown") + +router = APIRouter() + + +@router.post( + "/teardown", + response_model=SwarmDeployResponse, + tags=["Swarm Deployments"], + responses={ + 400: {"description": "Bad Request (malformed JSON body)"}, + 404: {"description": "A targeted host does not exist"}, + 422: {"description": "Request body validation error"}, + }, +) +async def api_teardown_swarm( + req: SwarmTeardownRequest, + repo: BaseRepository = Depends(get_repo), +) -> SwarmDeployResponse: + if req.host_uuid is not None: + row = await repo.get_swarm_host_by_uuid(req.host_uuid) + if row is None: + raise HTTPException(status_code=404, detail="host not found") + targets = [row] + else: + targets = await repo.list_swarm_hosts() + + async def _call(host: dict[str, Any]) -> SwarmHostResult: + try: + async with AgentClient(host=host) as agent: + body = await agent.teardown(req.decky_id) + if req.decky_id is None: + await repo.delete_decky_shards_for_host(host["uuid"]) + return SwarmHostResult(host_uuid=host["uuid"], host_name=host["name"], ok=True, detail=body) + except Exception as exc: + log.exception("swarm.teardown failed host=%s", host["name"]) + return SwarmHostResult( + host_uuid=host["uuid"], host_name=host["name"], ok=False, detail=str(exc) + ) + + results = await asyncio.gather(*(_call(h) for h in targets)) + return SwarmDeployResponse(results=list(results)) diff --git a/decnet/web/router/swarm_mgmt/__init__.py b/decnet/web/router/swarm_mgmt/__init__.py new file mode 100644 index 0000000..12790f8 --- /dev/null +++ b/decnet/web/router/swarm_mgmt/__init__.py @@ -0,0 +1,26 @@ +"""Swarm management endpoints for the React dashboard. + +These are *not* the unauthenticated /swarm routes mounted on the separate +swarm-controller process (decnet/web/swarm_api.py on port 8770). These +live on the main web API, go through ``require_admin``, and are the +interface the dashboard uses to list hosts, decommission them, list +deckies across the fleet, and generate one-shot agent-enrollment +bundles. + +Mounted under ``/api/v1/swarm`` by the main api router. +""" +from fastapi import APIRouter + +from .api_list_hosts import router as list_hosts_router +from .api_decommission_host import router as decommission_host_router +from .api_list_deckies import router as list_deckies_router +from .api_enroll_bundle import router as enroll_bundle_router +from .api_teardown_host import router as teardown_host_router + +swarm_mgmt_router = APIRouter(prefix="/swarm") + +swarm_mgmt_router.include_router(list_hosts_router) +swarm_mgmt_router.include_router(decommission_host_router) +swarm_mgmt_router.include_router(list_deckies_router) +swarm_mgmt_router.include_router(enroll_bundle_router) +swarm_mgmt_router.include_router(teardown_host_router) diff --git a/decnet/web/router/swarm_mgmt/api_decommission_host.py b/decnet/web/router/swarm_mgmt/api_decommission_host.py new file mode 100644 index 0000000..d473b34 --- /dev/null +++ b/decnet/web/router/swarm_mgmt/api_decommission_host.py @@ -0,0 +1,71 @@ +"""DELETE /swarm/hosts/{uuid} — decommission a worker from the dashboard. + +Also instructs the worker agent to stop all DECNET services and delete +its install footprint (keeping logs). Agent self-destruct failure does +not block decommission — the master-side cleanup always runs so a dead +worker can still be removed from the dashboard. +""" +from __future__ import annotations + +import pathlib + +from fastapi import APIRouter, Depends, HTTPException, status + +from decnet.logging import get_logger +from decnet.swarm.client import AgentClient +from decnet.web.db.repository import BaseRepository +from decnet.web.dependencies import get_repo, require_admin + +log = get_logger("swarm.decommission") +router = APIRouter() + + +@router.delete( + "/hosts/{uuid}", + status_code=status.HTTP_204_NO_CONTENT, + tags=["Swarm Management"], + responses={ + 401: {"description": "Could not validate credentials"}, + 403: {"description": "Insufficient permissions"}, + 404: {"description": "Host not found"}, + 422: {"description": "Path parameter validation error"}, + }, +) +async def decommission_host( + uuid: str, + admin: dict = Depends(require_admin), + repo: BaseRepository = Depends(get_repo), +) -> None: + row = await repo.get_swarm_host_by_uuid(uuid) + if row is None: + raise HTTPException(status_code=404, detail="host not found") + + # Ask the worker to wipe its own install (keeps logs). The agent + # schedules the reaper as a detached process and returns immediately, + # so this call is fast when the worker is reachable. A dead worker + # shouldn't block the operator from cleaning up the dashboard entry, + # hence best-effort with a log and continue. + try: + async with AgentClient(host=row) as agent: + await agent.self_destruct() + except Exception: + log.exception( + "decommission: self-destruct dispatch failed host=%s — " + "proceeding with master-side cleanup anyway", + row.get("name"), + ) + + await repo.delete_decky_shards_for_host(uuid) + await repo.delete_swarm_host(uuid) + + bundle_dir = pathlib.Path(row.get("cert_bundle_path") or "") + if bundle_dir.is_dir(): + for child in bundle_dir.iterdir(): + try: + child.unlink() + except OSError: + pass + try: + bundle_dir.rmdir() + except OSError: + pass diff --git a/decnet/web/router/swarm_mgmt/api_enroll_bundle.py b/decnet/web/router/swarm_mgmt/api_enroll_bundle.py new file mode 100644 index 0000000..799df44 --- /dev/null +++ b/decnet/web/router/swarm_mgmt/api_enroll_bundle.py @@ -0,0 +1,484 @@ +"""Agent-enrollment bundles — the Wazuh-style one-liner flow. + +Three endpoints: + POST /swarm/enroll-bundle — admin issues certs + builds payload + GET /swarm/enroll-bundle/{t}.sh — bootstrap script (idempotent until .tgz) + GET /swarm/enroll-bundle/{t}.tgz — tarball payload (one-shot; trips served) + +The operator's paste is a single pipe ``curl -fsSL <.sh> | sudo bash``. +Under the hood the bootstrap curls the ``.tgz`` from the same token. +Both files are rendered + persisted on POST; the ``.tgz`` GET atomically +marks the token served, reads the bytes under the lock, and unlinks both +files so a sweeper cannot race it. Unclaimed tokens expire after 5 min. + +We avoid the single-self-extracting-script pattern because ``bash`` run +via pipe has ``$0 == "bash"`` — there is no file on disk to ``tail`` for +the embedded payload. Two URLs, one paste. +""" +from __future__ import annotations + +import asyncio +import fnmatch +import io +import os +import pathlib +import secrets +import tarfile +from dataclasses import dataclass +from datetime import datetime, timedelta, timezone +from typing import Optional + +from fastapi import APIRouter, Depends, HTTPException, Request, Response, status +from pydantic import BaseModel, Field + +from decnet.logging import get_logger +from decnet.swarm import pki +from decnet.web.db.repository import BaseRepository +from decnet.web.dependencies import get_repo, require_admin + +log = get_logger("swarm_mgmt.enroll_bundle") + +router = APIRouter() + +BUNDLE_TTL = timedelta(minutes=5) +BUNDLE_DIR = pathlib.Path(os.environ.get("DECNET_ENROLL_BUNDLE_DIR", "/tmp/decnet-enroll")) # nosec B108 - short-lived 0600 bundle cache, env-overridable +SWEEP_INTERVAL_SECS = 30 + +# Paths excluded from the bundled tarball. Matches the intent of +# decnet.swarm.tar_tree.DEFAULT_EXCLUDES but narrower — we never want +# tests, dev scaffolding, the master's DB, or the frontend source tree +# shipped to an agent. +_EXCLUDES: tuple[str, ...] = ( + ".venv", ".venv/*", "**/.venv/*", + "__pycache__", "**/__pycache__", "**/__pycache__/*", + ".git", ".git/*", + ".pytest_cache", ".pytest_cache/*", + ".mypy_cache", ".mypy_cache/*", + "*.egg-info", "*.egg-info/*", + # setuptools build/ staging dir — created by `pip install` and leaks a + # nested decnet_web/node_modules/ copy into the bundle otherwise. + "build", "build/*", "build/**", + "*.pyc", "*.pyo", + "*.db", "*.db-wal", "*.db-shm", "decnet.db*", + "*.log", + "tests", "tests/*", + "development", "development/*", + "wiki-checkout", "wiki-checkout/*", + # Frontend is master-only; agents never serve UI. + "decnet_web", "decnet_web/*", "decnet_web/**", + # Master FastAPI app and everything under decnet/web/ — no agent-side + # code imports it. The agent/updater/forwarder/collector/prober/sniffer + # entrypoints are all under decnet/agent, decnet/updater, decnet/swarm, + # decnet/collector, decnet/prober, decnet/sniffer. + "decnet/web", "decnet/web/*", "decnet/web/**", + # Mutator + Profiler are master-only (mutator schedules respawns across + # the swarm; profiler rebuilds attacker profiles against the master DB). + "decnet/mutator", "decnet/mutator/*", "decnet/mutator/**", + "decnet/profiler", "decnet/profiler/*", "decnet/profiler/**", + "decnet-state.json", + "master.log", "master.json", + "decnet.tar", + # Dev-host env/config leaks — these bake the master's absolute paths into + # the agent and point log handlers at directories that don't exist on the + # worker VM. + ".env", ".env.*", "**/.env", "**/.env.*", + "decnet.ini", "**/decnet.ini", +) + + +# --------------------------------------------------------------------------- +# DTOs +# --------------------------------------------------------------------------- + +class EnrollBundleRequest(BaseModel): + master_host: str = Field(..., min_length=1, max_length=253, + description="IP/host the agent will reach back to") + agent_name: str = Field(..., pattern=r"^[a-z0-9][a-z0-9-]{0,62}$", + description="Worker name (DNS-label safe)") + with_updater: bool = Field( + default=True, + description="Include updater cert bundle and auto-start decnet updater on the agent", + ) + use_ipvlan: bool = Field( + default=False, + description=( + "Run deckies on this agent over IPvlan L2 instead of MACVLAN. " + "Required when the agent is a VirtualBox/VMware guest bridged over Wi-Fi — " + "Wi-Fi APs bind one MAC per station, so MACVLAN's extra container MACs " + "rotate the VM's DHCP lease. Safe no-op on wired/bare-metal hosts." + ), + ) + services_ini: Optional[str] = Field( + default=None, + description="Optional INI text shipped to the agent as /etc/decnet/services.ini", + ) + + +class EnrollBundleResponse(BaseModel): + token: str + command: str + expires_at: datetime + host_uuid: str + + +# --------------------------------------------------------------------------- +# In-memory registry +# --------------------------------------------------------------------------- + +@dataclass +class _Bundle: + sh_path: pathlib.Path + tgz_path: pathlib.Path + expires_at: datetime + host_uuid: str + served: bool = False + + +_BUNDLES: dict[str, _Bundle] = {} +_LOCK = asyncio.Lock() +_SWEEPER_TASK: Optional[asyncio.Task] = None + + +async def _sweep_loop() -> None: + while True: + try: + await asyncio.sleep(SWEEP_INTERVAL_SECS) + now = datetime.now(timezone.utc) + async with _LOCK: + dead = [t for t, b in _BUNDLES.items() if b.served or b.expires_at <= now] + for t in dead: + b = _BUNDLES.pop(t) + for p in (b.sh_path, b.tgz_path): + try: + p.unlink() + except FileNotFoundError: + pass + except OSError as exc: + log.warning("enroll-bundle sweep unlink failed path=%s err=%s", p, exc) + except asyncio.CancelledError: + raise + except Exception: # noqa: BLE001 + log.exception("enroll-bundle sweeper iteration failed") + + +def _ensure_sweeper() -> None: + global _SWEEPER_TASK + if _SWEEPER_TASK is None or _SWEEPER_TASK.done(): + _SWEEPER_TASK = asyncio.create_task(_sweep_loop()) + + +# --------------------------------------------------------------------------- +# Tarball construction +# --------------------------------------------------------------------------- + +def _repo_root() -> pathlib.Path: + # decnet/web/router/swarm_mgmt/api_enroll_bundle.py -> 4 parents = repo root. + return pathlib.Path(__file__).resolve().parents[4] + + +def _is_excluded(rel: str) -> bool: + parts = pathlib.PurePosixPath(rel).parts + for pat in _EXCLUDES: + if fnmatch.fnmatch(rel, pat): + return True + for i in range(1, len(parts) + 1): + if fnmatch.fnmatch("/".join(parts[:i]), pat): + return True + return False + + +def _render_decnet_ini( + master_host: str, + host_uuid: str, + use_ipvlan: bool = False, + swarmctl_port: int = 8770, +) -> bytes: + ipvlan_line = f"ipvlan = {'true' if use_ipvlan else 'false'}\n" + return ( + "; Generated by DECNET agent-enrollment bundle.\n" + "[decnet]\n" + "mode = agent\n" + "disallow-master = true\n" + "log-directory = /var/log/decnet\n" + f"{ipvlan_line}" + "\n" + "[agent]\n" + f"master-host = {master_host}\n" + f"swarmctl-port = {swarmctl_port}\n" + "swarm-syslog-port = 6514\n" + "agent-port = 8765\n" + "agent-dir = /etc/decnet/agent\n" + "updater-dir = /etc/decnet/updater\n" + f"host-uuid = {host_uuid}\n" + ).encode() + + +def _add_bytes(tar: tarfile.TarFile, name: str, data: bytes, mode: int = 0o644) -> None: + info = tarfile.TarInfo(name) + info.size = len(data) + info.mode = mode + info.mtime = int(datetime.now(timezone.utc).timestamp()) + tar.addfile(info, io.BytesIO(data)) + + +def _build_tarball( + master_host: str, + agent_name: str, + host_uuid: str, + issued: pki.IssuedCert, + services_ini: Optional[str], + updater_issued: Optional[pki.IssuedCert] = None, + use_ipvlan: bool = False, +) -> bytes: + """Gzipped tarball with: + - full repo source (minus excludes) + - etc/decnet/decnet.ini (pre-baked for mode=agent) + - home/.decnet/agent/{ca.crt,worker.crt,worker.key} + - home/.decnet/updater/{ca.crt,updater.crt,updater.key} (if updater_issued) + - services.ini at root if provided + """ + root = _repo_root() + buf = io.BytesIO() + with tarfile.open(fileobj=buf, mode="w:gz") as tar: + for path in sorted(root.rglob("*")): + rel = path.relative_to(root).as_posix() + if _is_excluded(rel): + continue + if path.is_symlink() or path.is_dir(): + continue + tar.add(path, arcname=rel, recursive=False) + + _add_bytes( + tar, + "etc/decnet/decnet.ini", + _render_decnet_ini(master_host, host_uuid, use_ipvlan), + ) + for unit in _SYSTEMD_UNITS: + _add_bytes( + tar, + f"etc/systemd/system/{unit}.service", + _render_systemd_unit(unit, agent_name, master_host), + ) + _add_bytes(tar, "home/.decnet/agent/ca.crt", issued.ca_cert_pem) + _add_bytes(tar, "home/.decnet/agent/worker.crt", issued.cert_pem) + _add_bytes(tar, "home/.decnet/agent/worker.key", issued.key_pem, mode=0o600) + + if updater_issued is not None: + _add_bytes(tar, "home/.decnet/updater/ca.crt", updater_issued.ca_cert_pem) + _add_bytes(tar, "home/.decnet/updater/updater.crt", updater_issued.cert_pem) + _add_bytes(tar, "home/.decnet/updater/updater.key", updater_issued.key_pem, mode=0o600) + + if services_ini: + _add_bytes(tar, "services.ini", services_ini.encode()) + + return buf.getvalue() + + +_SYSTEMD_UNITS = ( + "decnet-agent", "decnet-forwarder", "decnet-engine", "decnet-updater", + # Per-host microservices — activated by enroll_bootstrap.sh. The + # profiler intentionally stays master-side: it rebuilds attacker + # profiles against the master DB, which workers don't share. + "decnet-collector", "decnet-prober", "decnet-sniffer", +) + + +def _render_systemd_unit(name: str, agent_name: str, master_host: str) -> bytes: + tpl_path = pathlib.Path(__file__).resolve().parents[1].parent / "templates" / f"{name}.service.j2" + tpl = tpl_path.read_text() + return ( + tpl.replace("{{ agent_name }}", agent_name) + .replace("{{ master_host }}", master_host) + ).encode() + + +def _render_bootstrap( + agent_name: str, + master_host: str, + tarball_url: str, + expires_at: datetime, + with_updater: bool, +) -> bytes: + tpl_path = pathlib.Path(__file__).resolve().parents[1].parent / "templates" / "enroll_bootstrap.sh.j2" + tpl = tpl_path.read_text() + now = datetime.now(timezone.utc).replace(microsecond=0).isoformat() + rendered = ( + tpl.replace("{{ agent_name }}", agent_name) + .replace("{{ master_host }}", master_host) + .replace("{{ tarball_url }}", tarball_url) + .replace("{{ generated_at }}", now) + .replace("{{ expires_at }}", expires_at.replace(microsecond=0).isoformat()) + .replace("{{ with_updater }}", "true" if with_updater else "false") + ) + return rendered.encode() + + +# --------------------------------------------------------------------------- +# Endpoints +# --------------------------------------------------------------------------- + +@router.post( + "/enroll-bundle", + response_model=EnrollBundleResponse, + status_code=status.HTTP_201_CREATED, + tags=["Swarm Management"], + responses={ + 400: {"description": "Bad Request (malformed JSON body)"}, + 401: {"description": "Could not validate credentials"}, + 403: {"description": "Insufficient permissions"}, + 409: {"description": "A worker with this name is already enrolled"}, + 422: {"description": "Request body validation error"}, + }, +) +async def create_enroll_bundle( + req: EnrollBundleRequest, + request: Request, + admin: dict = Depends(require_admin), + repo: BaseRepository = Depends(get_repo), +) -> EnrollBundleResponse: + import uuid as _uuid + + existing = await repo.get_swarm_host_by_name(req.agent_name) + if existing is not None: + raise HTTPException(status_code=409, detail=f"Worker '{req.agent_name}' is already enrolled") + + # 1. Issue certs (reuses the same code as /swarm/enroll). The worker's own + # address is not known yet — the master learns it when the agent fetches + # the tarball (see get_payload), which also backfills the SwarmHost row. + ca = pki.ensure_ca() + sans = list({req.agent_name, req.master_host}) + issued = pki.issue_worker_cert(ca, req.agent_name, sans) + bundle_dir = pki.DEFAULT_CA_DIR / "workers" / req.agent_name + pki.write_worker_bundle(issued, bundle_dir) + + updater_issued: Optional[pki.IssuedCert] = None + updater_fp: Optional[str] = None + if req.with_updater: + updater_cn = f"updater@{req.agent_name}" + updater_sans = list({*sans, updater_cn, "127.0.0.1"}) + updater_issued = pki.issue_worker_cert(ca, updater_cn, updater_sans) + updater_dir = bundle_dir / "updater" + updater_dir.mkdir(parents=True, exist_ok=True) + (updater_dir / "updater.crt").write_bytes(updater_issued.cert_pem) + (updater_dir / "updater.key").write_bytes(updater_issued.key_pem) + os.chmod(updater_dir / "updater.key", 0o600) + updater_fp = updater_issued.fingerprint_sha256 + + # 2. Register the host row so it shows up in SwarmHosts immediately. + host_uuid = str(_uuid.uuid4()) + await repo.add_swarm_host( + { + "uuid": host_uuid, + "name": req.agent_name, + "address": "", # filled in when the agent fetches the .tgz (its source IP) + "agent_port": 8765, + "status": "enrolled", + "client_cert_fingerprint": issued.fingerprint_sha256, + "updater_cert_fingerprint": updater_fp, + "cert_bundle_path": str(bundle_dir), + "enrolled_at": datetime.now(timezone.utc), + "notes": "enrolled via UI bundle", + "use_ipvlan": req.use_ipvlan, + } + ) + + # 3. Render payload + bootstrap. + tarball = _build_tarball( + req.master_host, req.agent_name, host_uuid, issued, req.services_ini, updater_issued, + use_ipvlan=req.use_ipvlan, + ) + token = secrets.token_urlsafe(24) + expires_at = datetime.now(timezone.utc) + BUNDLE_TTL + + BUNDLE_DIR.mkdir(parents=True, exist_ok=True, mode=0o700) + sh_path = BUNDLE_DIR / f"{token}.sh" + tgz_path = BUNDLE_DIR / f"{token}.tgz" + + # Build URLs against the operator-supplied master_host (reachable from the + # new agent) rather than request.base_url, which reflects how the dashboard + # user reached us — often 127.0.0.1 behind a proxy or loopback-bound API. + scheme = request.url.scheme + port = request.url.port + netloc = req.master_host if port is None else f"{req.master_host}:{port}" + base = f"{scheme}://{netloc}" + tarball_url = f"{base}/api/v1/swarm/enroll-bundle/{token}.tgz" + bootstrap_url = f"{base}/api/v1/swarm/enroll-bundle/{token}.sh" + script = _render_bootstrap(req.agent_name, req.master_host, tarball_url, expires_at, req.with_updater) + + tgz_path.write_bytes(tarball) + sh_path.write_bytes(script) + os.chmod(tgz_path, 0o600) + os.chmod(sh_path, 0o600) + + async with _LOCK: + _BUNDLES[token] = _Bundle( + sh_path=sh_path, tgz_path=tgz_path, expires_at=expires_at, host_uuid=host_uuid, + ) + _ensure_sweeper() + + log.info("enroll-bundle created agent=%s master=%s token=%s...", req.agent_name, req.master_host, token[:8]) + + return EnrollBundleResponse( + token=token, + command=f"curl -fsSL {bootstrap_url} | sudo bash", + expires_at=expires_at, + host_uuid=host_uuid, + ) + + +def _now() -> datetime: + # Indirection so tests can monkeypatch. + return datetime.now(timezone.utc) + + +async def _lookup_live(token: str) -> _Bundle: + b = _BUNDLES.get(token) + if b is None or b.served or b.expires_at <= _now(): + raise HTTPException(status_code=404, detail="bundle not found or expired") + return b + + +@router.get( + "/enroll-bundle/{token}.sh", + tags=["Swarm Management"], + include_in_schema=False, +) +async def get_bootstrap(token: str) -> Response: + async with _LOCK: + b = await _lookup_live(token) + data = b.sh_path.read_bytes() + return Response(content=data, media_type="text/x-shellscript") + + +@router.get( + "/enroll-bundle/{token}.tgz", + tags=["Swarm Management"], + include_in_schema=False, +) +async def get_payload( + token: str, + request: Request, + repo: BaseRepository = Depends(get_repo), +) -> Response: + async with _LOCK: + b = await _lookup_live(token) + b.served = True + data = b.tgz_path.read_bytes() + host_uuid = b.host_uuid + for p in (b.sh_path, b.tgz_path): + try: + p.unlink() + except FileNotFoundError: + pass + + # The agent's first connect-back — its source IP is the reachable address + # the master will later use to probe it. Backfill the SwarmHost row here + # so the operator sees the real address instead of an empty placeholder. + client_host = request.client.host if request.client else "" + if client_host: + try: + await repo.update_swarm_host(host_uuid, {"address": client_host}) + except Exception as e: # noqa: BLE001 + log.warning("enroll-bundle could not backfill address host=%s err=%s", host_uuid, e) + + return Response(content=data, media_type="application/gzip") diff --git a/decnet/web/router/swarm_mgmt/api_list_deckies.py b/decnet/web/router/swarm_mgmt/api_list_deckies.py new file mode 100644 index 0000000..0f8bb84 --- /dev/null +++ b/decnet/web/router/swarm_mgmt/api_list_deckies.py @@ -0,0 +1,58 @@ +"""GET /swarm/deckies — admin-gated list of decky shards across the fleet.""" +from __future__ import annotations + +from typing import Optional + +from fastapi import APIRouter, Depends + +from decnet.web.db.models import DeckyShardView +from decnet.web.db.repository import BaseRepository +from decnet.web.dependencies import get_repo, require_admin + +router = APIRouter() + + +@router.get("/deckies", response_model=list[DeckyShardView], tags=["Swarm Management"]) +async def list_deckies( + host_uuid: Optional[str] = None, + state: Optional[str] = None, + admin: dict = Depends(require_admin), + repo: BaseRepository = Depends(get_repo), +) -> list[DeckyShardView]: + shards = await repo.list_decky_shards(host_uuid) + hosts = {h["uuid"]: h for h in await repo.list_swarm_hosts()} + + # Pre-heartbeat fallback — older rows without decky_config can still + # surface their IP from the master's deploy state snapshot. + deploy_state = await repo.get_state("deployment") or {} + cfg_deckies = (deploy_state.get("config") or {}).get("deckies") or [] + ip_by_name: dict[str, str] = { + d.get("name"): d.get("ip") for d in cfg_deckies if d.get("name") + } + + out: list[DeckyShardView] = [] + for s in shards: + if state and s.get("state") != state: + continue + host = hosts.get(s["host_uuid"], {}) + out.append(DeckyShardView( + decky_name=s["decky_name"], + decky_ip=s.get("decky_ip") or ip_by_name.get(s["decky_name"]), + host_uuid=s["host_uuid"], + host_name=host.get("name") or "", + host_address=host.get("address") or "", + host_status=host.get("status") or "unknown", + services=s.get("services") or [], + state=s.get("state") or "pending", + last_error=s.get("last_error"), + compose_hash=s.get("compose_hash"), + updated_at=s["updated_at"], + hostname=s.get("hostname"), + distro=s.get("distro"), + archetype=s.get("archetype"), + service_config=s.get("service_config") or {}, + mutate_interval=s.get("mutate_interval"), + last_mutated=s.get("last_mutated") or 0.0, + last_seen=s.get("last_seen"), + )) + return out diff --git a/decnet/web/router/swarm_mgmt/api_list_hosts.py b/decnet/web/router/swarm_mgmt/api_list_hosts.py new file mode 100644 index 0000000..f835fc5 --- /dev/null +++ b/decnet/web/router/swarm_mgmt/api_list_hosts.py @@ -0,0 +1,60 @@ +"""GET /swarm/hosts — admin-gated list of enrolled workers for the dashboard. + +Fans out an ``AgentClient.health()`` probe to each host on every call and +updates ``status`` / ``last_heartbeat`` as a side effect. This mirrors how +``/swarm-updates/hosts`` probes the updater daemon — the SwarmHosts page +polls this endpoint, so probe-on-read is what drives heartbeat freshness +in the UI. No separate scheduler needed. +""" +from __future__ import annotations + +import asyncio +from datetime import datetime, timezone +from typing import Any, Optional + +from fastapi import APIRouter, Depends + +from decnet.logging import get_logger +from decnet.swarm.client import AgentClient +from decnet.web.db.models import SwarmHostView +from decnet.web.db.repository import BaseRepository +from decnet.web.dependencies import get_repo, require_admin + +log = get_logger("swarm_mgmt.list_hosts") + +router = APIRouter() + + +async def _probe_and_update( + host: dict[str, Any], repo: BaseRepository +) -> dict[str, Any]: + """Best-effort mTLS probe. Skips hosts with no address yet (pending first + connect-back) so we don't pollute the DB with 'unreachable' on fresh + enrollments that haven't fetched the tarball.""" + if not host.get("address"): + return host + try: + async with AgentClient(host=host) as agent: + await agent.health() + patch = {"status": "active", "last_heartbeat": datetime.now(timezone.utc)} + except Exception as exc: # noqa: BLE001 + log.debug("swarm/hosts probe unreachable host=%s err=%s", host.get("name"), exc) + patch = {"status": "unreachable"} + try: + await repo.update_swarm_host(host["uuid"], patch) + except Exception as exc: # noqa: BLE001 + log.warning("swarm/hosts could not persist probe result host=%s err=%s", host.get("name"), exc) + return host + host.update(patch) + return host + + +@router.get("/hosts", response_model=list[SwarmHostView], tags=["Swarm Management"]) +async def list_hosts( + host_status: Optional[str] = None, + admin: dict = Depends(require_admin), + repo: BaseRepository = Depends(get_repo), +) -> list[SwarmHostView]: + rows = await repo.list_swarm_hosts(host_status) + probed = await asyncio.gather(*(_probe_and_update(r, repo) for r in rows)) + return [SwarmHostView(**r) for r in probed] diff --git a/decnet/web/router/swarm_mgmt/api_teardown_host.py b/decnet/web/router/swarm_mgmt/api_teardown_host.py new file mode 100644 index 0000000..cae1b73 --- /dev/null +++ b/decnet/web/router/swarm_mgmt/api_teardown_host.py @@ -0,0 +1,150 @@ +"""POST /swarm/hosts/{uuid}/teardown — remote teardown on a swarm worker. + +Body: ``{"decky_id": "..."}`` (optional). When ``decky_id`` is null/omitted +the agent tears down the entire host (all deckies + network); otherwise it +tears down that single decky. + +Async-by-default: the endpoint returns 202 the moment the request is +accepted and runs the actual agent call + DB cleanup in a background task. +That lets the operator queue multiple teardowns in parallel without +blocking on slow docker-compose-down cycles on the worker. +""" +from __future__ import annotations + +import asyncio +from typing import Any, Optional + +from fastapi import APIRouter, Depends, HTTPException, status +from pydantic import BaseModel + +from decnet.logging import get_logger +from decnet.swarm.client import AgentClient +from decnet.web.db.repository import BaseRepository +from decnet.web.dependencies import get_repo, require_admin + +log = get_logger("swarm.teardown") +router = APIRouter() + +# Track spawned background tasks so (a) they're not GC'd mid-flight and +# (b) tests can drain them deterministically via ``await drain_pending()``. +_PENDING: "set[asyncio.Task]" = set() + + +def _spawn(coro) -> asyncio.Task: + task = asyncio.create_task(coro) + _PENDING.add(task) + task.add_done_callback(_PENDING.discard) + return task + + +async def drain_pending() -> None: + """Await all outstanding teardown tasks. Used by tests.""" + while _PENDING: + await asyncio.gather(*list(_PENDING), return_exceptions=True) + + +class TeardownHostRequest(BaseModel): + decky_id: Optional[str] = None + + +class TeardownHostResponse(BaseModel): + host_uuid: str + host_name: str + decky_id: Optional[str] = None + accepted: bool + detail: str + + +async def _mark_tearing_down( + repo: BaseRepository, host_uuid: str, decky_id: Optional[str] +) -> None: + """Flip affected shards to state='tearing_down' so the UI can show + progress immediately while the background task runs.""" + shards = await repo.list_decky_shards(host_uuid) + for s in shards: + if decky_id and s.get("decky_name") != decky_id: + continue + await repo.upsert_decky_shard({ + **s, + "state": "tearing_down", + "last_error": None, + }) + + +async def _run_teardown( + host: dict[str, Any], repo: BaseRepository, decky_id: Optional[str] +) -> None: + """Fire the remote teardown + DB cleanup. Exceptions are logged and + reflected on the shard so the UI surfaces them — never re-raised, + since nothing is awaiting us.""" + try: + async with AgentClient(host=host) as agent: + await agent.teardown(decky_id) + except Exception as exc: + log.exception( + "swarm.teardown background task failed host=%s decky=%s", + host.get("name"), decky_id, + ) + # Reflect the failure on the shard(s) — don't delete on failure, + # the operator needs to see what went wrong and retry. + try: + shards = await repo.list_decky_shards(host["uuid"]) + for s in shards: + if decky_id and s.get("decky_name") != decky_id: + continue + await repo.upsert_decky_shard({ + **s, + "state": "teardown_failed", + "last_error": str(exc)[:512], + }) + except Exception: + log.exception("swarm.teardown failed to record shard failure") + return + + try: + if decky_id: + await repo.delete_decky_shard(decky_id) + else: + await repo.delete_decky_shards_for_host(host["uuid"]) + except Exception: + log.exception("swarm.teardown DB cleanup failed (agent call succeeded)") + + +@router.post( + "/hosts/{uuid}/teardown", + response_model=TeardownHostResponse, + status_code=status.HTTP_202_ACCEPTED, + tags=["Swarm Management"], + responses={ + 400: {"description": "Bad Request (malformed JSON body)"}, + 401: {"description": "Could not validate credentials"}, + 403: {"description": "Insufficient permissions"}, + 404: {"description": "Host not found"}, + 422: {"description": "Request body or path parameter validation error"}, + }, +) +async def teardown_host( + uuid: str, + req: TeardownHostRequest, + admin: dict = Depends(require_admin), + repo: BaseRepository = Depends(get_repo), +) -> TeardownHostResponse: + host = await repo.get_swarm_host_by_uuid(uuid) + if host is None: + raise HTTPException(status_code=404, detail="host not found") + + await _mark_tearing_down(repo, uuid, req.decky_id) + + # Fire-and-forget: asyncio.create_task (not BackgroundTasks) so the + # task runs independently of this request's lifecycle — the operator + # can queue another teardown the moment this one returns 202 without + # waiting for any per-request cleanup phase. + _spawn(_run_teardown(host, repo, req.decky_id)) + + return TeardownHostResponse( + host_uuid=uuid, + host_name=host.get("name") or "", + decky_id=req.decky_id, + accepted=True, + detail="teardown queued", + ) diff --git a/decnet/web/router/swarm_updates/__init__.py b/decnet/web/router/swarm_updates/__init__.py new file mode 100644 index 0000000..d14e13f --- /dev/null +++ b/decnet/web/router/swarm_updates/__init__.py @@ -0,0 +1,23 @@ +"""Remote Updates — master dashboard's surface for pushing code to workers. + +These are *not* the swarm-controller's /swarm routes (those run on a +separate process, auth-free, internal-only). They live on the main web +API, go through ``require_admin``, and are the interface the React +dashboard calls to fan updates out to worker ``decnet updater`` daemons +via ``UpdaterClient``. + +Mounted under ``/api/v1/swarm-updates`` by the main api router. +""" +from fastapi import APIRouter + +from .api_list_host_releases import router as list_host_releases_router +from .api_push_update import router as push_update_router +from .api_push_update_self import router as push_update_self_router +from .api_rollback_host import router as rollback_host_router + +swarm_updates_router = APIRouter(prefix="/swarm-updates") + +swarm_updates_router.include_router(list_host_releases_router) +swarm_updates_router.include_router(push_update_router) +swarm_updates_router.include_router(push_update_self_router) +swarm_updates_router.include_router(rollback_host_router) diff --git a/decnet/web/router/swarm_updates/api_list_host_releases.py b/decnet/web/router/swarm_updates/api_list_host_releases.py new file mode 100644 index 0000000..ac493eb --- /dev/null +++ b/decnet/web/router/swarm_updates/api_list_host_releases.py @@ -0,0 +1,86 @@ +"""GET /swarm-updates/hosts — per-host updater health + release slots. + +Fans out an ``UpdaterClient.health()`` probe to every enrolled host that +has an updater bundle. Each probe is isolated: a single unreachable host +never fails the whole list (that's normal partial-failure behaviour for +a fleet view). +""" +from __future__ import annotations + +import asyncio +from typing import Any + +from fastapi import APIRouter, Depends + +from decnet.logging import get_logger +from decnet.swarm.updater_client import UpdaterClient +from decnet.web.db.models import HostReleaseInfo, HostReleasesResponse +from decnet.web.db.repository import BaseRepository +from decnet.web.dependencies import get_repo, require_admin + +log = get_logger("swarm_updates.list") + +router = APIRouter() + + +def _extract_shas(releases: list[dict[str, Any]]) -> tuple[str | None, str | None]: + """Pick the (current, previous) SHA from the updater's releases list. + + The updater reports releases as ``[{"slot": "active"|"prev", "sha": ..., + ...}]`` in no guaranteed order, so pull by slot name rather than index. + """ + current = next((r.get("sha") for r in releases if r.get("slot") == "active"), None) + previous = next((r.get("sha") for r in releases if r.get("slot") == "prev"), None) + return current, previous + + +async def _probe_host(host: dict[str, Any]) -> HostReleaseInfo: + try: + async with UpdaterClient(host=host) as u: + body = await u.health() + except Exception as exc: # noqa: BLE001 + return HostReleaseInfo( + host_uuid=host["uuid"], + host_name=host["name"], + address=host["address"], + reachable=False, + detail=f"{type(exc).__name__}: {exc}", + ) + releases = body.get("releases") or [] + current, previous = _extract_shas(releases) + return HostReleaseInfo( + host_uuid=host["uuid"], + host_name=host["name"], + address=host["address"], + reachable=True, + agent_status=body.get("agent_status") or body.get("status"), + current_sha=current, + previous_sha=previous, + releases=releases, + ) + + +@router.get( + "/hosts", + response_model=HostReleasesResponse, + tags=["Swarm Updates"], + responses={ + 401: {"description": "Could not validate credentials"}, + 403: {"description": "Insufficient permissions"}, + }, +) +async def api_list_host_releases( + admin: dict = Depends(require_admin), + repo: BaseRepository = Depends(get_repo), +) -> HostReleasesResponse: + rows = await repo.list_swarm_hosts() + # Only hosts actually capable of receiving updates — decommissioned + # hosts and agent-only enrollments are filtered out. + targets = [ + r for r in rows + if r.get("status") != "decommissioned" and r.get("updater_cert_fingerprint") + ] + if not targets: + return HostReleasesResponse(hosts=[]) + results = await asyncio.gather(*(_probe_host(h) for h in targets)) + return HostReleasesResponse(hosts=list(results)) diff --git a/decnet/web/router/swarm_updates/api_push_update.py b/decnet/web/router/swarm_updates/api_push_update.py new file mode 100644 index 0000000..0aea5ee --- /dev/null +++ b/decnet/web/router/swarm_updates/api_push_update.py @@ -0,0 +1,163 @@ +"""POST /swarm-updates/push — fan a tarball of the master's tree to workers. + +Mirrors the ``decnet swarm update`` CLI flow: build the tarball once, +dispatch concurrently, collect per-host statuses. Returns HTTP 200 even +when individual hosts failed — the operator reads per-host ``status``. +""" +from __future__ import annotations + +import asyncio +import pathlib +from typing import Any + +from fastapi import APIRouter, Depends, HTTPException + +from decnet.logging import get_logger +from decnet.swarm.tar_tree import detect_git_sha, tar_working_tree +from decnet.swarm.updater_client import UpdaterClient +from decnet.web.db.models import PushUpdateRequest, PushUpdateResponse, PushUpdateResult +from decnet.web.db.repository import BaseRepository +from decnet.web.dependencies import get_repo, require_admin + +log = get_logger("swarm_updates.push") + +router = APIRouter() + + +def _master_tree_root() -> pathlib.Path: + """Resolve the master's install tree to tar. + + Walks up from this file: ``decnet/web/router/swarm_updates/`` → 3 parents + lands on the repo root. Matches the layout shipped via ``pip install -e .`` + and the dev checkout at ``~/Tools/DECNET``. + """ + return pathlib.Path(__file__).resolve().parents[4] + + +def _classify_update(status_code: int) -> str: + if status_code == 200: + return "updated" + if status_code == 409: + return "rolled-back" + return "failed" + + +async def _resolve_targets( + repo: BaseRepository, + req: PushUpdateRequest, +) -> list[dict[str, Any]]: + if req.all == bool(req.host_uuids): + raise HTTPException( + status_code=400, + detail="Specify exactly one of host_uuids or all=true.", + ) + rows = await repo.list_swarm_hosts() + rows = [r for r in rows if r.get("updater_cert_fingerprint")] + if req.all: + targets = [r for r in rows if r.get("status") != "decommissioned"] + else: + wanted = set(req.host_uuids or []) + targets = [r for r in rows if r["uuid"] in wanted] + missing = wanted - {r["uuid"] for r in targets} + if missing: + raise HTTPException( + status_code=404, + detail=f"Unknown or updater-less host(s): {sorted(missing)}", + ) + if not targets: + raise HTTPException( + status_code=404, + detail="No targets: no enrolled hosts have an updater bundle.", + ) + return targets + + +async def _push_one( + host: dict[str, Any], + tarball: bytes, + sha: str, + include_self: bool, +) -> PushUpdateResult: + try: + async with UpdaterClient(host=host) as u: + r = await u.update(tarball, sha=sha) + body = r.json() if r.content else {} + status = _classify_update(r.status_code) + stderr = body.get("stderr") if isinstance(body, dict) else None + + if include_self and r.status_code == 200: + # Agent first, updater second — a broken updater push must never + # strand the fleet on an old agent. + try: + rs = await u.update_self(tarball, sha=sha) + self_ok = rs.status_code in (200, 0) # 0 = connection dropped (expected) + except Exception as exc: # noqa: BLE001 + # Connection drop on update-self is expected and not an error. + self_ok = _is_expected_connection_drop(exc) + if not self_ok: + return PushUpdateResult( + host_uuid=host["uuid"], host_name=host["name"], + status="self-failed", http_status=r.status_code, sha=sha, + detail=f"agent updated OK but self-update failed: {exc}", + stderr=stderr, + ) + status = "self-updated" if self_ok else "self-failed" + + return PushUpdateResult( + host_uuid=host["uuid"], host_name=host["name"], + status=status, http_status=r.status_code, sha=sha, + detail=body.get("error") or body.get("probe") if isinstance(body, dict) else None, + stderr=stderr, + ) + except Exception as exc: # noqa: BLE001 + log.exception("swarm_updates.push failed host=%s", host.get("name")) + return PushUpdateResult( + host_uuid=host["uuid"], host_name=host["name"], + status="failed", + detail=f"{type(exc).__name__}: {exc}", + ) + + +def _is_expected_connection_drop(exc: BaseException) -> bool: + """update-self re-execs the updater mid-response; httpx raises on the drop.""" + import httpx + return isinstance(exc, (httpx.RemoteProtocolError, httpx.ReadError, httpx.ConnectError)) + + +@router.post( + "/push", + response_model=PushUpdateResponse, + tags=["Swarm Updates"], + responses={ + 400: {"description": "Bad Request (malformed JSON body or conflicting host_uuids/all flags)"}, + 401: {"description": "Could not validate credentials"}, + 403: {"description": "Insufficient permissions"}, + 404: {"description": "No matching target hosts or no updater-capable hosts enrolled"}, + 422: {"description": "Request body validation error"}, + }, +) +async def api_push_update( + req: PushUpdateRequest, + admin: dict = Depends(require_admin), + repo: BaseRepository = Depends(get_repo), +) -> PushUpdateResponse: + targets = await _resolve_targets(repo, req) + tree_root = _master_tree_root() + # Both `detect_git_sha` (shells out) and `tar_working_tree` (walks the repo + # + gzips a few MB) are synchronous CPU+I/O. Running them directly on the + # event loop blocks every other request until the tarball is built — the + # dashboard freezes on /swarm-updates push. Offload to a worker thread. + sha = await asyncio.to_thread(detect_git_sha, tree_root) + tarball = await asyncio.to_thread(tar_working_tree, tree_root, extra_excludes=req.exclude) + log.info( + "swarm_updates.push sha=%s tarball=%d hosts=%d include_self=%s", + sha or "(not a git repo)", len(tarball), len(targets), req.include_self, + ) + results = await asyncio.gather( + *(_push_one(h, tarball, sha, req.include_self) for h in targets) + ) + return PushUpdateResponse( + sha=sha, + tarball_bytes=len(tarball), + results=list(results), + ) diff --git a/decnet/web/router/swarm_updates/api_push_update_self.py b/decnet/web/router/swarm_updates/api_push_update_self.py new file mode 100644 index 0000000..2ffa16f --- /dev/null +++ b/decnet/web/router/swarm_updates/api_push_update_self.py @@ -0,0 +1,101 @@ +"""POST /swarm-updates/push-self — push only to workers' /update-self. + +Use case: the agent is fine but the updater itself needs an upgrade (e.g. +a fix to ``executor.py``). Uploading only ``/update-self`` avoids a +redundant agent restart on healthy workers. + +No auto-rollback: the updater re-execs itself on success, so a broken +push leaves the worker on the old code — verified by polling ``/health`` +after the request returns. +""" +from __future__ import annotations + +import asyncio +from typing import Any + +from fastapi import APIRouter, Depends + +from decnet.logging import get_logger +from decnet.swarm.tar_tree import detect_git_sha, tar_working_tree +from decnet.swarm.updater_client import UpdaterClient +from decnet.web.db.models import PushUpdateRequest, PushUpdateResponse, PushUpdateResult +from decnet.web.db.repository import BaseRepository +from decnet.web.dependencies import get_repo, require_admin + +from .api_push_update import _is_expected_connection_drop, _master_tree_root, _resolve_targets + +log = get_logger("swarm_updates.push_self") + +router = APIRouter() + + +async def _push_self_one(host: dict[str, Any], tarball: bytes, sha: str) -> PushUpdateResult: + try: + async with UpdaterClient(host=host) as u: + try: + r = await u.update_self(tarball, sha=sha) + http_status = r.status_code + body = r.json() if r.content else {} + ok = http_status == 200 + detail = (body.get("error") or body.get("probe")) if isinstance(body, dict) else None + stderr = body.get("stderr") if isinstance(body, dict) else None + except Exception as exc: # noqa: BLE001 + # Connection drops during self-update are expected — the updater + # re-execs itself mid-response. + if _is_expected_connection_drop(exc): + return PushUpdateResult( + host_uuid=host["uuid"], host_name=host["name"], + status="self-updated", sha=sha, + detail="updater re-exec dropped connection (expected)", + ) + raise + return PushUpdateResult( + host_uuid=host["uuid"], host_name=host["name"], + status="self-updated" if ok else "self-failed", + http_status=http_status, sha=sha, + detail=detail, stderr=stderr, + ) + except Exception as exc: # noqa: BLE001 + log.exception("swarm_updates.push_self failed host=%s", host.get("name")) + return PushUpdateResult( + host_uuid=host["uuid"], host_name=host["name"], + status="self-failed", + detail=f"{type(exc).__name__}: {exc}", + ) + + +@router.post( + "/push-self", + response_model=PushUpdateResponse, + tags=["Swarm Updates"], + responses={ + 400: {"description": "Bad Request (malformed JSON body or conflicting host_uuids/all flags)"}, + 401: {"description": "Could not validate credentials"}, + 403: {"description": "Insufficient permissions"}, + 404: {"description": "No matching target hosts or no updater-capable hosts enrolled"}, + 422: {"description": "Request body validation error"}, + }, +) +async def api_push_update_self( + req: PushUpdateRequest, + admin: dict = Depends(require_admin), + repo: BaseRepository = Depends(get_repo), +) -> PushUpdateResponse: + targets = await _resolve_targets(repo, req) + tree_root = _master_tree_root() + # Offload sync I/O (git shell-out + tar+gzip of the repo) so the event + # loop stays responsive while the tarball is being built. + sha = await asyncio.to_thread(detect_git_sha, tree_root) + tarball = await asyncio.to_thread(tar_working_tree, tree_root, extra_excludes=req.exclude) + log.info( + "swarm_updates.push_self sha=%s tarball=%d hosts=%d", + sha or "(not a git repo)", len(tarball), len(targets), + ) + results = await asyncio.gather( + *(_push_self_one(h, tarball, sha) for h in targets) + ) + return PushUpdateResponse( + sha=sha, + tarball_bytes=len(tarball), + results=list(results), + ) diff --git a/decnet/web/router/swarm_updates/api_rollback_host.py b/decnet/web/router/swarm_updates/api_rollback_host.py new file mode 100644 index 0000000..0bfe165 --- /dev/null +++ b/decnet/web/router/swarm_updates/api_rollback_host.py @@ -0,0 +1,77 @@ +"""POST /swarm-updates/rollback — manual rollback on a single host. + +Calls the worker updater's ``/rollback`` which swaps the ``current`` +symlink back to ``releases/prev``. Fails with 404 if the target has no +previous release slot. +""" +from __future__ import annotations + +from fastapi import APIRouter, Depends, HTTPException + +from decnet.logging import get_logger +from decnet.swarm.updater_client import UpdaterClient +from decnet.web.db.models import RollbackRequest, RollbackResponse +from decnet.web.db.repository import BaseRepository +from decnet.web.dependencies import get_repo, require_admin + +log = get_logger("swarm_updates.rollback") + +router = APIRouter() + + +@router.post( + "/rollback", + response_model=RollbackResponse, + tags=["Swarm Updates"], + responses={ + 400: {"description": "Bad Request (malformed JSON body or host has no updater bundle)"}, + 401: {"description": "Could not validate credentials"}, + 403: {"description": "Insufficient permissions"}, + 404: {"description": "Unknown host, or no previous release slot on the worker"}, + 422: {"description": "Request body validation error"}, + }, +) +async def api_rollback_host( + req: RollbackRequest, + admin: dict = Depends(require_admin), + repo: BaseRepository = Depends(get_repo), +) -> RollbackResponse: + host = await repo.get_swarm_host_by_uuid(req.host_uuid) + if host is None: + raise HTTPException(status_code=404, detail=f"Unknown host: {req.host_uuid}") + if not host.get("updater_cert_fingerprint"): + raise HTTPException( + status_code=400, + detail=f"Host '{host['name']}' has no updater bundle — nothing to roll back.", + ) + + try: + async with UpdaterClient(host=host) as u: + r = await u.rollback() + except Exception as exc: # noqa: BLE001 + log.exception("swarm_updates.rollback transport failure host=%s", host["name"]) + return RollbackResponse( + host_uuid=host["uuid"], host_name=host["name"], + status="failed", + detail=f"{type(exc).__name__}: {exc}", + ) + + body = r.json() if r.content else {} + if r.status_code == 404: + # No previous release — surface as 404 so the UI can render the + # "nothing to roll back" state distinctly from a transport error. + raise HTTPException( + status_code=404, + detail=body.get("detail") if isinstance(body, dict) else "No previous release on worker.", + ) + if r.status_code != 200: + return RollbackResponse( + host_uuid=host["uuid"], host_name=host["name"], + status="failed", http_status=r.status_code, + detail=(body.get("error") or body.get("detail")) if isinstance(body, dict) else None, + ) + return RollbackResponse( + host_uuid=host["uuid"], host_name=host["name"], + status="rolled-back", http_status=r.status_code, + detail=body.get("status") if isinstance(body, dict) else None, + ) diff --git a/decnet/web/router/system/__init__.py b/decnet/web/router/system/__init__.py new file mode 100644 index 0000000..fdc0c05 --- /dev/null +++ b/decnet/web/router/system/__init__.py @@ -0,0 +1,6 @@ +from fastapi import APIRouter + +from .api_deployment_mode import router as deployment_mode_router + +system_router = APIRouter(prefix="/system", tags=["System"]) +system_router.include_router(deployment_mode_router) diff --git a/decnet/web/router/system/api_deployment_mode.py b/decnet/web/router/system/api_deployment_mode.py new file mode 100644 index 0000000..18cb3b0 --- /dev/null +++ b/decnet/web/router/system/api_deployment_mode.py @@ -0,0 +1,41 @@ +"""GET /system/deployment-mode — tells the UI whether a deploy will shard +across SWARM workers or land on the master itself. + +Logic mirrors the auto-mode branch in ``api_deploy_deckies``: master role +plus at least one reachable enrolled worker = swarm; otherwise unihost. +""" +from __future__ import annotations + +import os + +from fastapi import APIRouter, Depends +from pydantic import BaseModel + +from decnet.web.db.repository import BaseRepository +from decnet.web.dependencies import get_repo + +router = APIRouter() + + +class DeploymentModeResponse(BaseModel): + mode: str # "swarm" or "unihost" + role: str # "master" or "agent" + swarm_host_count: int + + +@router.get("/deployment-mode", response_model=DeploymentModeResponse) +async def get_deployment_mode( + repo: BaseRepository = Depends(get_repo), +) -> DeploymentModeResponse: + role = os.environ.get("DECNET_MODE", "master").lower() + hosts = 0 + if role == "master": + hosts = sum( + 1 for h in await repo.list_swarm_hosts() + if h.get("status") in ("active", "enrolled") and h.get("address") + ) + return DeploymentModeResponse( + mode="swarm" if hosts > 0 else "unihost", + role=role, + swarm_host_count=hosts, + ) diff --git a/decnet/web/swarm_api.py b/decnet/web/swarm_api.py new file mode 100644 index 0000000..43ffeb6 --- /dev/null +++ b/decnet/web/swarm_api.py @@ -0,0 +1,67 @@ +"""DECNET SWARM Controller — master-side control plane. + +Runs as an independent FastAPI/uvicorn process. Isolated from +``decnet.web.api`` so controller failure cannot cascade to the main API, +ingester, or dashboard (mirrors the existing pattern used by +``decnet api`` with ``start_new_session=True``). + +Responsibilities: +* host enrollment (issues CA-signed worker bundles); +* dispatching DecnetConfig shards to worker agents over mTLS; +* active health probes of enrolled workers. + +The controller *reuses* the same ``get_repo`` dependency as the main API, +so SwarmHost / DeckyShard state is visible to both processes via the +shared DB. +""" +from __future__ import annotations + +from decnet.web import _uvicorn_tls_scope # noqa: F401 # patches uvicorn on import + +from contextlib import asynccontextmanager +from typing import AsyncGenerator + +from fastapi import FastAPI +from fastapi.responses import ORJSONResponse + +from decnet.logging import get_logger +from decnet.swarm import pki +from decnet.swarm.client import ensure_master_identity +from decnet.web.dependencies import repo +from decnet.web.router.swarm import swarm_router + +log = get_logger("swarm_api") + + +@asynccontextmanager +async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]: + log.info("swarm-controller starting up") + # Make sure the CA and master client cert exist before we accept any + # request — enrollment needs them and AgentClient needs them. + pki.ensure_ca() + ensure_master_identity() + await repo.initialize() + log.info("swarm-controller ready") + yield + log.info("swarm-controller shutdown") + + +app: FastAPI = FastAPI( + title="DECNET SWARM Controller", + version="0.1.0", + lifespan=lifespan, + default_response_class=ORJSONResponse, + # No interactive docs: the controller is an internal management plane, + # not a public surface. Enable explicitly in dev if needed. + docs_url=None, + redoc_url=None, + openapi_url=None, +) + +app.include_router(swarm_router) + + +@app.get("/health") +async def root_health() -> dict[str, str]: + """Top-level liveness probe (no DB I/O).""" + return {"status": "ok", "role": "swarm-controller"} diff --git a/decnet/web/templates/decnet-agent.service.j2 b/decnet/web/templates/decnet-agent.service.j2 new file mode 100644 index 0000000..9847345 --- /dev/null +++ b/decnet/web/templates/decnet-agent.service.j2 @@ -0,0 +1,18 @@ +[Unit] +Description=DECNET worker agent (mTLS control plane) — {{ agent_name }} +Documentation=https://github.com/anti/DECNET +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +WorkingDirectory=/opt/decnet +Environment=DECNET_SYSTEM_LOGS=/var/log/decnet/decnet.agent.log +ExecStart=/usr/local/bin/decnet agent --no-forwarder +Restart=on-failure +RestartSec=5 +StandardOutput=append:/var/log/decnet/decnet.agent.log +StandardError=append:/var/log/decnet/decnet.agent.log + +[Install] +WantedBy=multi-user.target diff --git a/decnet/web/templates/decnet-collector.service.j2 b/decnet/web/templates/decnet-collector.service.j2 new file mode 100644 index 0000000..3137bfd --- /dev/null +++ b/decnet/web/templates/decnet-collector.service.j2 @@ -0,0 +1,20 @@ +[Unit] +Description=DECNET container log collector — {{ agent_name }} +Documentation=https://github.com/anti/DECNET +After=network-online.target decnet-agent.service +Wants=network-online.target +PartOf=decnet-agent.service + +[Service] +Type=simple +WorkingDirectory=/opt/decnet +Environment=DECNET_MODE=agent +Environment=DECNET_SYSTEM_LOGS=/var/log/decnet/decnet.collector.log +ExecStart=/usr/local/bin/decnet collect --log-file /var/log/decnet/decnet.log +Restart=on-failure +RestartSec=5 +StandardOutput=append:/var/log/decnet/decnet.collector.log +StandardError=append:/var/log/decnet/decnet.collector.log + +[Install] +WantedBy=multi-user.target diff --git a/decnet/web/templates/decnet-engine.service.j2 b/decnet/web/templates/decnet-engine.service.j2 new file mode 100644 index 0000000..dadf1b0 --- /dev/null +++ b/decnet/web/templates/decnet-engine.service.j2 @@ -0,0 +1,17 @@ +[Unit] +Description=DECNET deckie orchestrator (decnet deploy) — {{ agent_name }} +Documentation=https://github.com/anti/DECNET +After=network-online.target decnet-agent.service +Wants=network-online.target + +[Service] +Type=oneshot +RemainAfterExit=yes +WorkingDirectory=/opt/decnet +Environment=DECNET_SYSTEM_LOGS=/var/log/decnet/decnet.log +ExecStart=/usr/local/bin/decnet deploy +StandardOutput=append:/var/log/decnet/decnet.log +StandardError=append:/var/log/decnet/decnet.log + +[Install] +WantedBy=multi-user.target diff --git a/decnet/web/templates/decnet-forwarder.service.j2 b/decnet/web/templates/decnet-forwarder.service.j2 new file mode 100644 index 0000000..e0a2391 --- /dev/null +++ b/decnet/web/templates/decnet-forwarder.service.j2 @@ -0,0 +1,19 @@ +[Unit] +Description=DECNET log forwarder (syslog-over-TLS → master) — {{ agent_name }} +Documentation=https://github.com/anti/DECNET +After=network-online.target +Wants=network-online.target +PartOf=decnet-agent.service + +[Service] +Type=simple +WorkingDirectory=/opt/decnet +Environment=DECNET_SYSTEM_LOGS=/var/log/decnet/decnet.forwarder.log +ExecStart=/usr/local/bin/decnet forwarder --master-host {{ master_host }} --master-port 6514 --agent-dir /etc/decnet/agent --log-file /var/log/decnet/decnet.log +Restart=on-failure +RestartSec=5 +StandardOutput=append:/var/log/decnet/decnet.forwarder.log +StandardError=append:/var/log/decnet/decnet.forwarder.log + +[Install] +WantedBy=multi-user.target diff --git a/decnet/web/templates/decnet-prober.service.j2 b/decnet/web/templates/decnet-prober.service.j2 new file mode 100644 index 0000000..209851e --- /dev/null +++ b/decnet/web/templates/decnet-prober.service.j2 @@ -0,0 +1,20 @@ +[Unit] +Description=DECNET attacker prober (JARM/HASSH/TCP fingerprint) — {{ agent_name }} +Documentation=https://github.com/anti/DECNET +After=network-online.target decnet-agent.service +Wants=network-online.target +PartOf=decnet-agent.service + +[Service] +Type=simple +WorkingDirectory=/opt/decnet +Environment=DECNET_MODE=agent +Environment=DECNET_SYSTEM_LOGS=/var/log/decnet/decnet.prober.log +ExecStart=/usr/local/bin/decnet probe --log-file /var/log/decnet/decnet.log --interval 300 +Restart=on-failure +RestartSec=5 +StandardOutput=append:/var/log/decnet/decnet.prober.log +StandardError=append:/var/log/decnet/decnet.prober.log + +[Install] +WantedBy=multi-user.target diff --git a/decnet/web/templates/decnet-sniffer.service.j2 b/decnet/web/templates/decnet-sniffer.service.j2 new file mode 100644 index 0000000..360a3ac --- /dev/null +++ b/decnet/web/templates/decnet-sniffer.service.j2 @@ -0,0 +1,24 @@ +[Unit] +Description=DECNET network sniffer — {{ agent_name }} +Documentation=https://github.com/anti/DECNET +After=network-online.target decnet-agent.service +Wants=network-online.target +PartOf=decnet-agent.service + +[Service] +Type=simple +WorkingDirectory=/opt/decnet +Environment=DECNET_MODE=agent +Environment=DECNET_SYSTEM_LOGS=/var/log/decnet/decnet.sniffer.log +# scapy needs raw sockets; forwarder already runs with these caps, so we +# mirror the same ambient set here. +AmbientCapabilities=CAP_NET_ADMIN CAP_NET_RAW +CapabilityBoundingSet=CAP_NET_ADMIN CAP_NET_RAW +ExecStart=/usr/local/bin/decnet sniffer --log-file /var/log/decnet/decnet.log +Restart=on-failure +RestartSec=5 +StandardOutput=append:/var/log/decnet/decnet.sniffer.log +StandardError=append:/var/log/decnet/decnet.sniffer.log + +[Install] +WantedBy=multi-user.target diff --git a/decnet/web/templates/decnet-updater.service.j2 b/decnet/web/templates/decnet-updater.service.j2 new file mode 100644 index 0000000..3ac5406 --- /dev/null +++ b/decnet/web/templates/decnet-updater.service.j2 @@ -0,0 +1,18 @@ +[Unit] +Description=DECNET self-updater (accepts tarball pushes from master) — {{ agent_name }} +Documentation=https://github.com/anti/DECNET +After=network-online.target +Wants=network-online.target + +[Service] +Type=simple +WorkingDirectory=/opt/decnet +Environment=DECNET_SYSTEM_LOGS=/var/log/decnet/decnet.updater.log +ExecStart=/usr/local/bin/decnet updater --updater-dir /etc/decnet/updater --install-dir /opt/decnet --agent-dir /etc/decnet/agent +Restart=on-failure +RestartSec=5 +StandardOutput=append:/var/log/decnet/decnet.updater.log +StandardError=append:/var/log/decnet/decnet.updater.log + +[Install] +WantedBy=multi-user.target diff --git a/decnet/web/templates/enroll_bootstrap.sh.j2 b/decnet/web/templates/enroll_bootstrap.sh.j2 new file mode 100644 index 0000000..74ab220 --- /dev/null +++ b/decnet/web/templates/enroll_bootstrap.sh.j2 @@ -0,0 +1,86 @@ +#!/usr/bin/env bash +# DECNET bootstrap installer for agent {{ agent_name }} -> master {{ master_host }}. +# Fetches the code+certs payload, installs, and starts the agent daemon. +# Generated by the master at {{ generated_at }}. Expires {{ expires_at }}. +set -euo pipefail + +[[ $EUID -eq 0 ]] || { echo "decnet-install: must run as root (use sudo)"; exit 1; } +for bin in python3 curl tar systemctl; do + command -v "$bin" >/dev/null || { echo "decnet-install: $bin required"; exit 1; } +done + +WORK="$(mktemp -d)" +trap 'rm -rf "$WORK"' EXIT + +echo "[DECNET] fetching payload..." +curl -fsSL "{{ tarball_url }}" | tar -xz -C "$WORK" + +INSTALL_DIR=/opt/decnet +RELEASE_DIR="$INSTALL_DIR/releases/active" +VENV_DIR="$INSTALL_DIR/venv" +# Mirror the updater's layout from day one so `decnet updater` can rotate +# releases/active in-place and the shared venv is the thing on PATH. +mkdir -p "$RELEASE_DIR" +cp -a "$WORK/." "$RELEASE_DIR/" +ln -sfn "$RELEASE_DIR" "$INSTALL_DIR/current" +cd "$RELEASE_DIR" + +echo "[DECNET] building shared venv at $VENV_DIR..." +python3 -m venv "$VENV_DIR" +"$VENV_DIR/bin/pip" install -q --upgrade pip +"$VENV_DIR/bin/pip" install -q "$RELEASE_DIR" + +install -Dm0644 etc/decnet/decnet.ini /etc/decnet/decnet.ini +[[ -f services.ini ]] && install -Dm0644 services.ini /etc/decnet/services.ini + +# Log directory the baked-in INI points at — must exist before `decnet` imports config. +install -d -m0755 /var/log/decnet + +# Certs live under /etc/decnet/ (root-owned, 0600 keys) — this is a root +# daemon's data, not a user's. The baked INI's `agent-dir`/`updater-dir` +# point at these paths. +for f in ca.crt worker.crt worker.key; do + install -Dm0600 -o root -g root \ + "home/.decnet/agent/$f" "/etc/decnet/agent/$f" +done +chmod 0755 /etc/decnet/agent + +WITH_UPDATER="{{ with_updater }}" +if [[ "$WITH_UPDATER" == "true" && -d home/.decnet/updater ]]; then + for f in ca.crt updater.crt updater.key; do + install -Dm0600 -o root -g root \ + "home/.decnet/updater/$f" "/etc/decnet/updater/$f" + done + chmod 0755 /etc/decnet/updater +fi + +# Guarantee the pip-installed entrypoint is executable (some setuptools+editable +# combos drop it with mode 0644) and expose it on PATH. +chmod 0755 "$VENV_DIR/bin/decnet" +ln -sf "$VENV_DIR/bin/decnet" /usr/local/bin/decnet + +echo "[DECNET] installing systemd units..." +for unit in \ + decnet-agent decnet-forwarder decnet-engine \ + decnet-collector decnet-prober decnet-sniffer; do + install -Dm0644 "etc/systemd/system/${unit}.service" "/etc/systemd/system/${unit}.service" +done +if [[ "$WITH_UPDATER" == "true" ]]; then + install -Dm0644 etc/systemd/system/decnet-updater.service /etc/systemd/system/decnet-updater.service +fi +systemctl daemon-reload + +# Agent + forwarder are the control plane; collector/prober/profiler/sniffer +# are the per-host microservices that used to require `decnet deploy` to +# auto-spawn. With systemd units they come up at boot and auto-restart. +ACTIVE_UNITS=( + decnet-agent.service decnet-forwarder.service + decnet-collector.service decnet-prober.service + decnet-sniffer.service +) +if [[ "$WITH_UPDATER" == "true" ]]; then + ACTIVE_UNITS+=(decnet-updater.service) +fi +systemctl enable --now "${ACTIVE_UNITS[@]}" + +echo "[DECNET] agent {{ agent_name }} enrolled -> {{ master_host }}. Units: ${ACTIVE_UNITS[*]} active." diff --git a/decnet_web/src/App.tsx b/decnet_web/src/App.tsx index 8748ef2..50bdefc 100644 --- a/decnet_web/src/App.tsx +++ b/decnet_web/src/App.tsx @@ -6,18 +6,37 @@ import Dashboard from './components/Dashboard'; import DeckyFleet from './components/DeckyFleet'; import LiveLogs from './components/LiveLogs'; import Attackers from './components/Attackers'; +import AttackerDetail from './components/AttackerDetail'; import Config from './components/Config'; import Bounty from './components/Bounty'; +import RemoteUpdates from './components/RemoteUpdates'; +import SwarmHosts from './components/SwarmHosts'; +import AgentEnrollment from './components/AgentEnrollment'; + +function isTokenValid(token: string): boolean { + try { + const payload = JSON.parse(atob(token.split('.')[1].replace(/-/g, '+').replace(/_/g, '/'))); + return typeof payload.exp === 'number' && payload.exp * 1000 > Date.now(); + } catch { + return false; + } +} + +function getValidToken(): string | null { + const stored = localStorage.getItem('token'); + if (stored && isTokenValid(stored)) return stored; + if (stored) localStorage.removeItem('token'); + return null; +} function App() { - const [token, setToken] = useState(localStorage.getItem('token')); + const [token, setToken] = useState(getValidToken); const [searchQuery, setSearchQuery] = useState(''); useEffect(() => { - const savedToken = localStorage.getItem('token'); - if (savedToken) { - setToken(savedToken); - } + const onAuthLogout = () => setToken(null); + window.addEventListener('auth:logout', onAuthLogout); + return () => window.removeEventListener('auth:logout', onAuthLogout); }, []); const handleLogin = (newToken: string) => { @@ -46,7 +65,11 @@ function App() { } /> } /> } /> + } /> } /> + } /> + } /> + } /> } /> diff --git a/decnet_web/src/components/AgentEnrollment.tsx b/decnet_web/src/components/AgentEnrollment.tsx new file mode 100644 index 0000000..f538416 --- /dev/null +++ b/decnet_web/src/components/AgentEnrollment.tsx @@ -0,0 +1,188 @@ +import React, { useEffect, useRef, useState } from 'react'; +import api from '../utils/api'; +import './Dashboard.css'; +import './Swarm.css'; +import { UserPlus, Copy, RotateCcw, Check, AlertTriangle } from 'lucide-react'; + +interface BundleResult { + token: string; + host_uuid: string; + command: string; + expires_at: string; +} + +const AgentEnrollment: React.FC = () => { + const [masterHost, setMasterHost] = useState(window.location.hostname); + const [agentName, setAgentName] = useState(''); + const [withUpdater, setWithUpdater] = useState(true); + const [useIpvlan, setUseIpvlan] = useState(false); + const [servicesIni, setServicesIni] = useState(null); + const [servicesIniName, setServicesIniName] = useState(null); + const [submitting, setSubmitting] = useState(false); + const [error, setError] = useState(null); + const [result, setResult] = useState(null); + const [copied, setCopied] = useState(false); + const [now, setNow] = useState(Date.now()); + const fileRef = useRef(null); + + useEffect(() => { + const t = setInterval(() => setNow(Date.now()), 1000); + return () => clearInterval(t); + }, []); + + const handleFile = (e: React.ChangeEvent) => { + const f = e.target.files?.[0]; + if (!f) { + setServicesIni(null); + setServicesIniName(null); + return; + } + const reader = new FileReader(); + reader.onload = () => { + setServicesIni(String(reader.result)); + setServicesIniName(f.name); + }; + reader.readAsText(f); + }; + + const reset = () => { + setResult(null); + setError(null); + setAgentName(''); + setWithUpdater(true); + setUseIpvlan(false); + setServicesIni(null); + setServicesIniName(null); + setCopied(false); + if (fileRef.current) fileRef.current.value = ''; + }; + + const submit = async (e: React.FormEvent) => { + e.preventDefault(); + setSubmitting(true); + setError(null); + try { + const res = await api.post('/swarm/enroll-bundle', { + master_host: masterHost, + agent_name: agentName, + with_updater: withUpdater, + use_ipvlan: useIpvlan, + services_ini: servicesIni, + }); + setResult(res.data); + } catch (err: any) { + setError(err?.response?.data?.detail || 'Enrollment bundle creation failed'); + } finally { + setSubmitting(false); + } + }; + + const copyCmd = async () => { + if (!result) return; + await navigator.clipboard.writeText(result.command); + setCopied(true); + setTimeout(() => setCopied(false), 2000); + }; + + const nameOk = /^[a-z0-9][a-z0-9-]{0,62}$/.test(agentName); + + const remainingSecs = result ? Math.max(0, Math.floor((new Date(result.expires_at).getTime() - now) / 1000)) : 0; + const mm = Math.floor(remainingSecs / 60).toString().padStart(2, '0'); + const ss = (remainingSecs % 60).toString().padStart(2, '0'); + + return ( +
+
+

Agent Enrollment

+
+ + {!result ? ( +
+

+ Generates a one-shot bootstrap URL valid for 5 minutes. Paste the command into a + root shell on the target worker VM — no manual cert shuffling required. +

+
+ + + + + + {error &&
{error}
} + +
+
+ ) : ( +
+

Paste this on the new worker (as root):

+
{result.command}
+
+ + +
+

+ Expires in {mm}:{ss} — one-shot, single download. Host UUID:{' '} + {result.host_uuid} +

+ {remainingSecs === 0 && ( +
+ This bundle has expired. Generate another. +
+ )} +
+ )} +
+ ); +}; + +export default AgentEnrollment; diff --git a/decnet_web/src/components/ArtifactDrawer.tsx b/decnet_web/src/components/ArtifactDrawer.tsx new file mode 100644 index 0000000..491ec9c --- /dev/null +++ b/decnet_web/src/components/ArtifactDrawer.tsx @@ -0,0 +1,186 @@ +import React, { useState } from 'react'; +import { X, Download, AlertTriangle } from 'lucide-react'; +import api from '../utils/api'; + +interface ArtifactDrawerProps { + decky: string; + storedAs: string; + fields: Record; + onClose: () => void; +} + +// Bulky nested structures are shipped as one base64-encoded JSON blob in +// `meta_json_b64` (see templates/ssh/emit_capture.py). All summary fields +// arrive as top-level SD params already present in `fields`. +function decodeMeta(fields: Record): Record | null { + const b64 = fields.meta_json_b64; + if (typeof b64 !== 'string' || !b64) return null; + try { + const json = atob(b64); + return JSON.parse(json); + } catch (err) { + console.error('artifact: failed to decode meta_json_b64', err); + return null; + } +} + +const Row: React.FC<{ label: string; value: React.ReactNode }> = ({ label, value }) => ( +
+
{label}
+
{value ?? }
+
+); + +const ArtifactDrawer: React.FC = ({ decky, storedAs, fields, onClose }) => { + const [downloading, setDownloading] = useState(false); + const [error, setError] = useState(null); + const meta = decodeMeta(fields); + + const handleDownload = async () => { + setDownloading(true); + setError(null); + try { + const res = await api.get( + `/artifacts/${encodeURIComponent(decky)}/${encodeURIComponent(storedAs)}`, + { responseType: 'blob' }, + ); + const blobUrl = URL.createObjectURL(res.data); + const a = document.createElement('a'); + a.href = blobUrl; + a.download = storedAs; + document.body.appendChild(a); + a.click(); + document.body.removeChild(a); + URL.revokeObjectURL(blobUrl); + } catch (err: any) { + const status = err?.response?.status; + setError( + status === 403 ? 'Admin role required to download artifacts.' : + status === 404 ? 'Artifact not found on disk (may have been purged).' : + status === 400 ? 'Server rejected the request (invalid parameters).' : + 'Download failed — see console.' + ); + console.error('artifact download failed', err); + } finally { + setDownloading(false); + } + }; + + const concurrent = meta?.concurrent_sessions; + const ssSnapshot = meta?.ss_snapshot; + + return ( +
+
e.stopPropagation()} + style={{ + width: 'min(620px, 100%)', height: '100%', + backgroundColor: 'var(--bg-color, #0d1117)', + borderLeft: '1px solid var(--border-color, #30363d)', + padding: '24px', overflowY: 'auto', + color: 'var(--text-color)', + }} + > +
+
+
+ CAPTURED ARTIFACT · {decky} +
+
+ {storedAs} +
+
+ +
+ +
+ + Attacker-controlled content. Download at your own risk. +
+ + + {error && ( +
{error}
+ )} + +
+

+ ORIGIN +

+ + + + +
+ +
+

+ ATTRIBUTION · {fields.attribution ?? 'unknown'} +

+ + + + + + + + + +
+ + {Array.isArray(concurrent) && concurrent.length > 0 && ( +
+

+ CONCURRENT SESSIONS ({concurrent.length}) +

+
+              {JSON.stringify(concurrent, null, 2)}
+            
+
+ )} + + {Array.isArray(ssSnapshot) && ssSnapshot.length > 0 && ( +
+

+ SS SNAPSHOT ({ssSnapshot.length}) +

+
+              {JSON.stringify(ssSnapshot, null, 2)}
+            
+
+ )} +
+
+ ); +}; + +export default ArtifactDrawer; diff --git a/decnet_web/src/components/AttackerDetail.tsx b/decnet_web/src/components/AttackerDetail.tsx new file mode 100644 index 0000000..1d0d5af --- /dev/null +++ b/decnet_web/src/components/AttackerDetail.tsx @@ -0,0 +1,1177 @@ +import React, { useEffect, useState } from 'react'; +import { useParams, useNavigate } from 'react-router-dom'; +import { Activity, ArrowLeft, ChevronDown, ChevronLeft, ChevronRight, ChevronUp, Crosshair, Fingerprint, Shield, Clock, Wifi, Lock, FileKey, Radio, Timer, Paperclip } from 'lucide-react'; +import api from '../utils/api'; +import ArtifactDrawer from './ArtifactDrawer'; +import './Dashboard.css'; + +interface AttackerBehavior { + os_guess: string | null; + hop_distance: number | null; + tcp_fingerprint: { + window?: number | null; + wscale?: number | null; + mss?: number | null; + options_sig?: string; + has_sack?: boolean; + has_timestamps?: boolean; + } | null; + retransmit_count: number; + behavior_class: string | null; + beacon_interval_s: number | null; + beacon_jitter_pct: number | null; + tool_guesses: string[] | null; + timing_stats: { + event_count?: number; + duration_s?: number; + mean_iat_s?: number | null; + median_iat_s?: number | null; + stdev_iat_s?: number | null; + min_iat_s?: number | null; + max_iat_s?: number | null; + cv?: number | null; + } | null; + phase_sequence: { + recon_end_ts?: string | null; + exfil_start_ts?: string | null; + exfil_latency_s?: number | null; + large_payload_count?: number; + } | null; + updated_at?: string; +} + +interface AttackerData { + uuid: string; + ip: string; + first_seen: string; + last_seen: string; + event_count: number; + service_count: number; + decky_count: number; + services: string[]; + deckies: string[]; + traversal_path: string | null; + is_traversal: boolean; + bounty_count: number; + credential_count: number; + fingerprints: any[]; + commands: { service: string; decky: string; command: string; timestamp: string }[]; + updated_at: string; + behavior: AttackerBehavior | null; +} + +// ─── Fingerprint rendering ─────────────────────────────────────────────────── + +const fpTypeLabel: Record = { + ja3: 'TLS FINGERPRINT', + ja4l: 'LATENCY (JA4L)', + tls_resumption: 'SESSION RESUMPTION', + tls_certificate: 'CERTIFICATE', + http_useragent: 'HTTP USER-AGENT', + vnc_client_version: 'VNC CLIENT', + jarm: 'JARM', + hassh_server: 'HASSH SERVER', + tcpfp: 'TCP/IP STACK', +}; + +const fpTypeIcon: Record = { + ja3: , + ja4l: , + tls_resumption: , + tls_certificate: , + http_useragent: , + vnc_client_version: , + jarm: , + hassh_server: , + tcpfp: , +}; + +function getPayload(bounty: any): any { + if (bounty?.payload && typeof bounty.payload === 'object') return bounty.payload; + if (bounty?.payload && typeof bounty.payload === 'string') { + try { return JSON.parse(bounty.payload); } catch { return bounty; } + } + return bounty; +} + +const HashRow: React.FC<{ label: string; value?: string | null }> = ({ label, value }) => { + if (!value) return null; + return ( +
+ {label} + + {value} + +
+ ); +}; + +const Tag: React.FC<{ children: React.ReactNode; color?: string }> = ({ children, color }) => ( + + {children} + +); + +const FpTlsHashes: React.FC<{ p: any }> = ({ p }) => ( +
+ + + + + {(p.tls_version || p.sni || p.alpn) && ( +
+ {p.tls_version && {p.tls_version}} + {p.sni && SNI: {p.sni}} + {p.alpn && ALPN: {p.alpn}} + {p.dst_port && :{p.dst_port}} +
+ )} +
+); + +const FpLatency: React.FC<{ p: any }> = ({ p }) => ( +
+
+ RTT + + {p.rtt_ms} + + ms +
+ {p.client_ttl && ( +
+ TTL + + {p.client_ttl} + +
+ )} +
+); + +const FpResumption: React.FC<{ p: any }> = ({ p }) => { + const mechanisms = typeof p.mechanisms === 'string' + ? p.mechanisms.split(',') + : Array.isArray(p.mechanisms) ? p.mechanisms : []; + return ( +
+ {mechanisms.map((m: string) => ( + {m.trim().toUpperCase().replace(/_/g, ' ')} + ))} +
+ ); +}; + +const FpCertificate: React.FC<{ p: any }> = ({ p }) => ( +
+
+ + {p.subject_cn} + + {p.self_signed === 'true' && ( + SELF-SIGNED + )} +
+ {p.issuer && ( +
+ ISSUER: + {p.issuer} +
+ )} + {(p.not_before || p.not_after) && ( +
+ VALIDITY: + + {p.not_before || '?'} — {p.not_after || '?'} + +
+ )} + {p.sans && ( +
+ SANs: + {(typeof p.sans === 'string' ? p.sans.split(',') : p.sans).map((san: string) => ( + {san.trim()} + ))} +
+ )} +
+); + +const FpJarm: React.FC<{ p: any }> = ({ p }) => ( +
+ + {(p.target_ip || p.target_port) && ( +
+ {p.target_ip && {p.target_ip}} + {p.target_port && :{p.target_port}} +
+ )} +
+); + +const FpHassh: React.FC<{ p: any }> = ({ p }) => ( +
+ + {p.ssh_banner && ( +
+ BANNER: + {p.ssh_banner} +
+ )} + {p.kex_algorithms && ( +
+ + KEX ALGORITHMS + +
+ {p.kex_algorithms.split(',').map((algo: string) => ( + {algo.trim()} + ))} +
+
+ )} + {p.encryption_s2c && ( +
+ + ENCRYPTION (S→C) + +
+ {p.encryption_s2c.split(',').map((algo: string) => ( + {algo.trim()} + ))} +
+
+ )} + {(p.target_ip || p.target_port) && ( +
+ {p.target_ip && {p.target_ip}} + {p.target_port && :{p.target_port}} +
+ )} +
+); + +const FpTcpStack: React.FC<{ p: any }> = ({ p }) => ( +
+ +
+ {p.ttl && ( +
+ TTL + {p.ttl} +
+ )} + {p.window_size && ( +
+ WIN + {p.window_size} +
+ )} + {p.mss && ( +
+ MSS + {p.mss} +
+ )} +
+
+ {p.df_bit === '1' && DF} + {p.sack_ok === '1' && SACK} + {p.timestamp === '1' && TS} + {p.window_scale && p.window_scale !== '-1' && WSCALE:{p.window_scale}} +
+ {p.options_order && ( +
+ OPTS: + {p.options_order} +
+ )} + {(p.target_ip || p.target_port) && ( +
+ {p.target_ip && {p.target_ip}} + {p.target_port && :{p.target_port}} +
+ )} +
+); + +const FpGeneric: React.FC<{ p: any }> = ({ p }) => ( +
+ {p.value ? ( + + {p.value} + + ) : ( + + {JSON.stringify(p)} + + )} +
+); + +const FingerprintGroup: React.FC<{ fpType: string; items: any[] }> = ({ fpType, items }) => { + const label = fpTypeLabel[fpType] || fpType.toUpperCase().replace(/_/g, ' '); + const icon = fpTypeIcon[fpType] || ; + + return ( +
+
+ {icon} + {label} + {items.length > 1 && ( + ({items.length}) + )} +
+
+ {items.map((fp, i) => { + const p = getPayload(fp); + switch (fpType) { + case 'ja3': return ; + case 'ja4l': return ; + case 'tls_resumption': return ; + case 'tls_certificate': return ; + case 'jarm': return ; + case 'hassh_server': return ; + case 'tcpfp': return ; + default: return ; + } + })} +
+
+ ); +}; + +// ─── Behavioral profile blocks ────────────────────────────────────────────── + +const OS_LABELS: Record = { + linux: 'LINUX', + windows: 'WINDOWS', + macos_ios: 'macOS / iOS', + freebsd: 'FREEBSD', + openbsd: 'OPENBSD', + embedded: 'EMBEDDED', + nmap: 'NMAP (SCANNER)', + unknown: 'UNKNOWN', +}; + +const BEHAVIOR_LABELS: Record = { + beaconing: 'BEACONING', + interactive: 'INTERACTIVE', + scanning: 'SCANNING', + brute_force: 'BRUTE FORCE', + slow_scan: 'SLOW SCAN', + mixed: 'MIXED', + unknown: 'UNKNOWN', +}; + +const BEHAVIOR_COLORS: Record = { + beaconing: '#ff6b6b', + interactive: 'var(--accent-color)', + scanning: '#e5c07b', + brute_force: '#ff9f43', + slow_scan: '#c8a96e', + mixed: 'var(--text-color)', + unknown: 'var(--text-color)', +}; + +const TOOL_LABELS: Record = { + cobalt_strike: 'COBALT STRIKE', + sliver: 'SLIVER', + havoc: 'HAVOC', + mythic: 'MYTHIC', + nmap: 'NMAP', + gophish: 'GOPHISH', + nikto: 'NIKTO', + sqlmap: 'SQLMAP', + nuclei: 'NUCLEI', + masscan: 'MASSCAN', + zgrab: 'ZGRAB', + metasploit: 'METASPLOIT', + gobuster: 'GOBUSTER', + dirbuster: 'DIRBUSTER', + hydra: 'HYDRA', + wfuzz: 'WFUZZ', + curl: 'CURL', + python_requests: 'PYTHON-REQUESTS', +}; + +const fmtOpt = (v: number | null | undefined): string => + v === null || v === undefined ? '—' : String(v); + +const fmtSecs = (v: number | null | undefined): string => { + if (v === null || v === undefined) return '—'; + if (v < 1) return `${(v * 1000).toFixed(0)} ms`; + if (v < 60) return `${v.toFixed(2)} s`; + if (v < 3600) return `${(v / 60).toFixed(2)} m`; + return `${(v / 3600).toFixed(2)} h`; +}; + +const StatBlock: React.FC<{ label: string; value: React.ReactNode; color?: string }> = ({ + label, value, color, +}) => ( +
+
+ {value} +
+
{label}
+
+); + +const KeyValueRow: React.FC<{ label: string; value: React.ReactNode }> = ({ label, value }) => ( +
+ + {label} + + + {value} + +
+); + +// Tools detected via beacon timing (C2 frameworks). +const _C2_TOOLS = new Set(['cobalt_strike', 'sliver', 'havoc', 'mythic']); + +const BehaviorHeadline: React.FC<{ b: AttackerBehavior }> = ({ b }) => { + const osLabel = b.os_guess ? (OS_LABELS[b.os_guess] || b.os_guess.toUpperCase()) : '—'; + const behaviorLabel = b.behavior_class + ? (BEHAVIOR_LABELS[b.behavior_class] || b.behavior_class.toUpperCase()) + : 'UNKNOWN'; + const behaviorColor = b.behavior_class ? BEHAVIOR_COLORS[b.behavior_class] : undefined; + return ( +
+ + + +
+ ); +}; + +const DetectedToolsBlock: React.FC<{ b: AttackerBehavior }> = ({ b }) => { + const tools = b.tool_guesses && b.tool_guesses.length > 0 ? b.tool_guesses : null; + if (!tools) return null; + return ( +
+
+ + + DETECTED TOOLS + +
+
+ {tools.map(t => ( +
+ + {TOOL_LABELS[t] || t.toUpperCase()} + + + {_C2_TOOLS.has(t) ? 'BEACON TIMING' : 'HTTP HEADER'} + +
+ ))} +
+
+ ); +}; + +const BeaconBlock: React.FC<{ b: AttackerBehavior }> = ({ b }) => { + if (b.behavior_class !== 'beaconing' || b.beacon_interval_s === null) return null; + return ( +
+
+ + + BEACON CADENCE + +
+
+
+ INTERVAL + + {fmtSecs(b.beacon_interval_s)} + +
+ {b.beacon_jitter_pct !== null && ( +
+ JITTER + + {b.beacon_jitter_pct.toFixed(1)}% + +
+ )} +
+
+ ); +}; + +const TcpStackBlock: React.FC<{ b: AttackerBehavior }> = ({ b }) => { + const fp = b.tcp_fingerprint; + if (!fp || (!fp.window && !fp.mss && !fp.options_sig)) return null; + return ( +
+
+ + + TCP STACK (PASSIVE) + +
+
+
+ {fp.window !== null && fp.window !== undefined && ( +
+ WIN + + {fp.window} + +
+ )} + {fp.wscale !== null && fp.wscale !== undefined && ( +
+ WSCALE + + {fp.wscale} + +
+ )} + {fp.mss !== null && fp.mss !== undefined && ( +
+ MSS + {fp.mss} +
+ )} +
+ RETRANSMITS + 0 ? '#e5c07b' : undefined, + }} + > + {b.retransmit_count} + +
+
+
+ {fp.has_sack && SACK} + {fp.has_timestamps && TS} +
+ {fp.options_sig && ( +
+ OPTS: + + {fp.options_sig} + +
+ )} +
+
+ ); +}; + +const TimingStatsBlock: React.FC<{ b: AttackerBehavior }> = ({ b }) => { + const s = b.timing_stats; + if (!s || !s.event_count || s.event_count < 2) return null; + return ( +
+
+ + + INTER-EVENT TIMING + +
+
+ + + + + + + +
+
+ ); +}; + +const PhaseSequenceBlock: React.FC<{ b: AttackerBehavior }> = ({ b }) => { + const p = b.phase_sequence; + if (!p || (!p.recon_end_ts && !p.exfil_start_ts && !p.large_payload_count)) return null; + return ( +
+
+ + + PHASE SEQUENCE + +
+
+ + + + +
+
+ ); +}; + +// ─── Collapsible section ──────────────────────────────────────────────────── + +const Section: React.FC<{ + title: React.ReactNode; + right?: React.ReactNode; + open: boolean; + onToggle: () => void; + children: React.ReactNode; +}> = ({ title, right, open, onToggle, children }) => ( +
+
+
+ {open ? : } +

{title}

+
+ {right &&
e.stopPropagation()}>{right}
} +
+ {open && children} +
+); + +// ─── Main component ───────────────────────────────────────────────────────── + +const AttackerDetail: React.FC = () => { + const { id } = useParams<{ id: string }>(); + const navigate = useNavigate(); + const [attacker, setAttacker] = useState(null); + const [loading, setLoading] = useState(true); + const [error, setError] = useState(null); + const [serviceFilter, setServiceFilter] = useState(null); + + // Section collapse state + const [openSections, setOpenSections] = useState>({ + timeline: true, + services: true, + deckies: true, + behavior: true, + commands: true, + fingerprints: true, + artifacts: true, + }); + + // Captured file-drop artifacts (ssh inotify farm) for this attacker. + type ArtifactLog = { + id: number; + timestamp: string; + decky: string; + service: string; + fields: string; // JSON-encoded SD params (parsed lazily below) + }; + const [artifacts, setArtifacts] = useState([]); + const [artifact, setArtifact] = useState<{ decky: string; storedAs: string; fields: Record } | null>(null); + const toggle = (key: string) => setOpenSections((prev) => ({ ...prev, [key]: !prev[key] })); + + // Commands pagination state + const [commands, setCommands] = useState([]); + const [cmdTotal, setCmdTotal] = useState(0); + const [cmdPage, setCmdPage] = useState(1); + const cmdLimit = 50; + + useEffect(() => { + const fetchAttacker = async () => { + setLoading(true); + try { + const res = await api.get(`/attackers/${id}`); + setAttacker(res.data); + } catch (err: any) { + if (err.response?.status === 404) { + setError('ATTACKER NOT FOUND'); + } else { + setError('FAILED TO LOAD ATTACKER PROFILE'); + } + } finally { + setLoading(false); + } + }; + fetchAttacker(); + }, [id]); + + useEffect(() => { + if (!id) return; + const fetchCommands = async () => { + try { + const offset = (cmdPage - 1) * cmdLimit; + let url = `/attackers/${id}/commands?limit=${cmdLimit}&offset=${offset}`; + if (serviceFilter) url += `&service=${encodeURIComponent(serviceFilter)}`; + const res = await api.get(url); + setCommands(res.data.data); + setCmdTotal(res.data.total); + } catch (err: any) { + if (err.response?.status === 422) { + alert("Fuck off."); + } + setCommands([]); + setCmdTotal(0); + } + }; + fetchCommands(); + }, [id, cmdPage, serviceFilter]); + + // Reset command page when service filter changes + useEffect(() => { + setCmdPage(1); + }, [serviceFilter]); + + useEffect(() => { + if (!id) return; + const fetchArtifacts = async () => { + try { + const res = await api.get(`/attackers/${id}/artifacts`); + setArtifacts(res.data.data ?? []); + } catch { + setArtifacts([]); + } + }; + fetchArtifacts(); + }, [id]); + + if (loading) { + return ( +
+
+ LOADING THREAT PROFILE... +
+
+ ); + } + + if (error || !attacker) { + return ( +
+ +
+ {error || 'ATTACKER NOT FOUND'} +
+
+ ); + } + + return ( +
+ {/* Back Button */} + + + {/* Header */} +
+ +

+ {attacker.ip} +

+ {attacker.is_traversal && ( + TRAVERSAL + )} +
+ + {/* Stats Row */} +
+
+
{attacker.event_count}
+
EVENTS
+
+
+
{attacker.bounty_count}
+
BOUNTIES
+
+
+
{attacker.credential_count}
+
CREDENTIALS
+
+
+
{attacker.service_count}
+
SERVICES
+
+
+
{attacker.decky_count}
+
DECKIES
+
+
+ + {/* Timestamps */} +
toggle('timeline')}> +
+
+ FIRST SEEN: + {new Date(attacker.first_seen).toLocaleString()} +
+
+ LAST SEEN: + {new Date(attacker.last_seen).toLocaleString()} +
+
+ UPDATED: + {new Date(attacker.updated_at).toLocaleString()} +
+
+
+ + {/* Services */} +
toggle('services')}> +
+ {attacker.services.length > 0 ? attacker.services.map((svc) => { + const isActive = serviceFilter === svc; + return ( + setServiceFilter(isActive ? null : svc)} + title={isActive ? 'Clear filter' : `Filter by ${svc.toUpperCase()}`} + > + {svc.toUpperCase()} + + ); + }) : ( + No services recorded + )} +
+
+ + {/* Deckies & Traversal */} +
toggle('deckies')}> +
+ {attacker.traversal_path ? ( +
+ TRAVERSAL PATH: + {attacker.traversal_path} +
+ ) : ( +
+ {attacker.deckies.map((d) => ( + + {d} + + ))} + {attacker.deckies.length === 0 && No deckies recorded} +
+ )} +
+
+ + {/* Behavioral Profile */} +
toggle('behavior')} + > + {attacker.behavior ? ( +
+ +
+ + + + + +
+
+ ) : ( +
+ NO BEHAVIORAL DATA YET — PROFILER HAS NOT RUN FOR THIS ATTACKER +
+ )} +
+ + {/* Commands */} + {(() => { + const cmdTotalPages = Math.ceil(cmdTotal / cmdLimit); + return ( +
COMMANDS ({cmdTotal}{serviceFilter ? ` ${serviceFilter.toUpperCase()}` : ''})} + open={openSections.commands} + onToggle={() => toggle('commands')} + right={openSections.commands && cmdTotalPages > 1 ? ( +
+ + Page {cmdPage} of {cmdTotalPages} + +
+ + +
+
+ ) : undefined} + > + {commands.length > 0 ? ( +
+ + + + + + + + + + + {commands.map((cmd, i) => ( + + + + + + + ))} + +
TIMESTAMPSERVICEDECKYCOMMAND
+ {cmd.timestamp ? new Date(cmd.timestamp).toLocaleString() : '-'} + {cmd.service}{cmd.decky}{cmd.command}
+
+ ) : ( +
+ {serviceFilter ? `NO ${serviceFilter.toUpperCase()} COMMANDS CAPTURED` : 'NO COMMANDS CAPTURED'} +
+ )} +
+ ); + })()} + + {/* Fingerprints — grouped by type */} + {(() => { + const filteredFps = serviceFilter + ? attacker.fingerprints.filter((fp) => { + const p = getPayload(fp); + return p.service === serviceFilter; + }) + : attacker.fingerprints; + + // Group fingerprints by type + const groups: Record = {}; + filteredFps.forEach((fp) => { + const p = getPayload(fp); + const fpType: string = p.fingerprint_type || 'unknown'; + if (!groups[fpType]) groups[fpType] = []; + groups[fpType].push(fp); + }); + + // Active probes first, then passive, then unknown + const activeTypes = ['jarm', 'hassh_server', 'tcpfp']; + const passiveTypes = ['ja3', 'ja4l', 'tls_resumption', 'tls_certificate', 'http_useragent', 'vnc_client_version']; + const knownTypes = [...activeTypes, ...passiveTypes]; + const unknownTypes = Object.keys(groups).filter((t) => !knownTypes.includes(t)); + + const hasActive = activeTypes.some((t) => groups[t]); + const hasPassive = [...passiveTypes, ...unknownTypes].some((t) => groups[t]); + + return ( +
FINGERPRINTS ({filteredFps.length}{serviceFilter ? ` / ${attacker.fingerprints.length}` : ''})} + open={openSections.fingerprints} + onToggle={() => toggle('fingerprints')} + > + {filteredFps.length > 0 ? ( +
+ {/* Active probes section */} + {hasActive && ( +
+
+ + ACTIVE PROBES +
+
+ {activeTypes.filter((t) => groups[t]).map((fpType) => ( + + ))} +
+
+ )} + + {/* Passive fingerprints section */} + {hasPassive && ( +
+
+ + PASSIVE FINGERPRINTS +
+
+ {[...passiveTypes, ...unknownTypes].filter((t) => groups[t]).map((fpType) => ( + + ))} +
+
+ )} +
+ ) : ( +
+ {serviceFilter ? `NO ${serviceFilter.toUpperCase()} FINGERPRINTS CAPTURED` : 'NO FINGERPRINTS CAPTURED'} +
+ )} +
+ ); + })()} + + {/* Captured Artifacts */} +
CAPTURED ARTIFACTS ({artifacts.length})} + open={openSections.artifacts} + onToggle={() => toggle('artifacts')} + > + {artifacts.length > 0 ? ( +
+ + + + + + + + + + + + + {artifacts.map((row) => { + let fields: Record = {}; + try { fields = JSON.parse(row.fields || '{}'); } catch {} + const storedAs = fields.stored_as ? String(fields.stored_as) : null; + const sha = fields.sha256 ? String(fields.sha256) : ''; + return ( + + + + + + + + + ); + })} + +
TIMESTAMPDECKYFILENAMESIZESHA-256
+ {new Date(row.timestamp).toLocaleString()} + {row.decky} + {fields.orig_path ?? storedAs ?? '—'} + + {fields.size ? `${fields.size} B` : '—'} + + {sha ? `${sha.slice(0, 12)}…` : '—'} + + {storedAs && ( + + )} +
+
+ ) : ( +
+ NO ARTIFACTS CAPTURED FROM THIS ATTACKER +
+ )} +
+ + {artifact && ( + setArtifact(null)} + /> + )} + + {/* UUID footer */} +
+ UUID: {attacker.uuid} +
+
+ ); +}; + +export default AttackerDetail; diff --git a/decnet_web/src/components/Attackers.tsx b/decnet_web/src/components/Attackers.tsx index 0ed1ce9..24e8577 100644 --- a/decnet_web/src/components/Attackers.tsx +++ b/decnet_web/src/components/Attackers.tsx @@ -1,17 +1,264 @@ -import React from 'react'; -import { Activity } from 'lucide-react'; +import React, { useEffect, useState } from 'react'; +import { useSearchParams, useNavigate } from 'react-router-dom'; +import { Crosshair, Search, ChevronLeft, ChevronRight, Filter } from 'lucide-react'; +import api from '../utils/api'; import './Dashboard.css'; +interface AttackerEntry { + uuid: string; + ip: string; + first_seen: string; + last_seen: string; + event_count: number; + service_count: number; + decky_count: number; + services: string[]; + deckies: string[]; + traversal_path: string | null; + is_traversal: boolean; + bounty_count: number; + credential_count: number; + fingerprints: any[]; + commands: any[]; + updated_at: string; +} + +function timeAgo(dateStr: string): string { + const diff = Date.now() - new Date(dateStr).getTime(); + const mins = Math.floor(diff / 60000); + if (mins < 1) return 'just now'; + if (mins < 60) return `${mins}m ago`; + const hrs = Math.floor(mins / 60); + if (hrs < 24) return `${hrs}h ago`; + const days = Math.floor(hrs / 24); + return `${days}d ago`; +} + const Attackers: React.FC = () => { + const navigate = useNavigate(); + const [searchParams, setSearchParams] = useSearchParams(); + const query = searchParams.get('q') || ''; + const sortBy = searchParams.get('sort_by') || 'recent'; + const serviceFilter = searchParams.get('service') || ''; + const page = parseInt(searchParams.get('page') || '1'); + + const [attackers, setAttackers] = useState([]); + const [total, setTotal] = useState(0); + const [loading, setLoading] = useState(true); + const [searchInput, setSearchInput] = useState(query); + + const limit = 50; + + const fetchAttackers = async () => { + setLoading(true); + try { + const offset = (page - 1) * limit; + let url = `/attackers?limit=${limit}&offset=${offset}&sort_by=${sortBy}`; + if (query) url += `&search=${encodeURIComponent(query)}`; + if (serviceFilter) url += `&service=${encodeURIComponent(serviceFilter)}`; + + const res = await api.get(url); + setAttackers(res.data.data); + setTotal(res.data.total); + } catch (err) { + console.error('Failed to fetch attackers', err); + } finally { + setLoading(false); + } + }; + + useEffect(() => { + fetchAttackers(); + }, [query, sortBy, serviceFilter, page]); + + const _params = (overrides: Record = {}) => { + const base: Record = { q: query, sort_by: sortBy, service: serviceFilter, page: '1' }; + return Object.fromEntries(Object.entries({ ...base, ...overrides }).filter(([, v]) => v !== '')); + }; + + const handleSearch = (e: React.FormEvent) => { + e.preventDefault(); + setSearchParams(_params({ q: searchInput })); + }; + + const setPage = (p: number) => { + setSearchParams(_params({ page: p.toString() })); + }; + + const setSort = (s: string) => { + setSearchParams(_params({ sort_by: s })); + }; + + const clearService = () => { + setSearchParams(_params({ service: '' })); + }; + + const totalPages = Math.ceil(total / limit); + return ( -
-
- -

ATTACKER PROFILES

+
+ {/* Page Header */} +
+
+ +

ATTACKER PROFILES

+
+ +
+
+ + +
+ +
+ + setSearchInput(e.target.value)} + style={{ background: 'transparent', border: 'none', padding: '4px', fontSize: '0.8rem', width: '200px' }} + /> + +
-
-

NO ACTIVE THREATS PROFILED YET.

-

(Attackers view placeholder)

+ + {/* Summary & Pagination */} +
+
+
+ {total} THREATS PROFILED + {serviceFilter && ( + + )} +
+ +
+ + Page {page} of {totalPages || 1} + +
+ + +
+
+
+ + {/* Card Grid */} + {loading ? ( +
+ SCANNING THREAT PROFILES... +
+ ) : attackers.length === 0 ? ( +
+ NO ACTIVE THREATS PROFILED YET +
+ ) : ( +
+ {attackers.map((a) => { + const lastCmd = a.commands.length > 0 + ? a.commands[a.commands.length - 1] + : null; + + return ( +
navigate(`/attackers/${a.uuid}`)} + > + {/* Header row */} +
+ {a.ip} + {a.is_traversal && ( + TRAVERSAL + )} +
+ + {/* Timestamps */} +
+ First: {new Date(a.first_seen).toLocaleDateString()} + Last: {timeAgo(a.last_seen)} +
+ + {/* Counts */} +
+ Events: {a.event_count} + Bounties: {a.bounty_count} + Creds: {a.credential_count} +
+ + {/* Services */} +
+ {a.services.map((svc) => ( + { e.stopPropagation(); setSearchParams(_params({ service: svc })); }} + > + {svc.toUpperCase()} + + ))} +
+ + {/* Deckies / Traversal Path */} + {a.traversal_path ? ( +
+ Path: {a.traversal_path} +
+ ) : a.deckies.length > 0 ? ( +
+ Deckies: {a.deckies.join(', ')} +
+ ) : null} + + {/* Commands & Fingerprints */} +
+ Cmds: {a.commands.length} + Fingerprints: {a.fingerprints.length} +
+ + {/* Last command preview */} + {lastCmd && ( +
+ Last cmd: {lastCmd.command} +
+ )} +
+ ); + })} +
+ )}
); diff --git a/decnet_web/src/components/Bounty.tsx b/decnet_web/src/components/Bounty.tsx index 29c11c9..895918c 100644 --- a/decnet_web/src/components/Bounty.tsx +++ b/decnet_web/src/components/Bounty.tsx @@ -14,6 +14,118 @@ interface BountyEntry { payload: any; } +const _FINGERPRINT_LABELS: Record = { + fingerprint_type: 'TYPE', + ja3: 'JA3', + ja3s: 'JA3S', + ja4: 'JA4', + ja4s: 'JA4S', + ja4l: 'JA4L', + sni: 'SNI', + alpn: 'ALPN', + dst_port: 'PORT', + mechanisms: 'MECHANISM', + raw_ciphers: 'CIPHERS', + hash: 'HASH', + target_ip: 'TARGET', + target_port: 'PORT', + ssh_banner: 'BANNER', + kex_algorithms: 'KEX', + encryption_s2c: 'ENC (S→C)', + mac_s2c: 'MAC (S→C)', + compression_s2c: 'COMP (S→C)', + raw: 'RAW', + ttl: 'TTL', + window_size: 'WINDOW', + df_bit: 'DF', + mss: 'MSS', + window_scale: 'WSCALE', + sack_ok: 'SACK', + timestamp: 'TS', + options_order: 'OPTS ORDER', +}; + +const _TAG_STYLE: React.CSSProperties = { + fontSize: '0.65rem', + padding: '1px 6px', + borderRadius: '3px', + border: '1px solid rgba(238, 130, 238, 0.4)', + backgroundColor: 'rgba(238, 130, 238, 0.08)', + color: 'var(--accent-color)', + whiteSpace: 'nowrap', + flexShrink: 0, +}; + +const _HASH_STYLE: React.CSSProperties = { + fontSize: '0.75rem', + fontFamily: 'monospace', + opacity: 0.85, + wordBreak: 'break-all', +}; + +const FingerprintPayload: React.FC<{ payload: any }> = ({ payload }) => { + if (!payload || typeof payload !== 'object') { + return {JSON.stringify(payload)}; + } + + // For simple payloads like tls_resumption with just type + mechanism + const keys = Object.keys(payload); + const isSimple = keys.length <= 3; + + if (isSimple) { + return ( +
+ {keys.map((k) => { + const val = payload[k]; + if (val === null || val === undefined) return null; + const label = _FINGERPRINT_LABELS[k] || k.toUpperCase(); + return ( + + {label} + {String(val)} + + ); + })} +
+ ); + } + + // Full fingerprint — show priority fields as labeled rows + const priorityKeys = ['fingerprint_type', 'ja3', 'ja3s', 'ja4', 'ja4s', 'ja4l', 'sni', 'alpn', 'dst_port', 'mechanisms', 'hash', 'target_ip', 'target_port', 'ssh_banner', 'ttl', 'window_size', 'mss', 'options_order']; + const shown = priorityKeys.filter((k) => payload[k] !== undefined && payload[k] !== null); + const rest = keys.filter((k) => !priorityKeys.includes(k) && payload[k] !== null && payload[k] !== undefined); + + return ( +
+ {shown.map((k) => { + const label = _FINGERPRINT_LABELS[k] || k.toUpperCase(); + const val = String(payload[k]); + return ( +
+ {label} + {val} +
+ ); + })} + {rest.length > 0 && ( +
+ + +{rest.length} MORE FIELDS + +
+ {rest.map((k) => ( +
+ {(_FINGERPRINT_LABELS[k] || k).toUpperCase()} + {String(payload[k])} +
+ ))} +
+
+ )} +
+ ); +}; + const Bounty: React.FC = () => { const [searchParams, setSearchParams] = useSearchParams(); const query = searchParams.get('q') || ''; @@ -83,6 +195,7 @@ const Bounty: React.FC = () => { > +
@@ -167,6 +280,8 @@ const Bounty: React.FC = () => { user:{b.payload.username} pass:{b.payload.password}
+ ) : b.bounty_type === 'fingerprint' ? ( + ) : ( {JSON.stringify(b.payload)} )} diff --git a/decnet_web/src/components/Config.css b/decnet_web/src/components/Config.css new file mode 100644 index 0000000..496548b --- /dev/null +++ b/decnet_web/src/components/Config.css @@ -0,0 +1,282 @@ +.config-page { + display: flex; + flex-direction: column; + gap: 24px; +} + +.config-tabs { + display: flex; + gap: 0; + border-bottom: 1px solid var(--border-color); + background-color: var(--secondary-color); +} + +.config-tab { + padding: 12px 24px; + display: flex; + align-items: center; + gap: 8px; + font-size: 0.75rem; + letter-spacing: 1.5px; + border: none; + border-bottom: 2px solid transparent; + background: transparent; + color: var(--text-color); + opacity: 0.5; + cursor: pointer; + transition: all 0.3s ease; +} + +.config-tab:hover { + opacity: 0.8; + background: rgba(0, 255, 65, 0.03); + box-shadow: none; + color: var(--text-color); +} + +.config-tab.active { + opacity: 1; + border-bottom-color: var(--accent-color); + color: var(--text-color); +} + +.config-panel { + background-color: var(--secondary-color); + border: 1px solid var(--border-color); + padding: 32px; +} + +.config-field { + display: flex; + flex-direction: column; + gap: 10px; + margin-bottom: 24px; +} + +.config-field:last-child { + margin-bottom: 0; +} + +.config-label { + font-size: 0.7rem; + letter-spacing: 1px; + opacity: 0.6; +} + +.config-value { + font-size: 1.1rem; + padding: 8px 0; +} + +.config-input-row { + display: flex; + align-items: center; + gap: 12px; +} + +.config-input-row input { + width: 120px; +} + +.config-input-row input[type="text"] { + width: 160px; +} + +.preset-buttons { + display: flex; + gap: 8px; +} + +.preset-btn { + padding: 6px 14px; + font-size: 0.75rem; + opacity: 0.7; +} + +.preset-btn.active { + opacity: 1; + border-color: var(--accent-color); + color: var(--accent-color); +} + +.save-btn { + padding: 8px 20px; + font-weight: bold; + letter-spacing: 1px; + display: flex; + align-items: center; + gap: 6px; +} + +.save-btn:disabled { + opacity: 0.3; + cursor: not-allowed; +} + +/* User Management Table */ +.users-table-container { + overflow-x: auto; + margin-bottom: 24px; +} + +.users-table { + width: 100%; + border-collapse: collapse; + font-size: 0.8rem; + text-align: left; +} + +.users-table th { + padding: 12px 24px; + border-bottom: 1px solid var(--border-color); + opacity: 0.5; + font-weight: normal; + font-size: 0.7rem; + letter-spacing: 1px; +} + +.users-table td { + padding: 12px 24px; + border-bottom: 1px solid rgba(48, 54, 61, 0.5); +} + +.users-table tr:hover { + background-color: rgba(0, 255, 65, 0.03); +} + +.user-actions { + display: flex; + gap: 8px; +} + +.action-btn { + padding: 4px 10px; + font-size: 0.7rem; + display: flex; + align-items: center; + gap: 4px; +} + +.action-btn.danger { + border-color: #ff4141; + color: #ff4141; +} + +.action-btn.danger:hover { + background: #ff4141; + color: var(--background-color); + box-shadow: 0 0 10px rgba(255, 65, 65, 0.5); +} + +/* Add User Form */ +.add-user-section { + border-top: 1px solid var(--border-color); + padding-top: 24px; +} + +.add-user-form { + display: flex; + align-items: flex-end; + gap: 16px; + flex-wrap: wrap; +} + +.add-user-form .form-group { + display: flex; + flex-direction: column; + gap: 6px; +} + +.add-user-form label { + font-size: 0.65rem; + letter-spacing: 1px; + opacity: 0.6; +} + +.add-user-form input { + width: 180px; +} + +.add-user-form select { + background: #0d1117; + border: 1px solid var(--border-color); + color: var(--text-color); + padding: 8px 12px; + font-family: inherit; + cursor: pointer; +} + +.add-user-form select:focus { + outline: none; + border-color: var(--text-color); + box-shadow: var(--matrix-green-glow); +} + +.role-select { + background: #0d1117; + border: 1px solid var(--border-color); + color: var(--text-color); + padding: 4px 8px; + font-family: inherit; + font-size: 0.75rem; + cursor: pointer; +} + +.role-badge { + font-size: 0.7rem; + padding: 2px 8px; + border: 1px solid; + display: inline-block; +} + +.role-badge.admin { + border-color: var(--accent-color); + color: var(--accent-color); +} + +.role-badge.viewer { + border-color: var(--border-color); + color: var(--text-color); + opacity: 0.6; +} + +.must-change-badge { + font-size: 0.65rem; + color: #ffaa00; + opacity: 0.8; +} + +.config-success { + color: var(--text-color); + font-size: 0.75rem; + padding: 6px 12px; + border: 1px solid var(--text-color); + background: rgba(0, 255, 65, 0.1); + display: inline-block; +} + +.config-error { + color: #ff4141; + font-size: 0.75rem; + padding: 6px 12px; + border: 1px solid #ff4141; + background: rgba(255, 65, 65, 0.1); + display: inline-block; +} + +.confirm-dialog { + display: flex; + align-items: center; + gap: 8px; + font-size: 0.75rem; +} + +.confirm-dialog span { + color: #ff4141; +} + +.interval-hint { + font-size: 0.65rem; + opacity: 0.4; + letter-spacing: 0.5px; +} diff --git a/decnet_web/src/components/Config.tsx b/decnet_web/src/components/Config.tsx index 5c41911..87a7c0c 100644 --- a/decnet_web/src/components/Config.tsx +++ b/decnet_web/src/components/Config.tsx @@ -1,18 +1,516 @@ -import React from 'react'; -import { Settings } from 'lucide-react'; +import React, { useEffect, useState } from 'react'; +import api from '../utils/api'; +import { Settings, Users, Sliders, Trash2, UserPlus, Key, Save, Shield, AlertTriangle } from 'lucide-react'; import './Dashboard.css'; +import './Config.css'; + +interface UserEntry { + uuid: string; + username: string; + role: string; + must_change_password: boolean; +} + +interface ConfigData { + role: string; + deployment_limit: number; + global_mutation_interval: string; + users?: UserEntry[]; + developer_mode?: boolean; +} const Config: React.FC = () => { + const [config, setConfig] = useState(null); + const [loading, setLoading] = useState(true); + const [activeTab, setActiveTab] = useState<'limits' | 'users' | 'globals'>('limits'); + + // Deployment limit state + const [limitInput, setLimitInput] = useState(''); + const [limitSaving, setLimitSaving] = useState(false); + const [limitMsg, setLimitMsg] = useState<{ type: 'success' | 'error'; text: string } | null>(null); + + // Global mutation interval state + const [intervalInput, setIntervalInput] = useState(''); + const [intervalSaving, setIntervalSaving] = useState(false); + const [intervalMsg, setIntervalMsg] = useState<{ type: 'success' | 'error'; text: string } | null>(null); + + // Add user form state + const [newUsername, setNewUsername] = useState(''); + const [newPassword, setNewPassword] = useState(''); + const [newRole, setNewRole] = useState<'admin' | 'viewer'>('viewer'); + const [addingUser, setAddingUser] = useState(false); + const [userMsg, setUserMsg] = useState<{ type: 'success' | 'error'; text: string } | null>(null); + + // Confirm delete state + const [confirmDelete, setConfirmDelete] = useState(null); + + // Reset password state + const [resetTarget, setResetTarget] = useState(null); + const [resetPassword, setResetPassword] = useState(''); + + // Reinit state + const [confirmReinit, setConfirmReinit] = useState(false); + const [reiniting, setReiniting] = useState(false); + const [reinitMsg, setReinitMsg] = useState<{ type: 'success' | 'error'; text: string } | null>(null); + + const isAdmin = config?.role === 'admin'; + + const fetchConfig = async () => { + try { + const res = await api.get('/config'); + setConfig(res.data); + setLimitInput(String(res.data.deployment_limit)); + setIntervalInput(res.data.global_mutation_interval); + } catch (err) { + console.error('Failed to fetch config', err); + } finally { + setLoading(false); + } + }; + + useEffect(() => { + fetchConfig(); + }, []); + + // If server didn't send users, force tab away from users + useEffect(() => { + if (config && !config.users && activeTab === 'users') { + setActiveTab('limits'); + } + }, [config, activeTab]); + + const handleSaveLimit = async () => { + const val = parseInt(limitInput); + if (isNaN(val) || val < 1 || val > 500) { + setLimitMsg({ type: 'error', text: 'VALUE MUST BE 1-500' }); + return; + } + setLimitSaving(true); + setLimitMsg(null); + try { + await api.put('/config/deployment-limit', { deployment_limit: val }); + setLimitMsg({ type: 'success', text: 'DEPLOYMENT LIMIT UPDATED' }); + fetchConfig(); + } catch (err: any) { + setLimitMsg({ type: 'error', text: err.response?.data?.detail || 'UPDATE FAILED' }); + } finally { + setLimitSaving(false); + } + }; + + const handleSaveInterval = async () => { + if (!/^[1-9]\d*[mdMyY]$/.test(intervalInput)) { + setIntervalMsg({ type: 'error', text: 'INVALID FORMAT (e.g. 30m, 1d, 6M)' }); + return; + } + setIntervalSaving(true); + setIntervalMsg(null); + try { + await api.put('/config/global-mutation-interval', { global_mutation_interval: intervalInput }); + setIntervalMsg({ type: 'success', text: 'MUTATION INTERVAL UPDATED' }); + fetchConfig(); + } catch (err: any) { + setIntervalMsg({ type: 'error', text: err.response?.data?.detail || 'UPDATE FAILED' }); + } finally { + setIntervalSaving(false); + } + }; + + const handleAddUser = async (e: React.FormEvent) => { + e.preventDefault(); + if (!newUsername.trim() || !newPassword.trim()) return; + setAddingUser(true); + setUserMsg(null); + try { + await api.post('/config/users', { + username: newUsername.trim(), + password: newPassword, + role: newRole, + }); + setNewUsername(''); + setNewPassword(''); + setNewRole('viewer'); + setUserMsg({ type: 'success', text: 'USER CREATED' }); + fetchConfig(); + } catch (err: any) { + setUserMsg({ type: 'error', text: err.response?.data?.detail || 'CREATE FAILED' }); + } finally { + setAddingUser(false); + } + }; + + const handleDeleteUser = async (uuid: string) => { + try { + await api.delete(`/config/users/${uuid}`); + setConfirmDelete(null); + fetchConfig(); + } catch (err: any) { + alert(err.response?.data?.detail || 'Delete failed'); + } + }; + + const handleRoleChange = async (uuid: string, role: string) => { + try { + await api.put(`/config/users/${uuid}/role`, { role }); + fetchConfig(); + } catch (err: any) { + alert(err.response?.data?.detail || 'Role update failed'); + } + }; + + const handleResetPassword = async (uuid: string) => { + if (!resetPassword.trim() || resetPassword.length < 8) { + alert('Password must be at least 8 characters'); + return; + } + try { + await api.put(`/config/users/${uuid}/reset-password`, { new_password: resetPassword }); + setResetTarget(null); + setResetPassword(''); + fetchConfig(); + } catch (err: any) { + alert(err.response?.data?.detail || 'Password reset failed'); + } + }; + + const handleReinit = async () => { + setReiniting(true); + setReinitMsg(null); + try { + const res = await api.delete('/config/reinit'); + const d = res.data.deleted; + setReinitMsg({ type: 'success', text: `PURGED: ${d.logs} logs, ${d.bounties} bounties, ${d.attackers} attacker profiles` }); + setConfirmReinit(false); + } catch (err: any) { + setReinitMsg({ type: 'error', text: err.response?.data?.detail || 'REINIT FAILED' }); + } finally { + setReiniting(false); + } + }; + + if (loading) { + return ( +
+
LOADING CONFIGURATION...
+
+ ); + } + + if (!config) { + return ( +
+
+

FAILED TO LOAD CONFIGURATION

+
+
+ ); + } + + const tabs: { key: string; label: string; icon: React.ReactNode }[] = [ + { key: 'limits', label: 'DEPLOYMENT LIMITS', icon: }, + ...(config.users + ? [{ key: 'users', label: 'USER MANAGEMENT', icon: }] + : []), + { key: 'globals', label: 'GLOBAL VALUES', icon: }, + ]; + return ( -
-
- -

SYSTEM CONFIGURATION

+
+
+
+ +

SYSTEM CONFIGURATION

+
-
-

CONFIGURATION READ-ONLY MODE ACTIVE.

-

(Config view placeholder)

+ +
+ {tabs.map((tab) => ( + + ))}
+ + {/* DEPLOYMENT LIMITS TAB */} + {activeTab === 'limits' && ( +
+
+ MAXIMUM DECKIES PER DEPLOYMENT + {isAdmin ? ( + <> +
+ setLimitInput(e.target.value)} + /> +
+ {[10, 50, 100, 200].map((v) => ( + + ))} +
+ +
+ {limitMsg && ( + + {limitMsg.text} + + )} + + ) : ( + {config.deployment_limit} + )} +
+
+ )} + + {/* USER MANAGEMENT TAB (only if server sent users) */} + {activeTab === 'users' && config.users && ( +
+
+ + + + + + + + + + + {config.users.map((user) => ( + + + + + + + ))} + +
USERNAMEROLESTATUSACTIONS
{user.username} + {user.role.toUpperCase()} + + {user.must_change_password && ( + MUST CHANGE PASSWORD + )} + +
+ {/* Role change dropdown */} + + + {/* Reset password */} + {resetTarget === user.uuid ? ( +
+ setResetPassword(e.target.value)} + style={{ width: '140px' }} + /> + + +
+ ) : ( + + )} + + {/* Delete */} + {confirmDelete === user.uuid ? ( +
+ CONFIRM? + + +
+ ) : ( + + )} +
+
+
+ +
+
+
+ + setNewUsername(e.target.value)} + required + minLength={1} + maxLength={64} + /> +
+
+ + setNewPassword(e.target.value)} + required + minLength={8} + maxLength={72} + /> +
+
+ + +
+ + {userMsg && ( + + {userMsg.text} + + )} +
+
+
+ )} + + {/* GLOBAL VALUES TAB */} + {activeTab === 'globals' && ( +
+
+ GLOBAL MUTATION INTERVAL + {isAdmin ? ( + <> +
+ setIntervalInput(e.target.value)} + placeholder="30m" + /> + +
+ + FORMAT: <number><unit> — m=minutes, d=days, M=months, y=years (e.g. 30m, 7d, 1M) + + {intervalMsg && ( + + {intervalMsg.text} + + )} + + ) : ( + {config.global_mutation_interval} + )} +
+
+ )} + + {/* DANGER ZONE — developer mode only, server-gated, shown on globals tab */} + {activeTab === 'globals' && config.developer_mode && ( +
+
+ + + DANGER ZONE — DEVELOPER MODE + +

+ Purge all logs, bounty vault entries, and attacker profiles. This action is irreversible. +

+ {!confirmReinit ? ( + + ) : ( +
+ THIS WILL DELETE ALL COLLECTED DATA. ARE YOU SURE? + + +
+ )} + {reinitMsg && ( + + {reinitMsg.text} + + )} +
+
+ )}
); }; diff --git a/decnet_web/src/components/Dashboard.css b/decnet_web/src/components/Dashboard.css index 773fcd9..91889f2 100644 --- a/decnet_web/src/components/Dashboard.css +++ b/decnet_web/src/components/Dashboard.css @@ -127,3 +127,96 @@ from { transform: rotate(0deg); } to { transform: rotate(360deg); } } + +/* Attacker Profiles */ +.attacker-grid { + display: grid; + grid-template-columns: repeat(auto-fill, minmax(340px, 1fr)); + gap: 16px; + padding: 16px; +} + +.attacker-card { + background: var(--secondary-color); + border: 1px solid var(--border-color); + padding: 16px; + cursor: pointer; + transition: transform 0.15s ease, box-shadow 0.15s ease, border-color 0.15s ease; +} + +.attacker-card:hover { + transform: translateY(-2px); + border-color: var(--text-color); + box-shadow: var(--matrix-green-glow); +} + +.traversal-badge { + font-size: 0.65rem; + padding: 2px 8px; + border: 1px solid var(--accent-color); + background: rgba(238, 130, 238, 0.1); + color: var(--accent-color); + letter-spacing: 2px; +} + +.service-badge { + font-size: 0.7rem; + padding: 2px 8px; + border: 1px solid var(--text-color); + background: rgba(0, 255, 65, 0.05); + color: var(--text-color); +} + +.back-button { + display: inline-flex; + align-items: center; + gap: 8px; + padding: 8px 16px; + border: 1px solid var(--border-color); + background: transparent; + color: var(--text-color); + cursor: pointer; + font-size: 0.8rem; + letter-spacing: 2px; + transition: border-color 0.15s ease, box-shadow 0.15s ease; +} + +.back-button:hover { + border-color: var(--text-color); + box-shadow: var(--matrix-green-glow); +} + +/* Fingerprint cards */ +.fp-card { + border: 1px solid var(--border-color); + background: rgba(0, 0, 0, 0.2); + transition: border-color 0.15s ease; +} + +.fp-card:hover { + border-color: var(--accent-color); +} + +.fp-card-header { + display: flex; + align-items: center; + gap: 8px; + padding: 8px 16px; + border-bottom: 1px solid var(--border-color); +} + +.fp-card-icon { + color: var(--accent-color); + display: flex; + align-items: center; +} + +.fp-card-label { + font-size: 0.7rem; + letter-spacing: 2px; + opacity: 0.7; +} + +.fp-card-body { + padding: 12px 16px; +} diff --git a/decnet_web/src/components/Dashboard.tsx b/decnet_web/src/components/Dashboard.tsx index c32717d..fd8319b 100644 --- a/decnet_web/src/components/Dashboard.tsx +++ b/decnet_web/src/components/Dashboard.tsx @@ -1,6 +1,8 @@ -import React, { useEffect, useState } from 'react'; +import React, { useEffect, useState, useRef } from 'react'; import './Dashboard.css'; -import { Shield, Users, Activity, Clock } from 'lucide-react'; +import { Shield, Users, Activity, Clock, Paperclip } from 'lucide-react'; +import { parseEventBody } from '../utils/parseEventBody'; +import ArtifactDrawer from './ArtifactDrawer'; interface Stats { total_logs: number; @@ -29,37 +31,53 @@ const Dashboard: React.FC = ({ searchQuery }) => { const [stats, setStats] = useState(null); const [logs, setLogs] = useState([]); const [loading, setLoading] = useState(true); + const [artifact, setArtifact] = useState<{ decky: string; storedAs: string; fields: Record } | null>(null); + const eventSourceRef = useRef(null); + const reconnectTimerRef = useRef | null>(null); useEffect(() => { - const token = localStorage.getItem('token'); - const baseUrl = import.meta.env.VITE_API_URL || 'http://localhost:8000/api/v1'; - let url = `${baseUrl}/stream?token=${token}`; - if (searchQuery) { - url += `&search=${encodeURIComponent(searchQuery)}`; - } - - const eventSource = new EventSource(url); - - eventSource.onmessage = (event) => { - try { - const payload = JSON.parse(event.data); - if (payload.type === 'logs') { - setLogs(prev => [...payload.data, ...prev].slice(0, 100)); - } else if (payload.type === 'stats') { - setStats(payload.data); - setLoading(false); - } - } catch (err) { - console.error('Failed to parse SSE payload', err); + const connect = () => { + if (eventSourceRef.current) { + eventSourceRef.current.close(); } + + const token = localStorage.getItem('token'); + const baseUrl = import.meta.env.VITE_API_URL || 'http://localhost:8000/api/v1'; + let url = `${baseUrl}/stream?token=${token}`; + if (searchQuery) { + url += `&search=${encodeURIComponent(searchQuery)}`; + } + + const es = new EventSource(url); + eventSourceRef.current = es; + + es.onmessage = (event) => { + try { + const payload = JSON.parse(event.data); + if (payload.type === 'logs') { + setLogs(prev => [...payload.data, ...prev].slice(0, 100)); + } else if (payload.type === 'stats') { + setStats(payload.data); + setLoading(false); + window.dispatchEvent(new CustomEvent('decnet:stats', { detail: payload.data })); + } + } catch (err) { + console.error('Failed to parse SSE payload', err); + } + }; + + es.onerror = () => { + es.close(); + eventSourceRef.current = null; + reconnectTimerRef.current = setTimeout(connect, 3000); + }; }; - eventSource.onerror = (err) => { - console.error('SSE connection error, attempting to reconnect...', err); - }; + connect(); return () => { - eventSource.close(); + if (reconnectTimerRef.current) clearTimeout(reconnectTimerRef.current); + if (eventSourceRef.current) eventSourceRef.current.close(); }; }, [searchQuery]); @@ -112,6 +130,17 @@ const Dashboard: React.FC = ({ searchQuery }) => { } } + let msgHead: string | null = null; + let msgTail: string | null = null; + if (Object.keys(parsedFields).length === 0) { + const parsed = parseEventBody(log.msg); + parsedFields = parsed.fields; + msgHead = parsed.head; + msgTail = parsed.tail; + } else if (log.msg && log.msg !== '-') { + msgTail = log.msg; + } + return ( {new Date(log.timestamp).toLocaleString()} @@ -121,20 +150,53 @@ const Dashboard: React.FC = ({ searchQuery }) => {
- {log.event_type} {log.msg && log.msg !== '-' && — {log.msg}} + {(() => { + const et = log.event_type && log.event_type !== '-' ? log.event_type : null; + const parts = [et, msgHead].filter(Boolean) as string[]; + return ( + <> + {parts.join(' · ')} + {msgTail && {parts.length ? ' — ' : ''}{msgTail}} + + ); + })()}
- {Object.keys(parsedFields).length > 0 && ( + {(Object.keys(parsedFields).length > 0 || parsedFields.stored_as) && (
- {Object.entries(parsedFields).map(([k, v]) => ( - setArtifact({ + decky: log.decky, + storedAs: String(parsedFields.stored_as), + fields: parsedFields, + })} + title="Inspect captured artifact" + style={{ + display: 'flex', alignItems: 'center', gap: '6px', + fontSize: '0.7rem', + backgroundColor: 'rgba(255, 170, 0, 0.1)', + padding: '2px 8px', + borderRadius: '4px', + border: '1px solid rgba(255, 170, 0, 0.5)', + color: '#ffaa00', + cursor: 'pointer', + }} + > + ARTIFACT + + )} + {Object.entries(parsedFields) + .filter(([k]) => k !== 'meta_json_b64') + .map(([k, v]) => ( + - {k}: {v} + {k}: {typeof v === 'object' ? JSON.stringify(v) : v} ))}
@@ -152,6 +214,14 @@ const Dashboard: React.FC = ({ searchQuery }) => {
+ {artifact && ( + setArtifact(null)} + /> + )}
); }; diff --git a/decnet_web/src/components/DeckyFleet.tsx b/decnet_web/src/components/DeckyFleet.tsx index a6f99a9..41521a0 100644 --- a/decnet_web/src/components/DeckyFleet.tsx +++ b/decnet_web/src/components/DeckyFleet.tsx @@ -1,7 +1,17 @@ import React, { useEffect, useState } from 'react'; import api from '../utils/api'; import './Dashboard.css'; // Re-use common dashboard styles -import { Server, Cpu, Globe, Database, Clock, RefreshCw, Upload } from 'lucide-react'; +import { Server, Cpu, Globe, Database, Clock, RefreshCw, Upload, Network, PowerOff } from 'lucide-react'; + +interface SwarmMeta { + host_uuid: string; + host_name: string; + host_address: string; + host_status: string; + state: string; + last_error: string | null; + last_seen: string | null; +} interface Decky { name: string; @@ -13,8 +23,43 @@ interface Decky { service_config: Record>; mutate_interval: number | null; last_mutated: number; + swarm?: SwarmMeta; } +// Raw shape returned by /swarm/deckies (DeckyShardView on the backend). +// Pre-heartbeat rows have nullable metadata fields; we coerce to the +// shared Decky interface so the card grid renders uniformly either way. +interface SwarmDeckyRaw { + decky_name: string; + decky_ip: string | null; + host_uuid: string; + host_name: string; + host_address: string; + host_status: string; + services: string[]; + state: string; + last_error: string | null; + last_seen: string | null; + hostname: string | null; + distro: string | null; + archetype: string | null; + service_config: Record>; + mutate_interval: number | null; + last_mutated: number; +} + +const _stateColor = (state: string): string => { + switch (state) { + case 'running': return 'var(--accent-color)'; + case 'degraded': return '#f39c12'; + case 'tearing_down': return '#f39c12'; + case 'pending': return 'var(--dim-color)'; + case 'failed': + case 'teardown_failed': return '#e74c3c'; + default: return 'var(--dim-color)'; + } +}; + const DeckyFleet: React.FC = () => { const [deckies, setDeckies] = useState([]); const [loading, setLoading] = useState(true); @@ -22,11 +67,49 @@ const DeckyFleet: React.FC = () => { const [showDeploy, setShowDeploy] = useState(false); const [iniContent, setIniContent] = useState(''); const [deploying, setDeploying] = useState(false); + const [isAdmin, setIsAdmin] = useState(false); + const [deployMode, setDeployMode] = useState<{ mode: string; swarm_host_count: number } | null>(null); + // Two-click arm/commit for teardown — lifted from the old SwarmDeckies + // component. browsers silently suppress window.confirm() after the user + // opts out of further dialogs, so we gate destructive actions with a + // 4-second "click again" window instead. + const [armed, setArmed] = useState(null); + const [tearingDown, setTearingDown] = useState>(new Set()); - const fetchDeckies = async () => { + const arm = (key: string) => { + setArmed(key); + setTimeout(() => setArmed((prev) => (prev === key ? null : prev)), 4000); + }; + + const fetchDeckies = async (mode?: string) => { try { - const _res = await api.get('/deckies'); - setDeckies(_res.data); + if (mode === 'swarm') { + const res = await api.get('/swarm/deckies'); + const normalized: Decky[] = res.data.map((s) => ({ + name: s.decky_name, + ip: s.decky_ip || '—', + services: s.services || [], + distro: s.distro || 'unknown', + hostname: s.hostname || '—', + archetype: s.archetype, + service_config: s.service_config || {}, + mutate_interval: s.mutate_interval, + last_mutated: s.last_mutated || 0, + swarm: { + host_uuid: s.host_uuid, + host_name: s.host_name, + host_address: s.host_address, + host_status: s.host_status, + state: s.state, + last_error: s.last_error, + last_seen: s.last_seen, + }, + })); + setDeckies(normalized); + } else { + const res = await api.get('/deckies'); + setDeckies(res.data); + } } catch (err) { console.error('Failed to fetch decky fleet', err); } finally { @@ -34,11 +117,20 @@ const DeckyFleet: React.FC = () => { } }; + const fetchRole = async () => { + try { + const res = await api.get('/config'); + setIsAdmin(res.data.role === 'admin'); + } catch { + setIsAdmin(false); + } + }; + const handleMutate = async (name: string) => { setMutating(name); try { await api.post(`/deckies/${name}/mutate`, {}, { timeout: 120000 }); - await fetchDeckies(); + await fetchDeckies(deployMode?.mode); } catch (err: any) { console.error('Failed to mutate', err); if (err.code === 'ECONNABORTED') { @@ -57,13 +149,33 @@ const DeckyFleet: React.FC = () => { const mutate_interval = _val.trim() === '' ? null : parseInt(_val); try { await api.put(`/deckies/${name}/mutate-interval`, { mutate_interval }); - fetchDeckies(); + fetchDeckies(deployMode?.mode); } catch (err) { console.error('Failed to update interval', err); alert('Update failed'); } }; + const handleTeardown = async (d: Decky) => { + if (!d.swarm) return; + const key = `td:${d.swarm.host_uuid}:${d.name}`; + if (armed !== key) { arm(key); return; } + setArmed(null); + setTearingDown((prev) => new Set(prev).add(d.name)); + try { + await api.post(`/swarm/hosts/${d.swarm.host_uuid}/teardown`, { decky_id: d.name }); + await fetchDeckies(deployMode?.mode); + } catch (err: any) { + alert(err?.response?.data?.detail || 'Teardown failed'); + } finally { + setTearingDown((prev) => { + const next = new Set(prev); + next.delete(d.name); + return next; + }); + } + }; + const handleDeploy = async () => { if (!iniContent.trim()) return; setDeploying(true); @@ -71,7 +183,7 @@ const DeckyFleet: React.FC = () => { await api.post('/deckies/deploy', { ini_content: iniContent }, { timeout: 120000 }); setIniContent(''); setShowDeploy(false); - fetchDeckies(); + fetchDeckies(deployMode?.mode); } catch (err: any) { console.error('Deploy failed', err); alert(`Deploy failed: ${err.response?.data?.detail || err.message}`); @@ -92,49 +204,90 @@ const DeckyFleet: React.FC = () => { reader.readAsText(file); }; + const fetchDeployMode = async () => { + try { + const res = await api.get('/system/deployment-mode'); + const mode = res.data.mode; + setDeployMode({ mode, swarm_host_count: res.data.swarm_host_count }); + return mode; + } catch { + setDeployMode(null); + return undefined; + } + }; + useEffect(() => { - fetchDeckies(); - const _interval = setInterval(fetchDeckies, 10000); // Fleet state updates less frequently than logs - return () => clearInterval(_interval); + let cancelled = false; + (async () => { + const mode = await fetchDeployMode(); + if (cancelled) return; + await fetchDeckies(mode); + await fetchRole(); + })(); + // Keep the poll mode-aware by reading from the deployMode ref at tick time. + const _interval = setInterval(() => { + // Deployment mode itself can change (first host enrolls → swarm), so + // re-check it alongside the fleet. + fetchDeployMode().then((m) => fetchDeckies(m)); + }, 10000); + return () => { cancelled = true; clearInterval(_interval); }; }, []); if (loading) return
SCANNING NETWORK FOR DECOYS...
; + const isSwarm = deployMode?.mode === 'swarm'; + return (

DECOY FLEET ASSET INVENTORY

+ {deployMode && ( + + [{isSwarm ? `SWARM × ${deployMode.swarm_host_count}` : 'UNIHOST'}] + + )}
- + {isAdmin && ( + + )}
{showDeploy && (
-

Deploy via INI Configuration

+

+ Deploy via INI Configuration + {deployMode && ( + + {deployMode.mode === 'swarm' + ? `→ will shard across ${deployMode.swarm_host_count} SWARM host(s)` + : '→ will deploy locally (UNIHOST)'} + + )} +

- -
-