You don't have permission to access this resource.
\n"
+ "\n"
+ f"{SERVER_HEADER} Server at {NODE_NAME} Port 443\n"
+ "\n"
+ )
+
+ headers = {"Content-Type": "text/html", **EXTRA_HEADERS}
+ return body, RESPONSE_CODE, headers
+
+
+class _SilentHandler(WSGIRequestHandler):
+ """Suppress Werkzeug's Server header so Flask's after_request is the sole source."""
+ def version_string(self) -> str:
+ return ""
+
+
+if __name__ == "__main__":
+ _log("startup", msg=f"HTTPS server starting as {NODE_NAME}")
+
+ ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
+ ctx.load_cert_chain(TLS_CERT, TLS_KEY)
+
+ srv = make_server("0.0.0.0", PORT, app, request_handler=_SilentHandler) # nosec B104
+ srv.socket = ctx.wrap_socket(srv.socket, server_side=True)
+ srv.serve_forever()
diff --git a/decnet/templates/https/syslog_bridge.py b/decnet/templates/https/syslog_bridge.py
new file mode 100644
index 0000000..c0a78d0
--- /dev/null
+++ b/decnet/templates/https/syslog_bridge.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+"""
+Shared RFC 5424 syslog helper used by service containers.
+
+Services call syslog_line() to format an RFC 5424 message, then
+write_syslog_file() to emit it to stdout — the container runtime
+captures it, and the host-side collector streams it into the log file.
+
+RFC 5424 structure:
+ 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG
+
+Facility: local0 (16). SD element ID uses PEN 55555.
+"""
+
+from datetime import datetime, timezone
+from typing import Any
+
+# ─── Constants ────────────────────────────────────────────────────────────────
+
+_FACILITY_LOCAL0 = 16
+_SD_ID = "relay@55555"
+_NILVALUE = "-"
+
+SEVERITY_EMERG = 0
+SEVERITY_ALERT = 1
+SEVERITY_CRIT = 2
+SEVERITY_ERROR = 3
+SEVERITY_WARNING = 4
+SEVERITY_NOTICE = 5
+SEVERITY_INFO = 6
+SEVERITY_DEBUG = 7
+
+_MAX_HOSTNAME = 255
+_MAX_APPNAME = 48
+_MAX_MSGID = 32
+
+# ─── Formatter ────────────────────────────────────────────────────────────────
+
+def _sd_escape(value: str) -> str:
+ """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3."""
+ return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]")
+
+
+def _sd_element(fields: dict[str, Any]) -> str:
+ if not fields:
+ return _NILVALUE
+ params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items())
+ return f"[{_SD_ID} {params}]"
+
+
+def syslog_line(
+ service: str,
+ hostname: str,
+ event_type: str,
+ severity: int = SEVERITY_INFO,
+ timestamp: datetime | None = None,
+ msg: str | None = None,
+ **fields: Any,
+) -> str:
+ """
+ Return a single RFC 5424-compliant syslog line (no trailing newline).
+
+ Args:
+ service: APP-NAME (e.g. "http", "mysql")
+ hostname: HOSTNAME (node name)
+ event_type: MSGID (e.g. "request", "login_attempt")
+ severity: Syslog severity integer (default: INFO=6)
+ timestamp: UTC datetime; defaults to now
+ msg: Optional free-text MSG
+ **fields: Encoded as structured data params
+ """
+ pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>"
+ ts = (timestamp or datetime.now(timezone.utc)).isoformat()
+ host = (hostname or _NILVALUE)[:_MAX_HOSTNAME]
+ appname = (service or _NILVALUE)[:_MAX_APPNAME]
+ msgid = (event_type or _NILVALUE)[:_MAX_MSGID]
+ sd = _sd_element(fields)
+ message = f" {msg}" if msg else ""
+ return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}"
+
+
+def write_syslog_file(line: str) -> None:
+ """Emit a syslog line to stdout for container log capture."""
+ print(line, flush=True)
+
+
+def forward_syslog(line: str, log_target: str) -> None:
+ """No-op stub. TCP forwarding is handled by rsyslog, not by service containers."""
+ pass
diff --git a/templates/imap/Dockerfile b/decnet/templates/imap/Dockerfile
similarity index 85%
rename from templates/imap/Dockerfile
rename to decnet/templates/imap/Dockerfile
index a0e8fa2..35d1b67 100644
--- a/templates/imap/Dockerfile
+++ b/decnet/templates/imap/Dockerfile
@@ -5,13 +5,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
python3 \
&& rm -rf /var/lib/apt/lists/*
-COPY decnet_logging.py /opt/decnet_logging.py
+COPY syslog_bridge.py /opt/syslog_bridge.py
COPY server.py /opt/server.py
COPY entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh
EXPOSE 143 993
-RUN useradd -r -s /bin/false -d /opt decnet \
+RUN useradd -r -s /bin/false -d /opt logrelay \
&& apt-get update && apt-get install -y --no-install-recommends libcap2-bin \
&& rm -rf /var/lib/apt/lists/* \
&& (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true)
@@ -19,5 +19,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
CMD kill -0 1 || exit 1
-USER decnet
+USER logrelay
ENTRYPOINT ["/entrypoint.sh"]
diff --git a/templates/imap/entrypoint.sh b/decnet/templates/imap/entrypoint.sh
similarity index 100%
rename from templates/imap/entrypoint.sh
rename to decnet/templates/imap/entrypoint.sh
diff --git a/templates/imap/server.py b/decnet/templates/imap/server.py
similarity index 99%
rename from templates/imap/server.py
rename to decnet/templates/imap/server.py
index 71489af..5b01588 100644
--- a/templates/imap/server.py
+++ b/decnet/templates/imap/server.py
@@ -12,7 +12,7 @@ Banner advertises Dovecot so nmap fingerprints correctly.
import asyncio
import os
-from decnet_logging import SEVERITY_WARNING, syslog_line, write_syslog_file, forward_syslog
+from syslog_bridge import SEVERITY_WARNING, syslog_line, write_syslog_file, forward_syslog
NODE_NAME = os.environ.get("NODE_NAME", "mailserver")
SERVICE_NAME = "imap"
@@ -236,7 +236,6 @@ _MAILBOXES = ["INBOX", "Sent", "Drafts", "Archive"]
def _log(event_type: str, severity: int = 6, **kwargs) -> None:
line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs)
- print(line, flush=True)
write_syslog_file(line)
forward_syslog(line, LOG_TARGET)
diff --git a/decnet/templates/imap/syslog_bridge.py b/decnet/templates/imap/syslog_bridge.py
new file mode 100644
index 0000000..c0a78d0
--- /dev/null
+++ b/decnet/templates/imap/syslog_bridge.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+"""
+Shared RFC 5424 syslog helper used by service containers.
+
+Services call syslog_line() to format an RFC 5424 message, then
+write_syslog_file() to emit it to stdout — the container runtime
+captures it, and the host-side collector streams it into the log file.
+
+RFC 5424 structure:
+ 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG
+
+Facility: local0 (16). SD element ID uses PEN 55555.
+"""
+
+from datetime import datetime, timezone
+from typing import Any
+
+# ─── Constants ────────────────────────────────────────────────────────────────
+
+_FACILITY_LOCAL0 = 16
+_SD_ID = "relay@55555"
+_NILVALUE = "-"
+
+SEVERITY_EMERG = 0
+SEVERITY_ALERT = 1
+SEVERITY_CRIT = 2
+SEVERITY_ERROR = 3
+SEVERITY_WARNING = 4
+SEVERITY_NOTICE = 5
+SEVERITY_INFO = 6
+SEVERITY_DEBUG = 7
+
+_MAX_HOSTNAME = 255
+_MAX_APPNAME = 48
+_MAX_MSGID = 32
+
+# ─── Formatter ────────────────────────────────────────────────────────────────
+
+def _sd_escape(value: str) -> str:
+ """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3."""
+ return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]")
+
+
+def _sd_element(fields: dict[str, Any]) -> str:
+ if not fields:
+ return _NILVALUE
+ params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items())
+ return f"[{_SD_ID} {params}]"
+
+
+def syslog_line(
+ service: str,
+ hostname: str,
+ event_type: str,
+ severity: int = SEVERITY_INFO,
+ timestamp: datetime | None = None,
+ msg: str | None = None,
+ **fields: Any,
+) -> str:
+ """
+ Return a single RFC 5424-compliant syslog line (no trailing newline).
+
+ Args:
+ service: APP-NAME (e.g. "http", "mysql")
+ hostname: HOSTNAME (node name)
+ event_type: MSGID (e.g. "request", "login_attempt")
+ severity: Syslog severity integer (default: INFO=6)
+ timestamp: UTC datetime; defaults to now
+ msg: Optional free-text MSG
+ **fields: Encoded as structured data params
+ """
+ pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>"
+ ts = (timestamp or datetime.now(timezone.utc)).isoformat()
+ host = (hostname or _NILVALUE)[:_MAX_HOSTNAME]
+ appname = (service or _NILVALUE)[:_MAX_APPNAME]
+ msgid = (event_type or _NILVALUE)[:_MAX_MSGID]
+ sd = _sd_element(fields)
+ message = f" {msg}" if msg else ""
+ return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}"
+
+
+def write_syslog_file(line: str) -> None:
+ """Emit a syslog line to stdout for container log capture."""
+ print(line, flush=True)
+
+
+def forward_syslog(line: str, log_target: str) -> None:
+ """No-op stub. TCP forwarding is handled by rsyslog, not by service containers."""
+ pass
diff --git a/templates/k8s/Dockerfile b/decnet/templates/k8s/Dockerfile
similarity index 87%
rename from templates/k8s/Dockerfile
rename to decnet/templates/k8s/Dockerfile
index 118ed00..1da6296 100644
--- a/templates/k8s/Dockerfile
+++ b/decnet/templates/k8s/Dockerfile
@@ -8,13 +8,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
ENV PIP_BREAK_SYSTEM_PACKAGES=1
RUN pip3 install --no-cache-dir flask
-COPY decnet_logging.py /opt/decnet_logging.py
+COPY syslog_bridge.py /opt/syslog_bridge.py
COPY server.py /opt/server.py
COPY entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh
EXPOSE 6443 8080
-RUN useradd -r -s /bin/false -d /opt decnet \
+RUN useradd -r -s /bin/false -d /opt logrelay \
&& apt-get update && apt-get install -y --no-install-recommends libcap2-bin \
&& rm -rf /var/lib/apt/lists/* \
&& (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true)
@@ -22,5 +22,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
CMD kill -0 1 || exit 1
-USER decnet
+USER logrelay
ENTRYPOINT ["/entrypoint.sh"]
diff --git a/templates/k8s/entrypoint.sh b/decnet/templates/k8s/entrypoint.sh
similarity index 100%
rename from templates/k8s/entrypoint.sh
rename to decnet/templates/k8s/entrypoint.sh
diff --git a/templates/k8s/server.py b/decnet/templates/k8s/server.py
similarity index 97%
rename from templates/k8s/server.py
rename to decnet/templates/k8s/server.py
index bf96fb9..8e5ba51 100644
--- a/templates/k8s/server.py
+++ b/decnet/templates/k8s/server.py
@@ -10,7 +10,7 @@ import json
import os
from flask import Flask, request
-from decnet_logging import syslog_line, write_syslog_file, forward_syslog
+from syslog_bridge import syslog_line, write_syslog_file, forward_syslog
NODE_NAME = os.environ.get("NODE_NAME", "k8s-master")
SERVICE_NAME = "k8s"
@@ -69,7 +69,6 @@ _SECRETS = {
def _log(event_type: str, severity: int = 6, **kwargs) -> None:
line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs)
- print(line, flush=True)
write_syslog_file(line)
forward_syslog(line, LOG_TARGET)
diff --git a/decnet/templates/k8s/syslog_bridge.py b/decnet/templates/k8s/syslog_bridge.py
new file mode 100644
index 0000000..c0a78d0
--- /dev/null
+++ b/decnet/templates/k8s/syslog_bridge.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+"""
+Shared RFC 5424 syslog helper used by service containers.
+
+Services call syslog_line() to format an RFC 5424 message, then
+write_syslog_file() to emit it to stdout — the container runtime
+captures it, and the host-side collector streams it into the log file.
+
+RFC 5424 structure:
+ 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG
+
+Facility: local0 (16). SD element ID uses PEN 55555.
+"""
+
+from datetime import datetime, timezone
+from typing import Any
+
+# ─── Constants ────────────────────────────────────────────────────────────────
+
+_FACILITY_LOCAL0 = 16
+_SD_ID = "relay@55555"
+_NILVALUE = "-"
+
+SEVERITY_EMERG = 0
+SEVERITY_ALERT = 1
+SEVERITY_CRIT = 2
+SEVERITY_ERROR = 3
+SEVERITY_WARNING = 4
+SEVERITY_NOTICE = 5
+SEVERITY_INFO = 6
+SEVERITY_DEBUG = 7
+
+_MAX_HOSTNAME = 255
+_MAX_APPNAME = 48
+_MAX_MSGID = 32
+
+# ─── Formatter ────────────────────────────────────────────────────────────────
+
+def _sd_escape(value: str) -> str:
+ """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3."""
+ return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]")
+
+
+def _sd_element(fields: dict[str, Any]) -> str:
+ if not fields:
+ return _NILVALUE
+ params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items())
+ return f"[{_SD_ID} {params}]"
+
+
+def syslog_line(
+ service: str,
+ hostname: str,
+ event_type: str,
+ severity: int = SEVERITY_INFO,
+ timestamp: datetime | None = None,
+ msg: str | None = None,
+ **fields: Any,
+) -> str:
+ """
+ Return a single RFC 5424-compliant syslog line (no trailing newline).
+
+ Args:
+ service: APP-NAME (e.g. "http", "mysql")
+ hostname: HOSTNAME (node name)
+ event_type: MSGID (e.g. "request", "login_attempt")
+ severity: Syslog severity integer (default: INFO=6)
+ timestamp: UTC datetime; defaults to now
+ msg: Optional free-text MSG
+ **fields: Encoded as structured data params
+ """
+ pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>"
+ ts = (timestamp or datetime.now(timezone.utc)).isoformat()
+ host = (hostname or _NILVALUE)[:_MAX_HOSTNAME]
+ appname = (service or _NILVALUE)[:_MAX_APPNAME]
+ msgid = (event_type or _NILVALUE)[:_MAX_MSGID]
+ sd = _sd_element(fields)
+ message = f" {msg}" if msg else ""
+ return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}"
+
+
+def write_syslog_file(line: str) -> None:
+ """Emit a syslog line to stdout for container log capture."""
+ print(line, flush=True)
+
+
+def forward_syslog(line: str, log_target: str) -> None:
+ """No-op stub. TCP forwarding is handled by rsyslog, not by service containers."""
+ pass
diff --git a/templates/ldap/Dockerfile b/decnet/templates/ldap/Dockerfile
similarity index 85%
rename from templates/ldap/Dockerfile
rename to decnet/templates/ldap/Dockerfile
index 2d8aa48..64e1a50 100644
--- a/templates/ldap/Dockerfile
+++ b/decnet/templates/ldap/Dockerfile
@@ -5,13 +5,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
python3 \
&& rm -rf /var/lib/apt/lists/*
-COPY decnet_logging.py /opt/decnet_logging.py
+COPY syslog_bridge.py /opt/syslog_bridge.py
COPY server.py /opt/server.py
COPY entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh
EXPOSE 389 636
-RUN useradd -r -s /bin/false -d /opt decnet \
+RUN useradd -r -s /bin/false -d /opt logrelay \
&& apt-get update && apt-get install -y --no-install-recommends libcap2-bin \
&& rm -rf /var/lib/apt/lists/* \
&& (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true)
@@ -19,5 +19,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
CMD kill -0 1 || exit 1
-USER decnet
+USER logrelay
ENTRYPOINT ["/entrypoint.sh"]
diff --git a/templates/ldap/entrypoint.sh b/decnet/templates/ldap/entrypoint.sh
similarity index 100%
rename from templates/ldap/entrypoint.sh
rename to decnet/templates/ldap/entrypoint.sh
diff --git a/templates/ldap/server.py b/decnet/templates/ldap/server.py
similarity index 97%
rename from templates/ldap/server.py
rename to decnet/templates/ldap/server.py
index bfef78f..c7d4136 100644
--- a/templates/ldap/server.py
+++ b/decnet/templates/ldap/server.py
@@ -7,7 +7,7 @@ invalidCredentials error. Logs all interactions as JSON.
import asyncio
import os
-from decnet_logging import syslog_line, write_syslog_file, forward_syslog
+from syslog_bridge import syslog_line, write_syslog_file, forward_syslog
NODE_NAME = os.environ.get("NODE_NAME", "ldapserver")
SERVICE_NAME = "ldap"
@@ -18,7 +18,6 @@ LOG_TARGET = os.environ.get("LOG_TARGET", "")
def _log(event_type: str, severity: int = 6, **kwargs) -> None:
line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs)
- print(line, flush=True)
write_syslog_file(line)
forward_syslog(line, LOG_TARGET)
diff --git a/decnet/templates/ldap/syslog_bridge.py b/decnet/templates/ldap/syslog_bridge.py
new file mode 100644
index 0000000..c0a78d0
--- /dev/null
+++ b/decnet/templates/ldap/syslog_bridge.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+"""
+Shared RFC 5424 syslog helper used by service containers.
+
+Services call syslog_line() to format an RFC 5424 message, then
+write_syslog_file() to emit it to stdout — the container runtime
+captures it, and the host-side collector streams it into the log file.
+
+RFC 5424 structure:
+ 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG
+
+Facility: local0 (16). SD element ID uses PEN 55555.
+"""
+
+from datetime import datetime, timezone
+from typing import Any
+
+# ─── Constants ────────────────────────────────────────────────────────────────
+
+_FACILITY_LOCAL0 = 16
+_SD_ID = "relay@55555"
+_NILVALUE = "-"
+
+SEVERITY_EMERG = 0
+SEVERITY_ALERT = 1
+SEVERITY_CRIT = 2
+SEVERITY_ERROR = 3
+SEVERITY_WARNING = 4
+SEVERITY_NOTICE = 5
+SEVERITY_INFO = 6
+SEVERITY_DEBUG = 7
+
+_MAX_HOSTNAME = 255
+_MAX_APPNAME = 48
+_MAX_MSGID = 32
+
+# ─── Formatter ────────────────────────────────────────────────────────────────
+
+def _sd_escape(value: str) -> str:
+ """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3."""
+ return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]")
+
+
+def _sd_element(fields: dict[str, Any]) -> str:
+ if not fields:
+ return _NILVALUE
+ params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items())
+ return f"[{_SD_ID} {params}]"
+
+
+def syslog_line(
+ service: str,
+ hostname: str,
+ event_type: str,
+ severity: int = SEVERITY_INFO,
+ timestamp: datetime | None = None,
+ msg: str | None = None,
+ **fields: Any,
+) -> str:
+ """
+ Return a single RFC 5424-compliant syslog line (no trailing newline).
+
+ Args:
+ service: APP-NAME (e.g. "http", "mysql")
+ hostname: HOSTNAME (node name)
+ event_type: MSGID (e.g. "request", "login_attempt")
+ severity: Syslog severity integer (default: INFO=6)
+ timestamp: UTC datetime; defaults to now
+ msg: Optional free-text MSG
+ **fields: Encoded as structured data params
+ """
+ pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>"
+ ts = (timestamp or datetime.now(timezone.utc)).isoformat()
+ host = (hostname or _NILVALUE)[:_MAX_HOSTNAME]
+ appname = (service or _NILVALUE)[:_MAX_APPNAME]
+ msgid = (event_type or _NILVALUE)[:_MAX_MSGID]
+ sd = _sd_element(fields)
+ message = f" {msg}" if msg else ""
+ return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}"
+
+
+def write_syslog_file(line: str) -> None:
+ """Emit a syslog line to stdout for container log capture."""
+ print(line, flush=True)
+
+
+def forward_syslog(line: str, log_target: str) -> None:
+ """No-op stub. TCP forwarding is handled by rsyslog, not by service containers."""
+ pass
diff --git a/templates/llmnr/Dockerfile b/decnet/templates/llmnr/Dockerfile
similarity index 86%
rename from templates/llmnr/Dockerfile
rename to decnet/templates/llmnr/Dockerfile
index cddfc7d..724f4db 100644
--- a/templates/llmnr/Dockerfile
+++ b/decnet/templates/llmnr/Dockerfile
@@ -5,14 +5,14 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
python3 \
&& rm -rf /var/lib/apt/lists/*
-COPY decnet_logging.py /opt/decnet_logging.py
+COPY syslog_bridge.py /opt/syslog_bridge.py
COPY server.py /opt/server.py
COPY entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh
EXPOSE 5355/udp
EXPOSE 5353/udp
-RUN useradd -r -s /bin/false -d /opt decnet \
+RUN useradd -r -s /bin/false -d /opt logrelay \
&& apt-get update && apt-get install -y --no-install-recommends libcap2-bin \
&& rm -rf /var/lib/apt/lists/* \
&& (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true)
@@ -20,5 +20,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
CMD kill -0 1 || exit 1
-USER decnet
+USER logrelay
ENTRYPOINT ["/entrypoint.sh"]
diff --git a/templates/llmnr/entrypoint.sh b/decnet/templates/llmnr/entrypoint.sh
similarity index 100%
rename from templates/llmnr/entrypoint.sh
rename to decnet/templates/llmnr/entrypoint.sh
diff --git a/templates/llmnr/server.py b/decnet/templates/llmnr/server.py
similarity index 97%
rename from templates/llmnr/server.py
rename to decnet/templates/llmnr/server.py
index 7d0fc95..ac94707 100644
--- a/templates/llmnr/server.py
+++ b/decnet/templates/llmnr/server.py
@@ -9,7 +9,7 @@ Logs every packet with source IP and decoded query name where possible.
import asyncio
import os
import struct
-from decnet_logging import syslog_line, write_syslog_file, forward_syslog
+from syslog_bridge import syslog_line, write_syslog_file, forward_syslog
NODE_NAME = os.environ.get("NODE_NAME", "lan-host")
SERVICE_NAME = "llmnr"
@@ -20,7 +20,6 @@ LOG_TARGET = os.environ.get("LOG_TARGET", "")
def _log(event_type: str, severity: int = 6, **kwargs) -> None:
line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs)
- print(line, flush=True)
write_syslog_file(line)
forward_syslog(line, LOG_TARGET)
diff --git a/decnet/templates/llmnr/syslog_bridge.py b/decnet/templates/llmnr/syslog_bridge.py
new file mode 100644
index 0000000..c0a78d0
--- /dev/null
+++ b/decnet/templates/llmnr/syslog_bridge.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+"""
+Shared RFC 5424 syslog helper used by service containers.
+
+Services call syslog_line() to format an RFC 5424 message, then
+write_syslog_file() to emit it to stdout — the container runtime
+captures it, and the host-side collector streams it into the log file.
+
+RFC 5424 structure:
+ 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG
+
+Facility: local0 (16). SD element ID uses PEN 55555.
+"""
+
+from datetime import datetime, timezone
+from typing import Any
+
+# ─── Constants ────────────────────────────────────────────────────────────────
+
+_FACILITY_LOCAL0 = 16
+_SD_ID = "relay@55555"
+_NILVALUE = "-"
+
+SEVERITY_EMERG = 0
+SEVERITY_ALERT = 1
+SEVERITY_CRIT = 2
+SEVERITY_ERROR = 3
+SEVERITY_WARNING = 4
+SEVERITY_NOTICE = 5
+SEVERITY_INFO = 6
+SEVERITY_DEBUG = 7
+
+_MAX_HOSTNAME = 255
+_MAX_APPNAME = 48
+_MAX_MSGID = 32
+
+# ─── Formatter ────────────────────────────────────────────────────────────────
+
+def _sd_escape(value: str) -> str:
+ """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3."""
+ return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]")
+
+
+def _sd_element(fields: dict[str, Any]) -> str:
+ if not fields:
+ return _NILVALUE
+ params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items())
+ return f"[{_SD_ID} {params}]"
+
+
+def syslog_line(
+ service: str,
+ hostname: str,
+ event_type: str,
+ severity: int = SEVERITY_INFO,
+ timestamp: datetime | None = None,
+ msg: str | None = None,
+ **fields: Any,
+) -> str:
+ """
+ Return a single RFC 5424-compliant syslog line (no trailing newline).
+
+ Args:
+ service: APP-NAME (e.g. "http", "mysql")
+ hostname: HOSTNAME (node name)
+ event_type: MSGID (e.g. "request", "login_attempt")
+ severity: Syslog severity integer (default: INFO=6)
+ timestamp: UTC datetime; defaults to now
+ msg: Optional free-text MSG
+ **fields: Encoded as structured data params
+ """
+ pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>"
+ ts = (timestamp or datetime.now(timezone.utc)).isoformat()
+ host = (hostname or _NILVALUE)[:_MAX_HOSTNAME]
+ appname = (service or _NILVALUE)[:_MAX_APPNAME]
+ msgid = (event_type or _NILVALUE)[:_MAX_MSGID]
+ sd = _sd_element(fields)
+ message = f" {msg}" if msg else ""
+ return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}"
+
+
+def write_syslog_file(line: str) -> None:
+ """Emit a syslog line to stdout for container log capture."""
+ print(line, flush=True)
+
+
+def forward_syslog(line: str, log_target: str) -> None:
+ """No-op stub. TCP forwarding is handled by rsyslog, not by service containers."""
+ pass
diff --git a/templates/mongodb/Dockerfile b/decnet/templates/mongodb/Dockerfile
similarity index 85%
rename from templates/mongodb/Dockerfile
rename to decnet/templates/mongodb/Dockerfile
index d8f7039..d7bc953 100644
--- a/templates/mongodb/Dockerfile
+++ b/decnet/templates/mongodb/Dockerfile
@@ -5,13 +5,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
python3 \
&& rm -rf /var/lib/apt/lists/*
-COPY decnet_logging.py /opt/decnet_logging.py
+COPY syslog_bridge.py /opt/syslog_bridge.py
COPY server.py /opt/server.py
COPY entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh
EXPOSE 27017
-RUN useradd -r -s /bin/false -d /opt decnet \
+RUN useradd -r -s /bin/false -d /opt logrelay \
&& apt-get update && apt-get install -y --no-install-recommends libcap2-bin \
&& rm -rf /var/lib/apt/lists/* \
&& (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true)
@@ -19,5 +19,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
CMD kill -0 1 || exit 1
-USER decnet
+USER logrelay
ENTRYPOINT ["/entrypoint.sh"]
diff --git a/templates/mongodb/entrypoint.sh b/decnet/templates/mongodb/entrypoint.sh
similarity index 100%
rename from templates/mongodb/entrypoint.sh
rename to decnet/templates/mongodb/entrypoint.sh
diff --git a/templates/mongodb/server.py b/decnet/templates/mongodb/server.py
similarity index 97%
rename from templates/mongodb/server.py
rename to decnet/templates/mongodb/server.py
index cc16af5..ce14f02 100644
--- a/templates/mongodb/server.py
+++ b/decnet/templates/mongodb/server.py
@@ -9,7 +9,7 @@ received messages as JSON.
import asyncio
import os
import struct
-from decnet_logging import syslog_line, write_syslog_file, forward_syslog
+from syslog_bridge import syslog_line, write_syslog_file, forward_syslog
NODE_NAME = os.environ.get("NODE_NAME", "mongodb")
SERVICE_NAME = "mongodb"
@@ -62,7 +62,6 @@ def _op_msg(request_id: int, doc: bytes) -> bytes:
def _log(event_type: str, severity: int = 6, **kwargs) -> None:
line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs)
- print(line, flush=True)
write_syslog_file(line)
forward_syslog(line, LOG_TARGET)
diff --git a/decnet/templates/mongodb/syslog_bridge.py b/decnet/templates/mongodb/syslog_bridge.py
new file mode 100644
index 0000000..c0a78d0
--- /dev/null
+++ b/decnet/templates/mongodb/syslog_bridge.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+"""
+Shared RFC 5424 syslog helper used by service containers.
+
+Services call syslog_line() to format an RFC 5424 message, then
+write_syslog_file() to emit it to stdout — the container runtime
+captures it, and the host-side collector streams it into the log file.
+
+RFC 5424 structure:
+ 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG
+
+Facility: local0 (16). SD element ID uses PEN 55555.
+"""
+
+from datetime import datetime, timezone
+from typing import Any
+
+# ─── Constants ────────────────────────────────────────────────────────────────
+
+_FACILITY_LOCAL0 = 16
+_SD_ID = "relay@55555"
+_NILVALUE = "-"
+
+SEVERITY_EMERG = 0
+SEVERITY_ALERT = 1
+SEVERITY_CRIT = 2
+SEVERITY_ERROR = 3
+SEVERITY_WARNING = 4
+SEVERITY_NOTICE = 5
+SEVERITY_INFO = 6
+SEVERITY_DEBUG = 7
+
+_MAX_HOSTNAME = 255
+_MAX_APPNAME = 48
+_MAX_MSGID = 32
+
+# ─── Formatter ────────────────────────────────────────────────────────────────
+
+def _sd_escape(value: str) -> str:
+ """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3."""
+ return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]")
+
+
+def _sd_element(fields: dict[str, Any]) -> str:
+ if not fields:
+ return _NILVALUE
+ params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items())
+ return f"[{_SD_ID} {params}]"
+
+
+def syslog_line(
+ service: str,
+ hostname: str,
+ event_type: str,
+ severity: int = SEVERITY_INFO,
+ timestamp: datetime | None = None,
+ msg: str | None = None,
+ **fields: Any,
+) -> str:
+ """
+ Return a single RFC 5424-compliant syslog line (no trailing newline).
+
+ Args:
+ service: APP-NAME (e.g. "http", "mysql")
+ hostname: HOSTNAME (node name)
+ event_type: MSGID (e.g. "request", "login_attempt")
+ severity: Syslog severity integer (default: INFO=6)
+ timestamp: UTC datetime; defaults to now
+ msg: Optional free-text MSG
+ **fields: Encoded as structured data params
+ """
+ pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>"
+ ts = (timestamp or datetime.now(timezone.utc)).isoformat()
+ host = (hostname or _NILVALUE)[:_MAX_HOSTNAME]
+ appname = (service or _NILVALUE)[:_MAX_APPNAME]
+ msgid = (event_type or _NILVALUE)[:_MAX_MSGID]
+ sd = _sd_element(fields)
+ message = f" {msg}" if msg else ""
+ return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}"
+
+
+def write_syslog_file(line: str) -> None:
+ """Emit a syslog line to stdout for container log capture."""
+ print(line, flush=True)
+
+
+def forward_syslog(line: str, log_target: str) -> None:
+ """No-op stub. TCP forwarding is handled by rsyslog, not by service containers."""
+ pass
diff --git a/templates/mqtt/Dockerfile b/decnet/templates/mqtt/Dockerfile
similarity index 85%
rename from templates/mqtt/Dockerfile
rename to decnet/templates/mqtt/Dockerfile
index 1ee311d..562ed42 100644
--- a/templates/mqtt/Dockerfile
+++ b/decnet/templates/mqtt/Dockerfile
@@ -5,13 +5,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
python3 \
&& rm -rf /var/lib/apt/lists/*
-COPY decnet_logging.py /opt/decnet_logging.py
+COPY syslog_bridge.py /opt/syslog_bridge.py
COPY server.py /opt/server.py
COPY entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh
EXPOSE 1883
-RUN useradd -r -s /bin/false -d /opt decnet \
+RUN useradd -r -s /bin/false -d /opt logrelay \
&& apt-get update && apt-get install -y --no-install-recommends libcap2-bin \
&& rm -rf /var/lib/apt/lists/* \
&& (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true)
@@ -19,5 +19,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
CMD kill -0 1 || exit 1
-USER decnet
+USER logrelay
ENTRYPOINT ["/entrypoint.sh"]
diff --git a/templates/mqtt/entrypoint.sh b/decnet/templates/mqtt/entrypoint.sh
similarity index 100%
rename from templates/mqtt/entrypoint.sh
rename to decnet/templates/mqtt/entrypoint.sh
diff --git a/templates/mqtt/server.py b/decnet/templates/mqtt/server.py
similarity index 98%
rename from templates/mqtt/server.py
rename to decnet/templates/mqtt/server.py
index d0b43c1..66438bd 100644
--- a/templates/mqtt/server.py
+++ b/decnet/templates/mqtt/server.py
@@ -12,7 +12,7 @@ import json
import os
import random
import struct
-from decnet_logging import syslog_line, write_syslog_file, forward_syslog
+from syslog_bridge import syslog_line, write_syslog_file, forward_syslog
NODE_NAME = os.environ.get("NODE_NAME", "mqtt-broker")
SERVICE_NAME = "mqtt"
@@ -28,7 +28,6 @@ _CONNACK_NOT_AUTH = b"\x20\x02\x00\x05"
def _log(event_type: str, severity: int = 6, **kwargs) -> None:
line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs)
- print(line, flush=True)
write_syslog_file(line)
forward_syslog(line, LOG_TARGET)
diff --git a/decnet/templates/mqtt/syslog_bridge.py b/decnet/templates/mqtt/syslog_bridge.py
new file mode 100644
index 0000000..c0a78d0
--- /dev/null
+++ b/decnet/templates/mqtt/syslog_bridge.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+"""
+Shared RFC 5424 syslog helper used by service containers.
+
+Services call syslog_line() to format an RFC 5424 message, then
+write_syslog_file() to emit it to stdout — the container runtime
+captures it, and the host-side collector streams it into the log file.
+
+RFC 5424 structure:
+ 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG
+
+Facility: local0 (16). SD element ID uses PEN 55555.
+"""
+
+from datetime import datetime, timezone
+from typing import Any
+
+# ─── Constants ────────────────────────────────────────────────────────────────
+
+_FACILITY_LOCAL0 = 16
+_SD_ID = "relay@55555"
+_NILVALUE = "-"
+
+SEVERITY_EMERG = 0
+SEVERITY_ALERT = 1
+SEVERITY_CRIT = 2
+SEVERITY_ERROR = 3
+SEVERITY_WARNING = 4
+SEVERITY_NOTICE = 5
+SEVERITY_INFO = 6
+SEVERITY_DEBUG = 7
+
+_MAX_HOSTNAME = 255
+_MAX_APPNAME = 48
+_MAX_MSGID = 32
+
+# ─── Formatter ────────────────────────────────────────────────────────────────
+
+def _sd_escape(value: str) -> str:
+ """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3."""
+ return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]")
+
+
+def _sd_element(fields: dict[str, Any]) -> str:
+ if not fields:
+ return _NILVALUE
+ params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items())
+ return f"[{_SD_ID} {params}]"
+
+
+def syslog_line(
+ service: str,
+ hostname: str,
+ event_type: str,
+ severity: int = SEVERITY_INFO,
+ timestamp: datetime | None = None,
+ msg: str | None = None,
+ **fields: Any,
+) -> str:
+ """
+ Return a single RFC 5424-compliant syslog line (no trailing newline).
+
+ Args:
+ service: APP-NAME (e.g. "http", "mysql")
+ hostname: HOSTNAME (node name)
+ event_type: MSGID (e.g. "request", "login_attempt")
+ severity: Syslog severity integer (default: INFO=6)
+ timestamp: UTC datetime; defaults to now
+ msg: Optional free-text MSG
+ **fields: Encoded as structured data params
+ """
+ pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>"
+ ts = (timestamp or datetime.now(timezone.utc)).isoformat()
+ host = (hostname or _NILVALUE)[:_MAX_HOSTNAME]
+ appname = (service or _NILVALUE)[:_MAX_APPNAME]
+ msgid = (event_type or _NILVALUE)[:_MAX_MSGID]
+ sd = _sd_element(fields)
+ message = f" {msg}" if msg else ""
+ return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}"
+
+
+def write_syslog_file(line: str) -> None:
+ """Emit a syslog line to stdout for container log capture."""
+ print(line, flush=True)
+
+
+def forward_syslog(line: str, log_target: str) -> None:
+ """No-op stub. TCP forwarding is handled by rsyslog, not by service containers."""
+ pass
diff --git a/templates/mssql/Dockerfile b/decnet/templates/mssql/Dockerfile
similarity index 85%
rename from templates/mssql/Dockerfile
rename to decnet/templates/mssql/Dockerfile
index 07607cb..2f34156 100644
--- a/templates/mssql/Dockerfile
+++ b/decnet/templates/mssql/Dockerfile
@@ -5,13 +5,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
python3 \
&& rm -rf /var/lib/apt/lists/*
-COPY decnet_logging.py /opt/decnet_logging.py
+COPY syslog_bridge.py /opt/syslog_bridge.py
COPY server.py /opt/server.py
COPY entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh
EXPOSE 1433
-RUN useradd -r -s /bin/false -d /opt decnet \
+RUN useradd -r -s /bin/false -d /opt logrelay \
&& apt-get update && apt-get install -y --no-install-recommends libcap2-bin \
&& rm -rf /var/lib/apt/lists/* \
&& (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true)
@@ -19,5 +19,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
CMD kill -0 1 || exit 1
-USER decnet
+USER logrelay
ENTRYPOINT ["/entrypoint.sh"]
diff --git a/templates/mssql/entrypoint.sh b/decnet/templates/mssql/entrypoint.sh
similarity index 100%
rename from templates/mssql/entrypoint.sh
rename to decnet/templates/mssql/entrypoint.sh
diff --git a/templates/mssql/server.py b/decnet/templates/mssql/server.py
similarity index 97%
rename from templates/mssql/server.py
rename to decnet/templates/mssql/server.py
index 41040d8..61114d5 100644
--- a/templates/mssql/server.py
+++ b/decnet/templates/mssql/server.py
@@ -8,7 +8,7 @@ a login failed error. Logs auth attempts as JSON.
import asyncio
import os
import struct
-from decnet_logging import syslog_line, write_syslog_file, forward_syslog
+from syslog_bridge import syslog_line, write_syslog_file, forward_syslog
NODE_NAME = os.environ.get("NODE_NAME", "dbserver")
SERVICE_NAME = "mssql"
@@ -45,7 +45,6 @@ _PRELOGIN_RESP = bytes([
def _log(event_type: str, severity: int = 6, **kwargs) -> None:
line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs)
- print(line, flush=True)
write_syslog_file(line)
forward_syslog(line, LOG_TARGET)
diff --git a/decnet/templates/mssql/syslog_bridge.py b/decnet/templates/mssql/syslog_bridge.py
new file mode 100644
index 0000000..c0a78d0
--- /dev/null
+++ b/decnet/templates/mssql/syslog_bridge.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+"""
+Shared RFC 5424 syslog helper used by service containers.
+
+Services call syslog_line() to format an RFC 5424 message, then
+write_syslog_file() to emit it to stdout — the container runtime
+captures it, and the host-side collector streams it into the log file.
+
+RFC 5424 structure:
+ 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG
+
+Facility: local0 (16). SD element ID uses PEN 55555.
+"""
+
+from datetime import datetime, timezone
+from typing import Any
+
+# ─── Constants ────────────────────────────────────────────────────────────────
+
+_FACILITY_LOCAL0 = 16
+_SD_ID = "relay@55555"
+_NILVALUE = "-"
+
+SEVERITY_EMERG = 0
+SEVERITY_ALERT = 1
+SEVERITY_CRIT = 2
+SEVERITY_ERROR = 3
+SEVERITY_WARNING = 4
+SEVERITY_NOTICE = 5
+SEVERITY_INFO = 6
+SEVERITY_DEBUG = 7
+
+_MAX_HOSTNAME = 255
+_MAX_APPNAME = 48
+_MAX_MSGID = 32
+
+# ─── Formatter ────────────────────────────────────────────────────────────────
+
+def _sd_escape(value: str) -> str:
+ """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3."""
+ return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]")
+
+
+def _sd_element(fields: dict[str, Any]) -> str:
+ if not fields:
+ return _NILVALUE
+ params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items())
+ return f"[{_SD_ID} {params}]"
+
+
+def syslog_line(
+ service: str,
+ hostname: str,
+ event_type: str,
+ severity: int = SEVERITY_INFO,
+ timestamp: datetime | None = None,
+ msg: str | None = None,
+ **fields: Any,
+) -> str:
+ """
+ Return a single RFC 5424-compliant syslog line (no trailing newline).
+
+ Args:
+ service: APP-NAME (e.g. "http", "mysql")
+ hostname: HOSTNAME (node name)
+ event_type: MSGID (e.g. "request", "login_attempt")
+ severity: Syslog severity integer (default: INFO=6)
+ timestamp: UTC datetime; defaults to now
+ msg: Optional free-text MSG
+ **fields: Encoded as structured data params
+ """
+ pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>"
+ ts = (timestamp or datetime.now(timezone.utc)).isoformat()
+ host = (hostname or _NILVALUE)[:_MAX_HOSTNAME]
+ appname = (service or _NILVALUE)[:_MAX_APPNAME]
+ msgid = (event_type or _NILVALUE)[:_MAX_MSGID]
+ sd = _sd_element(fields)
+ message = f" {msg}" if msg else ""
+ return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}"
+
+
+def write_syslog_file(line: str) -> None:
+ """Emit a syslog line to stdout for container log capture."""
+ print(line, flush=True)
+
+
+def forward_syslog(line: str, log_target: str) -> None:
+ """No-op stub. TCP forwarding is handled by rsyslog, not by service containers."""
+ pass
diff --git a/templates/mysql/Dockerfile b/decnet/templates/mysql/Dockerfile
similarity index 85%
rename from templates/mysql/Dockerfile
rename to decnet/templates/mysql/Dockerfile
index cbfb532..926e74b 100644
--- a/templates/mysql/Dockerfile
+++ b/decnet/templates/mysql/Dockerfile
@@ -5,13 +5,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
python3 \
&& rm -rf /var/lib/apt/lists/*
-COPY decnet_logging.py /opt/decnet_logging.py
+COPY syslog_bridge.py /opt/syslog_bridge.py
COPY server.py /opt/server.py
COPY entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh
EXPOSE 3306
-RUN useradd -r -s /bin/false -d /opt decnet \
+RUN useradd -r -s /bin/false -d /opt logrelay \
&& apt-get update && apt-get install -y --no-install-recommends libcap2-bin \
&& rm -rf /var/lib/apt/lists/* \
&& (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true)
@@ -19,5 +19,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
CMD kill -0 1 || exit 1
-USER decnet
+USER logrelay
ENTRYPOINT ["/entrypoint.sh"]
diff --git a/templates/mysql/entrypoint.sh b/decnet/templates/mysql/entrypoint.sh
similarity index 100%
rename from templates/mysql/entrypoint.sh
rename to decnet/templates/mysql/entrypoint.sh
diff --git a/templates/mysql/server.py b/decnet/templates/mysql/server.py
similarity index 97%
rename from templates/mysql/server.py
rename to decnet/templates/mysql/server.py
index 812a910..a6b1d94 100644
--- a/templates/mysql/server.py
+++ b/decnet/templates/mysql/server.py
@@ -9,7 +9,7 @@ attempts as JSON.
import asyncio
import os
import struct
-from decnet_logging import syslog_line, write_syslog_file, forward_syslog
+from syslog_bridge import syslog_line, write_syslog_file, forward_syslog
NODE_NAME = os.environ.get("NODE_NAME", "dbserver")
SERVICE_NAME = "mysql"
@@ -44,7 +44,6 @@ def _make_packet(payload: bytes, seq: int = 0) -> bytes:
def _log(event_type: str, severity: int = 6, **kwargs) -> None:
line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs)
- print(line, flush=True)
write_syslog_file(line)
forward_syslog(line, LOG_TARGET)
diff --git a/decnet/templates/mysql/syslog_bridge.py b/decnet/templates/mysql/syslog_bridge.py
new file mode 100644
index 0000000..c0a78d0
--- /dev/null
+++ b/decnet/templates/mysql/syslog_bridge.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+"""
+Shared RFC 5424 syslog helper used by service containers.
+
+Services call syslog_line() to format an RFC 5424 message, then
+write_syslog_file() to emit it to stdout — the container runtime
+captures it, and the host-side collector streams it into the log file.
+
+RFC 5424 structure:
+ 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG
+
+Facility: local0 (16). SD element ID uses PEN 55555.
+"""
+
+from datetime import datetime, timezone
+from typing import Any
+
+# ─── Constants ────────────────────────────────────────────────────────────────
+
+_FACILITY_LOCAL0 = 16
+_SD_ID = "relay@55555"
+_NILVALUE = "-"
+
+SEVERITY_EMERG = 0
+SEVERITY_ALERT = 1
+SEVERITY_CRIT = 2
+SEVERITY_ERROR = 3
+SEVERITY_WARNING = 4
+SEVERITY_NOTICE = 5
+SEVERITY_INFO = 6
+SEVERITY_DEBUG = 7
+
+_MAX_HOSTNAME = 255
+_MAX_APPNAME = 48
+_MAX_MSGID = 32
+
+# ─── Formatter ────────────────────────────────────────────────────────────────
+
+def _sd_escape(value: str) -> str:
+ """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3."""
+ return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]")
+
+
+def _sd_element(fields: dict[str, Any]) -> str:
+ if not fields:
+ return _NILVALUE
+ params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items())
+ return f"[{_SD_ID} {params}]"
+
+
+def syslog_line(
+ service: str,
+ hostname: str,
+ event_type: str,
+ severity: int = SEVERITY_INFO,
+ timestamp: datetime | None = None,
+ msg: str | None = None,
+ **fields: Any,
+) -> str:
+ """
+ Return a single RFC 5424-compliant syslog line (no trailing newline).
+
+ Args:
+ service: APP-NAME (e.g. "http", "mysql")
+ hostname: HOSTNAME (node name)
+ event_type: MSGID (e.g. "request", "login_attempt")
+ severity: Syslog severity integer (default: INFO=6)
+ timestamp: UTC datetime; defaults to now
+ msg: Optional free-text MSG
+ **fields: Encoded as structured data params
+ """
+ pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>"
+ ts = (timestamp or datetime.now(timezone.utc)).isoformat()
+ host = (hostname or _NILVALUE)[:_MAX_HOSTNAME]
+ appname = (service or _NILVALUE)[:_MAX_APPNAME]
+ msgid = (event_type or _NILVALUE)[:_MAX_MSGID]
+ sd = _sd_element(fields)
+ message = f" {msg}" if msg else ""
+ return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}"
+
+
+def write_syslog_file(line: str) -> None:
+ """Emit a syslog line to stdout for container log capture."""
+ print(line, flush=True)
+
+
+def forward_syslog(line: str, log_target: str) -> None:
+ """No-op stub. TCP forwarding is handled by rsyslog, not by service containers."""
+ pass
diff --git a/templates/pop3/Dockerfile b/decnet/templates/pop3/Dockerfile
similarity index 85%
rename from templates/pop3/Dockerfile
rename to decnet/templates/pop3/Dockerfile
index ccbfe65..08ac966 100644
--- a/templates/pop3/Dockerfile
+++ b/decnet/templates/pop3/Dockerfile
@@ -5,13 +5,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
python3 \
&& rm -rf /var/lib/apt/lists/*
-COPY decnet_logging.py /opt/decnet_logging.py
+COPY syslog_bridge.py /opt/syslog_bridge.py
COPY server.py /opt/server.py
COPY entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh
EXPOSE 110 995
-RUN useradd -r -s /bin/false -d /opt decnet \
+RUN useradd -r -s /bin/false -d /opt logrelay \
&& apt-get update && apt-get install -y --no-install-recommends libcap2-bin \
&& rm -rf /var/lib/apt/lists/* \
&& (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true)
@@ -19,5 +19,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
CMD kill -0 1 || exit 1
-USER decnet
+USER logrelay
ENTRYPOINT ["/entrypoint.sh"]
diff --git a/templates/pop3/entrypoint.sh b/decnet/templates/pop3/entrypoint.sh
similarity index 100%
rename from templates/pop3/entrypoint.sh
rename to decnet/templates/pop3/entrypoint.sh
diff --git a/templates/pop3/server.py b/decnet/templates/pop3/server.py
similarity index 99%
rename from templates/pop3/server.py
rename to decnet/templates/pop3/server.py
index 33bca78..8599bc8 100644
--- a/templates/pop3/server.py
+++ b/decnet/templates/pop3/server.py
@@ -11,7 +11,7 @@ Credentials via IMAP_USERS env var (shared with IMAP service).
import asyncio
import os
-from decnet_logging import SEVERITY_WARNING, syslog_line, write_syslog_file, forward_syslog
+from syslog_bridge import SEVERITY_WARNING, syslog_line, write_syslog_file, forward_syslog
NODE_NAME = os.environ.get("NODE_NAME", "mailserver")
SERVICE_NAME = "pop3"
@@ -161,7 +161,6 @@ _BAIT_EMAILS: list[str] = [
def _log(event_type: str, severity: int = 6, **kwargs) -> None:
line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs)
- print(line, flush=True)
write_syslog_file(line)
forward_syslog(line, LOG_TARGET)
diff --git a/decnet/templates/pop3/syslog_bridge.py b/decnet/templates/pop3/syslog_bridge.py
new file mode 100644
index 0000000..c0a78d0
--- /dev/null
+++ b/decnet/templates/pop3/syslog_bridge.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+"""
+Shared RFC 5424 syslog helper used by service containers.
+
+Services call syslog_line() to format an RFC 5424 message, then
+write_syslog_file() to emit it to stdout — the container runtime
+captures it, and the host-side collector streams it into the log file.
+
+RFC 5424 structure:
+ 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG
+
+Facility: local0 (16). SD element ID uses PEN 55555.
+"""
+
+from datetime import datetime, timezone
+from typing import Any
+
+# ─── Constants ────────────────────────────────────────────────────────────────
+
+_FACILITY_LOCAL0 = 16
+_SD_ID = "relay@55555"
+_NILVALUE = "-"
+
+SEVERITY_EMERG = 0
+SEVERITY_ALERT = 1
+SEVERITY_CRIT = 2
+SEVERITY_ERROR = 3
+SEVERITY_WARNING = 4
+SEVERITY_NOTICE = 5
+SEVERITY_INFO = 6
+SEVERITY_DEBUG = 7
+
+_MAX_HOSTNAME = 255
+_MAX_APPNAME = 48
+_MAX_MSGID = 32
+
+# ─── Formatter ────────────────────────────────────────────────────────────────
+
+def _sd_escape(value: str) -> str:
+ """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3."""
+ return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]")
+
+
+def _sd_element(fields: dict[str, Any]) -> str:
+ if not fields:
+ return _NILVALUE
+ params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items())
+ return f"[{_SD_ID} {params}]"
+
+
+def syslog_line(
+ service: str,
+ hostname: str,
+ event_type: str,
+ severity: int = SEVERITY_INFO,
+ timestamp: datetime | None = None,
+ msg: str | None = None,
+ **fields: Any,
+) -> str:
+ """
+ Return a single RFC 5424-compliant syslog line (no trailing newline).
+
+ Args:
+ service: APP-NAME (e.g. "http", "mysql")
+ hostname: HOSTNAME (node name)
+ event_type: MSGID (e.g. "request", "login_attempt")
+ severity: Syslog severity integer (default: INFO=6)
+ timestamp: UTC datetime; defaults to now
+ msg: Optional free-text MSG
+ **fields: Encoded as structured data params
+ """
+ pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>"
+ ts = (timestamp or datetime.now(timezone.utc)).isoformat()
+ host = (hostname or _NILVALUE)[:_MAX_HOSTNAME]
+ appname = (service or _NILVALUE)[:_MAX_APPNAME]
+ msgid = (event_type or _NILVALUE)[:_MAX_MSGID]
+ sd = _sd_element(fields)
+ message = f" {msg}" if msg else ""
+ return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}"
+
+
+def write_syslog_file(line: str) -> None:
+ """Emit a syslog line to stdout for container log capture."""
+ print(line, flush=True)
+
+
+def forward_syslog(line: str, log_target: str) -> None:
+ """No-op stub. TCP forwarding is handled by rsyslog, not by service containers."""
+ pass
diff --git a/templates/postgres/Dockerfile b/decnet/templates/postgres/Dockerfile
similarity index 85%
rename from templates/postgres/Dockerfile
rename to decnet/templates/postgres/Dockerfile
index 0a6a6bf..6eab4e1 100644
--- a/templates/postgres/Dockerfile
+++ b/decnet/templates/postgres/Dockerfile
@@ -5,13 +5,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
python3 \
&& rm -rf /var/lib/apt/lists/*
-COPY decnet_logging.py /opt/decnet_logging.py
+COPY syslog_bridge.py /opt/syslog_bridge.py
COPY server.py /opt/server.py
COPY entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh
EXPOSE 5432
-RUN useradd -r -s /bin/false -d /opt decnet \
+RUN useradd -r -s /bin/false -d /opt logrelay \
&& apt-get update && apt-get install -y --no-install-recommends libcap2-bin \
&& rm -rf /var/lib/apt/lists/* \
&& (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true)
@@ -19,5 +19,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
CMD kill -0 1 || exit 1
-USER decnet
+USER logrelay
ENTRYPOINT ["/entrypoint.sh"]
diff --git a/templates/postgres/entrypoint.sh b/decnet/templates/postgres/entrypoint.sh
similarity index 100%
rename from templates/postgres/entrypoint.sh
rename to decnet/templates/postgres/entrypoint.sh
diff --git a/templates/postgres/server.py b/decnet/templates/postgres/server.py
similarity index 97%
rename from templates/postgres/server.py
rename to decnet/templates/postgres/server.py
index 45126d7..267154f 100644
--- a/templates/postgres/server.py
+++ b/decnet/templates/postgres/server.py
@@ -9,7 +9,7 @@ returns an error. Logs all interactions as JSON.
import asyncio
import os
import struct
-from decnet_logging import syslog_line, write_syslog_file, forward_syslog
+from syslog_bridge import syslog_line, write_syslog_file, forward_syslog
NODE_NAME = os.environ.get("NODE_NAME", "pgserver")
SERVICE_NAME = "postgres"
@@ -24,7 +24,6 @@ def _error_response(message: str) -> bytes:
def _log(event_type: str, severity: int = 6, **kwargs) -> None:
line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs)
- print(line, flush=True)
write_syslog_file(line)
forward_syslog(line, LOG_TARGET)
diff --git a/decnet/templates/postgres/syslog_bridge.py b/decnet/templates/postgres/syslog_bridge.py
new file mode 100644
index 0000000..c0a78d0
--- /dev/null
+++ b/decnet/templates/postgres/syslog_bridge.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+"""
+Shared RFC 5424 syslog helper used by service containers.
+
+Services call syslog_line() to format an RFC 5424 message, then
+write_syslog_file() to emit it to stdout — the container runtime
+captures it, and the host-side collector streams it into the log file.
+
+RFC 5424 structure:
+ 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG
+
+Facility: local0 (16). SD element ID uses PEN 55555.
+"""
+
+from datetime import datetime, timezone
+from typing import Any
+
+# ─── Constants ────────────────────────────────────────────────────────────────
+
+_FACILITY_LOCAL0 = 16
+_SD_ID = "relay@55555"
+_NILVALUE = "-"
+
+SEVERITY_EMERG = 0
+SEVERITY_ALERT = 1
+SEVERITY_CRIT = 2
+SEVERITY_ERROR = 3
+SEVERITY_WARNING = 4
+SEVERITY_NOTICE = 5
+SEVERITY_INFO = 6
+SEVERITY_DEBUG = 7
+
+_MAX_HOSTNAME = 255
+_MAX_APPNAME = 48
+_MAX_MSGID = 32
+
+# ─── Formatter ────────────────────────────────────────────────────────────────
+
+def _sd_escape(value: str) -> str:
+ """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3."""
+ return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]")
+
+
+def _sd_element(fields: dict[str, Any]) -> str:
+ if not fields:
+ return _NILVALUE
+ params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items())
+ return f"[{_SD_ID} {params}]"
+
+
+def syslog_line(
+ service: str,
+ hostname: str,
+ event_type: str,
+ severity: int = SEVERITY_INFO,
+ timestamp: datetime | None = None,
+ msg: str | None = None,
+ **fields: Any,
+) -> str:
+ """
+ Return a single RFC 5424-compliant syslog line (no trailing newline).
+
+ Args:
+ service: APP-NAME (e.g. "http", "mysql")
+ hostname: HOSTNAME (node name)
+ event_type: MSGID (e.g. "request", "login_attempt")
+ severity: Syslog severity integer (default: INFO=6)
+ timestamp: UTC datetime; defaults to now
+ msg: Optional free-text MSG
+ **fields: Encoded as structured data params
+ """
+ pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>"
+ ts = (timestamp or datetime.now(timezone.utc)).isoformat()
+ host = (hostname or _NILVALUE)[:_MAX_HOSTNAME]
+ appname = (service or _NILVALUE)[:_MAX_APPNAME]
+ msgid = (event_type or _NILVALUE)[:_MAX_MSGID]
+ sd = _sd_element(fields)
+ message = f" {msg}" if msg else ""
+ return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}"
+
+
+def write_syslog_file(line: str) -> None:
+ """Emit a syslog line to stdout for container log capture."""
+ print(line, flush=True)
+
+
+def forward_syslog(line: str, log_target: str) -> None:
+ """No-op stub. TCP forwarding is handled by rsyslog, not by service containers."""
+ pass
diff --git a/templates/rdp/Dockerfile b/decnet/templates/rdp/Dockerfile
similarity index 87%
rename from templates/rdp/Dockerfile
rename to decnet/templates/rdp/Dockerfile
index cf68714..06ed165 100644
--- a/templates/rdp/Dockerfile
+++ b/decnet/templates/rdp/Dockerfile
@@ -8,13 +8,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
ENV PIP_BREAK_SYSTEM_PACKAGES=1
RUN pip3 install --no-cache-dir twisted jinja2
-COPY decnet_logging.py /opt/decnet_logging.py
+COPY syslog_bridge.py /opt/syslog_bridge.py
COPY server.py /opt/server.py
COPY entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh
EXPOSE 3389
-RUN useradd -r -s /bin/false -d /opt decnet \
+RUN useradd -r -s /bin/false -d /opt logrelay \
&& apt-get update && apt-get install -y --no-install-recommends libcap2-bin \
&& rm -rf /var/lib/apt/lists/* \
&& (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true)
@@ -22,5 +22,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
CMD kill -0 1 || exit 1
-USER decnet
+USER logrelay
ENTRYPOINT ["/entrypoint.sh"]
diff --git a/templates/rdp/entrypoint.sh b/decnet/templates/rdp/entrypoint.sh
similarity index 100%
rename from templates/rdp/entrypoint.sh
rename to decnet/templates/rdp/entrypoint.sh
diff --git a/templates/rdp/server.py b/decnet/templates/rdp/server.py
similarity index 94%
rename from templates/rdp/server.py
rename to decnet/templates/rdp/server.py
index 12a0a48..2f61d7b 100644
--- a/templates/rdp/server.py
+++ b/decnet/templates/rdp/server.py
@@ -10,7 +10,7 @@ import os
from twisted.internet import protocol, reactor
from twisted.python import log as twisted_log
-from decnet_logging import syslog_line, write_syslog_file, forward_syslog
+from syslog_bridge import syslog_line, write_syslog_file, forward_syslog
NODE_NAME = os.environ.get("NODE_NAME", "WORKSTATION")
SERVICE_NAME = "rdp"
@@ -21,7 +21,6 @@ LOG_TARGET = os.environ.get("LOG_TARGET", "")
def _log(event_type: str, severity: int = 6, **kwargs) -> None:
line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs)
- print(line, flush=True)
write_syslog_file(line)
forward_syslog(line, LOG_TARGET)
diff --git a/decnet/templates/rdp/syslog_bridge.py b/decnet/templates/rdp/syslog_bridge.py
new file mode 100644
index 0000000..c0a78d0
--- /dev/null
+++ b/decnet/templates/rdp/syslog_bridge.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+"""
+Shared RFC 5424 syslog helper used by service containers.
+
+Services call syslog_line() to format an RFC 5424 message, then
+write_syslog_file() to emit it to stdout — the container runtime
+captures it, and the host-side collector streams it into the log file.
+
+RFC 5424 structure:
+ 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG
+
+Facility: local0 (16). SD element ID uses PEN 55555.
+"""
+
+from datetime import datetime, timezone
+from typing import Any
+
+# ─── Constants ────────────────────────────────────────────────────────────────
+
+_FACILITY_LOCAL0 = 16
+_SD_ID = "relay@55555"
+_NILVALUE = "-"
+
+SEVERITY_EMERG = 0
+SEVERITY_ALERT = 1
+SEVERITY_CRIT = 2
+SEVERITY_ERROR = 3
+SEVERITY_WARNING = 4
+SEVERITY_NOTICE = 5
+SEVERITY_INFO = 6
+SEVERITY_DEBUG = 7
+
+_MAX_HOSTNAME = 255
+_MAX_APPNAME = 48
+_MAX_MSGID = 32
+
+# ─── Formatter ────────────────────────────────────────────────────────────────
+
+def _sd_escape(value: str) -> str:
+ """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3."""
+ return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]")
+
+
+def _sd_element(fields: dict[str, Any]) -> str:
+ if not fields:
+ return _NILVALUE
+ params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items())
+ return f"[{_SD_ID} {params}]"
+
+
+def syslog_line(
+ service: str,
+ hostname: str,
+ event_type: str,
+ severity: int = SEVERITY_INFO,
+ timestamp: datetime | None = None,
+ msg: str | None = None,
+ **fields: Any,
+) -> str:
+ """
+ Return a single RFC 5424-compliant syslog line (no trailing newline).
+
+ Args:
+ service: APP-NAME (e.g. "http", "mysql")
+ hostname: HOSTNAME (node name)
+ event_type: MSGID (e.g. "request", "login_attempt")
+ severity: Syslog severity integer (default: INFO=6)
+ timestamp: UTC datetime; defaults to now
+ msg: Optional free-text MSG
+ **fields: Encoded as structured data params
+ """
+ pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>"
+ ts = (timestamp or datetime.now(timezone.utc)).isoformat()
+ host = (hostname or _NILVALUE)[:_MAX_HOSTNAME]
+ appname = (service or _NILVALUE)[:_MAX_APPNAME]
+ msgid = (event_type or _NILVALUE)[:_MAX_MSGID]
+ sd = _sd_element(fields)
+ message = f" {msg}" if msg else ""
+ return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}"
+
+
+def write_syslog_file(line: str) -> None:
+ """Emit a syslog line to stdout for container log capture."""
+ print(line, flush=True)
+
+
+def forward_syslog(line: str, log_target: str) -> None:
+ """No-op stub. TCP forwarding is handled by rsyslog, not by service containers."""
+ pass
diff --git a/templates/redis/Dockerfile b/decnet/templates/redis/Dockerfile
similarity index 85%
rename from templates/redis/Dockerfile
rename to decnet/templates/redis/Dockerfile
index bc627ac..b3f85de 100644
--- a/templates/redis/Dockerfile
+++ b/decnet/templates/redis/Dockerfile
@@ -5,13 +5,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
python3 \
&& rm -rf /var/lib/apt/lists/*
-COPY decnet_logging.py /opt/decnet_logging.py
+COPY syslog_bridge.py /opt/syslog_bridge.py
COPY server.py /opt/server.py
COPY entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh
EXPOSE 6379
-RUN useradd -r -s /bin/false -d /opt decnet \
+RUN useradd -r -s /bin/false -d /opt logrelay \
&& apt-get update && apt-get install -y --no-install-recommends libcap2-bin \
&& rm -rf /var/lib/apt/lists/* \
&& (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true)
@@ -19,5 +19,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
CMD kill -0 1 || exit 1
-USER decnet
+USER logrelay
ENTRYPOINT ["/entrypoint.sh"]
diff --git a/templates/redis/entrypoint.sh b/decnet/templates/redis/entrypoint.sh
similarity index 100%
rename from templates/redis/entrypoint.sh
rename to decnet/templates/redis/entrypoint.sh
diff --git a/templates/redis/server.py b/decnet/templates/redis/server.py
similarity index 98%
rename from templates/redis/server.py
rename to decnet/templates/redis/server.py
index 4aa5961..4d3242f 100644
--- a/templates/redis/server.py
+++ b/decnet/templates/redis/server.py
@@ -7,7 +7,7 @@ KEYS, and arbitrary commands. Logs every command and argument as JSON.
import asyncio
import os
-from decnet_logging import syslog_line, write_syslog_file, forward_syslog
+from syslog_bridge import syslog_line, write_syslog_file, forward_syslog
NODE_NAME = os.environ.get("NODE_NAME", "cache-server")
SERVICE_NAME = "redis"
@@ -46,7 +46,6 @@ _FAKE_STORE = {
def _log(event_type: str, severity: int = 6, **kwargs) -> None:
line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs)
- print(line, flush=True)
write_syslog_file(line)
forward_syslog(line, LOG_TARGET)
diff --git a/decnet/templates/redis/syslog_bridge.py b/decnet/templates/redis/syslog_bridge.py
new file mode 100644
index 0000000..c0a78d0
--- /dev/null
+++ b/decnet/templates/redis/syslog_bridge.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+"""
+Shared RFC 5424 syslog helper used by service containers.
+
+Services call syslog_line() to format an RFC 5424 message, then
+write_syslog_file() to emit it to stdout — the container runtime
+captures it, and the host-side collector streams it into the log file.
+
+RFC 5424 structure:
+ 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG
+
+Facility: local0 (16). SD element ID uses PEN 55555.
+"""
+
+from datetime import datetime, timezone
+from typing import Any
+
+# ─── Constants ────────────────────────────────────────────────────────────────
+
+_FACILITY_LOCAL0 = 16
+_SD_ID = "relay@55555"
+_NILVALUE = "-"
+
+SEVERITY_EMERG = 0
+SEVERITY_ALERT = 1
+SEVERITY_CRIT = 2
+SEVERITY_ERROR = 3
+SEVERITY_WARNING = 4
+SEVERITY_NOTICE = 5
+SEVERITY_INFO = 6
+SEVERITY_DEBUG = 7
+
+_MAX_HOSTNAME = 255
+_MAX_APPNAME = 48
+_MAX_MSGID = 32
+
+# ─── Formatter ────────────────────────────────────────────────────────────────
+
+def _sd_escape(value: str) -> str:
+ """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3."""
+ return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]")
+
+
+def _sd_element(fields: dict[str, Any]) -> str:
+ if not fields:
+ return _NILVALUE
+ params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items())
+ return f"[{_SD_ID} {params}]"
+
+
+def syslog_line(
+ service: str,
+ hostname: str,
+ event_type: str,
+ severity: int = SEVERITY_INFO,
+ timestamp: datetime | None = None,
+ msg: str | None = None,
+ **fields: Any,
+) -> str:
+ """
+ Return a single RFC 5424-compliant syslog line (no trailing newline).
+
+ Args:
+ service: APP-NAME (e.g. "http", "mysql")
+ hostname: HOSTNAME (node name)
+ event_type: MSGID (e.g. "request", "login_attempt")
+ severity: Syslog severity integer (default: INFO=6)
+ timestamp: UTC datetime; defaults to now
+ msg: Optional free-text MSG
+ **fields: Encoded as structured data params
+ """
+ pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>"
+ ts = (timestamp or datetime.now(timezone.utc)).isoformat()
+ host = (hostname or _NILVALUE)[:_MAX_HOSTNAME]
+ appname = (service or _NILVALUE)[:_MAX_APPNAME]
+ msgid = (event_type or _NILVALUE)[:_MAX_MSGID]
+ sd = _sd_element(fields)
+ message = f" {msg}" if msg else ""
+ return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}"
+
+
+def write_syslog_file(line: str) -> None:
+ """Emit a syslog line to stdout for container log capture."""
+ print(line, flush=True)
+
+
+def forward_syslog(line: str, log_target: str) -> None:
+ """No-op stub. TCP forwarding is handled by rsyslog, not by service containers."""
+ pass
diff --git a/templates/sip/Dockerfile b/decnet/templates/sip/Dockerfile
similarity index 86%
rename from templates/sip/Dockerfile
rename to decnet/templates/sip/Dockerfile
index ab37230..e42a5e2 100644
--- a/templates/sip/Dockerfile
+++ b/decnet/templates/sip/Dockerfile
@@ -5,14 +5,14 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
python3 \
&& rm -rf /var/lib/apt/lists/*
-COPY decnet_logging.py /opt/decnet_logging.py
+COPY syslog_bridge.py /opt/syslog_bridge.py
COPY server.py /opt/server.py
COPY entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh
EXPOSE 5060/udp
EXPOSE 5060/tcp
-RUN useradd -r -s /bin/false -d /opt decnet \
+RUN useradd -r -s /bin/false -d /opt logrelay \
&& apt-get update && apt-get install -y --no-install-recommends libcap2-bin \
&& rm -rf /var/lib/apt/lists/* \
&& (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true)
@@ -20,5 +20,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
CMD kill -0 1 || exit 1
-USER decnet
+USER logrelay
ENTRYPOINT ["/entrypoint.sh"]
diff --git a/templates/sip/entrypoint.sh b/decnet/templates/sip/entrypoint.sh
similarity index 100%
rename from templates/sip/entrypoint.sh
rename to decnet/templates/sip/entrypoint.sh
diff --git a/templates/sip/server.py b/decnet/templates/sip/server.py
similarity index 97%
rename from templates/sip/server.py
rename to decnet/templates/sip/server.py
index a84c0c7..dd40166 100644
--- a/templates/sip/server.py
+++ b/decnet/templates/sip/server.py
@@ -8,7 +8,7 @@ Authorization header and call metadata, then responds with 401 Unauthorized.
import asyncio
import os
import re
-from decnet_logging import syslog_line, write_syslog_file, forward_syslog
+from syslog_bridge import syslog_line, write_syslog_file, forward_syslog
NODE_NAME = os.environ.get("NODE_NAME", "pbx")
SERVICE_NAME = "sip"
@@ -30,7 +30,6 @@ _401 = (
def _log(event_type: str, severity: int = 6, **kwargs) -> None:
line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs)
- print(line, flush=True)
write_syslog_file(line)
forward_syslog(line, LOG_TARGET)
diff --git a/decnet/templates/sip/syslog_bridge.py b/decnet/templates/sip/syslog_bridge.py
new file mode 100644
index 0000000..c0a78d0
--- /dev/null
+++ b/decnet/templates/sip/syslog_bridge.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+"""
+Shared RFC 5424 syslog helper used by service containers.
+
+Services call syslog_line() to format an RFC 5424 message, then
+write_syslog_file() to emit it to stdout — the container runtime
+captures it, and the host-side collector streams it into the log file.
+
+RFC 5424 structure:
+ 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG
+
+Facility: local0 (16). SD element ID uses PEN 55555.
+"""
+
+from datetime import datetime, timezone
+from typing import Any
+
+# ─── Constants ────────────────────────────────────────────────────────────────
+
+_FACILITY_LOCAL0 = 16
+_SD_ID = "relay@55555"
+_NILVALUE = "-"
+
+SEVERITY_EMERG = 0
+SEVERITY_ALERT = 1
+SEVERITY_CRIT = 2
+SEVERITY_ERROR = 3
+SEVERITY_WARNING = 4
+SEVERITY_NOTICE = 5
+SEVERITY_INFO = 6
+SEVERITY_DEBUG = 7
+
+_MAX_HOSTNAME = 255
+_MAX_APPNAME = 48
+_MAX_MSGID = 32
+
+# ─── Formatter ────────────────────────────────────────────────────────────────
+
+def _sd_escape(value: str) -> str:
+ """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3."""
+ return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]")
+
+
+def _sd_element(fields: dict[str, Any]) -> str:
+ if not fields:
+ return _NILVALUE
+ params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items())
+ return f"[{_SD_ID} {params}]"
+
+
+def syslog_line(
+ service: str,
+ hostname: str,
+ event_type: str,
+ severity: int = SEVERITY_INFO,
+ timestamp: datetime | None = None,
+ msg: str | None = None,
+ **fields: Any,
+) -> str:
+ """
+ Return a single RFC 5424-compliant syslog line (no trailing newline).
+
+ Args:
+ service: APP-NAME (e.g. "http", "mysql")
+ hostname: HOSTNAME (node name)
+ event_type: MSGID (e.g. "request", "login_attempt")
+ severity: Syslog severity integer (default: INFO=6)
+ timestamp: UTC datetime; defaults to now
+ msg: Optional free-text MSG
+ **fields: Encoded as structured data params
+ """
+ pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>"
+ ts = (timestamp or datetime.now(timezone.utc)).isoformat()
+ host = (hostname or _NILVALUE)[:_MAX_HOSTNAME]
+ appname = (service or _NILVALUE)[:_MAX_APPNAME]
+ msgid = (event_type or _NILVALUE)[:_MAX_MSGID]
+ sd = _sd_element(fields)
+ message = f" {msg}" if msg else ""
+ return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}"
+
+
+def write_syslog_file(line: str) -> None:
+ """Emit a syslog line to stdout for container log capture."""
+ print(line, flush=True)
+
+
+def forward_syslog(line: str, log_target: str) -> None:
+ """No-op stub. TCP forwarding is handled by rsyslog, not by service containers."""
+ pass
diff --git a/templates/smb/Dockerfile b/decnet/templates/smb/Dockerfile
similarity index 87%
rename from templates/smb/Dockerfile
rename to decnet/templates/smb/Dockerfile
index cea8028..64120be 100644
--- a/templates/smb/Dockerfile
+++ b/decnet/templates/smb/Dockerfile
@@ -8,13 +8,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
ENV PIP_BREAK_SYSTEM_PACKAGES=1
RUN pip3 install --no-cache-dir impacket jinja2
-COPY decnet_logging.py /opt/decnet_logging.py
+COPY syslog_bridge.py /opt/syslog_bridge.py
COPY server.py /opt/server.py
COPY entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh
EXPOSE 445 139
-RUN useradd -r -s /bin/false -d /opt decnet \
+RUN useradd -r -s /bin/false -d /opt logrelay \
&& apt-get update && apt-get install -y --no-install-recommends libcap2-bin \
&& rm -rf /var/lib/apt/lists/* \
&& (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true)
@@ -22,5 +22,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
CMD kill -0 1 || exit 1
-USER decnet
+USER logrelay
ENTRYPOINT ["/entrypoint.sh"]
diff --git a/templates/smb/entrypoint.sh b/decnet/templates/smb/entrypoint.sh
similarity index 100%
rename from templates/smb/entrypoint.sh
rename to decnet/templates/smb/entrypoint.sh
diff --git a/templates/smb/server.py b/decnet/templates/smb/server.py
similarity index 90%
rename from templates/smb/server.py
rename to decnet/templates/smb/server.py
index aa5d1a9..24356a8 100644
--- a/templates/smb/server.py
+++ b/decnet/templates/smb/server.py
@@ -7,7 +7,7 @@ Logs all connection attempts, optionally forwarding them as JSON to LOG_TARGET.
import os
from impacket import smbserver
-from decnet_logging import syslog_line, write_syslog_file, forward_syslog
+from syslog_bridge import syslog_line, write_syslog_file, forward_syslog
NODE_NAME = os.environ.get("NODE_NAME", "WORKSTATION")
SERVICE_NAME = "smb"
@@ -18,7 +18,6 @@ LOG_TARGET = os.environ.get("LOG_TARGET", "")
def _log(event_type: str, severity: int = 6, **kwargs) -> None:
line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs)
- print(line, flush=True)
write_syslog_file(line)
forward_syslog(line, LOG_TARGET)
diff --git a/decnet/templates/smb/syslog_bridge.py b/decnet/templates/smb/syslog_bridge.py
new file mode 100644
index 0000000..c0a78d0
--- /dev/null
+++ b/decnet/templates/smb/syslog_bridge.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+"""
+Shared RFC 5424 syslog helper used by service containers.
+
+Services call syslog_line() to format an RFC 5424 message, then
+write_syslog_file() to emit it to stdout — the container runtime
+captures it, and the host-side collector streams it into the log file.
+
+RFC 5424 structure:
+ 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG
+
+Facility: local0 (16). SD element ID uses PEN 55555.
+"""
+
+from datetime import datetime, timezone
+from typing import Any
+
+# ─── Constants ────────────────────────────────────────────────────────────────
+
+_FACILITY_LOCAL0 = 16
+_SD_ID = "relay@55555"
+_NILVALUE = "-"
+
+SEVERITY_EMERG = 0
+SEVERITY_ALERT = 1
+SEVERITY_CRIT = 2
+SEVERITY_ERROR = 3
+SEVERITY_WARNING = 4
+SEVERITY_NOTICE = 5
+SEVERITY_INFO = 6
+SEVERITY_DEBUG = 7
+
+_MAX_HOSTNAME = 255
+_MAX_APPNAME = 48
+_MAX_MSGID = 32
+
+# ─── Formatter ────────────────────────────────────────────────────────────────
+
+def _sd_escape(value: str) -> str:
+ """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3."""
+ return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]")
+
+
+def _sd_element(fields: dict[str, Any]) -> str:
+ if not fields:
+ return _NILVALUE
+ params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items())
+ return f"[{_SD_ID} {params}]"
+
+
+def syslog_line(
+ service: str,
+ hostname: str,
+ event_type: str,
+ severity: int = SEVERITY_INFO,
+ timestamp: datetime | None = None,
+ msg: str | None = None,
+ **fields: Any,
+) -> str:
+ """
+ Return a single RFC 5424-compliant syslog line (no trailing newline).
+
+ Args:
+ service: APP-NAME (e.g. "http", "mysql")
+ hostname: HOSTNAME (node name)
+ event_type: MSGID (e.g. "request", "login_attempt")
+ severity: Syslog severity integer (default: INFO=6)
+ timestamp: UTC datetime; defaults to now
+ msg: Optional free-text MSG
+ **fields: Encoded as structured data params
+ """
+ pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>"
+ ts = (timestamp or datetime.now(timezone.utc)).isoformat()
+ host = (hostname or _NILVALUE)[:_MAX_HOSTNAME]
+ appname = (service or _NILVALUE)[:_MAX_APPNAME]
+ msgid = (event_type or _NILVALUE)[:_MAX_MSGID]
+ sd = _sd_element(fields)
+ message = f" {msg}" if msg else ""
+ return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}"
+
+
+def write_syslog_file(line: str) -> None:
+ """Emit a syslog line to stdout for container log capture."""
+ print(line, flush=True)
+
+
+def forward_syslog(line: str, log_target: str) -> None:
+ """No-op stub. TCP forwarding is handled by rsyslog, not by service containers."""
+ pass
diff --git a/templates/smtp/Dockerfile b/decnet/templates/smtp/Dockerfile
similarity index 85%
rename from templates/smtp/Dockerfile
rename to decnet/templates/smtp/Dockerfile
index 2013f50..c7bf5c8 100644
--- a/templates/smtp/Dockerfile
+++ b/decnet/templates/smtp/Dockerfile
@@ -5,13 +5,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
python3 \
&& rm -rf /var/lib/apt/lists/*
-COPY decnet_logging.py /opt/decnet_logging.py
+COPY syslog_bridge.py /opt/syslog_bridge.py
COPY server.py /opt/server.py
COPY entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh
EXPOSE 25 587
-RUN useradd -r -s /bin/false -d /opt decnet \
+RUN useradd -r -s /bin/false -d /opt logrelay \
&& apt-get update && apt-get install -y --no-install-recommends libcap2-bin \
&& rm -rf /var/lib/apt/lists/* \
&& (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true)
@@ -19,5 +19,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
CMD kill -0 1 || exit 1
-USER decnet
+USER logrelay
ENTRYPOINT ["/entrypoint.sh"]
diff --git a/templates/smtp/entrypoint.sh b/decnet/templates/smtp/entrypoint.sh
similarity index 100%
rename from templates/smtp/entrypoint.sh
rename to decnet/templates/smtp/entrypoint.sh
diff --git a/templates/smtp/server.py b/decnet/templates/smtp/server.py
similarity index 92%
rename from templates/smtp/server.py
rename to decnet/templates/smtp/server.py
index b5b2232..9cd52a2 100644
--- a/templates/smtp/server.py
+++ b/decnet/templates/smtp/server.py
@@ -23,7 +23,7 @@ import base64
import os
import random
import string
-from decnet_logging import SEVERITY_WARNING, syslog_line, write_syslog_file, forward_syslog
+from syslog_bridge import SEVERITY_WARNING, syslog_line, write_syslog_file, forward_syslog
NODE_NAME = os.environ.get("NODE_NAME", "mailserver")
SERVICE_NAME = "smtp"
@@ -37,7 +37,6 @@ _SMTP_MTA = os.environ.get("SMTP_MTA", NODE_NAME)
def _log(event_type: str, severity: int = 6, **kwargs) -> None:
line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs)
- print(line, flush=True)
write_syslog_file(line)
forward_syslog(line, LOG_TARGET)
@@ -87,9 +86,10 @@ class SMTPProtocol(asyncio.Protocol):
def data_received(self, data):
self._buf += data
- while b"\r\n" in self._buf:
- line, self._buf = self._buf.split(b"\r\n", 1)
- self._handle_line(line.decode(errors="replace"))
+ while b"\n" in self._buf:
+ line, self._buf = self._buf.split(b"\n", 1)
+ # Strip trailing \r so both CRLF and bare LF work
+ self._handle_line(line.rstrip(b"\r").decode(errors="replace"))
def connection_lost(self, exc):
_log("disconnect", src=self._peer[0] if self._peer else "?")
@@ -118,7 +118,12 @@ class SMTPProtocol(asyncio.Protocol):
self._data_buf.append(line[1:] if line.startswith(".") else line)
return
- # ── AUTH multi-step (LOGIN mechanism) ─────────────────────────────────
+ # ── AUTH multi-step (LOGIN / PLAIN continuation) ─────────────────────
+ if self._auth_state == "await_plain":
+ user, password = _decode_auth_plain(line)
+ self._finish_auth(user, password)
+ self._auth_state = ""
+ return
if self._auth_state == "await_user":
self._auth_user = base64.b64decode(line + "==").decode(errors="replace")
self._auth_state = "await_pass"
@@ -137,6 +142,11 @@ class SMTPProtocol(asyncio.Protocol):
args = parts[1] if len(parts) > 1 else ""
if cmd in ("EHLO", "HELO"):
+ if not args:
+ self._transport.write(
+ f"501 5.5.4 Syntax: {cmd} hostname\r\n".encode()
+ )
+ return
_log("ehlo", src=self._peer[0], domain=args)
self._transport.write(
f"250-{_SMTP_MTA}\r\n"
diff --git a/decnet/templates/smtp/syslog_bridge.py b/decnet/templates/smtp/syslog_bridge.py
new file mode 100644
index 0000000..c0a78d0
--- /dev/null
+++ b/decnet/templates/smtp/syslog_bridge.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+"""
+Shared RFC 5424 syslog helper used by service containers.
+
+Services call syslog_line() to format an RFC 5424 message, then
+write_syslog_file() to emit it to stdout — the container runtime
+captures it, and the host-side collector streams it into the log file.
+
+RFC 5424 structure:
+ 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG
+
+Facility: local0 (16). SD element ID uses PEN 55555.
+"""
+
+from datetime import datetime, timezone
+from typing import Any
+
+# ─── Constants ────────────────────────────────────────────────────────────────
+
+_FACILITY_LOCAL0 = 16
+_SD_ID = "relay@55555"
+_NILVALUE = "-"
+
+SEVERITY_EMERG = 0
+SEVERITY_ALERT = 1
+SEVERITY_CRIT = 2
+SEVERITY_ERROR = 3
+SEVERITY_WARNING = 4
+SEVERITY_NOTICE = 5
+SEVERITY_INFO = 6
+SEVERITY_DEBUG = 7
+
+_MAX_HOSTNAME = 255
+_MAX_APPNAME = 48
+_MAX_MSGID = 32
+
+# ─── Formatter ────────────────────────────────────────────────────────────────
+
+def _sd_escape(value: str) -> str:
+ """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3."""
+ return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]")
+
+
+def _sd_element(fields: dict[str, Any]) -> str:
+ if not fields:
+ return _NILVALUE
+ params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items())
+ return f"[{_SD_ID} {params}]"
+
+
+def syslog_line(
+ service: str,
+ hostname: str,
+ event_type: str,
+ severity: int = SEVERITY_INFO,
+ timestamp: datetime | None = None,
+ msg: str | None = None,
+ **fields: Any,
+) -> str:
+ """
+ Return a single RFC 5424-compliant syslog line (no trailing newline).
+
+ Args:
+ service: APP-NAME (e.g. "http", "mysql")
+ hostname: HOSTNAME (node name)
+ event_type: MSGID (e.g. "request", "login_attempt")
+ severity: Syslog severity integer (default: INFO=6)
+ timestamp: UTC datetime; defaults to now
+ msg: Optional free-text MSG
+ **fields: Encoded as structured data params
+ """
+ pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>"
+ ts = (timestamp or datetime.now(timezone.utc)).isoformat()
+ host = (hostname or _NILVALUE)[:_MAX_HOSTNAME]
+ appname = (service or _NILVALUE)[:_MAX_APPNAME]
+ msgid = (event_type or _NILVALUE)[:_MAX_MSGID]
+ sd = _sd_element(fields)
+ message = f" {msg}" if msg else ""
+ return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}"
+
+
+def write_syslog_file(line: str) -> None:
+ """Emit a syslog line to stdout for container log capture."""
+ print(line, flush=True)
+
+
+def forward_syslog(line: str, log_target: str) -> None:
+ """No-op stub. TCP forwarding is handled by rsyslog, not by service containers."""
+ pass
diff --git a/decnet/templates/sniffer/Dockerfile b/decnet/templates/sniffer/Dockerfile
new file mode 100644
index 0000000..ff9a6fc
--- /dev/null
+++ b/decnet/templates/sniffer/Dockerfile
@@ -0,0 +1,12 @@
+ARG BASE_IMAGE=debian:bookworm-slim
+FROM ${BASE_IMAGE}
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+ python3 python3-pip libpcap-dev \
+ && rm -rf /var/lib/apt/lists/*
+
+RUN pip3 install --no-cache-dir --break-system-packages "scapy==2.6.1"
+
+COPY syslog_bridge.py server.py /opt/
+
+ENTRYPOINT ["python3", "/opt/server.py"]
diff --git a/decnet/templates/sniffer/server.py b/decnet/templates/sniffer/server.py
new file mode 100644
index 0000000..9bd7714
--- /dev/null
+++ b/decnet/templates/sniffer/server.py
@@ -0,0 +1,1050 @@
+#!/usr/bin/env python3
+"""
+syslog-relay passive TLS sniffer.
+
+Captures TLS handshakes on the MACVLAN interface (shared network namespace
+with the decky base container). Extracts fingerprints and connection
+metadata, then emits structured RFC 5424 log lines to stdout for the
+host-side collector to ingest.
+
+Requires: NET_RAW + NET_ADMIN capabilities (set in compose fragment).
+
+Supported fingerprints:
+ JA3 — MD5(SSLVersion,Ciphers,Extensions,EllipticCurves,ECPointFormats)
+ JA3S — MD5(SSLVersion,Cipher,Extensions)
+ JA4 — {proto}{ver}{sni}{#cs}{#ext}{alpn}_{sha256_12(sorted_cs)}_{sha256_12(sorted_ext,sigalgs)}
+ JA4S — {proto}{ver}{#ext}{alpn}_{sha256_12(cipher,sorted_ext)}
+ JA4L — TCP RTT latency measurement (client_ttl, server_rtt_ms)
+ TLS session resumption detection (session tickets, PSK, 0-RTT)
+ Certificate extraction (TLS ≤1.2 only — 1.3 encrypts certs)
+
+GREASE values (RFC 8701) are excluded from all lists before hashing.
+"""
+
+from __future__ import annotations
+
+import hashlib
+import os
+import struct
+import time
+from typing import Any
+
+from scapy.layers.inet import IP, TCP
+from scapy.sendrecv import sniff
+
+from syslog_bridge import SEVERITY_INFO, SEVERITY_WARNING, syslog_line, write_syslog_file
+
+# ─── Configuration ────────────────────────────────────────────────────────────
+
+NODE_NAME: str = os.environ.get("NODE_NAME", "decky-sniffer")
+SERVICE_NAME: str = "sniffer"
+
+# Session TTL in seconds — drop half-open sessions after this
+_SESSION_TTL: float = 60.0
+
+# Dedup TTL — suppress identical fingerprint events from the same source IP
+# within this window (seconds). Set to 0 to disable dedup.
+_DEDUP_TTL: float = float(os.environ.get("DEDUP_TTL", "300"))
+
+# GREASE values per RFC 8701 — 0x0A0A, 0x1A1A, 0x2A2A, ..., 0xFAFA
+_GREASE: frozenset[int] = frozenset(0x0A0A + i * 0x1010 for i in range(16))
+
+# TLS record / handshake type constants
+_TLS_RECORD_HANDSHAKE: int = 0x16
+_TLS_HT_CLIENT_HELLO: int = 0x01
+_TLS_HT_SERVER_HELLO: int = 0x02
+_TLS_HT_CERTIFICATE: int = 0x0B
+
+# TLS extension types we extract for metadata
+_EXT_SNI: int = 0x0000
+_EXT_SUPPORTED_GROUPS: int = 0x000A
+_EXT_EC_POINT_FORMATS: int = 0x000B
+_EXT_SIGNATURE_ALGORITHMS: int = 0x000D
+_EXT_ALPN: int = 0x0010
+_EXT_SESSION_TICKET: int = 0x0023
+_EXT_SUPPORTED_VERSIONS: int = 0x002B
+_EXT_PRE_SHARED_KEY: int = 0x0029
+_EXT_EARLY_DATA: int = 0x002A
+
+# TCP flags
+_TCP_SYN: int = 0x02
+_TCP_ACK: int = 0x10
+
+# ─── Session tracking ─────────────────────────────────────────────────────────
+
+# Key: (src_ip, src_port, dst_ip, dst_port) — forward 4-tuple from ClientHello
+# Value: parsed ClientHello metadata dict
+_sessions: dict[tuple[str, int, str, int], dict[str, Any]] = {}
+_session_ts: dict[tuple[str, int, str, int], float] = {}
+
+# TCP RTT tracking for JA4L: key = (client_ip, client_port, server_ip, server_port)
+# Value: {"syn_time": float, "ttl": int}
+_tcp_syn: dict[tuple[str, int, str, int], dict[str, Any]] = {}
+# Completed RTT measurements: key = same 4-tuple, value = {"rtt_ms": float, "client_ttl": int}
+_tcp_rtt: dict[tuple[str, int, str, int], dict[str, Any]] = {}
+
+
+# ─── GREASE helpers ───────────────────────────────────────────────────────────
+
+def _is_grease(value: int) -> bool:
+ return value in _GREASE
+
+
+def _filter_grease(values: list[int]) -> list[int]:
+ return [v for v in values if not _is_grease(v)]
+
+
+# ─── Pure-Python TLS record parser ────────────────────────────────────────────
+
+def _parse_client_hello(data: bytes) -> dict[str, Any] | None:
+ """
+ Parse a TLS ClientHello from raw bytes (starting at TLS record header).
+ Returns a dict of parsed fields, or None if not a valid ClientHello.
+ """
+ try:
+ if len(data) < 6:
+ return None
+ # TLS record header: content_type(1) version(2) length(2)
+ if data[0] != _TLS_RECORD_HANDSHAKE:
+ return None
+ record_len = struct.unpack_from("!H", data, 3)[0]
+ if len(data) < 5 + record_len:
+ return None
+
+ # Handshake header: type(1) length(3)
+ hs = data[5:]
+ if hs[0] != _TLS_HT_CLIENT_HELLO:
+ return None
+
+ hs_len = struct.unpack_from("!I", b"\x00" + hs[1:4])[0]
+ body = hs[4: 4 + hs_len]
+ if len(body) < 34:
+ return None
+
+ pos = 0
+ # ClientHello version (2 bytes) — used for JA3
+ tls_version = struct.unpack_from("!H", body, pos)[0]
+ pos += 2
+
+ # Random (32 bytes)
+ pos += 32
+
+ # Session ID
+ session_id_len = body[pos]
+ session_id = body[pos + 1: pos + 1 + session_id_len]
+ pos += 1 + session_id_len
+
+ # Cipher Suites
+ cs_len = struct.unpack_from("!H", body, pos)[0]
+ pos += 2
+ cipher_suites = [
+ struct.unpack_from("!H", body, pos + i * 2)[0]
+ for i in range(cs_len // 2)
+ ]
+ pos += cs_len
+
+ # Compression Methods
+ comp_len = body[pos]
+ pos += 1 + comp_len
+
+ # Extensions
+ extensions: list[int] = []
+ supported_groups: list[int] = []
+ ec_point_formats: list[int] = []
+ signature_algorithms: list[int] = []
+ supported_versions: list[int] = []
+ sni: str = ""
+ alpn: list[str] = []
+ has_session_ticket_data: bool = False
+ has_pre_shared_key: bool = False
+ has_early_data: bool = False
+
+ if pos + 2 <= len(body):
+ ext_total = struct.unpack_from("!H", body, pos)[0]
+ pos += 2
+ ext_end = pos + ext_total
+
+ while pos + 4 <= ext_end:
+ ext_type = struct.unpack_from("!H", body, pos)[0]
+ ext_len = struct.unpack_from("!H", body, pos + 2)[0]
+ ext_data = body[pos + 4: pos + 4 + ext_len]
+ pos += 4 + ext_len
+
+ if not _is_grease(ext_type):
+ extensions.append(ext_type)
+
+ if ext_type == _EXT_SNI and len(ext_data) > 5:
+ # server_name_list_length(2) type(1) name_length(2) name
+ sni = ext_data[5:].decode("ascii", errors="replace")
+
+ elif ext_type == _EXT_SUPPORTED_GROUPS and len(ext_data) >= 2:
+ grp_len = struct.unpack_from("!H", ext_data, 0)[0]
+ supported_groups = [
+ struct.unpack_from("!H", ext_data, 2 + i * 2)[0]
+ for i in range(grp_len // 2)
+ ]
+
+ elif ext_type == _EXT_EC_POINT_FORMATS and len(ext_data) >= 1:
+ pf_len = ext_data[0]
+ ec_point_formats = list(ext_data[1: 1 + pf_len])
+
+ elif ext_type == _EXT_ALPN and len(ext_data) >= 2:
+ proto_list_len = struct.unpack_from("!H", ext_data, 0)[0]
+ ap = 2
+ while ap < 2 + proto_list_len:
+ plen = ext_data[ap]
+ alpn.append(ext_data[ap + 1: ap + 1 + plen].decode("ascii", errors="replace"))
+ ap += 1 + plen
+
+ elif ext_type == _EXT_SIGNATURE_ALGORITHMS and len(ext_data) >= 2:
+ sa_len = struct.unpack_from("!H", ext_data, 0)[0]
+ signature_algorithms = [
+ struct.unpack_from("!H", ext_data, 2 + i * 2)[0]
+ for i in range(sa_len // 2)
+ ]
+
+ elif ext_type == _EXT_SUPPORTED_VERSIONS and len(ext_data) >= 1:
+ sv_len = ext_data[0]
+ supported_versions = [
+ struct.unpack_from("!H", ext_data, 1 + i * 2)[0]
+ for i in range(sv_len // 2)
+ ]
+
+ elif ext_type == _EXT_SESSION_TICKET:
+ has_session_ticket_data = len(ext_data) > 0
+
+ elif ext_type == _EXT_PRE_SHARED_KEY:
+ has_pre_shared_key = True
+
+ elif ext_type == _EXT_EARLY_DATA:
+ has_early_data = True
+
+ filtered_ciphers = _filter_grease(cipher_suites)
+ filtered_groups = _filter_grease(supported_groups)
+ filtered_sig_algs = _filter_grease(signature_algorithms)
+ filtered_versions = _filter_grease(supported_versions)
+
+ return {
+ "tls_version": tls_version,
+ "cipher_suites": filtered_ciphers,
+ "extensions": extensions,
+ "supported_groups": filtered_groups,
+ "ec_point_formats": ec_point_formats,
+ "signature_algorithms": filtered_sig_algs,
+ "supported_versions": filtered_versions,
+ "sni": sni,
+ "alpn": alpn,
+ "session_id": session_id,
+ "has_session_ticket_data": has_session_ticket_data,
+ "has_pre_shared_key": has_pre_shared_key,
+ "has_early_data": has_early_data,
+ }
+
+ except Exception:
+ return None
+
+
+def _parse_server_hello(data: bytes) -> dict[str, Any] | None:
+ """
+ Parse a TLS ServerHello from raw bytes.
+ Returns dict with tls_version, cipher_suite, extensions, or None.
+ """
+ try:
+ if len(data) < 6 or data[0] != _TLS_RECORD_HANDSHAKE:
+ return None
+
+ hs = data[5:]
+ if hs[0] != _TLS_HT_SERVER_HELLO:
+ return None
+
+ hs_len = struct.unpack_from("!I", b"\x00" + hs[1:4])[0]
+ body = hs[4: 4 + hs_len]
+ if len(body) < 35:
+ return None
+
+ pos = 0
+ tls_version = struct.unpack_from("!H", body, pos)[0]
+ pos += 2
+
+ # Random (32 bytes)
+ pos += 32
+
+ # Session ID
+ session_id_len = body[pos]
+ pos += 1 + session_id_len
+
+ if pos + 2 > len(body):
+ return None
+
+ cipher_suite = struct.unpack_from("!H", body, pos)[0]
+ pos += 2
+
+ # Compression method (1 byte)
+ pos += 1
+
+ extensions: list[int] = []
+ selected_version: int | None = None
+ alpn: str = ""
+
+ if pos + 2 <= len(body):
+ ext_total = struct.unpack_from("!H", body, pos)[0]
+ pos += 2
+ ext_end = pos + ext_total
+ while pos + 4 <= ext_end:
+ ext_type = struct.unpack_from("!H", body, pos)[0]
+ ext_len = struct.unpack_from("!H", body, pos + 2)[0]
+ ext_data = body[pos + 4: pos + 4 + ext_len]
+ pos += 4 + ext_len
+ if not _is_grease(ext_type):
+ extensions.append(ext_type)
+
+ if ext_type == _EXT_SUPPORTED_VERSIONS and len(ext_data) >= 2:
+ selected_version = struct.unpack_from("!H", ext_data, 0)[0]
+
+ elif ext_type == _EXT_ALPN and len(ext_data) >= 2:
+ proto_list_len = struct.unpack_from("!H", ext_data, 0)[0]
+ if proto_list_len > 0 and len(ext_data) >= 4:
+ plen = ext_data[2]
+ alpn = ext_data[3: 3 + plen].decode("ascii", errors="replace")
+
+ return {
+ "tls_version": tls_version,
+ "cipher_suite": cipher_suite,
+ "extensions": extensions,
+ "selected_version": selected_version,
+ "alpn": alpn,
+ }
+
+ except Exception:
+ return None
+
+
+def _parse_certificate(data: bytes) -> dict[str, Any] | None:
+ """
+ Parse a TLS Certificate handshake message from raw bytes.
+
+ Only works for TLS 1.2 and below — TLS 1.3 encrypts the Certificate
+ message. Extracts basic details from the first (leaf) certificate
+ using minimal DER/ASN.1 parsing.
+ """
+ try:
+ if len(data) < 6 or data[0] != _TLS_RECORD_HANDSHAKE:
+ return None
+
+ hs = data[5:]
+ if hs[0] != _TLS_HT_CERTIFICATE:
+ return None
+
+ hs_len = struct.unpack_from("!I", b"\x00" + hs[1:4])[0]
+ body = hs[4: 4 + hs_len]
+ if len(body) < 3:
+ return None
+
+ # Certificate list total length (3 bytes)
+ certs_len = struct.unpack_from("!I", b"\x00" + body[0:3])[0]
+ if certs_len == 0:
+ return None
+
+ pos = 3
+ # First certificate length (3 bytes)
+ if pos + 3 > len(body):
+ return None
+ cert_len = struct.unpack_from("!I", b"\x00" + body[pos:pos + 3])[0]
+ pos += 3
+ if pos + cert_len > len(body):
+ return None
+
+ cert_der = body[pos: pos + cert_len]
+ return _parse_x509_der(cert_der)
+
+ except Exception:
+ return None
+
+
+# ─── Minimal DER/ASN.1 X.509 parser ─────────────────────────────────────────
+
+def _der_read_tag_len(data: bytes, pos: int) -> tuple[int, int, int]:
+ """Read a DER tag and length. Returns (tag, content_start, content_length)."""
+ tag = data[pos]
+ pos += 1
+ length_byte = data[pos]
+ pos += 1
+ if length_byte & 0x80:
+ num_bytes = length_byte & 0x7F
+ length = int.from_bytes(data[pos: pos + num_bytes], "big")
+ pos += num_bytes
+ else:
+ length = length_byte
+ return tag, pos, length
+
+
+def _der_read_sequence(data: bytes, pos: int) -> tuple[int, int]:
+ """Read a SEQUENCE tag, return (content_start, content_length)."""
+ tag, content_start, length = _der_read_tag_len(data, pos)
+ return content_start, length
+
+
+def _der_read_oid(data: bytes, pos: int, length: int) -> str:
+ """Decode a DER OID to dotted string."""
+ if length < 1:
+ return ""
+ first = data[pos]
+ oid_parts = [str(first // 40), str(first % 40)]
+ val = 0
+ for i in range(1, length):
+ b = data[pos + i]
+ val = (val << 7) | (b & 0x7F)
+ if not (b & 0x80):
+ oid_parts.append(str(val))
+ val = 0
+ return ".".join(oid_parts)
+
+
+def _der_extract_cn(data: bytes, start: int, length: int) -> str:
+ """Walk an X.501 Name (SEQUENCE of SETs of SEQUENCE of OID+value) to find CN."""
+ pos = start
+ end = start + length
+ while pos < end:
+ # Each RDN is a SET
+ set_tag, set_start, set_len = _der_read_tag_len(data, pos)
+ if set_tag != 0x31: # SET
+ break
+ set_end = set_start + set_len
+
+ # Inside the SET, each attribute is a SEQUENCE
+ attr_pos = set_start
+ while attr_pos < set_end:
+ seq_tag, seq_start, seq_len = _der_read_tag_len(data, attr_pos)
+ if seq_tag != 0x30: # SEQUENCE
+ break
+ # OID
+ oid_tag, oid_start, oid_len = _der_read_tag_len(data, seq_start)
+ if oid_tag == 0x06:
+ oid = _der_read_oid(data, oid_start, oid_len)
+ # CN OID = 2.5.4.3
+ if oid == "2.5.4.3":
+ val_tag, val_start, val_len = _der_read_tag_len(data, oid_start + oid_len)
+ return data[val_start: val_start + val_len].decode("utf-8", errors="replace")
+ attr_pos = seq_start + seq_len
+
+ pos = set_end
+ return ""
+
+
+def _der_extract_name_str(data: bytes, start: int, length: int) -> str:
+ """Extract a human-readable summary of an X.501 Name (all RDN values joined)."""
+ parts: list[str] = []
+ pos = start
+ end = start + length
+ oid_names = {
+ "2.5.4.3": "CN",
+ "2.5.4.6": "C",
+ "2.5.4.7": "L",
+ "2.5.4.8": "ST",
+ "2.5.4.10": "O",
+ "2.5.4.11": "OU",
+ }
+ while pos < end:
+ set_tag, set_start, set_len = _der_read_tag_len(data, pos)
+ if set_tag != 0x31:
+ break
+ set_end = set_start + set_len
+ attr_pos = set_start
+ while attr_pos < set_end:
+ seq_tag, seq_start, seq_len = _der_read_tag_len(data, attr_pos)
+ if seq_tag != 0x30:
+ break
+ oid_tag, oid_start, oid_len = _der_read_tag_len(data, seq_start)
+ if oid_tag == 0x06:
+ oid = _der_read_oid(data, oid_start, oid_len)
+ val_tag, val_start, val_len = _der_read_tag_len(data, oid_start + oid_len)
+ val = data[val_start: val_start + val_len].decode("utf-8", errors="replace")
+ name = oid_names.get(oid, oid)
+ parts.append(f"{name}={val}")
+ attr_pos = seq_start + seq_len
+ pos = set_end
+ return ", ".join(parts)
+
+
+def _parse_x509_der(cert_der: bytes) -> dict[str, Any] | None:
+ """
+ Minimal X.509 DER parser. Extracts subject CN, issuer string,
+ validity period, and self-signed flag.
+
+ Structure: SEQUENCE { tbsCertificate, signatureAlgorithm, signatureValue }
+ tbsCertificate: SEQUENCE {
+ version [0] EXPLICIT, serialNumber, signature,
+ issuer, validity { notBefore, notAfter },
+ subject, subjectPublicKeyInfo, ...extensions
+ }
+ """
+ try:
+ # Outer SEQUENCE
+ outer_start, outer_len = _der_read_sequence(cert_der, 0)
+ # tbsCertificate SEQUENCE
+ tbs_tag, tbs_start, tbs_len = _der_read_tag_len(cert_der, outer_start)
+ tbs_end = tbs_start + tbs_len
+ pos = tbs_start
+
+ # version [0] EXPLICIT — optional, skip if present
+ if cert_der[pos] == 0xA0:
+ _, v_start, v_len = _der_read_tag_len(cert_der, pos)
+ pos = v_start + v_len
+
+ # serialNumber (INTEGER)
+ _, sn_start, sn_len = _der_read_tag_len(cert_der, pos)
+ pos = sn_start + sn_len
+
+ # signature algorithm (SEQUENCE)
+ _, sa_start, sa_len = _der_read_tag_len(cert_der, pos)
+ pos = sa_start + sa_len
+
+ # issuer (SEQUENCE)
+ issuer_tag, issuer_start, issuer_len = _der_read_tag_len(cert_der, pos)
+ issuer_str = _der_extract_name_str(cert_der, issuer_start, issuer_len)
+ issuer_cn = _der_extract_cn(cert_der, issuer_start, issuer_len)
+ pos = issuer_start + issuer_len
+
+ # validity (SEQUENCE of two times)
+ val_tag, val_start, val_len = _der_read_tag_len(cert_der, pos)
+ # notBefore
+ nb_tag, nb_start, nb_len = _der_read_tag_len(cert_der, val_start)
+ not_before = cert_der[nb_start: nb_start + nb_len].decode("ascii", errors="replace")
+ # notAfter
+ na_tag, na_start, na_len = _der_read_tag_len(cert_der, nb_start + nb_len)
+ not_after = cert_der[na_start: na_start + na_len].decode("ascii", errors="replace")
+ pos = val_start + val_len
+
+ # subject (SEQUENCE)
+ subj_tag, subj_start, subj_len = _der_read_tag_len(cert_der, pos)
+ subject_cn = _der_extract_cn(cert_der, subj_start, subj_len)
+ subject_str = _der_extract_name_str(cert_der, subj_start, subj_len)
+
+ # Self-signed: issuer CN matches subject CN (basic check)
+ self_signed = (issuer_cn == subject_cn) and subject_cn != ""
+
+ # SANs are in extensions — attempt to find them
+ pos = subj_start + subj_len
+ sans: list[str] = _extract_sans(cert_der, pos, tbs_end)
+
+ return {
+ "subject_cn": subject_cn,
+ "subject": subject_str,
+ "issuer": issuer_str,
+ "issuer_cn": issuer_cn,
+ "not_before": not_before,
+ "not_after": not_after,
+ "self_signed": self_signed,
+ "sans": sans,
+ }
+
+ except Exception:
+ return None
+
+
+def _extract_sans(cert_der: bytes, pos: int, end: int) -> list[str]:
+ """
+ Attempt to extract Subject Alternative Names from X.509v3 extensions.
+ SAN OID = 2.5.29.17
+ """
+ sans: list[str] = []
+ try:
+ # Skip subjectPublicKeyInfo SEQUENCE
+ if pos >= end:
+ return sans
+ spki_tag, spki_start, spki_len = _der_read_tag_len(cert_der, pos)
+ pos = spki_start + spki_len
+
+ # Extensions are wrapped in [3] EXPLICIT
+ while pos < end:
+ tag = cert_der[pos]
+ if tag == 0xA3: # [3] EXPLICIT — extensions wrapper
+ _, ext_wrap_start, ext_wrap_len = _der_read_tag_len(cert_der, pos)
+ # Inner SEQUENCE of extensions
+ _, exts_start, exts_len = _der_read_tag_len(cert_der, ext_wrap_start)
+ epos = exts_start
+ eend = exts_start + exts_len
+ while epos < eend:
+ # Each extension is a SEQUENCE { OID, [critical], value }
+ ext_tag, ext_start, ext_len = _der_read_tag_len(cert_der, epos)
+ ext_end = ext_start + ext_len
+
+ oid_tag, oid_start, oid_len = _der_read_tag_len(cert_der, ext_start)
+ if oid_tag == 0x06:
+ oid = _der_read_oid(cert_der, oid_start, oid_len)
+ if oid == "2.5.29.17": # SAN
+ # Find the OCTET STRING containing the SAN value
+ vpos = oid_start + oid_len
+ # Skip optional BOOLEAN (critical)
+ if vpos < ext_end and cert_der[vpos] == 0x01:
+ _, bs, bl = _der_read_tag_len(cert_der, vpos)
+ vpos = bs + bl
+ # OCTET STRING wrapping the SAN SEQUENCE
+ if vpos < ext_end:
+ os_tag, os_start, os_len = _der_read_tag_len(cert_der, vpos)
+ if os_tag == 0x04:
+ sans = _parse_san_sequence(cert_der, os_start, os_len)
+ epos = ext_end
+ break
+ else:
+ _, skip_start, skip_len = _der_read_tag_len(cert_der, pos)
+ pos = skip_start + skip_len
+ except Exception:
+ pass
+ return sans
+
+
+def _parse_san_sequence(data: bytes, start: int, length: int) -> list[str]:
+ """Parse a GeneralNames SEQUENCE to extract DNS names and IPs."""
+ names: list[str] = []
+ try:
+ # The SAN value is itself a SEQUENCE of GeneralName
+ seq_tag, seq_start, seq_len = _der_read_tag_len(data, start)
+ pos = seq_start
+ end = seq_start + seq_len
+ while pos < end:
+ tag = data[pos]
+ _, val_start, val_len = _der_read_tag_len(data, pos)
+ context_tag = tag & 0x1F
+ if context_tag == 2: # dNSName
+ names.append(data[val_start: val_start + val_len].decode("ascii", errors="replace"))
+ elif context_tag == 7 and val_len == 4: # iPAddress (IPv4)
+ names.append(".".join(str(b) for b in data[val_start: val_start + val_len]))
+ pos = val_start + val_len
+ except Exception:
+ pass
+ return names
+
+
+# ─── JA3 / JA3S computation ───────────────────────────────────────────────────
+
+def _tls_version_str(version: int) -> str:
+ return {
+ 0x0301: "TLS 1.0",
+ 0x0302: "TLS 1.1",
+ 0x0303: "TLS 1.2",
+ 0x0304: "TLS 1.3",
+ 0x0200: "SSL 2.0",
+ 0x0300: "SSL 3.0",
+ }.get(version, f"0x{version:04x}")
+
+
+def _ja3(ch: dict[str, Any]) -> tuple[str, str]:
+ """Return (ja3_string, ja3_hash) for a parsed ClientHello."""
+ parts = [
+ str(ch["tls_version"]),
+ "-".join(str(c) for c in ch["cipher_suites"]),
+ "-".join(str(e) for e in ch["extensions"]),
+ "-".join(str(g) for g in ch["supported_groups"]),
+ "-".join(str(p) for p in ch["ec_point_formats"]),
+ ]
+ ja3_str = ",".join(parts)
+ return ja3_str, hashlib.md5(ja3_str.encode()).hexdigest()
+
+
+def _ja3s(sh: dict[str, Any]) -> tuple[str, str]:
+ """Return (ja3s_string, ja3s_hash) for a parsed ServerHello."""
+ parts = [
+ str(sh["tls_version"]),
+ str(sh["cipher_suite"]),
+ "-".join(str(e) for e in sh["extensions"]),
+ ]
+ ja3s_str = ",".join(parts)
+ return ja3s_str, hashlib.md5(ja3s_str.encode()).hexdigest()
+
+
+# ─── JA4 / JA4S computation ──────────────────────────────────────────────────
+
+def _ja4_version(ch: dict[str, Any]) -> str:
+ """
+ Determine JA4 TLS version string (2 chars).
+ Uses supported_versions extension if present (TLS 1.3 advertises 0x0303 in
+ ClientHello.version but 0x0304 in supported_versions).
+ """
+ versions = ch.get("supported_versions", [])
+ if versions:
+ best = max(versions)
+ else:
+ best = ch["tls_version"]
+ return {
+ 0x0304: "13",
+ 0x0303: "12",
+ 0x0302: "11",
+ 0x0301: "10",
+ 0x0300: "s3",
+ 0x0200: "s2",
+ }.get(best, "00")
+
+
+def _ja4_alpn_tag(alpn_list: list[str] | str) -> str:
+ """
+ JA4 ALPN tag: first and last character of the first ALPN protocol.
+ No ALPN → "00".
+ """
+ if isinstance(alpn_list, str):
+ proto = alpn_list
+ elif alpn_list:
+ proto = alpn_list[0]
+ else:
+ return "00"
+
+ if not proto:
+ return "00"
+ if len(proto) == 1:
+ return proto[0] + proto[0]
+ return proto[0] + proto[-1]
+
+
+def _sha256_12(text: str) -> str:
+ """First 12 hex chars of SHA-256."""
+ return hashlib.sha256(text.encode()).hexdigest()[:12]
+
+
+def _ja4(ch: dict[str, Any]) -> str:
+ """
+ Compute JA4 fingerprint from a parsed ClientHello.
+
+ Format: a_b_c where
+ a = {t|q}{version:2}{d|i}{cipher_count:02d}{ext_count:02d}{alpn_tag:2}
+ b = sha256_12(sorted_cipher_suites, comma-separated)
+ c = sha256_12(sorted_extensions,sorted_signature_algorithms)
+
+ Protocol is always 't' (TCP) since we capture on a TCP socket.
+ SNI present → 'd' (domain), absent → 'i' (IP).
+ """
+ proto = "t"
+ ver = _ja4_version(ch)
+ sni_flag = "d" if ch.get("sni") else "i"
+
+ # Counts — GREASE already filtered, but also exclude SNI (0x0000) and ALPN (0x0010)
+ # from extension count per JA4 spec? No — JA4 counts all non-GREASE extensions.
+ cs_count = min(len(ch["cipher_suites"]), 99)
+ ext_count = min(len(ch["extensions"]), 99)
+ alpn_tag = _ja4_alpn_tag(ch.get("alpn", []))
+
+ section_a = f"{proto}{ver}{sni_flag}{cs_count:02d}{ext_count:02d}{alpn_tag}"
+
+ # Section b: sorted cipher suites as decimal, comma-separated
+ sorted_cs = sorted(ch["cipher_suites"])
+ section_b = _sha256_12(",".join(str(c) for c in sorted_cs))
+
+ # Section c: sorted extensions + sorted signature algorithms
+ sorted_ext = sorted(ch["extensions"])
+ sorted_sa = sorted(ch.get("signature_algorithms", []))
+ ext_str = ",".join(str(e) for e in sorted_ext)
+ sa_str = ",".join(str(s) for s in sorted_sa)
+ combined = f"{ext_str}_{sa_str}" if sa_str else ext_str
+ section_c = _sha256_12(combined)
+
+ return f"{section_a}_{section_b}_{section_c}"
+
+
+def _ja4s(sh: dict[str, Any]) -> str:
+ """
+ Compute JA4S fingerprint from a parsed ServerHello.
+
+ Format: a_b where
+ a = {t|q}{version:2}{ext_count:02d}{alpn_tag:2}
+ b = sha256_12({cipher_suite},{sorted_extensions comma-separated})
+ """
+ proto = "t"
+ # Use selected_version from supported_versions ext if available
+ selected = sh.get("selected_version")
+ if selected:
+ ver = {0x0304: "13", 0x0303: "12", 0x0302: "11", 0x0301: "10",
+ 0x0300: "s3", 0x0200: "s2"}.get(selected, "00")
+ else:
+ ver = {0x0304: "13", 0x0303: "12", 0x0302: "11", 0x0301: "10",
+ 0x0300: "s3", 0x0200: "s2"}.get(sh["tls_version"], "00")
+
+ ext_count = min(len(sh["extensions"]), 99)
+ alpn_tag = _ja4_alpn_tag(sh.get("alpn", ""))
+
+ section_a = f"{proto}{ver}{ext_count:02d}{alpn_tag}"
+
+ sorted_ext = sorted(sh["extensions"])
+ inner = f"{sh['cipher_suite']},{','.join(str(e) for e in sorted_ext)}"
+ section_b = _sha256_12(inner)
+
+ return f"{section_a}_{section_b}"
+
+
+# ─── JA4L (latency) ──────────────────────────────────────────────────────────
+
+def _ja4l(key: tuple[str, int, str, int]) -> dict[str, Any] | None:
+ """
+ Retrieve JA4L data for a connection.
+
+ JA4L measures the TCP handshake RTT: time from SYN to SYN-ACK.
+ Returns {"rtt_ms": float, "client_ttl": int} or None.
+ """
+ return _tcp_rtt.get(key)
+
+
+# ─── Session resumption ──────────────────────────────────────────────────────
+
+def _session_resumption_info(ch: dict[str, Any]) -> dict[str, Any]:
+ """
+ Analyze ClientHello for TLS session resumption behavior.
+ Returns a dict describing what resumption mechanisms the client uses.
+ """
+ mechanisms: list[str] = []
+
+ if ch.get("has_session_ticket_data"):
+ mechanisms.append("session_ticket")
+
+ if ch.get("has_pre_shared_key"):
+ mechanisms.append("psk")
+
+ if ch.get("has_early_data"):
+ mechanisms.append("early_data_0rtt")
+
+ if ch.get("session_id") and len(ch["session_id"]) > 0:
+ mechanisms.append("session_id")
+
+ return {
+ "resumption_attempted": len(mechanisms) > 0,
+ "mechanisms": mechanisms,
+ }
+
+
+# ─── Session cleanup ─────────────────────────────────────────────────────────
+
+def _cleanup_sessions() -> None:
+ now = time.monotonic()
+ stale = [k for k, ts in _session_ts.items() if now - ts > _SESSION_TTL]
+ for k in stale:
+ _sessions.pop(k, None)
+ _session_ts.pop(k, None)
+ # Also clean up TCP RTT tracking
+ stale_syn = [k for k, v in _tcp_syn.items()
+ if now - v.get("time", 0) > _SESSION_TTL]
+ for k in stale_syn:
+ _tcp_syn.pop(k, None)
+ stale_rtt = [k for k, _ in _tcp_rtt.items()
+ if k not in _sessions and k not in _session_ts]
+ for k in stale_rtt:
+ _tcp_rtt.pop(k, None)
+
+
+# ─── Dedup cache ─────────────────────────────────────────────────────────────
+
+# Key: (src_ip, event_type, fingerprint_key) → timestamp of last emit
+_dedup_cache: dict[tuple[str, str, str], float] = {}
+_DEDUP_CLEANUP_INTERVAL: float = 60.0
+_dedup_last_cleanup: float = 0.0
+
+
+def _dedup_key_for(event_type: str, fields: dict[str, Any]) -> str:
+ """Build a dedup fingerprint from the most significant fields."""
+ if event_type == "tls_client_hello":
+ return fields.get("ja3", "") + "|" + fields.get("ja4", "")
+ if event_type == "tls_session":
+ return (fields.get("ja3", "") + "|" + fields.get("ja3s", "") +
+ "|" + fields.get("ja4", "") + "|" + fields.get("ja4s", ""))
+ if event_type == "tls_certificate":
+ return fields.get("subject_cn", "") + "|" + fields.get("issuer", "")
+ # tls_resumption or unknown — dedup on mechanisms
+ return fields.get("mechanisms", fields.get("resumption", ""))
+
+
+def _is_duplicate(event_type: str, fields: dict[str, Any]) -> bool:
+ """Return True if this event was already emitted within the dedup window."""
+ if _DEDUP_TTL <= 0:
+ return False
+
+ global _dedup_last_cleanup
+ now = time.monotonic()
+
+ # Periodic cleanup
+ if now - _dedup_last_cleanup > _DEDUP_CLEANUP_INTERVAL:
+ stale = [k for k, ts in _dedup_cache.items() if now - ts > _DEDUP_TTL]
+ for k in stale:
+ del _dedup_cache[k]
+ _dedup_last_cleanup = now
+
+ src_ip = fields.get("src_ip", "")
+ fp = _dedup_key_for(event_type, fields)
+ cache_key = (src_ip, event_type, fp)
+
+ last_seen = _dedup_cache.get(cache_key)
+ if last_seen is not None and now - last_seen < _DEDUP_TTL:
+ return True
+
+ _dedup_cache[cache_key] = now
+ return False
+
+
+# ─── Logging helpers ─────────────────────────────────────────────────────────
+
+def _log(event_type: str, severity: int = SEVERITY_INFO, **fields: Any) -> None:
+ if _is_duplicate(event_type, fields):
+ return
+ line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity=severity, **fields)
+ write_syslog_file(line)
+
+
+# ─── Packet callback ─────────────────────────────────────────────────────────
+
+def _on_packet(pkt: Any) -> None:
+ if not (pkt.haslayer(IP) and pkt.haslayer(TCP)):
+ return
+
+ ip = pkt[IP]
+ tcp = pkt[TCP]
+
+ src_ip: str = ip.src
+ dst_ip: str = ip.dst
+ src_port: int = tcp.sport
+ dst_port: int = tcp.dport
+ flags: int = tcp.flags.value if hasattr(tcp.flags, 'value') else int(tcp.flags)
+
+ # ── TCP SYN tracking for JA4L ──
+ if flags & _TCP_SYN and not (flags & _TCP_ACK):
+ # Pure SYN — record timestamp and TTL
+ key = (src_ip, src_port, dst_ip, dst_port)
+ _tcp_syn[key] = {"time": time.monotonic(), "ttl": ip.ttl}
+
+ elif flags & _TCP_SYN and flags & _TCP_ACK:
+ # SYN-ACK — calculate RTT for the original SYN sender
+ rev_key = (dst_ip, dst_port, src_ip, src_port)
+ syn_data = _tcp_syn.pop(rev_key, None)
+ if syn_data:
+ rtt_ms = round((time.monotonic() - syn_data["time"]) * 1000, 2)
+ _tcp_rtt[rev_key] = {
+ "rtt_ms": rtt_ms,
+ "client_ttl": syn_data["ttl"],
+ }
+
+ payload = bytes(tcp.payload)
+ if not payload:
+ return
+
+ # TLS record check
+ if payload[0] != _TLS_RECORD_HANDSHAKE:
+ return
+
+ # Attempt ClientHello parse
+ ch = _parse_client_hello(payload)
+ if ch is not None:
+ _cleanup_sessions()
+
+ key = (src_ip, src_port, dst_ip, dst_port)
+ ja3_str, ja3_hash = _ja3(ch)
+ ja4_hash = _ja4(ch)
+ resumption = _session_resumption_info(ch)
+ rtt_data = _ja4l(key)
+
+ _sessions[key] = {
+ "ja3": ja3_hash,
+ "ja3_str": ja3_str,
+ "ja4": ja4_hash,
+ "tls_version": ch["tls_version"],
+ "cipher_suites": ch["cipher_suites"],
+ "extensions": ch["extensions"],
+ "signature_algorithms": ch.get("signature_algorithms", []),
+ "supported_versions": ch.get("supported_versions", []),
+ "sni": ch["sni"],
+ "alpn": ch["alpn"],
+ "resumption": resumption,
+ }
+ _session_ts[key] = time.monotonic()
+
+ log_fields: dict[str, Any] = {
+ "src_ip": src_ip,
+ "src_port": str(src_port),
+ "dst_ip": dst_ip,
+ "dst_port": str(dst_port),
+ "ja3": ja3_hash,
+ "ja4": ja4_hash,
+ "tls_version": _tls_version_str(ch["tls_version"]),
+ "sni": ch["sni"] or "",
+ "alpn": ",".join(ch["alpn"]),
+ "raw_ciphers": "-".join(str(c) for c in ch["cipher_suites"]),
+ "raw_extensions": "-".join(str(e) for e in ch["extensions"]),
+ }
+
+ if resumption["resumption_attempted"]:
+ log_fields["resumption"] = ",".join(resumption["mechanisms"])
+
+ if rtt_data:
+ log_fields["ja4l_rtt_ms"] = str(rtt_data["rtt_ms"])
+ log_fields["ja4l_client_ttl"] = str(rtt_data["client_ttl"])
+
+ _log("tls_client_hello", **log_fields)
+ return
+
+ # Attempt ServerHello parse
+ sh = _parse_server_hello(payload)
+ if sh is not None:
+ # Reverse 4-tuple to find the matching ClientHello
+ rev_key = (dst_ip, dst_port, src_ip, src_port)
+ ch_data = _sessions.pop(rev_key, None)
+ _session_ts.pop(rev_key, None)
+
+ ja3s_str, ja3s_hash = _ja3s(sh)
+ ja4s_hash = _ja4s(sh)
+
+ fields: dict[str, Any] = {
+ "src_ip": dst_ip, # original attacker is now the destination
+ "src_port": str(dst_port),
+ "dst_ip": src_ip,
+ "dst_port": str(src_port),
+ "ja3s": ja3s_hash,
+ "ja4s": ja4s_hash,
+ "tls_version": _tls_version_str(sh["tls_version"]),
+ }
+
+ if ch_data:
+ fields["ja3"] = ch_data["ja3"]
+ fields["ja4"] = ch_data.get("ja4", "")
+ fields["sni"] = ch_data["sni"] or ""
+ fields["alpn"] = ",".join(ch_data["alpn"])
+ fields["raw_ciphers"] = "-".join(str(c) for c in ch_data["cipher_suites"])
+ fields["raw_extensions"] = "-".join(str(e) for e in ch_data["extensions"])
+ if ch_data.get("resumption", {}).get("resumption_attempted"):
+ fields["resumption"] = ",".join(ch_data["resumption"]["mechanisms"])
+
+ rtt_data = _tcp_rtt.pop(rev_key, None)
+ if rtt_data:
+ fields["ja4l_rtt_ms"] = str(rtt_data["rtt_ms"])
+ fields["ja4l_client_ttl"] = str(rtt_data["client_ttl"])
+
+ _log("tls_session", severity=SEVERITY_WARNING, **fields)
+ return
+
+ # Attempt Certificate parse (TLS 1.2 only — 1.3 encrypts it)
+ cert = _parse_certificate(payload)
+ if cert is not None:
+ # Match to a session — the cert comes from the server side
+ rev_key = (dst_ip, dst_port, src_ip, src_port)
+ ch_data = _sessions.get(rev_key)
+
+ cert_fields: dict[str, Any] = {
+ "src_ip": dst_ip,
+ "src_port": str(dst_port),
+ "dst_ip": src_ip,
+ "dst_port": str(src_port),
+ "subject_cn": cert["subject_cn"],
+ "issuer": cert["issuer"],
+ "self_signed": str(cert["self_signed"]).lower(),
+ "not_before": cert["not_before"],
+ "not_after": cert["not_after"],
+ }
+ if cert["sans"]:
+ cert_fields["sans"] = ",".join(cert["sans"])
+ if ch_data:
+ cert_fields["sni"] = ch_data.get("sni", "")
+
+ _log("tls_certificate", **cert_fields)
+
+
+# ─── Entry point ─────────────────────────────────────────────────────────────
+
+if __name__ == "__main__":
+ _log("startup", msg=f"sniffer started node={NODE_NAME}")
+ sniff(
+ filter="tcp",
+ prn=_on_packet,
+ store=False,
+ )
diff --git a/templates/snmp/Dockerfile b/decnet/templates/snmp/Dockerfile
similarity index 85%
rename from templates/snmp/Dockerfile
rename to decnet/templates/snmp/Dockerfile
index 5a452e9..9b79675 100644
--- a/templates/snmp/Dockerfile
+++ b/decnet/templates/snmp/Dockerfile
@@ -5,13 +5,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
python3 \
&& rm -rf /var/lib/apt/lists/*
-COPY decnet_logging.py /opt/decnet_logging.py
+COPY syslog_bridge.py /opt/syslog_bridge.py
COPY server.py /opt/server.py
COPY entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh
EXPOSE 161/udp
-RUN useradd -r -s /bin/false -d /opt decnet \
+RUN useradd -r -s /bin/false -d /opt logrelay \
&& apt-get update && apt-get install -y --no-install-recommends libcap2-bin \
&& rm -rf /var/lib/apt/lists/* \
&& (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true)
@@ -19,5 +19,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
CMD kill -0 1 || exit 1
-USER decnet
+USER logrelay
ENTRYPOINT ["/entrypoint.sh"]
diff --git a/templates/snmp/entrypoint.sh b/decnet/templates/snmp/entrypoint.sh
similarity index 100%
rename from templates/snmp/entrypoint.sh
rename to decnet/templates/snmp/entrypoint.sh
diff --git a/templates/snmp/server.py b/decnet/templates/snmp/server.py
similarity index 98%
rename from templates/snmp/server.py
rename to decnet/templates/snmp/server.py
index 34bb7bd..9410939 100644
--- a/templates/snmp/server.py
+++ b/decnet/templates/snmp/server.py
@@ -9,7 +9,7 @@ Logs all requests as JSON.
import asyncio
import os
import struct
-from decnet_logging import syslog_line, write_syslog_file, forward_syslog
+from syslog_bridge import syslog_line, write_syslog_file, forward_syslog
NODE_NAME = os.environ.get("NODE_NAME", "switch")
SERVICE_NAME = "snmp"
@@ -68,7 +68,6 @@ _OID_VALUES = {
def _log(event_type: str, severity: int = 6, **kwargs) -> None:
line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs)
- print(line, flush=True)
write_syslog_file(line)
forward_syslog(line, LOG_TARGET)
diff --git a/decnet/templates/snmp/syslog_bridge.py b/decnet/templates/snmp/syslog_bridge.py
new file mode 100644
index 0000000..c0a78d0
--- /dev/null
+++ b/decnet/templates/snmp/syslog_bridge.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+"""
+Shared RFC 5424 syslog helper used by service containers.
+
+Services call syslog_line() to format an RFC 5424 message, then
+write_syslog_file() to emit it to stdout — the container runtime
+captures it, and the host-side collector streams it into the log file.
+
+RFC 5424 structure:
+ 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG
+
+Facility: local0 (16). SD element ID uses PEN 55555.
+"""
+
+from datetime import datetime, timezone
+from typing import Any
+
+# ─── Constants ────────────────────────────────────────────────────────────────
+
+_FACILITY_LOCAL0 = 16
+_SD_ID = "relay@55555"
+_NILVALUE = "-"
+
+SEVERITY_EMERG = 0
+SEVERITY_ALERT = 1
+SEVERITY_CRIT = 2
+SEVERITY_ERROR = 3
+SEVERITY_WARNING = 4
+SEVERITY_NOTICE = 5
+SEVERITY_INFO = 6
+SEVERITY_DEBUG = 7
+
+_MAX_HOSTNAME = 255
+_MAX_APPNAME = 48
+_MAX_MSGID = 32
+
+# ─── Formatter ────────────────────────────────────────────────────────────────
+
+def _sd_escape(value: str) -> str:
+ """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3."""
+ return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]")
+
+
+def _sd_element(fields: dict[str, Any]) -> str:
+ if not fields:
+ return _NILVALUE
+ params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items())
+ return f"[{_SD_ID} {params}]"
+
+
+def syslog_line(
+ service: str,
+ hostname: str,
+ event_type: str,
+ severity: int = SEVERITY_INFO,
+ timestamp: datetime | None = None,
+ msg: str | None = None,
+ **fields: Any,
+) -> str:
+ """
+ Return a single RFC 5424-compliant syslog line (no trailing newline).
+
+ Args:
+ service: APP-NAME (e.g. "http", "mysql")
+ hostname: HOSTNAME (node name)
+ event_type: MSGID (e.g. "request", "login_attempt")
+ severity: Syslog severity integer (default: INFO=6)
+ timestamp: UTC datetime; defaults to now
+ msg: Optional free-text MSG
+ **fields: Encoded as structured data params
+ """
+ pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>"
+ ts = (timestamp or datetime.now(timezone.utc)).isoformat()
+ host = (hostname or _NILVALUE)[:_MAX_HOSTNAME]
+ appname = (service or _NILVALUE)[:_MAX_APPNAME]
+ msgid = (event_type or _NILVALUE)[:_MAX_MSGID]
+ sd = _sd_element(fields)
+ message = f" {msg}" if msg else ""
+ return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}"
+
+
+def write_syslog_file(line: str) -> None:
+ """Emit a syslog line to stdout for container log capture."""
+ print(line, flush=True)
+
+
+def forward_syslog(line: str, log_target: str) -> None:
+ """No-op stub. TCP forwarding is handled by rsyslog, not by service containers."""
+ pass
diff --git a/decnet/templates/ssh/Dockerfile b/decnet/templates/ssh/Dockerfile
new file mode 100644
index 0000000..5e91886
--- /dev/null
+++ b/decnet/templates/ssh/Dockerfile
@@ -0,0 +1,116 @@
+ARG BASE_IMAGE=debian:bookworm-slim
+FROM ${BASE_IMAGE}
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+ openssh-server \
+ sudo \
+ rsyslog \
+ curl \
+ wget \
+ vim \
+ nano \
+ net-tools \
+ procps \
+ htop \
+ git \
+ inotify-tools \
+ psmisc \
+ iproute2 \
+ iputils-ping \
+ ca-certificates \
+ nmap \
+ jq \
+ python3 \
+ && rm -rf /var/lib/apt/lists/*
+
+RUN mkdir -p /var/run/sshd /root/.ssh /var/log/journal /var/lib/systemd/coredump \
+ && chmod 700 /var/lib/systemd/coredump
+
+# sshd_config: allow root + password auth; VERBOSE so session lines carry
+# client IP + session PID (needed for file-capture attribution).
+RUN sed -i \
+ -e 's|^#\?PermitRootLogin.*|PermitRootLogin yes|' \
+ -e 's|^#\?PasswordAuthentication.*|PasswordAuthentication yes|' \
+ -e 's|^#\?ChallengeResponseAuthentication.*|ChallengeResponseAuthentication no|' \
+ -e 's|^#\?LogLevel.*|LogLevel VERBOSE|' \
+ /etc/ssh/sshd_config
+
+# rsyslog: forward auth.* and user.* to PID 1's stdout in RFC 5424 format.
+# /proc/1/fd/1 is the container-stdout fd Docker attached — writing there
+# surfaces lines in `docker logs` without needing a named pipe + relay cat
+# (which would be readable AND writable by any root-in-container process).
+RUN printf '%s\n' \
+ '# auth + user events → container stdout as RFC 5424' \
+ '$template RFC5424fmt,"<%PRI%>1 %TIMESTAMP:::date-rfc3339% %HOSTNAME% %APP-NAME% %PROCID% %MSGID% %STRUCTURED-DATA% %msg%\n"' \
+ 'auth,authpriv.* /proc/1/fd/1;RFC5424fmt' \
+ 'user.* /proc/1/fd/1;RFC5424fmt' \
+ > /etc/rsyslog.d/50-journal-forward.conf
+
+# Silence default catch-all rules so we own auth/user routing exclusively.
+# Also disable rsyslog's privilege drop: PID 1's stdout (/proc/1/fd/1) is
+# owned by root, so a syslog-user rsyslogd gets EACCES and silently drops
+# every auth/user line (bash CMD events + file_captured emissions).
+RUN sed -i \
+ -e 's|^\(\*\.\*;auth,authpriv\.none\)|#\1|' \
+ -e 's|^auth,authpriv\.\*|#auth,authpriv.*|' \
+ -e 's|^\$PrivDropToUser|#$PrivDropToUser|' \
+ -e 's|^\$PrivDropToGroup|#$PrivDropToGroup|' \
+ /etc/rsyslog.conf
+
+# Sudo: log to syslog (auth facility) AND a local file with full I/O capture
+RUN echo 'Defaults logfile="/var/log/sudo.log"' >> /etc/sudoers && \
+ echo 'Defaults syslog=auth' >> /etc/sudoers && \
+ echo 'Defaults log_input,log_output' >> /etc/sudoers
+
+# Lived-in environment: motd, shell aliases, fake project files
+RUN echo "Ubuntu 22.04.3 LTS" > /etc/issue.net && \
+ echo "Welcome to Ubuntu 22.04.3 LTS (GNU/Linux 5.15.0-88-generic x86_64)" > /etc/motd && \
+ echo "" >> /etc/motd && \
+ echo " * Documentation: https://help.ubuntu.com" >> /etc/motd && \
+ echo " * Management: https://landscape.canonical.com" >> /etc/motd && \
+ echo " * Support: https://ubuntu.com/advantage" >> /etc/motd
+
+RUN echo 'alias ll="ls -alF"' >> /root/.bashrc && \
+ echo 'alias la="ls -A"' >> /root/.bashrc && \
+ echo 'alias l="ls -CF"' >> /root/.bashrc && \
+ echo 'export HISTSIZE=1000' >> /root/.bashrc && \
+ echo 'export HISTFILESIZE=2000' >> /root/.bashrc && \
+ echo 'PROMPT_COMMAND='"'"'logger -p user.info -t bash "CMD uid=$UID user=$USER src=${SSH_CLIENT%% *} pwd=$PWD cmd=$(history 1 | sed "s/^ *[0-9]* *//")";'"'" >> /root/.bashrc
+
+# Fake project files to look lived-in
+RUN mkdir -p /root/projects /root/backups /var/www/html && \
+ printf '# TODO: migrate DB to new server\n# check cron jobs\n# update SSL cert\n' > /root/notes.txt && \
+ printf 'DB_HOST=10.0.0.5\nDB_USER=admin\nDB_PASS=changeme123\nDB_NAME=prod_db\n' > /root/projects/.env && \
+ printf '[Unit]\nDescription=App Server\n[Service]\nExecStart=/usr/bin/python3 /opt/app/server.py\n' > /root/projects/app.service
+
+# Stage all capture sources in a scratch dir. Nothing here survives the layer:
+# _build_stealth.py packs syslog_bridge.py + emit_capture.py + capture.sh into
+# XOR+gzip+base64 blobs embedded directly in /entrypoint.sh, and the whole
+# /tmp/build tree is wiped at the end of the RUN — so the final image has no
+# `.py` file under /opt and no `journal-relay` script under /usr/libexec/udev.
+COPY entrypoint.sh capture.sh syslog_bridge.py emit_capture.py \
+ argv_zap.c _build_stealth.py /tmp/build/
+
+# argv_zap is compiled into a shared object disguised as a multiarch
+# udev-companion library (sits next to real libudev.so.1). gcc is installed
+# only for this build step and purged in the same layer.
+RUN set -eu \
+ && apt-get update \
+ && apt-get install -y --no-install-recommends gcc libc6-dev \
+ && mkdir -p /usr/lib/x86_64-linux-gnu /usr/libexec/udev \
+ && gcc -O2 -fPIC -shared \
+ -o /usr/lib/x86_64-linux-gnu/libudev-shared.so.1 \
+ /tmp/build/argv_zap.c -ldl \
+ && apt-get purge -y gcc libc6-dev \
+ && apt-get autoremove -y \
+ && rm -rf /var/lib/apt/lists/* \
+ && ln -sf /usr/bin/inotifywait /usr/libexec/udev/kmsg-watch \
+ && python3 /tmp/build/_build_stealth.py \
+ && rm -rf /tmp/build
+
+EXPOSE 22
+
+HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
+ CMD kill -0 1 || exit 1
+
+ENTRYPOINT ["/entrypoint.sh"]
diff --git a/decnet/templates/ssh/_build_stealth.py b/decnet/templates/ssh/_build_stealth.py
new file mode 100644
index 0000000..a3a4ceb
--- /dev/null
+++ b/decnet/templates/ssh/_build_stealth.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+"""
+Build-time helper: merge capture Python sources, XOR+gzip+base64 pack them
+and the capture.sh loop, and render the final /entrypoint.sh from its
+templated form.
+
+Runs inside the Docker build. Reads from /tmp/build/, writes /entrypoint.sh.
+"""
+
+from __future__ import annotations
+
+import base64
+import gzip
+import random
+import sys
+from pathlib import Path
+
+BUILD = Path("/tmp/build")
+
+
+def _merge_python() -> str:
+ bridge = (BUILD / "syslog_bridge.py").read_text()
+ emit = (BUILD / "emit_capture.py").read_text()
+
+ def _clean(src: str) -> tuple[list[str], list[str]]:
+ """Return (future_imports, other_lines) with noise stripped."""
+ futures: list[str] = []
+ rest: list[str] = []
+ for line in src.splitlines():
+ ls = line.lstrip()
+ if ls.startswith("from __future__"):
+ futures.append(line)
+ elif ls.startswith("sys.path.insert") or ls.startswith("from syslog_bridge"):
+ continue
+ else:
+ rest.append(line)
+ return futures, rest
+
+ b_fut, b_rest = _clean(bridge)
+ e_fut, e_rest = _clean(emit)
+
+ # Deduplicate future imports and hoist to the very top.
+ seen: set[str] = set()
+ futures: list[str] = []
+ for line in (*b_fut, *e_fut):
+ stripped = line.strip()
+ if stripped not in seen:
+ seen.add(stripped)
+ futures.append(line)
+
+ header = "\n".join(futures)
+ body = "\n".join(b_rest) + "\n\n" + "\n".join(e_rest)
+ return (header + "\n" if header else "") + body
+
+
+def _pack(text: str, key: int) -> str:
+ gz = gzip.compress(text.encode("utf-8"))
+ xored = bytes(b ^ key for b in gz)
+ return base64.b64encode(xored).decode("ascii")
+
+
+def main() -> int:
+ key = random.SystemRandom().randint(1, 255)
+
+ merged_py = _merge_python()
+ capture_sh = (BUILD / "capture.sh").read_text()
+
+ emit_b64 = _pack(merged_py, key)
+ relay_b64 = _pack(capture_sh, key)
+
+ tpl = (BUILD / "entrypoint.sh").read_text()
+ rendered = (
+ tpl.replace("__STEALTH_KEY__", str(key))
+ .replace("__EMIT_CAPTURE_B64__", emit_b64)
+ .replace("__JOURNAL_RELAY_B64__", relay_b64)
+ )
+
+ for marker in ("__STEALTH_KEY__", "__EMIT_CAPTURE_B64__", "__JOURNAL_RELAY_B64__"):
+ if marker in rendered:
+ print(f"build: placeholder {marker} still present after render", file=sys.stderr)
+ return 1
+
+ Path("/entrypoint.sh").write_text(rendered)
+ Path("/entrypoint.sh").chmod(0o755)
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/decnet/templates/ssh/argv_zap.c b/decnet/templates/ssh/argv_zap.c
new file mode 100644
index 0000000..4f60996
--- /dev/null
+++ b/decnet/templates/ssh/argv_zap.c
@@ -0,0 +1,65 @@
+/*
+ * argv_zap.so — LD_PRELOAD shim that blanks argv[1..] from /proc/PID/cmdline
+ * after the target binary has parsed its arguments.
+ *
+ * Rationale: exec -a can rewrite argv[0], but the remaining args (paths,
+ * flags) remain visible via `ps aux`. By hooking __libc_start_main we can
+ * copy argv into heap-backed storage, hand that to the real main, then
+ * zero the stack-resident argv region so the kernel's cmdline reader
+ * returns just argv[0].
+ *
+ * Usage:
+ * gcc -O2 -fPIC -shared -o argv_zap.so argv_zap.c -ldl
+ * ARGV_ZAP_COMM=kmsg-watch LD_PRELOAD=/path/argv_zap.so \
+ * exec -a "kmsg-watch" inotifywait …
+ */
+
+#define _GNU_SOURCE
+#include
+#include
+#include
+#include
+
+typedef int (*main_t)(int, char **, char **);
+typedef int (*libc_start_main_t)(main_t, int, char **,
+ void (*)(void), void (*)(void),
+ void (*)(void), void *);
+
+static main_t real_main;
+
+static int wrapped_main(int argc, char **argv, char **envp) {
+ /* Heap-copy argv so the target keeps its arguments. */
+ char **heap_argv = (char **)calloc(argc + 1, sizeof(char *));
+ if (heap_argv) {
+ for (int i = 0; i < argc; i++) {
+ heap_argv[i] = strdup(argv[i] ? argv[i] : "");
+ }
+ }
+
+ /* Zero the contiguous argv[1..] region (argv[0] stays for ps). */
+ if (argc > 1 && argv[1] && argv[argc - 1]) {
+ char *start = argv[1];
+ char *end = argv[argc - 1] + strlen(argv[argc - 1]);
+ if (end > start) memset(start, 0, (size_t)(end - start));
+ }
+
+ /* Optional comm rename so /proc/self/comm mirrors the argv[0] disguise.
+ * Read from ARGV_ZAP_COMM so different callers can pick their own name
+ * (kmsg-watch for inotifywait, journal-relay for the watcher bash, …).
+ * Unset afterwards so children don't accidentally inherit the override. */
+ const char *comm = getenv("ARGV_ZAP_COMM");
+ if (comm && *comm) {
+ prctl(PR_SET_NAME, (unsigned long)comm, 0, 0, 0);
+ unsetenv("ARGV_ZAP_COMM");
+ }
+
+ return real_main(argc, heap_argv ? heap_argv : argv, envp);
+}
+
+int __libc_start_main(main_t main_fn, int argc, char **argv,
+ void (*init)(void), void (*fini)(void),
+ void (*rtld_fini)(void), void *stack_end) {
+ real_main = main_fn;
+ libc_start_main_t real = (libc_start_main_t)dlsym(RTLD_NEXT, "__libc_start_main");
+ return real(wrapped_main, argc, argv, init, fini, rtld_fini, stack_end);
+}
diff --git a/decnet/templates/ssh/capture.sh b/decnet/templates/ssh/capture.sh
new file mode 100755
index 0000000..21952c5
--- /dev/null
+++ b/decnet/templates/ssh/capture.sh
@@ -0,0 +1,265 @@
+#!/bin/bash
+# systemd-journal relay helper: mirrors newly-written files under a
+# monitored set of paths into the coredump staging directory and emits
+# a structured journal line per event.
+#
+# `lastpipe` runs the tail of `inotify | while` in the current shell so
+# the process tree stays flat (one bash, not two). Job control must be
+# off for lastpipe to apply — non-interactive scripts already have it off.
+shopt -s lastpipe
+set +m
+
+set -u
+
+CAPTURE_DIR="${CAPTURE_DIR:-/var/lib/systemd/coredump}"
+CAPTURE_MAX_BYTES="${CAPTURE_MAX_BYTES:-52428800}" # 50 MiB
+CAPTURE_WATCH_PATHS="${CAPTURE_WATCH_PATHS:-/root /tmp /var/tmp /home /var/www /opt /dev/shm}"
+# Invoke inotifywait through the udev-sided symlink; fall back to the real
+# binary if the symlink is missing.
+INOTIFY_BIN="${INOTIFY_BIN:-/usr/libexec/udev/kmsg-watch}"
+[ -x "$INOTIFY_BIN" ] || INOTIFY_BIN="$(command -v inotifywait)"
+
+mkdir -p "$CAPTURE_DIR"
+chmod 700 "$CAPTURE_DIR"
+
+# Filenames we never capture (boot noise, self-writes).
+_is_ignored_path() {
+ local p="$1"
+ case "$p" in
+ "$CAPTURE_DIR"/*) return 0 ;;
+ /var/lib/systemd/*) return 0 ;;
+ */.bash_history) return 0 ;;
+ */.viminfo) return 0 ;;
+ */ssh_host_*_key*) return 0 ;;
+ esac
+ return 1
+}
+
+# Resolve the writer PID best-effort. Prints the PID or nothing.
+_writer_pid() {
+ local path="$1"
+ local pid
+ pid="$(fuser "$path" 2>/dev/null | tr -d ' \t\n')"
+ if [ -n "$pid" ]; then
+ printf '%s' "${pid%% *}"
+ return
+ fi
+ # Fallback: scan /proc/*/fd for an open handle on the path.
+ for fd_link in /proc/[0-9]*/fd/*; do
+ [ -L "$fd_link" ] || continue
+ if [ "$(readlink -f "$fd_link" 2>/dev/null)" = "$path" ]; then
+ printf '%s' "$(echo "$fd_link" | awk -F/ '{print $3}')"
+ return
+ fi
+ done
+}
+
+# Walk PPid chain from $1 until we hit an sshd session leader.
+# Prints: (empty on no match).
+_walk_to_sshd() {
+ local pid="$1"
+ local depth=0
+ while [ -n "$pid" ] && [ "$pid" != "0" ] && [ "$pid" != "1" ] && [ $depth -lt 20 ]; do
+ local cmd
+ cmd="$(tr '\0' ' ' < "/proc/$pid/cmdline" 2>/dev/null)"
+ # sshd session leaders look like: "sshd: root@pts/0" or "sshd: root@notty"
+ if echo "$cmd" | grep -qE '^sshd: [^ ]+@'; then
+ local user
+ user="$(echo "$cmd" | sed -E 's/^sshd: ([^@]+)@.*/\1/')"
+ printf '%s %s' "$pid" "$user"
+ return
+ fi
+ pid="$(awk '/^PPid:/ {print $2}' "/proc/$pid/status" 2>/dev/null)"
+ depth=$((depth + 1))
+ done
+}
+
+# Emit a JSON array of currently-established SSH peers.
+# Each item: {pid, src_ip, src_port}.
+_ss_sessions_json() {
+ ss -Htnp state established sport = :22 2>/dev/null \
+ | awk '
+ {
+ peer=$4; local_=$3;
+ # peer looks like 198.51.100.7:55342 (may be IPv6 [::1]:x)
+ n=split(peer, a, ":");
+ port=a[n];
+ ip=peer; sub(":" port "$", "", ip);
+ gsub(/[\[\]]/, "", ip);
+ # extract pid from users:(("sshd",pid=1234,fd=5))
+ pid="";
+ if (match($0, /pid=[0-9]+/)) {
+ pid=substr($0, RSTART+4, RLENGTH-4);
+ }
+ printf "{\"pid\":%s,\"src_ip\":\"%s\",\"src_port\":%s}\n",
+ (pid==""?"null":pid), ip, (port+0);
+ }' \
+ | jq -s '.'
+}
+
+# Emit a JSON array of logged-in users from utmp.
+# Each item: {user, src_ip, login_at}.
+_who_sessions_json() {
+ who --ips 2>/dev/null \
+ | awk '{ printf "{\"user\":\"%s\",\"tty\":\"%s\",\"login_at\":\"%s %s\",\"src_ip\":\"%s\"}\n", $1, $2, $3, $4, $NF }' \
+ | jq -s '.'
+}
+
+_capture_one() {
+ local src="$1"
+ [ -f "$src" ] || return 0
+ _is_ignored_path "$src" && return 0
+
+ local size
+ size="$(stat -c '%s' "$src" 2>/dev/null)"
+ [ -z "$size" ] && return 0
+ if [ "$size" -gt "$CAPTURE_MAX_BYTES" ]; then
+ logger -p user.info -t systemd-journal "file_skipped size=$size path=$src reason=oversize"
+ return 0
+ fi
+
+ # Attribution first — PID may disappear after the copy races.
+ local writer_pid writer_comm writer_cmdline writer_uid writer_loginuid
+ writer_pid="$(_writer_pid "$src")"
+ if [ -n "$writer_pid" ] && [ -d "/proc/$writer_pid" ]; then
+ writer_comm="$(cat "/proc/$writer_pid/comm" 2>/dev/null)"
+ writer_cmdline="$(tr '\0' ' ' < "/proc/$writer_pid/cmdline" 2>/dev/null)"
+ writer_uid="$(awk '/^Uid:/ {print $2}' "/proc/$writer_pid/status" 2>/dev/null)"
+ writer_loginuid="$(cat "/proc/$writer_pid/loginuid" 2>/dev/null)"
+ fi
+
+ local ssh_pid ssh_user
+ if [ -n "$writer_pid" ]; then
+ read -r ssh_pid ssh_user < <(_walk_to_sshd "$writer_pid" || true)
+ fi
+
+ local ss_json who_json
+ ss_json="$(_ss_sessions_json 2>/dev/null || echo '[]')"
+ who_json="$(_who_sessions_json 2>/dev/null || echo '[]')"
+
+ # Resolve src_ip via ss by matching ssh_pid.
+ local src_ip="" src_port="null" attribution="unknown"
+ if [ -n "${ssh_pid:-}" ]; then
+ local matched
+ matched="$(echo "$ss_json" | jq -c --argjson p "$ssh_pid" '.[] | select(.pid==$p)')"
+ if [ -n "$matched" ]; then
+ src_ip="$(echo "$matched" | jq -r '.src_ip')"
+ src_port="$(echo "$matched" | jq -r '.src_port')"
+ attribution="pid-chain"
+ fi
+ fi
+ # Fallback 1: ss-only. scp/wget/sftp close their fd before close_write
+ # fires, so fuser/proc-fd walks miss them. If there's exactly one live
+ # sshd session, attribute to it. With multiple, attribute to the first
+ # but tag ambiguous so analysts know to cross-check concurrent_sessions.
+ if [ "$attribution" = "unknown" ]; then
+ local ss_len
+ ss_len="$(echo "$ss_json" | jq 'length')"
+ if [ "$ss_len" -ge 1 ]; then
+ src_ip="$(echo "$ss_json" | jq -r '.[0].src_ip')"
+ src_port="$(echo "$ss_json" | jq -r '.[0].src_port')"
+ ssh_pid="$(echo "$ss_json" | jq -r '.[0].pid // empty')"
+ if [ -n "${ssh_pid:-}" ] && [ -d "/proc/$ssh_pid" ]; then
+ local ssh_cmd
+ ssh_cmd="$(tr '\0' ' ' < "/proc/$ssh_pid/cmdline" 2>/dev/null)"
+ ssh_user="$(echo "$ssh_cmd" | sed -nE 's/^sshd: ([^@]+)@.*/\1/p')"
+ fi
+ if [ "$ss_len" -eq 1 ]; then
+ attribution="ss-only"
+ else
+ attribution="ss-ambiguous"
+ fi
+ fi
+ fi
+
+ # Fallback 2: utmp. Weakest signal; often empty in containers.
+ if [ "$attribution" = "unknown" ] && [ "$(echo "$who_json" | jq 'length')" -gt 0 ]; then
+ src_ip="$(echo "$who_json" | jq -r '.[0].src_ip')"
+ attribution="utmp-only"
+ fi
+
+ local sha
+ sha="$(sha256sum "$src" 2>/dev/null | awk '{print $1}')"
+ [ -z "$sha" ] && return 0
+
+ local ts base stored_as
+ ts="$(date -u +%Y-%m-%dT%H:%M:%SZ)"
+ base="$(basename "$src")"
+ stored_as="${ts}_${sha:0:12}_${base}"
+
+ cp --preserve=timestamps,ownership "$src" "$CAPTURE_DIR/$stored_as" 2>/dev/null || return 0
+
+ local mtime
+ mtime="$(stat -c '%y' "$src" 2>/dev/null)"
+
+ # Prefer NODE_NAME (the deployer-supplied decky identifier) over
+ # $HOSTNAME, which is a cosmetic fake like "SRV-DEV-36" set by
+ # entrypoint.sh. The UI and the artifact bind mount both key on the
+ # decky name, so using $HOSTNAME here makes /artifacts/{decky}/... URLs
+ # unresolvable.
+ local decky="${NODE_NAME:-${HOSTNAME:-unknown}}"
+
+ # One syslog line, no sidecar. Flat summary fields ride as top-level SD
+ # params (searchable pills in the UI); bulky nested structures (writer
+ # cmdline, concurrent_sessions, ss_snapshot) are base64-packed into a
+ # single meta_json_b64 SD param by emit_capture.py.
+ jq -n \
+ --arg _hostname "$decky" \
+ --arg _service "ssh" \
+ --arg _event_type "file_captured" \
+ --arg captured_at "$ts" \
+ --arg orig_path "$src" \
+ --arg stored_as "$stored_as" \
+ --arg sha256 "$sha" \
+ --argjson size "$size" \
+ --arg mtime "$mtime" \
+ --arg attribution "$attribution" \
+ --arg writer_pid "${writer_pid:-}" \
+ --arg writer_comm "${writer_comm:-}" \
+ --arg writer_cmdline "${writer_cmdline:-}" \
+ --arg writer_uid "${writer_uid:-}" \
+ --arg writer_loginuid "${writer_loginuid:-}" \
+ --arg ssh_pid "${ssh_pid:-}" \
+ --arg ssh_user "${ssh_user:-}" \
+ --arg src_ip "$src_ip" \
+ --arg src_port "$src_port" \
+ --argjson concurrent "$who_json" \
+ --argjson ss_snapshot "$ss_json" \
+ '{
+ _hostname: $_hostname,
+ _service: $_service,
+ _event_type: $_event_type,
+ captured_at: $captured_at,
+ orig_path: $orig_path,
+ stored_as: $stored_as,
+ sha256: $sha256,
+ size: $size,
+ mtime: $mtime,
+ attribution: $attribution,
+ writer_pid: $writer_pid,
+ writer_comm: $writer_comm,
+ writer_uid: $writer_uid,
+ ssh_pid: $ssh_pid,
+ ssh_user: $ssh_user,
+ src_ip: $src_ip,
+ src_port: (if $src_port == "null" or $src_port == "" then "" else $src_port end),
+ writer_cmdline: $writer_cmdline,
+ writer_loginuid: $writer_loginuid,
+ concurrent_sessions: $concurrent,
+ ss_snapshot: $ss_snapshot
+ }' \
+ | python3 <(printf '%s' "$EMIT_CAPTURE_PY")
+}
+
+# Main loop.
+# LD_PRELOAD libudev-shared.so.1 blanks argv[1..] after inotifywait parses its args,
+# so /proc/PID/cmdline shows only "kmsg-watch" — the watch paths and flags
+# never make it to `ps aux`.
+# shellcheck disable=SC2086
+ARGV_ZAP_COMM=kmsg-watch LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libudev-shared.so.1 "$INOTIFY_BIN" -m -r -q \
+ --event close_write --event moved_to \
+ --format '%w%f' \
+ $CAPTURE_WATCH_PATHS 2>/dev/null \
+| while IFS= read -r path; do
+ _capture_one "$path" &
+done
diff --git a/decnet/templates/ssh/emit_capture.py b/decnet/templates/ssh/emit_capture.py
new file mode 100644
index 0000000..b2c4b8d
--- /dev/null
+++ b/decnet/templates/ssh/emit_capture.py
@@ -0,0 +1,84 @@
+#!/usr/bin/env python3
+"""
+Emit an RFC 5424 `file_captured` line to stdout.
+
+Called by capture.sh after a file drop has been mirrored into the quarantine
+directory. Reads a single JSON object from stdin describing the event; emits
+one syslog line that the collector parses into `logs.fields`.
+
+The input JSON may contain arbitrary nested structures (writer cmdline,
+concurrent_sessions, ss_snapshot). Bulky fields are base64-encoded into a
+single `meta_json_b64` SD param — this avoids pathological characters
+(`]`, `"`, `\\`) that the collector's SD-block regex cannot losslessly
+round-trip when embedded directly.
+"""
+
+from __future__ import annotations
+
+import base64
+import json
+import os
+import sys
+
+sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
+
+from syslog_bridge import syslog_line, write_syslog_file # noqa: E402
+
+# Flat fields ride as individual SD params (searchable, rendered as pills).
+# Everything else is rolled into the base64 meta blob.
+_FLAT_FIELDS: tuple[str, ...] = (
+ "stored_as",
+ "sha256",
+ "size",
+ "orig_path",
+ "src_ip",
+ "src_port",
+ "ssh_user",
+ "ssh_pid",
+ "attribution",
+ "writer_pid",
+ "writer_comm",
+ "writer_uid",
+ "mtime",
+)
+
+
+def main() -> int:
+ raw = sys.stdin.read()
+ if not raw.strip():
+ print("emit_capture: empty stdin", file=sys.stderr)
+ return 1
+ try:
+ event: dict = json.loads(raw)
+ except json.JSONDecodeError as exc:
+ print(f"emit_capture: bad JSON: {exc}", file=sys.stderr)
+ return 1
+
+ hostname = str(event.pop("_hostname", None) or os.environ.get("HOSTNAME") or "-")
+ service = str(event.pop("_service", "ssh"))
+ event_type = str(event.pop("_event_type", "file_captured"))
+
+ fields: dict[str, str] = {}
+ for key in _FLAT_FIELDS:
+ if key in event:
+ value = event.pop(key)
+ if value is None or value == "":
+ continue
+ fields[key] = str(value)
+
+ if event:
+ payload = json.dumps(event, separators=(",", ":"), ensure_ascii=False, sort_keys=True)
+ fields["meta_json_b64"] = base64.b64encode(payload.encode("utf-8")).decode("ascii")
+
+ line = syslog_line(
+ service=service,
+ hostname=hostname,
+ event_type=event_type,
+ **fields,
+ )
+ write_syslog_file(line)
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/decnet/templates/ssh/entrypoint.sh b/decnet/templates/ssh/entrypoint.sh
new file mode 100644
index 0000000..7ecd6fa
--- /dev/null
+++ b/decnet/templates/ssh/entrypoint.sh
@@ -0,0 +1,79 @@
+#!/bin/bash
+set -e
+
+# Configure root password (default: admin)
+ROOT_PASSWORD="${SSH_ROOT_PASSWORD:-admin}"
+echo "root:${ROOT_PASSWORD}" | chpasswd
+
+# Optional: override hostname inside container
+if [ -n "$SSH_HOSTNAME" ]; then
+ echo "$SSH_HOSTNAME" > /etc/hostname
+ hostname "$SSH_HOSTNAME"
+fi
+
+# Generate host keys if missing (first boot)
+ssh-keygen -A
+
+# Fake bash history so the box looks used
+if [ ! -f /root/.bash_history ]; then
+ cat > /root/.bash_history <<'HIST'
+apt update && apt upgrade -y
+systemctl status nginx
+tail -f /var/log/syslog
+df -h
+htop
+ps aux | grep python
+git pull origin main
+cd /root/projects
+vim notes.txt
+crontab -e
+ls /var/www/html
+HIST
+fi
+
+# Logging pipeline: rsyslogd (RFC 5424) → /proc/1/fd/1 → Docker log capture.
+# No intermediate pipe/relay — a named FIFO would be readable AND writable
+# by any root-in-container process, letting an attacker either eavesdrop on
+# the SIEM feed or inject forged log lines.
+if [ -n "${NODE_NAME:-}" ]; then
+ printf '$LocalHostName %s\n' "$NODE_NAME" > /etc/rsyslog.d/00-localhostname.conf
+fi
+rsyslogd
+
+# ─── systemd-journal companion bootstrap ─────────────────────────────────────
+# The next block primes cached udev/journal helpers from in-memory tables.
+# Tables are XOR+gzip+base64 packed at image build. Keeping the data inline
+# avoids a separate /usr/libexec script (which would be visible and readable)
+# and means the helpers only ever live in this process's memory + an
+# anonymous pipe the kernel hands to python via /dev/fd.
+_STEALTH_KEY=__STEALTH_KEY__
+_EMIT_CAPTURE_B64='__EMIT_CAPTURE_B64__'
+_JOURNAL_RELAY_B64='__JOURNAL_RELAY_B64__'
+
+_decode() {
+ printf '%s' "$1" | base64 -d | python3 -c '
+import sys
+k = '"$_STEALTH_KEY"'
+d = sys.stdin.buffer.read()
+sys.stdout.buffer.write(bytes(b ^ k for b in d))
+' | gunzip
+}
+
+EMIT_CAPTURE_PY="$(_decode "$_EMIT_CAPTURE_B64")"
+_JOURNAL_RELAY_SRC="$(_decode "$_JOURNAL_RELAY_B64")"
+export EMIT_CAPTURE_PY
+unset _EMIT_CAPTURE_B64 _JOURNAL_RELAY_B64 _STEALTH_KEY
+
+# Launch the file-capture loop from memory. LD_PRELOAD + ARGV_ZAP_COMM blank
+# argv[1..] so /proc/PID/cmdline shows only "journal-relay".
+(
+ export CAPTURE_DIR=/var/lib/systemd/coredump
+ export LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libudev-shared.so.1
+ export ARGV_ZAP_COMM=journal-relay
+ exec -a journal-relay bash -c "$_JOURNAL_RELAY_SRC"
+) &
+
+unset _JOURNAL_RELAY_SRC
+
+# sshd logs via syslog — no -e flag, so auth events flow through rsyslog → /proc/1/fd/1 → stdout
+exec /usr/sbin/sshd -D
diff --git a/decnet/templates/ssh/syslog_bridge.py b/decnet/templates/ssh/syslog_bridge.py
new file mode 100644
index 0000000..c0a78d0
--- /dev/null
+++ b/decnet/templates/ssh/syslog_bridge.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+"""
+Shared RFC 5424 syslog helper used by service containers.
+
+Services call syslog_line() to format an RFC 5424 message, then
+write_syslog_file() to emit it to stdout — the container runtime
+captures it, and the host-side collector streams it into the log file.
+
+RFC 5424 structure:
+ 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG
+
+Facility: local0 (16). SD element ID uses PEN 55555.
+"""
+
+from datetime import datetime, timezone
+from typing import Any
+
+# ─── Constants ────────────────────────────────────────────────────────────────
+
+_FACILITY_LOCAL0 = 16
+_SD_ID = "relay@55555"
+_NILVALUE = "-"
+
+SEVERITY_EMERG = 0
+SEVERITY_ALERT = 1
+SEVERITY_CRIT = 2
+SEVERITY_ERROR = 3
+SEVERITY_WARNING = 4
+SEVERITY_NOTICE = 5
+SEVERITY_INFO = 6
+SEVERITY_DEBUG = 7
+
+_MAX_HOSTNAME = 255
+_MAX_APPNAME = 48
+_MAX_MSGID = 32
+
+# ─── Formatter ────────────────────────────────────────────────────────────────
+
+def _sd_escape(value: str) -> str:
+ """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3."""
+ return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]")
+
+
+def _sd_element(fields: dict[str, Any]) -> str:
+ if not fields:
+ return _NILVALUE
+ params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items())
+ return f"[{_SD_ID} {params}]"
+
+
+def syslog_line(
+ service: str,
+ hostname: str,
+ event_type: str,
+ severity: int = SEVERITY_INFO,
+ timestamp: datetime | None = None,
+ msg: str | None = None,
+ **fields: Any,
+) -> str:
+ """
+ Return a single RFC 5424-compliant syslog line (no trailing newline).
+
+ Args:
+ service: APP-NAME (e.g. "http", "mysql")
+ hostname: HOSTNAME (node name)
+ event_type: MSGID (e.g. "request", "login_attempt")
+ severity: Syslog severity integer (default: INFO=6)
+ timestamp: UTC datetime; defaults to now
+ msg: Optional free-text MSG
+ **fields: Encoded as structured data params
+ """
+ pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>"
+ ts = (timestamp or datetime.now(timezone.utc)).isoformat()
+ host = (hostname or _NILVALUE)[:_MAX_HOSTNAME]
+ appname = (service or _NILVALUE)[:_MAX_APPNAME]
+ msgid = (event_type or _NILVALUE)[:_MAX_MSGID]
+ sd = _sd_element(fields)
+ message = f" {msg}" if msg else ""
+ return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}"
+
+
+def write_syslog_file(line: str) -> None:
+ """Emit a syslog line to stdout for container log capture."""
+ print(line, flush=True)
+
+
+def forward_syslog(line: str, log_target: str) -> None:
+ """No-op stub. TCP forwarding is handled by rsyslog, not by service containers."""
+ pass
diff --git a/decnet/templates/syslog_bridge.py b/decnet/templates/syslog_bridge.py
new file mode 100644
index 0000000..c0a78d0
--- /dev/null
+++ b/decnet/templates/syslog_bridge.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+"""
+Shared RFC 5424 syslog helper used by service containers.
+
+Services call syslog_line() to format an RFC 5424 message, then
+write_syslog_file() to emit it to stdout — the container runtime
+captures it, and the host-side collector streams it into the log file.
+
+RFC 5424 structure:
+ 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG
+
+Facility: local0 (16). SD element ID uses PEN 55555.
+"""
+
+from datetime import datetime, timezone
+from typing import Any
+
+# ─── Constants ────────────────────────────────────────────────────────────────
+
+_FACILITY_LOCAL0 = 16
+_SD_ID = "relay@55555"
+_NILVALUE = "-"
+
+SEVERITY_EMERG = 0
+SEVERITY_ALERT = 1
+SEVERITY_CRIT = 2
+SEVERITY_ERROR = 3
+SEVERITY_WARNING = 4
+SEVERITY_NOTICE = 5
+SEVERITY_INFO = 6
+SEVERITY_DEBUG = 7
+
+_MAX_HOSTNAME = 255
+_MAX_APPNAME = 48
+_MAX_MSGID = 32
+
+# ─── Formatter ────────────────────────────────────────────────────────────────
+
+def _sd_escape(value: str) -> str:
+ """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3."""
+ return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]")
+
+
+def _sd_element(fields: dict[str, Any]) -> str:
+ if not fields:
+ return _NILVALUE
+ params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items())
+ return f"[{_SD_ID} {params}]"
+
+
+def syslog_line(
+ service: str,
+ hostname: str,
+ event_type: str,
+ severity: int = SEVERITY_INFO,
+ timestamp: datetime | None = None,
+ msg: str | None = None,
+ **fields: Any,
+) -> str:
+ """
+ Return a single RFC 5424-compliant syslog line (no trailing newline).
+
+ Args:
+ service: APP-NAME (e.g. "http", "mysql")
+ hostname: HOSTNAME (node name)
+ event_type: MSGID (e.g. "request", "login_attempt")
+ severity: Syslog severity integer (default: INFO=6)
+ timestamp: UTC datetime; defaults to now
+ msg: Optional free-text MSG
+ **fields: Encoded as structured data params
+ """
+ pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>"
+ ts = (timestamp or datetime.now(timezone.utc)).isoformat()
+ host = (hostname or _NILVALUE)[:_MAX_HOSTNAME]
+ appname = (service or _NILVALUE)[:_MAX_APPNAME]
+ msgid = (event_type or _NILVALUE)[:_MAX_MSGID]
+ sd = _sd_element(fields)
+ message = f" {msg}" if msg else ""
+ return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}"
+
+
+def write_syslog_file(line: str) -> None:
+ """Emit a syslog line to stdout for container log capture."""
+ print(line, flush=True)
+
+
+def forward_syslog(line: str, log_target: str) -> None:
+ """No-op stub. TCP forwarding is handled by rsyslog, not by service containers."""
+ pass
diff --git a/templates/telnet/Dockerfile b/decnet/templates/telnet/Dockerfile
similarity index 86%
rename from templates/telnet/Dockerfile
rename to decnet/templates/telnet/Dockerfile
index ad66570..483446b 100644
--- a/templates/telnet/Dockerfile
+++ b/decnet/templates/telnet/Dockerfile
@@ -10,11 +10,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
# rsyslog: forward auth.* and user.* to named pipe in RFC 5424 format
RUN printf '%s\n' \
- '# DECNET log bridge — auth + user events → named pipe as RFC 5424' \
+ '# syslog-relay log bridge — auth + user events → named pipe as RFC 5424' \
'$template RFC5424fmt,"<%PRI%>1 %TIMESTAMP:::date-rfc3339% %HOSTNAME% %APP-NAME% %PROCID% %MSGID% %STRUCTURED-DATA% %msg%\n"' \
- 'auth,authpriv.* |/var/run/decnet-logs;RFC5424fmt' \
- 'user.* |/var/run/decnet-logs;RFC5424fmt' \
- > /etc/rsyslog.d/99-decnet.conf
+ 'auth,authpriv.* |/run/systemd/journal/syslog-relay;RFC5424fmt' \
+ 'user.* |/run/systemd/journal/syslog-relay;RFC5424fmt' \
+ > /etc/rsyslog.d/50-journal-forward.conf
# Disable imklog — containers can't read /proc/kmsg
RUN sed -i 's/^\(module(load="imklog"\)/# \1/' /etc/rsyslog.conf
diff --git a/templates/telnet/entrypoint.sh b/decnet/templates/telnet/entrypoint.sh
similarity index 70%
rename from templates/telnet/entrypoint.sh
rename to decnet/templates/telnet/entrypoint.sh
index 81da1e4..78dff79 100644
--- a/templates/telnet/entrypoint.sh
+++ b/decnet/templates/telnet/entrypoint.sh
@@ -27,12 +27,14 @@ cat /root/.env
HIST
fi
-# Logging pipeline: named pipe → rsyslogd (RFC 5424) → stdout
-rm -f /var/run/decnet-logs
-mkfifo /var/run/decnet-logs
+# Logging pipeline: named pipe → rsyslogd (RFC 5424) → stdout.
+# Cloak the pipe path and the relay `cat` so `ps aux` / `ls /run` don't
+# betray the honeypot — see ssh/entrypoint.sh for the same pattern.
+mkdir -p /run/systemd/journal
+rm -f /run/systemd/journal/syslog-relay
+mkfifo /run/systemd/journal/syslog-relay
-# Relay pipe to stdout so Docker captures all syslog events
-cat /var/run/decnet-logs &
+bash -c 'exec -a "systemd-journal-fwd" cat /run/systemd/journal/syslog-relay' &
# Start rsyslog
rsyslogd
diff --git a/decnet/templates/telnet/syslog_bridge.py b/decnet/templates/telnet/syslog_bridge.py
new file mode 100644
index 0000000..c0a78d0
--- /dev/null
+++ b/decnet/templates/telnet/syslog_bridge.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+"""
+Shared RFC 5424 syslog helper used by service containers.
+
+Services call syslog_line() to format an RFC 5424 message, then
+write_syslog_file() to emit it to stdout — the container runtime
+captures it, and the host-side collector streams it into the log file.
+
+RFC 5424 structure:
+ 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG
+
+Facility: local0 (16). SD element ID uses PEN 55555.
+"""
+
+from datetime import datetime, timezone
+from typing import Any
+
+# ─── Constants ────────────────────────────────────────────────────────────────
+
+_FACILITY_LOCAL0 = 16
+_SD_ID = "relay@55555"
+_NILVALUE = "-"
+
+SEVERITY_EMERG = 0
+SEVERITY_ALERT = 1
+SEVERITY_CRIT = 2
+SEVERITY_ERROR = 3
+SEVERITY_WARNING = 4
+SEVERITY_NOTICE = 5
+SEVERITY_INFO = 6
+SEVERITY_DEBUG = 7
+
+_MAX_HOSTNAME = 255
+_MAX_APPNAME = 48
+_MAX_MSGID = 32
+
+# ─── Formatter ────────────────────────────────────────────────────────────────
+
+def _sd_escape(value: str) -> str:
+ """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3."""
+ return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]")
+
+
+def _sd_element(fields: dict[str, Any]) -> str:
+ if not fields:
+ return _NILVALUE
+ params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items())
+ return f"[{_SD_ID} {params}]"
+
+
+def syslog_line(
+ service: str,
+ hostname: str,
+ event_type: str,
+ severity: int = SEVERITY_INFO,
+ timestamp: datetime | None = None,
+ msg: str | None = None,
+ **fields: Any,
+) -> str:
+ """
+ Return a single RFC 5424-compliant syslog line (no trailing newline).
+
+ Args:
+ service: APP-NAME (e.g. "http", "mysql")
+ hostname: HOSTNAME (node name)
+ event_type: MSGID (e.g. "request", "login_attempt")
+ severity: Syslog severity integer (default: INFO=6)
+ timestamp: UTC datetime; defaults to now
+ msg: Optional free-text MSG
+ **fields: Encoded as structured data params
+ """
+ pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>"
+ ts = (timestamp or datetime.now(timezone.utc)).isoformat()
+ host = (hostname or _NILVALUE)[:_MAX_HOSTNAME]
+ appname = (service or _NILVALUE)[:_MAX_APPNAME]
+ msgid = (event_type or _NILVALUE)[:_MAX_MSGID]
+ sd = _sd_element(fields)
+ message = f" {msg}" if msg else ""
+ return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}"
+
+
+def write_syslog_file(line: str) -> None:
+ """Emit a syslog line to stdout for container log capture."""
+ print(line, flush=True)
+
+
+def forward_syslog(line: str, log_target: str) -> None:
+ """No-op stub. TCP forwarding is handled by rsyslog, not by service containers."""
+ pass
diff --git a/templates/tftp/Dockerfile b/decnet/templates/tftp/Dockerfile
similarity index 85%
rename from templates/tftp/Dockerfile
rename to decnet/templates/tftp/Dockerfile
index dc7296c..fec26b1 100644
--- a/templates/tftp/Dockerfile
+++ b/decnet/templates/tftp/Dockerfile
@@ -5,13 +5,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
python3 \
&& rm -rf /var/lib/apt/lists/*
-COPY decnet_logging.py /opt/decnet_logging.py
+COPY syslog_bridge.py /opt/syslog_bridge.py
COPY server.py /opt/server.py
COPY entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh
EXPOSE 69/udp
-RUN useradd -r -s /bin/false -d /opt decnet \
+RUN useradd -r -s /bin/false -d /opt logrelay \
&& apt-get update && apt-get install -y --no-install-recommends libcap2-bin \
&& rm -rf /var/lib/apt/lists/* \
&& (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true)
@@ -19,5 +19,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
CMD kill -0 1 || exit 1
-USER decnet
+USER logrelay
ENTRYPOINT ["/entrypoint.sh"]
diff --git a/templates/tftp/entrypoint.sh b/decnet/templates/tftp/entrypoint.sh
similarity index 100%
rename from templates/tftp/entrypoint.sh
rename to decnet/templates/tftp/entrypoint.sh
diff --git a/templates/tftp/server.py b/decnet/templates/tftp/server.py
similarity index 95%
rename from templates/tftp/server.py
rename to decnet/templates/tftp/server.py
index 602cdc9..1faf0bd 100644
--- a/templates/tftp/server.py
+++ b/decnet/templates/tftp/server.py
@@ -8,7 +8,7 @@ then responds with an error packet. Logs all requests as JSON.
import asyncio
import os
import struct
-from decnet_logging import syslog_line, write_syslog_file, forward_syslog
+from syslog_bridge import syslog_line, write_syslog_file, forward_syslog
NODE_NAME = os.environ.get("NODE_NAME", "tftpserver")
SERVICE_NAME = "tftp"
@@ -28,7 +28,6 @@ def _error_pkt(code: int, msg: str) -> bytes:
def _log(event_type: str, severity: int = 6, **kwargs) -> None:
line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs)
- print(line, flush=True)
write_syslog_file(line)
forward_syslog(line, LOG_TARGET)
diff --git a/decnet/templates/tftp/syslog_bridge.py b/decnet/templates/tftp/syslog_bridge.py
new file mode 100644
index 0000000..c0a78d0
--- /dev/null
+++ b/decnet/templates/tftp/syslog_bridge.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+"""
+Shared RFC 5424 syslog helper used by service containers.
+
+Services call syslog_line() to format an RFC 5424 message, then
+write_syslog_file() to emit it to stdout — the container runtime
+captures it, and the host-side collector streams it into the log file.
+
+RFC 5424 structure:
+ 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG
+
+Facility: local0 (16). SD element ID uses PEN 55555.
+"""
+
+from datetime import datetime, timezone
+from typing import Any
+
+# ─── Constants ────────────────────────────────────────────────────────────────
+
+_FACILITY_LOCAL0 = 16
+_SD_ID = "relay@55555"
+_NILVALUE = "-"
+
+SEVERITY_EMERG = 0
+SEVERITY_ALERT = 1
+SEVERITY_CRIT = 2
+SEVERITY_ERROR = 3
+SEVERITY_WARNING = 4
+SEVERITY_NOTICE = 5
+SEVERITY_INFO = 6
+SEVERITY_DEBUG = 7
+
+_MAX_HOSTNAME = 255
+_MAX_APPNAME = 48
+_MAX_MSGID = 32
+
+# ─── Formatter ────────────────────────────────────────────────────────────────
+
+def _sd_escape(value: str) -> str:
+ """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3."""
+ return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]")
+
+
+def _sd_element(fields: dict[str, Any]) -> str:
+ if not fields:
+ return _NILVALUE
+ params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items())
+ return f"[{_SD_ID} {params}]"
+
+
+def syslog_line(
+ service: str,
+ hostname: str,
+ event_type: str,
+ severity: int = SEVERITY_INFO,
+ timestamp: datetime | None = None,
+ msg: str | None = None,
+ **fields: Any,
+) -> str:
+ """
+ Return a single RFC 5424-compliant syslog line (no trailing newline).
+
+ Args:
+ service: APP-NAME (e.g. "http", "mysql")
+ hostname: HOSTNAME (node name)
+ event_type: MSGID (e.g. "request", "login_attempt")
+ severity: Syslog severity integer (default: INFO=6)
+ timestamp: UTC datetime; defaults to now
+ msg: Optional free-text MSG
+ **fields: Encoded as structured data params
+ """
+ pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>"
+ ts = (timestamp or datetime.now(timezone.utc)).isoformat()
+ host = (hostname or _NILVALUE)[:_MAX_HOSTNAME]
+ appname = (service or _NILVALUE)[:_MAX_APPNAME]
+ msgid = (event_type or _NILVALUE)[:_MAX_MSGID]
+ sd = _sd_element(fields)
+ message = f" {msg}" if msg else ""
+ return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}"
+
+
+def write_syslog_file(line: str) -> None:
+ """Emit a syslog line to stdout for container log capture."""
+ print(line, flush=True)
+
+
+def forward_syslog(line: str, log_target: str) -> None:
+ """No-op stub. TCP forwarding is handled by rsyslog, not by service containers."""
+ pass
diff --git a/templates/vnc/Dockerfile b/decnet/templates/vnc/Dockerfile
similarity index 85%
rename from templates/vnc/Dockerfile
rename to decnet/templates/vnc/Dockerfile
index 62a5581..5957dee 100644
--- a/templates/vnc/Dockerfile
+++ b/decnet/templates/vnc/Dockerfile
@@ -5,13 +5,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
python3 \
&& rm -rf /var/lib/apt/lists/*
-COPY decnet_logging.py /opt/decnet_logging.py
+COPY syslog_bridge.py /opt/syslog_bridge.py
COPY server.py /opt/server.py
COPY entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh
EXPOSE 5900
-RUN useradd -r -s /bin/false -d /opt decnet \
+RUN useradd -r -s /bin/false -d /opt logrelay \
&& apt-get update && apt-get install -y --no-install-recommends libcap2-bin \
&& rm -rf /var/lib/apt/lists/* \
&& (find /usr/bin/ -maxdepth 1 -name 'python3*' -type f -exec setcap 'cap_net_bind_service+eip' {} \; 2>/dev/null || true)
@@ -19,5 +19,5 @@ RUN useradd -r -s /bin/false -d /opt decnet \
HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \
CMD kill -0 1 || exit 1
-USER decnet
+USER logrelay
ENTRYPOINT ["/entrypoint.sh"]
diff --git a/templates/vnc/entrypoint.sh b/decnet/templates/vnc/entrypoint.sh
similarity index 100%
rename from templates/vnc/entrypoint.sh
rename to decnet/templates/vnc/entrypoint.sh
diff --git a/templates/vnc/server.py b/decnet/templates/vnc/server.py
similarity index 96%
rename from templates/vnc/server.py
rename to decnet/templates/vnc/server.py
index 7f8637f..3f82f6d 100644
--- a/templates/vnc/server.py
+++ b/decnet/templates/vnc/server.py
@@ -8,7 +8,7 @@ failed". Logs the raw response for offline cracking.
import asyncio
import os
-from decnet_logging import syslog_line, write_syslog_file, forward_syslog
+from syslog_bridge import syslog_line, write_syslog_file, forward_syslog
NODE_NAME = os.environ.get("NODE_NAME", "desktop")
SERVICE_NAME = "vnc"
@@ -20,7 +20,6 @@ LOG_TARGET = os.environ.get("LOG_TARGET", "")
def _log(event_type: str, severity: int = 6, **kwargs) -> None:
line = syslog_line(SERVICE_NAME, NODE_NAME, event_type, severity, **kwargs)
- print(line, flush=True)
write_syslog_file(line)
forward_syslog(line, LOG_TARGET)
diff --git a/decnet/templates/vnc/syslog_bridge.py b/decnet/templates/vnc/syslog_bridge.py
new file mode 100644
index 0000000..c0a78d0
--- /dev/null
+++ b/decnet/templates/vnc/syslog_bridge.py
@@ -0,0 +1,89 @@
+#!/usr/bin/env python3
+"""
+Shared RFC 5424 syslog helper used by service containers.
+
+Services call syslog_line() to format an RFC 5424 message, then
+write_syslog_file() to emit it to stdout — the container runtime
+captures it, and the host-side collector streams it into the log file.
+
+RFC 5424 structure:
+ 1 TIMESTAMP HOSTNAME APP-NAME PROCID MSGID [SD-ELEMENT] MSG
+
+Facility: local0 (16). SD element ID uses PEN 55555.
+"""
+
+from datetime import datetime, timezone
+from typing import Any
+
+# ─── Constants ────────────────────────────────────────────────────────────────
+
+_FACILITY_LOCAL0 = 16
+_SD_ID = "relay@55555"
+_NILVALUE = "-"
+
+SEVERITY_EMERG = 0
+SEVERITY_ALERT = 1
+SEVERITY_CRIT = 2
+SEVERITY_ERROR = 3
+SEVERITY_WARNING = 4
+SEVERITY_NOTICE = 5
+SEVERITY_INFO = 6
+SEVERITY_DEBUG = 7
+
+_MAX_HOSTNAME = 255
+_MAX_APPNAME = 48
+_MAX_MSGID = 32
+
+# ─── Formatter ────────────────────────────────────────────────────────────────
+
+def _sd_escape(value: str) -> str:
+ """Escape SD-PARAM-VALUE per RFC 5424 §6.3.3."""
+ return value.replace("\\", "\\\\").replace('"', '\\"').replace("]", "\\]")
+
+
+def _sd_element(fields: dict[str, Any]) -> str:
+ if not fields:
+ return _NILVALUE
+ params = " ".join(f'{k}="{_sd_escape(str(v))}"' for k, v in fields.items())
+ return f"[{_SD_ID} {params}]"
+
+
+def syslog_line(
+ service: str,
+ hostname: str,
+ event_type: str,
+ severity: int = SEVERITY_INFO,
+ timestamp: datetime | None = None,
+ msg: str | None = None,
+ **fields: Any,
+) -> str:
+ """
+ Return a single RFC 5424-compliant syslog line (no trailing newline).
+
+ Args:
+ service: APP-NAME (e.g. "http", "mysql")
+ hostname: HOSTNAME (node name)
+ event_type: MSGID (e.g. "request", "login_attempt")
+ severity: Syslog severity integer (default: INFO=6)
+ timestamp: UTC datetime; defaults to now
+ msg: Optional free-text MSG
+ **fields: Encoded as structured data params
+ """
+ pri = f"<{_FACILITY_LOCAL0 * 8 + severity}>"
+ ts = (timestamp or datetime.now(timezone.utc)).isoformat()
+ host = (hostname or _NILVALUE)[:_MAX_HOSTNAME]
+ appname = (service or _NILVALUE)[:_MAX_APPNAME]
+ msgid = (event_type or _NILVALUE)[:_MAX_MSGID]
+ sd = _sd_element(fields)
+ message = f" {msg}" if msg else ""
+ return f"{pri}1 {ts} {host} {appname} {_NILVALUE} {msgid} {sd}{message}"
+
+
+def write_syslog_file(line: str) -> None:
+ """Emit a syslog line to stdout for container log capture."""
+ print(line, flush=True)
+
+
+def forward_syslog(line: str, log_target: str) -> None:
+ """No-op stub. TCP forwarding is handled by rsyslog, not by service containers."""
+ pass
diff --git a/decnet/updater/__init__.py b/decnet/updater/__init__.py
new file mode 100644
index 0000000..b586e1f
--- /dev/null
+++ b/decnet/updater/__init__.py
@@ -0,0 +1,10 @@
+"""DECNET self-updater daemon.
+
+Runs on each worker alongside ``decnet agent``. Receives working-tree
+tarballs from the master and owns the agent's lifecycle: snapshot →
+install → restart → probe → auto-rollback on failure.
+
+Deliberately separate process, separate venv, separate mTLS cert so that
+a broken ``decnet agent`` push can always be rolled back by the updater
+that shipped it. See ``wiki/Remote-Updates.md``.
+"""
diff --git a/decnet/updater/app.py b/decnet/updater/app.py
new file mode 100644
index 0000000..5c5d879
--- /dev/null
+++ b/decnet/updater/app.py
@@ -0,0 +1,139 @@
+"""Updater FastAPI app — mTLS-protected endpoints for self-update.
+
+Mirrors the shape of ``decnet/agent/app.py``: bare FastAPI, docs disabled,
+handlers delegate to ``decnet.updater.executor``.
+
+Mounted by uvicorn via ``decnet.updater.server`` with ``--ssl-cert-reqs 2``;
+the CN on the peer cert tells us which endpoints are legal (``updater@*``
+only — agent certs are rejected).
+"""
+from __future__ import annotations
+
+import os as _os
+import pathlib
+
+from fastapi import FastAPI, File, Form, HTTPException, UploadFile
+from pydantic import BaseModel
+
+from decnet.logging import get_logger
+from decnet.swarm import pki
+from decnet.updater import executor as _exec
+
+log = get_logger("updater.app")
+
+app = FastAPI(
+ title="DECNET Self-Updater",
+ version="0.1.0",
+ docs_url=None,
+ redoc_url=None,
+ openapi_url=None,
+)
+
+
+class _Config:
+ install_dir: pathlib.Path = pathlib.Path(
+ _os.environ.get("DECNET_UPDATER_INSTALL_DIR") or str(_exec.DEFAULT_INSTALL_DIR)
+ )
+ updater_install_dir: pathlib.Path = pathlib.Path(
+ _os.environ.get("DECNET_UPDATER_UPDATER_DIR")
+ or str(_exec.DEFAULT_INSTALL_DIR / "updater")
+ )
+ agent_dir: pathlib.Path = pathlib.Path(
+ _os.environ.get("DECNET_UPDATER_AGENT_DIR") or str(pki.DEFAULT_AGENT_DIR)
+ )
+
+
+def configure(
+ install_dir: pathlib.Path,
+ updater_install_dir: pathlib.Path,
+ agent_dir: pathlib.Path,
+) -> None:
+ """Inject paths from the server launcher; must be called before serving."""
+ _Config.install_dir = install_dir
+ _Config.updater_install_dir = updater_install_dir
+ _Config.agent_dir = agent_dir
+
+
+# ------------------------------------------------------------------- schemas
+
+class RollbackResult(BaseModel):
+ status: str
+ release: dict
+ probe: str
+
+
+class ReleasesResponse(BaseModel):
+ releases: list[dict]
+
+
+# -------------------------------------------------------------------- routes
+
+@app.get("/health")
+async def health() -> dict:
+ return {
+ "status": "ok",
+ "role": "updater",
+ "releases": [r.to_dict() for r in _exec.list_releases(_Config.install_dir)],
+ }
+
+
+@app.get("/releases")
+async def releases() -> dict:
+ return {"releases": [r.to_dict() for r in _exec.list_releases(_Config.install_dir)]}
+
+
+@app.post("/update")
+async def update(
+ tarball: UploadFile = File(..., description="tar.gz of the working tree"),
+ sha: str = Form("", description="git SHA of the tree for provenance"),
+) -> dict:
+ body = await tarball.read()
+ try:
+ return _exec.run_update(
+ body, sha=sha or None,
+ install_dir=_Config.install_dir, agent_dir=_Config.agent_dir,
+ )
+ except _exec.UpdateError as exc:
+ status = 409 if exc.rolled_back else 500
+ raise HTTPException(
+ status_code=status,
+ detail={"error": str(exc), "stderr": exc.stderr, "rolled_back": exc.rolled_back},
+ ) from exc
+
+
+@app.post("/update-self")
+async def update_self(
+ tarball: UploadFile = File(...),
+ sha: str = Form(""),
+ confirm_self: str = Form("", description="Must be 'true' to proceed"),
+) -> dict:
+ if confirm_self.lower() != "true":
+ raise HTTPException(
+ status_code=400,
+ detail="self-update requires confirm_self=true (no auto-rollback)",
+ )
+ body = await tarball.read()
+ try:
+ return _exec.run_update_self(
+ body, sha=sha or None,
+ updater_install_dir=_Config.updater_install_dir,
+ )
+ except _exec.UpdateError as exc:
+ raise HTTPException(
+ status_code=500,
+ detail={"error": str(exc), "stderr": exc.stderr},
+ ) from exc
+
+
+@app.post("/rollback")
+async def rollback() -> dict:
+ try:
+ return _exec.run_rollback(
+ install_dir=_Config.install_dir, agent_dir=_Config.agent_dir,
+ )
+ except _exec.UpdateError as exc:
+ status = 404 if "no previous" in str(exc) else 500
+ raise HTTPException(
+ status_code=status,
+ detail={"error": str(exc), "stderr": exc.stderr},
+ ) from exc
diff --git a/decnet/updater/executor.py b/decnet/updater/executor.py
new file mode 100644
index 0000000..a618f4a
--- /dev/null
+++ b/decnet/updater/executor.py
@@ -0,0 +1,693 @@
+"""Update/rollback orchestrator for the DECNET self-updater.
+
+Directory layout owned by this module (root = ``install_dir``):
+
+ /
+ current -> releases/active (symlink; atomic swap == promotion)
+ releases/
+ active/ (working tree; has its own .venv)
+ prev/ (last good snapshot; restored on failure)
+ active.new/ (staging; only exists mid-update)
+ agent.pid (PID of the agent process we spawned)
+
+Rollback semantics: if the agent doesn't come back healthy after an update,
+we swap the symlink back to ``prev``, restart the agent, and return the
+captured pip/agent stderr to the caller.
+
+Seams for tests — every subprocess call goes through a module-level hook
+(`_run_pip`, `_spawn_agent`, `_probe_agent`) so tests can monkeypatch them
+without actually touching the filesystem's Python toolchain.
+"""
+from __future__ import annotations
+
+import dataclasses
+import os
+import pathlib
+import shutil
+import signal
+import ssl
+import subprocess # nosec B404
+import sys
+import tarfile
+import time
+from datetime import datetime, timezone
+from typing import Any, Callable, Optional
+
+import httpx
+
+from decnet.logging import get_logger
+from decnet.swarm import pki
+
+log = get_logger("updater.executor")
+
+DEFAULT_INSTALL_DIR = pathlib.Path("/opt/decnet")
+AGENT_PROBE_URL = "https://127.0.0.1:8765/health"
+AGENT_PROBE_ATTEMPTS = 10
+AGENT_PROBE_BACKOFF_S = 1.0
+AGENT_RESTART_GRACE_S = 10.0
+
+
+# ------------------------------------------------------------------- errors
+
+class UpdateError(RuntimeError):
+ """Raised when an update fails but the install dir is consistent.
+
+ Carries the captured stderr so the master gets actionable output.
+ """
+
+ def __init__(self, message: str, *, stderr: str = "", rolled_back: bool = False):
+ super().__init__(message)
+ self.stderr = stderr
+ self.rolled_back = rolled_back
+
+
+# -------------------------------------------------------------------- types
+
+@dataclasses.dataclass(frozen=True)
+class Release:
+ slot: str
+ sha: Optional[str]
+ installed_at: Optional[datetime]
+
+ def to_dict(self) -> dict[str, Any]:
+ return {
+ "slot": self.slot,
+ "sha": self.sha,
+ "installed_at": self.installed_at.isoformat() if self.installed_at else None,
+ }
+
+
+# ---------------------------------------------------------------- internals
+
+def _releases_dir(install_dir: pathlib.Path) -> pathlib.Path:
+ return install_dir / "releases"
+
+
+def _active_dir(install_dir: pathlib.Path) -> pathlib.Path:
+ return _releases_dir(install_dir) / "active"
+
+
+def _prev_dir(install_dir: pathlib.Path) -> pathlib.Path:
+ return _releases_dir(install_dir) / "prev"
+
+
+def _staging_dir(install_dir: pathlib.Path) -> pathlib.Path:
+ return _releases_dir(install_dir) / "active.new"
+
+
+def _current_symlink(install_dir: pathlib.Path) -> pathlib.Path:
+ return install_dir / "current"
+
+
+def _pid_file(install_dir: pathlib.Path) -> pathlib.Path:
+ return install_dir / "agent.pid"
+
+
+def _manifest_file(release: pathlib.Path) -> pathlib.Path:
+ return release / ".decnet-release.json"
+
+
+def _venv_python(release: pathlib.Path) -> pathlib.Path:
+ return release / ".venv" / "bin" / "python"
+
+
+def _heal_path_symlink(install_dir: pathlib.Path) -> None:
+ """Point /usr/local/bin/decnet at the shared venv we manage.
+
+ Pre-fix bootstraps installed into ``/.venv`` (editable) and
+ symlinked /usr/local/bin/decnet there, so systemd units kept executing
+ the pre-update code even after ``_run_pip`` wrote to the shared venv.
+ Fix it opportunistically on every update so already-enrolled hosts
+ recover on the next push instead of needing a manual re-enroll.
+ """
+ target = _shared_venv(install_dir) / "bin" / "decnet"
+ link = pathlib.Path("/usr/local/bin/decnet")
+ if not target.is_file():
+ return
+ try:
+ if link.is_symlink() and pathlib.Path(os.readlink(link)) == target:
+ return
+ tmp = link.with_suffix(".tmp")
+ if tmp.exists() or tmp.is_symlink():
+ tmp.unlink()
+ tmp.symlink_to(target)
+ os.replace(tmp, link)
+ log.info("repointed %s -> %s", link, target)
+ except OSError as exc:
+ log.warning("could not repoint %s: %s", link, exc)
+
+
+def _shared_venv(install_dir: pathlib.Path) -> pathlib.Path:
+ """The one stable venv that agents/updaters run out of.
+
+ Release slots ship source only. We ``pip install --force-reinstall
+ --no-deps`` into this venv on promotion so shebangs never dangle
+ across a rotation.
+ """
+ return install_dir / "venv"
+
+
+# ------------------------------------------------------------------- public
+
+def read_release(release: pathlib.Path) -> Release:
+ """Read the release manifest sidecar; tolerate absence."""
+ slot = release.name
+ mf = _manifest_file(release)
+ if not mf.is_file():
+ return Release(slot=slot, sha=None, installed_at=None)
+ import json
+
+ try:
+ data = json.loads(mf.read_text())
+ except (json.JSONDecodeError, OSError):
+ return Release(slot=slot, sha=None, installed_at=None)
+ ts = data.get("installed_at")
+ return Release(
+ slot=slot,
+ sha=data.get("sha"),
+ installed_at=datetime.fromisoformat(ts) if ts else None,
+ )
+
+
+def list_releases(install_dir: pathlib.Path) -> list[Release]:
+ out: list[Release] = []
+ for slot_dir in (_active_dir(install_dir), _prev_dir(install_dir)):
+ if slot_dir.is_dir():
+ out.append(read_release(slot_dir))
+ return out
+
+
+def clean_stale_staging(install_dir: pathlib.Path) -> None:
+ """Remove a half-extracted ``active.new`` left by a crashed update."""
+ staging = _staging_dir(install_dir)
+ if staging.exists():
+ log.warning("removing stale staging dir %s", staging)
+ shutil.rmtree(staging, ignore_errors=True)
+
+
+def extract_tarball(tarball_bytes: bytes, dest: pathlib.Path) -> None:
+ """Extract a gzipped tarball into ``dest`` (must not pre-exist).
+
+ Rejects absolute paths and ``..`` traversal in the archive.
+ """
+ import io
+
+ dest.mkdir(parents=True, exist_ok=False)
+ with tarfile.open(fileobj=io.BytesIO(tarball_bytes), mode="r:gz") as tar:
+ for member in tar.getmembers():
+ name = member.name
+ if name.startswith("/") or ".." in pathlib.PurePosixPath(name).parts:
+ raise UpdateError(f"unsafe path in tarball: {name!r}")
+ tar.extractall(dest) # nosec B202 — validated above
+
+
+# ---------------------------------------------------------------- seams
+
+def _run_pip(
+ release: pathlib.Path,
+ install_dir: Optional[pathlib.Path] = None,
+) -> subprocess.CompletedProcess:
+ """pip install ``release`` into the shared venv at ``install_dir/venv``.
+
+ The shared venv is bootstrapped on first use. ``--force-reinstall
+ --no-deps`` replaces site-packages for the decnet package only; the
+ rest of the env stays cached across updates.
+
+ Monkeypatched in tests so the test suite never shells out.
+ """
+ idir = install_dir or release.parent.parent # releases/ -> install_dir
+ venv_dir = _shared_venv(idir)
+ fresh = not venv_dir.exists()
+ if fresh:
+ subprocess.run( # nosec B603
+ [sys.executable, "-m", "venv", str(venv_dir)],
+ check=True, capture_output=True, text=True,
+ )
+ py = venv_dir / "bin" / "python"
+ # First install into a fresh venv: pull full dep tree. Subsequent updates
+ # use --no-deps so pip only replaces the decnet package.
+ args = [str(py), "-m", "pip", "install", "--force-reinstall", str(release)]
+ if not fresh:
+ args.insert(-1, "--no-deps")
+ return subprocess.run( # nosec B603
+ args, check=False, capture_output=True, text=True,
+ )
+
+
+AGENT_SYSTEMD_UNIT = "decnet-agent.service"
+FORWARDER_SYSTEMD_UNIT = "decnet-forwarder.service"
+UPDATER_SYSTEMD_UNIT = "decnet-updater.service"
+# Per-host microservices that run out of the same /opt/decnet tree. An
+# update replaces their code, so we must cycle them alongside the agent or
+# they keep serving the pre-update image. Best-effort: legacy enrollments
+# without these units installed shouldn't abort the update.
+AUXILIARY_SYSTEMD_UNITS = (
+ "decnet-collector.service", "decnet-prober.service",
+ "decnet-sniffer.service",
+)
+
+
+def _systemd_available() -> bool:
+ """True when we're running under systemd and have systemctl on PATH.
+
+ Detection is conservative: we only return True if *both* the invocation
+ marker is set (``INVOCATION_ID`` is exported by systemd for every unit)
+ and ``systemctl`` is resolvable. The env var alone can be forged; the
+ binary alone can exist on hosts running other init systems.
+ """
+ if not os.environ.get("INVOCATION_ID"):
+ return False
+ from shutil import which
+ return which("systemctl") is not None
+
+
+def _spawn_agent(install_dir: pathlib.Path) -> int:
+ """Launch the agent and return its PID.
+
+ Under systemd, restart ``decnet-agent.service`` via ``systemctl`` so the
+ new process inherits the unit's ambient capabilities (CAP_NET_ADMIN,
+ CAP_NET_RAW). Spawning with ``subprocess.Popen`` from inside the updater
+ unit would make the agent a child of the updater and therefore a member
+ of the updater's (empty) capability set — it would come up without the
+ caps needed to run MACVLAN/scapy.
+
+ Off systemd (dev boxes, manual starts), fall back to a direct Popen.
+ """
+ if _systemd_available():
+ return _spawn_agent_via_systemd(install_dir)
+ return _spawn_agent_via_popen(install_dir)
+
+
+SYSTEMD_UNIT_DIR = pathlib.Path("/etc/systemd/system")
+
+
+def _sync_systemd_units(
+ install_dir: pathlib.Path,
+ dst_root: pathlib.Path = SYSTEMD_UNIT_DIR,
+) -> bool:
+ """Copy any `etc/systemd/system/*.service` files from the active release
+ into ``dst_root`` (default ``/etc/systemd/system/``) and run
+ `daemon-reload` if anything changed.
+
+ Returns True if daemon-reload was invoked. The bootstrap installer writes
+ these files on first enrollment; the updater mirrors that on every code
+ push so unit edits (ExecStart flips, new units, cap changes) ship too.
+ Best-effort: a read-only /etc or a missing ``active/etc`` subtree is just
+ logged and skipped.
+ """
+ src_root = _active_dir(install_dir) / "etc" / "systemd" / "system"
+ if not src_root.is_dir():
+ return False
+ changed = False
+ for src in sorted(src_root.glob("*.service")):
+ dst = dst_root / src.name
+ try:
+ new = src.read_bytes()
+ old = dst.read_bytes() if dst.is_file() else None
+ if old == new:
+ continue
+ tmp = dst.with_suffix(".service.tmp")
+ tmp.write_bytes(new)
+ os.chmod(tmp, 0o644)
+ os.replace(tmp, dst)
+ log.info("installed/updated systemd unit %s", dst)
+ changed = True
+ except OSError as exc:
+ log.warning("could not install unit %s: %s", dst, exc)
+ if changed and _systemd_available():
+ try:
+ subprocess.run( # nosec B603 B607
+ ["systemctl", "daemon-reload"],
+ check=True, capture_output=True, text=True,
+ )
+ log.info("systemctl daemon-reload succeeded")
+ except subprocess.CalledProcessError as exc:
+ log.warning("systemctl daemon-reload failed: %s", exc.stderr.strip())
+ return changed
+
+
+def _spawn_agent_via_systemd(install_dir: pathlib.Path) -> int:
+ # Restart agent + forwarder together: both processes run out of the same
+ # /opt/decnet tree, so a code push that replaces the tree must cycle both
+ # or the forwarder keeps the pre-update code in memory. Forwarder restart
+ # is best-effort — a worker without the forwarder unit installed (e.g. a
+ # legacy enrollment) shouldn't abort the update.
+ subprocess.run( # nosec B603 B607
+ ["systemctl", "restart", AGENT_SYSTEMD_UNIT],
+ check=True, capture_output=True, text=True,
+ )
+ fwd = subprocess.run( # nosec B603 B607
+ ["systemctl", "restart", FORWARDER_SYSTEMD_UNIT],
+ check=False, capture_output=True, text=True,
+ )
+ if fwd.returncode != 0:
+ log.warning("forwarder restart failed (ignored): %s", fwd.stderr.strip())
+ for unit in AUXILIARY_SYSTEMD_UNITS:
+ aux = subprocess.run( # nosec B603 B607
+ ["systemctl", "restart", unit],
+ check=False, capture_output=True, text=True,
+ )
+ if aux.returncode != 0:
+ log.warning("%s restart failed (ignored): %s", unit, aux.stderr.strip())
+ pid_out = subprocess.run( # nosec B603 B607
+ ["systemctl", "show", "--property=MainPID", "--value", AGENT_SYSTEMD_UNIT],
+ check=True, capture_output=True, text=True,
+ )
+ pid = int(pid_out.stdout.strip() or "0")
+ if pid:
+ _pid_file(install_dir).write_text(str(pid))
+ return pid
+
+
+def _spawn_agent_via_popen(install_dir: pathlib.Path) -> int:
+ decnet_bin = _shared_venv(install_dir) / "bin" / "decnet"
+ log_path = install_dir / "agent.spawn.log"
+ # cwd=install_dir so a persistent ``/.env.local`` gets
+ # picked up by decnet.env (which loads from CWD). The release slot
+ # itself is immutable across updates, so the env file cannot live
+ # inside it.
+ proc = subprocess.Popen( # nosec B603
+ [str(decnet_bin), "agent", "--daemon"],
+ start_new_session=True,
+ cwd=str(install_dir),
+ stdout=open(log_path, "ab"), # noqa: SIM115
+ stderr=subprocess.STDOUT,
+ )
+ _pid_file(install_dir).write_text(str(proc.pid))
+ return proc.pid
+
+
+def _discover_agent_pids() -> list[int]:
+ """Scan /proc for any running ``decnet agent`` process.
+
+ Used as a fallback when agent.pid is missing (e.g., the agent was started
+ by hand rather than by the updater) so an update still produces a clean
+ restart instead of leaving the old in-memory code serving requests.
+ """
+ pids: list[int] = []
+ self_pid = os.getpid()
+ for entry in pathlib.Path("/proc").iterdir():
+ if not entry.name.isdigit():
+ continue
+ pid = int(entry.name)
+ if pid == self_pid:
+ continue
+ try:
+ raw = (entry / "cmdline").read_bytes()
+ except (FileNotFoundError, PermissionError, OSError):
+ continue
+ argv = [a for a in raw.split(b"\x00") if a]
+ if len(argv) < 2:
+ continue
+ if not argv[0].endswith(b"python") and b"python" not in pathlib.Path(argv[0].decode(errors="ignore")).name.encode():
+ # Allow direct console-script invocation too: argv[0] ends with /decnet
+ if not argv[0].endswith(b"/decnet"):
+ continue
+ if b"decnet" in b" ".join(argv) and b"agent" in argv:
+ pids.append(pid)
+ return pids
+
+
+def _stop_agent(install_dir: pathlib.Path, grace: float = AGENT_RESTART_GRACE_S) -> None:
+ """SIGTERM the agent and wait for it to exit; SIGKILL after ``grace`` s.
+
+ Prefers the PID recorded in ``agent.pid`` (processes we spawned) but
+ falls back to scanning /proc for any ``decnet agent`` so manually-started
+ agents are also restarted cleanly during an update.
+
+ Under systemd, stop is a no-op — ``_spawn_agent`` issues a single
+ ``systemctl restart`` that handles stop and start atomically. Pre-stopping
+ would only race the restart's own stop phase.
+ """
+ if _systemd_available():
+ return
+ pids: list[int] = []
+ pid_file = _pid_file(install_dir)
+ if pid_file.is_file():
+ try:
+ pids.append(int(pid_file.read_text().strip()))
+ except (ValueError, OSError):
+ pass
+ for pid in _discover_agent_pids():
+ if pid not in pids:
+ pids.append(pid)
+ if not pids:
+ return
+ for pid in pids:
+ try:
+ os.kill(pid, signal.SIGTERM)
+ except ProcessLookupError:
+ continue
+ deadline = time.monotonic() + grace
+ remaining = list(pids)
+ while remaining and time.monotonic() < deadline:
+ remaining = [p for p in remaining if _pid_alive(p)]
+ if remaining:
+ time.sleep(0.2)
+ for pid in remaining:
+ try:
+ os.kill(pid, signal.SIGKILL)
+ except ProcessLookupError:
+ pass
+
+
+def _pid_alive(pid: int) -> bool:
+ try:
+ os.kill(pid, 0)
+ return True
+ except ProcessLookupError:
+ return False
+
+
+def _probe_agent(
+ agent_dir: pathlib.Path = pki.DEFAULT_AGENT_DIR,
+ url: str = AGENT_PROBE_URL,
+ attempts: int = AGENT_PROBE_ATTEMPTS,
+ backoff_s: float = AGENT_PROBE_BACKOFF_S,
+) -> tuple[bool, str]:
+ """Local mTLS health probe against the agent. Returns (ok, detail)."""
+ worker_key = agent_dir / "worker.key"
+ worker_crt = agent_dir / "worker.crt"
+ ca = agent_dir / "ca.crt"
+ if not (worker_key.is_file() and worker_crt.is_file() and ca.is_file()):
+ return False, f"no mTLS bundle at {agent_dir}"
+ ctx = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
+ ctx.load_cert_chain(certfile=str(worker_crt), keyfile=str(worker_key))
+ ctx.load_verify_locations(cafile=str(ca))
+ ctx.verify_mode = ssl.CERT_REQUIRED
+ ctx.check_hostname = False
+
+ last = ""
+ for i in range(attempts):
+ try:
+ with httpx.Client(verify=ctx, timeout=3.0) as client:
+ r = client.get(url)
+ if r.status_code == 200:
+ return True, r.text
+ last = f"status={r.status_code} body={r.text[:200]}"
+ except Exception as exc: # noqa: BLE001
+ last = f"{type(exc).__name__}: {exc}"
+ if i < attempts - 1:
+ time.sleep(backoff_s)
+ return False, last
+
+
+# -------------------------------------------------------------- orchestrator
+
+def _write_manifest(release: pathlib.Path, sha: Optional[str]) -> None:
+ import json
+
+ _manifest_file(release).write_text(json.dumps({
+ "sha": sha,
+ "installed_at": datetime.now(timezone.utc).isoformat(),
+ }))
+
+
+def _rotate(install_dir: pathlib.Path) -> None:
+ """Rotate directories: prev→(deleted), active→prev, active.new→active.
+
+ Caller must ensure ``active.new`` exists. ``active`` may or may not.
+ """
+ active = _active_dir(install_dir)
+ prev = _prev_dir(install_dir)
+ staging = _staging_dir(install_dir)
+
+ if prev.exists():
+ shutil.rmtree(prev)
+ if active.exists():
+ active.rename(prev)
+ staging.rename(active)
+
+
+def _point_current_at(install_dir: pathlib.Path, target: pathlib.Path) -> None:
+ """Atomic symlink flip via rename."""
+ link = _current_symlink(install_dir)
+ tmp = install_dir / ".current.tmp"
+ if tmp.exists() or tmp.is_symlink():
+ tmp.unlink()
+ tmp.symlink_to(target)
+ os.replace(tmp, link)
+
+
+def run_update(
+ tarball_bytes: bytes,
+ sha: Optional[str],
+ install_dir: pathlib.Path = DEFAULT_INSTALL_DIR,
+ agent_dir: pathlib.Path = pki.DEFAULT_AGENT_DIR,
+) -> dict[str, Any]:
+ """Apply an update atomically. Rolls back on probe failure."""
+ log.info("update received sha=%s bytes=%d install_dir=%s", sha, len(tarball_bytes), install_dir)
+ clean_stale_staging(install_dir)
+ staging = _staging_dir(install_dir)
+
+ log.info("extracting tarball -> %s", staging)
+ extract_tarball(tarball_bytes, staging)
+ _write_manifest(staging, sha)
+
+ log.info("pip install into shared venv (%s)", _shared_venv(install_dir))
+ pip = _run_pip(staging)
+ if pip.returncode != 0:
+ log.error("pip install failed rc=%d stderr=%s", pip.returncode, (pip.stderr or pip.stdout).strip()[:400])
+ shutil.rmtree(staging, ignore_errors=True)
+ raise UpdateError(
+ "pip install failed on new release", stderr=pip.stderr or pip.stdout,
+ )
+
+ log.info("rotating releases: active.new -> active, active -> prev")
+ _rotate(install_dir)
+ _point_current_at(install_dir, _active_dir(install_dir))
+ _heal_path_symlink(install_dir)
+ _sync_systemd_units(install_dir)
+
+ log.info("restarting agent (and forwarder if present)")
+ _stop_agent(install_dir)
+ _spawn_agent(install_dir)
+
+ ok, detail = _probe_agent(agent_dir=agent_dir)
+ if ok:
+ log.info("update complete sha=%s probe=ok", sha)
+ return {
+ "status": "updated",
+ "release": read_release(_active_dir(install_dir)).to_dict(),
+ "probe": detail,
+ }
+
+ # Rollback.
+ log.warning("agent probe failed after update: %s — rolling back", detail)
+ _stop_agent(install_dir)
+ # Swap active <-> prev.
+ active = _active_dir(install_dir)
+ prev = _prev_dir(install_dir)
+ tmp = _releases_dir(install_dir) / ".swap"
+ if tmp.exists():
+ shutil.rmtree(tmp)
+ active.rename(tmp)
+ prev.rename(active)
+ tmp.rename(prev)
+ _point_current_at(install_dir, active)
+ _spawn_agent(install_dir)
+ ok2, detail2 = _probe_agent(agent_dir=agent_dir)
+ raise UpdateError(
+ "agent failed health probe after update; rolled back to previous release",
+ stderr=f"forward-probe: {detail}\nrollback-probe: {detail2}",
+ rolled_back=ok2,
+ )
+
+
+def run_rollback(
+ install_dir: pathlib.Path = DEFAULT_INSTALL_DIR,
+ agent_dir: pathlib.Path = pki.DEFAULT_AGENT_DIR,
+) -> dict[str, Any]:
+ """Manually swap active with prev and restart the agent."""
+ active = _active_dir(install_dir)
+ prev = _prev_dir(install_dir)
+ if not prev.is_dir():
+ raise UpdateError("no previous release to roll back to")
+
+ _stop_agent(install_dir)
+ tmp = _releases_dir(install_dir) / ".swap"
+ if tmp.exists():
+ shutil.rmtree(tmp)
+ active.rename(tmp)
+ prev.rename(active)
+ tmp.rename(prev)
+ _point_current_at(install_dir, active)
+ _spawn_agent(install_dir)
+ ok, detail = _probe_agent(agent_dir=agent_dir)
+ if not ok:
+ raise UpdateError("agent unhealthy after rollback", stderr=detail)
+ return {
+ "status": "rolled_back",
+ "release": read_release(active).to_dict(),
+ "probe": detail,
+ }
+
+
+def run_update_self(
+ tarball_bytes: bytes,
+ sha: Optional[str],
+ updater_install_dir: pathlib.Path,
+ exec_cb: Optional[Callable[[list[str]], None]] = None,
+) -> dict[str, Any]:
+ """Replace the updater's own source tree, then re-exec this process.
+
+ No auto-rollback. Caller must treat "connection dropped + /health
+ returns new SHA within 30s" as success.
+ """
+ log.info("self-update received sha=%s bytes=%d install_dir=%s", sha, len(tarball_bytes), updater_install_dir)
+ clean_stale_staging(updater_install_dir)
+ staging = _staging_dir(updater_install_dir)
+ log.info("extracting tarball -> %s", staging)
+ extract_tarball(tarball_bytes, staging)
+ _write_manifest(staging, sha)
+
+ log.info("pip install updater release into shared venv (%s)", _shared_venv(updater_install_dir))
+ pip = _run_pip(staging)
+ if pip.returncode != 0:
+ log.error("self-update pip install failed rc=%d stderr=%s", pip.returncode, (pip.stderr or pip.stdout).strip()[:400])
+ shutil.rmtree(staging, ignore_errors=True)
+ raise UpdateError(
+ "pip install failed on new updater release",
+ stderr=pip.stderr or pip.stdout,
+ )
+
+ log.info("rotating updater releases and flipping current symlink")
+ _rotate(updater_install_dir)
+ _point_current_at(updater_install_dir, _active_dir(updater_install_dir))
+ _heal_path_symlink(updater_install_dir)
+ _sync_systemd_units(updater_install_dir)
+
+ # Reconstruct the updater's original launch command from env vars set by
+ # `decnet.updater.server.run`. We can't reuse sys.argv: inside the app
+ # process this is the uvicorn subprocess invocation (--ssl-keyfile, etc.),
+ # not the operator-visible `decnet updater ...` command.
+ decnet_bin = str(_shared_venv(updater_install_dir) / "bin" / "decnet")
+ argv = [decnet_bin, "updater",
+ "--host", os.environ.get("DECNET_UPDATER_HOST", "0.0.0.0"), # nosec B104
+ "--port", os.environ.get("DECNET_UPDATER_PORT", "8766"),
+ "--updater-dir", os.environ.get("DECNET_UPDATER_BUNDLE_DIR",
+ str(pki.DEFAULT_AGENT_DIR.parent / "updater")),
+ "--install-dir", os.environ.get("DECNET_UPDATER_INSTALL_DIR",
+ str(updater_install_dir.parent)),
+ "--agent-dir", os.environ.get("DECNET_UPDATER_AGENT_DIR",
+ str(pki.DEFAULT_AGENT_DIR))]
+ if exec_cb is not None:
+ exec_cb(argv) # tests stub this — we don't actually re-exec
+ return {"status": "self_update_queued", "argv": argv}
+ # Under systemd, hand the restart to the init system so the new process
+ # keeps its unit context (capabilities, cgroup, logging target) instead
+ # of inheriting whatever we had here. Spawn a detached sh that waits for
+ # this response to flush before issuing the restart — `systemctl restart`
+ # on our own unit would kill us mid-response and the caller would see a
+ # connection drop with no indication of success.
+ if _systemd_available():
+ log.info("self-update queued: systemctl restart %s (deferred 1s)", UPDATER_SYSTEMD_UNIT)
+ subprocess.Popen( # nosec B603 B607
+ ["sh", "-c", f"sleep 1 && systemctl restart {UPDATER_SYSTEMD_UNIT}"],
+ start_new_session=True,
+ stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
+ )
+ return {"status": "self_update_queued", "via": "systemd"}
+ # Off-systemd fallback: replace the process image directly.
+ os.execv(argv[0], argv) # nosec B606 - pragma: no cover
+ return {"status": "self_update_queued"} # pragma: no cover
diff --git a/decnet/updater/server.py b/decnet/updater/server.py
new file mode 100644
index 0000000..ed4b93d
--- /dev/null
+++ b/decnet/updater/server.py
@@ -0,0 +1,90 @@
+"""Self-updater uvicorn launcher.
+
+Parallels ``decnet/agent/server.py`` but uses a distinct bundle directory
+(``~/.decnet/updater``) with a cert whose CN is ``updater@``. That
+cert is signed by the same DECNET CA as the agent's, so the master's one
+CA still gates both channels; the CN is how we tell them apart.
+"""
+from __future__ import annotations
+
+import os
+import pathlib
+import signal
+import subprocess # nosec B404
+import sys
+
+from decnet.logging import get_logger
+from decnet.swarm import pki
+
+log = get_logger("updater.server")
+
+DEFAULT_UPDATER_DIR = pathlib.Path(os.path.expanduser("~/.decnet/updater"))
+
+
+def _load_bundle(updater_dir: pathlib.Path) -> bool:
+ return all(
+ (updater_dir / name).is_file()
+ for name in ("ca.crt", "updater.crt", "updater.key")
+ )
+
+
+def run(
+ host: str,
+ port: int,
+ updater_dir: pathlib.Path = DEFAULT_UPDATER_DIR,
+ install_dir: pathlib.Path = pathlib.Path("/opt/decnet"),
+ agent_dir: pathlib.Path = pki.DEFAULT_AGENT_DIR,
+) -> int:
+ if not _load_bundle(updater_dir):
+ print(
+ f"[updater] No cert bundle at {updater_dir}. "
+ f"Run `decnet swarm enroll --updater` from the master first.",
+ file=sys.stderr,
+ )
+ return 2
+
+ # Pass config into the app module via env so uvicorn subprocess picks it up.
+ os.environ["DECNET_UPDATER_INSTALL_DIR"] = str(install_dir)
+ os.environ["DECNET_UPDATER_UPDATER_DIR"] = str(install_dir / "updater")
+ os.environ["DECNET_UPDATER_AGENT_DIR"] = str(agent_dir)
+ # Needed by run_update_self to rebuild the updater's launch argv.
+ os.environ["DECNET_UPDATER_BUNDLE_DIR"] = str(updater_dir)
+ os.environ["DECNET_UPDATER_HOST"] = str(host)
+ os.environ["DECNET_UPDATER_PORT"] = str(port)
+
+ keyfile = updater_dir / "updater.key"
+ certfile = updater_dir / "updater.crt"
+ cafile = updater_dir / "ca.crt"
+
+ cmd = [
+ sys.executable,
+ "-m",
+ "uvicorn",
+ "decnet.updater.app:app",
+ "--host",
+ host,
+ "--port",
+ str(port),
+ "--ssl-keyfile",
+ str(keyfile),
+ "--ssl-certfile",
+ str(certfile),
+ "--ssl-ca-certs",
+ str(cafile),
+ "--ssl-cert-reqs",
+ "2",
+ ]
+ log.info("updater starting host=%s port=%d bundle=%s", host, port, updater_dir)
+ proc = subprocess.Popen(cmd, start_new_session=True) # nosec B603
+ try:
+ return proc.wait()
+ except KeyboardInterrupt:
+ try:
+ os.killpg(proc.pid, signal.SIGTERM)
+ try:
+ return proc.wait(timeout=10)
+ except subprocess.TimeoutExpired:
+ os.killpg(proc.pid, signal.SIGKILL)
+ return proc.wait()
+ except ProcessLookupError:
+ return 0
diff --git a/decnet/web/_uvicorn_tls_scope.py b/decnet/web/_uvicorn_tls_scope.py
new file mode 100644
index 0000000..f68ad56
--- /dev/null
+++ b/decnet/web/_uvicorn_tls_scope.py
@@ -0,0 +1,72 @@
+"""Inject the TLS peer cert into ASGI scope — uvicorn ≤ 0.44 does not.
+
+Uvicorn's h11/httptools HTTP protocols build the ASGI ``scope`` dict
+without any ``extensions.tls`` entry, so per-request cert pinning
+handlers (like POST /swarm/heartbeat) can't see the client cert that
+CERT_REQUIRED already validated at handshake.
+
+We patch ``RequestResponseCycle.__init__`` on both protocol modules to
+read the peer cert off the asyncio transport (which *does* carry it)
+and write the DER bytes into
+``scope["extensions"]["tls"]["client_cert_chain"]``. This is the same
+key the ASGI TLS extension proposal uses, so the application code will
+keep working unchanged if a future uvicorn populates it natively.
+
+Import this module once at app startup time (before uvicorn starts
+accepting connections). Idempotent — subsequent imports are no-ops.
+"""
+from __future__ import annotations
+
+from typing import Any
+
+
+_PATCHED = False
+
+
+def _wrap_cycle_init(cycle_cls) -> None:
+ original = cycle_cls.__init__
+
+ def _patched_init(self, *args: Any, **kwargs: Any) -> None:
+ original(self, *args, **kwargs)
+ transport = kwargs.get("transport") or getattr(self, "transport", None)
+ if transport is None:
+ return
+ ssl_obj = transport.get_extra_info("ssl_object")
+ if ssl_obj is None:
+ return
+ try:
+ der = ssl_obj.getpeercert(binary_form=True)
+ except Exception:
+ return
+ if not der:
+ return
+ # scope is a mutable dict uvicorn stores here; Starlette forwards
+ # it to handlers as request.scope. Use setdefault so we don't clobber
+ # any future native extension entries from uvicorn itself.
+ scope = self.scope
+ extensions = scope.setdefault("extensions", {})
+ extensions.setdefault("tls", {"client_cert_chain": [der]})
+
+ cycle_cls.__init__ = _patched_init
+
+
+def install() -> None:
+ """Patch uvicorn's HTTP cycle classes. Safe to call multiple times."""
+ global _PATCHED
+ if _PATCHED:
+ return
+ try:
+ from uvicorn.protocols.http import h11_impl
+ _wrap_cycle_init(h11_impl.RequestResponseCycle)
+ except Exception: # nosec B110 - optional uvicorn impl may be unavailable
+ pass
+ try:
+ from uvicorn.protocols.http import httptools_impl
+ _wrap_cycle_init(httptools_impl.RequestResponseCycle)
+ except Exception: # nosec B110 - optional uvicorn impl may be unavailable
+ pass
+ _PATCHED = True
+
+
+# Auto-install on import so simply importing this module patches uvicorn.
+install()
diff --git a/decnet/web/api.py b/decnet/web/api.py
index d5e3ca3..f33b9de 100644
--- a/decnet/web/api.py
+++ b/decnet/web/api.py
@@ -1,33 +1,66 @@
import asyncio
-import logging
import os
from contextlib import asynccontextmanager
from typing import Any, AsyncGenerator, Optional
from fastapi import FastAPI, Request, status
from fastapi.exceptions import RequestValidationError
-from fastapi.responses import JSONResponse
+from fastapi.responses import ORJSONResponse
from pydantic import ValidationError
from fastapi.middleware.cors import CORSMiddleware
-from decnet.env import DECNET_CORS_ORIGINS, DECNET_DEVELOPER, DECNET_INGEST_LOG_FILE
+from decnet.env import (
+ DECNET_CORS_ORIGINS,
+ DECNET_DEVELOPER,
+ DECNET_EMBED_PROFILER,
+ DECNET_EMBED_SNIFFER,
+ DECNET_INGEST_LOG_FILE,
+ DECNET_PROFILE_DIR,
+ DECNET_PROFILE_REQUESTS,
+)
+from decnet.logging import get_logger
from decnet.web.dependencies import repo
from decnet.collector import log_collector_worker
from decnet.web.ingester import log_ingestion_worker
+from decnet.profiler import attacker_profile_worker
from decnet.web.router import api_router
-log = logging.getLogger(__name__)
+log = get_logger("api")
ingestion_task: Optional[asyncio.Task[Any]] = None
collector_task: Optional[asyncio.Task[Any]] = None
+attacker_task: Optional[asyncio.Task[Any]] = None
+sniffer_task: Optional[asyncio.Task[Any]] = None
+
+
+def get_background_tasks() -> dict[str, Optional[asyncio.Task[Any]]]:
+ """Expose background task handles for the health endpoint."""
+ return {
+ "ingestion_worker": ingestion_task,
+ "collector_worker": collector_task,
+ "attacker_worker": attacker_task,
+ "sniffer_worker": sniffer_task,
+ }
@asynccontextmanager
async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
- global ingestion_task, collector_task
+ global ingestion_task, collector_task, attacker_task, sniffer_task
+ import resource
+ soft, hard = resource.getrlimit(resource.RLIMIT_NOFILE)
+ if soft < 4096:
+ log.warning(
+ "Low open-file limit detected (ulimit -n = %d). "
+ "High-traffic deployments may hit 'Too many open files' errors. "
+ "Raise it with: ulimit -n 65536 (session) or LimitNOFILE=65536 (systemd)",
+ soft,
+ )
+
+ log.info("API startup initialising database")
for attempt in range(1, 6):
try:
await repo.initialize()
+ log.debug("API startup DB initialised attempt=%d", attempt)
break
except Exception as exc:
log.warning("DB init attempt %d/5 failed: %s", attempt, exc)
@@ -35,25 +68,57 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
log.error("DB failed to initialize after 5 attempts — startup may be degraded")
await asyncio.sleep(0.5)
+ # Conditionally enable OpenTelemetry tracing
+ from decnet.telemetry import setup_tracing
+ setup_tracing(app)
+
# Start background tasks only if not in contract test mode
if os.environ.get("DECNET_CONTRACT_TEST") != "true":
# Start background ingestion task
if ingestion_task is None or ingestion_task.done():
ingestion_task = asyncio.create_task(log_ingestion_worker(repo))
+ log.debug("API startup ingest worker started")
# Start Docker log collector (writes to log file; ingester reads from it)
_log_file = os.environ.get("DECNET_INGEST_LOG_FILE", DECNET_INGEST_LOG_FILE)
if _log_file and (collector_task is None or collector_task.done()):
collector_task = asyncio.create_task(log_collector_worker(_log_file))
+ log.debug("API startup collector worker started log_file=%s", _log_file)
elif not _log_file:
log.warning("DECNET_INGEST_LOG_FILE not set — Docker log collection disabled.")
+
+ # Start attacker profile rebuild worker only when explicitly requested.
+ # Default is OFF because `decnet deploy` always starts a standalone
+ # `decnet profiler --daemon` process. Running both against the same
+ # DB cursor causes events to be skipped or double-processed.
+ if DECNET_EMBED_PROFILER:
+ if attacker_task is None or attacker_task.done():
+ attacker_task = asyncio.create_task(attacker_profile_worker(repo))
+ log.info("API startup: embedded profiler started (DECNET_EMBED_PROFILER=true)")
+ else:
+ log.debug("API startup: profiler not embedded — expecting standalone daemon")
+
+ # Start fleet-wide MACVLAN sniffer only when explicitly requested.
+ # Default is OFF because `decnet deploy` always starts a standalone
+ # `decnet sniffer --daemon` process. Running both against the same
+ # interface produces duplicated events and wastes CPU.
+ if DECNET_EMBED_SNIFFER:
+ try:
+ from decnet.sniffer import sniffer_worker
+ if sniffer_task is None or sniffer_task.done():
+ sniffer_task = asyncio.create_task(sniffer_worker(_log_file))
+ log.info("API startup: embedded sniffer started (DECNET_EMBED_SNIFFER=true)")
+ except Exception as exc:
+ log.warning("Sniffer worker failed to start — API continues without sniffing: %s", exc)
+ else:
+ log.debug("API startup: sniffer not embedded — expecting standalone daemon")
else:
log.info("Contract Test Mode: skipping background worker startup")
yield
- # Shutdown background tasks
- for task in (ingestion_task, collector_task):
+ log.info("API shutdown cancelling background tasks")
+ for task in (ingestion_task, collector_task, attacker_task, sniffer_task):
if task and not task.done():
task.cancel()
try:
@@ -62,12 +127,16 @@ async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
pass
except Exception as exc:
log.warning("Task shutdown error: %s", exc)
+ from decnet.telemetry import shutdown_tracing
+ shutdown_tracing()
+ log.info("API shutdown complete")
app: FastAPI = FastAPI(
title="DECNET Web Dashboard API",
version="1.0.0",
lifespan=lifespan,
+ default_response_class=ORJSONResponse,
docs_url="/docs" if DECNET_DEVELOPER else None,
redoc_url="/redoc" if DECNET_DEVELOPER else None,
openapi_url="/openapi.json" if DECNET_DEVELOPER else None
@@ -81,12 +150,37 @@ app.add_middleware(
allow_headers=["Authorization", "Content-Type", "Last-Event-ID"],
)
+if DECNET_PROFILE_REQUESTS:
+ import time
+ from pathlib import Path
+ from pyinstrument import Profiler
+ from starlette.middleware.base import BaseHTTPMiddleware
+
+ _profile_dir = Path(DECNET_PROFILE_DIR)
+ _profile_dir.mkdir(parents=True, exist_ok=True)
+
+ class PyinstrumentMiddleware(BaseHTTPMiddleware):
+ async def dispatch(self, request: Request, call_next):
+ profiler = Profiler(async_mode="enabled")
+ profiler.start()
+ try:
+ response = await call_next(request)
+ finally:
+ profiler.stop()
+ slug = request.url.path.strip("/").replace("/", "_") or "root"
+ out = _profile_dir / f"{int(time.time() * 1000)}-{request.method}-{slug}.html"
+ out.write_text(profiler.output_html())
+ return response
+
+ app.add_middleware(PyinstrumentMiddleware)
+ log.info("Pyinstrument middleware mounted — flamegraphs -> %s", _profile_dir)
+
# Include the modular API router
app.include_router(api_router, prefix="/api/v1")
@app.exception_handler(RequestValidationError)
-async def validation_exception_handler(request: Request, exc: RequestValidationError) -> JSONResponse:
+async def validation_exception_handler(request: Request, exc: RequestValidationError) -> ORJSONResponse:
"""
Handle validation errors with targeted status codes to satisfy contract tests.
Tiered Prioritization:
@@ -106,7 +200,7 @@ async def validation_exception_handler(request: Request, exc: RequestValidationE
for err in errors
)
if is_structural_violation:
- return JSONResponse(
+ return ORJSONResponse(
status_code=status.HTTP_400_BAD_REQUEST,
content={"detail": "Bad Request: Schema structural violation (wrong type, extra fields, or invalid length)."},
)
@@ -117,7 +211,7 @@ async def validation_exception_handler(request: Request, exc: RequestValidationE
# Empty INI content (Valid string but semantically empty)
is_ini_empty = any("INI content is empty" in err.get("msg", "") for err in errors)
if is_ini_empty:
- return JSONResponse(
+ return ORJSONResponse(
status_code=status.HTTP_409_CONFLICT,
content={"detail": "Configuration conflict: INI content is empty."},
)
@@ -126,7 +220,7 @@ async def validation_exception_handler(request: Request, exc: RequestValidationE
# Mapping to 409 for Positive Data compliance.
is_invalid_characters = any("Invalid INI format" in err.get("msg", "") for err in errors)
if is_invalid_characters:
- return JSONResponse(
+ return ORJSONResponse(
status_code=status.HTTP_409_CONFLICT,
content={"detail": "Configuration conflict: INI syntax or characters are invalid."},
)
@@ -134,7 +228,7 @@ async def validation_exception_handler(request: Request, exc: RequestValidationE
# Logical invalidity (Valid string, valid syntax, but missing required DECNET logic like sections)
is_ini_invalid_logic = any("at least one section" in err.get("msg", "") for err in errors)
if is_ini_invalid_logic:
- return JSONResponse(
+ return ORJSONResponse(
status_code=status.HTTP_409_CONFLICT,
content={"detail": "Invalid INI config structure: No decky sections found."},
)
@@ -149,19 +243,19 @@ async def validation_exception_handler(request: Request, exc: RequestValidationE
if "/deckies/deploy" in request.url.path:
message = "Invalid INI config"
- return JSONResponse(
+ return ORJSONResponse(
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
content={"detail": message},
)
@app.exception_handler(ValidationError)
-async def pydantic_validation_exception_handler(request: Request, exc: ValidationError) -> JSONResponse:
+async def pydantic_validation_exception_handler(request: Request, exc: ValidationError) -> ORJSONResponse:
"""
Handle Pydantic errors that occur during manual model instantiation (e.g. state hydration).
Prevents 500 errors when the database contains inconsistent or outdated schema data.
"""
log.error("Internal Pydantic validation error: %s", exc)
- return JSONResponse(
+ return ORJSONResponse(
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
content={
"detail": "Internal data consistency error",
diff --git a/decnet/web/auth.py b/decnet/web/auth.py
index 6ece1e3..81879c5 100644
--- a/decnet/web/auth.py
+++ b/decnet/web/auth.py
@@ -1,3 +1,4 @@
+import asyncio
from datetime import datetime, timedelta, timezone
from typing import Optional, Any
import jwt
@@ -24,6 +25,15 @@ def get_password_hash(password: str) -> str:
return _hashed.decode("utf-8")
+async def averify_password(plain_password: str, hashed_password: str) -> bool:
+ # bcrypt is CPU-bound and ~250ms/call; keep it off the event loop.
+ return await asyncio.to_thread(verify_password, plain_password, hashed_password)
+
+
+async def ahash_password(password: str) -> str:
+ return await asyncio.to_thread(get_password_hash, password)
+
+
def create_access_token(data: dict[str, Any], expires_delta: Optional[timedelta] = None) -> str:
_to_encode: dict[str, Any] = data.copy()
_expire: datetime
diff --git a/decnet/web/db/factory.py b/decnet/web/db/factory.py
index b98884e..af5ff5c 100644
--- a/decnet/web/db/factory.py
+++ b/decnet/web/db/factory.py
@@ -1,18 +1,33 @@
+"""
+Repository factory — selects a :class:`BaseRepository` implementation based on
+``DECNET_DB_TYPE`` (``sqlite`` or ``mysql``).
+"""
+from __future__ import annotations
+
+import os
from typing import Any
-from decnet.env import os
+
from decnet.web.db.repository import BaseRepository
+
def get_repository(**kwargs: Any) -> BaseRepository:
- """Factory function to instantiate the correct repository implementation based on environment."""
+ """Instantiate the repository implementation selected by ``DECNET_DB_TYPE``.
+
+ Keyword arguments are forwarded to the concrete implementation:
+
+ * SQLite accepts ``db_path``.
+ * MySQL accepts ``url`` and engine tuning knobs (``pool_size``, …).
+ """
db_type = os.environ.get("DECNET_DB_TYPE", "sqlite").lower()
if db_type == "sqlite":
from decnet.web.db.sqlite.repository import SQLiteRepository
- return SQLiteRepository(**kwargs)
+ repo = SQLiteRepository(**kwargs)
elif db_type == "mysql":
- # Placeholder for future implementation
- # from decnet.web.db.mysql.repository import MySQLRepository
- # return MySQLRepository()
- raise NotImplementedError("MySQL support is planned but not yet implemented.")
+ from decnet.web.db.mysql.repository import MySQLRepository
+ repo = MySQLRepository(**kwargs)
else:
raise ValueError(f"Unsupported database type: {db_type}")
+
+ from decnet.telemetry import wrap_repository
+ return wrap_repository(repo)
diff --git a/decnet/web/db/models.py b/decnet/web/db/models.py
index 681db23..5d75bc7 100644
--- a/decnet/web/db/models.py
+++ b/decnet/web/db/models.py
@@ -1,8 +1,16 @@
from datetime import datetime, timezone
-from typing import Optional, Any, List, Annotated
+from typing import Literal, Optional, Any, List, Annotated
+from sqlalchemy import Column, Text
+from sqlalchemy.dialects.mysql import MEDIUMTEXT
from sqlmodel import SQLModel, Field
from pydantic import BaseModel, ConfigDict, Field as PydanticField, BeforeValidator
-from decnet.models import IniContent
+from decnet.models import IniContent, DecnetConfig
+
+# Use on columns that accumulate over an attacker's lifetime (commands,
+# fingerprints, state blobs). TEXT on MySQL caps at 64 KiB; MEDIUMTEXT
+# stretches to 16 MiB. SQLite has no fixed-width text types so Text()
+# stays unchanged there.
+_BIG_TEXT = Text().with_variant(MEDIUMTEXT(), "mysql")
def _normalize_null(v: Any) -> Any:
if isinstance(v, str) and v.lower() in ("null", "undefined", ""):
@@ -30,9 +38,16 @@ class Log(SQLModel, table=True):
service: str = Field(index=True)
event_type: str = Field(index=True)
attacker_ip: str = Field(index=True)
- raw_line: str
- fields: str
- msg: Optional[str] = None
+ # Long-text columns — use TEXT so MySQL DDL doesn't truncate to VARCHAR(255).
+ # TEXT is equivalent to plain text in SQLite.
+ raw_line: str = Field(sa_column=Column("raw_line", Text, nullable=False))
+ fields: str = Field(sa_column=Column("fields", Text, nullable=False))
+ msg: Optional[str] = Field(default=None, sa_column=Column("msg", Text, nullable=True))
+ # OTEL trace context — bridges the collector→ingester trace to the SSE
+ # read path. Nullable so pre-existing rows and non-traced deployments
+ # are unaffected.
+ trace_id: Optional[str] = Field(default=None)
+ span_id: Optional[str] = Field(default=None)
class Bounty(SQLModel, table=True):
__tablename__ = "bounty"
@@ -42,13 +57,140 @@ class Bounty(SQLModel, table=True):
service: str = Field(index=True)
attacker_ip: str = Field(index=True)
bounty_type: str = Field(index=True)
- payload: str
+ payload: str = Field(sa_column=Column("payload", Text, nullable=False))
class State(SQLModel, table=True):
__tablename__ = "state"
key: str = Field(primary_key=True)
- value: str # Stores JSON serialized DecnetConfig or other state blobs
+ # JSON-serialized DecnetConfig or other state blobs — can be large as
+ # deckies/services accumulate. MEDIUMTEXT on MySQL (16 MiB ceiling).
+ value: str = Field(sa_column=Column("value", _BIG_TEXT, nullable=False))
+
+
+class Attacker(SQLModel, table=True):
+ __tablename__ = "attackers"
+ uuid: str = Field(primary_key=True)
+ ip: str = Field(index=True)
+ first_seen: datetime = Field(index=True)
+ last_seen: datetime = Field(index=True)
+ event_count: int = Field(default=0)
+ service_count: int = Field(default=0)
+ decky_count: int = Field(default=0)
+ # JSON blobs — these grow over the attacker's lifetime. Use MEDIUMTEXT on
+ # MySQL (16 MiB) for the fields that accumulate (fingerprints, commands,
+ # and the deckies/services lists that are unbounded in principle).
+ services: str = Field(
+ default="[]", sa_column=Column("services", _BIG_TEXT, nullable=False, default="[]")
+ ) # JSON list[str]
+ deckies: str = Field(
+ default="[]", sa_column=Column("deckies", _BIG_TEXT, nullable=False, default="[]")
+ ) # JSON list[str], first-contact ordered
+ traversal_path: Optional[str] = Field(
+ default=None, sa_column=Column("traversal_path", Text, nullable=True)
+ ) # "decky-01 → decky-03 → decky-05"
+ is_traversal: bool = Field(default=False)
+ bounty_count: int = Field(default=0)
+ credential_count: int = Field(default=0)
+ fingerprints: str = Field(
+ default="[]", sa_column=Column("fingerprints", _BIG_TEXT, nullable=False, default="[]")
+ ) # JSON list[dict] — bounty fingerprints
+ commands: str = Field(
+ default="[]", sa_column=Column("commands", _BIG_TEXT, nullable=False, default="[]")
+ ) # JSON list[dict] — commands per service/decky
+ updated_at: datetime = Field(
+ default_factory=lambda: datetime.now(timezone.utc), index=True
+ )
+
+
+class SwarmHost(SQLModel, table=True):
+ """A worker host enrolled into a DECNET swarm.
+
+ Rows exist only on the master. Populated by `decnet swarm enroll` and
+ read by the swarm controller when sharding deckies onto workers.
+ """
+ __tablename__ = "swarm_hosts"
+ uuid: str = Field(primary_key=True)
+ name: str = Field(index=True, unique=True)
+ address: str # IP or hostname reachable by the master
+ agent_port: int = Field(default=8765)
+ status: str = Field(default="enrolled", index=True)
+ # ISO-8601 string of the last successful agent /health probe
+ last_heartbeat: Optional[datetime] = Field(default=None)
+ client_cert_fingerprint: str # SHA-256 hex of worker's issued client cert
+ # SHA-256 hex of the updater-identity cert, if the host was enrolled
+ # with ``--updater`` / ``issue_updater_bundle``. ``None`` for hosts
+ # that only have an agent identity.
+ updater_cert_fingerprint: Optional[str] = Field(default=None)
+ # Directory on the master where the per-worker cert bundle lives
+ cert_bundle_path: str
+ enrolled_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
+ notes: Optional[str] = Field(default=None, sa_column=Column("notes", Text, nullable=True))
+ # Per-host driver preference. True => deckies on this host run over IPvlan
+ # (L2) instead of macvlan — required when the host is a VirtualBox guest
+ # bridged over Wi-Fi, because Wi-Fi APs only allow one MAC per station
+ # and macvlan's per-container MACs rotate the VM's DHCP lease.
+ use_ipvlan: bool = Field(default=False)
+
+
+class DeckyShard(SQLModel, table=True):
+ """Mapping of a single decky to the worker host running it (swarm mode)."""
+ __tablename__ = "decky_shards"
+ decky_name: str = Field(primary_key=True)
+ host_uuid: str = Field(foreign_key="swarm_hosts.uuid", index=True)
+ # JSON list of service names running on this decky (snapshot of assignment).
+ services: str = Field(sa_column=Column("services", _BIG_TEXT, nullable=False, default="[]"))
+ # Full serialised DeckyConfig from the most recent dispatch or heartbeat.
+ # Lets the dashboard render the same rich card (hostname/distro/archetype/
+ # service_config/mutate_interval) that the local-fleet view uses, without
+ # needing a live round-trip to the worker for every page render.
+ decky_config: Optional[str] = Field(
+ default=None, sa_column=Column("decky_config", _BIG_TEXT, nullable=True)
+ )
+ decky_ip: Optional[str] = Field(default=None)
+ state: str = Field(default="pending", index=True) # pending|running|failed|torn_down|degraded|tearing_down|teardown_failed
+ last_error: Optional[str] = Field(default=None, sa_column=Column("last_error", Text, nullable=True))
+ compose_hash: Optional[str] = Field(default=None)
+ # Timestamp of the last heartbeat that echoed this shard; lets the UI
+ # show "stale" decks whose agent has gone silent.
+ last_seen: Optional[datetime] = Field(default=None)
+ updated_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
+
+
+class AttackerBehavior(SQLModel, table=True):
+ """
+ Timing & behavioral profile for an attacker, joined to Attacker by uuid.
+
+ Kept in a separate table so the core Attacker row stays narrow and
+ behavior data can be updated independently (e.g. as the sniffer observes
+ more packets) without touching the event-count aggregates.
+ """
+ __tablename__ = "attacker_behavior"
+ attacker_uuid: str = Field(primary_key=True, foreign_key="attackers.uuid")
+ # OS / TCP stack fingerprint (rolled up from sniffer events)
+ os_guess: Optional[str] = None
+ hop_distance: Optional[int] = None
+ tcp_fingerprint: str = Field(
+ default="{}",
+ sa_column=Column("tcp_fingerprint", Text, nullable=False, default="{}"),
+ ) # JSON: window, wscale, mss, options_sig
+ retransmit_count: int = Field(default=0)
+ # Behavioral (derived by the profiler from log-event timing)
+ behavior_class: Optional[str] = None # beaconing | interactive | scanning | brute_force | slow_scan | mixed | unknown
+ beacon_interval_s: Optional[float] = None
+ beacon_jitter_pct: Optional[float] = None
+ tool_guesses: Optional[str] = None # JSON list[str] — all matched tools
+ timing_stats: str = Field(
+ default="{}",
+ sa_column=Column("timing_stats", Text, nullable=False, default="{}"),
+ ) # JSON: mean/median/stdev/min/max IAT
+ phase_sequence: str = Field(
+ default="{}",
+ sa_column=Column("phase_sequence", Text, nullable=False, default="{}"),
+ ) # JSON: recon_end/exfil_start/latency
+ updated_at: datetime = Field(
+ default_factory=lambda: datetime.now(timezone.utc), index=True
+ )
# --- API Request/Response Models (Pydantic) ---
@@ -77,6 +219,12 @@ class BountyResponse(BaseModel):
offset: int
data: List[dict[str, Any]]
+class AttackersResponse(BaseModel):
+ total: int
+ limit: int
+ offset: int
+ data: List[dict[str, Any]]
+
class StatsResponse(BaseModel):
total_logs: int
unique_attackers: int
@@ -93,3 +241,251 @@ class DeployIniRequest(BaseModel):
# This field now enforces strict INI structure during Pydantic initialization.
# The OpenAPI schema correctly shows it as a required string.
ini_content: IniContent = PydanticField(..., description="A valid INI formatted string")
+
+
+# --- Configuration Models ---
+
+class CreateUserRequest(BaseModel):
+ username: str = PydanticField(..., min_length=1, max_length=64)
+ password: str = PydanticField(..., min_length=8, max_length=72)
+ role: Literal["admin", "viewer"] = "viewer"
+
+class UpdateUserRoleRequest(BaseModel):
+ role: Literal["admin", "viewer"]
+
+class ResetUserPasswordRequest(BaseModel):
+ new_password: str = PydanticField(..., min_length=8, max_length=72)
+
+class DeploymentLimitRequest(BaseModel):
+ deployment_limit: int = PydanticField(..., ge=1, le=500)
+
+class GlobalMutationIntervalRequest(BaseModel):
+ global_mutation_interval: str = PydanticField(..., pattern=r"^[1-9]\d*[mdMyY]$")
+
+class UserResponse(BaseModel):
+ uuid: str
+ username: str
+ role: str
+ must_change_password: bool
+
+class ConfigResponse(BaseModel):
+ role: str
+ deployment_limit: int
+ global_mutation_interval: str
+
+class AdminConfigResponse(ConfigResponse):
+ users: List[UserResponse]
+
+
+class ComponentHealth(BaseModel):
+ status: Literal["ok", "failing"]
+ detail: Optional[str] = None
+
+
+class HealthResponse(BaseModel):
+ status: Literal["healthy", "degraded", "unhealthy"]
+ components: dict[str, ComponentHealth]
+
+
+# --- Swarm API DTOs ---
+# Request/response contracts for the master-side swarm controller
+# (decnet/web/swarm_api.py). The underlying SQLModel tables — SwarmHost and
+# DeckyShard — live above; these are the HTTP-facing shapes.
+
+class SwarmEnrollRequest(BaseModel):
+ # x509 CommonName is capped at 64 bytes (RFC 5280 UB-common-name) — the
+ # cert issuer would reject anything longer with a ValueError.
+ # Pattern: ASCII hostname-safe characters only. The name is embedded
+ # both in the CN and as a SAN DNS entry; x509.DNSName only accepts
+ # A-label ASCII, so non-ASCII would blow up at issuance.
+ name: str = PydanticField(
+ ..., min_length=1, max_length=64,
+ pattern=r"^[A-Za-z0-9][A-Za-z0-9._\-]*$",
+ )
+ address: str = PydanticField(
+ ..., min_length=1, max_length=253,
+ pattern=r"^[A-Za-z0-9][A-Za-z0-9._:\-]*$",
+ description="IP or DNS the master uses to reach the worker",
+ )
+ agent_port: int = PydanticField(default=8765, ge=1, le=65535)
+ sans: list[
+ Annotated[
+ str,
+ PydanticField(
+ min_length=1, max_length=253,
+ pattern=r"^[A-Za-z0-9][A-Za-z0-9._:\-]*$",
+ ),
+ ]
+ ] = PydanticField(
+ default_factory=list,
+ description="Extra SANs (IPs / hostnames) to embed in the worker cert",
+ )
+ notes: Optional[str] = None
+ issue_updater_bundle: bool = PydanticField(
+ default=False,
+ description="If true, also issue an updater cert (CN=updater@) for the remote self-updater",
+ )
+
+
+class SwarmUpdaterBundle(BaseModel):
+ """Subset of SwarmEnrolledBundle for the updater identity."""
+ fingerprint: str
+ updater_cert_pem: str
+ updater_key_pem: str
+
+
+class SwarmEnrolledBundle(BaseModel):
+ """Cert bundle returned to the operator — must be delivered to the worker."""
+ host_uuid: str
+ name: str
+ address: str
+ agent_port: int
+ fingerprint: str
+ ca_cert_pem: str
+ worker_cert_pem: str
+ worker_key_pem: str
+ updater: Optional[SwarmUpdaterBundle] = None
+
+
+class SwarmHostView(BaseModel):
+ uuid: str
+ name: str
+ address: str
+ agent_port: int
+ status: str
+ last_heartbeat: Optional[datetime] = None
+ client_cert_fingerprint: str
+ updater_cert_fingerprint: Optional[str] = None
+ enrolled_at: datetime
+ notes: Optional[str] = None
+ use_ipvlan: bool = False
+
+
+class DeckyShardView(BaseModel):
+ """One decky → host mapping, enriched with the host's identity for display."""
+ decky_name: str
+ decky_ip: Optional[str] = None # resolved from the stored DecnetConfig at read time
+ host_uuid: str
+ host_name: str
+ host_address: str
+ host_status: str
+ services: list[str]
+ state: str
+ last_error: Optional[str] = None
+ compose_hash: Optional[str] = None
+ updated_at: datetime
+ # Enriched fields lifted from the stored DeckyConfig snapshot so the
+ # dashboard can render the same card shape as the local-fleet view.
+ hostname: Optional[str] = None
+ distro: Optional[str] = None
+ archetype: Optional[str] = None
+ service_config: dict[str, dict[str, Any]] = {}
+ mutate_interval: Optional[int] = None
+ last_mutated: float = 0.0
+ last_seen: Optional[datetime] = None
+
+
+class SwarmDeployRequest(BaseModel):
+ config: DecnetConfig
+ dry_run: bool = False
+ no_cache: bool = False
+
+
+class SwarmTeardownRequest(BaseModel):
+ host_uuid: Optional[str] = PydanticField(
+ default=None,
+ description="If set, tear down only this worker; otherwise tear down all hosts",
+ )
+ decky_id: Optional[str] = None
+
+
+class SwarmHostResult(BaseModel):
+ host_uuid: str
+ host_name: str
+ ok: bool
+ detail: Any | None = None
+
+
+class SwarmDeployResponse(BaseModel):
+ results: list[SwarmHostResult]
+
+
+class SwarmHostHealth(BaseModel):
+ host_uuid: str
+ name: str
+ address: str
+ reachable: bool
+ detail: Any | None = None
+
+
+class SwarmCheckResponse(BaseModel):
+ results: list[SwarmHostHealth]
+
+
+# --- Remote Updates (master → worker /updater) DTOs ---
+# Powers the dashboard's Remote Updates page. The master dashboard calls
+# these (auth-gated) endpoints; internally they fan out to each worker's
+# updater daemon over mTLS via UpdaterClient.
+
+class HostReleaseInfo(BaseModel):
+ host_uuid: str
+ host_name: str
+ address: str
+ reachable: bool
+ # These fields mirror the updater's /health payload when reachable; they
+ # are all Optional so an unreachable host still serializes cleanly.
+ agent_status: Optional[str] = None
+ current_sha: Optional[str] = None
+ previous_sha: Optional[str] = None
+ releases: list[dict[str, Any]] = PydanticField(default_factory=list)
+ detail: Optional[str] = None # populated when unreachable
+
+
+class HostReleasesResponse(BaseModel):
+ hosts: list[HostReleaseInfo]
+
+
+class PushUpdateRequest(BaseModel):
+ host_uuids: Optional[list[str]] = PydanticField(
+ default=None,
+ description="Target specific hosts; mutually exclusive with 'all'.",
+ )
+ all: bool = PydanticField(default=False, description="Target every non-decommissioned host with an updater bundle.")
+ include_self: bool = PydanticField(
+ default=False,
+ description="After a successful /update, also push /update-self to upgrade the updater itself.",
+ )
+ exclude: list[str] = PydanticField(
+ default_factory=list,
+ description="Additional tarball exclude globs (on top of the built-in defaults).",
+ )
+
+
+class PushUpdateResult(BaseModel):
+ host_uuid: str
+ host_name: str
+ # updated = /update 200. rolled-back = /update 409 (auto-recovered).
+ # failed = transport error or non-200/409 response. self-updated = /update-self succeeded.
+ status: Literal["updated", "rolled-back", "failed", "self-updated", "self-failed"]
+ http_status: Optional[int] = None
+ sha: Optional[str] = None
+ detail: Optional[str] = None
+ stderr: Optional[str] = None
+
+
+class PushUpdateResponse(BaseModel):
+ sha: str
+ tarball_bytes: int
+ results: list[PushUpdateResult]
+
+
+class RollbackRequest(BaseModel):
+ host_uuid: str = PydanticField(..., description="Host to roll back to its previous release slot.")
+
+
+class RollbackResponse(BaseModel):
+ host_uuid: str
+ host_name: str
+ status: Literal["rolled-back", "failed"]
+ http_status: Optional[int] = None
+ detail: Optional[str] = None
diff --git a/decnet/web/db/mysql/__init__.py b/decnet/web/db/mysql/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/decnet/web/db/mysql/database.py b/decnet/web/db/mysql/database.py
new file mode 100644
index 0000000..2e7b329
--- /dev/null
+++ b/decnet/web/db/mysql/database.py
@@ -0,0 +1,98 @@
+"""
+MySQL async engine factory.
+
+Builds a SQLAlchemy AsyncEngine against MySQL using the ``asyncmy`` driver.
+
+Connection info is resolved (in order of precedence):
+
+1. An explicit ``url`` argument passed to :func:`get_async_engine`
+2. ``DECNET_DB_URL`` — full SQLAlchemy URL
+3. Component env vars:
+ ``DECNET_DB_HOST`` (default ``localhost``)
+ ``DECNET_DB_PORT`` (default ``3306``)
+ ``DECNET_DB_NAME`` (default ``decnet``)
+ ``DECNET_DB_USER`` (default ``decnet``)
+ ``DECNET_DB_PASSWORD`` (default empty — raises unless pytest is running)
+"""
+from __future__ import annotations
+
+import os
+from typing import Optional
+from urllib.parse import quote_plus
+
+from sqlalchemy.ext.asyncio import AsyncEngine, create_async_engine
+
+
+DEFAULT_POOL_SIZE = int(os.environ.get("DECNET_DB_POOL_SIZE", "20"))
+DEFAULT_MAX_OVERFLOW = int(os.environ.get("DECNET_DB_MAX_OVERFLOW", "40"))
+DEFAULT_POOL_RECYCLE = int(os.environ.get("DECNET_DB_POOL_RECYCLE", "3600"))
+DEFAULT_POOL_PRE_PING = os.environ.get("DECNET_DB_POOL_PRE_PING", "true").lower() == "true"
+
+
+def build_mysql_url(
+ host: Optional[str] = None,
+ port: Optional[int] = None,
+ database: Optional[str] = None,
+ user: Optional[str] = None,
+ password: Optional[str] = None,
+) -> str:
+ """Compose an async SQLAlchemy URL for MySQL using the asyncmy driver.
+
+ Component args override env vars. Password is percent-encoded so special
+ characters (``@``, ``:``, ``/``…) don't break URL parsing.
+ """
+ host = host or os.environ.get("DECNET_DB_HOST", "localhost")
+ port = port or int(os.environ.get("DECNET_DB_PORT", "3306"))
+ database = database or os.environ.get("DECNET_DB_NAME", "decnet")
+ user = user or os.environ.get("DECNET_DB_USER", "decnet")
+
+ if password is None:
+ password = os.environ.get("DECNET_DB_PASSWORD", "")
+
+ # Allow empty passwords during tests (pytest sets PYTEST_* env vars).
+ # Outside tests, an empty MySQL password is almost never intentional.
+ if not password and not any(k.startswith("PYTEST") for k in os.environ):
+ raise ValueError(
+ "DECNET_DB_PASSWORD is not set. Either export it, set DECNET_DB_URL, "
+ "or run under pytest for an empty-password default."
+ )
+
+ pw_enc = quote_plus(password)
+ user_enc = quote_plus(user)
+ return f"mysql+asyncmy://{user_enc}:{pw_enc}@{host}:{port}/{database}"
+
+
+def resolve_url(url: Optional[str] = None) -> str:
+ """Pick a connection URL: explicit arg → DECNET_DB_URL env → built from components."""
+ if url:
+ return url
+ env_url = os.environ.get("DECNET_DB_URL")
+ if env_url:
+ return env_url
+ return build_mysql_url()
+
+
+def get_async_engine(
+ url: Optional[str] = None,
+ *,
+ pool_size: int = DEFAULT_POOL_SIZE,
+ max_overflow: int = DEFAULT_MAX_OVERFLOW,
+ pool_recycle: int = DEFAULT_POOL_RECYCLE,
+ pool_pre_ping: bool = DEFAULT_POOL_PRE_PING,
+ echo: bool = False,
+) -> AsyncEngine:
+ """Create an AsyncEngine for MySQL.
+
+ Defaults tuned for a dashboard workload: a modest pool, hourly recycle
+ to sidestep MySQL's idle-connection reaper, and pre-ping to fail fast
+ if a pooled connection has been killed server-side.
+ """
+ dsn = resolve_url(url)
+ return create_async_engine(
+ dsn,
+ echo=echo,
+ pool_size=pool_size,
+ max_overflow=max_overflow,
+ pool_recycle=pool_recycle,
+ pool_pre_ping=pool_pre_ping,
+ )
diff --git a/decnet/web/db/mysql/repository.py b/decnet/web/db/mysql/repository.py
new file mode 100644
index 0000000..f83b4bf
--- /dev/null
+++ b/decnet/web/db/mysql/repository.py
@@ -0,0 +1,141 @@
+"""
+MySQL implementation of :class:`BaseRepository`.
+
+Inherits the portable SQLModel query code from :class:`SQLModelRepository`
+and only overrides the two places where MySQL's SQL dialect differs from
+SQLite's:
+
+* :meth:`_migrate_attackers_table` — uses ``information_schema`` (MySQL
+ has no ``PRAGMA``).
+* :meth:`get_log_histogram` — uses ``FROM_UNIXTIME`` /
+ ``UNIX_TIMESTAMP`` + integer division for bucketing.
+"""
+from __future__ import annotations
+
+from typing import List, Optional
+
+from sqlalchemy import func, select, text, literal_column
+from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
+from sqlmodel.sql.expression import SelectOfScalar
+
+from decnet.web.db.models import Log
+from decnet.web.db.mysql.database import get_async_engine
+from decnet.web.db.sqlmodel_repo import SQLModelRepository
+
+
+class MySQLRepository(SQLModelRepository):
+ """MySQL backend — uses ``asyncmy``."""
+
+ def __init__(self, url: Optional[str] = None, **engine_kwargs) -> None:
+ self.engine = get_async_engine(url=url, **engine_kwargs)
+ self.session_factory = async_sessionmaker(
+ self.engine, class_=AsyncSession, expire_on_commit=False
+ )
+
+ async def _migrate_attackers_table(self) -> None:
+ """Drop the legacy (pre-UUID) ``attackers`` table if it exists without a ``uuid`` column.
+
+ MySQL exposes column metadata via ``information_schema.COLUMNS``.
+ ``DATABASE()`` scopes the lookup to the currently connected schema.
+ """
+ async with self.engine.begin() as conn:
+ rows = (await conn.execute(text(
+ "SELECT COLUMN_NAME FROM information_schema.COLUMNS "
+ "WHERE TABLE_SCHEMA = DATABASE() AND TABLE_NAME = 'attackers'"
+ ))).fetchall()
+ if rows and not any(r[0] == "uuid" for r in rows):
+ await conn.execute(text("DROP TABLE attackers"))
+
+ async def _migrate_column_types(self) -> None:
+ """Upgrade TEXT → MEDIUMTEXT for columns that accumulate large JSON blobs.
+
+ ``create_all()`` never alters existing columns, so tables created before
+ ``_BIG_TEXT`` was introduced keep their 64 KiB ``TEXT`` cap. This method
+ inspects ``information_schema`` and issues ``ALTER TABLE … MODIFY COLUMN``
+ for each offending column found.
+ """
+ targets: dict[str, dict[str, str]] = {
+ "attackers": {
+ "commands": "MEDIUMTEXT NOT NULL DEFAULT '[]'",
+ "fingerprints": "MEDIUMTEXT NOT NULL DEFAULT '[]'",
+ "services": "MEDIUMTEXT NOT NULL DEFAULT '[]'",
+ "deckies": "MEDIUMTEXT NOT NULL DEFAULT '[]'",
+ },
+ "state": {
+ "value": "MEDIUMTEXT NOT NULL",
+ },
+ }
+ async with self.engine.begin() as conn:
+ rows = (await conn.execute(text(
+ "SELECT TABLE_NAME, COLUMN_NAME FROM information_schema.COLUMNS "
+ "WHERE TABLE_SCHEMA = DATABASE() "
+ " AND TABLE_NAME IN ('attackers', 'state') "
+ " AND COLUMN_NAME IN ('commands','fingerprints','services','deckies','value') "
+ " AND DATA_TYPE = 'text'"
+ ))).fetchall()
+ for table_name, col_name in rows:
+ spec = targets.get(table_name, {}).get(col_name)
+ if spec:
+ await conn.execute(text(
+ f"ALTER TABLE `{table_name}` MODIFY COLUMN `{col_name}` {spec}"
+ ))
+
+ async def initialize(self) -> None:
+ """Create tables and run all MySQL-specific migrations.
+
+ Uses a MySQL advisory lock to serialize DDL across concurrent
+ uvicorn workers — prevents the 'Table was skipped since its
+ definition is being modified by concurrent DDL' race.
+ """
+ from sqlmodel import SQLModel
+ async with self.engine.connect() as lock_conn:
+ await lock_conn.execute(text("SELECT GET_LOCK('decnet_schema_init', 30)"))
+ try:
+ await self._migrate_attackers_table()
+ await self._migrate_column_types()
+ async with self.engine.begin() as conn:
+ await conn.run_sync(SQLModel.metadata.create_all)
+ await self._ensure_admin_user()
+ finally:
+ await lock_conn.execute(text("SELECT RELEASE_LOCK('decnet_schema_init')"))
+ await lock_conn.close()
+
+ def _json_field_equals(self, key: str):
+ # MySQL 5.7+ exposes JSON_EXTRACT; quoted string result returned for
+ # TEXT-stored JSON, same behavior we rely on in SQLite.
+ return text(f"JSON_UNQUOTE(JSON_EXTRACT(fields, '$.{key}')) = :val")
+
+ async def get_log_histogram(
+ self,
+ search: Optional[str] = None,
+ start_time: Optional[str] = None,
+ end_time: Optional[str] = None,
+ interval_minutes: int = 15,
+ ) -> List[dict]:
+ bucket_seconds = max(interval_minutes, 1) * 60
+ # Truncate each timestamp to the start of its bucket:
+ # FROM_UNIXTIME( (UNIX_TIMESTAMP(timestamp) DIV N) * N )
+ # DIV is MySQL's integer division operator.
+ bucket_expr = literal_column(
+ f"FROM_UNIXTIME((UNIX_TIMESTAMP(timestamp) DIV {bucket_seconds}) * {bucket_seconds})"
+ ).label("bucket_time")
+
+ statement: SelectOfScalar = select(bucket_expr, func.count().label("count")).select_from(Log)
+ statement = self._apply_filters(statement, search, start_time, end_time)
+ statement = statement.group_by(literal_column("bucket_time")).order_by(
+ literal_column("bucket_time")
+ )
+
+ async with self._session() as session:
+ results = await session.execute(statement)
+ # Normalize to ISO string for API parity with the SQLite backend
+ # (SQLite's datetime() returns a string already; FROM_UNIXTIME
+ # returns a datetime).
+ out: List[dict] = []
+ for r in results.all():
+ ts = r[0]
+ out.append({
+ "time": ts.isoformat(sep=" ") if hasattr(ts, "isoformat") else ts,
+ "count": r[1],
+ })
+ return out
diff --git a/decnet/web/db/repository.py b/decnet/web/db/repository.py
index 08a6259..d0513d4 100644
--- a/decnet/web/db/repository.py
+++ b/decnet/web/db/repository.py
@@ -15,6 +15,15 @@ class BaseRepository(ABC):
"""Add a new log entry to the database."""
pass
+ async def add_logs(self, log_entries: list[dict[str, Any]]) -> None:
+ """Bulk-insert log entries in a single transaction.
+
+ Default implementation falls back to per-row add_log; concrete
+ repositories should override for a real single-commit insert.
+ """
+ for _entry in log_entries:
+ await self.add_log(_entry)
+
@abstractmethod
async def get_logs(
self,
@@ -60,6 +69,26 @@ class BaseRepository(ABC):
"""Update a user's password and change the must_change_password flag."""
pass
+ @abstractmethod
+ async def list_users(self) -> list[dict[str, Any]]:
+ """Retrieve all users (caller must strip password_hash before returning to clients)."""
+ pass
+
+ @abstractmethod
+ async def delete_user(self, uuid: str) -> bool:
+ """Delete a user by UUID. Returns True if user was found and deleted."""
+ pass
+
+ @abstractmethod
+ async def update_user_role(self, uuid: str, role: str) -> None:
+ """Update a user's role."""
+ pass
+
+ @abstractmethod
+ async def purge_logs_and_bounties(self) -> dict[str, int]:
+ """Delete all logs, bounties, and attacker profiles. Returns counts of deleted rows."""
+ pass
+
@abstractmethod
async def add_bounty(self, bounty_data: dict[str, Any]) -> None:
"""Add a new harvested artifact (bounty) to the database."""
@@ -90,3 +119,118 @@ class BaseRepository(ABC):
async def set_state(self, key: str, value: Any) -> None:
"""Store a specific state entry by key."""
pass
+
+ @abstractmethod
+ async def get_max_log_id(self) -> int:
+ """Return the highest log ID, or 0 if the table is empty."""
+ pass
+
+ @abstractmethod
+ async def get_logs_after_id(self, last_id: int, limit: int = 500) -> list[dict[str, Any]]:
+ """Return logs with id > last_id, ordered by id ASC, up to limit."""
+ pass
+
+ @abstractmethod
+ async def get_all_bounties_by_ip(self) -> dict[str, list[dict[str, Any]]]:
+ """Retrieve all bounty rows grouped by attacker_ip."""
+ pass
+
+ @abstractmethod
+ async def get_bounties_for_ips(self, ips: set[str]) -> dict[str, list[dict[str, Any]]]:
+ """Retrieve bounty rows grouped by attacker_ip, filtered to only the given IPs."""
+ pass
+
+ @abstractmethod
+ async def upsert_attacker(self, data: dict[str, Any]) -> str:
+ """Insert or replace an attacker profile record. Returns the row's UUID."""
+ pass
+
+ @abstractmethod
+ async def upsert_attacker_behavior(self, attacker_uuid: str, data: dict[str, Any]) -> None:
+ """Insert or replace the behavioral/fingerprint row for an attacker."""
+ pass
+
+ @abstractmethod
+ async def get_attacker_behavior(self, attacker_uuid: str) -> Optional[dict[str, Any]]:
+ """Retrieve the behavioral/fingerprint row for an attacker UUID."""
+ pass
+
+ @abstractmethod
+ async def get_behaviors_for_ips(self, ips: set[str]) -> dict[str, dict[str, Any]]:
+ """Bulk-fetch behavior rows keyed by attacker IP (JOIN to attackers)."""
+ pass
+
+ @abstractmethod
+ async def get_attacker_by_uuid(self, uuid: str) -> Optional[dict[str, Any]]:
+ """Retrieve a single attacker profile by UUID."""
+ pass
+
+ @abstractmethod
+ async def get_attackers(
+ self,
+ limit: int = 50,
+ offset: int = 0,
+ search: Optional[str] = None,
+ sort_by: str = "recent",
+ service: Optional[str] = None,
+ ) -> list[dict[str, Any]]:
+ """Retrieve paginated attacker profile records."""
+ pass
+
+ @abstractmethod
+ async def get_total_attackers(self, search: Optional[str] = None, service: Optional[str] = None) -> int:
+ """Retrieve the total count of attacker profile records, optionally filtered."""
+ pass
+
+ @abstractmethod
+ async def get_attacker_commands(
+ self,
+ uuid: str,
+ limit: int = 50,
+ offset: int = 0,
+ service: Optional[str] = None,
+ ) -> dict[str, Any]:
+ """Retrieve paginated commands for an attacker, optionally filtered by service."""
+ pass
+
+ @abstractmethod
+ async def get_attacker_artifacts(self, uuid: str) -> list[dict[str, Any]]:
+ """Return `file_captured` log rows for this attacker, newest first."""
+ pass
+
+ # ------------------------------------------------------------- swarm
+ # Swarm methods have default no-op / empty implementations so existing
+ # subclasses and non-swarm deployments continue to work without change.
+
+ async def add_swarm_host(self, data: dict[str, Any]) -> None:
+ raise NotImplementedError
+
+ async def get_swarm_host_by_name(self, name: str) -> Optional[dict[str, Any]]:
+ raise NotImplementedError
+
+ async def get_swarm_host_by_uuid(self, uuid: str) -> Optional[dict[str, Any]]:
+ raise NotImplementedError
+
+ async def get_swarm_host_by_fingerprint(self, fingerprint: str) -> Optional[dict[str, Any]]:
+ raise NotImplementedError
+
+ async def list_swarm_hosts(self, status: Optional[str] = None) -> list[dict[str, Any]]:
+ raise NotImplementedError
+
+ async def update_swarm_host(self, uuid: str, fields: dict[str, Any]) -> None:
+ raise NotImplementedError
+
+ async def delete_swarm_host(self, uuid: str) -> bool:
+ raise NotImplementedError
+
+ async def upsert_decky_shard(self, data: dict[str, Any]) -> None:
+ raise NotImplementedError
+
+ async def list_decky_shards(self, host_uuid: Optional[str] = None) -> list[dict[str, Any]]:
+ raise NotImplementedError
+
+ async def delete_decky_shards_for_host(self, host_uuid: str) -> int:
+ raise NotImplementedError
+
+ async def delete_decky_shard(self, decky_name: str) -> bool:
+ raise NotImplementedError
diff --git a/decnet/web/db/sqlite/database.py b/decnet/web/db/sqlite/database.py
index 22ca549..e446958 100644
--- a/decnet/web/db/sqlite/database.py
+++ b/decnet/web/db/sqlite/database.py
@@ -1,5 +1,7 @@
+import os
+
from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession, async_sessionmaker, create_async_engine
-from sqlalchemy import create_engine, Engine
+from sqlalchemy import create_engine, Engine, event
from sqlmodel import SQLModel
from typing import AsyncGenerator
@@ -11,7 +13,34 @@ def get_async_engine(db_path: str) -> AsyncEngine:
prefix = "sqlite+aiosqlite:///"
if db_path.startswith(":memory:"):
prefix = "sqlite+aiosqlite://"
- return create_async_engine(f"{prefix}{db_path}", echo=False, connect_args={"uri": True})
+
+ pool_size = int(os.environ.get("DECNET_DB_POOL_SIZE", "20"))
+ max_overflow = int(os.environ.get("DECNET_DB_MAX_OVERFLOW", "40"))
+
+ pool_recycle = int(os.environ.get("DECNET_DB_POOL_RECYCLE", "3600"))
+ # SQLite is a local file — dead-connection probes are pure overhead.
+ # Env var stays for network-mounted setups that still want it.
+ pool_pre_ping = os.environ.get("DECNET_DB_POOL_PRE_PING", "false").lower() == "true"
+
+ engine = create_async_engine(
+ f"{prefix}{db_path}",
+ echo=False,
+ pool_size=pool_size,
+ max_overflow=max_overflow,
+ pool_recycle=pool_recycle,
+ pool_pre_ping=pool_pre_ping,
+ connect_args={"uri": True, "timeout": 30},
+ )
+
+ @event.listens_for(engine.sync_engine, "connect")
+ def _set_sqlite_pragmas(dbapi_conn, _conn_record):
+ cursor = dbapi_conn.cursor()
+ cursor.execute("PRAGMA journal_mode=WAL")
+ cursor.execute("PRAGMA synchronous=NORMAL")
+ cursor.execute("PRAGMA busy_timeout=30000")
+ cursor.close()
+
+ return engine
def get_sync_engine(db_path: str) -> Engine:
prefix = "sqlite:///"
diff --git a/decnet/web/db/sqlite/repository.py b/decnet/web/db/sqlite/repository.py
index 9f28a33..5965d0b 100644
--- a/decnet/web/db/sqlite/repository.py
+++ b/decnet/web/db/sqlite/repository.py
@@ -1,23 +1,22 @@
-import asyncio
-import json
-import uuid
-from datetime import datetime
-from typing import Any, Optional, List
+from typing import List, Optional
-from sqlalchemy import func, select, desc, asc, text, or_, update, literal_column
+from sqlalchemy import func, select, text, literal_column
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker
from sqlmodel.sql.expression import SelectOfScalar
-from decnet.config import load_state, _ROOT
-from decnet.env import DECNET_ADMIN_USER, DECNET_ADMIN_PASSWORD
-from decnet.web.auth import get_password_hash
-from decnet.web.db.repository import BaseRepository
-from decnet.web.db.models import User, Log, Bounty, State
+from decnet.config import _ROOT
+from decnet.web.db.models import Log
from decnet.web.db.sqlite.database import get_async_engine
+from decnet.web.db.sqlmodel_repo import SQLModelRepository
-class SQLiteRepository(BaseRepository):
- """SQLite implementation using SQLModel and SQLAlchemy Async."""
+class SQLiteRepository(SQLModelRepository):
+ """SQLite backend — uses ``aiosqlite``.
+
+ Overrides the two places where SQLite's SQL dialect differs from
+ MySQL/PostgreSQL: legacy-schema migration (via ``PRAGMA table_info``)
+ and the log-histogram bucket expression (via ``strftime`` + ``unixepoch``).
+ """
def __init__(self, db_path: str = str(_ROOT / "decnet.db")) -> None:
self.db_path = db_path
@@ -26,173 +25,16 @@ class SQLiteRepository(BaseRepository):
self.engine, class_=AsyncSession, expire_on_commit=False
)
- async def initialize(self) -> None:
- """Async warm-up / verification. Creates tables if they don't exist."""
- from sqlmodel import SQLModel
+ async def _migrate_attackers_table(self) -> None:
+ """Drop the old attackers table if it lacks the uuid column (pre-UUID schema)."""
async with self.engine.begin() as conn:
- await conn.run_sync(SQLModel.metadata.create_all)
+ rows = (await conn.execute(text("PRAGMA table_info(attackers)"))).fetchall()
+ if rows and not any(r[1] == "uuid" for r in rows):
+ await conn.execute(text("DROP TABLE attackers"))
- async with self.session_factory() as session:
- # Check if admin exists
- result = await session.execute(
- select(User).where(User.username == DECNET_ADMIN_USER)
- )
- if not result.scalar_one_or_none():
- session.add(User(
- uuid=str(uuid.uuid4()),
- username=DECNET_ADMIN_USER,
- password_hash=get_password_hash(DECNET_ADMIN_PASSWORD),
- role="admin",
- must_change_password=True,
- ))
- await session.commit()
-
- async def reinitialize(self) -> None:
- """Initialize the database schema asynchronously (useful for tests)."""
- from sqlmodel import SQLModel
- async with self.engine.begin() as conn:
- await conn.run_sync(SQLModel.metadata.create_all)
-
- async with self.session_factory() as session:
- result = await session.execute(
- select(User).where(User.username == DECNET_ADMIN_USER)
- )
- if not result.scalar_one_or_none():
- session.add(User(
- uuid=str(uuid.uuid4()),
- username=DECNET_ADMIN_USER,
- password_hash=get_password_hash(DECNET_ADMIN_PASSWORD),
- role="admin",
- must_change_password=True,
- ))
- await session.commit()
-
- # ------------------------------------------------------------------ logs
-
- async def add_log(self, log_data: dict[str, Any]) -> None:
- data = log_data.copy()
- if "fields" in data and isinstance(data["fields"], dict):
- data["fields"] = json.dumps(data["fields"])
- if "timestamp" in data and isinstance(data["timestamp"], str):
- try:
- data["timestamp"] = datetime.fromisoformat(
- data["timestamp"].replace("Z", "+00:00")
- )
- except ValueError:
- pass
-
- async with self.session_factory() as session:
- session.add(Log(**data))
- await session.commit()
-
- def _apply_filters(
- self,
- statement: SelectOfScalar,
- search: Optional[str],
- start_time: Optional[str],
- end_time: Optional[str],
- ) -> SelectOfScalar:
- import re
- import shlex
-
- if start_time:
- statement = statement.where(Log.timestamp >= start_time)
- if end_time:
- statement = statement.where(Log.timestamp <= end_time)
-
- if search:
- try:
- tokens = shlex.split(search)
- except ValueError:
- tokens = search.split()
-
- core_fields = {
- "decky": Log.decky,
- "service": Log.service,
- "event": Log.event_type,
- "attacker": Log.attacker_ip,
- "attacker-ip": Log.attacker_ip,
- "attacker_ip": Log.attacker_ip,
- }
-
- for token in tokens:
- if ":" in token:
- key, val = token.split(":", 1)
- if key in core_fields:
- statement = statement.where(core_fields[key] == val)
- else:
- key_safe = re.sub(r"[^a-zA-Z0-9_]", "", key)
- if key_safe:
- statement = statement.where(
- text(f"json_extract(fields, '$.{key_safe}') = :val")
- ).params(val=val)
- else:
- lk = f"%{token}%"
- statement = statement.where(
- or_(
- Log.raw_line.like(lk),
- Log.decky.like(lk),
- Log.service.like(lk),
- Log.attacker_ip.like(lk),
- )
- )
- return statement
-
- async def get_logs(
- self,
- limit: int = 50,
- offset: int = 0,
- search: Optional[str] = None,
- start_time: Optional[str] = None,
- end_time: Optional[str] = None,
- ) -> List[dict]:
- statement = (
- select(Log)
- .order_by(desc(Log.timestamp))
- .offset(offset)
- .limit(limit)
- )
- statement = self._apply_filters(statement, search, start_time, end_time)
-
- async with self.session_factory() as session:
- results = await session.execute(statement)
- return [log.model_dump(mode='json') for log in results.scalars().all()]
-
- async def get_max_log_id(self) -> int:
- async with self.session_factory() as session:
- result = await session.execute(select(func.max(Log.id)))
- val = result.scalar()
- return val if val is not None else 0
-
- async def get_logs_after_id(
- self,
- last_id: int,
- limit: int = 50,
- search: Optional[str] = None,
- start_time: Optional[str] = None,
- end_time: Optional[str] = None,
- ) -> List[dict]:
- statement = (
- select(Log).where(Log.id > last_id).order_by(asc(Log.id)).limit(limit)
- )
- statement = self._apply_filters(statement, search, start_time, end_time)
-
- async with self.session_factory() as session:
- results = await session.execute(statement)
- return [log.model_dump(mode='json') for log in results.scalars().all()]
-
- async def get_total_logs(
- self,
- search: Optional[str] = None,
- start_time: Optional[str] = None,
- end_time: Optional[str] = None,
- ) -> int:
- statement = select(func.count()).select_from(Log)
- statement = self._apply_filters(statement, search, start_time, end_time)
-
- async with self.session_factory() as session:
- result = await session.execute(statement)
- return result.scalar() or 0
+ def _json_field_equals(self, key: str):
+ # SQLite stores JSON as text; json_extract is the canonical accessor.
+ return text(f"json_extract(fields, '$.{key}') = :val")
async def get_log_histogram(
self,
@@ -206,173 +48,12 @@ class SQLiteRepository(BaseRepository):
f"datetime((strftime('%s', timestamp) / {bucket_seconds}) * {bucket_seconds}, 'unixepoch')"
).label("bucket_time")
- statement = select(bucket_expr, func.count().label("count")).select_from(Log)
+ statement: SelectOfScalar = select(bucket_expr, func.count().label("count")).select_from(Log)
statement = self._apply_filters(statement, search, start_time, end_time)
statement = statement.group_by(literal_column("bucket_time")).order_by(
literal_column("bucket_time")
)
- async with self.session_factory() as session:
+ async with self._session() as session:
results = await session.execute(statement)
return [{"time": r[0], "count": r[1]} for r in results.all()]
-
- async def get_stats_summary(self) -> dict[str, Any]:
- async with self.session_factory() as session:
- total_logs = (
- await session.execute(select(func.count()).select_from(Log))
- ).scalar() or 0
- unique_attackers = (
- await session.execute(
- select(func.count(func.distinct(Log.attacker_ip)))
- )
- ).scalar() or 0
- active_deckies = (
- await session.execute(
- select(func.count(func.distinct(Log.decky)))
- )
- ).scalar() or 0
-
- _state = await asyncio.to_thread(load_state)
- deployed_deckies = len(_state[0].deckies) if _state else 0
-
- return {
- "total_logs": total_logs,
- "unique_attackers": unique_attackers,
- "active_deckies": active_deckies,
- "deployed_deckies": deployed_deckies,
- }
-
- async def get_deckies(self) -> List[dict]:
- _state = await asyncio.to_thread(load_state)
- return [_d.model_dump() for _d in _state[0].deckies] if _state else []
-
- # ------------------------------------------------------------------ users
-
- async def get_user_by_username(self, username: str) -> Optional[dict]:
- async with self.session_factory() as session:
- result = await session.execute(
- select(User).where(User.username == username)
- )
- user = result.scalar_one_or_none()
- return user.model_dump() if user else None
-
- async def get_user_by_uuid(self, uuid: str) -> Optional[dict]:
- async with self.session_factory() as session:
- result = await session.execute(
- select(User).where(User.uuid == uuid)
- )
- user = result.scalar_one_or_none()
- return user.model_dump() if user else None
-
- async def create_user(self, user_data: dict[str, Any]) -> None:
- async with self.session_factory() as session:
- session.add(User(**user_data))
- await session.commit()
-
- async def update_user_password(
- self, uuid: str, password_hash: str, must_change_password: bool = False
- ) -> None:
- async with self.session_factory() as session:
- await session.execute(
- update(User)
- .where(User.uuid == uuid)
- .values(
- password_hash=password_hash,
- must_change_password=must_change_password,
- )
- )
- await session.commit()
-
- # ---------------------------------------------------------------- bounties
-
- async def add_bounty(self, bounty_data: dict[str, Any]) -> None:
- data = bounty_data.copy()
- if "payload" in data and isinstance(data["payload"], dict):
- data["payload"] = json.dumps(data["payload"])
-
- async with self.session_factory() as session:
- session.add(Bounty(**data))
- await session.commit()
-
- def _apply_bounty_filters(
- self,
- statement: SelectOfScalar,
- bounty_type: Optional[str],
- search: Optional[str]
- ) -> SelectOfScalar:
- if bounty_type:
- statement = statement.where(Bounty.bounty_type == bounty_type)
- if search:
- lk = f"%{search}%"
- statement = statement.where(
- or_(
- Bounty.decky.like(lk),
- Bounty.service.like(lk),
- Bounty.attacker_ip.like(lk),
- Bounty.payload.like(lk),
- )
- )
- return statement
-
- async def get_bounties(
- self,
- limit: int = 50,
- offset: int = 0,
- bounty_type: Optional[str] = None,
- search: Optional[str] = None,
- ) -> List[dict]:
- statement = (
- select(Bounty)
- .order_by(desc(Bounty.timestamp))
- .offset(offset)
- .limit(limit)
- )
- statement = self._apply_bounty_filters(statement, bounty_type, search)
-
- async with self.session_factory() as session:
- results = await session.execute(statement)
- final = []
- for item in results.scalars().all():
- d = item.model_dump(mode='json')
- try:
- d["payload"] = json.loads(d["payload"])
- except (json.JSONDecodeError, TypeError):
- pass
- final.append(d)
- return final
-
- async def get_total_bounties(
- self, bounty_type: Optional[str] = None, search: Optional[str] = None
- ) -> int:
- statement = select(func.count()).select_from(Bounty)
- statement = self._apply_bounty_filters(statement, bounty_type, search)
-
- async with self.session_factory() as session:
- result = await session.execute(statement)
- return result.scalar() or 0
-
- async def get_state(self, key: str) -> Optional[dict[str, Any]]:
- async with self.session_factory() as session:
- statement = select(State).where(State.key == key)
- result = await session.execute(statement)
- state = result.scalar_one_or_none()
- if state:
- return json.loads(state.value)
- return None
-
- async def set_state(self, key: str, value: Any) -> None: # noqa: ANN401
- async with self.session_factory() as session:
- # Check if exists
- statement = select(State).where(State.key == key)
- result = await session.execute(statement)
- state = result.scalar_one_or_none()
-
- value_json = json.dumps(value)
- if state:
- state.value = value_json
- session.add(state)
- else:
- new_state = State(key=key, value=value_json)
- session.add(new_state)
-
- await session.commit()
diff --git a/decnet/web/db/sqlmodel_repo.py b/decnet/web/db/sqlmodel_repo.py
new file mode 100644
index 0000000..b5f40f4
--- /dev/null
+++ b/decnet/web/db/sqlmodel_repo.py
@@ -0,0 +1,901 @@
+"""
+Shared SQLModel-based repository implementation.
+
+Contains all dialect-portable query code used by the SQLite and MySQL
+backends. Dialect-specific behavior lives in subclasses:
+
+* engine/session construction (``__init__``)
+* ``_migrate_attackers_table`` (legacy schema check; DDL introspection
+ is not portable)
+* ``get_log_histogram`` (date-bucket expression differs per dialect)
+"""
+from __future__ import annotations
+
+import asyncio
+import json
+
+import orjson
+import uuid
+from datetime import datetime, timezone
+from typing import Any, Optional, List
+
+from sqlalchemy import func, select, desc, asc, text, or_, update
+from sqlalchemy.ext.asyncio import AsyncEngine, AsyncSession, async_sessionmaker
+from sqlmodel.sql.expression import SelectOfScalar
+
+from decnet.config import load_state
+from decnet.env import DECNET_ADMIN_USER, DECNET_ADMIN_PASSWORD
+from decnet.web.auth import get_password_hash
+from decnet.web.db.repository import BaseRepository
+from decnet.web.db.models import (
+ User,
+ Log,
+ Bounty,
+ State,
+ Attacker,
+ AttackerBehavior,
+ SwarmHost,
+ DeckyShard,
+)
+
+
+from contextlib import asynccontextmanager
+
+from decnet.logging import get_logger
+
+_log = get_logger("db.pool")
+
+# Hold strong refs to in-flight cleanup tasks so they aren't GC'd mid-run.
+_cleanup_tasks: set[asyncio.Task] = set()
+
+
+def _detach_close(session: AsyncSession) -> None:
+ """Hand session cleanup to a fresh task so the caller's cancellation
+ doesn't interrupt it.
+
+ ``asyncio.shield`` doesn't help on the exception path: shield prevents
+ *other* tasks from cancelling the inner coroutine, but if the *current*
+ task is already cancelled, its next ``await`` re-raises
+ ``CancelledError`` as soon as the inner coroutine yields. That's what
+ happens when uvicorn cancels a request mid-query — the rollback inside
+ ``session.close()`` can't complete, and the aiomysql connection is
+ orphaned (pool logs "non-checked-in connection" on GC).
+
+ A fresh task isn't subject to the caller's pending cancellation, so
+ ``close()`` (or the ``invalidate()`` fallback for a dead connection)
+ runs to completion and the pool reclaims the connection promptly.
+
+ Fire-and-forget on purpose: the caller is already unwinding and must
+ not wait on cleanup.
+ """
+ async def _cleanup() -> None:
+ try:
+ await session.close()
+ except BaseException:
+ try:
+ session.sync_session.invalidate()
+ except BaseException:
+ _log.debug("detach-close: invalidate failed", exc_info=True)
+
+ try:
+ loop = asyncio.get_running_loop()
+ except RuntimeError:
+ # No running loop (shutdown path) — best-effort sync invalidate.
+ try:
+ session.sync_session.invalidate()
+ except BaseException:
+ _log.debug("detach-close: no-loop invalidate failed", exc_info=True)
+ return
+ task = loop.create_task(_cleanup())
+ _cleanup_tasks.add(task)
+ # Consume any exception to silence "Task exception was never retrieved".
+ task.add_done_callback(lambda t: (_cleanup_tasks.discard(t), t.exception()))
+
+
+@asynccontextmanager
+async def _safe_session(factory: async_sessionmaker[AsyncSession]):
+ """Session context manager that keeps close() reliable under cancellation.
+
+ Success path: await close() inline so the caller observes cleanup
+ (commit visibility, connection release) before proceeding.
+
+ Exception path (includes CancelledError from client disconnects):
+ detach close() to a fresh task. The caller is unwinding and its
+ own cancellation would abort an inline close mid-rollback, leaving
+ the aiomysql connection orphaned.
+ """
+ session = factory()
+ try:
+ yield session
+ except BaseException:
+ _detach_close(session)
+ raise
+ else:
+ await session.close()
+
+
+class SQLModelRepository(BaseRepository):
+ """Concrete SQLModel/SQLAlchemy-async repository.
+
+ Subclasses provide ``self.engine`` (AsyncEngine) and ``self.session_factory``
+ in ``__init__``, and override the few dialect-specific helpers.
+ """
+
+ engine: AsyncEngine
+ session_factory: async_sessionmaker[AsyncSession]
+
+ def _session(self):
+ """Return a cancellation-safe session context manager."""
+ return _safe_session(self.session_factory)
+
+ # ------------------------------------------------------------ lifecycle
+
+ async def initialize(self) -> None:
+ """Create tables if absent and seed the admin user."""
+ from sqlmodel import SQLModel
+ await self._migrate_attackers_table()
+ async with self.engine.begin() as conn:
+ await conn.run_sync(SQLModel.metadata.create_all)
+ await self._ensure_admin_user()
+
+ async def reinitialize(self) -> None:
+ """Re-create schema (for tests / reset flows). Does NOT drop existing tables."""
+ from sqlmodel import SQLModel
+ async with self.engine.begin() as conn:
+ await conn.run_sync(SQLModel.metadata.create_all)
+ await self._ensure_admin_user()
+
+ async def _ensure_admin_user(self) -> None:
+ async with self._session() as session:
+ result = await session.execute(
+ select(User).where(User.username == DECNET_ADMIN_USER)
+ )
+ existing = result.scalar_one_or_none()
+ if existing is None:
+ session.add(User(
+ uuid=str(uuid.uuid4()),
+ username=DECNET_ADMIN_USER,
+ password_hash=get_password_hash(DECNET_ADMIN_PASSWORD),
+ role="admin",
+ must_change_password=True,
+ ))
+ await session.commit()
+ return
+ # Self-heal env drift: if admin never finalized their password,
+ # re-sync the hash from DECNET_ADMIN_PASSWORD. Otherwise leave
+ # the user's chosen password alone.
+ if existing.must_change_password:
+ existing.password_hash = get_password_hash(DECNET_ADMIN_PASSWORD)
+ session.add(existing)
+ await session.commit()
+
+ async def _migrate_attackers_table(self) -> None:
+ """Legacy-schema cleanup. Override per dialect (DDL introspection is non-portable)."""
+ return None
+
+ # ---------------------------------------------------------------- logs
+
+ @staticmethod
+ def _normalize_log_row(log_data: dict[str, Any]) -> dict[str, Any]:
+ data = log_data.copy()
+ if "fields" in data and isinstance(data["fields"], dict):
+ data["fields"] = orjson.dumps(data["fields"]).decode()
+ if "timestamp" in data and isinstance(data["timestamp"], str):
+ try:
+ data["timestamp"] = datetime.fromisoformat(
+ data["timestamp"].replace("Z", "+00:00")
+ )
+ except ValueError:
+ pass
+ return data
+
+ async def add_log(self, log_data: dict[str, Any]) -> None:
+ data = self._normalize_log_row(log_data)
+ async with self._session() as session:
+ session.add(Log(**data))
+ await session.commit()
+
+ async def add_logs(self, log_entries: list[dict[str, Any]]) -> None:
+ """Bulk insert — one session, one commit for the whole batch."""
+ if not log_entries:
+ return
+ _rows = [Log(**self._normalize_log_row(e)) for e in log_entries]
+ async with self._session() as session:
+ session.add_all(_rows)
+ await session.commit()
+
+ def _apply_filters(
+ self,
+ statement: SelectOfScalar,
+ search: Optional[str],
+ start_time: Optional[str],
+ end_time: Optional[str],
+ ) -> SelectOfScalar:
+ import re
+ import shlex
+
+ if start_time:
+ statement = statement.where(Log.timestamp >= start_time)
+ if end_time:
+ statement = statement.where(Log.timestamp <= end_time)
+
+ if search:
+ try:
+ tokens = shlex.split(search)
+ except ValueError:
+ tokens = search.split()
+
+ core_fields = {
+ "decky": Log.decky,
+ "service": Log.service,
+ "event": Log.event_type,
+ "attacker": Log.attacker_ip,
+ "attacker-ip": Log.attacker_ip,
+ "attacker_ip": Log.attacker_ip,
+ }
+
+ for token in tokens:
+ if ":" in token:
+ key, val = token.split(":", 1)
+ if key in core_fields:
+ statement = statement.where(core_fields[key] == val)
+ else:
+ key_safe = re.sub(r"[^a-zA-Z0-9_]", "", key)
+ if key_safe:
+ statement = statement.where(
+ self._json_field_equals(key_safe)
+ ).params(val=val)
+ else:
+ lk = f"%{token}%"
+ statement = statement.where(
+ or_(
+ Log.raw_line.like(lk),
+ Log.decky.like(lk),
+ Log.service.like(lk),
+ Log.attacker_ip.like(lk),
+ )
+ )
+ return statement
+
+ def _json_field_equals(self, key: str):
+ """Return a text() predicate that matches rows where fields->key == :val.
+
+ Both SQLite and MySQL expose a ``JSON_EXTRACT`` function; MySQL also
+ exposes the same function under ``json_extract`` (case-insensitive).
+ The ``:val`` parameter is bound separately and must be supplied with
+ ``.params(val=...)`` by the caller, which keeps us safe from injection.
+ """
+ return text(f"JSON_EXTRACT(fields, '$.{key}') = :val")
+
+ async def get_logs(
+ self,
+ limit: int = 50,
+ offset: int = 0,
+ search: Optional[str] = None,
+ start_time: Optional[str] = None,
+ end_time: Optional[str] = None,
+ ) -> List[dict]:
+ statement = (
+ select(Log)
+ .order_by(desc(Log.timestamp))
+ .offset(offset)
+ .limit(limit)
+ )
+ statement = self._apply_filters(statement, search, start_time, end_time)
+
+ async with self._session() as session:
+ results = await session.execute(statement)
+ return [log.model_dump(mode="json") for log in results.scalars().all()]
+
+ async def get_max_log_id(self) -> int:
+ async with self._session() as session:
+ result = await session.execute(select(func.max(Log.id)))
+ val = result.scalar()
+ return val if val is not None else 0
+
+ async def get_logs_after_id(
+ self,
+ last_id: int,
+ limit: int = 50,
+ search: Optional[str] = None,
+ start_time: Optional[str] = None,
+ end_time: Optional[str] = None,
+ ) -> List[dict]:
+ statement = (
+ select(Log).where(Log.id > last_id).order_by(asc(Log.id)).limit(limit)
+ )
+ statement = self._apply_filters(statement, search, start_time, end_time)
+
+ async with self._session() as session:
+ results = await session.execute(statement)
+ return [log.model_dump(mode="json") for log in results.scalars().all()]
+
+ async def get_total_logs(
+ self,
+ search: Optional[str] = None,
+ start_time: Optional[str] = None,
+ end_time: Optional[str] = None,
+ ) -> int:
+ statement = select(func.count()).select_from(Log)
+ statement = self._apply_filters(statement, search, start_time, end_time)
+
+ async with self._session() as session:
+ result = await session.execute(statement)
+ return result.scalar() or 0
+
+ async def get_log_histogram(
+ self,
+ search: Optional[str] = None,
+ start_time: Optional[str] = None,
+ end_time: Optional[str] = None,
+ interval_minutes: int = 15,
+ ) -> List[dict]:
+ """Dialect-specific — override per backend."""
+ raise NotImplementedError
+
+ async def get_stats_summary(self) -> dict[str, Any]:
+ async with self._session() as session:
+ total_logs = (
+ await session.execute(select(func.count()).select_from(Log))
+ ).scalar() or 0
+ unique_attackers = (
+ await session.execute(
+ select(func.count(func.distinct(Log.attacker_ip)))
+ )
+ ).scalar() or 0
+
+ _state = await asyncio.to_thread(load_state)
+ deployed_deckies = len(_state[0].deckies) if _state else 0
+
+ return {
+ "total_logs": total_logs,
+ "unique_attackers": unique_attackers,
+ "active_deckies": deployed_deckies,
+ "deployed_deckies": deployed_deckies,
+ }
+
+ async def get_deckies(self) -> List[dict]:
+ _state = await asyncio.to_thread(load_state)
+ return [_d.model_dump() for _d in _state[0].deckies] if _state else []
+
+ # --------------------------------------------------------------- users
+
+ async def get_user_by_username(self, username: str) -> Optional[dict]:
+ async with self._session() as session:
+ result = await session.execute(
+ select(User).where(User.username == username)
+ )
+ user = result.scalar_one_or_none()
+ return user.model_dump() if user else None
+
+ async def get_user_by_uuid(self, uuid: str) -> Optional[dict]:
+ async with self._session() as session:
+ result = await session.execute(
+ select(User).where(User.uuid == uuid)
+ )
+ user = result.scalar_one_or_none()
+ return user.model_dump() if user else None
+
+ async def create_user(self, user_data: dict[str, Any]) -> None:
+ async with self._session() as session:
+ session.add(User(**user_data))
+ await session.commit()
+
+ async def update_user_password(
+ self, uuid: str, password_hash: str, must_change_password: bool = False
+ ) -> None:
+ async with self._session() as session:
+ await session.execute(
+ update(User)
+ .where(User.uuid == uuid)
+ .values(
+ password_hash=password_hash,
+ must_change_password=must_change_password,
+ )
+ )
+ await session.commit()
+
+ async def list_users(self) -> list[dict]:
+ async with self._session() as session:
+ result = await session.execute(select(User))
+ return [u.model_dump() for u in result.scalars().all()]
+
+ async def delete_user(self, uuid: str) -> bool:
+ async with self._session() as session:
+ result = await session.execute(select(User).where(User.uuid == uuid))
+ user = result.scalar_one_or_none()
+ if not user:
+ return False
+ await session.delete(user)
+ await session.commit()
+ return True
+
+ async def update_user_role(self, uuid: str, role: str) -> None:
+ async with self._session() as session:
+ await session.execute(
+ update(User).where(User.uuid == uuid).values(role=role)
+ )
+ await session.commit()
+
+ async def purge_logs_and_bounties(self) -> dict[str, int]:
+ async with self._session() as session:
+ logs_deleted = (await session.execute(text("DELETE FROM logs"))).rowcount
+ bounties_deleted = (await session.execute(text("DELETE FROM bounty"))).rowcount
+ # attacker_behavior has FK → attackers.uuid; delete children first.
+ await session.execute(text("DELETE FROM attacker_behavior"))
+ attackers_deleted = (await session.execute(text("DELETE FROM attackers"))).rowcount
+ await session.commit()
+ return {
+ "logs": logs_deleted,
+ "bounties": bounties_deleted,
+ "attackers": attackers_deleted,
+ }
+
+ # ------------------------------------------------------------ bounties
+
+ async def add_bounty(self, bounty_data: dict[str, Any]) -> None:
+ data = bounty_data.copy()
+ if "payload" in data and isinstance(data["payload"], dict):
+ data["payload"] = orjson.dumps(data["payload"]).decode()
+
+ async with self._session() as session:
+ dup = await session.execute(
+ select(Bounty.id).where(
+ Bounty.bounty_type == data.get("bounty_type"),
+ Bounty.attacker_ip == data.get("attacker_ip"),
+ Bounty.payload == data.get("payload"),
+ ).limit(1)
+ )
+ if dup.first() is not None:
+ return
+ session.add(Bounty(**data))
+ await session.commit()
+
+ def _apply_bounty_filters(
+ self,
+ statement: SelectOfScalar,
+ bounty_type: Optional[str],
+ search: Optional[str],
+ ) -> SelectOfScalar:
+ if bounty_type:
+ statement = statement.where(Bounty.bounty_type == bounty_type)
+ if search:
+ lk = f"%{search}%"
+ statement = statement.where(
+ or_(
+ Bounty.decky.like(lk),
+ Bounty.service.like(lk),
+ Bounty.attacker_ip.like(lk),
+ Bounty.payload.like(lk),
+ )
+ )
+ return statement
+
+ async def get_bounties(
+ self,
+ limit: int = 50,
+ offset: int = 0,
+ bounty_type: Optional[str] = None,
+ search: Optional[str] = None,
+ ) -> List[dict]:
+ statement = (
+ select(Bounty)
+ .order_by(desc(Bounty.timestamp))
+ .offset(offset)
+ .limit(limit)
+ )
+ statement = self._apply_bounty_filters(statement, bounty_type, search)
+
+ async with self._session() as session:
+ results = await session.execute(statement)
+ final = []
+ for item in results.scalars().all():
+ d = item.model_dump(mode="json")
+ try:
+ d["payload"] = json.loads(d["payload"])
+ except (json.JSONDecodeError, TypeError):
+ pass
+ final.append(d)
+ return final
+
+ async def get_total_bounties(
+ self, bounty_type: Optional[str] = None, search: Optional[str] = None
+ ) -> int:
+ statement = select(func.count()).select_from(Bounty)
+ statement = self._apply_bounty_filters(statement, bounty_type, search)
+
+ async with self._session() as session:
+ result = await session.execute(statement)
+ return result.scalar() or 0
+
+ async def get_state(self, key: str) -> Optional[dict[str, Any]]:
+ async with self._session() as session:
+ statement = select(State).where(State.key == key)
+ result = await session.execute(statement)
+ state = result.scalar_one_or_none()
+ if state:
+ return json.loads(state.value)
+ return None
+
+ async def set_state(self, key: str, value: Any) -> None: # noqa: ANN401
+ async with self._session() as session:
+ statement = select(State).where(State.key == key)
+ result = await session.execute(statement)
+ state = result.scalar_one_or_none()
+
+ value_json = orjson.dumps(value).decode()
+ if state:
+ state.value = value_json
+ session.add(state)
+ else:
+ session.add(State(key=key, value=value_json))
+
+ await session.commit()
+
+ # ----------------------------------------------------------- attackers
+
+ async def get_all_bounties_by_ip(self) -> dict[str, List[dict[str, Any]]]:
+ from collections import defaultdict
+ async with self._session() as session:
+ result = await session.execute(
+ select(Bounty).order_by(asc(Bounty.timestamp))
+ )
+ grouped: dict[str, List[dict[str, Any]]] = defaultdict(list)
+ for item in result.scalars().all():
+ d = item.model_dump(mode="json")
+ try:
+ d["payload"] = json.loads(d["payload"])
+ except (json.JSONDecodeError, TypeError):
+ pass
+ grouped[item.attacker_ip].append(d)
+ return dict(grouped)
+
+ async def get_bounties_for_ips(self, ips: set[str]) -> dict[str, List[dict[str, Any]]]:
+ from collections import defaultdict
+ async with self._session() as session:
+ result = await session.execute(
+ select(Bounty).where(Bounty.attacker_ip.in_(ips)).order_by(asc(Bounty.timestamp))
+ )
+ grouped: dict[str, List[dict[str, Any]]] = defaultdict(list)
+ for item in result.scalars().all():
+ d = item.model_dump(mode="json")
+ try:
+ d["payload"] = json.loads(d["payload"])
+ except (json.JSONDecodeError, TypeError):
+ pass
+ grouped[item.attacker_ip].append(d)
+ return dict(grouped)
+
+ async def upsert_attacker(self, data: dict[str, Any]) -> str:
+ async with self._session() as session:
+ result = await session.execute(
+ select(Attacker).where(Attacker.ip == data["ip"])
+ )
+ existing = result.scalar_one_or_none()
+ if existing:
+ for k, v in data.items():
+ setattr(existing, k, v)
+ session.add(existing)
+ row_uuid = existing.uuid
+ else:
+ row_uuid = str(uuid.uuid4())
+ data = {**data, "uuid": row_uuid}
+ session.add(Attacker(**data))
+ await session.commit()
+ return row_uuid
+
+ async def upsert_attacker_behavior(
+ self,
+ attacker_uuid: str,
+ data: dict[str, Any],
+ ) -> None:
+ async with self._session() as session:
+ result = await session.execute(
+ select(AttackerBehavior).where(
+ AttackerBehavior.attacker_uuid == attacker_uuid
+ )
+ )
+ existing = result.scalar_one_or_none()
+ payload = {**data, "updated_at": datetime.now(timezone.utc)}
+ if existing:
+ for k, v in payload.items():
+ setattr(existing, k, v)
+ session.add(existing)
+ else:
+ session.add(AttackerBehavior(attacker_uuid=attacker_uuid, **payload))
+ await session.commit()
+
+ async def get_attacker_behavior(
+ self,
+ attacker_uuid: str,
+ ) -> Optional[dict[str, Any]]:
+ async with self._session() as session:
+ result = await session.execute(
+ select(AttackerBehavior).where(
+ AttackerBehavior.attacker_uuid == attacker_uuid
+ )
+ )
+ row = result.scalar_one_or_none()
+ if not row:
+ return None
+ return self._deserialize_behavior(row.model_dump(mode="json"))
+
+ async def get_behaviors_for_ips(
+ self,
+ ips: set[str],
+ ) -> dict[str, dict[str, Any]]:
+ if not ips:
+ return {}
+ async with self._session() as session:
+ result = await session.execute(
+ select(Attacker.ip, AttackerBehavior)
+ .join(AttackerBehavior, Attacker.uuid == AttackerBehavior.attacker_uuid)
+ .where(Attacker.ip.in_(ips))
+ )
+ out: dict[str, dict[str, Any]] = {}
+ for ip, row in result.all():
+ out[ip] = self._deserialize_behavior(row.model_dump(mode="json"))
+ return out
+
+ @staticmethod
+ def _deserialize_behavior(d: dict[str, Any]) -> dict[str, Any]:
+ for key in ("tcp_fingerprint", "timing_stats", "phase_sequence"):
+ if isinstance(d.get(key), str):
+ try:
+ d[key] = json.loads(d[key])
+ except (json.JSONDecodeError, TypeError):
+ pass
+ # Deserialize tool_guesses JSON array; normalise None → [].
+ raw = d.get("tool_guesses")
+ if isinstance(raw, str):
+ try:
+ parsed = json.loads(raw)
+ d["tool_guesses"] = parsed if isinstance(parsed, list) else [parsed]
+ except (json.JSONDecodeError, TypeError):
+ d["tool_guesses"] = []
+ elif raw is None:
+ d["tool_guesses"] = []
+ return d
+
+ @staticmethod
+ def _deserialize_attacker(d: dict[str, Any]) -> dict[str, Any]:
+ for key in ("services", "deckies", "fingerprints", "commands"):
+ if isinstance(d.get(key), str):
+ try:
+ d[key] = json.loads(d[key])
+ except (json.JSONDecodeError, TypeError):
+ pass
+ return d
+
+ async def get_attacker_by_uuid(self, uuid: str) -> Optional[dict[str, Any]]:
+ async with self._session() as session:
+ result = await session.execute(
+ select(Attacker).where(Attacker.uuid == uuid)
+ )
+ attacker = result.scalar_one_or_none()
+ if not attacker:
+ return None
+ return self._deserialize_attacker(attacker.model_dump(mode="json"))
+
+ async def get_attackers(
+ self,
+ limit: int = 50,
+ offset: int = 0,
+ search: Optional[str] = None,
+ sort_by: str = "recent",
+ service: Optional[str] = None,
+ ) -> List[dict[str, Any]]:
+ order = {
+ "active": desc(Attacker.event_count),
+ "traversals": desc(Attacker.is_traversal),
+ }.get(sort_by, desc(Attacker.last_seen))
+
+ statement = select(Attacker).order_by(order).offset(offset).limit(limit)
+ if search:
+ statement = statement.where(Attacker.ip.like(f"%{search}%"))
+ if service:
+ statement = statement.where(Attacker.services.like(f'%"{service}"%'))
+
+ async with self._session() as session:
+ result = await session.execute(statement)
+ return [
+ self._deserialize_attacker(a.model_dump(mode="json"))
+ for a in result.scalars().all()
+ ]
+
+ async def get_total_attackers(
+ self, search: Optional[str] = None, service: Optional[str] = None
+ ) -> int:
+ statement = select(func.count()).select_from(Attacker)
+ if search:
+ statement = statement.where(Attacker.ip.like(f"%{search}%"))
+ if service:
+ statement = statement.where(Attacker.services.like(f'%"{service}"%'))
+
+ async with self._session() as session:
+ result = await session.execute(statement)
+ return result.scalar() or 0
+
+ async def get_attacker_commands(
+ self,
+ uuid: str,
+ limit: int = 50,
+ offset: int = 0,
+ service: Optional[str] = None,
+ ) -> dict[str, Any]:
+ async with self._session() as session:
+ result = await session.execute(
+ select(Attacker.commands).where(Attacker.uuid == uuid)
+ )
+ raw = result.scalar_one_or_none()
+ if raw is None:
+ return {"total": 0, "data": []}
+
+ commands: list = json.loads(raw) if isinstance(raw, str) else raw
+ if service:
+ commands = [c for c in commands if c.get("service") == service]
+
+ total = len(commands)
+ page = commands[offset: offset + limit]
+ return {"total": total, "data": page}
+
+ async def get_attacker_artifacts(self, uuid: str) -> list[dict[str, Any]]:
+ """Return `file_captured` logs for the attacker identified by UUID.
+
+ Resolves the attacker's IP first, then queries the logs table on two
+ indexed columns (``attacker_ip`` and ``event_type``). No JSON extract
+ needed — the decky/stored_as are already decoded into ``fields`` by
+ the ingester and returned to the frontend for drawer rendering.
+ """
+ async with self._session() as session:
+ ip_res = await session.execute(
+ select(Attacker.ip).where(Attacker.uuid == uuid)
+ )
+ ip = ip_res.scalar_one_or_none()
+ if not ip:
+ return []
+ rows = await session.execute(
+ select(Log)
+ .where(Log.attacker_ip == ip)
+ .where(Log.event_type == "file_captured")
+ .order_by(desc(Log.timestamp))
+ .limit(200)
+ )
+ return [r.model_dump(mode="json") for r in rows.scalars().all()]
+
+ # ------------------------------------------------------------- swarm
+
+ async def add_swarm_host(self, data: dict[str, Any]) -> None:
+ async with self._session() as session:
+ session.add(SwarmHost(**data))
+ await session.commit()
+
+ async def get_swarm_host_by_name(self, name: str) -> Optional[dict[str, Any]]:
+ async with self._session() as session:
+ result = await session.execute(select(SwarmHost).where(SwarmHost.name == name))
+ row = result.scalar_one_or_none()
+ return row.model_dump(mode="json") if row else None
+
+ async def get_swarm_host_by_uuid(self, uuid: str) -> Optional[dict[str, Any]]:
+ async with self._session() as session:
+ result = await session.execute(select(SwarmHost).where(SwarmHost.uuid == uuid))
+ row = result.scalar_one_or_none()
+ return row.model_dump(mode="json") if row else None
+
+ async def get_swarm_host_by_fingerprint(self, fingerprint: str) -> Optional[dict[str, Any]]:
+ async with self._session() as session:
+ result = await session.execute(
+ select(SwarmHost).where(SwarmHost.client_cert_fingerprint == fingerprint)
+ )
+ row = result.scalar_one_or_none()
+ return row.model_dump(mode="json") if row else None
+
+ async def list_swarm_hosts(self, status: Optional[str] = None) -> list[dict[str, Any]]:
+ statement = select(SwarmHost).order_by(asc(SwarmHost.name))
+ if status:
+ statement = statement.where(SwarmHost.status == status)
+ async with self._session() as session:
+ result = await session.execute(statement)
+ return [r.model_dump(mode="json") for r in result.scalars().all()]
+
+ async def update_swarm_host(self, uuid: str, fields: dict[str, Any]) -> None:
+ if not fields:
+ return
+ async with self._session() as session:
+ await session.execute(
+ update(SwarmHost).where(SwarmHost.uuid == uuid).values(**fields)
+ )
+ await session.commit()
+
+ async def delete_swarm_host(self, uuid: str) -> bool:
+ async with self._session() as session:
+ # Clean up child shards first (no ON DELETE CASCADE portable across dialects).
+ await session.execute(
+ text("DELETE FROM decky_shards WHERE host_uuid = :u"), {"u": uuid}
+ )
+ result = await session.execute(
+ select(SwarmHost).where(SwarmHost.uuid == uuid)
+ )
+ host = result.scalar_one_or_none()
+ if not host:
+ await session.commit()
+ return False
+ await session.delete(host)
+ await session.commit()
+ return True
+
+ async def upsert_decky_shard(self, data: dict[str, Any]) -> None:
+ payload = {**data, "updated_at": datetime.now(timezone.utc)}
+ if isinstance(payload.get("services"), list):
+ payload["services"] = orjson.dumps(payload["services"]).decode()
+ async with self._session() as session:
+ result = await session.execute(
+ select(DeckyShard).where(DeckyShard.decky_name == payload["decky_name"])
+ )
+ existing = result.scalar_one_or_none()
+ if existing:
+ for k, v in payload.items():
+ setattr(existing, k, v)
+ session.add(existing)
+ else:
+ session.add(DeckyShard(**payload))
+ await session.commit()
+
+ async def list_decky_shards(
+ self, host_uuid: Optional[str] = None
+ ) -> list[dict[str, Any]]:
+ statement = select(DeckyShard).order_by(asc(DeckyShard.decky_name))
+ if host_uuid:
+ statement = statement.where(DeckyShard.host_uuid == host_uuid)
+ async with self._session() as session:
+ result = await session.execute(statement)
+ out: list[dict[str, Any]] = []
+ for r in result.scalars().all():
+ d = r.model_dump(mode="json")
+ raw = d.get("services")
+ if isinstance(raw, str):
+ try:
+ d["services"] = json.loads(raw)
+ except (json.JSONDecodeError, TypeError):
+ d["services"] = []
+ # Flatten the stored DeckyConfig snapshot into the row so
+ # routers can hand it to DeckyShardView without re-parsing.
+ # Rows predating the migration have decky_config=NULL and
+ # fall through with the default (None/{}) view values.
+ cfg_raw = d.get("decky_config")
+ if isinstance(cfg_raw, str):
+ try:
+ cfg = json.loads(cfg_raw)
+ except (json.JSONDecodeError, TypeError):
+ cfg = {}
+ if isinstance(cfg, dict):
+ for k in ("hostname", "distro", "archetype",
+ "service_config", "mutate_interval",
+ "last_mutated"):
+ if k in cfg and d.get(k) is None:
+ d[k] = cfg[k]
+ # Keep decky_ip authoritative from the column (newer
+ # heartbeats overwrite it) but fall back to the
+ # snapshot if the column is still NULL.
+ if not d.get("decky_ip") and cfg.get("ip"):
+ d["decky_ip"] = cfg["ip"]
+ out.append(d)
+ return out
+
+ async def delete_decky_shards_for_host(self, host_uuid: str) -> int:
+ async with self._session() as session:
+ result = await session.execute(
+ text("DELETE FROM decky_shards WHERE host_uuid = :u"),
+ {"u": host_uuid},
+ )
+ await session.commit()
+ return result.rowcount or 0
+
+ async def delete_decky_shard(self, decky_name: str) -> bool:
+ async with self._session() as session:
+ result = await session.execute(
+ text("DELETE FROM decky_shards WHERE decky_name = :n"),
+ {"n": decky_name},
+ )
+ await session.commit()
+ return bool(result.rowcount)
diff --git a/decnet/web/dependencies.py b/decnet/web/dependencies.py
index 99a6d39..d3f83d2 100644
--- a/decnet/web/dependencies.py
+++ b/decnet/web/dependencies.py
@@ -1,3 +1,5 @@
+import asyncio
+import time
from typing import Any, Optional
import jwt
@@ -23,6 +25,88 @@ repo = get_repo()
oauth2_scheme = OAuth2PasswordBearer(tokenUrl="/api/v1/auth/login")
+# Per-request user lookup was the hidden tax behind every authed endpoint —
+# SELECT users WHERE uuid=? ran once per call, serializing through aiosqlite.
+# 10s TTL is well below JWT expiry and we invalidate on all user writes.
+_USER_TTL = 10.0
+_user_cache: dict[str, tuple[Optional[dict[str, Any]], float]] = {}
+_user_cache_lock: Optional[asyncio.Lock] = None
+
+# Username cache for the login hot path. Short TTL — the bcrypt verify
+# still runs against the cached hash, so security is unchanged. The
+# staleness window is: if a password is changed, the old password is
+# usable for up to _USERNAME_TTL seconds until the cache expires (or
+# invalidate_user_cache fires). We invalidate on every user write.
+# Missing lookups are NOT cached to avoid locking out a just-created user.
+_USERNAME_TTL = 5.0
+_username_cache: dict[str, tuple[dict[str, Any], float]] = {}
+_username_cache_lock: Optional[asyncio.Lock] = None
+
+
+def _reset_user_cache() -> None:
+ global _user_cache, _user_cache_lock, _username_cache, _username_cache_lock
+ _user_cache = {}
+ _user_cache_lock = None
+ _username_cache = {}
+ _username_cache_lock = None
+
+
+def invalidate_user_cache(user_uuid: Optional[str] = None) -> None:
+ """Drop a single user (or all users) from the auth caches.
+
+ Callers: password change, role change, user create/delete.
+ The username cache is always cleared wholesale — we don't track
+ uuid→username and user writes are rare, so the cost is trivial.
+ """
+ if user_uuid is None:
+ _user_cache.clear()
+ else:
+ _user_cache.pop(user_uuid, None)
+ _username_cache.clear()
+
+
+async def get_user_by_username_cached(username: str) -> Optional[dict[str, Any]]:
+ """Cached read of get_user_by_username for the login path.
+
+ Positive hits are cached for _USERNAME_TTL seconds. Misses bypass
+ the cache so a freshly-created user can log in immediately.
+ """
+ global _username_cache_lock
+ entry = _username_cache.get(username)
+ now = time.monotonic()
+ if entry is not None and now - entry[1] < _USERNAME_TTL:
+ return entry[0]
+ if _username_cache_lock is None:
+ _username_cache_lock = asyncio.Lock()
+ async with _username_cache_lock:
+ entry = _username_cache.get(username)
+ now = time.monotonic()
+ if entry is not None and now - entry[1] < _USERNAME_TTL:
+ return entry[0]
+ user = await repo.get_user_by_username(username)
+ if user is not None:
+ _username_cache[username] = (user, time.monotonic())
+ return user
+
+
+async def _get_user_cached(user_uuid: str) -> Optional[dict[str, Any]]:
+ global _user_cache_lock
+ entry = _user_cache.get(user_uuid)
+ now = time.monotonic()
+ if entry is not None and now - entry[1] < _USER_TTL:
+ return entry[0]
+ if _user_cache_lock is None:
+ _user_cache_lock = asyncio.Lock()
+ async with _user_cache_lock:
+ entry = _user_cache.get(user_uuid)
+ now = time.monotonic()
+ if entry is not None and now - entry[1] < _USER_TTL:
+ return entry[0]
+ user = await repo.get_user_by_uuid(user_uuid)
+ _user_cache[user_uuid] = (user, time.monotonic())
+ return user
+
+
async def get_stream_user(request: Request, token: Optional[str] = None) -> str:
"""Auth dependency for SSE endpoints — accepts Bearer header OR ?token= query param.
EventSource does not support custom headers, so the query-string fallback is intentional here only.
@@ -82,7 +166,7 @@ async def _decode_token(request: Request) -> str:
async def get_current_user(request: Request) -> str:
"""Auth dependency — enforces must_change_password."""
_user_uuid = await _decode_token(request)
- _user = await repo.get_user_by_uuid(_user_uuid)
+ _user = await _get_user_cached(_user_uuid)
if _user and _user.get("must_change_password"):
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
@@ -96,3 +180,57 @@ async def get_current_user_unchecked(request: Request) -> str:
Use only for endpoints that must remain reachable with the flag set (e.g. change-password).
"""
return await _decode_token(request)
+
+
+# ---------------------------------------------------------------------------
+# Role-based access control
+# ---------------------------------------------------------------------------
+
+def require_role(*allowed_roles: str):
+ """Factory that returns a FastAPI dependency enforcing role membership.
+
+ Inlines JWT decode + user lookup + must_change_password + role check so the
+ user is only loaded from the DB once per request (not once in
+ ``get_current_user`` and again here). Returns the full user dict so
+ endpoints can inspect ``user["uuid"]``, ``user["role"]``, etc.
+ """
+ async def _check(request: Request) -> dict:
+ user_uuid = await _decode_token(request)
+ user = await _get_user_cached(user_uuid)
+ if not user:
+ raise HTTPException(
+ status_code=status.HTTP_401_UNAUTHORIZED,
+ detail="Could not validate credentials",
+ headers={"WWW-Authenticate": "Bearer"},
+ )
+ if user.get("must_change_password"):
+ raise HTTPException(
+ status_code=status.HTTP_403_FORBIDDEN,
+ detail="Password change required before accessing this resource",
+ )
+ if user["role"] not in allowed_roles:
+ raise HTTPException(
+ status_code=status.HTTP_403_FORBIDDEN,
+ detail="Insufficient permissions",
+ )
+ return user
+ return _check
+
+
+def require_stream_role(*allowed_roles: str):
+ """Like ``require_role`` but for SSE endpoints that accept a query-param token."""
+ async def _check(request: Request, token: Optional[str] = None) -> dict:
+ user_uuid = await get_stream_user(request, token)
+ user = await _get_user_cached(user_uuid)
+ if not user or user["role"] not in allowed_roles:
+ raise HTTPException(
+ status_code=status.HTTP_403_FORBIDDEN,
+ detail="Insufficient permissions",
+ )
+ return user
+ return _check
+
+
+require_admin = require_role("admin")
+require_viewer = require_role("viewer", "admin")
+require_stream_viewer = require_stream_role("viewer", "admin")
diff --git a/decnet/web/ingester.py b/decnet/web/ingester.py
index 96a224a..bca1d63 100644
--- a/decnet/web/ingester.py
+++ b/decnet/web/ingester.py
@@ -1,13 +1,24 @@
import asyncio
import os
-import logging
import json
+import time
from typing import Any
from pathlib import Path
+from decnet.env import DECNET_BATCH_SIZE, DECNET_BATCH_MAX_WAIT_MS
+from decnet.logging import get_logger
+from decnet.telemetry import (
+ traced as _traced,
+ get_tracer as _get_tracer,
+ extract_context as _extract_ctx,
+ start_span_with_context as _start_span,
+)
from decnet.web.db.repository import BaseRepository
-logger: logging.Logger = logging.getLogger("decnet.web.ingester")
+logger = get_logger("api")
+
+_INGEST_STATE_KEY = "ingest_worker_position"
+
async def log_ingestion_worker(repo: BaseRepository) -> None:
"""
@@ -20,9 +31,11 @@ async def log_ingestion_worker(repo: BaseRepository) -> None:
return
_json_log_path: Path = Path(_base_log_file).with_suffix(".json")
- _position: int = 0
- logger.info(f"Starting JSON log ingestion from {_json_log_path}")
+ _saved = await repo.get_state(_INGEST_STATE_KEY)
+ _position: int = _saved.get("position", 0) if _saved else 0
+
+ logger.info("ingest worker started path=%s position=%d", _json_log_path, _position)
while True:
try:
@@ -34,46 +47,103 @@ async def log_ingestion_worker(repo: BaseRepository) -> None:
if _stat.st_size < _position:
# File rotated or truncated
_position = 0
+ await repo.set_state(_INGEST_STATE_KEY, {"position": 0})
if _stat.st_size == _position:
# No new data
await asyncio.sleep(1)
continue
+ # Accumulate parsed rows and the file offset they end at. We
+ # only advance _position after the batch is successfully
+ # committed — if we get cancelled mid-flush, the next run
+ # re-reads the un-committed lines rather than losing them.
+ _batch: list[tuple[dict[str, Any], int]] = []
+ _batch_started: float = time.monotonic()
+ _max_wait_s: float = DECNET_BATCH_MAX_WAIT_MS / 1000.0
+
with open(_json_log_path, "r", encoding="utf-8", errors="replace") as _f:
_f.seek(_position)
while True:
_line: str = _f.readline()
- if not _line:
- break # EOF reached
-
- if not _line.endswith('\n'):
- # Partial line read, don't process yet, don't advance position
+ if not _line or not _line.endswith('\n'):
+ # EOF or partial line — flush what we have and stop
break
try:
_log_data: dict[str, Any] = json.loads(_line.strip())
- await repo.add_log(_log_data)
- await _extract_bounty(repo, _log_data)
+ # Collector injects trace context so the ingester span
+ # chains off the collector's — full event journey in Jaeger.
+ _parent_ctx = _extract_ctx(_log_data)
+ _tracer = _get_tracer("ingester")
+ with _start_span(_tracer, "ingester.process_record", context=_parent_ctx) as _span:
+ _span.set_attribute("decky", _log_data.get("decky", ""))
+ _span.set_attribute("service", _log_data.get("service", ""))
+ _span.set_attribute("event_type", _log_data.get("event_type", ""))
+ _span.set_attribute("attacker_ip", _log_data.get("attacker_ip", ""))
+ _sctx = getattr(_span, "get_span_context", None)
+ if _sctx:
+ _ctx = _sctx()
+ if _ctx and getattr(_ctx, "trace_id", 0):
+ _log_data["trace_id"] = format(_ctx.trace_id, "032x")
+ _log_data["span_id"] = format(_ctx.span_id, "016x")
+ _batch.append((_log_data, _f.tell()))
except json.JSONDecodeError:
- logger.error(f"Failed to decode JSON log line: {_line}")
+ logger.error("ingest: failed to decode JSON log line: %s", _line.strip())
+ # Skip past bad line so we don't loop forever on it.
+ _position = _f.tell()
continue
- # Update position after successful line read
- _position = _f.tell()
+ if len(_batch) >= DECNET_BATCH_SIZE or (
+ time.monotonic() - _batch_started >= _max_wait_s
+ ):
+ _position = await _flush_batch(repo, _batch, _position)
+ _batch.clear()
+ _batch_started = time.monotonic()
+
+ # Flush any remainder collected before EOF / partial-line break.
+ if _batch:
+ _position = await _flush_batch(repo, _batch, _position)
except Exception as _e:
_err_str = str(_e).lower()
if "no such table" in _err_str or "no active connection" in _err_str or "connection closed" in _err_str:
- logger.error(f"Post-shutdown or fatal DB error in ingester: {_e}")
+ logger.error("ingest: post-shutdown or fatal DB error: %s", _e)
break # Exit worker — DB is gone or uninitialized
- logger.error(f"Error in log ingestion worker: {_e}")
+ logger.error("ingest: error in worker: %s", _e)
await asyncio.sleep(5)
await asyncio.sleep(1)
+async def _flush_batch(
+ repo: BaseRepository,
+ batch: list[tuple[dict[str, Any], int]],
+ current_position: int,
+) -> int:
+ """Commit a batch of log rows and return the new file position.
+
+ If the enclosing task is being cancelled, bail out without touching
+ the DB — the session factory may already be disposed during lifespan
+ teardown, and awaiting it would stall the worker. The un-flushed
+ lines stay uncommitted; the next startup re-reads them from
+ ``current_position``.
+ """
+ _task = asyncio.current_task()
+ if _task is not None and _task.cancelling():
+ raise asyncio.CancelledError()
+
+ _entries = [_entry for _entry, _ in batch]
+ _new_position = batch[-1][1]
+ await repo.add_logs(_entries)
+ for _entry in _entries:
+ await _extract_bounty(repo, _entry)
+ await repo.set_state(_INGEST_STATE_KEY, {"position": _new_position})
+ return _new_position
+
+
+@_traced("ingester.extract_bounty")
async def _extract_bounty(repo: BaseRepository, log_data: dict[str, Any]) -> None:
"""Detect and extract valuable artifacts (bounties) from log entries."""
_fields = log_data.get("fields")
@@ -96,4 +166,180 @@ async def _extract_bounty(repo: BaseRepository, log_data: dict[str, Any]) -> Non
}
})
- # 2. Add more extractors here later (e.g. file hashes, crypto keys)
+ # 2. HTTP User-Agent fingerprint
+ _h_raw = _fields.get("headers")
+ if isinstance(_h_raw, dict):
+ _headers = _h_raw
+ elif isinstance(_h_raw, str):
+ try:
+ _parsed = json.loads(_h_raw)
+ _headers = _parsed if isinstance(_parsed, dict) else {}
+ except (json.JSONDecodeError, ValueError):
+ _headers = {}
+ else:
+ _headers = {}
+ _ua = _headers.get("User-Agent") or _headers.get("user-agent")
+ if _ua:
+ await repo.add_bounty({
+ "decky": log_data.get("decky"),
+ "service": log_data.get("service"),
+ "attacker_ip": log_data.get("attacker_ip"),
+ "bounty_type": "fingerprint",
+ "payload": {
+ "fingerprint_type": "http_useragent",
+ "value": _ua,
+ "method": _fields.get("method"),
+ "path": _fields.get("path"),
+ }
+ })
+
+ # 3. VNC client version fingerprint
+ _vnc_ver = _fields.get("client_version")
+ if _vnc_ver and log_data.get("event_type") == "version":
+ await repo.add_bounty({
+ "decky": log_data.get("decky"),
+ "service": log_data.get("service"),
+ "attacker_ip": log_data.get("attacker_ip"),
+ "bounty_type": "fingerprint",
+ "payload": {
+ "fingerprint_type": "vnc_client_version",
+ "value": _vnc_ver,
+ }
+ })
+
+ # 4. SSH client banner fingerprint (deferred — requires asyncssh server)
+ # Fires on: service=ssh, event_type=client_banner, fields.client_banner
+
+ # 5. JA3/JA3S TLS fingerprint from sniffer container
+ _ja3 = _fields.get("ja3")
+ if _ja3 and log_data.get("service") == "sniffer":
+ await repo.add_bounty({
+ "decky": log_data.get("decky"),
+ "service": "sniffer",
+ "attacker_ip": log_data.get("attacker_ip"),
+ "bounty_type": "fingerprint",
+ "payload": {
+ "fingerprint_type": "ja3",
+ "ja3": _ja3,
+ "ja3s": _fields.get("ja3s"),
+ "ja4": _fields.get("ja4"),
+ "ja4s": _fields.get("ja4s"),
+ "tls_version": _fields.get("tls_version"),
+ "sni": _fields.get("sni") or None,
+ "alpn": _fields.get("alpn") or None,
+ "dst_port": _fields.get("dst_port"),
+ "raw_ciphers": _fields.get("raw_ciphers"),
+ "raw_extensions": _fields.get("raw_extensions"),
+ },
+ })
+
+ # 6. JA4L latency fingerprint from sniffer
+ _ja4l_rtt = _fields.get("ja4l_rtt_ms")
+ if _ja4l_rtt and log_data.get("service") == "sniffer":
+ await repo.add_bounty({
+ "decky": log_data.get("decky"),
+ "service": "sniffer",
+ "attacker_ip": log_data.get("attacker_ip"),
+ "bounty_type": "fingerprint",
+ "payload": {
+ "fingerprint_type": "ja4l",
+ "rtt_ms": _ja4l_rtt,
+ "client_ttl": _fields.get("ja4l_client_ttl"),
+ },
+ })
+
+ # 7. TLS session resumption behavior
+ _resumption = _fields.get("resumption")
+ if _resumption and log_data.get("service") == "sniffer":
+ await repo.add_bounty({
+ "decky": log_data.get("decky"),
+ "service": "sniffer",
+ "attacker_ip": log_data.get("attacker_ip"),
+ "bounty_type": "fingerprint",
+ "payload": {
+ "fingerprint_type": "tls_resumption",
+ "mechanisms": _resumption,
+ },
+ })
+
+ # 8. TLS certificate details (TLS 1.2 only — passive extraction)
+ _subject_cn = _fields.get("subject_cn")
+ if _subject_cn and log_data.get("service") == "sniffer":
+ await repo.add_bounty({
+ "decky": log_data.get("decky"),
+ "service": "sniffer",
+ "attacker_ip": log_data.get("attacker_ip"),
+ "bounty_type": "fingerprint",
+ "payload": {
+ "fingerprint_type": "tls_certificate",
+ "subject_cn": _subject_cn,
+ "issuer": _fields.get("issuer"),
+ "self_signed": _fields.get("self_signed"),
+ "not_before": _fields.get("not_before"),
+ "not_after": _fields.get("not_after"),
+ "sans": _fields.get("sans"),
+ "sni": _fields.get("sni") or None,
+ },
+ })
+
+ # 9. JARM fingerprint from active prober
+ _jarm = _fields.get("jarm_hash")
+ if _jarm and log_data.get("service") == "prober":
+ await repo.add_bounty({
+ "decky": log_data.get("decky"),
+ "service": "prober",
+ "attacker_ip": _fields.get("target_ip", "Unknown"),
+ "bounty_type": "fingerprint",
+ "payload": {
+ "fingerprint_type": "jarm",
+ "hash": _jarm,
+ "target_ip": _fields.get("target_ip"),
+ "target_port": _fields.get("target_port"),
+ },
+ })
+
+ # 10. HASSHServer fingerprint from active prober
+ _hassh = _fields.get("hassh_server_hash")
+ if _hassh and log_data.get("service") == "prober":
+ await repo.add_bounty({
+ "decky": log_data.get("decky"),
+ "service": "prober",
+ "attacker_ip": _fields.get("target_ip", "Unknown"),
+ "bounty_type": "fingerprint",
+ "payload": {
+ "fingerprint_type": "hassh_server",
+ "hash": _hassh,
+ "target_ip": _fields.get("target_ip"),
+ "target_port": _fields.get("target_port"),
+ "ssh_banner": _fields.get("ssh_banner"),
+ "kex_algorithms": _fields.get("kex_algorithms"),
+ "encryption_s2c": _fields.get("encryption_s2c"),
+ "mac_s2c": _fields.get("mac_s2c"),
+ "compression_s2c": _fields.get("compression_s2c"),
+ },
+ })
+
+ # 11. TCP/IP stack fingerprint from active prober
+ _tcpfp = _fields.get("tcpfp_hash")
+ if _tcpfp and log_data.get("service") == "prober":
+ await repo.add_bounty({
+ "decky": log_data.get("decky"),
+ "service": "prober",
+ "attacker_ip": _fields.get("target_ip", "Unknown"),
+ "bounty_type": "fingerprint",
+ "payload": {
+ "fingerprint_type": "tcpfp",
+ "hash": _tcpfp,
+ "raw": _fields.get("tcpfp_raw"),
+ "target_ip": _fields.get("target_ip"),
+ "target_port": _fields.get("target_port"),
+ "ttl": _fields.get("ttl"),
+ "window_size": _fields.get("window_size"),
+ "df_bit": _fields.get("df_bit"),
+ "mss": _fields.get("mss"),
+ "window_scale": _fields.get("window_scale"),
+ "sack_ok": _fields.get("sack_ok"),
+ "timestamp": _fields.get("timestamp"),
+ "options_order": _fields.get("options_order"),
+ },
+ })
diff --git a/decnet/web/router/__init__.py b/decnet/web/router/__init__.py
index b1bd92e..cbbb99c 100644
--- a/decnet/web/router/__init__.py
+++ b/decnet/web/router/__init__.py
@@ -11,8 +11,32 @@ from .fleet.api_mutate_decky import router as mutate_decky_router
from .fleet.api_mutate_interval import router as mutate_interval_router
from .fleet.api_deploy_deckies import router as deploy_deckies_router
from .stream.api_stream_events import router as stream_router
+from .attackers.api_get_attackers import router as attackers_router
+from .attackers.api_get_attacker_detail import router as attacker_detail_router
+from .attackers.api_get_attacker_commands import router as attacker_commands_router
+from .attackers.api_get_attacker_artifacts import router as attacker_artifacts_router
+from .config.api_get_config import router as config_get_router
+from .config.api_update_config import router as config_update_router
+from .config.api_manage_users import router as config_users_router
+from .config.api_reinit import router as config_reinit_router
+from .health.api_get_health import router as health_router
+from .artifacts.api_get_artifact import router as artifacts_router
+from .swarm_updates import swarm_updates_router
+from .swarm_mgmt import swarm_mgmt_router
+from .system import system_router
-api_router = APIRouter()
+api_router = APIRouter(
+ # Every route under /api/v1 is auth-guarded (either by an explicit
+ # require_* Depends or by the global auth middleware). Document 401/403
+ # here so the OpenAPI schema reflects reality for contract tests.
+ responses={
+ 400: {"description": "Malformed request body"},
+ 401: {"description": "Missing or invalid credentials"},
+ 403: {"description": "Authenticated but not authorized"},
+ 404: {"description": "Referenced resource does not exist"},
+ 409: {"description": "Conflict with existing resource"},
+ },
+)
# Authentication
api_router.include_router(login_router)
@@ -31,6 +55,31 @@ api_router.include_router(mutate_decky_router)
api_router.include_router(mutate_interval_router)
api_router.include_router(deploy_deckies_router)
+# Attacker Profiles
+api_router.include_router(attackers_router)
+api_router.include_router(attacker_detail_router)
+api_router.include_router(attacker_commands_router)
+api_router.include_router(attacker_artifacts_router)
+
# Observability
api_router.include_router(stats_router)
api_router.include_router(stream_router)
+api_router.include_router(health_router)
+
+# Configuration
+api_router.include_router(config_get_router)
+api_router.include_router(config_update_router)
+api_router.include_router(config_users_router)
+api_router.include_router(config_reinit_router)
+
+# Artifacts (captured attacker file drops)
+api_router.include_router(artifacts_router)
+
+# Remote Updates (dashboard → worker updater daemons)
+api_router.include_router(swarm_updates_router)
+
+# Swarm Management (dashboard: hosts, deckies, agent enrollment bundles)
+api_router.include_router(swarm_mgmt_router)
+
+# System info (deployment-mode auto-detection, etc.)
+api_router.include_router(system_router)
diff --git a/decnet/web/router/artifacts/__init__.py b/decnet/web/router/artifacts/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/decnet/web/router/artifacts/api_get_artifact.py b/decnet/web/router/artifacts/api_get_artifact.py
new file mode 100644
index 0000000..c5f6c92
--- /dev/null
+++ b/decnet/web/router/artifacts/api_get_artifact.py
@@ -0,0 +1,84 @@
+"""
+Artifact download endpoint.
+
+SSH deckies farm attacker file drops into a host-mounted quarantine:
+ /var/lib/decnet/artifacts/{decky}/ssh/{stored_as}
+
+The capture event already flows through the normal log pipeline (one
+RFC 5424 line per capture, see templates/ssh/emit_capture.py), so metadata
+is served via /logs. This endpoint exists only to retrieve the raw bytes —
+admin-gated because the payloads are attacker-controlled content.
+"""
+
+from __future__ import annotations
+
+import os
+import re
+from pathlib import Path
+
+from fastapi import APIRouter, Depends, HTTPException
+from fastapi.responses import FileResponse
+
+from decnet.telemetry import traced as _traced
+from decnet.web.dependencies import require_admin
+
+router = APIRouter()
+
+# Override via env for tests; the prod path matches the bind mount declared in
+# decnet/services/ssh.py.
+ARTIFACTS_ROOT = Path(os.environ.get("DECNET_ARTIFACTS_ROOT", "/var/lib/decnet/artifacts"))
+
+# decky names come from the deployer — lowercase alnum plus hyphens.
+_DECKY_RE = re.compile(r"^[a-z0-9][a-z0-9-]{0,62}$")
+
+# stored_as is assembled by capture.sh as:
+# ${ts}_${sha:0:12}_${base}
+# where ts is ISO-8601 UTC (e.g. 2026-04-18T02:22:56Z), sha is 12 hex chars,
+# and base is the original filename's basename. Keep the filename charset
+# tight but allow common punctuation dropped files actually use.
+_STORED_AS_RE = re.compile(
+ r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z_[a-f0-9]{12}_[A-Za-z0-9._-]{1,255}$"
+)
+
+
+def _resolve_artifact_path(decky: str, stored_as: str) -> Path:
+ """Validate inputs, resolve the on-disk path, and confirm it stays inside
+ the artifacts root. Raises HTTPException(400) on any violation."""
+ if not _DECKY_RE.fullmatch(decky):
+ raise HTTPException(status_code=400, detail="invalid decky name")
+ if not _STORED_AS_RE.fullmatch(stored_as):
+ raise HTTPException(status_code=400, detail="invalid stored_as")
+
+ root = ARTIFACTS_ROOT.resolve()
+ candidate = (root / decky / "ssh" / stored_as).resolve()
+ # defence-in-depth: even though the regexes reject `..`, make sure a
+ # symlink or weird filesystem state can't escape the root.
+ if root not in candidate.parents and candidate != root:
+ raise HTTPException(status_code=400, detail="path escapes artifacts root")
+ return candidate
+
+
+@router.get(
+ "/artifacts/{decky}/{stored_as}",
+ tags=["Artifacts"],
+ responses={
+ 400: {"description": "Invalid decky or stored_as parameter"},
+ 401: {"description": "Could not validate credentials"},
+ 403: {"description": "Admin access required"},
+ 404: {"description": "Artifact not found"},
+ },
+)
+@_traced("api.get_artifact")
+async def get_artifact(
+ decky: str,
+ stored_as: str,
+ admin: dict = Depends(require_admin),
+) -> FileResponse:
+ path = _resolve_artifact_path(decky, stored_as)
+ if not path.is_file():
+ raise HTTPException(status_code=404, detail="artifact not found")
+ return FileResponse(
+ path=str(path),
+ media_type="application/octet-stream",
+ filename=stored_as,
+ )
diff --git a/decnet/web/router/attackers/__init__.py b/decnet/web/router/attackers/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/decnet/web/router/attackers/api_get_attacker_artifacts.py b/decnet/web/router/attackers/api_get_attacker_artifacts.py
new file mode 100644
index 0000000..000dc1f
--- /dev/null
+++ b/decnet/web/router/attackers/api_get_attacker_artifacts.py
@@ -0,0 +1,34 @@
+from typing import Any
+
+from fastapi import APIRouter, Depends, HTTPException
+
+from decnet.telemetry import traced as _traced
+from decnet.web.dependencies import require_viewer, repo
+
+router = APIRouter()
+
+
+@router.get(
+ "/attackers/{uuid}/artifacts",
+ tags=["Attacker Profiles"],
+ responses={
+ 401: {"description": "Could not validate credentials"},
+ 403: {"description": "Insufficient permissions"},
+ 404: {"description": "Attacker not found"},
+ },
+)
+@_traced("api.get_attacker_artifacts")
+async def get_attacker_artifacts(
+ uuid: str,
+ user: dict = Depends(require_viewer),
+) -> dict[str, Any]:
+ """List captured file-drop artifacts for an attacker (newest first).
+
+ Each entry is a `file_captured` log row — the frontend renders the
+ badge/drawer using the same `fields` payload as /logs.
+ """
+ attacker = await repo.get_attacker_by_uuid(uuid)
+ if not attacker:
+ raise HTTPException(status_code=404, detail="Attacker not found")
+ rows = await repo.get_attacker_artifacts(uuid)
+ return {"total": len(rows), "data": rows}
diff --git a/decnet/web/router/attackers/api_get_attacker_commands.py b/decnet/web/router/attackers/api_get_attacker_commands.py
new file mode 100644
index 0000000..14d03eb
--- /dev/null
+++ b/decnet/web/router/attackers/api_get_attacker_commands.py
@@ -0,0 +1,42 @@
+from typing import Any, Optional
+
+from fastapi import APIRouter, Depends, HTTPException, Query
+
+from decnet.telemetry import traced as _traced
+from decnet.web.dependencies import require_viewer, repo
+
+router = APIRouter()
+
+
+@router.get(
+ "/attackers/{uuid}/commands",
+ tags=["Attacker Profiles"],
+ responses={
+ 401: {"description": "Could not validate credentials"},
+ 403: {"description": "Insufficient permissions"},
+ 404: {"description": "Attacker not found"},
+ 422: {"description": "Query parameter validation error (limit/offset out of range or invalid)"},
+ },
+)
+@_traced("api.get_attacker_commands")
+async def get_attacker_commands(
+ uuid: str,
+ limit: int = Query(50, ge=1, le=200),
+ offset: int = Query(0, ge=0, le=2147483647),
+ service: Optional[str] = None,
+ user: dict = Depends(require_viewer),
+) -> dict[str, Any]:
+ """Retrieve paginated commands for an attacker profile."""
+ attacker = await repo.get_attacker_by_uuid(uuid)
+ if not attacker:
+ raise HTTPException(status_code=404, detail="Attacker not found")
+
+ def _norm(v: Optional[str]) -> Optional[str]:
+ if v in (None, "null", "NULL", "undefined", ""):
+ return None
+ return v
+
+ result = await repo.get_attacker_commands(
+ uuid=uuid, limit=limit, offset=offset, service=_norm(service),
+ )
+ return {"total": result["total"], "limit": limit, "offset": offset, "data": result["data"]}
diff --git a/decnet/web/router/attackers/api_get_attacker_detail.py b/decnet/web/router/attackers/api_get_attacker_detail.py
new file mode 100644
index 0000000..dcc9ebd
--- /dev/null
+++ b/decnet/web/router/attackers/api_get_attacker_detail.py
@@ -0,0 +1,30 @@
+from typing import Any
+
+from fastapi import APIRouter, Depends, HTTPException
+
+from decnet.telemetry import traced as _traced
+from decnet.web.dependencies import require_viewer, repo
+
+router = APIRouter()
+
+
+@router.get(
+ "/attackers/{uuid}",
+ tags=["Attacker Profiles"],
+ responses={
+ 401: {"description": "Could not validate credentials"},
+ 403: {"description": "Insufficient permissions"},
+ 404: {"description": "Attacker not found"},
+ },
+)
+@_traced("api.get_attacker_detail")
+async def get_attacker_detail(
+ uuid: str,
+ user: dict = Depends(require_viewer),
+) -> dict[str, Any]:
+ """Retrieve a single attacker profile by UUID (with behavior block)."""
+ attacker = await repo.get_attacker_by_uuid(uuid)
+ if not attacker:
+ raise HTTPException(status_code=404, detail="Attacker not found")
+ attacker["behavior"] = await repo.get_attacker_behavior(uuid)
+ return attacker
diff --git a/decnet/web/router/attackers/api_get_attackers.py b/decnet/web/router/attackers/api_get_attackers.py
new file mode 100644
index 0000000..f1ff7b4
--- /dev/null
+++ b/decnet/web/router/attackers/api_get_attackers.py
@@ -0,0 +1,83 @@
+import asyncio
+import time
+from typing import Any, Optional
+
+from fastapi import APIRouter, Depends, Query
+
+from decnet.telemetry import traced as _traced
+from decnet.web.dependencies import require_viewer, repo
+from decnet.web.db.models import AttackersResponse
+
+router = APIRouter()
+
+# Same pattern as /logs — cache the unfiltered total count; filtered
+# counts go straight to the DB.
+_TOTAL_TTL = 2.0
+_total_cache: tuple[Optional[int], float] = (None, 0.0)
+_total_lock: Optional[asyncio.Lock] = None
+
+
+def _reset_total_cache() -> None:
+ global _total_cache, _total_lock
+ _total_cache = (None, 0.0)
+ _total_lock = None
+
+
+async def _get_total_attackers_cached() -> int:
+ global _total_cache, _total_lock
+ value, ts = _total_cache
+ now = time.monotonic()
+ if value is not None and now - ts < _TOTAL_TTL:
+ return value
+ if _total_lock is None:
+ _total_lock = asyncio.Lock()
+ async with _total_lock:
+ value, ts = _total_cache
+ now = time.monotonic()
+ if value is not None and now - ts < _TOTAL_TTL:
+ return value
+ value = await repo.get_total_attackers()
+ _total_cache = (value, time.monotonic())
+ return value
+
+
+@router.get(
+ "/attackers",
+ response_model=AttackersResponse,
+ tags=["Attacker Profiles"],
+ responses={
+ 401: {"description": "Could not validate credentials"},
+ 403: {"description": "Insufficient permissions"},
+ 422: {"description": "Validation error"},
+ },
+)
+@_traced("api.get_attackers")
+async def get_attackers(
+ limit: int = Query(50, ge=1, le=1000),
+ offset: int = Query(0, ge=0, le=2147483647),
+ search: Optional[str] = None,
+ sort_by: str = Query("recent", pattern="^(recent|active|traversals)$"),
+ service: Optional[str] = None,
+ user: dict = Depends(require_viewer),
+) -> dict[str, Any]:
+ """Retrieve paginated attacker profiles."""
+ def _norm(v: Optional[str]) -> Optional[str]:
+ if v in (None, "null", "NULL", "undefined", ""):
+ return None
+ return v
+
+ s = _norm(search)
+ svc = _norm(service)
+ _data = await repo.get_attackers(limit=limit, offset=offset, search=s, sort_by=sort_by, service=svc)
+ if s is None and svc is None:
+ _total = await _get_total_attackers_cached()
+ else:
+ _total = await repo.get_total_attackers(search=s, service=svc)
+
+ # Bulk-join behavior rows for the IPs in this page to avoid N+1 queries.
+ _ips = {row["ip"] for row in _data if row.get("ip")}
+ _behaviors = await repo.get_behaviors_for_ips(_ips) if _ips else {}
+ for row in _data:
+ row["behavior"] = _behaviors.get(row.get("ip"))
+
+ return {"total": _total, "limit": limit, "offset": offset, "data": _data}
diff --git a/decnet/web/router/auth/api_change_pass.py b/decnet/web/router/auth/api_change_pass.py
index c186973..592b11e 100644
--- a/decnet/web/router/auth/api_change_pass.py
+++ b/decnet/web/router/auth/api_change_pass.py
@@ -2,8 +2,9 @@ from typing import Any, Optional
from fastapi import APIRouter, Depends, HTTPException, status
-from decnet.web.auth import get_password_hash, verify_password
-from decnet.web.dependencies import get_current_user_unchecked, repo
+from decnet.telemetry import traced as _traced
+from decnet.web.auth import ahash_password, averify_password
+from decnet.web.dependencies import get_current_user_unchecked, invalidate_user_cache, repo
from decnet.web.db.models import ChangePasswordRequest
router = APIRouter()
@@ -18,14 +19,16 @@ router = APIRouter()
422: {"description": "Validation error"}
},
)
+@_traced("api.change_password")
async def change_password(request: ChangePasswordRequest, current_user: str = Depends(get_current_user_unchecked)) -> dict[str, str]:
_user: Optional[dict[str, Any]] = await repo.get_user_by_uuid(current_user)
- if not _user or not verify_password(request.old_password, _user["password_hash"]):
+ if not _user or not await averify_password(request.old_password, _user["password_hash"]):
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Incorrect old password",
)
- _new_hash: str = get_password_hash(request.new_password)
+ _new_hash: str = await ahash_password(request.new_password)
await repo.update_user_password(current_user, _new_hash, must_change_password=False)
+ invalidate_user_cache(current_user)
return {"message": "Password updated successfully"}
diff --git a/decnet/web/router/auth/api_login.py b/decnet/web/router/auth/api_login.py
index a9db5b7..a41eaab 100644
--- a/decnet/web/router/auth/api_login.py
+++ b/decnet/web/router/auth/api_login.py
@@ -3,12 +3,13 @@ from typing import Any, Optional
from fastapi import APIRouter, HTTPException, status
+from decnet.telemetry import traced as _traced
from decnet.web.auth import (
ACCESS_TOKEN_EXPIRE_MINUTES,
+ averify_password,
create_access_token,
- verify_password,
)
-from decnet.web.dependencies import repo
+from decnet.web.dependencies import get_user_by_username_cached
from decnet.web.db.models import LoginRequest, Token
router = APIRouter()
@@ -24,9 +25,10 @@ router = APIRouter()
422: {"description": "Validation error"}
},
)
+@_traced("api.login")
async def login(request: LoginRequest) -> dict[str, Any]:
- _user: Optional[dict[str, Any]] = await repo.get_user_by_username(request.username)
- if not _user or not verify_password(request.password, _user["password_hash"]):
+ _user: Optional[dict[str, Any]] = await get_user_by_username_cached(request.username)
+ if not _user or not await averify_password(request.password, _user["password_hash"]):
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Incorrect username or password",
@@ -40,6 +42,6 @@ async def login(request: LoginRequest) -> dict[str, Any]:
)
return {
"access_token": _access_token,
- "token_type": "bearer", # nosec B105
+ "token_type": "bearer", # nosec B105 — OAuth2 token type, not a password
"must_change_password": bool(_user.get("must_change_password", False))
}
diff --git a/decnet/web/router/bounty/api_get_bounties.py b/decnet/web/router/bounty/api_get_bounties.py
index 5ff7fd2..5560181 100644
--- a/decnet/web/router/bounty/api_get_bounties.py
+++ b/decnet/web/router/bounty/api_get_bounties.py
@@ -1,21 +1,62 @@
+import asyncio
+import time
from typing import Any, Optional
from fastapi import APIRouter, Depends, Query
-from decnet.web.dependencies import get_current_user, repo
+from decnet.telemetry import traced as _traced
+from decnet.web.dependencies import require_viewer, repo
from decnet.web.db.models import BountyResponse
router = APIRouter()
+# Cache the unfiltered default page — the UI/locust hit this constantly
+# with no params. Filtered requests (bounty_type/search) bypass: rare
+# and staleness matters for search.
+_BOUNTY_TTL = 5.0
+_DEFAULT_LIMIT = 50
+_DEFAULT_OFFSET = 0
+_bounty_cache: tuple[Optional[dict[str, Any]], float] = (None, 0.0)
+_bounty_lock: Optional[asyncio.Lock] = None
+
+
+def _reset_bounty_cache() -> None:
+ global _bounty_cache, _bounty_lock
+ _bounty_cache = (None, 0.0)
+ _bounty_lock = None
+
+
+async def _get_bounty_default_cached() -> dict[str, Any]:
+ global _bounty_cache, _bounty_lock
+ value, ts = _bounty_cache
+ now = time.monotonic()
+ if value is not None and now - ts < _BOUNTY_TTL:
+ return value
+ if _bounty_lock is None:
+ _bounty_lock = asyncio.Lock()
+ async with _bounty_lock:
+ value, ts = _bounty_cache
+ now = time.monotonic()
+ if value is not None and now - ts < _BOUNTY_TTL:
+ return value
+ _data = await repo.get_bounties(
+ limit=_DEFAULT_LIMIT, offset=_DEFAULT_OFFSET, bounty_type=None, search=None,
+ )
+ _total = await repo.get_total_bounties(bounty_type=None, search=None)
+ value = {"total": _total, "limit": _DEFAULT_LIMIT, "offset": _DEFAULT_OFFSET, "data": _data}
+ _bounty_cache = (value, time.monotonic())
+ return value
+
@router.get("/bounty", response_model=BountyResponse, tags=["Bounty Vault"],
- responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}},)
+ responses={401: {"description": "Could not validate credentials"}, 403: {"description": "Insufficient permissions"}, 422: {"description": "Validation error"}},)
+@_traced("api.get_bounties")
async def get_bounties(
limit: int = Query(50, ge=1, le=1000),
offset: int = Query(0, ge=0, le=2147483647),
bounty_type: Optional[str] = None,
search: Optional[str] = None,
- current_user: str = Depends(get_current_user)
+ user: dict = Depends(require_viewer)
) -> dict[str, Any]:
"""Retrieve collected bounties (harvested credentials, payloads, etc.)."""
def _norm(v: Optional[str]) -> Optional[str]:
@@ -26,6 +67,9 @@ async def get_bounties(
bt = _norm(bounty_type)
s = _norm(search)
+ if bt is None and s is None and limit == _DEFAULT_LIMIT and offset == _DEFAULT_OFFSET:
+ return await _get_bounty_default_cached()
+
_data = await repo.get_bounties(limit=limit, offset=offset, bounty_type=bt, search=s)
_total = await repo.get_total_bounties(bounty_type=bt, search=s)
return {
diff --git a/decnet/web/router/config/__init__.py b/decnet/web/router/config/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/decnet/web/router/config/api_get_config.py b/decnet/web/router/config/api_get_config.py
new file mode 100644
index 0000000..d21f474
--- /dev/null
+++ b/decnet/web/router/config/api_get_config.py
@@ -0,0 +1,124 @@
+import asyncio
+import time
+from typing import Any, Optional
+
+from fastapi import APIRouter, Depends
+
+from decnet.env import DECNET_DEVELOPER
+from decnet.telemetry import traced as _traced
+from decnet.web.dependencies import require_viewer, repo
+from decnet.web.db.models import UserResponse
+
+router = APIRouter()
+
+_DEFAULT_DEPLOYMENT_LIMIT = 10
+_DEFAULT_MUTATION_INTERVAL = "30m"
+
+# Cache config_limits / config_globals reads — these change on rare admin
+# writes but get polled constantly by the UI and locust.
+_STATE_TTL = 5.0
+_state_cache: dict[str, tuple[Optional[dict[str, Any]], float]] = {}
+_state_locks: dict[str, asyncio.Lock] = {}
+
+# Admin branch fetched repo.list_users() on every /config call — cache 5s,
+# invalidate on user create/update/delete so the admin UI stays consistent.
+_USERS_TTL = 5.0
+_users_cache: tuple[Optional[list[dict[str, Any]]], float] = (None, 0.0)
+_users_lock: Optional[asyncio.Lock] = None
+
+
+def _reset_state_cache() -> None:
+ """Reset cached config state — used by tests."""
+ global _users_cache, _users_lock
+ _state_cache.clear()
+ # Drop any locks bound to the previous event loop — reusing one from
+ # a dead loop deadlocks the next test.
+ _state_locks.clear()
+ _users_cache = (None, 0.0)
+ _users_lock = None
+
+
+def invalidate_list_users_cache() -> None:
+ global _users_cache
+ _users_cache = (None, 0.0)
+
+
+async def _get_list_users_cached() -> list[dict[str, Any]]:
+ global _users_cache, _users_lock
+ value, ts = _users_cache
+ now = time.monotonic()
+ if value is not None and now - ts < _USERS_TTL:
+ return value
+ if _users_lock is None:
+ _users_lock = asyncio.Lock()
+ async with _users_lock:
+ value, ts = _users_cache
+ now = time.monotonic()
+ if value is not None and now - ts < _USERS_TTL:
+ return value
+ value = await repo.list_users()
+ _users_cache = (value, time.monotonic())
+ return value
+
+
+async def _get_state_cached(name: str) -> Optional[dict[str, Any]]:
+ entry = _state_cache.get(name)
+ now = time.monotonic()
+ if entry is not None and now - entry[1] < _STATE_TTL:
+ return entry[0]
+ lock = _state_locks.setdefault(name, asyncio.Lock())
+ async with lock:
+ entry = _state_cache.get(name)
+ now = time.monotonic()
+ if entry is not None and now - entry[1] < _STATE_TTL:
+ return entry[0]
+ value = await repo.get_state(name)
+ _state_cache[name] = (value, time.monotonic())
+ return value
+
+
+@router.get(
+ "/config",
+ tags=["Configuration"],
+ responses={
+ 401: {"description": "Could not validate credentials"},
+ 403: {"description": "Insufficient permissions"},
+ },
+)
+@_traced("api.get_config")
+async def api_get_config(user: dict = Depends(require_viewer)) -> dict:
+ limits_state = await _get_state_cached("config_limits")
+ globals_state = await _get_state_cached("config_globals")
+
+ deployment_limit = (
+ limits_state.get("deployment_limit", _DEFAULT_DEPLOYMENT_LIMIT)
+ if limits_state
+ else _DEFAULT_DEPLOYMENT_LIMIT
+ )
+ global_mutation_interval = (
+ globals_state.get("global_mutation_interval", _DEFAULT_MUTATION_INTERVAL)
+ if globals_state
+ else _DEFAULT_MUTATION_INTERVAL
+ )
+
+ base = {
+ "role": user["role"],
+ "deployment_limit": deployment_limit,
+ "global_mutation_interval": global_mutation_interval,
+ }
+
+ if user["role"] == "admin":
+ all_users = await _get_list_users_cached()
+ base["users"] = [
+ UserResponse(
+ uuid=u["uuid"],
+ username=u["username"],
+ role=u["role"],
+ must_change_password=u["must_change_password"],
+ ).model_dump()
+ for u in all_users
+ ]
+ if DECNET_DEVELOPER:
+ base["developer_mode"] = True
+
+ return base
diff --git a/decnet/web/router/config/api_manage_users.py b/decnet/web/router/config/api_manage_users.py
new file mode 100644
index 0000000..70e0fe9
--- /dev/null
+++ b/decnet/web/router/config/api_manage_users.py
@@ -0,0 +1,139 @@
+import uuid as _uuid
+
+from fastapi import APIRouter, Depends, HTTPException
+
+from decnet.telemetry import traced as _traced
+from decnet.web.auth import ahash_password
+from decnet.web.dependencies import require_admin, invalidate_user_cache, repo
+from decnet.web.router.config.api_get_config import invalidate_list_users_cache
+from decnet.web.db.models import (
+ CreateUserRequest,
+ UpdateUserRoleRequest,
+ ResetUserPasswordRequest,
+ UserResponse,
+)
+
+router = APIRouter()
+
+
+@router.post(
+ "/config/users",
+ tags=["Configuration"],
+ responses={
+ 400: {"description": "Bad Request (e.g. malformed JSON)"},
+ 401: {"description": "Could not validate credentials"},
+ 403: {"description": "Admin access required"},
+ 409: {"description": "Username already exists"},
+ 422: {"description": "Validation error"},
+ },
+)
+@_traced("api.create_user")
+async def api_create_user(
+ req: CreateUserRequest,
+ admin: dict = Depends(require_admin),
+) -> UserResponse:
+ existing = await repo.get_user_by_username(req.username)
+ if existing:
+ raise HTTPException(status_code=409, detail="Username already exists")
+
+ user_uuid = str(_uuid.uuid4())
+ await repo.create_user({
+ "uuid": user_uuid,
+ "username": req.username,
+ "password_hash": await ahash_password(req.password),
+ "role": req.role,
+ "must_change_password": True, # nosec B105 — not a password
+ })
+ invalidate_list_users_cache()
+ return UserResponse(
+ uuid=user_uuid,
+ username=req.username,
+ role=req.role,
+ must_change_password=True,
+ )
+
+
+@router.delete(
+ "/config/users/{user_uuid}",
+ tags=["Configuration"],
+ responses={
+ 401: {"description": "Could not validate credentials"},
+ 403: {"description": "Admin access required / cannot delete self"},
+ 404: {"description": "User not found"},
+ },
+)
+@_traced("api.delete_user")
+async def api_delete_user(
+ user_uuid: str,
+ admin: dict = Depends(require_admin),
+) -> dict[str, str]:
+ if user_uuid == admin["uuid"]:
+ raise HTTPException(status_code=403, detail="Cannot delete your own account")
+
+ deleted = await repo.delete_user(user_uuid)
+ if not deleted:
+ raise HTTPException(status_code=404, detail="User not found")
+ invalidate_user_cache(user_uuid)
+ invalidate_list_users_cache()
+ return {"message": "User deleted"}
+
+
+@router.put(
+ "/config/users/{user_uuid}/role",
+ tags=["Configuration"],
+ responses={
+ 400: {"description": "Bad Request (e.g. malformed JSON)"},
+ 401: {"description": "Could not validate credentials"},
+ 403: {"description": "Admin access required / cannot change own role"},
+ 404: {"description": "User not found"},
+ 422: {"description": "Validation error"},
+ },
+)
+@_traced("api.update_user_role")
+async def api_update_user_role(
+ user_uuid: str,
+ req: UpdateUserRoleRequest,
+ admin: dict = Depends(require_admin),
+) -> dict[str, str]:
+ if user_uuid == admin["uuid"]:
+ raise HTTPException(status_code=403, detail="Cannot change your own role")
+
+ target = await repo.get_user_by_uuid(user_uuid)
+ if not target:
+ raise HTTPException(status_code=404, detail="User not found")
+
+ await repo.update_user_role(user_uuid, req.role)
+ invalidate_user_cache(user_uuid)
+ invalidate_list_users_cache()
+ return {"message": "User role updated"}
+
+
+@router.put(
+ "/config/users/{user_uuid}/reset-password",
+ tags=["Configuration"],
+ responses={
+ 400: {"description": "Bad Request (e.g. malformed JSON)"},
+ 401: {"description": "Could not validate credentials"},
+ 403: {"description": "Admin access required"},
+ 404: {"description": "User not found"},
+ 422: {"description": "Validation error"},
+ },
+)
+@_traced("api.reset_user_password")
+async def api_reset_user_password(
+ user_uuid: str,
+ req: ResetUserPasswordRequest,
+ admin: dict = Depends(require_admin),
+) -> dict[str, str]:
+ target = await repo.get_user_by_uuid(user_uuid)
+ if not target:
+ raise HTTPException(status_code=404, detail="User not found")
+
+ await repo.update_user_password(
+ user_uuid,
+ await ahash_password(req.new_password),
+ must_change_password=True,
+ )
+ invalidate_user_cache(user_uuid)
+ invalidate_list_users_cache()
+ return {"message": "Password reset successfully"}
diff --git a/decnet/web/router/config/api_reinit.py b/decnet/web/router/config/api_reinit.py
new file mode 100644
index 0000000..ebdd1c7
--- /dev/null
+++ b/decnet/web/router/config/api_reinit.py
@@ -0,0 +1,27 @@
+from fastapi import APIRouter, Depends, HTTPException
+
+from decnet.env import DECNET_DEVELOPER
+from decnet.telemetry import traced as _traced
+from decnet.web.dependencies import require_admin, repo
+
+router = APIRouter()
+
+
+@router.delete(
+ "/config/reinit",
+ tags=["Configuration"],
+ responses={
+ 401: {"description": "Could not validate credentials"},
+ 403: {"description": "Admin access required or developer mode not enabled"},
+ },
+)
+@_traced("api.reinit")
+async def api_reinit(admin: dict = Depends(require_admin)) -> dict:
+ if not DECNET_DEVELOPER:
+ raise HTTPException(status_code=403, detail="Developer mode is not enabled")
+
+ counts = await repo.purge_logs_and_bounties()
+ return {
+ "message": "Data purged",
+ "deleted": counts,
+ }
diff --git a/decnet/web/router/config/api_update_config.py b/decnet/web/router/config/api_update_config.py
new file mode 100644
index 0000000..a7feee3
--- /dev/null
+++ b/decnet/web/router/config/api_update_config.py
@@ -0,0 +1,48 @@
+from fastapi import APIRouter, Depends
+
+from decnet.telemetry import traced as _traced
+from decnet.web.dependencies import require_admin, repo
+from decnet.web.db.models import DeploymentLimitRequest, GlobalMutationIntervalRequest
+
+router = APIRouter()
+
+
+@router.put(
+ "/config/deployment-limit",
+ tags=["Configuration"],
+ responses={
+ 400: {"description": "Bad Request (e.g. malformed JSON)"},
+ 401: {"description": "Could not validate credentials"},
+ 403: {"description": "Admin access required"},
+ 422: {"description": "Validation error"},
+ },
+)
+@_traced("api.update_deployment_limit")
+async def api_update_deployment_limit(
+ req: DeploymentLimitRequest,
+ admin: dict = Depends(require_admin),
+) -> dict[str, str]:
+ await repo.set_state("config_limits", {"deployment_limit": req.deployment_limit})
+ return {"message": "Deployment limit updated"}
+
+
+@router.put(
+ "/config/global-mutation-interval",
+ tags=["Configuration"],
+ responses={
+ 400: {"description": "Bad Request (e.g. malformed JSON)"},
+ 401: {"description": "Could not validate credentials"},
+ 403: {"description": "Admin access required"},
+ 422: {"description": "Validation error"},
+ },
+)
+@_traced("api.update_global_mutation_interval")
+async def api_update_global_mutation_interval(
+ req: GlobalMutationIntervalRequest,
+ admin: dict = Depends(require_admin),
+) -> dict[str, str]:
+ await repo.set_state(
+ "config_globals",
+ {"global_mutation_interval": req.global_mutation_interval},
+ )
+ return {"message": "Global mutation interval updated"}
diff --git a/decnet/web/router/fleet/api_deploy_deckies.py b/decnet/web/router/fleet/api_deploy_deckies.py
index 914a64c..5371ef7 100644
--- a/decnet/web/router/fleet/api_deploy_deckies.py
+++ b/decnet/web/router/fleet/api_deploy_deckies.py
@@ -1,14 +1,18 @@
-import logging
import os
from fastapi import APIRouter, Depends, HTTPException
-from decnet.config import DEFAULT_MUTATE_INTERVAL, DecnetConfig, _ROOT, log
+from decnet.logging import get_logger
+from decnet.telemetry import traced as _traced
+from decnet.config import DEFAULT_MUTATE_INTERVAL, DecnetConfig, _ROOT
from decnet.engine import deploy as _deploy
from decnet.ini_loader import load_ini_from_string
from decnet.network import detect_interface, detect_subnet, get_host_ip
-from decnet.web.dependencies import get_current_user, repo
+from decnet.web.dependencies import require_admin, repo
from decnet.web.db.models import DeployIniRequest
+from decnet.web.router.swarm.api_deploy_swarm import dispatch_decnet_config
+
+log = get_logger("api")
router = APIRouter()
@@ -19,12 +23,15 @@ router = APIRouter()
responses={
400: {"description": "Bad Request (e.g. malformed JSON)"},
401: {"description": "Could not validate credentials"},
+ 403: {"description": "Insufficient permissions"},
409: {"description": "Configuration conflict (e.g. invalid IP allocation or network mismatch)"},
422: {"description": "Invalid INI config or schema validation error"},
- 500: {"description": "Deployment failed"}
+ 500: {"description": "Deployment failed"},
+ 502: {"description": "Partial swarm deploy failure — one or more worker hosts returned an error"},
}
)
-async def api_deploy_deckies(req: DeployIniRequest, current_user: str = Depends(get_current_user)) -> dict[str, str]:
+@_traced("api.deploy_deckies")
+async def api_deploy_deckies(req: DeployIniRequest, admin: dict = Depends(require_admin)) -> dict[str, str]:
from decnet.fleet import build_deckies_from_ini
try:
@@ -38,16 +45,20 @@ async def api_deploy_deckies(req: DeployIniRequest, current_user: str = Depends(
state_dict = await repo.get_state("deployment")
ingest_log_file = os.environ.get("DECNET_INGEST_LOG_FILE")
+ config: DecnetConfig | None = None
if state_dict:
config = DecnetConfig(**state_dict["config"])
subnet_cidr = ini.subnet or config.subnet
gateway = ini.gateway or config.gateway
- host_ip = get_host_ip(config.interface)
+ iface = config.interface
+ host_ip = get_host_ip(iface)
# Always sync config log_file with current API ingestion target
if ingest_log_file:
config.log_file = ingest_log_file
else:
- # If no state exists, we need to infer network details from the INI or the host.
+ # No state yet — infer network details from the INI or the host. We
+ # defer instantiating DecnetConfig until after build_deckies_from_ini
+ # because DecnetConfig.deckies has min_length=1.
try:
iface = ini.interface or detect_interface()
subnet_cidr, gateway = ini.subnet, ini.gateway
@@ -62,16 +73,6 @@ async def api_deploy_deckies(req: DeployIniRequest, current_user: str = Depends(
detail=f"Network configuration conflict: {e}. "
"Add a [general] section with interface=, net=, and gw= to the INI."
)
- config = DecnetConfig(
- mode="unihost",
- interface=iface,
- subnet=subnet_cidr,
- gateway=gateway,
- deckies=[],
- log_file=ingest_log_file,
- ipvlan=False,
- mutate_interval=ini.mutate_interval or DEFAULT_MUTATE_INTERVAL
- )
try:
new_decky_configs = build_deckies_from_ini(
@@ -81,26 +82,94 @@ async def api_deploy_deckies(req: DeployIniRequest, current_user: str = Depends(
log.debug("deploy: build_deckies_from_ini rejected input: %s", e)
raise HTTPException(status_code=409, detail=str(e))
- # Merge deckies
- existing_deckies_map = {d.name: d for d in config.deckies}
- for new_decky in new_decky_configs:
- existing_deckies_map[new_decky.name] = new_decky
+ if config is None:
+ config = DecnetConfig(
+ mode="unihost",
+ interface=iface,
+ subnet=subnet_cidr,
+ gateway=gateway,
+ deckies=new_decky_configs,
+ log_file=ingest_log_file,
+ ipvlan=False,
+ mutate_interval=ini.mutate_interval or DEFAULT_MUTATE_INTERVAL,
+ )
- config.deckies = list(existing_deckies_map.values())
+ # The INI is the source of truth for *which* deckies exist this deploy.
+ # The old "merge with prior state" behaviour meant submitting `[decky1]`
+ # after a 3-decky run silently redeployed decky2/decky3 too — and then
+ # collided on their stale IPs ("Address already in use"). Full replace
+ # matches what the operator sees in the submitted config.
+ config.deckies = list(new_decky_configs)
- # We call deploy(config) which regenerates docker-compose and runs `up -d --remove-orphans`.
+ limits_state = await repo.get_state("config_limits")
+ deployment_limit = limits_state.get("deployment_limit", 10) if limits_state else 10
+ if len(config.deckies) > deployment_limit:
+ raise HTTPException(
+ status_code=409,
+ detail=f"Deployment would result in {len(config.deckies)} deckies, "
+ f"exceeding the configured limit of {deployment_limit}",
+ )
+
+ # Auto-mode: if we're a master with at least one enrolled/active SWARM
+ # host, shard the deckies across those workers instead of spawning docker
+ # containers on the master itself. Round-robin assignment over deckies
+ # that don't already carry a host_uuid (state from a prior swarm deploy
+ # keeps its original assignment).
+ swarm_hosts: list[dict] = []
+ if os.environ.get("DECNET_MODE", "master").lower() == "master":
+ swarm_hosts = [
+ h for h in await repo.list_swarm_hosts()
+ if h.get("status") in ("active", "enrolled") and h.get("address")
+ ]
+
+ if swarm_hosts:
+ # Carry-over from a prior deployment may reference a host_uuid that's
+ # since been decommissioned / re-enrolled at a new uuid. Drop any
+ # assignment that isn't in the currently-reachable set, then round-
+ # robin-fill the blanks — otherwise dispatch 404s on a dead uuid.
+ live_uuids = {h["uuid"] for h in swarm_hosts}
+ for d in config.deckies:
+ if d.host_uuid and d.host_uuid not in live_uuids:
+ d.host_uuid = None
+ unassigned = [d for d in config.deckies if not d.host_uuid]
+ for i, d in enumerate(unassigned):
+ d.host_uuid = swarm_hosts[i % len(swarm_hosts)]["uuid"]
+ config = config.model_copy(update={"mode": "swarm"})
+
+ try:
+ result = await dispatch_decnet_config(config, repo, dry_run=False, no_cache=False)
+ except HTTPException:
+ raise
+ except Exception as e:
+ log.exception("swarm-auto deploy dispatch failed: %s", e)
+ raise HTTPException(status_code=500, detail="Swarm dispatch failed. Check server logs.")
+
+ await repo.set_state("deployment", {
+ "config": config.model_dump(),
+ "compose_path": state_dict["compose_path"] if state_dict else "",
+ })
+
+ failed = [r for r in result.results if not r.ok]
+ if failed:
+ detail = "; ".join(f"{r.host_name}: {r.detail}" for r in failed)
+ raise HTTPException(status_code=502, detail=f"Partial swarm deploy failure — {detail}")
+ return {
+ "message": f"Deckies deployed across {len(result.results)} swarm host(s)",
+ "mode": "swarm",
+ }
+
+ # Unihost path — docker-compose on the master itself.
try:
if os.environ.get("DECNET_CONTRACT_TEST") != "true":
_deploy(config)
- # Persist new state to DB
new_state_payload = {
"config": config.model_dump(),
"compose_path": str(_ROOT / "docker-compose.yml") if not state_dict else state_dict["compose_path"]
}
await repo.set_state("deployment", new_state_payload)
except Exception as e:
- logging.getLogger("decnet.web.api").exception("Deployment failed: %s", e)
+ log.exception("Deployment failed: %s", e)
raise HTTPException(status_code=500, detail="Deployment failed. Check server logs for details.")
- return {"message": "Deckies deployed successfully"}
+ return {"message": "Deckies deployed successfully", "mode": "unihost"}
diff --git a/decnet/web/router/fleet/api_get_deckies.py b/decnet/web/router/fleet/api_get_deckies.py
index 7353373..593ff4e 100644
--- a/decnet/web/router/fleet/api_get_deckies.py
+++ b/decnet/web/router/fleet/api_get_deckies.py
@@ -1,13 +1,48 @@
-from typing import Any
+import asyncio
+import time
+from typing import Any, Optional
from fastapi import APIRouter, Depends
-from decnet.web.dependencies import get_current_user, repo
+from decnet.telemetry import traced as _traced
+from decnet.web.dependencies import require_viewer, repo
router = APIRouter()
+# /deckies is full fleet inventory — polled by the UI and under locust.
+# Fleet state changes on deploy/teardown (seconds to minutes); a 5s window
+# collapses the read storm into one DB hit.
+_DECKIES_TTL = 5.0
+_deckies_cache: tuple[Optional[list[dict[str, Any]]], float] = (None, 0.0)
+_deckies_lock: Optional[asyncio.Lock] = None
+
+
+def _reset_deckies_cache() -> None:
+ global _deckies_cache, _deckies_lock
+ _deckies_cache = (None, 0.0)
+ _deckies_lock = None
+
+
+async def _get_deckies_cached() -> list[dict[str, Any]]:
+ global _deckies_cache, _deckies_lock
+ value, ts = _deckies_cache
+ now = time.monotonic()
+ if value is not None and now - ts < _DECKIES_TTL:
+ return value
+ if _deckies_lock is None:
+ _deckies_lock = asyncio.Lock()
+ async with _deckies_lock:
+ value, ts = _deckies_cache
+ now = time.monotonic()
+ if value is not None and now - ts < _DECKIES_TTL:
+ return value
+ value = await repo.get_deckies()
+ _deckies_cache = (value, time.monotonic())
+ return value
+
@router.get("/deckies", tags=["Fleet Management"],
- responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}},)
-async def get_deckies(current_user: str = Depends(get_current_user)) -> list[dict[str, Any]]:
- return await repo.get_deckies()
+ responses={401: {"description": "Could not validate credentials"}, 403: {"description": "Insufficient permissions"}, 422: {"description": "Validation error"}},)
+@_traced("api.get_deckies")
+async def get_deckies(user: dict = Depends(require_viewer)) -> list[dict[str, Any]]:
+ return await _get_deckies_cached()
diff --git a/decnet/web/router/fleet/api_mutate_decky.py b/decnet/web/router/fleet/api_mutate_decky.py
index e3facc6..7f2e095 100644
--- a/decnet/web/router/fleet/api_mutate_decky.py
+++ b/decnet/web/router/fleet/api_mutate_decky.py
@@ -1,8 +1,9 @@
import os
from fastapi import APIRouter, Depends, HTTPException, Path
+from decnet.telemetry import traced as _traced
from decnet.mutator import mutate_decky
-from decnet.web.dependencies import get_current_user, repo
+from decnet.web.dependencies import require_admin, repo
router = APIRouter()
@@ -10,11 +11,17 @@ router = APIRouter()
@router.post(
"/deckies/{decky_name}/mutate",
tags=["Fleet Management"],
- responses={401: {"description": "Could not validate credentials"}, 404: {"description": "Decky not found"}}
+ responses={
+ 401: {"description": "Could not validate credentials"},
+ 403: {"description": "Insufficient permissions"},
+ 404: {"description": "Decky not found"},
+ 422: {"description": "Path parameter validation error (decky_name must match ^[a-z0-9\\-]{1,64}$)"},
+ }
)
+@_traced("api.mutate_decky")
async def api_mutate_decky(
decky_name: str = Path(..., pattern=r"^[a-z0-9\-]{1,64}$"),
- current_user: str = Depends(get_current_user),
+ admin: dict = Depends(require_admin),
) -> dict[str, str]:
if os.environ.get("DECNET_CONTRACT_TEST") == "true":
return {"message": f"Successfully mutated {decky_name} (Contract Test Mock)"}
diff --git a/decnet/web/router/fleet/api_mutate_interval.py b/decnet/web/router/fleet/api_mutate_interval.py
index f437340..10afba9 100644
--- a/decnet/web/router/fleet/api_mutate_interval.py
+++ b/decnet/web/router/fleet/api_mutate_interval.py
@@ -1,7 +1,8 @@
from fastapi import APIRouter, Depends, HTTPException
+from decnet.telemetry import traced as _traced
from decnet.config import DecnetConfig
-from decnet.web.dependencies import get_current_user, repo
+from decnet.web.dependencies import require_admin, repo
from decnet.web.db.models import MutateIntervalRequest
router = APIRouter()
@@ -19,11 +20,13 @@ def _parse_duration(s: str) -> int:
responses={
400: {"description": "Bad Request (e.g. malformed JSON)"},
401: {"description": "Could not validate credentials"},
+ 403: {"description": "Insufficient permissions"},
404: {"description": "No active deployment or decky not found"},
422: {"description": "Validation error"}
},
)
-async def api_update_mutate_interval(decky_name: str, req: MutateIntervalRequest, current_user: str = Depends(get_current_user)) -> dict[str, str]:
+@_traced("api.update_mutate_interval")
+async def api_update_mutate_interval(decky_name: str, req: MutateIntervalRequest, admin: dict = Depends(require_admin)) -> dict[str, str]:
state_dict = await repo.get_state("deployment")
if not state_dict:
raise HTTPException(status_code=404, detail="No active deployment")
diff --git a/decnet/web/router/health/__init__.py b/decnet/web/router/health/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/decnet/web/router/health/api_get_health.py b/decnet/web/router/health/api_get_health.py
new file mode 100644
index 0000000..056519f
--- /dev/null
+++ b/decnet/web/router/health/api_get_health.py
@@ -0,0 +1,151 @@
+import asyncio
+import time
+from typing import Any, Optional
+
+from fastapi import APIRouter, Depends
+from fastapi.responses import ORJSONResponse
+
+from decnet.telemetry import traced as _traced
+from decnet.web.dependencies import require_viewer, repo
+from decnet.web.db.models import HealthResponse, ComponentHealth
+
+router = APIRouter()
+
+_CRITICAL_SERVICES = {"database", "docker", "ingestion_worker"}
+
+# Cache Docker client and health result to avoid hammering the Docker socket
+_docker_client: Optional[Any] = None
+_docker_healthy: bool = False
+_docker_detail: str = ""
+_docker_last_check: float = 0.0
+_DOCKER_CHECK_INTERVAL = 5.0 # seconds between actual Docker pings
+
+# Cache DB liveness result — under load, every request was hitting
+# repo.get_total_logs() and filling the aiosqlite queue.
+_db_component: Optional[ComponentHealth] = None
+_db_last_check: float = 0.0
+# Lazy-init — an asyncio.Lock bound to a dead event loop deadlocks any
+# later test running under a fresh loop. Create on first use.
+_db_lock: Optional[asyncio.Lock] = None
+_DB_CHECK_INTERVAL = 1.0 # seconds
+
+
+def _reset_docker_cache() -> None:
+ """Reset cached Docker state — used by tests."""
+ global _docker_client, _docker_healthy, _docker_detail, _docker_last_check
+ _docker_client = None
+ _docker_healthy = False
+ _docker_detail = ""
+ _docker_last_check = 0.0
+
+
+def _reset_db_cache() -> None:
+ """Reset cached DB liveness — used by tests."""
+ global _db_component, _db_last_check, _db_lock
+ _db_component = None
+ _db_last_check = 0.0
+ _db_lock = None
+
+
+async def _check_database_cached() -> ComponentHealth:
+ global _db_component, _db_last_check, _db_lock
+ now = time.monotonic()
+ if _db_component is not None and now - _db_last_check < _DB_CHECK_INTERVAL:
+ return _db_component
+ if _db_lock is None:
+ _db_lock = asyncio.Lock()
+ async with _db_lock:
+ now = time.monotonic()
+ if _db_component is not None and now - _db_last_check < _DB_CHECK_INTERVAL:
+ return _db_component
+ try:
+ await repo.get_total_logs()
+ _db_component = ComponentHealth(status="ok")
+ except Exception as exc:
+ _db_component = ComponentHealth(status="failing", detail=str(exc))
+ _db_last_check = time.monotonic()
+ return _db_component
+
+
+@router.get(
+ "/health",
+ response_model=HealthResponse,
+ tags=["Observability"],
+ responses={
+ 401: {"description": "Could not validate credentials"},
+ 403: {"description": "Insufficient permissions"},
+ 503: {"model": HealthResponse, "description": "System unhealthy"},
+ },
+)
+@_traced("api.get_health")
+async def get_health(user: dict = Depends(require_viewer)) -> Any:
+ components: dict[str, ComponentHealth] = {}
+
+ # 1. Database (cached — avoids a DB round-trip per request)
+ components["database"] = await _check_database_cached()
+
+ # 2. Background workers
+ from decnet.web.api import get_background_tasks
+ for name, task in get_background_tasks().items():
+ if task is None:
+ components[name] = ComponentHealth(status="failing", detail="not started")
+ elif task.done():
+ if task.cancelled():
+ detail = "cancelled"
+ else:
+ exc = task.exception()
+ detail = f"exited: {exc}" if exc else "exited unexpectedly"
+ components[name] = ComponentHealth(status="failing", detail=detail)
+ else:
+ components[name] = ComponentHealth(status="ok")
+
+ # 3. Docker daemon (cached — avoids creating a new client per request)
+ global _docker_client, _docker_healthy, _docker_detail, _docker_last_check
+ now = time.monotonic()
+ if now - _docker_last_check > _DOCKER_CHECK_INTERVAL:
+ try:
+ import docker
+
+ if _docker_client is None:
+ _docker_client = await asyncio.to_thread(docker.from_env)
+ await asyncio.to_thread(_docker_client.ping)
+ _docker_healthy = True
+ _docker_detail = ""
+ except Exception as exc:
+ _docker_client = None
+ _docker_healthy = False
+ _docker_detail = str(exc)
+ _docker_last_check = now
+
+ if _docker_healthy:
+ components["docker"] = ComponentHealth(status="ok")
+ else:
+ components["docker"] = ComponentHealth(status="failing", detail=_docker_detail)
+
+ # Overall status tiers:
+ # healthy — every component ok
+ # degraded — only non-critical components failing (service usable,
+ # falls back to cache or skips non-essential work)
+ # unhealthy — a critical component (db, docker, ingestion) failing;
+ # survival depends on caches
+ critical_failing = any(
+ c.status == "failing"
+ for name, c in components.items()
+ if name in _CRITICAL_SERVICES
+ )
+ noncritical_failing = any(
+ c.status == "failing"
+ for name, c in components.items()
+ if name not in _CRITICAL_SERVICES
+ )
+
+ if critical_failing:
+ overall = "unhealthy"
+ elif noncritical_failing:
+ overall = "degraded"
+ else:
+ overall = "healthy"
+
+ result = HealthResponse(status=overall, components=components)
+ status_code = 503 if overall == "unhealthy" else 200
+ return ORJSONResponse(content=result.model_dump(), status_code=status_code)
diff --git a/decnet/web/router/logs/api_get_histogram.py b/decnet/web/router/logs/api_get_histogram.py
index 6e6d877..c334987 100644
--- a/decnet/web/router/logs/api_get_histogram.py
+++ b/decnet/web/router/logs/api_get_histogram.py
@@ -1,20 +1,58 @@
+import asyncio
+import time
from typing import Any, Optional
from fastapi import APIRouter, Depends, Query
-from decnet.web.dependencies import get_current_user, repo
+from decnet.telemetry import traced as _traced
+from decnet.web.dependencies import require_viewer, repo
router = APIRouter()
+# /logs/histogram aggregates over the full logs table — expensive and
+# polled constantly by the UI. Cache only the unfiltered default call
+# (which is what the UI and locust hit); any filter bypasses.
+_HISTOGRAM_TTL = 5.0
+_DEFAULT_INTERVAL = 15
+_histogram_cache: tuple[Optional[list[dict[str, Any]]], float] = (None, 0.0)
+_histogram_lock: Optional[asyncio.Lock] = None
+
+
+def _reset_histogram_cache() -> None:
+ global _histogram_cache, _histogram_lock
+ _histogram_cache = (None, 0.0)
+ _histogram_lock = None
+
+
+async def _get_histogram_cached() -> list[dict[str, Any]]:
+ global _histogram_cache, _histogram_lock
+ value, ts = _histogram_cache
+ now = time.monotonic()
+ if value is not None and now - ts < _HISTOGRAM_TTL:
+ return value
+ if _histogram_lock is None:
+ _histogram_lock = asyncio.Lock()
+ async with _histogram_lock:
+ value, ts = _histogram_cache
+ now = time.monotonic()
+ if value is not None and now - ts < _HISTOGRAM_TTL:
+ return value
+ value = await repo.get_log_histogram(
+ search=None, start_time=None, end_time=None, interval_minutes=_DEFAULT_INTERVAL,
+ )
+ _histogram_cache = (value, time.monotonic())
+ return value
+
@router.get("/logs/histogram", tags=["Logs"],
- responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}},)
+ responses={401: {"description": "Could not validate credentials"}, 403: {"description": "Insufficient permissions"}, 422: {"description": "Validation error"}},)
+@_traced("api.get_logs_histogram")
async def get_logs_histogram(
search: Optional[str] = None,
start_time: Optional[str] = Query(None),
end_time: Optional[str] = Query(None),
interval_minutes: int = Query(15, ge=1),
- current_user: str = Depends(get_current_user)
+ user: dict = Depends(require_viewer)
) -> list[dict[str, Any]]:
def _norm(v: Optional[str]) -> Optional[str]:
if v in (None, "null", "NULL", "undefined", ""):
@@ -25,4 +63,6 @@ async def get_logs_histogram(
st = _norm(start_time)
et = _norm(end_time)
+ if s is None and st is None and et is None and interval_minutes == _DEFAULT_INTERVAL:
+ return await _get_histogram_cached()
return await repo.get_log_histogram(search=s, start_time=st, end_time=et, interval_minutes=interval_minutes)
diff --git a/decnet/web/router/logs/api_get_logs.py b/decnet/web/router/logs/api_get_logs.py
index 2324c8c..8bd864b 100644
--- a/decnet/web/router/logs/api_get_logs.py
+++ b/decnet/web/router/logs/api_get_logs.py
@@ -1,22 +1,57 @@
+import asyncio
+import time
from typing import Any, Optional
from fastapi import APIRouter, Depends, Query
-from decnet.web.dependencies import get_current_user, repo
+from decnet.telemetry import traced as _traced
+from decnet.web.dependencies import require_viewer, repo
from decnet.web.db.models import LogsResponse
router = APIRouter()
+# Cache the unfiltered total-logs count. Filtered counts bypass the cache
+# (rare, freshness matters for search). SELECT count(*) FROM logs is a
+# full scan and gets hammered by paginating clients.
+_TOTAL_TTL = 2.0
+_total_cache: tuple[Optional[int], float] = (None, 0.0)
+_total_lock: Optional[asyncio.Lock] = None
+
+
+def _reset_total_cache() -> None:
+ global _total_cache, _total_lock
+ _total_cache = (None, 0.0)
+ _total_lock = None
+
+
+async def _get_total_logs_cached() -> int:
+ global _total_cache, _total_lock
+ value, ts = _total_cache
+ now = time.monotonic()
+ if value is not None and now - ts < _TOTAL_TTL:
+ return value
+ if _total_lock is None:
+ _total_lock = asyncio.Lock()
+ async with _total_lock:
+ value, ts = _total_cache
+ now = time.monotonic()
+ if value is not None and now - ts < _TOTAL_TTL:
+ return value
+ value = await repo.get_total_logs()
+ _total_cache = (value, time.monotonic())
+ return value
+
@router.get("/logs", response_model=LogsResponse, tags=["Logs"],
- responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}})
+ responses={401: {"description": "Could not validate credentials"}, 403: {"description": "Insufficient permissions"}, 422: {"description": "Validation error"}})
+@_traced("api.get_logs")
async def get_logs(
limit: int = Query(50, ge=1, le=1000),
offset: int = Query(0, ge=0, le=2147483647),
search: Optional[str] = Query(None, max_length=512),
start_time: Optional[str] = Query(None),
end_time: Optional[str] = Query(None),
- current_user: str = Depends(get_current_user)
+ user: dict = Depends(require_viewer)
) -> dict[str, Any]:
def _norm(v: Optional[str]) -> Optional[str]:
if v in (None, "null", "NULL", "undefined", ""):
@@ -28,7 +63,10 @@ async def get_logs(
et = _norm(end_time)
_logs: list[dict[str, Any]] = await repo.get_logs(limit=limit, offset=offset, search=s, start_time=st, end_time=et)
- _total: int = await repo.get_total_logs(search=s, start_time=st, end_time=et)
+ if s is None and st is None and et is None:
+ _total: int = await _get_total_logs_cached()
+ else:
+ _total = await repo.get_total_logs(search=s, start_time=st, end_time=et)
return {
"total": _total,
"limit": limit,
diff --git a/decnet/web/router/stats/api_get_stats.py b/decnet/web/router/stats/api_get_stats.py
index f72d8ad..474331d 100644
--- a/decnet/web/router/stats/api_get_stats.py
+++ b/decnet/web/router/stats/api_get_stats.py
@@ -1,14 +1,50 @@
-from typing import Any
+import asyncio
+import time
+from typing import Any, Optional
from fastapi import APIRouter, Depends
-from decnet.web.dependencies import get_current_user, repo
+from decnet.telemetry import traced as _traced
+from decnet.web.dependencies import require_viewer, repo
from decnet.web.db.models import StatsResponse
router = APIRouter()
+# /stats is aggregate telemetry polled constantly by the UI and locust.
+# A 5s window collapses thousands of concurrent calls — each of which
+# runs SELECT count(*) FROM logs + SELECT count(DISTINCT attacker_ip) —
+# into one DB hit per window.
+_STATS_TTL = 5.0
+_stats_cache: tuple[Optional[dict[str, Any]], float] = (None, 0.0)
+_stats_lock: Optional[asyncio.Lock] = None
+
+
+def _reset_stats_cache() -> None:
+ global _stats_cache, _stats_lock
+ _stats_cache = (None, 0.0)
+ _stats_lock = None
+
+
+async def _get_stats_cached() -> dict[str, Any]:
+ global _stats_cache, _stats_lock
+ value, ts = _stats_cache
+ now = time.monotonic()
+ if value is not None and now - ts < _STATS_TTL:
+ return value
+ if _stats_lock is None:
+ _stats_lock = asyncio.Lock()
+ async with _stats_lock:
+ value, ts = _stats_cache
+ now = time.monotonic()
+ if value is not None and now - ts < _STATS_TTL:
+ return value
+ value = await repo.get_stats_summary()
+ _stats_cache = (value, time.monotonic())
+ return value
+
@router.get("/stats", response_model=StatsResponse, tags=["Observability"],
- responses={401: {"description": "Could not validate credentials"}, 422: {"description": "Validation error"}},)
-async def get_stats(current_user: str = Depends(get_current_user)) -> dict[str, Any]:
- return await repo.get_stats_summary()
+ responses={401: {"description": "Could not validate credentials"}, 403: {"description": "Insufficient permissions"}, 422: {"description": "Validation error"}},)
+@_traced("api.get_stats")
+async def get_stats(user: dict = Depends(require_viewer)) -> dict[str, Any]:
+ return await _get_stats_cached()
diff --git a/decnet/web/router/stream/api_stream_events.py b/decnet/web/router/stream/api_stream_events.py
index 0690b6a..f463703 100644
--- a/decnet/web/router/stream/api_stream_events.py
+++ b/decnet/web/router/stream/api_stream_events.py
@@ -1,19 +1,49 @@
-import json
import asyncio
-import logging
+
+import orjson
from typing import AsyncGenerator, Optional
from fastapi import APIRouter, Depends, Query, Request
from fastapi.responses import StreamingResponse
from decnet.env import DECNET_DEVELOPER
-from decnet.web.dependencies import get_stream_user, repo
+from decnet.logging import get_logger
+from decnet.telemetry import traced as _traced, get_tracer as _get_tracer
+from decnet.web.dependencies import require_stream_viewer, repo
-log = logging.getLogger(__name__)
+log = get_logger("api")
router = APIRouter()
+def _build_trace_links(logs: list[dict]) -> list:
+ """Build OTEL span links from persisted trace_id/span_id in log rows.
+
+ Returns an empty list when tracing is disabled (no OTEL imports).
+ """
+ try:
+ from opentelemetry.trace import Link, SpanContext, TraceFlags
+ except ImportError:
+ return []
+ links: list[Link] = []
+ for entry in logs:
+ tid = entry.get("trace_id")
+ sid = entry.get("span_id")
+ if not tid or not sid or tid == "0":
+ continue
+ try:
+ ctx = SpanContext(
+ trace_id=int(tid, 16),
+ span_id=int(sid, 16),
+ is_remote=True,
+ trace_flags=TraceFlags(TraceFlags.SAMPLED),
+ )
+ links.append(Link(ctx))
+ except (ValueError, TypeError):
+ continue
+ return links
+
+
@router.get("/stream", tags=["Observability"],
responses={
200: {
@@ -21,9 +51,11 @@ router = APIRouter()
"description": "Real-time Server-Sent Events (SSE) stream"
},
401: {"description": "Could not validate credentials"},
+ 403: {"description": "Insufficient permissions"},
422: {"description": "Validation error"}
},
)
+@_traced("api.stream_events")
async def stream_events(
request: Request,
last_event_id: int = Query(0, alias="lastEventId"),
@@ -31,26 +63,33 @@ async def stream_events(
start_time: Optional[str] = None,
end_time: Optional[str] = None,
max_output: Optional[int] = Query(None, alias="maxOutput"),
- current_user: str = Depends(get_stream_user)
+ user: dict = Depends(require_stream_viewer)
) -> StreamingResponse:
+ # Prefetch the initial snapshot before entering the streaming generator.
+ # With asyncmy (pure async TCP I/O), the first DB await inside the generator
+ # fires immediately after the ASGI layer sends the keepalive chunk — the HTTP
+ # write and the MySQL read compete for asyncio I/O callbacks and the MySQL
+ # callback can stall. Running these here (normal async context, no streaming)
+ # avoids that race entirely. aiosqlite is immune because it runs SQLite in a
+ # thread, decoupled from the event loop's I/O scheduler.
+ _start_id = last_event_id if last_event_id != 0 else await repo.get_max_log_id()
+ _initial_stats = await repo.get_stats_summary()
+ _initial_histogram = await repo.get_log_histogram(
+ search=search, start_time=start_time, end_time=end_time, interval_minutes=15,
+ )
+
async def event_generator() -> AsyncGenerator[str, None]:
- last_id = last_event_id
+ last_id = _start_id
stats_interval_sec = 10
loops_since_stats = 0
emitted_chunks = 0
try:
- if last_id == 0:
- last_id = await repo.get_max_log_id()
+ yield ": keepalive\n\n" # flush headers immediately
- # Emit initial snapshot immediately so the client never needs to poll /stats
- stats = await repo.get_stats_summary()
- yield f"event: message\ndata: {json.dumps({'type': 'stats', 'data': stats})}\n\n"
- histogram = await repo.get_log_histogram(
- search=search, start_time=start_time,
- end_time=end_time, interval_minutes=15,
- )
- yield f"event: message\ndata: {json.dumps({'type': 'histogram', 'data': histogram})}\n\n"
+ # Emit pre-fetched initial snapshot — no DB calls in generator until the loop
+ yield f"event: message\ndata: {orjson.dumps({'type': 'stats', 'data': _initial_stats}).decode()}\n\n"
+ yield f"event: message\ndata: {orjson.dumps({'type': 'histogram', 'data': _initial_histogram}).decode()}\n\n"
while True:
if DECNET_DEVELOPER and max_output is not None:
@@ -68,17 +107,25 @@ async def stream_events(
)
if new_logs:
last_id = max(entry["id"] for entry in new_logs)
- yield f"event: message\ndata: {json.dumps({'type': 'logs', 'data': new_logs})}\n\n"
+ # Create a span linking back to the ingestion traces
+ # stored in each log row, closing the pipeline gap.
+ _links = _build_trace_links(new_logs)
+ _tracer = _get_tracer("sse")
+ with _tracer.start_as_current_span(
+ "sse.emit_logs", links=_links,
+ attributes={"log_count": len(new_logs)},
+ ):
+ yield f"event: message\ndata: {orjson.dumps({'type': 'logs', 'data': new_logs}).decode()}\n\n"
loops_since_stats = stats_interval_sec
if loops_since_stats >= stats_interval_sec:
stats = await repo.get_stats_summary()
- yield f"event: message\ndata: {json.dumps({'type': 'stats', 'data': stats})}\n\n"
+ yield f"event: message\ndata: {orjson.dumps({'type': 'stats', 'data': stats}).decode()}\n\n"
histogram = await repo.get_log_histogram(
search=search, start_time=start_time,
end_time=end_time, interval_minutes=15,
)
- yield f"event: message\ndata: {json.dumps({'type': 'histogram', 'data': histogram})}\n\n"
+ yield f"event: message\ndata: {orjson.dumps({'type': 'histogram', 'data': histogram}).decode()}\n\n"
loops_since_stats = 0
loops_since_stats += 1
@@ -88,6 +135,13 @@ async def stream_events(
pass
except Exception:
log.exception("SSE stream error for user %s", last_event_id)
- yield f"event: error\ndata: {json.dumps({'type': 'error', 'message': 'Stream interrupted'})}\n\n"
+ yield f"event: error\ndata: {orjson.dumps({'type': 'error', 'message': 'Stream interrupted'}).decode()}\n\n"
- return StreamingResponse(event_generator(), media_type="text/event-stream")
+ return StreamingResponse(
+ event_generator(),
+ media_type="text/event-stream",
+ headers={
+ "Cache-Control": "no-cache",
+ "X-Accel-Buffering": "no",
+ },
+ )
diff --git a/decnet/web/router/swarm/__init__.py b/decnet/web/router/swarm/__init__.py
new file mode 100644
index 0000000..7d3b4c2
--- /dev/null
+++ b/decnet/web/router/swarm/__init__.py
@@ -0,0 +1,47 @@
+"""Swarm controller routers.
+
+One file per endpoint, aggregated under the ``/swarm`` prefix. Mounted
+onto the swarm-api FastAPI app (``decnet/web/swarm_api.py``), a separate
+process from the main DECNET API so swarm failures cannot cascade into
+log ingestion / dashboard serving.
+"""
+from fastapi import APIRouter
+
+from .api_enroll_host import router as enroll_host_router
+from .api_list_hosts import router as list_hosts_router
+from .api_get_host import router as get_host_router
+from .api_decommission_host import router as decommission_host_router
+from .api_deploy_swarm import router as deploy_swarm_router
+from .api_teardown_swarm import router as teardown_swarm_router
+from .api_get_swarm_health import router as get_swarm_health_router
+from .api_check_hosts import router as check_hosts_router
+from .api_heartbeat import router as heartbeat_router
+from .api_list_deckies import router as list_deckies_router
+
+swarm_router = APIRouter(
+ prefix="/swarm",
+ # Error responses that every swarm route can surface. Route-level
+ # `responses=` entries still override/extend these for route-specific
+ # codes (e.g. 409 on /enroll).
+ responses={
+ 400: {"description": "Malformed request"},
+ 403: {"description": "Peer cert missing or fingerprint mismatch"},
+ 404: {"description": "Referenced host does not exist"},
+ },
+)
+
+# Hosts
+swarm_router.include_router(enroll_host_router)
+swarm_router.include_router(list_hosts_router)
+swarm_router.include_router(get_host_router)
+swarm_router.include_router(decommission_host_router)
+
+# Deployments
+swarm_router.include_router(deploy_swarm_router)
+swarm_router.include_router(teardown_swarm_router)
+swarm_router.include_router(list_deckies_router)
+
+# Health
+swarm_router.include_router(get_swarm_health_router)
+swarm_router.include_router(check_hosts_router)
+swarm_router.include_router(heartbeat_router)
diff --git a/decnet/web/router/swarm/api_check_hosts.py b/decnet/web/router/swarm/api_check_hosts.py
new file mode 100644
index 0000000..f058567
--- /dev/null
+++ b/decnet/web/router/swarm/api_check_hosts.py
@@ -0,0 +1,61 @@
+"""POST /swarm/check — active mTLS probe of every enrolled worker.
+
+Updates ``SwarmHost.status`` and ``last_heartbeat`` for each host based
+on the outcome of the probe.
+"""
+from __future__ import annotations
+
+import asyncio
+from datetime import datetime, timezone
+from typing import Any
+
+from fastapi import APIRouter, Depends
+
+from decnet.logging import get_logger
+from decnet.swarm.client import AgentClient
+from decnet.web.db.repository import BaseRepository
+from decnet.web.dependencies import get_repo
+from decnet.web.db.models import SwarmCheckResponse, SwarmHostHealth
+
+log = get_logger("swarm.check")
+
+router = APIRouter()
+
+
+@router.post("/check", response_model=SwarmCheckResponse, tags=["Swarm Health"])
+async def api_check_hosts(
+ repo: BaseRepository = Depends(get_repo),
+) -> SwarmCheckResponse:
+ hosts = await repo.list_swarm_hosts()
+
+ async def _probe(host: dict[str, Any]) -> SwarmHostHealth:
+ try:
+ async with AgentClient(host=host) as agent:
+ body = await agent.health()
+ await repo.update_swarm_host(
+ host["uuid"],
+ {
+ "status": "active",
+ "last_heartbeat": datetime.now(timezone.utc),
+ },
+ )
+ return SwarmHostHealth(
+ host_uuid=host["uuid"],
+ name=host["name"],
+ address=host["address"],
+ reachable=True,
+ detail=body,
+ )
+ except Exception as exc:
+ log.warning("swarm.check unreachable host=%s err=%s", host["name"], exc)
+ await repo.update_swarm_host(host["uuid"], {"status": "unreachable"})
+ return SwarmHostHealth(
+ host_uuid=host["uuid"],
+ name=host["name"],
+ address=host["address"],
+ reachable=False,
+ detail=str(exc),
+ )
+
+ results = await asyncio.gather(*(_probe(h) for h in hosts))
+ return SwarmCheckResponse(results=list(results))
diff --git a/decnet/web/router/swarm/api_decommission_host.py b/decnet/web/router/swarm/api_decommission_host.py
new file mode 100644
index 0000000..7e6c669
--- /dev/null
+++ b/decnet/web/router/swarm/api_decommission_host.py
@@ -0,0 +1,63 @@
+"""DELETE /swarm/hosts/{uuid} — decommission a worker.
+
+Removes the DeckyShard rows bound to the host (portable cascade — MySQL
+and SQLite both honor it via the repo layer), deletes the SwarmHost row,
+and best-effort-cleans the per-worker bundle directory on the master.
+
+Also asks the worker agent to wipe its own install (keeping logs). A
+dead/unreachable worker does not block master-side cleanup.
+"""
+from __future__ import annotations
+
+import pathlib
+
+from fastapi import APIRouter, Depends, HTTPException, status
+
+from decnet.logging import get_logger
+from decnet.swarm.client import AgentClient
+from decnet.web.db.repository import BaseRepository
+from decnet.web.dependencies import get_repo
+
+log = get_logger("swarm.decommission")
+router = APIRouter()
+
+
+@router.delete(
+ "/hosts/{uuid}",
+ status_code=status.HTTP_204_NO_CONTENT,
+ tags=["Swarm Hosts"],
+ responses={404: {"description": "No host with this UUID is enrolled"}},
+)
+async def api_decommission_host(
+ uuid: str,
+ repo: BaseRepository = Depends(get_repo),
+) -> None:
+ row = await repo.get_swarm_host_by_uuid(uuid)
+ if row is None:
+ raise HTTPException(status_code=404, detail="host not found")
+
+ try:
+ async with AgentClient(host=row) as agent:
+ await agent.self_destruct()
+ except Exception:
+ log.exception(
+ "decommission: self-destruct dispatch failed host=%s — "
+ "proceeding with master-side cleanup anyway",
+ row.get("name"),
+ )
+
+ await repo.delete_decky_shards_for_host(uuid)
+ await repo.delete_swarm_host(uuid)
+
+ # Best-effort bundle cleanup; if the dir was moved manually, don't fail.
+ bundle_dir = pathlib.Path(row.get("cert_bundle_path") or "")
+ if bundle_dir.is_dir():
+ for child in bundle_dir.iterdir():
+ try:
+ child.unlink()
+ except OSError:
+ pass
+ try:
+ bundle_dir.rmdir()
+ except OSError:
+ pass
diff --git a/decnet/web/router/swarm/api_deploy_swarm.py b/decnet/web/router/swarm/api_deploy_swarm.py
new file mode 100644
index 0000000..1142df8
--- /dev/null
+++ b/decnet/web/router/swarm/api_deploy_swarm.py
@@ -0,0 +1,155 @@
+"""POST /swarm/deploy — shard a DecnetConfig across enrolled workers.
+
+Per worker we build a filtered copy containing only the deckies assigned
+to that worker (via ``host_uuid``), then POST it to the worker agent.
+The caller is expected to have already set ``host_uuid`` on every decky;
+if any decky arrives without one, we fail fast. Auto-sharding lives in
+the CLI layer, not here.
+"""
+from __future__ import annotations
+
+import asyncio
+import json
+from datetime import datetime, timezone
+from typing import Any
+
+from fastapi import APIRouter, Depends, HTTPException
+
+from decnet.config import DecnetConfig, DeckyConfig
+from decnet.logging import get_logger
+from decnet.swarm.client import AgentClient
+from decnet.web.db.repository import BaseRepository
+from decnet.web.dependencies import get_repo
+from decnet.web.db.models import (
+ SwarmDeployRequest,
+ SwarmDeployResponse,
+ SwarmHostResult,
+)
+
+log = get_logger("swarm.deploy")
+
+router = APIRouter()
+
+
+def _shard_by_host(config: DecnetConfig) -> dict[str, list[DeckyConfig]]:
+ buckets: dict[str, list[DeckyConfig]] = {}
+ for d in config.deckies:
+ if not d.host_uuid:
+ raise HTTPException(
+ status_code=400,
+ detail=f"decky '{d.name}' has no host_uuid — caller must shard before dispatch",
+ )
+ buckets.setdefault(d.host_uuid, []).append(d)
+ return buckets
+
+
+def _worker_config(
+ base: DecnetConfig,
+ shard: list[DeckyConfig],
+ host: dict[str, Any],
+) -> DecnetConfig:
+ updates: dict[str, Any] = {"deckies": shard}
+ # Per-host driver opt-in (Wi-Fi-bridged VMs can't use macvlan — see
+ # SwarmHost.use_ipvlan). Never downgrade: if the operator picked ipvlan
+ # at the deploy level, keep it regardless of the per-host flag.
+ if host.get("use_ipvlan"):
+ updates["ipvlan"] = True
+ return base.model_copy(update=updates)
+
+
+async def dispatch_decnet_config(
+ config: DecnetConfig,
+ repo: BaseRepository,
+ dry_run: bool = False,
+ no_cache: bool = False,
+) -> SwarmDeployResponse:
+ """Shard ``config`` by ``host_uuid`` and dispatch to each worker in parallel.
+
+ Shared between POST /swarm/deploy (explicit swarm call) and the auto-swarm
+ branch of POST /deckies/deploy.
+ """
+ buckets = _shard_by_host(config)
+
+ hosts: dict[str, dict[str, Any]] = {}
+ for host_uuid in buckets:
+ row = await repo.get_swarm_host_by_uuid(host_uuid)
+ if row is None:
+ raise HTTPException(status_code=404, detail=f"unknown host_uuid: {host_uuid}")
+ hosts[host_uuid] = row
+
+ async def _dispatch(host_uuid: str, shard: list[DeckyConfig]) -> SwarmHostResult:
+ host = hosts[host_uuid]
+ cfg = _worker_config(config, shard, host)
+ try:
+ async with AgentClient(host=host) as agent:
+ body = await agent.deploy(cfg, dry_run=dry_run, no_cache=no_cache)
+ for d in shard:
+ await repo.upsert_decky_shard(
+ {
+ "decky_name": d.name,
+ "host_uuid": host_uuid,
+ "services": json.dumps(d.services),
+ "decky_config": d.model_dump_json(),
+ "decky_ip": d.ip,
+ "state": "running" if not dry_run else "pending",
+ "last_error": None,
+ "updated_at": datetime.now(timezone.utc),
+ }
+ )
+ await repo.update_swarm_host(host_uuid, {"status": "active"})
+ return SwarmHostResult(host_uuid=host_uuid, host_name=host["name"], ok=True, detail=body)
+ except Exception as exc:
+ log.exception("swarm.deploy dispatch failed host=%s", host["name"])
+ # Compose-up is partial-success-friendly: one decky failing to
+ # build doesn't roll back the ones that already came up. Ask the
+ # agent which containers actually exist before painting the whole
+ # shard red — otherwise decky1 and decky2 look "failed" even
+ # though they're live on the worker.
+ runtime: dict[str, Any] = {}
+ try:
+ async with AgentClient(host=host) as probe:
+ snap = await probe.status()
+ runtime = snap.get("runtime") or {}
+ except Exception:
+ log.warning("swarm.deploy: runtime probe failed host=%s — marking shard failed", host["name"])
+ for d in shard:
+ rstate = runtime.get(d.name) or {}
+ is_up = bool(rstate.get("running"))
+ await repo.upsert_decky_shard(
+ {
+ "decky_name": d.name,
+ "host_uuid": host_uuid,
+ "services": json.dumps(d.services),
+ "decky_config": d.model_dump_json(),
+ "decky_ip": d.ip,
+ "state": "running" if is_up else "failed",
+ "last_error": None if is_up else str(exc)[:512],
+ "updated_at": datetime.now(timezone.utc),
+ }
+ )
+ return SwarmHostResult(host_uuid=host_uuid, host_name=host["name"], ok=False, detail=str(exc))
+
+ results = await asyncio.gather(
+ *(_dispatch(uuid_, shard) for uuid_, shard in buckets.items())
+ )
+ return SwarmDeployResponse(results=list(results))
+
+
+@router.post(
+ "/deploy",
+ response_model=SwarmDeployResponse,
+ tags=["Swarm Deployments"],
+ responses={
+ 400: {"description": "Deployment mode must be 'swarm'"},
+ 404: {"description": "A referenced host_uuid is not enrolled"},
+ },
+)
+async def api_deploy_swarm(
+ req: SwarmDeployRequest,
+ repo: BaseRepository = Depends(get_repo),
+) -> SwarmDeployResponse:
+ if req.config.mode != "swarm":
+ raise HTTPException(status_code=400, detail="mode must be 'swarm'")
+ return await dispatch_decnet_config(
+ req.config, repo, dry_run=req.dry_run, no_cache=req.no_cache
+ )
diff --git a/decnet/web/router/swarm/api_enroll_host.py b/decnet/web/router/swarm/api_enroll_host.py
new file mode 100644
index 0000000..351a922
--- /dev/null
+++ b/decnet/web/router/swarm/api_enroll_host.py
@@ -0,0 +1,100 @@
+"""POST /swarm/enroll — issue a worker cert bundle and register the host.
+
+Enrollment is master-driven: the controller holds the CA private key,
+generates a fresh worker keypair + CA-signed cert, and returns the full
+bundle to the operator. Bundle delivery to the worker (scp/sshpass/etc.)
+is outside this process's trust boundary.
+
+Rationale: the worker agent speaks ONLY mTLS; there is no pre-auth
+bootstrap endpoint, so nothing to attack before the worker is enrolled.
+"""
+from __future__ import annotations
+
+import uuid as _uuid
+from datetime import datetime, timezone
+from typing import Optional
+
+from fastapi import APIRouter, Depends, HTTPException, status
+
+from decnet.swarm import pki
+from decnet.web.db.repository import BaseRepository
+from decnet.web.dependencies import get_repo
+from decnet.web.db.models import SwarmEnrolledBundle, SwarmEnrollRequest, SwarmUpdaterBundle
+
+router = APIRouter()
+
+
+@router.post(
+ "/enroll",
+ response_model=SwarmEnrolledBundle,
+ status_code=status.HTTP_201_CREATED,
+ tags=["Swarm Hosts"],
+ responses={
+ 400: {"description": "Bad Request (malformed JSON body)"},
+ 409: {"description": "A worker with this name is already enrolled"},
+ 422: {"description": "Request body validation error"},
+ },
+)
+async def api_enroll_host(
+ req: SwarmEnrollRequest,
+ repo: BaseRepository = Depends(get_repo),
+) -> SwarmEnrolledBundle:
+ existing = await repo.get_swarm_host_by_name(req.name)
+ if existing is not None:
+ raise HTTPException(status_code=409, detail=f"Worker '{req.name}' is already enrolled")
+
+ ca = pki.ensure_ca()
+ sans = list({*req.sans, req.address, req.name})
+ issued = pki.issue_worker_cert(ca, req.name, sans)
+
+ # Persist the bundle under ~/.decnet/ca/workers// so the master
+ # can replay it if the operator loses the original delivery.
+ bundle_dir = pki.DEFAULT_CA_DIR / "workers" / req.name
+ pki.write_worker_bundle(issued, bundle_dir)
+
+ updater_view: Optional[SwarmUpdaterBundle] = None
+ updater_fp: Optional[str] = None
+ if req.issue_updater_bundle:
+ updater_cn = f"updater@{req.name}"
+ updater_sans = list({*sans, updater_cn, "127.0.0.1"})
+ updater_issued = pki.issue_worker_cert(ca, updater_cn, updater_sans)
+ # Persist alongside the worker bundle for replay.
+ updater_dir = bundle_dir / "updater"
+ updater_dir.mkdir(parents=True, exist_ok=True)
+ (updater_dir / "updater.crt").write_bytes(updater_issued.cert_pem)
+ (updater_dir / "updater.key").write_bytes(updater_issued.key_pem)
+ import os as _os
+ _os.chmod(updater_dir / "updater.key", 0o600)
+ updater_fp = updater_issued.fingerprint_sha256
+ updater_view = SwarmUpdaterBundle(
+ fingerprint=updater_fp,
+ updater_cert_pem=updater_issued.cert_pem.decode(),
+ updater_key_pem=updater_issued.key_pem.decode(),
+ )
+
+ host_uuid = str(_uuid.uuid4())
+ await repo.add_swarm_host(
+ {
+ "uuid": host_uuid,
+ "name": req.name,
+ "address": req.address,
+ "agent_port": req.agent_port,
+ "status": "enrolled",
+ "client_cert_fingerprint": issued.fingerprint_sha256,
+ "updater_cert_fingerprint": updater_fp,
+ "cert_bundle_path": str(bundle_dir),
+ "enrolled_at": datetime.now(timezone.utc),
+ "notes": req.notes,
+ }
+ )
+ return SwarmEnrolledBundle(
+ host_uuid=host_uuid,
+ name=req.name,
+ address=req.address,
+ agent_port=req.agent_port,
+ fingerprint=issued.fingerprint_sha256,
+ ca_cert_pem=issued.ca_cert_pem.decode(),
+ worker_cert_pem=issued.cert_pem.decode(),
+ worker_key_pem=issued.key_pem.decode(),
+ updater=updater_view,
+ )
diff --git a/decnet/web/router/swarm/api_get_host.py b/decnet/web/router/swarm/api_get_host.py
new file mode 100644
index 0000000..556b6ee
--- /dev/null
+++ b/decnet/web/router/swarm/api_get_host.py
@@ -0,0 +1,26 @@
+"""GET /swarm/hosts/{uuid} — fetch a single worker by UUID."""
+from __future__ import annotations
+
+from fastapi import APIRouter, Depends, HTTPException
+
+from decnet.web.db.repository import BaseRepository
+from decnet.web.dependencies import get_repo
+from decnet.web.db.models import SwarmHostView
+
+router = APIRouter()
+
+
+@router.get(
+ "/hosts/{uuid}",
+ response_model=SwarmHostView,
+ tags=["Swarm Hosts"],
+ responses={404: {"description": "No host with this UUID is enrolled"}},
+)
+async def api_get_host(
+ uuid: str,
+ repo: BaseRepository = Depends(get_repo),
+) -> SwarmHostView:
+ row = await repo.get_swarm_host_by_uuid(uuid)
+ if row is None:
+ raise HTTPException(status_code=404, detail="host not found")
+ return SwarmHostView(**row)
diff --git a/decnet/web/router/swarm/api_get_swarm_health.py b/decnet/web/router/swarm/api_get_swarm_health.py
new file mode 100644
index 0000000..5960136
--- /dev/null
+++ b/decnet/web/router/swarm/api_get_swarm_health.py
@@ -0,0 +1,11 @@
+"""GET /swarm/health — controller liveness (no I/O)."""
+from __future__ import annotations
+
+from fastapi import APIRouter
+
+router = APIRouter()
+
+
+@router.get("/health", tags=["Swarm Health"])
+async def api_get_swarm_health() -> dict[str, str]:
+ return {"status": "ok", "role": "swarm-controller"}
diff --git a/decnet/web/router/swarm/api_heartbeat.py b/decnet/web/router/swarm/api_heartbeat.py
new file mode 100644
index 0000000..52487ca
--- /dev/null
+++ b/decnet/web/router/swarm/api_heartbeat.py
@@ -0,0 +1,148 @@
+"""POST /swarm/heartbeat — agent→master liveness + decky snapshot refresh.
+
+Workers call this every ~30 s with the output of ``executor.status()``.
+The master bumps ``SwarmHost.last_heartbeat`` and re-upserts each
+``DeckyShard`` with the fresh ``DeckyConfig`` snapshot + runtime-derived
+state so the dashboard stays current without a master-pull probe.
+
+Security: CA-signed mTLS is necessary but not sufficient — a
+decommissioned worker's still-valid cert must not resurrect ghost
+shards. We pin the presented peer cert's SHA-256 to the
+``client_cert_fingerprint`` stored for the claimed ``host_uuid``.
+Mismatch (or decommissioned host) → 403.
+"""
+from __future__ import annotations
+
+import hashlib
+import json
+from datetime import datetime, timezone
+from typing import Any, Optional
+
+from fastapi import APIRouter, Depends, HTTPException, Request
+from pydantic import BaseModel
+
+from decnet.config import DeckyConfig
+from decnet.logging import get_logger
+from decnet.web.db.repository import BaseRepository
+from decnet.web.dependencies import get_repo
+
+log = get_logger("swarm.heartbeat")
+
+router = APIRouter()
+
+
+class HeartbeatRequest(BaseModel):
+ host_uuid: str
+ agent_version: Optional[str] = None
+ status: dict[str, Any]
+
+
+def _extract_peer_fingerprint(scope: dict[str, Any]) -> Optional[str]:
+ """Pull the peer cert's SHA-256 fingerprint from an ASGI scope.
+
+ Tries two extraction paths because uvicorn has historically stashed
+ the TLS peer cert in different scope keys across versions:
+
+ 1. Primary: ``scope["extensions"]["tls"]["client_cert_chain"][0]``
+ (uvicorn ≥ 0.30 ASGI TLS extension).
+ 2. Fallback: the transport object's ``ssl_object.getpeercert(binary_form=True)``
+ (older uvicorn builds + some other servers).
+
+ Returns the lowercase hex SHA-256 of the DER-encoded cert, or None
+ when neither path yields bytes. The endpoint fails closed on None.
+ """
+ peer_der: Optional[bytes] = None
+ source = "none"
+
+ try:
+ chain = scope.get("extensions", {}).get("tls", {}).get("client_cert_chain")
+ if chain:
+ peer_der = chain[0]
+ source = "primary"
+ except Exception:
+ peer_der = None
+
+ if peer_der is None:
+ transport = scope.get("transport")
+ try:
+ ssl_obj = transport.get_extra_info("ssl_object") if transport else None
+ if ssl_obj is not None:
+ peer_der = ssl_obj.getpeercert(binary_form=True)
+ if peer_der:
+ source = "fallback"
+ except Exception:
+ peer_der = None
+
+ if not peer_der:
+ log.debug("heartbeat: peer cert extraction failed via none")
+ return None
+
+ log.debug("heartbeat: peer cert extraction succeeded via %s", source)
+ return hashlib.sha256(peer_der).hexdigest().lower()
+
+
+async def _verify_peer_matches_host(
+ request: Request, host_uuid: str, repo: BaseRepository
+) -> dict[str, Any]:
+ host = await repo.get_swarm_host_by_uuid(host_uuid)
+ if host is None:
+ raise HTTPException(status_code=404, detail="unknown host")
+ fp = _extract_peer_fingerprint(request.scope)
+ if fp is None:
+ raise HTTPException(status_code=403, detail="peer cert unavailable")
+ expected = (host.get("client_cert_fingerprint") or "").lower()
+ if not expected or fp != expected:
+ raise HTTPException(status_code=403, detail="cert fingerprint mismatch")
+ return host
+
+
+@router.post(
+ "/heartbeat",
+ status_code=204,
+ tags=["Swarm Health"],
+ responses={
+ 400: {"description": "Bad Request (malformed JSON body)"},
+ 403: {"description": "Peer cert missing, or its fingerprint does not match the host's pinned cert"},
+ 404: {"description": "host_uuid is not enrolled"},
+ 422: {"description": "Request body validation error"},
+ },
+)
+async def heartbeat(
+ req: HeartbeatRequest,
+ request: Request,
+ repo: BaseRepository = Depends(get_repo),
+) -> None:
+ await _verify_peer_matches_host(request, req.host_uuid, repo)
+
+ now = datetime.now(timezone.utc)
+ await repo.update_swarm_host(
+ req.host_uuid,
+ {"status": "active", "last_heartbeat": now},
+ )
+
+ status_body = req.status or {}
+ if not status_body.get("deployed"):
+ return
+
+ runtime = status_body.get("runtime") or {}
+ for decky_dict in status_body.get("deckies") or []:
+ try:
+ d = DeckyConfig(**decky_dict)
+ except Exception:
+ log.exception("heartbeat: skipping malformed decky payload host=%s", req.host_uuid)
+ continue
+ rstate = runtime.get(d.name) or {}
+ is_up = bool(rstate.get("running"))
+ await repo.upsert_decky_shard(
+ {
+ "decky_name": d.name,
+ "host_uuid": req.host_uuid,
+ "services": json.dumps(d.services),
+ "decky_config": d.model_dump_json(),
+ "decky_ip": d.ip,
+ "state": "running" if is_up else "degraded",
+ "last_error": None,
+ "last_seen": now,
+ "updated_at": now,
+ }
+ )
diff --git a/decnet/web/router/swarm/api_list_deckies.py b/decnet/web/router/swarm/api_list_deckies.py
new file mode 100644
index 0000000..43a5d98
--- /dev/null
+++ b/decnet/web/router/swarm/api_list_deckies.py
@@ -0,0 +1,55 @@
+"""GET /swarm/deckies — list decky shards with their worker host's identity.
+
+The DeckyShard table maps decky_name → host_uuid; users want to see which
+deckies are running and *where*, so we enrich each shard with the owning
+host's name/address/status from SwarmHost rather than making callers do
+the join themselves.
+"""
+from __future__ import annotations
+
+from typing import Optional
+
+from fastapi import APIRouter, Depends
+
+from decnet.web.db.repository import BaseRepository
+from decnet.web.dependencies import get_repo
+from decnet.web.db.models import DeckyShardView
+
+router = APIRouter()
+
+
+@router.get("/deckies", response_model=list[DeckyShardView], tags=["Swarm Deckies"])
+async def api_list_deckies(
+ host_uuid: Optional[str] = None,
+ state: Optional[str] = None,
+ repo: BaseRepository = Depends(get_repo),
+) -> list[DeckyShardView]:
+ shards = await repo.list_decky_shards(host_uuid)
+ hosts = {h["uuid"]: h for h in await repo.list_swarm_hosts()}
+
+ out: list[DeckyShardView] = []
+ for s in shards:
+ if state and s.get("state") != state:
+ continue
+ host = hosts.get(s["host_uuid"], {})
+ out.append(DeckyShardView(
+ decky_name=s["decky_name"],
+ decky_ip=s.get("decky_ip"),
+ host_uuid=s["host_uuid"],
+ host_name=host.get("name") or "",
+ host_address=host.get("address") or "",
+ host_status=host.get("status") or "unknown",
+ services=s.get("services") or [],
+ state=s.get("state") or "pending",
+ last_error=s.get("last_error"),
+ compose_hash=s.get("compose_hash"),
+ updated_at=s["updated_at"],
+ hostname=s.get("hostname"),
+ distro=s.get("distro"),
+ archetype=s.get("archetype"),
+ service_config=s.get("service_config") or {},
+ mutate_interval=s.get("mutate_interval"),
+ last_mutated=s.get("last_mutated") or 0.0,
+ last_seen=s.get("last_seen"),
+ ))
+ return out
diff --git a/decnet/web/router/swarm/api_list_hosts.py b/decnet/web/router/swarm/api_list_hosts.py
new file mode 100644
index 0000000..acc7ba9
--- /dev/null
+++ b/decnet/web/router/swarm/api_list_hosts.py
@@ -0,0 +1,21 @@
+"""GET /swarm/hosts — list enrolled workers, optionally filtered by status."""
+from __future__ import annotations
+
+from typing import Optional
+
+from fastapi import APIRouter, Depends
+
+from decnet.web.db.repository import BaseRepository
+from decnet.web.dependencies import get_repo
+from decnet.web.db.models import SwarmHostView
+
+router = APIRouter()
+
+
+@router.get("/hosts", response_model=list[SwarmHostView], tags=["Swarm Hosts"])
+async def api_list_hosts(
+ host_status: Optional[str] = None,
+ repo: BaseRepository = Depends(get_repo),
+) -> list[SwarmHostView]:
+ rows = await repo.list_swarm_hosts(host_status)
+ return [SwarmHostView(**r) for r in rows]
diff --git a/decnet/web/router/swarm/api_teardown_swarm.py b/decnet/web/router/swarm/api_teardown_swarm.py
new file mode 100644
index 0000000..d62f013
--- /dev/null
+++ b/decnet/web/router/swarm/api_teardown_swarm.py
@@ -0,0 +1,60 @@
+"""POST /swarm/teardown — tear down one or all enrolled workers."""
+from __future__ import annotations
+
+import asyncio
+from typing import Any
+
+from fastapi import APIRouter, Depends, HTTPException
+
+from decnet.logging import get_logger
+from decnet.swarm.client import AgentClient
+from decnet.web.db.repository import BaseRepository
+from decnet.web.dependencies import get_repo
+from decnet.web.db.models import (
+ SwarmDeployResponse,
+ SwarmHostResult,
+ SwarmTeardownRequest,
+)
+
+log = get_logger("swarm.teardown")
+
+router = APIRouter()
+
+
+@router.post(
+ "/teardown",
+ response_model=SwarmDeployResponse,
+ tags=["Swarm Deployments"],
+ responses={
+ 400: {"description": "Bad Request (malformed JSON body)"},
+ 404: {"description": "A targeted host does not exist"},
+ 422: {"description": "Request body validation error"},
+ },
+)
+async def api_teardown_swarm(
+ req: SwarmTeardownRequest,
+ repo: BaseRepository = Depends(get_repo),
+) -> SwarmDeployResponse:
+ if req.host_uuid is not None:
+ row = await repo.get_swarm_host_by_uuid(req.host_uuid)
+ if row is None:
+ raise HTTPException(status_code=404, detail="host not found")
+ targets = [row]
+ else:
+ targets = await repo.list_swarm_hosts()
+
+ async def _call(host: dict[str, Any]) -> SwarmHostResult:
+ try:
+ async with AgentClient(host=host) as agent:
+ body = await agent.teardown(req.decky_id)
+ if req.decky_id is None:
+ await repo.delete_decky_shards_for_host(host["uuid"])
+ return SwarmHostResult(host_uuid=host["uuid"], host_name=host["name"], ok=True, detail=body)
+ except Exception as exc:
+ log.exception("swarm.teardown failed host=%s", host["name"])
+ return SwarmHostResult(
+ host_uuid=host["uuid"], host_name=host["name"], ok=False, detail=str(exc)
+ )
+
+ results = await asyncio.gather(*(_call(h) for h in targets))
+ return SwarmDeployResponse(results=list(results))
diff --git a/decnet/web/router/swarm_mgmt/__init__.py b/decnet/web/router/swarm_mgmt/__init__.py
new file mode 100644
index 0000000..12790f8
--- /dev/null
+++ b/decnet/web/router/swarm_mgmt/__init__.py
@@ -0,0 +1,26 @@
+"""Swarm management endpoints for the React dashboard.
+
+These are *not* the unauthenticated /swarm routes mounted on the separate
+swarm-controller process (decnet/web/swarm_api.py on port 8770). These
+live on the main web API, go through ``require_admin``, and are the
+interface the dashboard uses to list hosts, decommission them, list
+deckies across the fleet, and generate one-shot agent-enrollment
+bundles.
+
+Mounted under ``/api/v1/swarm`` by the main api router.
+"""
+from fastapi import APIRouter
+
+from .api_list_hosts import router as list_hosts_router
+from .api_decommission_host import router as decommission_host_router
+from .api_list_deckies import router as list_deckies_router
+from .api_enroll_bundle import router as enroll_bundle_router
+from .api_teardown_host import router as teardown_host_router
+
+swarm_mgmt_router = APIRouter(prefix="/swarm")
+
+swarm_mgmt_router.include_router(list_hosts_router)
+swarm_mgmt_router.include_router(decommission_host_router)
+swarm_mgmt_router.include_router(list_deckies_router)
+swarm_mgmt_router.include_router(enroll_bundle_router)
+swarm_mgmt_router.include_router(teardown_host_router)
diff --git a/decnet/web/router/swarm_mgmt/api_decommission_host.py b/decnet/web/router/swarm_mgmt/api_decommission_host.py
new file mode 100644
index 0000000..d473b34
--- /dev/null
+++ b/decnet/web/router/swarm_mgmt/api_decommission_host.py
@@ -0,0 +1,71 @@
+"""DELETE /swarm/hosts/{uuid} — decommission a worker from the dashboard.
+
+Also instructs the worker agent to stop all DECNET services and delete
+its install footprint (keeping logs). Agent self-destruct failure does
+not block decommission — the master-side cleanup always runs so a dead
+worker can still be removed from the dashboard.
+"""
+from __future__ import annotations
+
+import pathlib
+
+from fastapi import APIRouter, Depends, HTTPException, status
+
+from decnet.logging import get_logger
+from decnet.swarm.client import AgentClient
+from decnet.web.db.repository import BaseRepository
+from decnet.web.dependencies import get_repo, require_admin
+
+log = get_logger("swarm.decommission")
+router = APIRouter()
+
+
+@router.delete(
+ "/hosts/{uuid}",
+ status_code=status.HTTP_204_NO_CONTENT,
+ tags=["Swarm Management"],
+ responses={
+ 401: {"description": "Could not validate credentials"},
+ 403: {"description": "Insufficient permissions"},
+ 404: {"description": "Host not found"},
+ 422: {"description": "Path parameter validation error"},
+ },
+)
+async def decommission_host(
+ uuid: str,
+ admin: dict = Depends(require_admin),
+ repo: BaseRepository = Depends(get_repo),
+) -> None:
+ row = await repo.get_swarm_host_by_uuid(uuid)
+ if row is None:
+ raise HTTPException(status_code=404, detail="host not found")
+
+ # Ask the worker to wipe its own install (keeps logs). The agent
+ # schedules the reaper as a detached process and returns immediately,
+ # so this call is fast when the worker is reachable. A dead worker
+ # shouldn't block the operator from cleaning up the dashboard entry,
+ # hence best-effort with a log and continue.
+ try:
+ async with AgentClient(host=row) as agent:
+ await agent.self_destruct()
+ except Exception:
+ log.exception(
+ "decommission: self-destruct dispatch failed host=%s — "
+ "proceeding with master-side cleanup anyway",
+ row.get("name"),
+ )
+
+ await repo.delete_decky_shards_for_host(uuid)
+ await repo.delete_swarm_host(uuid)
+
+ bundle_dir = pathlib.Path(row.get("cert_bundle_path") or "")
+ if bundle_dir.is_dir():
+ for child in bundle_dir.iterdir():
+ try:
+ child.unlink()
+ except OSError:
+ pass
+ try:
+ bundle_dir.rmdir()
+ except OSError:
+ pass
diff --git a/decnet/web/router/swarm_mgmt/api_enroll_bundle.py b/decnet/web/router/swarm_mgmt/api_enroll_bundle.py
new file mode 100644
index 0000000..799df44
--- /dev/null
+++ b/decnet/web/router/swarm_mgmt/api_enroll_bundle.py
@@ -0,0 +1,484 @@
+"""Agent-enrollment bundles — the Wazuh-style one-liner flow.
+
+Three endpoints:
+ POST /swarm/enroll-bundle — admin issues certs + builds payload
+ GET /swarm/enroll-bundle/{t}.sh — bootstrap script (idempotent until .tgz)
+ GET /swarm/enroll-bundle/{t}.tgz — tarball payload (one-shot; trips served)
+
+The operator's paste is a single pipe ``curl -fsSL <.sh> | sudo bash``.
+Under the hood the bootstrap curls the ``.tgz`` from the same token.
+Both files are rendered + persisted on POST; the ``.tgz`` GET atomically
+marks the token served, reads the bytes under the lock, and unlinks both
+files so a sweeper cannot race it. Unclaimed tokens expire after 5 min.
+
+We avoid the single-self-extracting-script pattern because ``bash`` run
+via pipe has ``$0 == "bash"`` — there is no file on disk to ``tail`` for
+the embedded payload. Two URLs, one paste.
+"""
+from __future__ import annotations
+
+import asyncio
+import fnmatch
+import io
+import os
+import pathlib
+import secrets
+import tarfile
+from dataclasses import dataclass
+from datetime import datetime, timedelta, timezone
+from typing import Optional
+
+from fastapi import APIRouter, Depends, HTTPException, Request, Response, status
+from pydantic import BaseModel, Field
+
+from decnet.logging import get_logger
+from decnet.swarm import pki
+from decnet.web.db.repository import BaseRepository
+from decnet.web.dependencies import get_repo, require_admin
+
+log = get_logger("swarm_mgmt.enroll_bundle")
+
+router = APIRouter()
+
+BUNDLE_TTL = timedelta(minutes=5)
+BUNDLE_DIR = pathlib.Path(os.environ.get("DECNET_ENROLL_BUNDLE_DIR", "/tmp/decnet-enroll")) # nosec B108 - short-lived 0600 bundle cache, env-overridable
+SWEEP_INTERVAL_SECS = 30
+
+# Paths excluded from the bundled tarball. Matches the intent of
+# decnet.swarm.tar_tree.DEFAULT_EXCLUDES but narrower — we never want
+# tests, dev scaffolding, the master's DB, or the frontend source tree
+# shipped to an agent.
+_EXCLUDES: tuple[str, ...] = (
+ ".venv", ".venv/*", "**/.venv/*",
+ "__pycache__", "**/__pycache__", "**/__pycache__/*",
+ ".git", ".git/*",
+ ".pytest_cache", ".pytest_cache/*",
+ ".mypy_cache", ".mypy_cache/*",
+ "*.egg-info", "*.egg-info/*",
+ # setuptools build/ staging dir — created by `pip install` and leaks a
+ # nested decnet_web/node_modules/ copy into the bundle otherwise.
+ "build", "build/*", "build/**",
+ "*.pyc", "*.pyo",
+ "*.db", "*.db-wal", "*.db-shm", "decnet.db*",
+ "*.log",
+ "tests", "tests/*",
+ "development", "development/*",
+ "wiki-checkout", "wiki-checkout/*",
+ # Frontend is master-only; agents never serve UI.
+ "decnet_web", "decnet_web/*", "decnet_web/**",
+ # Master FastAPI app and everything under decnet/web/ — no agent-side
+ # code imports it. The agent/updater/forwarder/collector/prober/sniffer
+ # entrypoints are all under decnet/agent, decnet/updater, decnet/swarm,
+ # decnet/collector, decnet/prober, decnet/sniffer.
+ "decnet/web", "decnet/web/*", "decnet/web/**",
+ # Mutator + Profiler are master-only (mutator schedules respawns across
+ # the swarm; profiler rebuilds attacker profiles against the master DB).
+ "decnet/mutator", "decnet/mutator/*", "decnet/mutator/**",
+ "decnet/profiler", "decnet/profiler/*", "decnet/profiler/**",
+ "decnet-state.json",
+ "master.log", "master.json",
+ "decnet.tar",
+ # Dev-host env/config leaks — these bake the master's absolute paths into
+ # the agent and point log handlers at directories that don't exist on the
+ # worker VM.
+ ".env", ".env.*", "**/.env", "**/.env.*",
+ "decnet.ini", "**/decnet.ini",
+)
+
+
+# ---------------------------------------------------------------------------
+# DTOs
+# ---------------------------------------------------------------------------
+
+class EnrollBundleRequest(BaseModel):
+ master_host: str = Field(..., min_length=1, max_length=253,
+ description="IP/host the agent will reach back to")
+ agent_name: str = Field(..., pattern=r"^[a-z0-9][a-z0-9-]{0,62}$",
+ description="Worker name (DNS-label safe)")
+ with_updater: bool = Field(
+ default=True,
+ description="Include updater cert bundle and auto-start decnet updater on the agent",
+ )
+ use_ipvlan: bool = Field(
+ default=False,
+ description=(
+ "Run deckies on this agent over IPvlan L2 instead of MACVLAN. "
+ "Required when the agent is a VirtualBox/VMware guest bridged over Wi-Fi — "
+ "Wi-Fi APs bind one MAC per station, so MACVLAN's extra container MACs "
+ "rotate the VM's DHCP lease. Safe no-op on wired/bare-metal hosts."
+ ),
+ )
+ services_ini: Optional[str] = Field(
+ default=None,
+ description="Optional INI text shipped to the agent as /etc/decnet/services.ini",
+ )
+
+
+class EnrollBundleResponse(BaseModel):
+ token: str
+ command: str
+ expires_at: datetime
+ host_uuid: str
+
+
+# ---------------------------------------------------------------------------
+# In-memory registry
+# ---------------------------------------------------------------------------
+
+@dataclass
+class _Bundle:
+ sh_path: pathlib.Path
+ tgz_path: pathlib.Path
+ expires_at: datetime
+ host_uuid: str
+ served: bool = False
+
+
+_BUNDLES: dict[str, _Bundle] = {}
+_LOCK = asyncio.Lock()
+_SWEEPER_TASK: Optional[asyncio.Task] = None
+
+
+async def _sweep_loop() -> None:
+ while True:
+ try:
+ await asyncio.sleep(SWEEP_INTERVAL_SECS)
+ now = datetime.now(timezone.utc)
+ async with _LOCK:
+ dead = [t for t, b in _BUNDLES.items() if b.served or b.expires_at <= now]
+ for t in dead:
+ b = _BUNDLES.pop(t)
+ for p in (b.sh_path, b.tgz_path):
+ try:
+ p.unlink()
+ except FileNotFoundError:
+ pass
+ except OSError as exc:
+ log.warning("enroll-bundle sweep unlink failed path=%s err=%s", p, exc)
+ except asyncio.CancelledError:
+ raise
+ except Exception: # noqa: BLE001
+ log.exception("enroll-bundle sweeper iteration failed")
+
+
+def _ensure_sweeper() -> None:
+ global _SWEEPER_TASK
+ if _SWEEPER_TASK is None or _SWEEPER_TASK.done():
+ _SWEEPER_TASK = asyncio.create_task(_sweep_loop())
+
+
+# ---------------------------------------------------------------------------
+# Tarball construction
+# ---------------------------------------------------------------------------
+
+def _repo_root() -> pathlib.Path:
+ # decnet/web/router/swarm_mgmt/api_enroll_bundle.py -> 4 parents = repo root.
+ return pathlib.Path(__file__).resolve().parents[4]
+
+
+def _is_excluded(rel: str) -> bool:
+ parts = pathlib.PurePosixPath(rel).parts
+ for pat in _EXCLUDES:
+ if fnmatch.fnmatch(rel, pat):
+ return True
+ for i in range(1, len(parts) + 1):
+ if fnmatch.fnmatch("/".join(parts[:i]), pat):
+ return True
+ return False
+
+
+def _render_decnet_ini(
+ master_host: str,
+ host_uuid: str,
+ use_ipvlan: bool = False,
+ swarmctl_port: int = 8770,
+) -> bytes:
+ ipvlan_line = f"ipvlan = {'true' if use_ipvlan else 'false'}\n"
+ return (
+ "; Generated by DECNET agent-enrollment bundle.\n"
+ "[decnet]\n"
+ "mode = agent\n"
+ "disallow-master = true\n"
+ "log-directory = /var/log/decnet\n"
+ f"{ipvlan_line}"
+ "\n"
+ "[agent]\n"
+ f"master-host = {master_host}\n"
+ f"swarmctl-port = {swarmctl_port}\n"
+ "swarm-syslog-port = 6514\n"
+ "agent-port = 8765\n"
+ "agent-dir = /etc/decnet/agent\n"
+ "updater-dir = /etc/decnet/updater\n"
+ f"host-uuid = {host_uuid}\n"
+ ).encode()
+
+
+def _add_bytes(tar: tarfile.TarFile, name: str, data: bytes, mode: int = 0o644) -> None:
+ info = tarfile.TarInfo(name)
+ info.size = len(data)
+ info.mode = mode
+ info.mtime = int(datetime.now(timezone.utc).timestamp())
+ tar.addfile(info, io.BytesIO(data))
+
+
+def _build_tarball(
+ master_host: str,
+ agent_name: str,
+ host_uuid: str,
+ issued: pki.IssuedCert,
+ services_ini: Optional[str],
+ updater_issued: Optional[pki.IssuedCert] = None,
+ use_ipvlan: bool = False,
+) -> bytes:
+ """Gzipped tarball with:
+ - full repo source (minus excludes)
+ - etc/decnet/decnet.ini (pre-baked for mode=agent)
+ - home/.decnet/agent/{ca.crt,worker.crt,worker.key}
+ - home/.decnet/updater/{ca.crt,updater.crt,updater.key} (if updater_issued)
+ - services.ini at root if provided
+ """
+ root = _repo_root()
+ buf = io.BytesIO()
+ with tarfile.open(fileobj=buf, mode="w:gz") as tar:
+ for path in sorted(root.rglob("*")):
+ rel = path.relative_to(root).as_posix()
+ if _is_excluded(rel):
+ continue
+ if path.is_symlink() or path.is_dir():
+ continue
+ tar.add(path, arcname=rel, recursive=False)
+
+ _add_bytes(
+ tar,
+ "etc/decnet/decnet.ini",
+ _render_decnet_ini(master_host, host_uuid, use_ipvlan),
+ )
+ for unit in _SYSTEMD_UNITS:
+ _add_bytes(
+ tar,
+ f"etc/systemd/system/{unit}.service",
+ _render_systemd_unit(unit, agent_name, master_host),
+ )
+ _add_bytes(tar, "home/.decnet/agent/ca.crt", issued.ca_cert_pem)
+ _add_bytes(tar, "home/.decnet/agent/worker.crt", issued.cert_pem)
+ _add_bytes(tar, "home/.decnet/agent/worker.key", issued.key_pem, mode=0o600)
+
+ if updater_issued is not None:
+ _add_bytes(tar, "home/.decnet/updater/ca.crt", updater_issued.ca_cert_pem)
+ _add_bytes(tar, "home/.decnet/updater/updater.crt", updater_issued.cert_pem)
+ _add_bytes(tar, "home/.decnet/updater/updater.key", updater_issued.key_pem, mode=0o600)
+
+ if services_ini:
+ _add_bytes(tar, "services.ini", services_ini.encode())
+
+ return buf.getvalue()
+
+
+_SYSTEMD_UNITS = (
+ "decnet-agent", "decnet-forwarder", "decnet-engine", "decnet-updater",
+ # Per-host microservices — activated by enroll_bootstrap.sh. The
+ # profiler intentionally stays master-side: it rebuilds attacker
+ # profiles against the master DB, which workers don't share.
+ "decnet-collector", "decnet-prober", "decnet-sniffer",
+)
+
+
+def _render_systemd_unit(name: str, agent_name: str, master_host: str) -> bytes:
+ tpl_path = pathlib.Path(__file__).resolve().parents[1].parent / "templates" / f"{name}.service.j2"
+ tpl = tpl_path.read_text()
+ return (
+ tpl.replace("{{ agent_name }}", agent_name)
+ .replace("{{ master_host }}", master_host)
+ ).encode()
+
+
+def _render_bootstrap(
+ agent_name: str,
+ master_host: str,
+ tarball_url: str,
+ expires_at: datetime,
+ with_updater: bool,
+) -> bytes:
+ tpl_path = pathlib.Path(__file__).resolve().parents[1].parent / "templates" / "enroll_bootstrap.sh.j2"
+ tpl = tpl_path.read_text()
+ now = datetime.now(timezone.utc).replace(microsecond=0).isoformat()
+ rendered = (
+ tpl.replace("{{ agent_name }}", agent_name)
+ .replace("{{ master_host }}", master_host)
+ .replace("{{ tarball_url }}", tarball_url)
+ .replace("{{ generated_at }}", now)
+ .replace("{{ expires_at }}", expires_at.replace(microsecond=0).isoformat())
+ .replace("{{ with_updater }}", "true" if with_updater else "false")
+ )
+ return rendered.encode()
+
+
+# ---------------------------------------------------------------------------
+# Endpoints
+# ---------------------------------------------------------------------------
+
+@router.post(
+ "/enroll-bundle",
+ response_model=EnrollBundleResponse,
+ status_code=status.HTTP_201_CREATED,
+ tags=["Swarm Management"],
+ responses={
+ 400: {"description": "Bad Request (malformed JSON body)"},
+ 401: {"description": "Could not validate credentials"},
+ 403: {"description": "Insufficient permissions"},
+ 409: {"description": "A worker with this name is already enrolled"},
+ 422: {"description": "Request body validation error"},
+ },
+)
+async def create_enroll_bundle(
+ req: EnrollBundleRequest,
+ request: Request,
+ admin: dict = Depends(require_admin),
+ repo: BaseRepository = Depends(get_repo),
+) -> EnrollBundleResponse:
+ import uuid as _uuid
+
+ existing = await repo.get_swarm_host_by_name(req.agent_name)
+ if existing is not None:
+ raise HTTPException(status_code=409, detail=f"Worker '{req.agent_name}' is already enrolled")
+
+ # 1. Issue certs (reuses the same code as /swarm/enroll). The worker's own
+ # address is not known yet — the master learns it when the agent fetches
+ # the tarball (see get_payload), which also backfills the SwarmHost row.
+ ca = pki.ensure_ca()
+ sans = list({req.agent_name, req.master_host})
+ issued = pki.issue_worker_cert(ca, req.agent_name, sans)
+ bundle_dir = pki.DEFAULT_CA_DIR / "workers" / req.agent_name
+ pki.write_worker_bundle(issued, bundle_dir)
+
+ updater_issued: Optional[pki.IssuedCert] = None
+ updater_fp: Optional[str] = None
+ if req.with_updater:
+ updater_cn = f"updater@{req.agent_name}"
+ updater_sans = list({*sans, updater_cn, "127.0.0.1"})
+ updater_issued = pki.issue_worker_cert(ca, updater_cn, updater_sans)
+ updater_dir = bundle_dir / "updater"
+ updater_dir.mkdir(parents=True, exist_ok=True)
+ (updater_dir / "updater.crt").write_bytes(updater_issued.cert_pem)
+ (updater_dir / "updater.key").write_bytes(updater_issued.key_pem)
+ os.chmod(updater_dir / "updater.key", 0o600)
+ updater_fp = updater_issued.fingerprint_sha256
+
+ # 2. Register the host row so it shows up in SwarmHosts immediately.
+ host_uuid = str(_uuid.uuid4())
+ await repo.add_swarm_host(
+ {
+ "uuid": host_uuid,
+ "name": req.agent_name,
+ "address": "", # filled in when the agent fetches the .tgz (its source IP)
+ "agent_port": 8765,
+ "status": "enrolled",
+ "client_cert_fingerprint": issued.fingerprint_sha256,
+ "updater_cert_fingerprint": updater_fp,
+ "cert_bundle_path": str(bundle_dir),
+ "enrolled_at": datetime.now(timezone.utc),
+ "notes": "enrolled via UI bundle",
+ "use_ipvlan": req.use_ipvlan,
+ }
+ )
+
+ # 3. Render payload + bootstrap.
+ tarball = _build_tarball(
+ req.master_host, req.agent_name, host_uuid, issued, req.services_ini, updater_issued,
+ use_ipvlan=req.use_ipvlan,
+ )
+ token = secrets.token_urlsafe(24)
+ expires_at = datetime.now(timezone.utc) + BUNDLE_TTL
+
+ BUNDLE_DIR.mkdir(parents=True, exist_ok=True, mode=0o700)
+ sh_path = BUNDLE_DIR / f"{token}.sh"
+ tgz_path = BUNDLE_DIR / f"{token}.tgz"
+
+ # Build URLs against the operator-supplied master_host (reachable from the
+ # new agent) rather than request.base_url, which reflects how the dashboard
+ # user reached us — often 127.0.0.1 behind a proxy or loopback-bound API.
+ scheme = request.url.scheme
+ port = request.url.port
+ netloc = req.master_host if port is None else f"{req.master_host}:{port}"
+ base = f"{scheme}://{netloc}"
+ tarball_url = f"{base}/api/v1/swarm/enroll-bundle/{token}.tgz"
+ bootstrap_url = f"{base}/api/v1/swarm/enroll-bundle/{token}.sh"
+ script = _render_bootstrap(req.agent_name, req.master_host, tarball_url, expires_at, req.with_updater)
+
+ tgz_path.write_bytes(tarball)
+ sh_path.write_bytes(script)
+ os.chmod(tgz_path, 0o600)
+ os.chmod(sh_path, 0o600)
+
+ async with _LOCK:
+ _BUNDLES[token] = _Bundle(
+ sh_path=sh_path, tgz_path=tgz_path, expires_at=expires_at, host_uuid=host_uuid,
+ )
+ _ensure_sweeper()
+
+ log.info("enroll-bundle created agent=%s master=%s token=%s...", req.agent_name, req.master_host, token[:8])
+
+ return EnrollBundleResponse(
+ token=token,
+ command=f"curl -fsSL {bootstrap_url} | sudo bash",
+ expires_at=expires_at,
+ host_uuid=host_uuid,
+ )
+
+
+def _now() -> datetime:
+ # Indirection so tests can monkeypatch.
+ return datetime.now(timezone.utc)
+
+
+async def _lookup_live(token: str) -> _Bundle:
+ b = _BUNDLES.get(token)
+ if b is None or b.served or b.expires_at <= _now():
+ raise HTTPException(status_code=404, detail="bundle not found or expired")
+ return b
+
+
+@router.get(
+ "/enroll-bundle/{token}.sh",
+ tags=["Swarm Management"],
+ include_in_schema=False,
+)
+async def get_bootstrap(token: str) -> Response:
+ async with _LOCK:
+ b = await _lookup_live(token)
+ data = b.sh_path.read_bytes()
+ return Response(content=data, media_type="text/x-shellscript")
+
+
+@router.get(
+ "/enroll-bundle/{token}.tgz",
+ tags=["Swarm Management"],
+ include_in_schema=False,
+)
+async def get_payload(
+ token: str,
+ request: Request,
+ repo: BaseRepository = Depends(get_repo),
+) -> Response:
+ async with _LOCK:
+ b = await _lookup_live(token)
+ b.served = True
+ data = b.tgz_path.read_bytes()
+ host_uuid = b.host_uuid
+ for p in (b.sh_path, b.tgz_path):
+ try:
+ p.unlink()
+ except FileNotFoundError:
+ pass
+
+ # The agent's first connect-back — its source IP is the reachable address
+ # the master will later use to probe it. Backfill the SwarmHost row here
+ # so the operator sees the real address instead of an empty placeholder.
+ client_host = request.client.host if request.client else ""
+ if client_host:
+ try:
+ await repo.update_swarm_host(host_uuid, {"address": client_host})
+ except Exception as e: # noqa: BLE001
+ log.warning("enroll-bundle could not backfill address host=%s err=%s", host_uuid, e)
+
+ return Response(content=data, media_type="application/gzip")
diff --git a/decnet/web/router/swarm_mgmt/api_list_deckies.py b/decnet/web/router/swarm_mgmt/api_list_deckies.py
new file mode 100644
index 0000000..0f8bb84
--- /dev/null
+++ b/decnet/web/router/swarm_mgmt/api_list_deckies.py
@@ -0,0 +1,58 @@
+"""GET /swarm/deckies — admin-gated list of decky shards across the fleet."""
+from __future__ import annotations
+
+from typing import Optional
+
+from fastapi import APIRouter, Depends
+
+from decnet.web.db.models import DeckyShardView
+from decnet.web.db.repository import BaseRepository
+from decnet.web.dependencies import get_repo, require_admin
+
+router = APIRouter()
+
+
+@router.get("/deckies", response_model=list[DeckyShardView], tags=["Swarm Management"])
+async def list_deckies(
+ host_uuid: Optional[str] = None,
+ state: Optional[str] = None,
+ admin: dict = Depends(require_admin),
+ repo: BaseRepository = Depends(get_repo),
+) -> list[DeckyShardView]:
+ shards = await repo.list_decky_shards(host_uuid)
+ hosts = {h["uuid"]: h for h in await repo.list_swarm_hosts()}
+
+ # Pre-heartbeat fallback — older rows without decky_config can still
+ # surface their IP from the master's deploy state snapshot.
+ deploy_state = await repo.get_state("deployment") or {}
+ cfg_deckies = (deploy_state.get("config") or {}).get("deckies") or []
+ ip_by_name: dict[str, str] = {
+ d.get("name"): d.get("ip") for d in cfg_deckies if d.get("name")
+ }
+
+ out: list[DeckyShardView] = []
+ for s in shards:
+ if state and s.get("state") != state:
+ continue
+ host = hosts.get(s["host_uuid"], {})
+ out.append(DeckyShardView(
+ decky_name=s["decky_name"],
+ decky_ip=s.get("decky_ip") or ip_by_name.get(s["decky_name"]),
+ host_uuid=s["host_uuid"],
+ host_name=host.get("name") or "",
+ host_address=host.get("address") or "",
+ host_status=host.get("status") or "unknown",
+ services=s.get("services") or [],
+ state=s.get("state") or "pending",
+ last_error=s.get("last_error"),
+ compose_hash=s.get("compose_hash"),
+ updated_at=s["updated_at"],
+ hostname=s.get("hostname"),
+ distro=s.get("distro"),
+ archetype=s.get("archetype"),
+ service_config=s.get("service_config") or {},
+ mutate_interval=s.get("mutate_interval"),
+ last_mutated=s.get("last_mutated") or 0.0,
+ last_seen=s.get("last_seen"),
+ ))
+ return out
diff --git a/decnet/web/router/swarm_mgmt/api_list_hosts.py b/decnet/web/router/swarm_mgmt/api_list_hosts.py
new file mode 100644
index 0000000..f835fc5
--- /dev/null
+++ b/decnet/web/router/swarm_mgmt/api_list_hosts.py
@@ -0,0 +1,60 @@
+"""GET /swarm/hosts — admin-gated list of enrolled workers for the dashboard.
+
+Fans out an ``AgentClient.health()`` probe to each host on every call and
+updates ``status`` / ``last_heartbeat`` as a side effect. This mirrors how
+``/swarm-updates/hosts`` probes the updater daemon — the SwarmHosts page
+polls this endpoint, so probe-on-read is what drives heartbeat freshness
+in the UI. No separate scheduler needed.
+"""
+from __future__ import annotations
+
+import asyncio
+from datetime import datetime, timezone
+from typing import Any, Optional
+
+from fastapi import APIRouter, Depends
+
+from decnet.logging import get_logger
+from decnet.swarm.client import AgentClient
+from decnet.web.db.models import SwarmHostView
+from decnet.web.db.repository import BaseRepository
+from decnet.web.dependencies import get_repo, require_admin
+
+log = get_logger("swarm_mgmt.list_hosts")
+
+router = APIRouter()
+
+
+async def _probe_and_update(
+ host: dict[str, Any], repo: BaseRepository
+) -> dict[str, Any]:
+ """Best-effort mTLS probe. Skips hosts with no address yet (pending first
+ connect-back) so we don't pollute the DB with 'unreachable' on fresh
+ enrollments that haven't fetched the tarball."""
+ if not host.get("address"):
+ return host
+ try:
+ async with AgentClient(host=host) as agent:
+ await agent.health()
+ patch = {"status": "active", "last_heartbeat": datetime.now(timezone.utc)}
+ except Exception as exc: # noqa: BLE001
+ log.debug("swarm/hosts probe unreachable host=%s err=%s", host.get("name"), exc)
+ patch = {"status": "unreachable"}
+ try:
+ await repo.update_swarm_host(host["uuid"], patch)
+ except Exception as exc: # noqa: BLE001
+ log.warning("swarm/hosts could not persist probe result host=%s err=%s", host.get("name"), exc)
+ return host
+ host.update(patch)
+ return host
+
+
+@router.get("/hosts", response_model=list[SwarmHostView], tags=["Swarm Management"])
+async def list_hosts(
+ host_status: Optional[str] = None,
+ admin: dict = Depends(require_admin),
+ repo: BaseRepository = Depends(get_repo),
+) -> list[SwarmHostView]:
+ rows = await repo.list_swarm_hosts(host_status)
+ probed = await asyncio.gather(*(_probe_and_update(r, repo) for r in rows))
+ return [SwarmHostView(**r) for r in probed]
diff --git a/decnet/web/router/swarm_mgmt/api_teardown_host.py b/decnet/web/router/swarm_mgmt/api_teardown_host.py
new file mode 100644
index 0000000..cae1b73
--- /dev/null
+++ b/decnet/web/router/swarm_mgmt/api_teardown_host.py
@@ -0,0 +1,150 @@
+"""POST /swarm/hosts/{uuid}/teardown — remote teardown on a swarm worker.
+
+Body: ``{"decky_id": "..."}`` (optional). When ``decky_id`` is null/omitted
+the agent tears down the entire host (all deckies + network); otherwise it
+tears down that single decky.
+
+Async-by-default: the endpoint returns 202 the moment the request is
+accepted and runs the actual agent call + DB cleanup in a background task.
+That lets the operator queue multiple teardowns in parallel without
+blocking on slow docker-compose-down cycles on the worker.
+"""
+from __future__ import annotations
+
+import asyncio
+from typing import Any, Optional
+
+from fastapi import APIRouter, Depends, HTTPException, status
+from pydantic import BaseModel
+
+from decnet.logging import get_logger
+from decnet.swarm.client import AgentClient
+from decnet.web.db.repository import BaseRepository
+from decnet.web.dependencies import get_repo, require_admin
+
+log = get_logger("swarm.teardown")
+router = APIRouter()
+
+# Track spawned background tasks so (a) they're not GC'd mid-flight and
+# (b) tests can drain them deterministically via ``await drain_pending()``.
+_PENDING: "set[asyncio.Task]" = set()
+
+
+def _spawn(coro) -> asyncio.Task:
+ task = asyncio.create_task(coro)
+ _PENDING.add(task)
+ task.add_done_callback(_PENDING.discard)
+ return task
+
+
+async def drain_pending() -> None:
+ """Await all outstanding teardown tasks. Used by tests."""
+ while _PENDING:
+ await asyncio.gather(*list(_PENDING), return_exceptions=True)
+
+
+class TeardownHostRequest(BaseModel):
+ decky_id: Optional[str] = None
+
+
+class TeardownHostResponse(BaseModel):
+ host_uuid: str
+ host_name: str
+ decky_id: Optional[str] = None
+ accepted: bool
+ detail: str
+
+
+async def _mark_tearing_down(
+ repo: BaseRepository, host_uuid: str, decky_id: Optional[str]
+) -> None:
+ """Flip affected shards to state='tearing_down' so the UI can show
+ progress immediately while the background task runs."""
+ shards = await repo.list_decky_shards(host_uuid)
+ for s in shards:
+ if decky_id and s.get("decky_name") != decky_id:
+ continue
+ await repo.upsert_decky_shard({
+ **s,
+ "state": "tearing_down",
+ "last_error": None,
+ })
+
+
+async def _run_teardown(
+ host: dict[str, Any], repo: BaseRepository, decky_id: Optional[str]
+) -> None:
+ """Fire the remote teardown + DB cleanup. Exceptions are logged and
+ reflected on the shard so the UI surfaces them — never re-raised,
+ since nothing is awaiting us."""
+ try:
+ async with AgentClient(host=host) as agent:
+ await agent.teardown(decky_id)
+ except Exception as exc:
+ log.exception(
+ "swarm.teardown background task failed host=%s decky=%s",
+ host.get("name"), decky_id,
+ )
+ # Reflect the failure on the shard(s) — don't delete on failure,
+ # the operator needs to see what went wrong and retry.
+ try:
+ shards = await repo.list_decky_shards(host["uuid"])
+ for s in shards:
+ if decky_id and s.get("decky_name") != decky_id:
+ continue
+ await repo.upsert_decky_shard({
+ **s,
+ "state": "teardown_failed",
+ "last_error": str(exc)[:512],
+ })
+ except Exception:
+ log.exception("swarm.teardown failed to record shard failure")
+ return
+
+ try:
+ if decky_id:
+ await repo.delete_decky_shard(decky_id)
+ else:
+ await repo.delete_decky_shards_for_host(host["uuid"])
+ except Exception:
+ log.exception("swarm.teardown DB cleanup failed (agent call succeeded)")
+
+
+@router.post(
+ "/hosts/{uuid}/teardown",
+ response_model=TeardownHostResponse,
+ status_code=status.HTTP_202_ACCEPTED,
+ tags=["Swarm Management"],
+ responses={
+ 400: {"description": "Bad Request (malformed JSON body)"},
+ 401: {"description": "Could not validate credentials"},
+ 403: {"description": "Insufficient permissions"},
+ 404: {"description": "Host not found"},
+ 422: {"description": "Request body or path parameter validation error"},
+ },
+)
+async def teardown_host(
+ uuid: str,
+ req: TeardownHostRequest,
+ admin: dict = Depends(require_admin),
+ repo: BaseRepository = Depends(get_repo),
+) -> TeardownHostResponse:
+ host = await repo.get_swarm_host_by_uuid(uuid)
+ if host is None:
+ raise HTTPException(status_code=404, detail="host not found")
+
+ await _mark_tearing_down(repo, uuid, req.decky_id)
+
+ # Fire-and-forget: asyncio.create_task (not BackgroundTasks) so the
+ # task runs independently of this request's lifecycle — the operator
+ # can queue another teardown the moment this one returns 202 without
+ # waiting for any per-request cleanup phase.
+ _spawn(_run_teardown(host, repo, req.decky_id))
+
+ return TeardownHostResponse(
+ host_uuid=uuid,
+ host_name=host.get("name") or "",
+ decky_id=req.decky_id,
+ accepted=True,
+ detail="teardown queued",
+ )
diff --git a/decnet/web/router/swarm_updates/__init__.py b/decnet/web/router/swarm_updates/__init__.py
new file mode 100644
index 0000000..d14e13f
--- /dev/null
+++ b/decnet/web/router/swarm_updates/__init__.py
@@ -0,0 +1,23 @@
+"""Remote Updates — master dashboard's surface for pushing code to workers.
+
+These are *not* the swarm-controller's /swarm routes (those run on a
+separate process, auth-free, internal-only). They live on the main web
+API, go through ``require_admin``, and are the interface the React
+dashboard calls to fan updates out to worker ``decnet updater`` daemons
+via ``UpdaterClient``.
+
+Mounted under ``/api/v1/swarm-updates`` by the main api router.
+"""
+from fastapi import APIRouter
+
+from .api_list_host_releases import router as list_host_releases_router
+from .api_push_update import router as push_update_router
+from .api_push_update_self import router as push_update_self_router
+from .api_rollback_host import router as rollback_host_router
+
+swarm_updates_router = APIRouter(prefix="/swarm-updates")
+
+swarm_updates_router.include_router(list_host_releases_router)
+swarm_updates_router.include_router(push_update_router)
+swarm_updates_router.include_router(push_update_self_router)
+swarm_updates_router.include_router(rollback_host_router)
diff --git a/decnet/web/router/swarm_updates/api_list_host_releases.py b/decnet/web/router/swarm_updates/api_list_host_releases.py
new file mode 100644
index 0000000..ac493eb
--- /dev/null
+++ b/decnet/web/router/swarm_updates/api_list_host_releases.py
@@ -0,0 +1,86 @@
+"""GET /swarm-updates/hosts — per-host updater health + release slots.
+
+Fans out an ``UpdaterClient.health()`` probe to every enrolled host that
+has an updater bundle. Each probe is isolated: a single unreachable host
+never fails the whole list (that's normal partial-failure behaviour for
+a fleet view).
+"""
+from __future__ import annotations
+
+import asyncio
+from typing import Any
+
+from fastapi import APIRouter, Depends
+
+from decnet.logging import get_logger
+from decnet.swarm.updater_client import UpdaterClient
+from decnet.web.db.models import HostReleaseInfo, HostReleasesResponse
+from decnet.web.db.repository import BaseRepository
+from decnet.web.dependencies import get_repo, require_admin
+
+log = get_logger("swarm_updates.list")
+
+router = APIRouter()
+
+
+def _extract_shas(releases: list[dict[str, Any]]) -> tuple[str | None, str | None]:
+ """Pick the (current, previous) SHA from the updater's releases list.
+
+ The updater reports releases as ``[{"slot": "active"|"prev", "sha": ...,
+ ...}]`` in no guaranteed order, so pull by slot name rather than index.
+ """
+ current = next((r.get("sha") for r in releases if r.get("slot") == "active"), None)
+ previous = next((r.get("sha") for r in releases if r.get("slot") == "prev"), None)
+ return current, previous
+
+
+async def _probe_host(host: dict[str, Any]) -> HostReleaseInfo:
+ try:
+ async with UpdaterClient(host=host) as u:
+ body = await u.health()
+ except Exception as exc: # noqa: BLE001
+ return HostReleaseInfo(
+ host_uuid=host["uuid"],
+ host_name=host["name"],
+ address=host["address"],
+ reachable=False,
+ detail=f"{type(exc).__name__}: {exc}",
+ )
+ releases = body.get("releases") or []
+ current, previous = _extract_shas(releases)
+ return HostReleaseInfo(
+ host_uuid=host["uuid"],
+ host_name=host["name"],
+ address=host["address"],
+ reachable=True,
+ agent_status=body.get("agent_status") or body.get("status"),
+ current_sha=current,
+ previous_sha=previous,
+ releases=releases,
+ )
+
+
+@router.get(
+ "/hosts",
+ response_model=HostReleasesResponse,
+ tags=["Swarm Updates"],
+ responses={
+ 401: {"description": "Could not validate credentials"},
+ 403: {"description": "Insufficient permissions"},
+ },
+)
+async def api_list_host_releases(
+ admin: dict = Depends(require_admin),
+ repo: BaseRepository = Depends(get_repo),
+) -> HostReleasesResponse:
+ rows = await repo.list_swarm_hosts()
+ # Only hosts actually capable of receiving updates — decommissioned
+ # hosts and agent-only enrollments are filtered out.
+ targets = [
+ r for r in rows
+ if r.get("status") != "decommissioned" and r.get("updater_cert_fingerprint")
+ ]
+ if not targets:
+ return HostReleasesResponse(hosts=[])
+ results = await asyncio.gather(*(_probe_host(h) for h in targets))
+ return HostReleasesResponse(hosts=list(results))
diff --git a/decnet/web/router/swarm_updates/api_push_update.py b/decnet/web/router/swarm_updates/api_push_update.py
new file mode 100644
index 0000000..0aea5ee
--- /dev/null
+++ b/decnet/web/router/swarm_updates/api_push_update.py
@@ -0,0 +1,163 @@
+"""POST /swarm-updates/push — fan a tarball of the master's tree to workers.
+
+Mirrors the ``decnet swarm update`` CLI flow: build the tarball once,
+dispatch concurrently, collect per-host statuses. Returns HTTP 200 even
+when individual hosts failed — the operator reads per-host ``status``.
+"""
+from __future__ import annotations
+
+import asyncio
+import pathlib
+from typing import Any
+
+from fastapi import APIRouter, Depends, HTTPException
+
+from decnet.logging import get_logger
+from decnet.swarm.tar_tree import detect_git_sha, tar_working_tree
+from decnet.swarm.updater_client import UpdaterClient
+from decnet.web.db.models import PushUpdateRequest, PushUpdateResponse, PushUpdateResult
+from decnet.web.db.repository import BaseRepository
+from decnet.web.dependencies import get_repo, require_admin
+
+log = get_logger("swarm_updates.push")
+
+router = APIRouter()
+
+
+def _master_tree_root() -> pathlib.Path:
+ """Resolve the master's install tree to tar.
+
+ Walks up from this file: ``decnet/web/router/swarm_updates/`` → 3 parents
+ lands on the repo root. Matches the layout shipped via ``pip install -e .``
+ and the dev checkout at ``~/Tools/DECNET``.
+ """
+ return pathlib.Path(__file__).resolve().parents[4]
+
+
+def _classify_update(status_code: int) -> str:
+ if status_code == 200:
+ return "updated"
+ if status_code == 409:
+ return "rolled-back"
+ return "failed"
+
+
+async def _resolve_targets(
+ repo: BaseRepository,
+ req: PushUpdateRequest,
+) -> list[dict[str, Any]]:
+ if req.all == bool(req.host_uuids):
+ raise HTTPException(
+ status_code=400,
+ detail="Specify exactly one of host_uuids or all=true.",
+ )
+ rows = await repo.list_swarm_hosts()
+ rows = [r for r in rows if r.get("updater_cert_fingerprint")]
+ if req.all:
+ targets = [r for r in rows if r.get("status") != "decommissioned"]
+ else:
+ wanted = set(req.host_uuids or [])
+ targets = [r for r in rows if r["uuid"] in wanted]
+ missing = wanted - {r["uuid"] for r in targets}
+ if missing:
+ raise HTTPException(
+ status_code=404,
+ detail=f"Unknown or updater-less host(s): {sorted(missing)}",
+ )
+ if not targets:
+ raise HTTPException(
+ status_code=404,
+ detail="No targets: no enrolled hosts have an updater bundle.",
+ )
+ return targets
+
+
+async def _push_one(
+ host: dict[str, Any],
+ tarball: bytes,
+ sha: str,
+ include_self: bool,
+) -> PushUpdateResult:
+ try:
+ async with UpdaterClient(host=host) as u:
+ r = await u.update(tarball, sha=sha)
+ body = r.json() if r.content else {}
+ status = _classify_update(r.status_code)
+ stderr = body.get("stderr") if isinstance(body, dict) else None
+
+ if include_self and r.status_code == 200:
+ # Agent first, updater second — a broken updater push must never
+ # strand the fleet on an old agent.
+ try:
+ rs = await u.update_self(tarball, sha=sha)
+ self_ok = rs.status_code in (200, 0) # 0 = connection dropped (expected)
+ except Exception as exc: # noqa: BLE001
+ # Connection drop on update-self is expected and not an error.
+ self_ok = _is_expected_connection_drop(exc)
+ if not self_ok:
+ return PushUpdateResult(
+ host_uuid=host["uuid"], host_name=host["name"],
+ status="self-failed", http_status=r.status_code, sha=sha,
+ detail=f"agent updated OK but self-update failed: {exc}",
+ stderr=stderr,
+ )
+ status = "self-updated" if self_ok else "self-failed"
+
+ return PushUpdateResult(
+ host_uuid=host["uuid"], host_name=host["name"],
+ status=status, http_status=r.status_code, sha=sha,
+ detail=body.get("error") or body.get("probe") if isinstance(body, dict) else None,
+ stderr=stderr,
+ )
+ except Exception as exc: # noqa: BLE001
+ log.exception("swarm_updates.push failed host=%s", host.get("name"))
+ return PushUpdateResult(
+ host_uuid=host["uuid"], host_name=host["name"],
+ status="failed",
+ detail=f"{type(exc).__name__}: {exc}",
+ )
+
+
+def _is_expected_connection_drop(exc: BaseException) -> bool:
+ """update-self re-execs the updater mid-response; httpx raises on the drop."""
+ import httpx
+ return isinstance(exc, (httpx.RemoteProtocolError, httpx.ReadError, httpx.ConnectError))
+
+
+@router.post(
+ "/push",
+ response_model=PushUpdateResponse,
+ tags=["Swarm Updates"],
+ responses={
+ 400: {"description": "Bad Request (malformed JSON body or conflicting host_uuids/all flags)"},
+ 401: {"description": "Could not validate credentials"},
+ 403: {"description": "Insufficient permissions"},
+ 404: {"description": "No matching target hosts or no updater-capable hosts enrolled"},
+ 422: {"description": "Request body validation error"},
+ },
+)
+async def api_push_update(
+ req: PushUpdateRequest,
+ admin: dict = Depends(require_admin),
+ repo: BaseRepository = Depends(get_repo),
+) -> PushUpdateResponse:
+ targets = await _resolve_targets(repo, req)
+ tree_root = _master_tree_root()
+ # Both `detect_git_sha` (shells out) and `tar_working_tree` (walks the repo
+ # + gzips a few MB) are synchronous CPU+I/O. Running them directly on the
+ # event loop blocks every other request until the tarball is built — the
+ # dashboard freezes on /swarm-updates push. Offload to a worker thread.
+ sha = await asyncio.to_thread(detect_git_sha, tree_root)
+ tarball = await asyncio.to_thread(tar_working_tree, tree_root, extra_excludes=req.exclude)
+ log.info(
+ "swarm_updates.push sha=%s tarball=%d hosts=%d include_self=%s",
+ sha or "(not a git repo)", len(tarball), len(targets), req.include_self,
+ )
+ results = await asyncio.gather(
+ *(_push_one(h, tarball, sha, req.include_self) for h in targets)
+ )
+ return PushUpdateResponse(
+ sha=sha,
+ tarball_bytes=len(tarball),
+ results=list(results),
+ )
diff --git a/decnet/web/router/swarm_updates/api_push_update_self.py b/decnet/web/router/swarm_updates/api_push_update_self.py
new file mode 100644
index 0000000..2ffa16f
--- /dev/null
+++ b/decnet/web/router/swarm_updates/api_push_update_self.py
@@ -0,0 +1,101 @@
+"""POST /swarm-updates/push-self — push only to workers' /update-self.
+
+Use case: the agent is fine but the updater itself needs an upgrade (e.g.
+a fix to ``executor.py``). Uploading only ``/update-self`` avoids a
+redundant agent restart on healthy workers.
+
+No auto-rollback: the updater re-execs itself on success, so a broken
+push leaves the worker on the old code — verified by polling ``/health``
+after the request returns.
+"""
+from __future__ import annotations
+
+import asyncio
+from typing import Any
+
+from fastapi import APIRouter, Depends
+
+from decnet.logging import get_logger
+from decnet.swarm.tar_tree import detect_git_sha, tar_working_tree
+from decnet.swarm.updater_client import UpdaterClient
+from decnet.web.db.models import PushUpdateRequest, PushUpdateResponse, PushUpdateResult
+from decnet.web.db.repository import BaseRepository
+from decnet.web.dependencies import get_repo, require_admin
+
+from .api_push_update import _is_expected_connection_drop, _master_tree_root, _resolve_targets
+
+log = get_logger("swarm_updates.push_self")
+
+router = APIRouter()
+
+
+async def _push_self_one(host: dict[str, Any], tarball: bytes, sha: str) -> PushUpdateResult:
+ try:
+ async with UpdaterClient(host=host) as u:
+ try:
+ r = await u.update_self(tarball, sha=sha)
+ http_status = r.status_code
+ body = r.json() if r.content else {}
+ ok = http_status == 200
+ detail = (body.get("error") or body.get("probe")) if isinstance(body, dict) else None
+ stderr = body.get("stderr") if isinstance(body, dict) else None
+ except Exception as exc: # noqa: BLE001
+ # Connection drops during self-update are expected — the updater
+ # re-execs itself mid-response.
+ if _is_expected_connection_drop(exc):
+ return PushUpdateResult(
+ host_uuid=host["uuid"], host_name=host["name"],
+ status="self-updated", sha=sha,
+ detail="updater re-exec dropped connection (expected)",
+ )
+ raise
+ return PushUpdateResult(
+ host_uuid=host["uuid"], host_name=host["name"],
+ status="self-updated" if ok else "self-failed",
+ http_status=http_status, sha=sha,
+ detail=detail, stderr=stderr,
+ )
+ except Exception as exc: # noqa: BLE001
+ log.exception("swarm_updates.push_self failed host=%s", host.get("name"))
+ return PushUpdateResult(
+ host_uuid=host["uuid"], host_name=host["name"],
+ status="self-failed",
+ detail=f"{type(exc).__name__}: {exc}",
+ )
+
+
+@router.post(
+ "/push-self",
+ response_model=PushUpdateResponse,
+ tags=["Swarm Updates"],
+ responses={
+ 400: {"description": "Bad Request (malformed JSON body or conflicting host_uuids/all flags)"},
+ 401: {"description": "Could not validate credentials"},
+ 403: {"description": "Insufficient permissions"},
+ 404: {"description": "No matching target hosts or no updater-capable hosts enrolled"},
+ 422: {"description": "Request body validation error"},
+ },
+)
+async def api_push_update_self(
+ req: PushUpdateRequest,
+ admin: dict = Depends(require_admin),
+ repo: BaseRepository = Depends(get_repo),
+) -> PushUpdateResponse:
+ targets = await _resolve_targets(repo, req)
+ tree_root = _master_tree_root()
+ # Offload sync I/O (git shell-out + tar+gzip of the repo) so the event
+ # loop stays responsive while the tarball is being built.
+ sha = await asyncio.to_thread(detect_git_sha, tree_root)
+ tarball = await asyncio.to_thread(tar_working_tree, tree_root, extra_excludes=req.exclude)
+ log.info(
+ "swarm_updates.push_self sha=%s tarball=%d hosts=%d",
+ sha or "(not a git repo)", len(tarball), len(targets),
+ )
+ results = await asyncio.gather(
+ *(_push_self_one(h, tarball, sha) for h in targets)
+ )
+ return PushUpdateResponse(
+ sha=sha,
+ tarball_bytes=len(tarball),
+ results=list(results),
+ )
diff --git a/decnet/web/router/swarm_updates/api_rollback_host.py b/decnet/web/router/swarm_updates/api_rollback_host.py
new file mode 100644
index 0000000..0bfe165
--- /dev/null
+++ b/decnet/web/router/swarm_updates/api_rollback_host.py
@@ -0,0 +1,77 @@
+"""POST /swarm-updates/rollback — manual rollback on a single host.
+
+Calls the worker updater's ``/rollback`` which swaps the ``current``
+symlink back to ``releases/prev``. Fails with 404 if the target has no
+previous release slot.
+"""
+from __future__ import annotations
+
+from fastapi import APIRouter, Depends, HTTPException
+
+from decnet.logging import get_logger
+from decnet.swarm.updater_client import UpdaterClient
+from decnet.web.db.models import RollbackRequest, RollbackResponse
+from decnet.web.db.repository import BaseRepository
+from decnet.web.dependencies import get_repo, require_admin
+
+log = get_logger("swarm_updates.rollback")
+
+router = APIRouter()
+
+
+@router.post(
+ "/rollback",
+ response_model=RollbackResponse,
+ tags=["Swarm Updates"],
+ responses={
+ 400: {"description": "Bad Request (malformed JSON body or host has no updater bundle)"},
+ 401: {"description": "Could not validate credentials"},
+ 403: {"description": "Insufficient permissions"},
+ 404: {"description": "Unknown host, or no previous release slot on the worker"},
+ 422: {"description": "Request body validation error"},
+ },
+)
+async def api_rollback_host(
+ req: RollbackRequest,
+ admin: dict = Depends(require_admin),
+ repo: BaseRepository = Depends(get_repo),
+) -> RollbackResponse:
+ host = await repo.get_swarm_host_by_uuid(req.host_uuid)
+ if host is None:
+ raise HTTPException(status_code=404, detail=f"Unknown host: {req.host_uuid}")
+ if not host.get("updater_cert_fingerprint"):
+ raise HTTPException(
+ status_code=400,
+ detail=f"Host '{host['name']}' has no updater bundle — nothing to roll back.",
+ )
+
+ try:
+ async with UpdaterClient(host=host) as u:
+ r = await u.rollback()
+ except Exception as exc: # noqa: BLE001
+ log.exception("swarm_updates.rollback transport failure host=%s", host["name"])
+ return RollbackResponse(
+ host_uuid=host["uuid"], host_name=host["name"],
+ status="failed",
+ detail=f"{type(exc).__name__}: {exc}",
+ )
+
+ body = r.json() if r.content else {}
+ if r.status_code == 404:
+ # No previous release — surface as 404 so the UI can render the
+ # "nothing to roll back" state distinctly from a transport error.
+ raise HTTPException(
+ status_code=404,
+ detail=body.get("detail") if isinstance(body, dict) else "No previous release on worker.",
+ )
+ if r.status_code != 200:
+ return RollbackResponse(
+ host_uuid=host["uuid"], host_name=host["name"],
+ status="failed", http_status=r.status_code,
+ detail=(body.get("error") or body.get("detail")) if isinstance(body, dict) else None,
+ )
+ return RollbackResponse(
+ host_uuid=host["uuid"], host_name=host["name"],
+ status="rolled-back", http_status=r.status_code,
+ detail=body.get("status") if isinstance(body, dict) else None,
+ )
diff --git a/decnet/web/router/system/__init__.py b/decnet/web/router/system/__init__.py
new file mode 100644
index 0000000..fdc0c05
--- /dev/null
+++ b/decnet/web/router/system/__init__.py
@@ -0,0 +1,6 @@
+from fastapi import APIRouter
+
+from .api_deployment_mode import router as deployment_mode_router
+
+system_router = APIRouter(prefix="/system", tags=["System"])
+system_router.include_router(deployment_mode_router)
diff --git a/decnet/web/router/system/api_deployment_mode.py b/decnet/web/router/system/api_deployment_mode.py
new file mode 100644
index 0000000..18cb3b0
--- /dev/null
+++ b/decnet/web/router/system/api_deployment_mode.py
@@ -0,0 +1,41 @@
+"""GET /system/deployment-mode — tells the UI whether a deploy will shard
+across SWARM workers or land on the master itself.
+
+Logic mirrors the auto-mode branch in ``api_deploy_deckies``: master role
+plus at least one reachable enrolled worker = swarm; otherwise unihost.
+"""
+from __future__ import annotations
+
+import os
+
+from fastapi import APIRouter, Depends
+from pydantic import BaseModel
+
+from decnet.web.db.repository import BaseRepository
+from decnet.web.dependencies import get_repo
+
+router = APIRouter()
+
+
+class DeploymentModeResponse(BaseModel):
+ mode: str # "swarm" or "unihost"
+ role: str # "master" or "agent"
+ swarm_host_count: int
+
+
+@router.get("/deployment-mode", response_model=DeploymentModeResponse)
+async def get_deployment_mode(
+ repo: BaseRepository = Depends(get_repo),
+) -> DeploymentModeResponse:
+ role = os.environ.get("DECNET_MODE", "master").lower()
+ hosts = 0
+ if role == "master":
+ hosts = sum(
+ 1 for h in await repo.list_swarm_hosts()
+ if h.get("status") in ("active", "enrolled") and h.get("address")
+ )
+ return DeploymentModeResponse(
+ mode="swarm" if hosts > 0 else "unihost",
+ role=role,
+ swarm_host_count=hosts,
+ )
diff --git a/decnet/web/swarm_api.py b/decnet/web/swarm_api.py
new file mode 100644
index 0000000..43ffeb6
--- /dev/null
+++ b/decnet/web/swarm_api.py
@@ -0,0 +1,67 @@
+"""DECNET SWARM Controller — master-side control plane.
+
+Runs as an independent FastAPI/uvicorn process. Isolated from
+``decnet.web.api`` so controller failure cannot cascade to the main API,
+ingester, or dashboard (mirrors the existing pattern used by
+``decnet api`` with ``start_new_session=True``).
+
+Responsibilities:
+* host enrollment (issues CA-signed worker bundles);
+* dispatching DecnetConfig shards to worker agents over mTLS;
+* active health probes of enrolled workers.
+
+The controller *reuses* the same ``get_repo`` dependency as the main API,
+so SwarmHost / DeckyShard state is visible to both processes via the
+shared DB.
+"""
+from __future__ import annotations
+
+from decnet.web import _uvicorn_tls_scope # noqa: F401 # patches uvicorn on import
+
+from contextlib import asynccontextmanager
+from typing import AsyncGenerator
+
+from fastapi import FastAPI
+from fastapi.responses import ORJSONResponse
+
+from decnet.logging import get_logger
+from decnet.swarm import pki
+from decnet.swarm.client import ensure_master_identity
+from decnet.web.dependencies import repo
+from decnet.web.router.swarm import swarm_router
+
+log = get_logger("swarm_api")
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
+ log.info("swarm-controller starting up")
+ # Make sure the CA and master client cert exist before we accept any
+ # request — enrollment needs them and AgentClient needs them.
+ pki.ensure_ca()
+ ensure_master_identity()
+ await repo.initialize()
+ log.info("swarm-controller ready")
+ yield
+ log.info("swarm-controller shutdown")
+
+
+app: FastAPI = FastAPI(
+ title="DECNET SWARM Controller",
+ version="0.1.0",
+ lifespan=lifespan,
+ default_response_class=ORJSONResponse,
+ # No interactive docs: the controller is an internal management plane,
+ # not a public surface. Enable explicitly in dev if needed.
+ docs_url=None,
+ redoc_url=None,
+ openapi_url=None,
+)
+
+app.include_router(swarm_router)
+
+
+@app.get("/health")
+async def root_health() -> dict[str, str]:
+ """Top-level liveness probe (no DB I/O)."""
+ return {"status": "ok", "role": "swarm-controller"}
diff --git a/decnet/web/templates/decnet-agent.service.j2 b/decnet/web/templates/decnet-agent.service.j2
new file mode 100644
index 0000000..9847345
--- /dev/null
+++ b/decnet/web/templates/decnet-agent.service.j2
@@ -0,0 +1,18 @@
+[Unit]
+Description=DECNET worker agent (mTLS control plane) — {{ agent_name }}
+Documentation=https://github.com/anti/DECNET
+After=network-online.target
+Wants=network-online.target
+
+[Service]
+Type=simple
+WorkingDirectory=/opt/decnet
+Environment=DECNET_SYSTEM_LOGS=/var/log/decnet/decnet.agent.log
+ExecStart=/usr/local/bin/decnet agent --no-forwarder
+Restart=on-failure
+RestartSec=5
+StandardOutput=append:/var/log/decnet/decnet.agent.log
+StandardError=append:/var/log/decnet/decnet.agent.log
+
+[Install]
+WantedBy=multi-user.target
diff --git a/decnet/web/templates/decnet-collector.service.j2 b/decnet/web/templates/decnet-collector.service.j2
new file mode 100644
index 0000000..3137bfd
--- /dev/null
+++ b/decnet/web/templates/decnet-collector.service.j2
@@ -0,0 +1,20 @@
+[Unit]
+Description=DECNET container log collector — {{ agent_name }}
+Documentation=https://github.com/anti/DECNET
+After=network-online.target decnet-agent.service
+Wants=network-online.target
+PartOf=decnet-agent.service
+
+[Service]
+Type=simple
+WorkingDirectory=/opt/decnet
+Environment=DECNET_MODE=agent
+Environment=DECNET_SYSTEM_LOGS=/var/log/decnet/decnet.collector.log
+ExecStart=/usr/local/bin/decnet collect --log-file /var/log/decnet/decnet.log
+Restart=on-failure
+RestartSec=5
+StandardOutput=append:/var/log/decnet/decnet.collector.log
+StandardError=append:/var/log/decnet/decnet.collector.log
+
+[Install]
+WantedBy=multi-user.target
diff --git a/decnet/web/templates/decnet-engine.service.j2 b/decnet/web/templates/decnet-engine.service.j2
new file mode 100644
index 0000000..dadf1b0
--- /dev/null
+++ b/decnet/web/templates/decnet-engine.service.j2
@@ -0,0 +1,17 @@
+[Unit]
+Description=DECNET deckie orchestrator (decnet deploy) — {{ agent_name }}
+Documentation=https://github.com/anti/DECNET
+After=network-online.target decnet-agent.service
+Wants=network-online.target
+
+[Service]
+Type=oneshot
+RemainAfterExit=yes
+WorkingDirectory=/opt/decnet
+Environment=DECNET_SYSTEM_LOGS=/var/log/decnet/decnet.log
+ExecStart=/usr/local/bin/decnet deploy
+StandardOutput=append:/var/log/decnet/decnet.log
+StandardError=append:/var/log/decnet/decnet.log
+
+[Install]
+WantedBy=multi-user.target
diff --git a/decnet/web/templates/decnet-forwarder.service.j2 b/decnet/web/templates/decnet-forwarder.service.j2
new file mode 100644
index 0000000..e0a2391
--- /dev/null
+++ b/decnet/web/templates/decnet-forwarder.service.j2
@@ -0,0 +1,19 @@
+[Unit]
+Description=DECNET log forwarder (syslog-over-TLS → master) — {{ agent_name }}
+Documentation=https://github.com/anti/DECNET
+After=network-online.target
+Wants=network-online.target
+PartOf=decnet-agent.service
+
+[Service]
+Type=simple
+WorkingDirectory=/opt/decnet
+Environment=DECNET_SYSTEM_LOGS=/var/log/decnet/decnet.forwarder.log
+ExecStart=/usr/local/bin/decnet forwarder --master-host {{ master_host }} --master-port 6514 --agent-dir /etc/decnet/agent --log-file /var/log/decnet/decnet.log
+Restart=on-failure
+RestartSec=5
+StandardOutput=append:/var/log/decnet/decnet.forwarder.log
+StandardError=append:/var/log/decnet/decnet.forwarder.log
+
+[Install]
+WantedBy=multi-user.target
diff --git a/decnet/web/templates/decnet-prober.service.j2 b/decnet/web/templates/decnet-prober.service.j2
new file mode 100644
index 0000000..209851e
--- /dev/null
+++ b/decnet/web/templates/decnet-prober.service.j2
@@ -0,0 +1,20 @@
+[Unit]
+Description=DECNET attacker prober (JARM/HASSH/TCP fingerprint) — {{ agent_name }}
+Documentation=https://github.com/anti/DECNET
+After=network-online.target decnet-agent.service
+Wants=network-online.target
+PartOf=decnet-agent.service
+
+[Service]
+Type=simple
+WorkingDirectory=/opt/decnet
+Environment=DECNET_MODE=agent
+Environment=DECNET_SYSTEM_LOGS=/var/log/decnet/decnet.prober.log
+ExecStart=/usr/local/bin/decnet probe --log-file /var/log/decnet/decnet.log --interval 300
+Restart=on-failure
+RestartSec=5
+StandardOutput=append:/var/log/decnet/decnet.prober.log
+StandardError=append:/var/log/decnet/decnet.prober.log
+
+[Install]
+WantedBy=multi-user.target
diff --git a/decnet/web/templates/decnet-sniffer.service.j2 b/decnet/web/templates/decnet-sniffer.service.j2
new file mode 100644
index 0000000..360a3ac
--- /dev/null
+++ b/decnet/web/templates/decnet-sniffer.service.j2
@@ -0,0 +1,24 @@
+[Unit]
+Description=DECNET network sniffer — {{ agent_name }}
+Documentation=https://github.com/anti/DECNET
+After=network-online.target decnet-agent.service
+Wants=network-online.target
+PartOf=decnet-agent.service
+
+[Service]
+Type=simple
+WorkingDirectory=/opt/decnet
+Environment=DECNET_MODE=agent
+Environment=DECNET_SYSTEM_LOGS=/var/log/decnet/decnet.sniffer.log
+# scapy needs raw sockets; forwarder already runs with these caps, so we
+# mirror the same ambient set here.
+AmbientCapabilities=CAP_NET_ADMIN CAP_NET_RAW
+CapabilityBoundingSet=CAP_NET_ADMIN CAP_NET_RAW
+ExecStart=/usr/local/bin/decnet sniffer --log-file /var/log/decnet/decnet.log
+Restart=on-failure
+RestartSec=5
+StandardOutput=append:/var/log/decnet/decnet.sniffer.log
+StandardError=append:/var/log/decnet/decnet.sniffer.log
+
+[Install]
+WantedBy=multi-user.target
diff --git a/decnet/web/templates/decnet-updater.service.j2 b/decnet/web/templates/decnet-updater.service.j2
new file mode 100644
index 0000000..3ac5406
--- /dev/null
+++ b/decnet/web/templates/decnet-updater.service.j2
@@ -0,0 +1,18 @@
+[Unit]
+Description=DECNET self-updater (accepts tarball pushes from master) — {{ agent_name }}
+Documentation=https://github.com/anti/DECNET
+After=network-online.target
+Wants=network-online.target
+
+[Service]
+Type=simple
+WorkingDirectory=/opt/decnet
+Environment=DECNET_SYSTEM_LOGS=/var/log/decnet/decnet.updater.log
+ExecStart=/usr/local/bin/decnet updater --updater-dir /etc/decnet/updater --install-dir /opt/decnet --agent-dir /etc/decnet/agent
+Restart=on-failure
+RestartSec=5
+StandardOutput=append:/var/log/decnet/decnet.updater.log
+StandardError=append:/var/log/decnet/decnet.updater.log
+
+[Install]
+WantedBy=multi-user.target
diff --git a/decnet/web/templates/enroll_bootstrap.sh.j2 b/decnet/web/templates/enroll_bootstrap.sh.j2
new file mode 100644
index 0000000..74ab220
--- /dev/null
+++ b/decnet/web/templates/enroll_bootstrap.sh.j2
@@ -0,0 +1,86 @@
+#!/usr/bin/env bash
+# DECNET bootstrap installer for agent {{ agent_name }} -> master {{ master_host }}.
+# Fetches the code+certs payload, installs, and starts the agent daemon.
+# Generated by the master at {{ generated_at }}. Expires {{ expires_at }}.
+set -euo pipefail
+
+[[ $EUID -eq 0 ]] || { echo "decnet-install: must run as root (use sudo)"; exit 1; }
+for bin in python3 curl tar systemctl; do
+ command -v "$bin" >/dev/null || { echo "decnet-install: $bin required"; exit 1; }
+done
+
+WORK="$(mktemp -d)"
+trap 'rm -rf "$WORK"' EXIT
+
+echo "[DECNET] fetching payload..."
+curl -fsSL "{{ tarball_url }}" | tar -xz -C "$WORK"
+
+INSTALL_DIR=/opt/decnet
+RELEASE_DIR="$INSTALL_DIR/releases/active"
+VENV_DIR="$INSTALL_DIR/venv"
+# Mirror the updater's layout from day one so `decnet updater` can rotate
+# releases/active in-place and the shared venv is the thing on PATH.
+mkdir -p "$RELEASE_DIR"
+cp -a "$WORK/." "$RELEASE_DIR/"
+ln -sfn "$RELEASE_DIR" "$INSTALL_DIR/current"
+cd "$RELEASE_DIR"
+
+echo "[DECNET] building shared venv at $VENV_DIR..."
+python3 -m venv "$VENV_DIR"
+"$VENV_DIR/bin/pip" install -q --upgrade pip
+"$VENV_DIR/bin/pip" install -q "$RELEASE_DIR"
+
+install -Dm0644 etc/decnet/decnet.ini /etc/decnet/decnet.ini
+[[ -f services.ini ]] && install -Dm0644 services.ini /etc/decnet/services.ini
+
+# Log directory the baked-in INI points at — must exist before `decnet` imports config.
+install -d -m0755 /var/log/decnet
+
+# Certs live under /etc/decnet/ (root-owned, 0600 keys) — this is a root
+# daemon's data, not a user's. The baked INI's `agent-dir`/`updater-dir`
+# point at these paths.
+for f in ca.crt worker.crt worker.key; do
+ install -Dm0600 -o root -g root \
+ "home/.decnet/agent/$f" "/etc/decnet/agent/$f"
+done
+chmod 0755 /etc/decnet/agent
+
+WITH_UPDATER="{{ with_updater }}"
+if [[ "$WITH_UPDATER" == "true" && -d home/.decnet/updater ]]; then
+ for f in ca.crt updater.crt updater.key; do
+ install -Dm0600 -o root -g root \
+ "home/.decnet/updater/$f" "/etc/decnet/updater/$f"
+ done
+ chmod 0755 /etc/decnet/updater
+fi
+
+# Guarantee the pip-installed entrypoint is executable (some setuptools+editable
+# combos drop it with mode 0644) and expose it on PATH.
+chmod 0755 "$VENV_DIR/bin/decnet"
+ln -sf "$VENV_DIR/bin/decnet" /usr/local/bin/decnet
+
+echo "[DECNET] installing systemd units..."
+for unit in \
+ decnet-agent decnet-forwarder decnet-engine \
+ decnet-collector decnet-prober decnet-sniffer; do
+ install -Dm0644 "etc/systemd/system/${unit}.service" "/etc/systemd/system/${unit}.service"
+done
+if [[ "$WITH_UPDATER" == "true" ]]; then
+ install -Dm0644 etc/systemd/system/decnet-updater.service /etc/systemd/system/decnet-updater.service
+fi
+systemctl daemon-reload
+
+# Agent + forwarder are the control plane; collector/prober/profiler/sniffer
+# are the per-host microservices that used to require `decnet deploy` to
+# auto-spawn. With systemd units they come up at boot and auto-restart.
+ACTIVE_UNITS=(
+ decnet-agent.service decnet-forwarder.service
+ decnet-collector.service decnet-prober.service
+ decnet-sniffer.service
+)
+if [[ "$WITH_UPDATER" == "true" ]]; then
+ ACTIVE_UNITS+=(decnet-updater.service)
+fi
+systemctl enable --now "${ACTIVE_UNITS[@]}"
+
+echo "[DECNET] agent {{ agent_name }} enrolled -> {{ master_host }}. Units: ${ACTIVE_UNITS[*]} active."
diff --git a/decnet_web/src/App.tsx b/decnet_web/src/App.tsx
index 8748ef2..50bdefc 100644
--- a/decnet_web/src/App.tsx
+++ b/decnet_web/src/App.tsx
@@ -6,18 +6,37 @@ import Dashboard from './components/Dashboard';
import DeckyFleet from './components/DeckyFleet';
import LiveLogs from './components/LiveLogs';
import Attackers from './components/Attackers';
+import AttackerDetail from './components/AttackerDetail';
import Config from './components/Config';
import Bounty from './components/Bounty';
+import RemoteUpdates from './components/RemoteUpdates';
+import SwarmHosts from './components/SwarmHosts';
+import AgentEnrollment from './components/AgentEnrollment';
+
+function isTokenValid(token: string): boolean {
+ try {
+ const payload = JSON.parse(atob(token.split('.')[1].replace(/-/g, '+').replace(/_/g, '/')));
+ return typeof payload.exp === 'number' && payload.exp * 1000 > Date.now();
+ } catch {
+ return false;
+ }
+}
+
+function getValidToken(): string | null {
+ const stored = localStorage.getItem('token');
+ if (stored && isTokenValid(stored)) return stored;
+ if (stored) localStorage.removeItem('token');
+ return null;
+}
function App() {
- const [token, setToken] = useState(localStorage.getItem('token'));
+ const [token, setToken] = useState(getValidToken);
const [searchQuery, setSearchQuery] = useState('');
useEffect(() => {
- const savedToken = localStorage.getItem('token');
- if (savedToken) {
- setToken(savedToken);
- }
+ const onAuthLogout = () => setToken(null);
+ window.addEventListener('auth:logout', onAuthLogout);
+ return () => window.removeEventListener('auth:logout', onAuthLogout);
}, []);
const handleLogin = (newToken: string) => {
@@ -46,7 +65,11 @@ function App() {
} />
} />
} />
+ } />
} />
+ } />
+ } />
+ } />
} />
diff --git a/decnet_web/src/components/AgentEnrollment.tsx b/decnet_web/src/components/AgentEnrollment.tsx
new file mode 100644
index 0000000..f538416
--- /dev/null
+++ b/decnet_web/src/components/AgentEnrollment.tsx
@@ -0,0 +1,188 @@
+import React, { useEffect, useRef, useState } from 'react';
+import api from '../utils/api';
+import './Dashboard.css';
+import './Swarm.css';
+import { UserPlus, Copy, RotateCcw, Check, AlertTriangle } from 'lucide-react';
+
+interface BundleResult {
+ token: string;
+ host_uuid: string;
+ command: string;
+ expires_at: string;
+}
+
+const AgentEnrollment: React.FC = () => {
+ const [masterHost, setMasterHost] = useState(window.location.hostname);
+ const [agentName, setAgentName] = useState('');
+ const [withUpdater, setWithUpdater] = useState(true);
+ const [useIpvlan, setUseIpvlan] = useState(false);
+ const [servicesIni, setServicesIni] = useState(null);
+ const [servicesIniName, setServicesIniName] = useState(null);
+ const [submitting, setSubmitting] = useState(false);
+ const [error, setError] = useState(null);
+ const [result, setResult] = useState(null);
+ const [copied, setCopied] = useState(false);
+ const [now, setNow] = useState(Date.now());
+ const fileRef = useRef(null);
+
+ useEffect(() => {
+ const t = setInterval(() => setNow(Date.now()), 1000);
+ return () => clearInterval(t);
+ }, []);
+
+ const handleFile = (e: React.ChangeEvent) => {
+ const f = e.target.files?.[0];
+ if (!f) {
+ setServicesIni(null);
+ setServicesIniName(null);
+ return;
+ }
+ const reader = new FileReader();
+ reader.onload = () => {
+ setServicesIni(String(reader.result));
+ setServicesIniName(f.name);
+ };
+ reader.readAsText(f);
+ };
+
+ const reset = () => {
+ setResult(null);
+ setError(null);
+ setAgentName('');
+ setWithUpdater(true);
+ setUseIpvlan(false);
+ setServicesIni(null);
+ setServicesIniName(null);
+ setCopied(false);
+ if (fileRef.current) fileRef.current.value = '';
+ };
+
+ const submit = async (e: React.FormEvent) => {
+ e.preventDefault();
+ setSubmitting(true);
+ setError(null);
+ try {
+ const res = await api.post('/swarm/enroll-bundle', {
+ master_host: masterHost,
+ agent_name: agentName,
+ with_updater: withUpdater,
+ use_ipvlan: useIpvlan,
+ services_ini: servicesIni,
+ });
+ setResult(res.data);
+ } catch (err: any) {
+ setError(err?.response?.data?.detail || 'Enrollment bundle creation failed');
+ } finally {
+ setSubmitting(false);
+ }
+ };
+
+ const copyCmd = async () => {
+ if (!result) return;
+ await navigator.clipboard.writeText(result.command);
+ setCopied(true);
+ setTimeout(() => setCopied(false), 2000);
+ };
+
+ const nameOk = /^[a-z0-9][a-z0-9-]{0,62}$/.test(agentName);
+
+ const remainingSecs = result ? Math.max(0, Math.floor((new Date(result.expires_at).getTime() - now) / 1000)) : 0;
+ const mm = Math.floor(remainingSecs / 60).toString().padStart(2, '0');
+ const ss = (remainingSecs % 60).toString().padStart(2, '0');
+
+ return (
+
+
+
Agent Enrollment
+
+
+ {!result ? (
+
+
+ Generates a one-shot bootstrap URL valid for 5 minutes. Paste the command into a
+ root shell on the target worker VM — no manual cert shuffling required.
+
+
+
+ ) : (
+
+
Paste this on the new worker (as root):
+
{result.command}
+
+
+
+
+
+ Expires in {mm}:{ss} — one-shot, single download. Host UUID:{' '}
+ {result.host_uuid}
+
+ {remainingSecs === 0 && (
+
+ This bundle has expired. Generate another.
+
+ )}
+
+ )}
+
+ );
+};
+
+export default AgentEnrollment;
diff --git a/decnet_web/src/components/ArtifactDrawer.tsx b/decnet_web/src/components/ArtifactDrawer.tsx
new file mode 100644
index 0000000..491ec9c
--- /dev/null
+++ b/decnet_web/src/components/ArtifactDrawer.tsx
@@ -0,0 +1,186 @@
+import React, { useState } from 'react';
+import { X, Download, AlertTriangle } from 'lucide-react';
+import api from '../utils/api';
+
+interface ArtifactDrawerProps {
+ decky: string;
+ storedAs: string;
+ fields: Record;
+ onClose: () => void;
+}
+
+// Bulky nested structures are shipped as one base64-encoded JSON blob in
+// `meta_json_b64` (see templates/ssh/emit_capture.py). All summary fields
+// arrive as top-level SD params already present in `fields`.
+function decodeMeta(fields: Record): Record | null {
+ const b64 = fields.meta_json_b64;
+ if (typeof b64 !== 'string' || !b64) return null;
+ try {
+ const json = atob(b64);
+ return JSON.parse(json);
+ } catch (err) {
+ console.error('artifact: failed to decode meta_json_b64', err);
+ return null;
+ }
+}
+
+const Row: React.FC<{ label: string; value: React.ReactNode }> = ({ label, value }) => (
+