feat(deploy): systemd units w/ capability-based hardening; updater restarts agent via systemctl

Add deploy/ unit files for every DECNET daemon (agent, updater, api, web,
swarmctl, listener, forwarder). All run as User=decnet with NoNewPrivileges,
ProtectSystem, PrivateTmp, LockPersonality; AmbientCapabilities=CAP_NET_ADMIN
CAP_NET_RAW only on the agent (MACVLAN/scapy). Existing api/web units migrated
to /opt/decnet layout and the same hardening stanza.

Make the updater's _spawn_agent systemd-aware: under systemd (detected via
INVOCATION_ID + systemctl on PATH), `systemctl restart decnet-agent.service`
replaces the Popen path so the new agent inherits the unit's ambient caps
instead of the updater's empty set. _stop_agent becomes a no-op in that mode
to avoid racing systemctl's own stop phase.

Tests cover the dispatcher branch selection, MainPID parsing, and the
systemd no-op stop.
This commit is contained in:
2026-04-19 00:44:06 -04:00
parent 40d3e86e55
commit f5a5fec607
9 changed files with 381 additions and 19 deletions

View File

@@ -208,11 +208,56 @@ def _run_pip(
)
def _spawn_agent(install_dir: pathlib.Path) -> int:
"""Launch ``decnet agent --daemon`` using the current-symlinked venv.
AGENT_SYSTEMD_UNIT = "decnet-agent.service"
Returns the new PID. Monkeypatched in tests.
def _systemd_available() -> bool:
"""True when we're running under systemd and have systemctl on PATH.
Detection is conservative: we only return True if *both* the invocation
marker is set (``INVOCATION_ID`` is exported by systemd for every unit)
and ``systemctl`` is resolvable. The env var alone can be forged; the
binary alone can exist on hosts running other init systems.
"""
if not os.environ.get("INVOCATION_ID"):
return False
from shutil import which
return which("systemctl") is not None
def _spawn_agent(install_dir: pathlib.Path) -> int:
"""Launch the agent and return its PID.
Under systemd, restart ``decnet-agent.service`` via ``systemctl`` so the
new process inherits the unit's ambient capabilities (CAP_NET_ADMIN,
CAP_NET_RAW). Spawning with ``subprocess.Popen`` from inside the updater
unit would make the agent a child of the updater and therefore a member
of the updater's (empty) capability set — it would come up without the
caps needed to run MACVLAN/scapy.
Off systemd (dev boxes, manual starts), fall back to a direct Popen.
"""
if _systemd_available():
return _spawn_agent_via_systemd(install_dir)
return _spawn_agent_via_popen(install_dir)
def _spawn_agent_via_systemd(install_dir: pathlib.Path) -> int:
subprocess.run( # nosec B603 B607
["systemctl", "restart", AGENT_SYSTEMD_UNIT],
check=True, capture_output=True, text=True,
)
pid_out = subprocess.run( # nosec B603 B607
["systemctl", "show", "--property=MainPID", "--value", AGENT_SYSTEMD_UNIT],
check=True, capture_output=True, text=True,
)
pid = int(pid_out.stdout.strip() or "0")
if pid:
_pid_file(install_dir).write_text(str(pid))
return pid
def _spawn_agent_via_popen(install_dir: pathlib.Path) -> int:
decnet_bin = _shared_venv(install_dir) / "bin" / "decnet"
log_path = install_dir / "agent.spawn.log"
# cwd=install_dir so a persistent ``<install_dir>/.env.local`` gets
@@ -267,7 +312,13 @@ def _stop_agent(install_dir: pathlib.Path, grace: float = AGENT_RESTART_GRACE_S)
Prefers the PID recorded in ``agent.pid`` (processes we spawned) but
falls back to scanning /proc for any ``decnet agent`` so manually-started
agents are also restarted cleanly during an update.
Under systemd, stop is a no-op — ``_spawn_agent`` issues a single
``systemctl restart`` that handles stop and start atomically. Pre-stopping
would only race the restart's own stop phase.
"""
if _systemd_available():
return
pids: list[int] = []
pid_file = _pid_file(install_dir)
if pid_file.is_file():

View File

@@ -0,0 +1,41 @@
[Unit]
Description=DECNET Worker Agent (mTLS)
Documentation=https://github.com/4nt11/DECNET/wiki/SWARM-Mode
After=network-online.target docker.service
Wants=network-online.target
Requires=docker.service
[Service]
Type=simple
User=decnet
Group=decnet
# docker.sock is group-readable by 'docker'; the agent needs it for compose.
SupplementaryGroups=docker
WorkingDirectory=/opt/decnet
EnvironmentFile=-/opt/decnet/.env.local
ExecStart=/opt/decnet/venv/bin/decnet agent --host 0.0.0.0 --port 8765 --agent-dir /etc/decnet/agent
# MACVLAN/IPVLAN management + scapy raw sockets. Granted via ambient caps so
# the process starts unprivileged and keeps only these two bits.
CapabilityBoundingSet=CAP_NET_ADMIN CAP_NET_RAW
AmbientCapabilities=CAP_NET_ADMIN CAP_NET_RAW
# Security Hardening
NoNewPrivileges=yes
ProtectSystem=full
ProtectHome=read-only
PrivateTmp=yes
ProtectKernelTunables=yes
ProtectKernelModules=yes
ProtectControlGroups=yes
RestrictSUIDSGID=yes
LockPersonality=yes
# /opt/decnet holds release slots + state; the agent reads them and writes its PID.
ReadWritePaths=/opt/decnet /var/log/decnet
Restart=on-failure
RestartSec=5
TimeoutStopSec=15
[Install]
WantedBy=multi-user.target

View File

@@ -1,19 +1,21 @@
[Unit]
Description=DECNET API Service
After=network.target docker.service
Documentation=https://github.com/4nt11/DECNET/wiki/REST-API-Reference
After=network-online.target docker.service
Wants=network-online.target
Requires=docker.service
[Service]
Type=simple
User=decnet
Group=decnet
WorkingDirectory=/path/to/DECNET
# Ensure environment is loaded from the .env file
EnvironmentFile=/path/to/DECNET/.env
# Use the virtualenv python to run the decnet api command
ExecStart=/path/to/DECNET/.venv/bin/decnet api
# docker.sock is group-readable by 'docker'; the API ingester tails container logs.
SupplementaryGroups=docker
WorkingDirectory=/opt/decnet
EnvironmentFile=-/opt/decnet/.env.local
ExecStart=/opt/decnet/venv/bin/decnet api
# Capabilities required to manage MACVLAN interfaces and network links without root
# MACVLAN/IPVLAN setup runs from the API lifespan when the embedded sniffer is on.
CapabilityBoundingSet=CAP_NET_ADMIN CAP_NET_RAW
AmbientCapabilities=CAP_NET_ADMIN CAP_NET_RAW
@@ -21,9 +23,17 @@ AmbientCapabilities=CAP_NET_ADMIN CAP_NET_RAW
NoNewPrivileges=yes
ProtectSystem=full
ProtectHome=read-only
PrivateTmp=yes
ProtectKernelTunables=yes
ProtectKernelModules=yes
ProtectControlGroups=yes
RestrictSUIDSGID=yes
LockPersonality=yes
ReadWritePaths=/opt/decnet /var/log/decnet
Restart=on-failure
RestartSec=5
TimeoutStopSec=15
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,46 @@
[Unit]
Description=DECNET Syslog-over-TLS Forwarder (worker, RFC 5425)
Documentation=https://github.com/4nt11/DECNET/wiki/Logging-and-Syslog
After=network-online.target
Wants=network-online.target
# The forwarder can run independently of the agent — it only needs the local
# log file to exist and the master to be reachable.
[Service]
Type=simple
User=decnet
Group=decnet
WorkingDirectory=/opt/decnet
EnvironmentFile=-/opt/decnet/.env.local
# Replace <master-host> with the master's LAN address or hostname. The agent
# cert bundle at /etc/decnet/agent is reused — the forwarder presents the same
# worker identity when it connects to the master's listener.
ExecStart=/opt/decnet/venv/bin/decnet forwarder \
--log-file /var/log/decnet/decnet.log \
--master-host ${DECNET_SWARM_MASTER_HOST} \
--master-port 6514 \
--agent-dir /etc/decnet/agent
# TLS client connection; no special capabilities.
CapabilityBoundingSet=
AmbientCapabilities=
# Security Hardening
NoNewPrivileges=yes
ProtectSystem=full
ProtectHome=read-only
PrivateTmp=yes
ProtectKernelTunables=yes
ProtectKernelModules=yes
ProtectControlGroups=yes
RestrictSUIDSGID=yes
LockPersonality=yes
# Reads the tailed log; writes a small byte-offset state file alongside it.
ReadWritePaths=/var/log/decnet
ReadOnlyPaths=/etc/decnet
Restart=on-failure
RestartSec=5
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,43 @@
[Unit]
Description=DECNET Syslog-over-TLS Listener (master, RFC 5425)
Documentation=https://github.com/4nt11/DECNET/wiki/Logging-and-Syslog
After=network-online.target
Wants=network-online.target
[Service]
Type=simple
User=decnet
Group=decnet
WorkingDirectory=/opt/decnet
EnvironmentFile=-/opt/decnet/.env.local
# Binds 0.0.0.0:6514 so workers across the LAN can connect. 6514 is not a
# privileged port (≥1024), so no CAP_NET_BIND_SERVICE is required.
ExecStart=/opt/decnet/venv/bin/decnet listener \
--host 0.0.0.0 --port 6514 \
--ca-dir /etc/decnet/ca \
--log-path /var/log/decnet/master.log \
--json-path /var/log/decnet/master.json
# Pure TLS server; no privileged network operations.
CapabilityBoundingSet=
AmbientCapabilities=
# Security Hardening
NoNewPrivileges=yes
ProtectSystem=full
ProtectHome=read-only
PrivateTmp=yes
ProtectKernelTunables=yes
ProtectKernelModules=yes
ProtectControlGroups=yes
RestrictSUIDSGID=yes
LockPersonality=yes
# Writes forensic .log + parsed .json sinks; CA bundle is read-only.
ReadWritePaths=/var/log/decnet
ReadOnlyPaths=/etc/decnet
Restart=on-failure
RestartSec=5
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,40 @@
[Unit]
Description=DECNET Swarm Controller (master)
Documentation=https://github.com/4nt11/DECNET/wiki/SWARM-Mode
After=network-online.target decnet-api.service
Wants=network-online.target
[Service]
Type=simple
User=decnet
Group=decnet
WorkingDirectory=/opt/decnet
EnvironmentFile=-/opt/decnet/.env.local
# Default bind is loopback — the controller is a master-local orchestrator
# reached by the CLI and the web dashboard, not by workers.
ExecStart=/opt/decnet/venv/bin/decnet swarmctl --host 127.0.0.1 --port 8770
# No special capabilities — the controller issues mTLS certs and talks to
# workers over TCP on unprivileged ports.
CapabilityBoundingSet=
AmbientCapabilities=
# Security Hardening
NoNewPrivileges=yes
ProtectSystem=full
ProtectHome=read-only
PrivateTmp=yes
ProtectKernelTunables=yes
ProtectKernelModules=yes
ProtectControlGroups=yes
RestrictSUIDSGID=yes
LockPersonality=yes
# Reads/writes the CA bundle and the master DB.
ReadWritePaths=/opt/decnet /var/log/decnet
ReadOnlyPaths=/etc/decnet
Restart=on-failure
RestartSec=5
[Install]
WantedBy=multi-user.target

View File

@@ -0,0 +1,49 @@
[Unit]
Description=DECNET Self-Updater (mTLS)
Documentation=https://github.com/4nt11/DECNET/wiki/Remote-Updates
After=network-online.target
Wants=network-online.target
# Deliberately NOT After=decnet-agent.service — the updater must come up even
# when the agent is broken, since that is exactly when it is most useful.
[Service]
Type=simple
User=decnet
Group=decnet
WorkingDirectory=/opt/decnet
EnvironmentFile=-/opt/decnet/.env.local
ExecStart=/opt/decnet/venv/bin/decnet updater \
--host 0.0.0.0 --port 8766 \
--updater-dir /etc/decnet/updater \
--install-dir /opt/decnet \
--agent-dir /etc/decnet/agent
# The updater SIGTERMs the agent and spawns a new one. Same User=decnet means
# signalling is allowed without CAP_KILL. It does not need NET_ADMIN/NET_RAW
# itself — the new agent process picks those up from decnet-agent.service when
# systemd restarts it (or from the agent's own unit's AmbientCapabilities when
# spawned by the updater as a direct child).
CapabilityBoundingSet=
AmbientCapabilities=
# Security Hardening
NoNewPrivileges=yes
ProtectSystem=full
ProtectHome=read-only
PrivateTmp=yes
ProtectKernelTunables=yes
ProtectKernelModules=yes
ProtectControlGroups=yes
RestrictSUIDSGID=yes
LockPersonality=yes
# Writes release slots, pip installs into venv, manages agent.pid.
ReadWritePaths=/opt/decnet /var/log/decnet
Restart=on-failure
RestartSec=5
# Self-update replaces the process image via os.execv; the new binary answers
# /health within 30 s. Give it headroom before systemd's own termination.
TimeoutStopSec=30
[Install]
WantedBy=multi-user.target

View File

@@ -1,27 +1,35 @@
[Unit]
Description=DECNET Web Dashboard Service
After=network.target decnet-api.service
Documentation=https://github.com/4nt11/DECNET/wiki/Web-Dashboard
After=network-online.target decnet-api.service
Wants=network-online.target
[Service]
Type=simple
User=decnet
Group=decnet
WorkingDirectory=/path/to/DECNET
# Ensure environment is loaded from the .env file
EnvironmentFile=/path/to/DECNET/.env
# Use the virtualenv python to run the decnet web command
ExecStart=/path/to/DECNET/.venv/bin/decnet web
WorkingDirectory=/opt/decnet
EnvironmentFile=-/opt/decnet/.env.local
ExecStart=/opt/decnet/venv/bin/decnet web
# The Web Dashboard service does not require network administration privileges.
# Enable the following lines if you wish to bind the Dashboard to a privileged port (e.g., 80 or 443)
# while still running as a non-root user.
# Uncomment if you bind the dashboard to a privileged port (80/443):
# CapabilityBoundingSet=CAP_NET_BIND_SERVICE
# AmbientCapabilities=CAP_NET_BIND_SERVICE
CapabilityBoundingSet=
AmbientCapabilities=
# Security Hardening
NoNewPrivileges=yes
ProtectSystem=full
ProtectHome=read-only
PrivateTmp=yes
ProtectKernelTunables=yes
ProtectKernelModules=yes
ProtectControlGroups=yes
RestrictSUIDSGID=yes
LockPersonality=yes
ReadWritePaths=/opt/decnet /var/log/decnet
ReadOnlyPaths=/etc/decnet
Restart=on-failure
RestartSec=5

View File

@@ -306,6 +306,7 @@ def test_stop_agent_falls_back_to_proc_scan_when_no_pidfile(
killed.append((pid, sig))
raise ProcessLookupError # pretend it already died after SIGTERM
monkeypatch.setattr(ex, "_systemd_available", lambda: False)
monkeypatch.setattr(ex, "_discover_agent_pids", lambda: [4242, 4243])
monkeypatch.setattr(ex.os, "kill", fake_kill)
@@ -315,3 +316,76 @@ def test_stop_agent_falls_back_to_proc_scan_when_no_pidfile(
import signal as _signal
assert (4242, _signal.SIGTERM) in killed
assert (4243, _signal.SIGTERM) in killed
def test_systemd_available_requires_invocation_id_and_systemctl(
monkeypatch: pytest.MonkeyPatch,
) -> None:
"""Both INVOCATION_ID and a resolvable systemctl are needed."""
monkeypatch.delenv("INVOCATION_ID", raising=False)
assert ex._systemd_available() is False
monkeypatch.setenv("INVOCATION_ID", "abc")
monkeypatch.setattr("shutil.which", lambda _: None)
assert ex._systemd_available() is False
monkeypatch.setattr("shutil.which", lambda _: "/usr/bin/systemctl")
assert ex._systemd_available() is True
def test_spawn_agent_dispatches_to_systemd_when_available(
monkeypatch: pytest.MonkeyPatch,
install_dir: pathlib.Path,
) -> None:
monkeypatch.setattr(ex, "_systemd_available", lambda: True)
called: list[pathlib.Path] = []
monkeypatch.setattr(ex, "_spawn_agent_via_systemd", lambda d: called.append(d) or 999)
monkeypatch.setattr(ex, "_spawn_agent_via_popen", lambda d: pytest.fail("popen path taken"))
assert ex._spawn_agent(install_dir) == 999
assert called == [install_dir]
def test_spawn_agent_dispatches_to_popen_when_not_systemd(
monkeypatch: pytest.MonkeyPatch,
install_dir: pathlib.Path,
) -> None:
monkeypatch.setattr(ex, "_systemd_available", lambda: False)
monkeypatch.setattr(ex, "_spawn_agent_via_systemd", lambda d: pytest.fail("systemd path taken"))
monkeypatch.setattr(ex, "_spawn_agent_via_popen", lambda d: 777)
assert ex._spawn_agent(install_dir) == 777
def test_stop_agent_is_noop_under_systemd(
monkeypatch: pytest.MonkeyPatch,
install_dir: pathlib.Path,
) -> None:
"""Under systemd, stop is skipped — systemctl restart handles it atomically."""
monkeypatch.setattr(ex, "_systemd_available", lambda: True)
monkeypatch.setattr(ex, "_discover_agent_pids", lambda: pytest.fail("scanned /proc"))
monkeypatch.setattr(ex.os, "kill", lambda *a, **k: pytest.fail("sent signal"))
(install_dir / "agent.pid").write_text("12345")
ex._stop_agent(install_dir, grace=0.0) # must not raise
def test_spawn_agent_via_systemd_records_main_pid(
monkeypatch: pytest.MonkeyPatch,
install_dir: pathlib.Path,
) -> None:
calls: list[list[str]] = []
class _Out:
def __init__(self, stdout: str = "") -> None:
self.stdout = stdout
def fake_run(cmd, **kwargs): # type: ignore[no-untyped-def]
calls.append(cmd)
if "show" in cmd:
return _Out("4711\n")
return _Out("")
monkeypatch.setattr(ex.subprocess, "run", fake_run)
pid = ex._spawn_agent_via_systemd(install_dir)
assert pid == 4711
assert (install_dir / "agent.pid").read_text() == "4711"
assert calls[0][:2] == ["systemctl", "restart"]
assert calls[1][:2] == ["systemctl", "show"]