From 51012eaa671afff1ae6d006aa4d83448a12aa405 Mon Sep 17 00:00:00 2001 From: anti Date: Fri, 24 Apr 2026 00:29:49 -0400 Subject: [PATCH] feat(init): decouple venv from install_dir; fail loud if no venv exists MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The systemd unit templates hardcoded {{ install_dir }}/venv/bin/decnet. On production hosts enroll_bootstrap.sh creates exactly that path so it worked. On dev boxes where the operator runs `sudo decnet init` against a source checkout with a differently-named venv (.venv, .311, .312), every decnet-*.service looped forever in auto-restart with: Failed at step EXEC spawning .../venv/bin/decnet: No such file or directory Templates now use {{ venv_dir }} as an independent Jinja2 var. `decnet init` adds --venv-dir (explicit override), otherwise autodetects: 1. $VIRTUAL_ENV (only when inside --install-dir, so a user-home venv never gets baked into a root-owned unit), 2. {install_dir}/venv (production default; what enroll_bootstrap creates), 3. {install_dir}/{.venv,.311,.312,.313} (common dev conventions). Init aborts before any file writes if nothing resolves — an operator-friendly error beats journalctl spam on every unit restart. python3-venv doesn't set a persistent system variable — $VIRTUAL_ENV lives in the activated shell only — so this has to be decided + baked in at init time; there's no way for systemd to "inherit the current venv" at unit start. Test mode (--prefix) skips venv validation so the existing test suite doesn't need to stub up a venv tree per case. --- decnet/cli/init.py | 94 ++++++++++++++++++++++++++++-- deploy/decnet-agent.service.j2 | 2 +- deploy/decnet-api.service.j2 | 2 +- deploy/decnet-bus.service.j2 | 2 +- deploy/decnet-collector.service.j2 | 2 +- deploy/decnet-forwarder.service.j2 | 2 +- deploy/decnet-listener.service.j2 | 2 +- deploy/decnet-mutator.service.j2 | 2 +- deploy/decnet-prober.service.j2 | 2 +- deploy/decnet-profiler.service.j2 | 2 +- deploy/decnet-sniffer.service.j2 | 2 +- deploy/decnet-swarmctl.service.j2 | 2 +- deploy/decnet-updater.service.j2 | 2 +- deploy/decnet-web.service.j2 | 2 +- 14 files changed, 103 insertions(+), 17 deletions(-) diff --git a/decnet/cli/init.py b/decnet/cli/init.py index 323b3eca..ac96ef73 100644 --- a/decnet/cli/init.py +++ b/decnet/cli/init.py @@ -21,7 +21,7 @@ import shutil import subprocess # nosec B404 import sys from pathlib import Path -from typing import Callable, List +from typing import Callable, List, Optional import typer from jinja2 import Environment, FileSystemLoader, StrictUndefined @@ -275,13 +275,75 @@ def _write_rendered_if_changed( return "ok" +def _resolve_venv_dir(install_dir: str, explicit: str | None) -> str: + """Pick the virtualenv systemd units should ExecStart out of. + + Priority: + 1. ``--venv-dir`` flag (explicit; absolute path required). + 2. ``VIRTUAL_ENV`` env var, but only when it lives under + ``install_dir`` (refuse to bake /home/user/.venv into a system + service — that directory is user-owned and may vanish). + 3. ``{install_dir}/venv`` — what ``enroll_bootstrap.sh`` creates + on fresh agents; the production default. + 4. First hit from a short list of dev-box conventions under + ``install_dir``: ``.venv``, ``.311``, ``.312``, ``.313``. + + Raises RuntimeError with an operator-friendly message if none of + those resolve to a directory containing ``bin/decnet``. Failing loud + at init time beats systemd spamming journalctl with + 'Failed at step EXEC spawning .../venv/bin/decnet: No such file or + directory' on every auto-restart. + """ + install_path = Path(install_dir) + + candidates: list[Path] = [] + if explicit: + if not explicit.startswith("/"): + raise RuntimeError( + f"--venv-dir must be an absolute path, got {explicit!r}" + ) + candidates.append(Path(explicit)) + else: + virtual_env = os.environ.get("VIRTUAL_ENV") + if virtual_env: + ve_path = Path(virtual_env) + try: + ve_path.relative_to(install_path) + candidates.append(ve_path) + except ValueError: + # VIRTUAL_ENV lives outside install_dir — don't bake a + # user-home venv into a root-owned systemd unit. + pass + candidates.append(install_path / "venv") + for name in (".venv", ".311", ".312", ".313"): + candidates.append(install_path / name) + + for cand in candidates: + if (cand / "bin" / "decnet").is_file(): + return str(cand) + + searched = ", ".join(str(c) for c in candidates) + raise RuntimeError( + "Could not find a DECNET venv. Create one first (e.g. " + f"`python -m venv {install_path}/venv && " + f"{install_path}/venv/bin/pip install -e {install_path}[dev]`) " + "or pass --venv-dir. Searched: " + searched + ) + + def _install_units( - deploy: Path, systemd_dir: Path, *, install_dir: str, force: bool, dry_run: bool + deploy: Path, + systemd_dir: Path, + *, + install_dir: str, + venv_dir: str, + force: bool, + dry_run: bool, ) -> str: """Render decnet-*.service.j2 → systemd_dir/decnet-*.service, and copy the static decnet.target (no templating needed — it has no install path references).""" - context = {"install_dir": install_dir} + context = {"install_dir": install_dir, "venv_dir": venv_dir} templates = sorted(deploy.glob("decnet-*.service.j2")) static = [deploy / "decnet.target"] @@ -457,6 +519,14 @@ def register(app: typer.Typer) -> None: "into every systemd unit via Jinja2 and used as the " "decnet user's home directory.", ), + venv_dir: Optional[str] = typer.Option( + None, "--venv-dir", + help="Absolute path to the Python venv systemd should " + "ExecStart from. If omitted, auto-detected in order: " + "$VIRTUAL_ENV (if under --install-dir), " + "{install-dir}/venv, then {install-dir}/{.venv,.311," + ".312,.313}. Init aborts if none exists.", + ), prefix: str = typer.Option( "", "--prefix", hidden=True, help="Filesystem prefix for tests (e.g. tmp_path). Empty = real root.", @@ -604,6 +674,21 @@ def register(app: typer.Typer) -> None: console.print(f"[red]decnet init: {exc}[/]") raise typer.Exit(1) from exc + # Resolve venv BEFORE any file writes — fails loud if the + # operator hasn't created one yet, instead of shipping broken + # systemd units that journalctl spams forever. Skipped under + # --prefix (test mode) because the test harness doesn't build a + # real venv and the rendered string is asserted on directly. + if prefix: + resolved_venv = venv_dir or f"{install_dir}/venv" + else: + try: + resolved_venv = _resolve_venv_dir(install_dir, venv_dir) + except RuntimeError as exc: + console.print(f"[red]decnet init: {exc}[/]") + raise typer.Exit(1) from exc + console.print(f"[dim]using venv: {resolved_venv}[/]") + dirs = [ (pfx / _install_rel, 0o755, user, group), (pfx / "var/lib/decnet", 0o750, user, group), @@ -640,7 +725,8 @@ def register(app: typer.Typer) -> None: "install systemd units", lambda: _install_units( deploy, systemd_dir, - install_dir=install_dir, force=force, dry_run=dry_run, + install_dir=install_dir, venv_dir=resolved_venv, + force=force, dry_run=dry_run, ), ) _step( diff --git a/deploy/decnet-agent.service.j2 b/deploy/decnet-agent.service.j2 index 8335d38e..b09a674c 100644 --- a/deploy/decnet-agent.service.j2 +++ b/deploy/decnet-agent.service.j2 @@ -13,7 +13,7 @@ Group=decnet SupplementaryGroups=docker WorkingDirectory={{ install_dir }} EnvironmentFile=-{{ install_dir }}/.env.local -ExecStart={{ install_dir }}/venv/bin/decnet agent --host 0.0.0.0 --port 8765 --agent-dir /etc/decnet/agent +ExecStart={{ venv_dir }}/bin/decnet agent --host 0.0.0.0 --port 8765 --agent-dir /etc/decnet/agent # MACVLAN/IPVLAN management + scapy raw sockets. Granted via ambient caps so # the process starts unprivileged and keeps only these two bits. diff --git a/deploy/decnet-api.service.j2 b/deploy/decnet-api.service.j2 index 4b324ed6..2f75dcd0 100644 --- a/deploy/decnet-api.service.j2 +++ b/deploy/decnet-api.service.j2 @@ -13,7 +13,7 @@ Group=decnet SupplementaryGroups=docker WorkingDirectory={{ install_dir }} EnvironmentFile=-{{ install_dir }}/.env.local -ExecStart={{ install_dir }}/venv/bin/decnet api +ExecStart={{ venv_dir }}/bin/decnet api # MACVLAN/IPVLAN setup runs from the API lifespan when the embedded sniffer is on. CapabilityBoundingSet=CAP_NET_ADMIN CAP_NET_RAW diff --git a/deploy/decnet-bus.service.j2 b/deploy/decnet-bus.service.j2 index 9222eda1..e3301b33 100644 --- a/deploy/decnet-bus.service.j2 +++ b/deploy/decnet-bus.service.j2 @@ -16,7 +16,7 @@ EnvironmentFile=-{{ install_dir }}/.env.local # connect. RuntimeDirectory=decnet RuntimeDirectoryMode=0755 -ExecStart={{ install_dir }}/venv/bin/decnet bus \ +ExecStart={{ venv_dir }}/bin/decnet bus \ --socket /run/decnet/bus.sock \ --group decnet diff --git a/deploy/decnet-collector.service.j2 b/deploy/decnet-collector.service.j2 index ef59bea2..35d36452 100644 --- a/deploy/decnet-collector.service.j2 +++ b/deploy/decnet-collector.service.j2 @@ -13,7 +13,7 @@ Group=decnet SupplementaryGroups=docker WorkingDirectory={{ install_dir }} EnvironmentFile=-{{ install_dir }}/.env.local -ExecStart={{ install_dir }}/venv/bin/decnet collect +ExecStart={{ venv_dir }}/bin/decnet collect # No privileged network operations. CapabilityBoundingSet= diff --git a/deploy/decnet-forwarder.service.j2 b/deploy/decnet-forwarder.service.j2 index 2d4fd29a..d0485a20 100644 --- a/deploy/decnet-forwarder.service.j2 +++ b/deploy/decnet-forwarder.service.j2 @@ -15,7 +15,7 @@ EnvironmentFile=-{{ install_dir }}/.env.local # Replace with the master's LAN address or hostname. The agent # cert bundle at /etc/decnet/agent is reused — the forwarder presents the same # worker identity when it connects to the master's listener. -ExecStart={{ install_dir }}/venv/bin/decnet forwarder \ +ExecStart={{ venv_dir }}/bin/decnet forwarder \ --log-file /var/log/decnet/decnet.log \ --master-host ${DECNET_SWARM_MASTER_HOST} \ --master-port 6514 \ diff --git a/deploy/decnet-listener.service.j2 b/deploy/decnet-listener.service.j2 index ff615390..031e1fa2 100644 --- a/deploy/decnet-listener.service.j2 +++ b/deploy/decnet-listener.service.j2 @@ -12,7 +12,7 @@ WorkingDirectory={{ install_dir }} EnvironmentFile=-{{ install_dir }}/.env.local # Binds 0.0.0.0:6514 so workers across the LAN can connect. 6514 is not a # privileged port (≥1024), so no CAP_NET_BIND_SERVICE is required. -ExecStart={{ install_dir }}/venv/bin/decnet listener \ +ExecStart={{ venv_dir }}/bin/decnet listener \ --host 0.0.0.0 --port 6514 \ --ca-dir /etc/decnet/ca \ --log-path /var/log/decnet/master.log \ diff --git a/deploy/decnet-mutator.service.j2 b/deploy/decnet-mutator.service.j2 index b4227ddb..5ef30e05 100644 --- a/deploy/decnet-mutator.service.j2 +++ b/deploy/decnet-mutator.service.j2 @@ -13,7 +13,7 @@ Group=decnet SupplementaryGroups=docker WorkingDirectory={{ install_dir }} EnvironmentFile=-{{ install_dir }}/.env.local -ExecStart={{ install_dir }}/venv/bin/decnet mutate --watch +ExecStart={{ venv_dir }}/bin/decnet mutate --watch CapabilityBoundingSet= AmbientCapabilities= diff --git a/deploy/decnet-prober.service.j2 b/deploy/decnet-prober.service.j2 index cbeab44a..e16c9b24 100644 --- a/deploy/decnet-prober.service.j2 +++ b/deploy/decnet-prober.service.j2 @@ -10,7 +10,7 @@ User=decnet Group=decnet WorkingDirectory={{ install_dir }} EnvironmentFile=-{{ install_dir }}/.env.local -ExecStart={{ install_dir }}/venv/bin/decnet probe +ExecStart={{ venv_dir }}/bin/decnet probe # TCP connect probes only — no raw sockets required. CapabilityBoundingSet= diff --git a/deploy/decnet-profiler.service.j2 b/deploy/decnet-profiler.service.j2 index d4a7ddba..d5abd1e2 100644 --- a/deploy/decnet-profiler.service.j2 +++ b/deploy/decnet-profiler.service.j2 @@ -10,7 +10,7 @@ User=decnet Group=decnet WorkingDirectory={{ install_dir }} EnvironmentFile=-{{ install_dir }}/.env.local -ExecStart={{ install_dir }}/venv/bin/decnet profiler +ExecStart={{ venv_dir }}/bin/decnet profiler CapabilityBoundingSet= AmbientCapabilities= diff --git a/deploy/decnet-sniffer.service.j2 b/deploy/decnet-sniffer.service.j2 index 796863fe..971ae147 100644 --- a/deploy/decnet-sniffer.service.j2 +++ b/deploy/decnet-sniffer.service.j2 @@ -10,7 +10,7 @@ User=decnet Group=decnet WorkingDirectory={{ install_dir }} EnvironmentFile=-{{ install_dir }}/.env.local -ExecStart={{ install_dir }}/venv/bin/decnet sniffer +ExecStart={{ venv_dir }}/bin/decnet sniffer # scapy needs raw packet access on the MACVLAN host interface. CapabilityBoundingSet=CAP_NET_RAW diff --git a/deploy/decnet-swarmctl.service.j2 b/deploy/decnet-swarmctl.service.j2 index dcfdd259..2c0efb4c 100644 --- a/deploy/decnet-swarmctl.service.j2 +++ b/deploy/decnet-swarmctl.service.j2 @@ -12,7 +12,7 @@ WorkingDirectory={{ install_dir }} EnvironmentFile=-{{ install_dir }}/.env.local # Default bind is loopback — the controller is a master-local orchestrator # reached by the CLI and the web dashboard, not by workers. -ExecStart={{ install_dir }}/venv/bin/decnet swarmctl --host 127.0.0.1 --port 8770 +ExecStart={{ venv_dir }}/bin/decnet swarmctl --host 127.0.0.1 --port 8770 # No special capabilities — the controller issues mTLS certs and talks to # workers over TCP on unprivileged ports. diff --git a/deploy/decnet-updater.service.j2 b/deploy/decnet-updater.service.j2 index 1221bef6..7e06eb0a 100644 --- a/deploy/decnet-updater.service.j2 +++ b/deploy/decnet-updater.service.j2 @@ -12,7 +12,7 @@ User=decnet Group=decnet WorkingDirectory={{ install_dir }} EnvironmentFile=-{{ install_dir }}/.env.local -ExecStart={{ install_dir }}/venv/bin/decnet updater \ +ExecStart={{ venv_dir }}/bin/decnet updater \ --host 0.0.0.0 --port 8766 \ --updater-dir /etc/decnet/updater \ --install-dir {{ install_dir }} \ diff --git a/deploy/decnet-web.service.j2 b/deploy/decnet-web.service.j2 index 313d9e25..8e852498 100644 --- a/deploy/decnet-web.service.j2 +++ b/deploy/decnet-web.service.j2 @@ -10,7 +10,7 @@ User=decnet Group=decnet WorkingDirectory={{ install_dir }} EnvironmentFile=-{{ install_dir }}/.env.local -ExecStart={{ install_dir }}/venv/bin/decnet web +ExecStart={{ venv_dir }}/bin/decnet web # Uncomment if you bind the dashboard to a privileged port (80/443): # CapabilityBoundingSet=CAP_NET_BIND_SERVICE