From 0b1a17b4ebdb8c6e384c95853403619b5f9ad290 Mon Sep 17 00:00:00 2001 From: anti Date: Mon, 27 Apr 2026 22:55:48 -0400 Subject: [PATCH] fix(agent): pass --always-recreate-deps so service netns shares stay fresh MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Decky service containers join their base via `network_mode: container:` and Docker binds that share at service start time. If `docker compose up` recreates a base (e.g. ports: changes after a forwards_l3 toggle) but decides services are unchanged, services keep a stale FD into the destroyed namespace and end up with only `lo` — so external traffic hits a closed port on the live base and gets RST. Hit live on the first VPS deploy: external SSH to the dmz-gateway was refused while sshd was listening, because base and service netns inodes had drifted apart. `--always-recreate-deps` makes compose rebuild every dependent whenever its base is recreated, removing the race entirely. --- decnet/agent/topology_ops.py | 15 +++- tests/agent/__init__.py | 0 .../test_topology_apply_recreate_deps.py | 79 +++++++++++++++++++ 3 files changed, 93 insertions(+), 1 deletion(-) create mode 100644 tests/agent/__init__.py create mode 100644 tests/agent/test_topology_apply_recreate_deps.py diff --git a/decnet/agent/topology_ops.py b/decnet/agent/topology_ops.py index 6db170ec..f8f156f2 100644 --- a/decnet/agent/topology_ops.py +++ b/decnet/agent/topology_ops.py @@ -122,7 +122,20 @@ async def apply( client, net_name, lan["subnet"], internal=internal ) write_topology_compose(hydrated, compose_path) - _compose_with_retry("up", "--build", "-d", compose_file=compose_path) + # ``--always-recreate-deps`` keeps service containers' netns shares + # fresh: every decky service joins its base's netns via + # ``network_mode: container:``, and that share is bound at + # service start time. If a base is recreated (e.g. when ``ports:`` + # changes after toggling ``forwards_l3``) but compose decides the + # services are unchanged, the services keep a stale netns FD + # pointing at the destroyed base — they end up in an empty + # namespace with only ``lo``, and external traffic hits a closed + # port on the live base. Forcing dependents to recreate alongside + # the base is the cheapest way to make this race impossible. + _compose_with_retry( + "up", "--build", "-d", "--always-recreate-deps", + compose_file=compose_path, + ) await asyncio.to_thread(_materialise) diff --git a/tests/agent/__init__.py b/tests/agent/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/agent/test_topology_apply_recreate_deps.py b/tests/agent/test_topology_apply_recreate_deps.py new file mode 100644 index 00000000..acc6cef3 --- /dev/null +++ b/tests/agent/test_topology_apply_recreate_deps.py @@ -0,0 +1,79 @@ +"""apply() must pass --always-recreate-deps to docker compose up. + +Regression guard for the stale-netns-share bug: deckie service containers +join the base via ``network_mode: container:`` and Docker binds the +share at service start. When compose recreates the base (e.g. ``ports:`` +changed after toggling ``forwards_l3``) but decides services are +unchanged, the services keep a stale FD into the destroyed netns and +end up with only ``lo``. Forcing dependent recreation removes the race. + +Found on first VPS deploy 2026-04-28: external SSH to the dmz-gateway +RST'd because the service's netns inode (37090) didn't match the base's +(41477). After ``compose down`` + ``up`` the inodes matched and traffic +flowed; this test guarantees agent re-applies do the same in one shot. +""" +from __future__ import annotations + +import asyncio +import pathlib +from typing import Any + +import pytest + +from decnet.agent import topology_ops as _ops + + +class _FakeStore: + def current(self) -> None: + return None + + def put(self, *a: Any, **kw: Any) -> None: + pass + + def clear(self, *a: Any, **kw: Any) -> None: + pass + + +def test_apply_passes_always_recreate_deps_to_compose( + monkeypatch: pytest.MonkeyPatch, tmp_path: pathlib.Path, +) -> None: + captured: list[tuple[str, ...]] = [] + + def _fake_compose(*args: str, compose_file: pathlib.Path, **kw: Any) -> None: + captured.append(args) + + monkeypatch.setattr(_ops, "_compose_with_retry", _fake_compose) + monkeypatch.setattr(_ops, "create_bridge_network", lambda *a, **k: None) + monkeypatch.setattr(_ops, "write_topology_compose", lambda *a, **k: None) + monkeypatch.setattr(_ops, "_validate_topology", lambda *_: []) + monkeypatch.setattr(_ops, "_validation_errors", lambda _: []) + monkeypatch.setattr(_ops, "canonical_hash", lambda *_: "deadbeef") + + class _StubDockerClient: + @staticmethod + def from_env() -> "_StubDockerClient": + return _StubDockerClient() + + monkeypatch.setattr(_ops, "docker", _StubDockerClient()) + + hydrated = { + "topology": {"id": "11111111-2222-3333-4444-555555555555"}, + "lans": [{"name": "dmz", "subnet": "10.0.0.0/24", "is_dmz": True}], + "deckies": [], + "edges": [], + } + + asyncio.get_event_loop_policy().new_event_loop().run_until_complete( + _ops.apply(hydrated, "deadbeef", _FakeStore()) + ) + + assert captured, "compose was never invoked" + args = captured[-1] + assert "up" in args, f"expected `up` in compose args, got {args}" + assert "--always-recreate-deps" in args, ( + "agent must pass --always-recreate-deps so service containers' " + "netns shares stay fresh when their base is recreated. Without " + "this flag, services end up with stale FDs into destroyed " + "namespaces and external traffic hits closed ports on the live " + f"base. Got: {args}" + )