Files
DECNET/tests/agent/test_topology_apply_recreate_deps.py
anti 0b1a17b4eb fix(agent): pass --always-recreate-deps so service netns shares stay fresh
Decky service containers join their base via `network_mode:
container:<base>` and Docker binds that share at service start time. If
`docker compose up` recreates a base (e.g. ports: changes after a
forwards_l3 toggle) but decides services are unchanged, services keep
a stale FD into the destroyed namespace and end up with only `lo` — so
external traffic hits a closed port on the live base and gets RST.

Hit live on the first VPS deploy: external SSH to the dmz-gateway was
refused while sshd was listening, because base and service netns
inodes had drifted apart. `--always-recreate-deps` makes compose
rebuild every dependent whenever its base is recreated, removing the
race entirely.
2026-04-27 22:55:48 -04:00

80 lines
2.8 KiB
Python

"""apply() must pass --always-recreate-deps to docker compose up.
Regression guard for the stale-netns-share bug: deckie service containers
join the base via ``network_mode: container:<base>`` and Docker binds the
share at service start. When compose recreates the base (e.g. ``ports:``
changed after toggling ``forwards_l3``) but decides services are
unchanged, the services keep a stale FD into the destroyed netns and
end up with only ``lo``. Forcing dependent recreation removes the race.
Found on first VPS deploy 2026-04-28: external SSH to the dmz-gateway
RST'd because the service's netns inode (37090) didn't match the base's
(41477). After ``compose down`` + ``up`` the inodes matched and traffic
flowed; this test guarantees agent re-applies do the same in one shot.
"""
from __future__ import annotations
import asyncio
import pathlib
from typing import Any
import pytest
from decnet.agent import topology_ops as _ops
class _FakeStore:
def current(self) -> None:
return None
def put(self, *a: Any, **kw: Any) -> None:
pass
def clear(self, *a: Any, **kw: Any) -> None:
pass
def test_apply_passes_always_recreate_deps_to_compose(
monkeypatch: pytest.MonkeyPatch, tmp_path: pathlib.Path,
) -> None:
captured: list[tuple[str, ...]] = []
def _fake_compose(*args: str, compose_file: pathlib.Path, **kw: Any) -> None:
captured.append(args)
monkeypatch.setattr(_ops, "_compose_with_retry", _fake_compose)
monkeypatch.setattr(_ops, "create_bridge_network", lambda *a, **k: None)
monkeypatch.setattr(_ops, "write_topology_compose", lambda *a, **k: None)
monkeypatch.setattr(_ops, "_validate_topology", lambda *_: [])
monkeypatch.setattr(_ops, "_validation_errors", lambda _: [])
monkeypatch.setattr(_ops, "canonical_hash", lambda *_: "deadbeef")
class _StubDockerClient:
@staticmethod
def from_env() -> "_StubDockerClient":
return _StubDockerClient()
monkeypatch.setattr(_ops, "docker", _StubDockerClient())
hydrated = {
"topology": {"id": "11111111-2222-3333-4444-555555555555"},
"lans": [{"name": "dmz", "subnet": "10.0.0.0/24", "is_dmz": True}],
"deckies": [],
"edges": [],
}
asyncio.get_event_loop_policy().new_event_loop().run_until_complete(
_ops.apply(hydrated, "deadbeef", _FakeStore())
)
assert captured, "compose was never invoked"
args = captured[-1]
assert "up" in args, f"expected `up` in compose args, got {args}"
assert "--always-recreate-deps" in args, (
"agent must pass --always-recreate-deps so service containers' "
"netns shares stay fresh when their base is recreated. Without "
"this flag, services end up with stale FDs into destroyed "
"namespaces and external traffic hits closed ports on the live "
f"base. Got: {args}"
)