fix(agent): pass --always-recreate-deps so service netns shares stay fresh
Decky service containers join their base via `network_mode: container:<base>` and Docker binds that share at service start time. If `docker compose up` recreates a base (e.g. ports: changes after a forwards_l3 toggle) but decides services are unchanged, services keep a stale FD into the destroyed namespace and end up with only `lo` — so external traffic hits a closed port on the live base and gets RST. Hit live on the first VPS deploy: external SSH to the dmz-gateway was refused while sshd was listening, because base and service netns inodes had drifted apart. `--always-recreate-deps` makes compose rebuild every dependent whenever its base is recreated, removing the race entirely.
This commit is contained in:
@@ -122,7 +122,20 @@ async def apply(
|
||||
client, net_name, lan["subnet"], internal=internal
|
||||
)
|
||||
write_topology_compose(hydrated, compose_path)
|
||||
_compose_with_retry("up", "--build", "-d", compose_file=compose_path)
|
||||
# ``--always-recreate-deps`` keeps service containers' netns shares
|
||||
# fresh: every decky service joins its base's netns via
|
||||
# ``network_mode: container:<base>``, and that share is bound at
|
||||
# service start time. If a base is recreated (e.g. when ``ports:``
|
||||
# changes after toggling ``forwards_l3``) but compose decides the
|
||||
# services are unchanged, the services keep a stale netns FD
|
||||
# pointing at the destroyed base — they end up in an empty
|
||||
# namespace with only ``lo``, and external traffic hits a closed
|
||||
# port on the live base. Forcing dependents to recreate alongside
|
||||
# the base is the cheapest way to make this race impossible.
|
||||
_compose_with_retry(
|
||||
"up", "--build", "-d", "--always-recreate-deps",
|
||||
compose_file=compose_path,
|
||||
)
|
||||
|
||||
await asyncio.to_thread(_materialise)
|
||||
|
||||
|
||||
0
tests/agent/__init__.py
Normal file
0
tests/agent/__init__.py
Normal file
79
tests/agent/test_topology_apply_recreate_deps.py
Normal file
79
tests/agent/test_topology_apply_recreate_deps.py
Normal file
@@ -0,0 +1,79 @@
|
||||
"""apply() must pass --always-recreate-deps to docker compose up.
|
||||
|
||||
Regression guard for the stale-netns-share bug: deckie service containers
|
||||
join the base via ``network_mode: container:<base>`` and Docker binds the
|
||||
share at service start. When compose recreates the base (e.g. ``ports:``
|
||||
changed after toggling ``forwards_l3``) but decides services are
|
||||
unchanged, the services keep a stale FD into the destroyed netns and
|
||||
end up with only ``lo``. Forcing dependent recreation removes the race.
|
||||
|
||||
Found on first VPS deploy 2026-04-28: external SSH to the dmz-gateway
|
||||
RST'd because the service's netns inode (37090) didn't match the base's
|
||||
(41477). After ``compose down`` + ``up`` the inodes matched and traffic
|
||||
flowed; this test guarantees agent re-applies do the same in one shot.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
import pathlib
|
||||
from typing import Any
|
||||
|
||||
import pytest
|
||||
|
||||
from decnet.agent import topology_ops as _ops
|
||||
|
||||
|
||||
class _FakeStore:
|
||||
def current(self) -> None:
|
||||
return None
|
||||
|
||||
def put(self, *a: Any, **kw: Any) -> None:
|
||||
pass
|
||||
|
||||
def clear(self, *a: Any, **kw: Any) -> None:
|
||||
pass
|
||||
|
||||
|
||||
def test_apply_passes_always_recreate_deps_to_compose(
|
||||
monkeypatch: pytest.MonkeyPatch, tmp_path: pathlib.Path,
|
||||
) -> None:
|
||||
captured: list[tuple[str, ...]] = []
|
||||
|
||||
def _fake_compose(*args: str, compose_file: pathlib.Path, **kw: Any) -> None:
|
||||
captured.append(args)
|
||||
|
||||
monkeypatch.setattr(_ops, "_compose_with_retry", _fake_compose)
|
||||
monkeypatch.setattr(_ops, "create_bridge_network", lambda *a, **k: None)
|
||||
monkeypatch.setattr(_ops, "write_topology_compose", lambda *a, **k: None)
|
||||
monkeypatch.setattr(_ops, "_validate_topology", lambda *_: [])
|
||||
monkeypatch.setattr(_ops, "_validation_errors", lambda _: [])
|
||||
monkeypatch.setattr(_ops, "canonical_hash", lambda *_: "deadbeef")
|
||||
|
||||
class _StubDockerClient:
|
||||
@staticmethod
|
||||
def from_env() -> "_StubDockerClient":
|
||||
return _StubDockerClient()
|
||||
|
||||
monkeypatch.setattr(_ops, "docker", _StubDockerClient())
|
||||
|
||||
hydrated = {
|
||||
"topology": {"id": "11111111-2222-3333-4444-555555555555"},
|
||||
"lans": [{"name": "dmz", "subnet": "10.0.0.0/24", "is_dmz": True}],
|
||||
"deckies": [],
|
||||
"edges": [],
|
||||
}
|
||||
|
||||
asyncio.get_event_loop_policy().new_event_loop().run_until_complete(
|
||||
_ops.apply(hydrated, "deadbeef", _FakeStore())
|
||||
)
|
||||
|
||||
assert captured, "compose was never invoked"
|
||||
args = captured[-1]
|
||||
assert "up" in args, f"expected `up` in compose args, got {args}"
|
||||
assert "--always-recreate-deps" in args, (
|
||||
"agent must pass --always-recreate-deps so service containers' "
|
||||
"netns shares stay fresh when their base is recreated. Without "
|
||||
"this flag, services end up with stale FDs into destroyed "
|
||||
"namespaces and external traffic hits closed ports on the live "
|
||||
f"base. Got: {args}"
|
||||
)
|
||||
Reference in New Issue
Block a user