fix(agent): pass --always-recreate-deps so service netns shares stay fresh
Decky service containers join their base via `network_mode: container:<base>` and Docker binds that share at service start time. If `docker compose up` recreates a base (e.g. ports: changes after a forwards_l3 toggle) but decides services are unchanged, services keep a stale FD into the destroyed namespace and end up with only `lo` — so external traffic hits a closed port on the live base and gets RST. Hit live on the first VPS deploy: external SSH to the dmz-gateway was refused while sshd was listening, because base and service netns inodes had drifted apart. `--always-recreate-deps` makes compose rebuild every dependent whenever its base is recreated, removing the race entirely.
This commit is contained in:
@@ -122,7 +122,20 @@ async def apply(
|
|||||||
client, net_name, lan["subnet"], internal=internal
|
client, net_name, lan["subnet"], internal=internal
|
||||||
)
|
)
|
||||||
write_topology_compose(hydrated, compose_path)
|
write_topology_compose(hydrated, compose_path)
|
||||||
_compose_with_retry("up", "--build", "-d", compose_file=compose_path)
|
# ``--always-recreate-deps`` keeps service containers' netns shares
|
||||||
|
# fresh: every decky service joins its base's netns via
|
||||||
|
# ``network_mode: container:<base>``, and that share is bound at
|
||||||
|
# service start time. If a base is recreated (e.g. when ``ports:``
|
||||||
|
# changes after toggling ``forwards_l3``) but compose decides the
|
||||||
|
# services are unchanged, the services keep a stale netns FD
|
||||||
|
# pointing at the destroyed base — they end up in an empty
|
||||||
|
# namespace with only ``lo``, and external traffic hits a closed
|
||||||
|
# port on the live base. Forcing dependents to recreate alongside
|
||||||
|
# the base is the cheapest way to make this race impossible.
|
||||||
|
_compose_with_retry(
|
||||||
|
"up", "--build", "-d", "--always-recreate-deps",
|
||||||
|
compose_file=compose_path,
|
||||||
|
)
|
||||||
|
|
||||||
await asyncio.to_thread(_materialise)
|
await asyncio.to_thread(_materialise)
|
||||||
|
|
||||||
|
|||||||
0
tests/agent/__init__.py
Normal file
0
tests/agent/__init__.py
Normal file
79
tests/agent/test_topology_apply_recreate_deps.py
Normal file
79
tests/agent/test_topology_apply_recreate_deps.py
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
"""apply() must pass --always-recreate-deps to docker compose up.
|
||||||
|
|
||||||
|
Regression guard for the stale-netns-share bug: deckie service containers
|
||||||
|
join the base via ``network_mode: container:<base>`` and Docker binds the
|
||||||
|
share at service start. When compose recreates the base (e.g. ``ports:``
|
||||||
|
changed after toggling ``forwards_l3``) but decides services are
|
||||||
|
unchanged, the services keep a stale FD into the destroyed netns and
|
||||||
|
end up with only ``lo``. Forcing dependent recreation removes the race.
|
||||||
|
|
||||||
|
Found on first VPS deploy 2026-04-28: external SSH to the dmz-gateway
|
||||||
|
RST'd because the service's netns inode (37090) didn't match the base's
|
||||||
|
(41477). After ``compose down`` + ``up`` the inodes matched and traffic
|
||||||
|
flowed; this test guarantees agent re-applies do the same in one shot.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import pathlib
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from decnet.agent import topology_ops as _ops
|
||||||
|
|
||||||
|
|
||||||
|
class _FakeStore:
|
||||||
|
def current(self) -> None:
|
||||||
|
return None
|
||||||
|
|
||||||
|
def put(self, *a: Any, **kw: Any) -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def clear(self, *a: Any, **kw: Any) -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def test_apply_passes_always_recreate_deps_to_compose(
|
||||||
|
monkeypatch: pytest.MonkeyPatch, tmp_path: pathlib.Path,
|
||||||
|
) -> None:
|
||||||
|
captured: list[tuple[str, ...]] = []
|
||||||
|
|
||||||
|
def _fake_compose(*args: str, compose_file: pathlib.Path, **kw: Any) -> None:
|
||||||
|
captured.append(args)
|
||||||
|
|
||||||
|
monkeypatch.setattr(_ops, "_compose_with_retry", _fake_compose)
|
||||||
|
monkeypatch.setattr(_ops, "create_bridge_network", lambda *a, **k: None)
|
||||||
|
monkeypatch.setattr(_ops, "write_topology_compose", lambda *a, **k: None)
|
||||||
|
monkeypatch.setattr(_ops, "_validate_topology", lambda *_: [])
|
||||||
|
monkeypatch.setattr(_ops, "_validation_errors", lambda _: [])
|
||||||
|
monkeypatch.setattr(_ops, "canonical_hash", lambda *_: "deadbeef")
|
||||||
|
|
||||||
|
class _StubDockerClient:
|
||||||
|
@staticmethod
|
||||||
|
def from_env() -> "_StubDockerClient":
|
||||||
|
return _StubDockerClient()
|
||||||
|
|
||||||
|
monkeypatch.setattr(_ops, "docker", _StubDockerClient())
|
||||||
|
|
||||||
|
hydrated = {
|
||||||
|
"topology": {"id": "11111111-2222-3333-4444-555555555555"},
|
||||||
|
"lans": [{"name": "dmz", "subnet": "10.0.0.0/24", "is_dmz": True}],
|
||||||
|
"deckies": [],
|
||||||
|
"edges": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
asyncio.get_event_loop_policy().new_event_loop().run_until_complete(
|
||||||
|
_ops.apply(hydrated, "deadbeef", _FakeStore())
|
||||||
|
)
|
||||||
|
|
||||||
|
assert captured, "compose was never invoked"
|
||||||
|
args = captured[-1]
|
||||||
|
assert "up" in args, f"expected `up` in compose args, got {args}"
|
||||||
|
assert "--always-recreate-deps" in args, (
|
||||||
|
"agent must pass --always-recreate-deps so service containers' "
|
||||||
|
"netns shares stay fresh when their base is recreated. Without "
|
||||||
|
"this flag, services end up with stale FDs into destroyed "
|
||||||
|
"namespaces and external traffic hits closed ports on the live "
|
||||||
|
f"base. Got: {args}"
|
||||||
|
)
|
||||||
Reference in New Issue
Block a user