fix(mutator): materialise live LAN add/remove on docker, not just the DB
apply_add_lan and apply_remove_lan were DB-only — they wrote/deleted the topology_lans row but never created or destroyed the docker bridge network. Adding a LAN to a deployed topology silently did nothing on the substrate side; any decky later attached to it had nowhere to bind. Both ops now call a shared _materialise_lan_change helper after the DB write. When the topology is active/degraded and not pinned to a swarm agent, the helper: * creates / removes the docker bridge network (internal=True for non-DMZ LANs, mirroring engine/deployer.deploy_topology), * re-renders the per-topology compose file so future redeploys reflect the change. Failures are logged, not re-raised — the DB row stays as source of truth so an operator can retry without leaking inconsistent state. Agent-pinned topologies are skipped; the next agent push reconciles. apply_add_decky / apply_attach_decky have the same gap and are not fixed here — multi-homing a running container needs careful recreate-vs-network-connect handling and is its own commit. Without those, dropping a decky into a freshly-added LAN still won't spawn a container; only the LAN itself is now live.
This commit is contained in:
@@ -98,6 +98,94 @@ def _decky_by_name(hydrated: dict[str, Any], name: str) -> Optional[dict]:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def _materialise_lan_change(
|
||||||
|
repo: Any,
|
||||||
|
topology_id: str,
|
||||||
|
*,
|
||||||
|
created: Optional[tuple[str, str, bool]] = None,
|
||||||
|
removed: Optional[str] = None,
|
||||||
|
) -> None:
|
||||||
|
"""Create or remove the docker bridge for a live LAN op + re-render compose.
|
||||||
|
|
||||||
|
Called from ``apply_add_lan`` / ``apply_remove_lan`` after the DB
|
||||||
|
write lands. Skips when:
|
||||||
|
|
||||||
|
* the topology is not active/degraded (a pending topology gets its
|
||||||
|
networks created at deploy time),
|
||||||
|
* the topology is pinned to a swarm agent (cross-host materialisation
|
||||||
|
isn't implemented; the agent's apply_topology RPC re-renders the
|
||||||
|
whole compose at next push),
|
||||||
|
* the docker SDK / networking primitive raises (logged, not
|
||||||
|
re-raised — the DB row is the source of truth).
|
||||||
|
"""
|
||||||
|
topology = await repo.get_topology(topology_id)
|
||||||
|
if topology is None:
|
||||||
|
return
|
||||||
|
status = topology.get("status")
|
||||||
|
if status not in ("active", "degraded"):
|
||||||
|
return
|
||||||
|
if topology.get("target_host_uuid"):
|
||||||
|
_log.info(
|
||||||
|
"live LAN op skipped (agent-pinned topology=%s); next agent push will reconcile",
|
||||||
|
topology_id,
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Lazy imports — these pull in docker.py / network.py which both
|
||||||
|
# require the docker SDK; keeping them out of module-import keeps
|
||||||
|
# the mutator usable in test environments that stub docker.
|
||||||
|
import docker
|
||||||
|
from decnet.engine.deployer import _topology_compose_path
|
||||||
|
from decnet.network import create_bridge_network, remove_bridge_network
|
||||||
|
from decnet.topology.compose import _network_name, write_topology_compose
|
||||||
|
|
||||||
|
client = docker.from_env()
|
||||||
|
try:
|
||||||
|
if created is not None:
|
||||||
|
name, subnet, is_dmz = created
|
||||||
|
net_name = _network_name(topology_id, name)
|
||||||
|
try:
|
||||||
|
create_bridge_network(
|
||||||
|
client, net_name, subnet, internal=not is_dmz,
|
||||||
|
)
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
_log.error(
|
||||||
|
"live add_lan: bridge create failed topology=%s lan=%s subnet=%s: %s",
|
||||||
|
topology_id, name, subnet, exc,
|
||||||
|
)
|
||||||
|
# Don't re-raise — the DB row is the source of truth.
|
||||||
|
# Operator can retry by removing + re-adding the LAN.
|
||||||
|
if removed is not None:
|
||||||
|
net_name = _network_name(topology_id, removed)
|
||||||
|
try:
|
||||||
|
remove_bridge_network(client, net_name)
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
_log.warning(
|
||||||
|
"live remove_lan: bridge remove failed topology=%s lan=%s: %s",
|
||||||
|
topology_id, removed, exc,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Re-render compose so the file on disk matches the DB. Even
|
||||||
|
# when the bridge create above failed, a future redeploy will
|
||||||
|
# try to bring the network back from the compose definition.
|
||||||
|
hydrated = await hydrate(repo, topology_id)
|
||||||
|
if hydrated is not None:
|
||||||
|
try:
|
||||||
|
write_topology_compose(
|
||||||
|
hydrated, _topology_compose_path(topology_id),
|
||||||
|
)
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
_log.warning(
|
||||||
|
"live LAN op: compose re-render failed topology=%s: %s",
|
||||||
|
topology_id, exc,
|
||||||
|
)
|
||||||
|
except Exception as exc: # noqa: BLE001 — outer net for any docker SDK failure
|
||||||
|
_log.error(
|
||||||
|
"live LAN materialisation crashed topology=%s: %s",
|
||||||
|
topology_id, exc,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------- ops
|
# ------------------------------------------------------------------- ops
|
||||||
|
|
||||||
|
|
||||||
@@ -131,6 +219,16 @@ async def apply_add_lan(
|
|||||||
"y": payload.get("y"),
|
"y": payload.get("y"),
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Live materialisation: when the topology is active/degraded, create
|
||||||
|
# the docker bridge network now and re-render the per-topology
|
||||||
|
# compose file so subsequent ``apply_add_decky`` writes a coherent
|
||||||
|
# services map. Pending topologies skip this — the next deploy
|
||||||
|
# creates everything from scratch. Agent-pinned topologies also
|
||||||
|
# skip; live editing on agents is its own routing problem.
|
||||||
|
await _materialise_lan_change(
|
||||||
|
repo, topology_id, created=(name, subnet, is_dmz),
|
||||||
|
)
|
||||||
await _assert_valid_after(repo, topology_id)
|
await _assert_valid_after(repo, topology_id)
|
||||||
|
|
||||||
|
|
||||||
@@ -150,7 +248,13 @@ async def apply_remove_lan(
|
|||||||
f"LAN {lan['name']!r} is the home LAN of decky "
|
f"LAN {lan['name']!r} is the home LAN of decky "
|
||||||
f"{d['decky_config']['name']!r}; remove the decky first"
|
f"{d['decky_config']['name']!r}; remove the decky first"
|
||||||
)
|
)
|
||||||
|
lan_name = lan["name"]
|
||||||
await repo.delete_lan(lan["id"])
|
await repo.delete_lan(lan["id"])
|
||||||
|
|
||||||
|
# Live materialisation symmetric to apply_add_lan: tear down the
|
||||||
|
# docker bridge and re-render compose so a future redeploy doesn't
|
||||||
|
# try to wire deckies into a network that no longer exists.
|
||||||
|
await _materialise_lan_change(repo, topology_id, removed=lan_name)
|
||||||
await _assert_valid_after(repo, topology_id)
|
await _assert_valid_after(repo, topology_id)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user