fix(engine): per-scope docker compose project names
Every compose invocation used -p decnet so fleet + every topology lived in one docker compose project. --remove-orphans, run during fleet pre-up cleanup and on every topology teardown / rollback, then swept every container in the project not listed in the current compose file — wiping sibling topologies and the flat fleet along with the intended target. Parameterize project on _compose / _compose_with_retry / _compose_ps (default FLEET_COMPOSE_PROJECT="decnet"). Add _topology_compose_project that returns decnet-topo-<id8>, and pass it through every topology compose call site (master deploy_topology + rollback + post-deploy ps, master teardown_topology, agent apply, agent teardown, all four live service mutations on topology deckies). Fleet calls keep the default and are unaffected. Migration: live containers from before this fix remain in the shared "decnet" project and need a one-time manual cleanup before they're reachable to the new topology code paths.
This commit is contained in:
@@ -50,6 +50,7 @@ from decnet.topology.validate import (
|
||||
log = get_logger("engine")
|
||||
console = Console()
|
||||
COMPOSE_FILE = Path("decnet-compose.yml")
|
||||
FLEET_COMPOSE_PROJECT = "decnet"
|
||||
_CANONICAL_LOGGING = Path(__file__).parent.parent / "templates" / "syslog_bridge.py"
|
||||
_CANONICAL_INSTANCE_SEED = Path(__file__).parent.parent / "templates" / "instance_seed.py"
|
||||
_CANONICAL_SESSREC_DIR = Path(__file__).parent.parent / "templates" / "_shared" / "sessrec"
|
||||
@@ -222,7 +223,9 @@ def _sync_caddy_modules(config: DecnetConfig) -> None:
|
||||
_chown_tree(dest_child, src_dir)
|
||||
|
||||
|
||||
def _compose_ps(compose_file: Path) -> list[dict[str, object]]:
|
||||
def _compose_ps(
|
||||
compose_file: Path, project: str = FLEET_COMPOSE_PROJECT,
|
||||
) -> list[dict[str, object]]:
|
||||
"""Return ``docker compose ps`` rows for *compose_file* as parsed JSON.
|
||||
|
||||
Used for post-deploy verification: ``compose up -d`` returns 0 the
|
||||
@@ -232,7 +235,7 @@ def _compose_ps(compose_file: Path) -> list[dict[str, object]]:
|
||||
parse failure — caller treats that as 'unverifiable, don't gate').
|
||||
"""
|
||||
cmd = [
|
||||
"docker", "compose", "-p", "decnet", "-f", str(compose_file),
|
||||
"docker", "compose", "-p", project, "-f", str(compose_file),
|
||||
"ps", "--all", "--format", "json",
|
||||
]
|
||||
try:
|
||||
@@ -264,13 +267,21 @@ def _compose_ps(compose_file: Path) -> list[dict[str, object]]:
|
||||
return rows
|
||||
|
||||
|
||||
def _compose(*args: str, compose_file: Path = COMPOSE_FILE, env: dict | None = None) -> None:
|
||||
def _compose(
|
||||
*args: str,
|
||||
compose_file: Path = COMPOSE_FILE,
|
||||
env: dict | None = None,
|
||||
project: str = FLEET_COMPOSE_PROJECT,
|
||||
) -> None:
|
||||
import os
|
||||
# -p decnet pins the compose project name. Without it, docker compose
|
||||
# -p pins the compose project name. Without it, docker compose
|
||||
# derives the project from basename($PWD); when a daemon (systemd) runs
|
||||
# with WorkingDirectory=/ that basename is empty and compose aborts with
|
||||
# "project name must not be empty".
|
||||
cmd = ["docker", "compose", "-p", "decnet", "-f", str(compose_file), *args]
|
||||
# "project name must not be empty". Each scope (fleet, individual
|
||||
# topology) gets its OWN project so `--remove-orphans` only sweeps
|
||||
# containers in that scope — without this, a fleet redeploy or a
|
||||
# topology teardown blasts every other scope's containers as orphans.
|
||||
cmd = ["docker", "compose", "-p", project, "-f", str(compose_file), *args]
|
||||
merged = {**os.environ, **(env or {})}
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, env=merged) # nosec B603
|
||||
if result.stdout:
|
||||
@@ -424,15 +435,13 @@ def _compose_with_retry(
|
||||
retries: int = 3,
|
||||
delay: float = 5.0,
|
||||
env: dict | None = None,
|
||||
project: str = FLEET_COMPOSE_PROJECT,
|
||||
) -> None:
|
||||
"""Run a docker compose command, retrying on transient failures."""
|
||||
import os
|
||||
last_exc: subprocess.CalledProcessError | None = None
|
||||
# -p decnet pins the compose project name. Without it, docker compose
|
||||
# derives the project from basename($PWD); when a daemon (systemd) runs
|
||||
# with WorkingDirectory=/ that basename is empty and compose aborts with
|
||||
# "project name must not be empty".
|
||||
cmd = ["docker", "compose", "-p", "decnet", "-f", str(compose_file), *args]
|
||||
# See ``_compose`` for the project-name rationale.
|
||||
cmd = ["docker", "compose", "-p", project, "-f", str(compose_file), *args]
|
||||
merged = {**os.environ, **(env or {})}
|
||||
|
||||
# Preflight: if buildx already looks wedged before the first attempt,
|
||||
@@ -825,6 +834,18 @@ def _topology_compose_path(topology_id: str) -> Path:
|
||||
return Path(f"decnet-topology-{topology_id[:8]}-compose.yml")
|
||||
|
||||
|
||||
def _topology_compose_project(topology_id: str) -> str:
|
||||
"""Per-topology docker compose project name.
|
||||
|
||||
Each topology is its OWN compose project so ``--remove-orphans``
|
||||
during teardown or rollback sweeps only that topology's containers,
|
||||
not the flat fleet or sibling topologies. Sharing a project (the
|
||||
historical default ``decnet``) meant any teardown blasted every
|
||||
other scope's containers.
|
||||
"""
|
||||
return f"decnet-topo-{topology_id[:8]}"
|
||||
|
||||
|
||||
async def _resolve_swarm_host(repo, host_uuid: str) -> dict:
|
||||
host = await repo.get_swarm_host_by_uuid(host_uuid)
|
||||
if host is None:
|
||||
@@ -967,6 +988,7 @@ async def deploy_topology(repo, topology_id: str, *, dry_run: bool = False) -> N
|
||||
|
||||
lans = hydrated["lans"]
|
||||
compose_path = _topology_compose_path(topology_id)
|
||||
compose_project = _topology_compose_project(topology_id)
|
||||
|
||||
if dry_run:
|
||||
# Plan-only: don't touch repo status or Docker — write the compose
|
||||
@@ -1017,6 +1039,7 @@ async def deploy_topology(repo, topology_id: str, *, dry_run: bool = False) -> N
|
||||
await anyio.to_thread.run_sync(
|
||||
lambda: _compose_with_retry(
|
||||
"up", "--build", "-d", compose_file=compose_path,
|
||||
project=compose_project,
|
||||
),
|
||||
)
|
||||
compose_started = True
|
||||
@@ -1029,7 +1052,8 @@ async def deploy_topology(repo, topology_id: str, *, dry_run: bool = False) -> N
|
||||
if compose_started or compose_path.exists():
|
||||
try:
|
||||
_compose(
|
||||
"down", "--remove-orphans", compose_file=compose_path
|
||||
"down", "--remove-orphans", compose_file=compose_path,
|
||||
project=compose_project,
|
||||
)
|
||||
except Exception as rb_exc: # pragma: no cover
|
||||
log.warning(
|
||||
@@ -1063,7 +1087,7 @@ async def deploy_topology(repo, topology_id: str, *, dry_run: bool = False) -> N
|
||||
# container isn't running — operators see real state instead of an
|
||||
# optimistic flag.
|
||||
ps_rows = await anyio.to_thread.run_sync(
|
||||
lambda: _compose_ps(compose_path),
|
||||
lambda: _compose_ps(compose_path, project=compose_project),
|
||||
)
|
||||
bad: list[str] = []
|
||||
# Build the per-decky state map. The base container's compose
|
||||
@@ -1155,12 +1179,14 @@ async def teardown_topology(repo, topology_id: str) -> None:
|
||||
|
||||
client = docker.from_env()
|
||||
compose_path = _topology_compose_path(topology_id)
|
||||
compose_project = _topology_compose_project(topology_id)
|
||||
|
||||
if compose_path.exists():
|
||||
try:
|
||||
await anyio.to_thread.run_sync(
|
||||
lambda: _compose(
|
||||
"down", "--remove-orphans", compose_file=compose_path,
|
||||
project=compose_project,
|
||||
),
|
||||
)
|
||||
except subprocess.CalledProcessError as exc:
|
||||
|
||||
@@ -40,7 +40,12 @@ from decnet.web.db.repository import BaseRepository
|
||||
# pattern in decnet.canary.planter for the same reason.
|
||||
|
||||
|
||||
def _compose(*args: str, compose_file: Optional[Path] = None, env=None) -> None:
|
||||
def _compose(
|
||||
*args: str,
|
||||
compose_file: Optional[Path] = None,
|
||||
env=None,
|
||||
project: Optional[str] = None,
|
||||
) -> None:
|
||||
"""Indirection so tests can ``monkeypatch.setattr(services_live, '_compose', ...)``.
|
||||
|
||||
Real implementation lives in :mod:`decnet.engine.deployer`; we
|
||||
@@ -48,10 +53,12 @@ def _compose(*args: str, compose_file: Optional[Path] = None, env=None) -> None:
|
||||
clean (see module docstring above).
|
||||
"""
|
||||
from decnet.engine.deployer import _compose as _real_compose
|
||||
if compose_file is None:
|
||||
_real_compose(*args, env=env)
|
||||
else:
|
||||
_real_compose(*args, compose_file=compose_file, env=env)
|
||||
kwargs: dict[str, Any] = {"env": env}
|
||||
if compose_file is not None:
|
||||
kwargs["compose_file"] = compose_file
|
||||
if project is not None:
|
||||
kwargs["project"] = project
|
||||
_real_compose(*args, **kwargs)
|
||||
|
||||
|
||||
def _topology_compose_path(topology_id: str) -> Path:
|
||||
@@ -59,6 +66,11 @@ def _topology_compose_path(topology_id: str) -> Path:
|
||||
return _real_path(topology_id)
|
||||
|
||||
|
||||
def _topology_compose_project(topology_id: str) -> str:
|
||||
from decnet.engine.deployer import _topology_compose_project as _real
|
||||
return _real(topology_id)
|
||||
|
||||
|
||||
def _write_topology_compose(hydrated, path: Path) -> Path:
|
||||
from decnet.topology.compose import write_topology_compose
|
||||
return write_topology_compose(hydrated, path)
|
||||
@@ -262,12 +274,13 @@ async def _add_topology_service(
|
||||
await _resync_agent_topology(repo, topology_id)
|
||||
else:
|
||||
target = f"{decky_name}-{service_name}"
|
||||
project = _topology_compose_project(topology_id)
|
||||
# Run compose in a worker thread so the API event loop stays
|
||||
# responsive — same pattern as engine/deployer.deploy_topology.
|
||||
await anyio.to_thread.run_sync(
|
||||
lambda: _compose(
|
||||
"up", "-d", "--no-deps", "--build", target,
|
||||
compose_file=compose_path,
|
||||
compose_file=compose_path, project=project,
|
||||
),
|
||||
)
|
||||
return services
|
||||
@@ -295,16 +308,21 @@ async def _remove_topology_service(
|
||||
services = [s for s in services if s != service_name]
|
||||
target = f"{decky_name}-{service_name}"
|
||||
compose_path = _topology_compose_path(topology_id)
|
||||
project = _topology_compose_project(topology_id)
|
||||
agent_pinned = await _topology_is_agent_pinned(repo, topology_id)
|
||||
if not agent_pinned:
|
||||
# Stop + rm before persisting + re-rendering so a half-completed
|
||||
# mutation leaves the operator a clear state to retry from
|
||||
# (container still running; DB still says service is on).
|
||||
await anyio.to_thread.run_sync(
|
||||
lambda: _compose("stop", target, compose_file=compose_path),
|
||||
lambda: _compose(
|
||||
"stop", target, compose_file=compose_path, project=project,
|
||||
),
|
||||
)
|
||||
await anyio.to_thread.run_sync(
|
||||
lambda: _compose("rm", "-f", target, compose_file=compose_path),
|
||||
lambda: _compose(
|
||||
"rm", "-f", target, compose_file=compose_path, project=project,
|
||||
),
|
||||
)
|
||||
await repo.update_topology_decky(decky["uuid"], {"services": services})
|
||||
await _rerender_topology_compose(repo, topology_id)
|
||||
@@ -568,10 +586,11 @@ async def _update_topology_service_config(
|
||||
await _resync_agent_topology(repo, topology_id)
|
||||
else:
|
||||
target = f"{decky_name}-{service_name}"
|
||||
project = _topology_compose_project(topology_id)
|
||||
await anyio.to_thread.run_sync(
|
||||
lambda: _compose(
|
||||
"up", "-d", "--no-deps", "--force-recreate", "--build", target,
|
||||
compose_file=compose_path,
|
||||
compose_file=compose_path, project=project,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user