From 28327a9b4e329a9b60679edde733fe06e65a8c35 Mon Sep 17 00:00:00 2001 From: anti Date: Sat, 30 May 2026 17:26:23 -0400 Subject: [PATCH] fix(swarm): ship update tarball from an explicit include-list, never secrets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit tar_working_tree walked the whole working tree minus a blocklist that omitted .env.local, *.key, *.pem, *.crt — so the JWT secret, Fernet key, admin password, DB creds and TLS private keys fanned out to every worker on each update push. Invert to an allowlist (DEFAULT_INCLUDES = pyproject.toml + LICENSE + README.md + decnet/), the exact surface 'pip install .' needs; decnet/ carries its own package-data. A defensive _HYGIENE_PATTERNS layer drops secret-/churn-shaped files even if nested under decnet/. extra_excludes can still narrow but can no longer widen past the allowlist. Verified against the live repo: the bundle carries the package + metadata and zero secret/db/log/pyc files, and pip-installs clean from the extracted tree. --- decnet/swarm/tar_tree.py | 103 +++++++++++++++++++------------ tests/swarm/test_tar_tree.py | 113 ++++++++++++++++++++++++----------- 2 files changed, 144 insertions(+), 72 deletions(-) diff --git a/decnet/swarm/tar_tree.py b/decnet/swarm/tar_tree.py index 0323a5ed..193f4fc4 100644 --- a/decnet/swarm/tar_tree.py +++ b/decnet/swarm/tar_tree.py @@ -1,13 +1,20 @@ # SPDX-License-Identifier: AGPL-3.0-or-later -"""Build a gzipped tarball of the master's working tree for pushing to workers. +"""Build a gzipped tarball of the installable DECNET package for workers. -Always excludes the obvious large / secret / churn paths: ``.venv/``, -``__pycache__/``, ``.git/``, ``wiki-checkout/``, ``*.db*``, ``*.log``. The -caller can supply additional exclude globs. +The tarball is extracted and ``pip install``-ed on each worker, so it ships +*only* what that build needs — enumerated by an INCLUDE allowlist, never a +blocklist. This is the trust-boundary rule: a bundle crossing to another host +enumerates what it carries, so a stray ``.env.local``, TLS private key, SQLite +DB, or the operator's whole working tree can never be swept in by an exclude +list that simply forgot a pattern. -Deliberately does NOT invoke git — the tree is what the operator has on -disk (staged + unstaged + untracked). That's the whole point; the scp -workflow we're replacing also shipped the live tree. +``DEFAULT_INCLUDES`` is the package surface (``decnet/`` + packaging metadata); +``_HYGIENE_PATTERNS`` is a defensive second layer that drops secret-/churn- +shaped files even if one somehow lives under an included directory. Callers may +pass ``extra_excludes`` to narrow further, but cannot add anything outside the +allowlist. + +Deliberately does NOT invoke git — the included dirs are taken from disk as-is. """ from __future__ import annotations @@ -17,22 +24,26 @@ import pathlib import tarfile from typing import Iterable, Optional -DEFAULT_EXCLUDES = ( - ".venv", ".venv/*", - "**/.venv/*", - "__pycache__", "**/__pycache__", "**/__pycache__/*", - ".git", ".git/*", - "wiki-checkout", "wiki-checkout/*", +# The ONLY top-level paths shipped to a worker: the importable package plus the +# metadata `pip install .` needs (setuptools build-meta + license-files=LICENSE). +# decnet/ carries its own package-data (templates/, canary/*). Everything else +# in the working tree — secrets, DBs, logs, the dashboard source, tests, build +# artifacts — is excluded by construction. +DEFAULT_INCLUDES = ( + "pyproject.toml", + "LICENSE", + "README.md", + "decnet", +) + +# Defensive hygiene applied WITHIN an included path: never ship build churn or +# anything secret-shaped, matched on the basename so it catches any nesting. +_HYGIENE_PATTERNS = ( "*.pyc", "*.pyo", - "*.db", "*.db-wal", "*.db-shm", + "*.db", "*.db-wal", "*.db-shm", "*.db-journal", "*.log", - ".pytest_cache", ".pytest_cache/*", - ".mypy_cache", ".mypy_cache/*", - ".tox", ".tox/*", - "*.egg-info", "*.egg-info/*", - "decnet-state.json", - "master.log", "master.json", - "decnet.db*", + ".env", ".env.*", "*.env", + "*.key", "*.pem", "*.crt", "*.p12", "*.pfx", ) @@ -41,39 +52,57 @@ def _is_excluded(rel: str, patterns: Iterable[str]) -> bool: for pat in patterns: if fnmatch.fnmatch(rel, pat): return True - # Also match the pattern against every leading subpath — this is - # what catches nested `.venv/...` without forcing callers to spell - # out every `**/` glob. + # Also match the pattern against every leading subpath so a caller can + # exclude a whole subtree without spelling out every `**/` glob. for i in range(1, len(parts) + 1): if fnmatch.fnmatch("/".join(parts[:i]), pat): return True return False +def _hygiene_skip(rel: str) -> bool: + """True for build-churn / secret-shaped files anywhere in the tree.""" + p = pathlib.PurePosixPath(rel) + if "__pycache__" in p.parts: + return True + return any(fnmatch.fnmatch(p.name, pat) for pat in _HYGIENE_PATTERNS) + + def tar_working_tree( root: pathlib.Path, extra_excludes: Optional[Iterable[str]] = None, + includes: Optional[Iterable[str]] = None, ) -> bytes: - """Return the gzipped tarball bytes of ``root``. + """Return the gzipped tarball of the installable package under ``root``. - Entries are added with paths relative to ``root`` (no leading ``/``, - no ``..``). The updater rejects unsafe paths on the receiving side. + Only paths in ``includes`` (default :data:`DEFAULT_INCLUDES`) are walked; + ``extra_excludes`` narrows further but can never widen the set. Entries are + added with paths relative to ``root`` (no leading ``/``, no ``..``). The + updater rejects unsafe paths on the receiving side. """ - patterns = list(DEFAULT_EXCLUDES) + list(extra_excludes or ()) + include_roots = list(includes) if includes is not None else list(DEFAULT_INCLUDES) + extra = list(extra_excludes or ()) buf = io.BytesIO() + def _admit(path: pathlib.Path) -> None: + rel = path.relative_to(root).as_posix() + if _hygiene_skip(rel) or _is_excluded(rel, extra): + return + tar.add(path, arcname=rel, recursive=False) + with tarfile.open(fileobj=buf, mode="w:gz") as tar: - for path in sorted(root.rglob("*")): - rel = path.relative_to(root).as_posix() - if _is_excluded(rel, patterns): + for entry in include_roots: + base = root / entry + if not base.exists() or base.is_symlink(): continue - if path.is_symlink(): - # Symlinks inside a repo tree are rare and often break - # portability; skip them rather than ship dangling links. + if base.is_file(): + _admit(base) continue - if path.is_dir(): - continue - tar.add(path, arcname=rel, recursive=False) + for path in sorted(base.rglob("*")): + # Skip symlinks (dangling/portability) and dirs (added implicitly). + if path.is_symlink() or path.is_dir(): + continue + _admit(path) return buf.getvalue() diff --git a/tests/swarm/test_tar_tree.py b/tests/swarm/test_tar_tree.py index 2db8e588..7a1a1011 100644 --- a/tests/swarm/test_tar_tree.py +++ b/tests/swarm/test_tar_tree.py @@ -1,5 +1,5 @@ # SPDX-License-Identifier: AGPL-3.0-or-later -"""tar_working_tree: exclude filter, tarball validity, git SHA detection.""" +"""tar_working_tree: include allowlist, secret exclusion, tarball validity, git SHA.""" from __future__ import annotations import io @@ -14,49 +14,92 @@ def _tree_names(data: bytes) -> set[str]: return {m.name for m in tar.getmembers()} -def test_tar_excludes_default_patterns(tmp_path: pathlib.Path) -> None: - (tmp_path / "decnet").mkdir() - (tmp_path / "decnet" / "keep.py").write_text("x = 1") - (tmp_path / ".venv").mkdir() - (tmp_path / ".venv" / "pyvenv.cfg").write_text("junk") - (tmp_path / ".git").mkdir() - (tmp_path / ".git" / "HEAD").write_text("ref: refs/heads/main\n") - (tmp_path / "decnet" / "__pycache__").mkdir() - (tmp_path / "decnet" / "__pycache__" / "keep.cpython-311.pyc").write_text("bytecode") - (tmp_path / "wiki-checkout").mkdir() - (tmp_path / "wiki-checkout" / "Home.md").write_text("# wiki") - (tmp_path / "run.db").write_text("sqlite") - (tmp_path / "master.log").write_text("log") - - data = tar_working_tree(tmp_path) - names = _tree_names(data) - assert "decnet/keep.py" in names - assert all(".venv" not in n for n in names) - assert all(".git" not in n for n in names) - assert all("__pycache__" not in n for n in names) - assert all("wiki-checkout" not in n for n in names) - assert "run.db" not in names - assert "master.log" not in names +def _seed_tree(root: pathlib.Path) -> None: + """A realistic master working tree: package + metadata + a pile of junk + and secrets that must NOT ship.""" + (root / "decnet").mkdir() + (root / "decnet" / "__init__.py").write_text("") + (root / "decnet" / "agent.py").write_text("x = 1") + (root / "decnet" / "templates").mkdir() + (root / "decnet" / "templates" / "base.j2").write_text("data") + (root / "decnet" / "__pycache__").mkdir() + (root / "decnet" / "__pycache__" / "agent.cpython-311.pyc").write_text("bytecode") + (root / "pyproject.toml").write_text("[project]\nname='decnet'\n") + (root / "LICENSE").write_text("AGPL") + (root / "README.md").write_text("# decnet") + # ---- secrets / junk that the OLD exclude-list would have leaked ---- + (root / ".env.local").write_text("DECNET_JWT_SECRET=topsecret") + (root / ".env").write_text("X=Y") + (root / "tls.key").write_text("-----BEGIN PRIVATE KEY-----") + (root / "ca.pem").write_text("-----BEGIN CERTIFICATE-----") + (root / "decnet.db").write_text("sqlite") + (root / "master.log").write_text("log") + (root / "decnet_web").mkdir() # dashboard source — not a package + (root / "decnet_web" / "app.tsx").write_text("ui") + (root / "tests").mkdir() + (root / "tests" / "test_x.py").write_text("assert True") -def test_tar_accepts_extra_excludes(tmp_path: pathlib.Path) -> None: - (tmp_path / "a.py").write_text("x") - (tmp_path / "secret.env").write_text("TOKEN=abc") - data = tar_working_tree(tmp_path, extra_excludes=["secret.env"]) - names = _tree_names(data) - assert "a.py" in names - assert "secret.env" not in names +def test_tar_ships_only_the_package_and_metadata(tmp_path: pathlib.Path) -> None: + _seed_tree(tmp_path) + names = _tree_names(tar_working_tree(tmp_path)) + assert "decnet/agent.py" in names + assert "decnet/__init__.py" in names + assert "decnet/templates/base.j2" in names # package-data ships + assert "pyproject.toml" in names + assert "LICENSE" in names + assert "README.md" in names + # Nothing outside the allowlist: + assert not any(n.startswith("decnet_web") for n in names) + assert not any(n.startswith("tests") for n in names) + + +def test_tar_never_ships_secrets_or_db_or_churn(tmp_path: pathlib.Path) -> None: + # The whole point of the include-list: these existed at the root and the + # bundle must not carry a single one of them. + _seed_tree(tmp_path) + names = _tree_names(tar_working_tree(tmp_path)) + for forbidden in (".env.local", ".env", "tls.key", "ca.pem", "decnet.db", "master.log"): + assert forbidden not in names, f"leaked {forbidden}" + assert not any("__pycache__" in n or n.endswith(".pyc") for n in names) + + +def test_secret_nested_under_package_is_still_dropped(tmp_path: pathlib.Path) -> None: + # Defensive hygiene: even a secret-shaped file *inside* decnet/ is excluded. + _seed_tree(tmp_path) + (tmp_path / "decnet" / "worker.key").write_text("oops") + (tmp_path / "decnet" / ".env.prod").write_text("SECRET=1") + names = _tree_names(tar_working_tree(tmp_path)) + assert "decnet/worker.key" not in names + assert "decnet/.env.prod" not in names + assert "decnet/agent.py" in names # real source still present + + +def test_extra_excludes_narrows_within_allowlist(tmp_path: pathlib.Path) -> None: + _seed_tree(tmp_path) + names = _tree_names(tar_working_tree(tmp_path, extra_excludes=["decnet/agent.py"])) + assert "decnet/agent.py" not in names + assert "decnet/__init__.py" in names + + +def test_extra_excludes_cannot_widen_beyond_allowlist(tmp_path: pathlib.Path) -> None: + # Passing a non-allowlisted include via extra_excludes is meaningless — + # excludes can only remove, never add. decnet_web stays out. + _seed_tree(tmp_path) + names = _tree_names(tar_working_tree(tmp_path, extra_excludes=[])) + assert not any(n.startswith("decnet_web") for n in names) def test_tar_skips_symlinks(tmp_path: pathlib.Path) -> None: - (tmp_path / "real.txt").write_text("hi") + (tmp_path / "decnet").mkdir() + (tmp_path / "decnet" / "real.py").write_text("hi") try: - (tmp_path / "link.txt").symlink_to(tmp_path / "real.txt") + (tmp_path / "decnet" / "link.py").symlink_to(tmp_path / "decnet" / "real.py") except (OSError, NotImplementedError): return # platform doesn't support symlinks — skip names = _tree_names(tar_working_tree(tmp_path)) - assert "real.txt" in names - assert "link.txt" not in names + assert "decnet/real.py" in names + assert "decnet/link.py" not in names def test_detect_git_sha_from_ref(tmp_path: pathlib.Path) -> None: