DECNET/tools/add_spdx_headers.py

#!/usr/bin/env python3
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Prepend SPDX-License-Identifier headers to every source file in DECNET.

One-shot tool. Safe to re-run (idempotent: skips files that already have SPDX).

Rules:
  - .py / .sh         -> '# SPDX-License-Identifier: AGPL-3.0-or-later'
  - .ts / .tsx / .js / .jsx
                      -> '// SPDX-License-Identifier: AGPL-3.0-or-later'
  - .css              -> '/* SPDX-License-Identifier: AGPL-3.0-or-later */'

Shebang preservation:
  - If line 1 starts with '#!', the SPDX header is inserted on line 2.
  - For .py, if a coding declaration (PEP 263) follows the shebang or sits on
    line 1/2, the SPDX header is inserted AFTER it.

Skips: .venv, .311, .git, node_modules, __pycache__, .mypy_cache, dist, build,
       .next, artifacts, bait, .benchmarks, .pytest_cache, decnet.egg-info,
       wiki-checkout, and any file matching --exclude.

Idempotency: a file containing 'SPDX-License-Identifier' anywhere in its first
12 lines is left untouched.
"""
from __future__ import annotations

import argparse
import re
import sys
from pathlib import Path

SPDX_ID = "SPDX-License-Identifier: AGPL-3.0-or-later"

COMMENT_BY_EXT: dict[str, str] = {
    ".py": f"# {SPDX_ID}",
    ".sh": f"# {SPDX_ID}",
    ".ts": f"// {SPDX_ID}",
    ".tsx": f"// {SPDX_ID}",
    ".js": f"// {SPDX_ID}",
    ".jsx": f"// {SPDX_ID}",
    ".css": f"/* {SPDX_ID} */",
}

SKIP_DIRS = {
    ".venv",
    ".venv_test",
    ".311",
    ".git",
    "node_modules",
    "__pycache__",
    ".mypy_cache",
    ".pytest_cache",
    ".benchmarks",
    "site-packages",
    "dist",
    "build",
    ".next",
    "artifacts",
    "bait",
    "decnet.egg-info",
    "wiki-checkout",
}

CODING_RE = re.compile(rb"coding[=:]\s*([-\w.]+)")


def is_skipped(path: Path, root: Path) -> bool:
    try:
        rel_parts = path.relative_to(root).parts
    except ValueError:
        return True
    return any(part in SKIP_DIRS for part in rel_parts)


def has_spdx(head_lines: list[bytes]) -> bool:
    for line in head_lines[:12]:
        if b"SPDX-License-Identifier" in line:
            return True
    return False


def compute_insert_index(lines: list[bytes], ext: str) -> int:
    """Return the index where the SPDX line should be inserted."""
    idx = 0
    if lines and lines[0].startswith(b"#!"):
        idx = 1
    if ext == ".py":
        # PEP 263 allows coding decl on line 1 or 2.
        if idx < len(lines) and CODING_RE.search(lines[idx]):
            idx += 1
    return idx


def process_file(path: Path, *, dry_run: bool) -> str:
    ext = path.suffix
    header = COMMENT_BY_EXT.get(ext)
    if header is None:
        return "skip-ext"

    try:
        raw = path.read_bytes()
    except OSError as e:
        return f"error:{e}"

    # Preserve trailing-newline state.
    had_trailing_nl = raw.endswith(b"\n")
    lines = raw.split(b"\n")
    # If file ended with \n, split produced a trailing empty element; drop it
    # for processing and restore at write time.
    if had_trailing_nl and lines and lines[-1] == b"":
        lines.pop()

    if has_spdx(lines):
        return "already"

    insert_at = compute_insert_index(lines, ext)
    new_lines = lines[:insert_at] + [header.encode("utf-8")] + lines[insert_at:]

    out = b"\n".join(new_lines)
    if had_trailing_nl or out and not out.endswith(b"\n"):
        out += b"\n"

    if dry_run:
        return "would-add"

    path.write_bytes(out)
    return "added"


def iter_targets(root: Path) -> list[Path]:
    out: list[Path] = []
    for p in root.rglob("*"):
        if not p.is_file():
            continue
        if p.suffix not in COMMENT_BY_EXT:
            continue
        if is_skipped(p, root):
            continue
        out.append(p)
    return out


def main() -> int:
    ap = argparse.ArgumentParser()
    ap.add_argument("--root", default=".", help="Repo root (default: cwd)")
    ap.add_argument("--dry-run", action="store_true")
    args = ap.parse_args()

    root = Path(args.root).resolve()
    targets = iter_targets(root)

    counts: dict[str, int] = {}
    samples: dict[str, list[Path]] = {}
    for p in targets:
        status = process_file(p, dry_run=args.dry_run)
        counts[status] = counts.get(status, 0) + 1
        samples.setdefault(status, []).append(p)

    print(f"root: {root}")
    print(f"total candidates: {len(targets)}")
    for status, n in sorted(counts.items()):
        print(f"  {status}: {n}")
        for s in samples[status][:3]:
            print(f"      e.g. {s.relative_to(root)}")
    return 0


if __name__ == "__main__":
    sys.exit(main())