merge: testing → main (reconcile 2-week divergence)

This commit is contained in:
2026-04-28 18:36:00 -04:00
parent 499836c9e4
commit 862e4dbb31
1235 changed files with 160255 additions and 7996 deletions

View File

@@ -0,0 +1,9 @@
"""RIR delegated-stats provider.
Free, offline, no license: each Regional Internet Registry publishes a
daily plaintext file mapping IPv4 allocations to countries. Together the
five RIR files cover the entire assigned IPv4 space.
Direct imports of :class:`RirProvider` are discouraged — go through
:func:`decnet.geoip.factory.get_provider`.
"""

62
decnet/geoip/rir/fetch.py Normal file
View File

@@ -0,0 +1,62 @@
"""RIR delegated-stats download.
Five public files, ~5 MB total. Pulled over HTTPS with a generic
User-Agent (stealth: never identify as DECNET — a RIR log scraper could
otherwise correlate our egress to a honeypot operator).
"""
from __future__ import annotations
import logging
import shutil
import urllib.request
from pathlib import Path
from typing import Tuple
logger = logging.getLogger("decnet.geoip.rir.fetch")
# (registry_name, url). Extended delegated-stats include the opaque
# registration ID we don't use, but they are what the RIRs recommend
# consumers pull.
RIR_SOURCES: Tuple[Tuple[str, str], ...] = (
("arin", "https://ftp.arin.net/pub/stats/arin/delegated-arin-extended-latest"),
("ripe", "https://ftp.ripe.net/pub/stats/ripencc/delegated-ripencc-extended-latest"),
("apnic", "https://ftp.apnic.net/stats/apnic/delegated-apnic-extended-latest"),
("lacnic", "https://ftp.lacnic.net/pub/stats/lacnic/delegated-lacnic-extended-latest"),
("afrinic", "https://ftp.afrinic.net/pub/stats/afrinic/delegated-afrinic-extended-latest"),
)
# Generic UA — no DECNET/honeypot token. Matches what a stock requests/
# urllib script would send if someone forgot to set one.
_USER_AGENT = "Mozilla/5.0 (compatible; fetch/1.0)"
_TIMEOUT_S = 60
def fetch_all(dest: Path) -> list[Path]:
"""Download every RIR file into *dest*. Returns the written paths.
Atomic per file: we download to ``{name}.txt.tmp`` then rename. A
partial failure leaves the previous generation intact.
"""
dest.mkdir(parents=True, exist_ok=True)
written: list[Path] = []
for name, url in RIR_SOURCES:
target = dest / f"{name}.txt"
tmp = target.with_suffix(".txt.tmp")
try:
_download(url, tmp)
tmp.replace(target)
written.append(target)
logger.info("geoip.rir: fetched %s (%d bytes)", name, target.stat().st_size)
except Exception as exc:
logger.error("geoip.rir: fetch failed for %s (%s): %s", name, url, exc)
if tmp.exists():
tmp.unlink(missing_ok=True)
# Keep any stale previous file — better outdated than empty.
return written
def _download(url: str, dest: Path) -> None:
req = urllib.request.Request(url, headers={"User-Agent": _USER_AGENT})
# `with` closes the response + dest file on any path.
with urllib.request.urlopen(req, timeout=_TIMEOUT_S) as resp, dest.open("wb") as fh: # nosec B310 — fixed https RIR URLs
shutil.copyfileobj(resp, fh)

70
decnet/geoip/rir/parse.py Normal file
View File

@@ -0,0 +1,70 @@
"""Parser for RIR ``delegated-*-extended`` files.
Line shape (the bits we care about)::
ripencc|DE|ipv4|85.214.0.0|65536|20060814|allocated|<opaque-id>
Fields: registry, country, type (ipv4/ipv6/asn), start, count, date,
status, id. We emit one ``(start_int, end_int_inclusive, country)``
tuple per ``ipv4|<cc>|...|allocated|assigned`` row.
Rows skipped:
* ``ipv6`` and ``asn`` types — IPv6 is out of MVP scope, ASN is a
different table.
* ``summary`` / ``version`` header lines (registry|*|*|*|*|summary).
* Rows with status ``reserved`` / ``available`` — no country assigned.
* Rows with country ``*`` or ``ZZ`` — sentinel for unassigned space.
* Rows where count is not a valid power-of-two-ish positive integer
(the RIR files are usually tidy, but defensive).
"""
from __future__ import annotations
import ipaddress
import logging
from pathlib import Path
from typing import Iterator, Tuple
Range = Tuple[int, int, str]
logger = logging.getLogger("decnet.geoip.rir.parse")
_VALID_STATUSES = frozenset({"allocated", "assigned"})
_SENTINEL_CCS = frozenset({"*", "ZZ", ""})
def parse_file(path: Path) -> Iterator[Range]:
"""Yield ``(start_int, end_int_inclusive, cc)`` for every ipv4 row."""
with path.open("r", encoding="utf-8", errors="replace") as fh:
for lineno, raw in enumerate(fh, 1):
line = raw.strip()
if not line or line.startswith("#"):
continue
parts = line.split("|")
if len(parts) < 7:
continue
_registry, cc, rtype, start, count, _date, status = parts[:7]
if rtype != "ipv4":
continue
if status not in _VALID_STATUSES:
continue
if cc in _SENTINEL_CCS:
continue
# summary header carries type=ipv4 but start=='*' and status
# =='summary' — already filtered by _VALID_STATUSES, but
# keep the guard for defensiveness.
if start in ("*", ""):
continue
try:
start_int = int(ipaddress.IPv4Address(start))
n = int(count)
except (ValueError, ipaddress.AddressValueError):
logger.debug("geoip.rir: skipping malformed line %d in %s", lineno, path.name)
continue
if n <= 0:
continue
end_int = start_int + n - 1
yield (start_int, end_int, cc.upper())

View File

@@ -0,0 +1,74 @@
"""RIR provider — orchestrates fetch + parse into a :class:`Lookup`."""
from __future__ import annotations
import logging
from pathlib import Path
from typing import Sequence
from decnet.geoip.base import Provider
from decnet.geoip.lookup import Lookup
from decnet.geoip.paths import ensure_root
from decnet.geoip.rir.fetch import RIR_SOURCES, fetch_all
from decnet.geoip.rir.parse import parse_file
logger = logging.getLogger("decnet.geoip.rir.provider")
# Pickled lookup cache — skips re-parsing ~5 MB of RIR text on every
# profiler restart. Rebuilt whenever any raw file is newer than the
# cache, see ``_cache_fresh``.
_CACHE_NAME = ".rir_index.pkl"
class RirProvider(Provider):
name = "rir"
def __init__(self) -> None:
self._root = ensure_root()
# ---------- Provider interface ----------
def refresh(self) -> None:
logger.info("geoip.rir: refreshing delegated-stats files into %s", self._root)
fetch_all(self._root)
# Invalidate the cache — next build_lookup regenerates it.
cache = self._root / _CACHE_NAME
if cache.exists():
cache.unlink(missing_ok=True)
def build_lookup(self) -> Lookup:
cache = self._root / _CACHE_NAME
if self._cache_fresh(cache):
try:
lookup = Lookup.load(cache)
logger.debug("geoip.rir: loaded cached index (%d ranges)", len(lookup))
return lookup
except Exception as exc:
logger.warning("geoip.rir: cache load failed, rebuilding: %s", exc)
ranges = []
for path in self.data_paths():
if not path.exists():
continue
ranges.extend(parse_file(path))
lookup = Lookup.from_ranges(ranges)
try:
lookup.save(cache)
except Exception as exc:
logger.warning("geoip.rir: cache save failed: %s", exc)
logger.info("geoip.rir: built index with %d ranges", len(lookup))
return lookup
def data_paths(self) -> Sequence[Path]:
return [self._root / f"{name}.txt" for name, _url in RIR_SOURCES]
# ---------- internals ----------
def _cache_fresh(self, cache: Path) -> bool:
"""True when the pickle exists and is at least as new as every raw file."""
if not cache.exists():
return False
cache_mtime = cache.stat().st_mtime
for path in self.data_paths():
if path.exists() and path.stat().st_mtime > cache_mtime:
return False
return True