merge: testing → main (reconcile 2-week divergence)
This commit is contained in:
9
decnet/asn/iptoasn/__init__.py
Normal file
9
decnet/asn/iptoasn/__init__.py
Normal file
@@ -0,0 +1,9 @@
|
||||
"""iptoasn.com IP→ASN provider.
|
||||
|
||||
Daily-refreshed gzipped TSV dump of the global BGP table, derived from
|
||||
RIPE RIS. Released into the public domain by upstream — no attribution
|
||||
required, no UA mandate, no terms to violate.
|
||||
|
||||
Direct imports of :class:`IptoasnProvider` are discouraged — go through
|
||||
:func:`decnet.asn.factory.get_provider`.
|
||||
"""
|
||||
63
decnet/asn/iptoasn/fetch.py
Normal file
63
decnet/asn/iptoasn/fetch.py
Normal file
@@ -0,0 +1,63 @@
|
||||
"""iptoasn.com bulk dump download.
|
||||
|
||||
One file: ``ip2asn-v4.tsv.gz``, ~5 MB compressed, refreshed daily.
|
||||
Pulled over HTTPS with the same generic UA the geoip RIR fetcher uses
|
||||
(stealth: never identify as DECNET — public-data scrapers correlated to
|
||||
honeypot operator egress is the threat model).
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import shutil
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
from typing import Tuple
|
||||
|
||||
logger = logging.getLogger("decnet.asn.iptoasn.fetch")
|
||||
|
||||
# Mirror the (name, url) tuple shape of geoip.rir.fetch so test
|
||||
# harnesses can swap one for the other.
|
||||
IPTOASN_SOURCES: Tuple[Tuple[str, str], ...] = (
|
||||
("ip2asn-v4", "https://iptoasn.com/data/ip2asn-v4.tsv.gz"),
|
||||
)
|
||||
|
||||
# Generic UA — matches geoip.rir.fetch. iptoasn.com explicitly releases
|
||||
# the data into the public domain and does NOT require an identifying UA,
|
||||
# so we keep DECNET stealth instead of advertising.
|
||||
_USER_AGENT = "Mozilla/5.0 (compatible; fetch/1.0)"
|
||||
_TIMEOUT_S = 60
|
||||
|
||||
|
||||
def fetch_all(dest: Path) -> list[Path]:
|
||||
"""Download every iptoasn file into *dest*. Returns the written paths.
|
||||
|
||||
Atomic per file: download to ``{name}.tsv.gz.tmp`` then rename. A
|
||||
partial failure leaves the previous generation intact.
|
||||
"""
|
||||
dest.mkdir(parents=True, exist_ok=True)
|
||||
written: list[Path] = []
|
||||
for name, url in IPTOASN_SOURCES:
|
||||
target = dest / f"{name}.tsv.gz"
|
||||
tmp = target.with_suffix(".gz.tmp")
|
||||
try:
|
||||
_download(url, tmp)
|
||||
tmp.replace(target)
|
||||
written.append(target)
|
||||
logger.info(
|
||||
"asn.iptoasn: fetched %s (%d bytes)",
|
||||
name, target.stat().st_size,
|
||||
)
|
||||
except Exception as exc:
|
||||
logger.error(
|
||||
"asn.iptoasn: fetch failed for %s (%s): %s", name, url, exc
|
||||
)
|
||||
if tmp.exists():
|
||||
tmp.unlink(missing_ok=True)
|
||||
# Keep any stale previous file — better outdated than empty.
|
||||
return written
|
||||
|
||||
|
||||
def _download(url: str, dest: Path) -> None:
|
||||
req = urllib.request.Request(url, headers={"User-Agent": _USER_AGENT})
|
||||
with urllib.request.urlopen(req, timeout=_TIMEOUT_S) as resp, dest.open("wb") as fh: # nosec B310 — fixed https iptoasn URL
|
||||
shutil.copyfileobj(resp, fh)
|
||||
78
decnet/asn/iptoasn/parse.py
Normal file
78
decnet/asn/iptoasn/parse.py
Normal file
@@ -0,0 +1,78 @@
|
||||
"""Parser for the iptoasn.com ``ip2asn-v4.tsv`` dump.
|
||||
|
||||
Line shape (gzipped, one row per BGP-announced prefix)::
|
||||
|
||||
1.0.0.0\\t1.0.0.255\\t13335\\tUS\\tCLOUDFLARENET
|
||||
|
||||
Fields: ``range_start``, ``range_end``, ``as_number``, ``country_code``,
|
||||
``as_description``. Both range columns are dotted IPv4 strings (the dump
|
||||
is IPv4-only — there's a separate ``ip2asn-v6.tsv.gz`` we don't pull).
|
||||
|
||||
Rows skipped:
|
||||
|
||||
* ``as_number == 0`` — iptoasn's sentinel for "unannounced" / private
|
||||
/ reserved space. Country may still be present (``"None"`` / two-letter
|
||||
CC) but we don't care: the geoip module owns country, ASN owns BGP.
|
||||
* Rows where either range column won't parse as IPv4.
|
||||
* Rows with fewer than 3 tab-separated columns.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import gzip
|
||||
import ipaddress
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Iterator
|
||||
|
||||
from decnet.asn.lookup import AsnInfo, Range
|
||||
|
||||
logger = logging.getLogger("decnet.asn.iptoasn.parse")
|
||||
|
||||
|
||||
def parse_file(path: Path) -> Iterator[Range]:
|
||||
"""Yield ``(start_int, end_int_inclusive, AsnInfo)`` for every BGP row.
|
||||
|
||||
Accepts a gzipped path (``*.tsv.gz``); plain TSV is also fine for
|
||||
test harnesses that hand-craft small fixtures.
|
||||
"""
|
||||
opener = gzip.open if path.suffix == ".gz" else open
|
||||
with opener(path, "rt", encoding="utf-8", errors="replace") as fh:
|
||||
for lineno, raw in enumerate(fh, 1):
|
||||
line = raw.rstrip("\n")
|
||||
if not line:
|
||||
continue
|
||||
parts = line.split("\t")
|
||||
if len(parts) < 3:
|
||||
continue
|
||||
start_s, end_s, asn_s = parts[0], parts[1], parts[2]
|
||||
# Description is the 5th column; iptoasn quotes nothing,
|
||||
# but the field can contain stray whitespace. ``""`` when
|
||||
# missing or unknown.
|
||||
name = parts[4].strip() if len(parts) >= 5 else ""
|
||||
|
||||
try:
|
||||
asn = int(asn_s)
|
||||
except ValueError:
|
||||
logger.debug(
|
||||
"asn.iptoasn: skipping malformed asn line %d in %s",
|
||||
lineno, path.name,
|
||||
)
|
||||
continue
|
||||
# ASN 0 is iptoasn's sentinel for unannounced / sentinel
|
||||
# space. Skip — there's no useful enrichment to attach.
|
||||
if asn == 0:
|
||||
continue
|
||||
|
||||
try:
|
||||
start_int = int(ipaddress.IPv4Address(start_s))
|
||||
end_int = int(ipaddress.IPv4Address(end_s))
|
||||
except (ValueError, ipaddress.AddressValueError):
|
||||
logger.debug(
|
||||
"asn.iptoasn: skipping malformed addr line %d in %s",
|
||||
lineno, path.name,
|
||||
)
|
||||
continue
|
||||
if end_int < start_int:
|
||||
continue
|
||||
|
||||
yield (start_int, end_int, AsnInfo(asn=asn, name=name))
|
||||
83
decnet/asn/iptoasn/provider.py
Normal file
83
decnet/asn/iptoasn/provider.py
Normal file
@@ -0,0 +1,83 @@
|
||||
"""iptoasn provider — orchestrates fetch + parse into an :class:`AsnLookup`.
|
||||
|
||||
Mirrors :class:`decnet.geoip.rir.provider.RirProvider` exactly: fetch,
|
||||
build a pickled cache, invalidate when raw files are newer than the
|
||||
cache.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Sequence
|
||||
|
||||
from decnet.asn.base import Provider
|
||||
from decnet.asn.iptoasn.fetch import IPTOASN_SOURCES, fetch_all
|
||||
from decnet.asn.iptoasn.parse import parse_file
|
||||
from decnet.asn.lookup import AsnLookup
|
||||
from decnet.asn.paths import ensure_root
|
||||
|
||||
logger = logging.getLogger("decnet.asn.iptoasn.provider")
|
||||
|
||||
# Pickled lookup cache — skips re-parsing the ~580k-row gz dump on every
|
||||
# profiler restart. Rebuilt whenever any raw file is newer than the
|
||||
# cache, see ``_cache_fresh``.
|
||||
_CACHE_NAME = ".iptoasn_index.pkl"
|
||||
|
||||
|
||||
class IptoasnProvider(Provider):
|
||||
name = "iptoasn"
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._root = ensure_root()
|
||||
|
||||
# ---------- Provider interface ----------
|
||||
|
||||
def refresh(self) -> None:
|
||||
logger.info("asn.iptoasn: refreshing dump into %s", self._root)
|
||||
fetch_all(self._root)
|
||||
cache = self._root / _CACHE_NAME
|
||||
if cache.exists():
|
||||
cache.unlink(missing_ok=True)
|
||||
|
||||
def build_lookup(self) -> AsnLookup:
|
||||
cache = self._root / _CACHE_NAME
|
||||
if self._cache_fresh(cache):
|
||||
try:
|
||||
lookup = AsnLookup.load(cache)
|
||||
logger.debug(
|
||||
"asn.iptoasn: loaded cached index (%d ranges)",
|
||||
len(lookup),
|
||||
)
|
||||
return lookup
|
||||
except Exception as exc:
|
||||
logger.warning(
|
||||
"asn.iptoasn: cache load failed, rebuilding: %s", exc
|
||||
)
|
||||
|
||||
ranges = []
|
||||
for path in self.data_paths():
|
||||
if not path.exists():
|
||||
continue
|
||||
ranges.extend(parse_file(path))
|
||||
lookup = AsnLookup.from_ranges(ranges)
|
||||
try:
|
||||
lookup.save(cache)
|
||||
except Exception as exc:
|
||||
logger.warning("asn.iptoasn: cache save failed: %s", exc)
|
||||
logger.info("asn.iptoasn: built index with %d ranges", len(lookup))
|
||||
return lookup
|
||||
|
||||
def data_paths(self) -> Sequence[Path]:
|
||||
return [self._root / f"{name}.tsv.gz" for name, _url in IPTOASN_SOURCES]
|
||||
|
||||
# ---------- internals ----------
|
||||
|
||||
def _cache_fresh(self, cache: Path) -> bool:
|
||||
"""True when the pickle exists and is at least as new as every raw file."""
|
||||
if not cache.exists():
|
||||
return False
|
||||
cache_mtime = cache.stat().st_mtime
|
||||
for path in self.data_paths():
|
||||
if path.exists() and path.stat().st_mtime > cache_mtime:
|
||||
return False
|
||||
return True
|
||||
Reference in New Issue
Block a user