Initial commit: ULPgrammer
- Core Telegram monitoring pipeline (scraper, processor, notifier, downloaders) - Textual TUI frontend with thread-safe event bus - SQLite persistence, severity scoring, dedup cache - Fixed ULP parser: handles https:// truncation, port+path URLs, semicolon separator - Test suite: 88 tests across scorer, cache, database, processor
This commit is contained in:
1
utils/__init__.py
Normal file
1
utils/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
"""utils — pure logic modules with no Telegram dependencies."""
|
||||
32
utils/cache.md
Normal file
32
utils/cache.md
Normal file
@@ -0,0 +1,32 @@
|
||||
# utils/cache.py
|
||||
|
||||
Tracks already-processed Telegram document IDs to avoid redownloading.
|
||||
Persists to `data/cache.json` as a JSON array of integers.
|
||||
|
||||
## Public API
|
||||
|
||||
```python
|
||||
from utils.cache import is_seen, mark_seen
|
||||
```
|
||||
|
||||
### `is_seen(file_id: int) -> bool`
|
||||
Returns `True` if this document ID has been processed before.
|
||||
Loads from disk on every call (safe for multi-process, slightly slow for hot loops — not an issue given download cadence).
|
||||
|
||||
### `mark_seen(file_id: int) -> None`
|
||||
Adds `file_id` to the cache and persists to disk.
|
||||
|
||||
---
|
||||
|
||||
## Storage
|
||||
|
||||
- **File:** `data/cache.json`
|
||||
- **Format:** JSON array of integers — `[123456789, 987654321, ...]`
|
||||
- **No expiry** — grows indefinitely. Safe to delete to re-process all files.
|
||||
|
||||
---
|
||||
|
||||
## Notes
|
||||
|
||||
- `is_seen` + `mark_seen` are called in `core/scraper.py` after a successful download+process cycle, not before — so a file that fails mid-process will be retried on next run.
|
||||
- Not thread-safe (load/modify/save is not atomic). Acceptable because downloads are sequential within the bot loop.
|
||||
38
utils/cache.py
Normal file
38
utils/cache.py
Normal file
@@ -0,0 +1,38 @@
|
||||
"""
|
||||
cache.py — Tracks already-processed file IDs to avoid redownloading.
|
||||
Persists to a simple JSON file on disk.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
CACHE_FILE = Path("./data/cache.json")
|
||||
|
||||
|
||||
def _load() -> set:
|
||||
if not CACHE_FILE.exists():
|
||||
return set()
|
||||
try:
|
||||
with open(CACHE_FILE, "r") as f:
|
||||
return set(json.load(f))
|
||||
except Exception:
|
||||
return set()
|
||||
|
||||
|
||||
def _save(seen: set) -> None:
|
||||
with open(CACHE_FILE, "w") as f:
|
||||
json.dump(list(seen), f)
|
||||
|
||||
|
||||
def is_seen(file_id: int) -> bool:
|
||||
return file_id in _load()
|
||||
|
||||
|
||||
def mark_seen(file_id: int) -> None:
|
||||
seen = _load()
|
||||
seen.add(file_id)
|
||||
_save(seen)
|
||||
log.debug(f" Cached file ID {file_id}")
|
||||
89
utils/database.md
Normal file
89
utils/database.md
Normal file
@@ -0,0 +1,89 @@
|
||||
# utils/database.py
|
||||
|
||||
SQLite persistence layer for credential hits.
|
||||
DB file: `data/hits.db`
|
||||
|
||||
## Public API
|
||||
|
||||
```python
|
||||
from utils.database import init_db, insert_hits, search, recent, by_severity, stats
|
||||
```
|
||||
|
||||
### Setup
|
||||
|
||||
#### `init_db() -> None`
|
||||
Creates `hits` table and indexes if they don't exist. Call once on startup.
|
||||
Safe to call multiple times (idempotent).
|
||||
|
||||
---
|
||||
|
||||
### Writing
|
||||
|
||||
#### `insert_hits(scored_hits, source, filename, seen_before=False) -> int`
|
||||
Inserts a list of `ScoredHit` objects. Returns row count inserted.
|
||||
|
||||
```python
|
||||
insert_hits(new_hits, source="channelname", filename="combo.zip")
|
||||
insert_hits(dupe_hits, source="channelname", filename="combo.zip", seen_before=True)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
### Querying
|
||||
|
||||
#### `search(keyword: str) -> list[sqlite3.Row]`
|
||||
Full-text search across `url`, `username`, `raw`. Returns rows sorted by score DESC, timestamp DESC.
|
||||
|
||||
#### `recent(limit: int = 50) -> list[sqlite3.Row]`
|
||||
Most recent hits, newest first.
|
||||
|
||||
#### `by_severity(severity: str) -> list[sqlite3.Row]`
|
||||
All unique (non-duplicate) hits at a given severity, newest first.
|
||||
`severity` must be one of: `"CRITICAL"`, `"HIGH"`, `"MEDIUM"`, `"LOW"`
|
||||
|
||||
#### `stats() -> dict`
|
||||
Returns summary counters:
|
||||
```python
|
||||
{
|
||||
"total": int, # all rows
|
||||
"unique": int, # seen_before=0
|
||||
"duplicates": int, # seen_before=1
|
||||
"critical": int, # unique CRITICAL
|
||||
"high": int,
|
||||
"medium": int,
|
||||
"low": int,
|
||||
"sources": int, # distinct source channels
|
||||
"top_source": {"source": str, "cnt": int} | None,
|
||||
}
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Schema
|
||||
|
||||
```sql
|
||||
hits (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
url TEXT,
|
||||
username TEXT,
|
||||
password TEXT,
|
||||
raw TEXT NOT NULL, -- full original credential line
|
||||
source TEXT, -- channel username or ID
|
||||
filename TEXT, -- downloaded file name
|
||||
timestamp TEXT NOT NULL, -- "YYYY-MM-DD HH:MM:SS UTC"
|
||||
severity TEXT NOT NULL, -- CRITICAL/HIGH/MEDIUM/LOW
|
||||
score INTEGER NOT NULL, -- 40/30/20/10
|
||||
reasons TEXT, -- pipe-separated reason strings
|
||||
seen_before INTEGER NOT NULL -- 0=new, 1=duplicate
|
||||
)
|
||||
```
|
||||
|
||||
Indexes: `url`, `username`, `source`, `timestamp`, `severity`.
|
||||
|
||||
---
|
||||
|
||||
## Notes
|
||||
|
||||
- Each query opens and closes its own connection via the `_connect()` context manager.
|
||||
- `conn.row_factory = sqlite3.Row` — rows support both index and column-name access.
|
||||
- Transactions: commit on success, rollback on exception.
|
||||
171
utils/database.py
Normal file
171
utils/database.py
Normal file
@@ -0,0 +1,171 @@
|
||||
"""
|
||||
database.py — SQLite storage for credential hits.
|
||||
|
||||
Schema:
|
||||
hits table:
|
||||
- id auto-increment primary key
|
||||
- url the target URL from the credential line
|
||||
- username extracted username/email
|
||||
- password extracted password
|
||||
- raw the full original line
|
||||
- source channel/bot it came from
|
||||
- filename the file it was found in
|
||||
- timestamp UTC time of discovery
|
||||
- severity CRITICAL / HIGH / MEDIUM / LOW
|
||||
- score numeric score (higher = worse)
|
||||
- reasons pipe-separated list of scoring reasons
|
||||
- seen_before whether this was a duplicate (for stats)
|
||||
"""
|
||||
|
||||
import sqlite3
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from contextlib import contextmanager
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
DB_FILE = Path("./data/hits.db")
|
||||
|
||||
|
||||
# ─── Setup ────────────────────────────────────────────────────────────────────
|
||||
|
||||
@contextmanager
|
||||
def _connect():
|
||||
conn = sqlite3.connect(DB_FILE)
|
||||
conn.row_factory = sqlite3.Row
|
||||
try:
|
||||
yield conn
|
||||
conn.commit()
|
||||
except Exception:
|
||||
conn.rollback()
|
||||
raise
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def init_db() -> None:
|
||||
"""Create tables if they don't exist yet."""
|
||||
with _connect() as conn:
|
||||
conn.execute("""
|
||||
CREATE TABLE IF NOT EXISTS hits (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
url TEXT,
|
||||
username TEXT,
|
||||
password TEXT,
|
||||
raw TEXT NOT NULL,
|
||||
source TEXT,
|
||||
filename TEXT,
|
||||
timestamp TEXT NOT NULL,
|
||||
severity TEXT NOT NULL DEFAULT 'LOW',
|
||||
score INTEGER NOT NULL DEFAULT 10,
|
||||
reasons TEXT,
|
||||
seen_before INTEGER NOT NULL DEFAULT 0
|
||||
)
|
||||
""")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_url ON hits(url)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_username ON hits(username)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_source ON hits(source)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_timestamp ON hits(timestamp)")
|
||||
conn.execute("CREATE INDEX IF NOT EXISTS idx_severity ON hits(severity)")
|
||||
log.info(f"Database ready: {DB_FILE}")
|
||||
|
||||
|
||||
# ─── Writing ─────────────────────────────────────────────────────────────────
|
||||
|
||||
def insert_hits(
|
||||
scored_hits: list,
|
||||
source: str,
|
||||
filename: str,
|
||||
seen_before: bool = False,
|
||||
) -> int:
|
||||
"""
|
||||
Insert a list of ScoredHit objects into the database.
|
||||
Returns the number of rows inserted.
|
||||
"""
|
||||
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
|
||||
rows = []
|
||||
for h in scored_hits:
|
||||
rows.append((
|
||||
h.url,
|
||||
h.username,
|
||||
h.password,
|
||||
h.raw,
|
||||
source,
|
||||
filename,
|
||||
timestamp,
|
||||
h.severity,
|
||||
h.score,
|
||||
" | ".join(h.reasons),
|
||||
1 if seen_before else 0,
|
||||
))
|
||||
|
||||
with _connect() as conn:
|
||||
conn.executemany("""
|
||||
INSERT INTO hits
|
||||
(url, username, password, raw, source, filename, timestamp,
|
||||
severity, score, reasons, seen_before)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""", rows)
|
||||
|
||||
log.info(f" DB: inserted {len(rows)} row(s) from {filename}")
|
||||
return len(rows)
|
||||
|
||||
|
||||
# ─── Querying ─────────────────────────────────────────────────────────────────
|
||||
|
||||
def search(keyword: str) -> list[sqlite3.Row]:
|
||||
"""Search hits by keyword across url, username, raw fields."""
|
||||
with _connect() as conn:
|
||||
return conn.execute("""
|
||||
SELECT * FROM hits
|
||||
WHERE url LIKE ? OR username LIKE ? OR raw LIKE ?
|
||||
ORDER BY score DESC, timestamp DESC
|
||||
""", (f"%{keyword}%",) * 3).fetchall()
|
||||
|
||||
|
||||
def recent(limit: int = 50) -> list[sqlite3.Row]:
|
||||
"""Return the most recent hits."""
|
||||
with _connect() as conn:
|
||||
return conn.execute("""
|
||||
SELECT * FROM hits
|
||||
ORDER BY timestamp DESC
|
||||
LIMIT ?
|
||||
""", (limit,)).fetchall()
|
||||
|
||||
|
||||
def by_severity(severity: str) -> list[sqlite3.Row]:
|
||||
"""Return all hits of a given severity level."""
|
||||
with _connect() as conn:
|
||||
return conn.execute("""
|
||||
SELECT * FROM hits
|
||||
WHERE severity = ? AND seen_before = 0
|
||||
ORDER BY timestamp DESC
|
||||
""", (severity,)).fetchall()
|
||||
|
||||
|
||||
def stats() -> dict:
|
||||
"""Return summary statistics."""
|
||||
with _connect() as conn:
|
||||
total = conn.execute("SELECT COUNT(*) FROM hits").fetchone()[0]
|
||||
unique = conn.execute("SELECT COUNT(*) FROM hits WHERE seen_before=0").fetchone()[0]
|
||||
critical = conn.execute("SELECT COUNT(*) FROM hits WHERE severity='CRITICAL' AND seen_before=0").fetchone()[0]
|
||||
high = conn.execute("SELECT COUNT(*) FROM hits WHERE severity='HIGH' AND seen_before=0").fetchone()[0]
|
||||
medium = conn.execute("SELECT COUNT(*) FROM hits WHERE severity='MEDIUM' AND seen_before=0").fetchone()[0]
|
||||
low = conn.execute("SELECT COUNT(*) FROM hits WHERE severity='LOW' AND seen_before=0").fetchone()[0]
|
||||
sources = conn.execute("SELECT COUNT(DISTINCT source) FROM hits").fetchone()[0]
|
||||
top_source = conn.execute("""
|
||||
SELECT source, COUNT(*) as cnt FROM hits
|
||||
GROUP BY source ORDER BY cnt DESC LIMIT 1
|
||||
""").fetchone()
|
||||
return {
|
||||
"total": total,
|
||||
"unique": unique,
|
||||
"duplicates": total - unique,
|
||||
"critical": critical,
|
||||
"high": high,
|
||||
"medium": medium,
|
||||
"low": low,
|
||||
"sources": sources,
|
||||
"top_source": dict(top_source) if top_source else None,
|
||||
}
|
||||
87
utils/scorer.md
Normal file
87
utils/scorer.md
Normal file
@@ -0,0 +1,87 @@
|
||||
# utils/scorer.py
|
||||
|
||||
Severity scoring for credential hits. No Telegram deps. Pure logic.
|
||||
|
||||
## Public API
|
||||
|
||||
```python
|
||||
from utils.scorer import score_hit, score_hits, summarize, ScoredHit
|
||||
from utils.scorer import CRITICAL, HIGH, MEDIUM, LOW, SEVERITY_EMOJI, SEVERITY_SCORES
|
||||
```
|
||||
|
||||
### `score_hit(line: str) -> ScoredHit`
|
||||
Score a single raw credential line. Parses ULP format (`url:user:pass`), runs all checks, returns a `ScoredHit`.
|
||||
|
||||
### `score_hits(lines: list[str]) -> list[ScoredHit]`
|
||||
Score a list of lines. Returns sorted descending by score.
|
||||
|
||||
### `summarize(scored: list[ScoredHit]) -> dict`
|
||||
Returns `{CRITICAL: n, HIGH: n, MEDIUM: n, LOW: n}`.
|
||||
|
||||
---
|
||||
|
||||
## ScoredHit dataclass
|
||||
|
||||
| Field | Type | Description |
|
||||
|-------|------|-------------|
|
||||
| `raw` | str | Original credential line |
|
||||
| `severity` | str | CRITICAL / HIGH / MEDIUM / LOW |
|
||||
| `score` | int | 40 / 30 / 20 / 10 |
|
||||
| `reasons` | list[str] | Human-readable match reasons |
|
||||
| `url` | str\|None | Parsed URL field |
|
||||
| `username` | str\|None | Parsed username/email field |
|
||||
| `password` | str\|None | Parsed password field |
|
||||
| `.emoji` | property | 🔴🟠🟡🟢 |
|
||||
|
||||
---
|
||||
|
||||
## Scoring rules (highest match wins)
|
||||
|
||||
| Severity | Triggers |
|
||||
|----------|----------|
|
||||
| CRITICAL | Employee email domain after `@` in username/line · Privileged service URL (admin, vpn, ssh, rdp, gitlab, jira…) |
|
||||
| HIGH | Internal service URL (intranet, erp, crm, sso, owa, sharepoint…) |
|
||||
| MEDIUM | Client-facing URL (app, patient, booking, helpdesk…) |
|
||||
| LOW | Org domain appears anywhere in line (baseline) |
|
||||
|
||||
Check 6 (no severity change): flags weak passwords ≤6 chars or common strings.
|
||||
|
||||
---
|
||||
|
||||
## Employee domain matching
|
||||
|
||||
Keywords in `config.TARGET_KEYWORDS` containing `@` become employee patterns.
|
||||
Pattern: `@<domain>(?:[^a-zA-Z0-9.\-]|$)` — requires literal `@` before the domain.
|
||||
**`user@gmail.com` on a URL containing `myorg.cl` does NOT trigger CRITICAL.**
|
||||
|
||||
Keywords without `@` go only to `ORG_DOMAINS` (LOW baseline).
|
||||
|
||||
---
|
||||
|
||||
## ULP line parser (`ULP_PATTERN`)
|
||||
|
||||
Separators: `:` `;` `,` `|` `\t` (any of these between the three fields).
|
||||
|
||||
The URL field handles two common stealer-log complications:
|
||||
|
||||
1. **`://` not treated as separator** — the optional scheme prefix `(?:https?|ftp)://` is consumed before the character-class match, so `https://` never gets split at the colon.
|
||||
|
||||
2. **Port + path consumed into the URL** — the optional group `(?::\d+/[^\s:;,|\t]*)` absorbs `:port/path` when the port is pure digits immediately followed by `/`. This correctly handles `http://host:8085/path/:user:pass` but intentionally skips patterns like `:24145487-8` (RUT number — hyphen after digits, no `/`).
|
||||
|
||||
**Known limitation:** A bare port with no path (e.g. `https://host:8080:user:pass`) will mis-parse `8080` as the username. This is not observed in practice — stealer logs always include at least a trailing `/`.
|
||||
|
||||
---
|
||||
|
||||
## Module-level globals (rebuilt on import + via KeywordsScreen)
|
||||
|
||||
| Name | Type | Description |
|
||||
|------|------|-------------|
|
||||
| `EMPLOYEE_DOMAINS` | `list[tuple[str, Pattern]]` | `(domain_str, anchored_pattern)` for `@`-keywords |
|
||||
| `ORG_DOMAINS` | `list[Pattern]` | Plain domain patterns for all keywords |
|
||||
|
||||
To rebuild after editing `config.TARGET_KEYWORDS` at runtime:
|
||||
```python
|
||||
import utils.scorer as scorer
|
||||
scorer.EMPLOYEE_DOMAINS = scorer._build_employee_domains()
|
||||
scorer.ORG_DOMAINS = scorer._build_org_domains()
|
||||
```
|
||||
273
utils/scorer.py
Normal file
273
utils/scorer.py
Normal file
@@ -0,0 +1,273 @@
|
||||
"""
|
||||
scorer.py — Severity scoring for credential hits.
|
||||
|
||||
Scoring logic (highest match wins):
|
||||
|
||||
CRITICAL — Employee credentials (internal email domain)
|
||||
e.g. jdoe@yourclinic.cl:password
|
||||
— Admin/privileged service URLs
|
||||
e.g. admin., vpn., ssh., rdp., gitlab., jira.
|
||||
|
||||
HIGH — Internal-facing services
|
||||
e.g. intranet., erp., crm., portal., citrix.
|
||||
— Password manager or SSO hits
|
||||
— Any credential where username looks like an employee email
|
||||
|
||||
MEDIUM — Client-facing portals
|
||||
e.g. app., patient., client., booking.
|
||||
— Domain match on a non-privileged service
|
||||
|
||||
LOW — Generic domain keyword match
|
||||
— No URL parsed, just a raw domain mention
|
||||
|
||||
Each scored hit gets a dict with:
|
||||
- severity: CRITICAL / HIGH / MEDIUM / LOW
|
||||
- score: int (higher = worse)
|
||||
- reasons: list of human-readable reasons
|
||||
- raw: original line
|
||||
"""
|
||||
|
||||
import re
|
||||
import logging
|
||||
from dataclasses import dataclass, field
|
||||
from config import TARGET_KEYWORDS
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
# ─── Severity levels ─────────────────────────────────────────────────────────
|
||||
|
||||
CRITICAL = "CRITICAL"
|
||||
HIGH = "HIGH"
|
||||
MEDIUM = "MEDIUM"
|
||||
LOW = "LOW"
|
||||
|
||||
SEVERITY_SCORES = {
|
||||
CRITICAL: 40,
|
||||
HIGH: 30,
|
||||
MEDIUM: 20,
|
||||
LOW: 10,
|
||||
}
|
||||
|
||||
SEVERITY_EMOJI = {
|
||||
CRITICAL: "🔴",
|
||||
HIGH: "🟠",
|
||||
MEDIUM: "🟡",
|
||||
LOW: "🟢",
|
||||
}
|
||||
|
||||
|
||||
# ─── Pattern banks ───────────────────────────────────────────────────────────
|
||||
|
||||
# Subdomains/services that indicate privileged access
|
||||
CRITICAL_SERVICES = re.compile(
|
||||
r"(?:^|https?://|\.)"
|
||||
r"(admin|vpn|ssh|rdp|ftp|sftp|gitlab|github|bitbucket|jenkins|"
|
||||
r"jira|confluence|grafana|kibana|sentry|vault|bastion|jump|"
|
||||
r"firewall|router|switch|proxy|ldap|ad\.|activedirectory|"
|
||||
r"exchange|mail\.)",
|
||||
re.IGNORECASE
|
||||
)
|
||||
|
||||
HIGH_SERVICES = re.compile(
|
||||
r"(?:^|https?://|\.)"
|
||||
r"(intranet|erp|crm|portal|citrix|workspace|webmail|owa|"
|
||||
r"sharepoint|teams|slack|zoom|meet|sso|login|auth|oauth|"
|
||||
r"accounts?|dashboard|internal|corp|staff|hr|payroll|"
|
||||
r"finance|accounting)",
|
||||
re.IGNORECASE
|
||||
)
|
||||
|
||||
MEDIUM_SERVICES = re.compile(
|
||||
r"(?:^|https?://|\.)"
|
||||
r"(app|patient|client|customer|booking|appointment|"
|
||||
r"reserva|cita|paciente|user|member|registro|signup|"
|
||||
r"support|helpdesk|ticket)",
|
||||
re.IGNORECASE
|
||||
)
|
||||
|
||||
# Looks like a corporate email (user@domain)
|
||||
EMAIL_PATTERN = re.compile(r"[a-zA-Z0-9._%+\-]+@([a-zA-Z0-9.\-]+\.[a-zA-Z]{2,})")
|
||||
|
||||
# ULP line parser
|
||||
# Separator set: colon, semicolon, comma, pipe, tab.
|
||||
# URL field: optional scheme (http/https/ftp) consumed first so '://' is never
|
||||
# mistaken for a separator; then an optional port group ':\d+/' absorbs port+path
|
||||
# (port is digits immediately followed by '/') so 'http://host:88/path:user:pass'
|
||||
# yields url='http://host:88/path', not url='http'.
|
||||
ULP_PATTERN = re.compile(
|
||||
r"^(?P<url>"
|
||||
r"(?:(?:https?|ftp)://)?[^\s:;,|\t]+" # optional scheme + host/path
|
||||
r"(?::\d+/[^\s:;,|\t]*)?" # optional :port/path (port = digits then /)
|
||||
r")"
|
||||
r"(?:[:;,|\t])"
|
||||
r"(?P<username>[^\s:;,|\t]+)"
|
||||
r"(?:[:;,|\t])"
|
||||
r"(?P<password>.+)$"
|
||||
)
|
||||
|
||||
|
||||
# ─── Derived from config ──────────────────────────────────────────────────────
|
||||
|
||||
def _kw_to_domain(kw: str) -> str:
|
||||
"""Strip regex syntax from a keyword to get a plain domain string."""
|
||||
return kw.replace(r"@", "").replace(r"\.", ".").strip("^$").lstrip(".")
|
||||
|
||||
|
||||
def _build_employee_domains() -> list[tuple[str, re.Pattern]]:
|
||||
"""
|
||||
Keywords that contain '@' are employee email domain patterns.
|
||||
|
||||
Pattern anchors at '@<domain>' so that a URL containing the org domain
|
||||
never causes a false CRITICAL on an unrelated email like @gmail.com.
|
||||
|
||||
Returns list of (domain_str, compiled_pattern) tuples.
|
||||
"""
|
||||
patterns = []
|
||||
for kw in TARGET_KEYWORDS:
|
||||
if "@" in kw:
|
||||
domain = _kw_to_domain(kw)
|
||||
if domain:
|
||||
pat = re.compile(
|
||||
r"@" + re.escape(domain) + r"(?:[^a-zA-Z0-9.\-]|$)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
patterns.append((domain, pat))
|
||||
return patterns
|
||||
|
||||
EMPLOYEE_DOMAINS = _build_employee_domains()
|
||||
|
||||
|
||||
def _build_org_domains() -> list[re.Pattern]:
|
||||
"""
|
||||
All keywords as plain domain patterns for the LOW baseline match.
|
||||
Checks that the org domain appears anywhere in the line.
|
||||
"""
|
||||
patterns = []
|
||||
for kw in TARGET_KEYWORDS:
|
||||
domain = _kw_to_domain(kw)
|
||||
if domain:
|
||||
patterns.append(re.compile(re.escape(domain), re.IGNORECASE))
|
||||
return patterns
|
||||
|
||||
ORG_DOMAINS = _build_org_domains()
|
||||
|
||||
|
||||
|
||||
# ─── Scoring logic ────────────────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class ScoredHit:
|
||||
raw: str
|
||||
severity: str
|
||||
score: int
|
||||
reasons: list[str] = field(default_factory=list)
|
||||
url: str | None = None
|
||||
username: str | None = None
|
||||
password: str | None = None
|
||||
|
||||
@property
|
||||
def emoji(self) -> str:
|
||||
return SEVERITY_EMOJI.get(self.severity, "⚪")
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f"{self.emoji} [{self.severity}] {self.raw}"
|
||||
|
||||
|
||||
def score_hit(line: str) -> ScoredHit:
|
||||
"""
|
||||
Score a single credential line.
|
||||
Returns a ScoredHit with severity, score, and reasons.
|
||||
"""
|
||||
line = line.strip()
|
||||
reasons = []
|
||||
scores = []
|
||||
|
||||
# Parse ULP fields if possible
|
||||
url = username = password = None
|
||||
m = ULP_PATTERN.match(line)
|
||||
if m:
|
||||
url = m.group("url")
|
||||
username = m.group("username")
|
||||
password = m.group("password")
|
||||
|
||||
# ── Check 1: Employee email domain in username or line ───────────────
|
||||
# EMPLOYEE_DOMAINS entries are (domain_str, pattern) where the pattern
|
||||
# requires '@' immediately before the domain, so a URL containing the
|
||||
# org domain never triggers a CRITICAL on an unrelated email (@gmail etc).
|
||||
for domain_str, pat in EMPLOYEE_DOMAINS:
|
||||
# Try the parsed username field first; fall back to full line.
|
||||
# Either way the pattern requires a literal '@' before the domain.
|
||||
field = username if username else ""
|
||||
if not pat.search(field):
|
||||
field = line
|
||||
if pat.search(field):
|
||||
scores.append(CRITICAL)
|
||||
reasons.append(f"Employee email domain: {domain_str}")
|
||||
break
|
||||
|
||||
# ── Check 2: Is the URL a privileged/critical service? ────────────────
|
||||
if url and CRITICAL_SERVICES.search(url):
|
||||
scores.append(CRITICAL)
|
||||
reasons.append(f"Critical service URL: {url}")
|
||||
|
||||
# ── Check 3: Is the URL a high-value internal service? ────────────────
|
||||
if url and HIGH_SERVICES.search(url):
|
||||
scores.append(HIGH)
|
||||
reasons.append(f"High-value internal service: {url}")
|
||||
|
||||
# ── Check 4: Is the URL a client-facing service? ──────────────────────
|
||||
if url and MEDIUM_SERVICES.search(url):
|
||||
scores.append(MEDIUM)
|
||||
reasons.append(f"Client-facing service: {url}")
|
||||
|
||||
# ── Check 5: Generic org domain match (baseline) ─────────────────────
|
||||
for pattern in ORG_DOMAINS:
|
||||
if pattern.search(line):
|
||||
if not scores:
|
||||
scores.append(LOW)
|
||||
reasons.append(f"Org domain match in line")
|
||||
break
|
||||
|
||||
# ── Check 6: Weak/empty password flag ────────────────────────────────
|
||||
if password:
|
||||
if len(password) <= 6:
|
||||
reasons.append(f"⚠ Weak password ({len(password)} chars)")
|
||||
if password.lower() in {"123456", "password", "qwerty", "111111", "admin", "letmein"}:
|
||||
reasons.append(f"⚠ Common password: {password}")
|
||||
|
||||
# ── Resolve final severity ────────────────────────────────────────────
|
||||
severity_order = [CRITICAL, HIGH, MEDIUM, LOW]
|
||||
final_severity = LOW # default
|
||||
for s in severity_order:
|
||||
if s in scores:
|
||||
final_severity = s
|
||||
break
|
||||
|
||||
if not reasons:
|
||||
reasons.append("Pattern match")
|
||||
|
||||
return ScoredHit(
|
||||
raw = line,
|
||||
severity = final_severity,
|
||||
score = SEVERITY_SCORES[final_severity],
|
||||
reasons = reasons,
|
||||
url = url,
|
||||
username = username,
|
||||
password = password,
|
||||
)
|
||||
|
||||
|
||||
def score_hits(lines: list[str]) -> list[ScoredHit]:
|
||||
"""Score a list of credential lines. Returns sorted by score descending."""
|
||||
scored = [score_hit(line) for line in lines]
|
||||
scored.sort(key=lambda h: h.score, reverse=True)
|
||||
return scored
|
||||
|
||||
|
||||
def summarize(scored: list[ScoredHit]) -> dict:
|
||||
"""Count hits by severity level."""
|
||||
summary = {CRITICAL: 0, HIGH: 0, MEDIUM: 0, LOW: 0}
|
||||
for h in scored:
|
||||
summary[h.severity] += 1
|
||||
return summary
|
||||
Reference in New Issue
Block a user