fix(ingester): retry get_state on startup DB errors; bump deps + rename behave packages

ingester: wrap bootstrap get_state() in forever-retry loop — MySQL coming
up after the API process killed the ingestion task permanently before it
ever entered _run_loop. Regression test added.

deps: idna 3.13→3.15 (CVE-2026-45409), twisted 26.4.0rc2→26.4.0
(PYSEC-2026-160), pip 26.1→26.1.1 (CVE-2026-3219 resolved upstream),
behave-core/behave-shell renamed from decnet-behave-* and bumped to 0.1.1.
pre-commit hook updated to reflect current ignore list.
This commit is contained in:
2026-05-20 22:10:15 -04:00
parent 916b21b652
commit 7bac3a29c6
3 changed files with 71 additions and 4 deletions

View File

@@ -770,3 +770,55 @@ class TestLogIngestionWorker:
if c[0][0] == _INGEST_STATE_KEY and c[0][1] == {"position": 0}
]
assert reset_calls, "set_state not called with position=0 after truncation"
@pytest.mark.asyncio
async def test_get_state_db_error_retries_then_recovers(self, tmp_path):
"""OperationalError on initial get_state must not kill the task.
Regression test for the 2026-05-20 incident: MySQL came up after the
API process, so the very first get_state() threw before _run_loop was
ever entered. The task died with an unhandled exception and the
ingester was silently dead for the whole API lifetime.
"""
from decnet.web.ingester import log_ingestion_worker
from sqlalchemy.exc import OperationalError
log_file = str(tmp_path / "test.log")
json_file = tmp_path / "test.json"
json_file.write_text(
json.dumps({"decky": "d1", "service": "ssh", "event_type": "auth",
"attacker_ip": "1.2.3.4", "fields": {}, "raw_line": "x", "msg": ""}) + "\n"
)
_get_state_calls: int = 0
async def fake_get_state(key):
nonlocal _get_state_calls
_get_state_calls += 1
if _get_state_calls == 1:
raise OperationalError("connection refused", None, None)
return {"position": 0}
mock_repo = MagicMock()
mock_repo.get_state = fake_get_state
mock_repo.add_logs = AsyncMock()
mock_repo.add_bounty = AsyncMock()
mock_repo.set_state = AsyncMock()
_sleep_count: int = 0
async def fake_sleep(secs):
nonlocal _sleep_count
_sleep_count += 1
# First sleep is the retry backoff after the OperationalError.
# Second sleep means we entered _run_loop and processed the batch.
if _sleep_count >= 2:
raise asyncio.CancelledError()
with patch.dict(os.environ, {"DECNET_INGEST_LOG_FILE": log_file}):
with patch("decnet.web.ingester._sleep", side_effect=fake_sleep):
with pytest.raises(asyncio.CancelledError):
await log_ingestion_worker(mock_repo)
assert _get_state_calls == 2, "should have retried get_state once after the error"
mock_repo.add_logs.assert_awaited_once()