Compare commits
452 Commits
201d246c07
...
testing
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b12d46ff9d | ||
| 8033137be6 | |||
|
|
2ce076cd37 | ||
| 17097cc3dc | |||
| 89268f19fb | |||
| 150e5eb5be | |||
| f2e01d8ea6 | |||
| 15b2e7ba5c | |||
| 3d00de8fd3 | |||
| 5e7d68fde3 | |||
| 20e89eb0a6 | |||
| 7483d01311 | |||
| 912171d053 | |||
| 7ba8bafcaa | |||
| 5b1af331b9 | |||
| 03b3c8855c | |||
| 555cd13f09 | |||
| 9b845269c9 | |||
| a0aeba5abc | |||
| d989cd0461 | |||
| 167f140b0e | |||
| c6804d79b6 | |||
| eebf9e4c97 | |||
| 99adbebe75 | |||
| 85c914e754 | |||
| e16f47ad24 | |||
| 4167345d51 | |||
| 6d8c90777d | |||
| b994250ef6 | |||
| b4adc7246f | |||
| 674ac7dd13 | |||
| cc6abf7256 | |||
| 681931d9bb | |||
| 72cc928ebf | |||
| 9ab43b4ea4 | |||
| 5f8149daee | |||
| 4749c972e5 | |||
| e986e81421 | |||
| ccc8619387 | |||
| 9b1168ce0b | |||
| 5d883466a2 | |||
| 6b407e8c9c | |||
| 8344b539c8 | |||
| 9350ce195a | |||
| 3c571cce5a | |||
| e03a6d10a0 | |||
| c5db1d7ba2 | |||
| 0b1a17b4eb | |||
| 0a525ebd37 | |||
| 673bc5b819 | |||
| 5415e98458 | |||
| 1a7da33375 | |||
| 28e2a93355 | |||
| 1de4136ed9 | |||
| 2950fc216e | |||
| 56a88d7bd4 | |||
| 2cc60bd677 | |||
| da3c35c6a4 | |||
| 397a1a111e | |||
| 55e86f606c | |||
| 87cb61c8b2 | |||
| 2eeec15f9c | |||
| 147f52467f | |||
| 49da15823f | |||
| 7e9bc6d49a | |||
| b86129e35e | |||
| a07fb3fe08 | |||
| 4e436da569 | |||
| b321e29002 | |||
| 32eeb0c813 | |||
| cb1872c52f | |||
| 636c057cc5 | |||
| 0b9873982d | |||
| f57c621117 | |||
| 6376523923 | |||
| 5ac8e0f91a | |||
| c17b9e01c8 | |||
| af15e68a3d | |||
| fcdb32908d | |||
| 11b0a99914 | |||
| e2c8b77546 | |||
| 53d08e01e5 | |||
| 34c85346a6 | |||
| 6c4ea706f8 | |||
| f9513bb7dd | |||
| fae3e0caa3 | |||
| 8fb9bc5545 | |||
| 19ceff4417 | |||
| c7658ea65e | |||
| 8f19adecfe | |||
| 6a0d140e91 | |||
| 813f14bf2a | |||
| 914c911984 | |||
| 94a0b46fb9 | |||
| 828165783e | |||
| 22d082ac9a | |||
| f046634d6e | |||
| 818aebadfc | |||
| f97ec4c2c1 | |||
| 73692b52f0 | |||
| 6d520eaa6f | |||
| 4badc75fb2 | |||
| 2979997442 | |||
| 3ee55ec341 | |||
| 674028d476 | |||
| 9650366d34 | |||
| c3518e3159 | |||
| 430262e01a | |||
| a8441481b5 | |||
| f775223a83 | |||
| 8814902999 | |||
| 646aeeca40 | |||
| 095500ae9a | |||
| 10fa8a84d1 | |||
| c595d039bd | |||
| 0e40cc8ae1 | |||
| b0b08754d0 | |||
| 453ab177b4 | |||
| 8d1c449173 | |||
| c5ad04620b | |||
| 3de19eb102 | |||
| 5b5ff54fa2 | |||
| 900c0c3ef5 | |||
| 4c37ece39e | |||
| cc2deb73f7 | |||
| 7fafdd66de | |||
| d531cea536 | |||
| 75af00c9c8 | |||
| 6936a1426c | |||
| 0946bab424 | |||
| 0a1cf65ddb | |||
| 059d1dba75 | |||
| 97aa57faed | |||
| e364ef8859 | |||
| 87412da1ca | |||
| 7923006203 | |||
| 6a4592a8f5 | |||
| ed323581fe | |||
| f7da33726c | |||
| de2f4c3a62 | |||
| a9775c4000 | |||
| fb522af107 | |||
| e545f7d8d3 | |||
| 6b6a808a4a | |||
| 7021fda0e6 | |||
| 27f7de9886 | |||
| 304592abfe | |||
| 0def6f7e37 | |||
| 943bb3a39d | |||
| f6b83755eb | |||
| 4f1077be72 | |||
| 448212ebcd | |||
| dc3d08dd41 | |||
| 84c1ca9c9b | |||
| 7904ef1308 | |||
| e80f3eec54 | |||
| 00254629f8 | |||
| 3eb67c9400 | |||
| a009549326 | |||
| 8a6d632ab0 | |||
| 4ec0dd75c8 | |||
| d3d9bd5aa7 | |||
| cd70136d09 | |||
| f49a7db07d | |||
| 58ca9075db | |||
| 023bc1993d | |||
| 0dd3811436 | |||
| 9816cdbd53 | |||
| 50870f2e7a | |||
| a455248dd9 | |||
| 5fb7ebe433 | |||
| 0d2283e10c | |||
| bf87f8794a | |||
| 181c792753 | |||
| 590c2b0fac | |||
| 00ecea924a | |||
| ce4be68501 | |||
| 817ce32e6d | |||
| 4ea4b0be53 | |||
| 4566146d50 | |||
| b3d1301925 | |||
| a8b9c82c97 | |||
| 6905c88083 | |||
| afe02af5c2 | |||
| 9777aa7677 | |||
| e4bf8fa012 | |||
| 0c1316f74c | |||
| 3404e3b3a6 | |||
| 6b16c844b6 | |||
| e696c2beb3 | |||
| abb4dd9fc0 | |||
| aebb9f81c6 | |||
| 2f47f67eef | |||
| 50c12d9e16 | |||
| f1026b4427 | |||
| f5a9e10bdc | |||
| d064125f61 | |||
| c69fdbb4ac | |||
| 883eaba25b | |||
| bcf460d2a5 | |||
| 010568e558 | |||
| 77a19ffe9f | |||
| a1bc8a878b | |||
| ee176a6f79 | |||
| e169b891d7 | |||
| 448fcd1227 | |||
| 0d92170a57 | |||
| 36031fa10a | |||
| efdaa87ee2 | |||
| bfb5d8c33c | |||
| 37050a4bcd | |||
| 99bc9a8b6d | |||
| f8ef0a5cf1 | |||
| 257624e6a7 | |||
| 40a31d8bc7 | |||
| 05d225ae38 | |||
| 86b9decf80 | |||
| a8356407c5 | |||
| c214cdd7bb | |||
| 9bed930497 | |||
| f3408d5e62 | |||
| 207f791684 | |||
| c973ded2fc | |||
| 52cbb01555 | |||
| 7389ddb62c | |||
| aaac300cc4 | |||
| c78ab032bd | |||
| ca39552692 | |||
| 6d1d69443a | |||
| 2c876b4d86 | |||
| dccb410bb3 | |||
| 2a0c5ca410 | |||
| 5a34371009 | |||
| 351a8939c3 | |||
| ce6b4a4174 | |||
| efc98285aa | |||
| f0ee6ff97e | |||
| ba155b70e1 | |||
| 2bcef50ac5 | |||
| ee682eef65 | |||
| 731063b96e | |||
| 4d10eba7a7 | |||
| 59c405d9e5 | |||
| c2ff8d1a4f | |||
| 638236113d | |||
| f84bf82f6c | |||
| e6127a81a1 | |||
| b70845a85d | |||
| 162f7c1194 | |||
| df84981954 | |||
| a935bf2663 | |||
| e53b580767 | |||
| 99ccd41bb5 | |||
| ec1079e78b | |||
| 8a430bf725 | |||
| 41ff6b4b03 | |||
| 620e1f5b1d | |||
| 011445b77a | |||
| 1e7703d64d | |||
| 9232031ec7 | |||
| 3787f7e5ec | |||
| df67cb8a46 | |||
| 6d031ae18c | |||
| 442413870d | |||
| b5c6b8a073 | |||
| 4a8b13b392 | |||
| f032ece678 | |||
| e684feb1fe | |||
| ec2360a5da | |||
| 323077b383 | |||
| 215251a122 | |||
| e4ccf30133 | |||
| 08436433ef | |||
| 311da4098e | |||
| d4b714dc39 | |||
| c282f74bd4 | |||
| bfff212a05 | |||
| edc8297af3 | |||
| 38832d87d5 | |||
| 51012eaa67 | |||
| cb692d570a | |||
| d61e143b71 | |||
| ae92948e22 | |||
| 26d04d5eb8 | |||
| f0b0967b16 | |||
| a6356abe27 | |||
| 6cbf8de6a8 | |||
| 2ff392511b | |||
| 0eb0b32c7a | |||
| ea95a009df | |||
| 21e6820714 | |||
| 1854f9de28 | |||
| ffc275f051 | |||
| 07bf3dc8cb | |||
| 1753eca198 | |||
| 4418608a54 | |||
| eb2308d9e1 | |||
| ef4179ea1f | |||
| 2f4f81e5de | |||
| 8cbb7834ef | |||
| d43303251d | |||
| c50448995b | |||
| d47a84c90b | |||
| 119b4e8724 | |||
| d3321324eb | |||
| 8181f39ae2 | |||
| 25838eb9f3 | |||
| b51095cec5 | |||
| 4bf671b316 | |||
| 9d64d8a046 | |||
| c804d3111a | |||
| 602a0e1efc | |||
| 9c38a3f11a | |||
| 1674316788 | |||
| e0231bf990 | |||
| e35358afd1 | |||
| ef34df4a7d | |||
| 31d02a9726 | |||
| 8985c28fab | |||
| f3e366a2a3 | |||
| 53647d66b7 | |||
| bff350400f | |||
| fcfc4eba3b | |||
| f94887393c | |||
| 5704e8fcce | |||
| 3f460bab84 | |||
| b802d59c70 | |||
| 1472f1da0a | |||
| 070ad9397c | |||
| fe8dd08ba6 | |||
| 4d1e6c0838 | |||
| ecb813ad38 | |||
| de63a0ab5c | |||
| e14527b382 | |||
| 1518475946 | |||
| ccbe949238 | |||
| dca6eddd5f | |||
| 6f537f52c2 | |||
| 8632cee40a | |||
| d0463c2c16 | |||
| 73ccf12678 | |||
| ef60b086ba | |||
| 1f429cd00e | |||
| 6fbac5d057 | |||
| 91111ea7ee | |||
| 3dae44c652 | |||
| 6d769edce0 | |||
| 49a6a674e6 | |||
| 13ea916943 | |||
| 0fbb07c2ec | |||
| fcaac648a4 | |||
| a41ef52249 | |||
| f21453afdc | |||
| 90d0c3b206 | |||
| a63708a3d1 | |||
| 3fb84ac5d0 | |||
| 51e9e263ca | |||
| 6bbb2376f7 | |||
| 6725197d58 | |||
| 246a82774b | |||
| 6e522c5a55 | |||
| a58d42e492 | |||
| 4596c1d69a | |||
| 3d047f2100 | |||
| 8f25ff677f | |||
| 85bb0e2f65 | |||
| 9ea0abc321 | |||
| c266d1b6e3 | |||
| 8fd166470f | |||
| a93cbe76f9 | |||
| aa848d5260 | |||
| cf5ba5cf2a | |||
| d4d8a2ad0d | |||
| bf5ed7abbb | |||
| fa0cdb3ab5 | |||
| f875350d75 | |||
| e23c6c4ee4 | |||
| f76fc09caf | |||
| 5c0631e12c | |||
| cbb394a160 | |||
| a448dbe283 | |||
| 67c2e30f89 | |||
| e51b65d7c3 | |||
| 34d9e37ab0 | |||
| 7f497ac552 | |||
| f3eaab5d37 | |||
| e083bbe17c | |||
| d97a32e2d0 | |||
| 1968f6e741 | |||
| 8ecb9e6c2d | |||
| f611e7363b | |||
| f0349632c3 | |||
| fbf289ff63 | |||
| 4481a947d4 | |||
| 1b64453aa7 | |||
| 4727ea0af2 | |||
| 59d618d25f | |||
| d9f3824086 | |||
| 071312fc0c | |||
| 542637c0dc | |||
| 1b29a7692c | |||
| e75198cca9 | |||
| 0cdcfe2653 | |||
| 050607e00d | |||
| 12e18b75db | |||
| 0a14dbc9f4 | |||
| e8f9c955b3 | |||
| 05d1ebbaaa | |||
| 5f8a746d6e | |||
| 13cb0ff38e | |||
| aea3e7e05b | |||
| 98465af226 | |||
| 5a0cf5d7c8 | |||
| 167582b887 | |||
| c4be1c721d | |||
| b261e8e5fa | |||
| c37d1f09c6 | |||
| d701df24c8 | |||
| 4d2e38f616 | |||
| d22922fc72 | |||
| d770eaa9cd | |||
| 2c35d60d45 | |||
| be4e1b1891 | |||
| 3618c59d08 | |||
| cc9765e54e | |||
| 897ce4035f | |||
| d06b04221f | |||
| ff0b2efbb0 | |||
| 999113e3c3 | |||
| 6db5842a28 | |||
| 0401cccd1d | |||
| b928f5d932 | |||
| 65290e13c7 | |||
| 4b881cb3ff | |||
| 53db53792e | |||
| 38db76dd14 | |||
| f182c98ffa | |||
| 2379b2aeda | |||
| a76b9ecdf9 | |||
| 91df57d36b | |||
| 9afaac7612 | |||
| e475c0957e | |||
| 2544d0294a | |||
| d4f4c58277 | |||
| 1bd1846e40 | |||
| 80e3c28234 | |||
| 14d96778e3 | |||
| 2a030bf3a9 | |||
| 33f139ecfa | |||
|
|
e8d97281f7 | ||
|
|
8ad3350d51 | ||
|
|
ac4e5e1570 |
@@ -2,7 +2,7 @@ name: CI
|
|||||||
|
|
||||||
on:
|
on:
|
||||||
push:
|
push:
|
||||||
branches: [dev, testing, "temp/merge-*"]
|
branches: [dev, testing]
|
||||||
paths-ignore:
|
paths-ignore:
|
||||||
- "**/*.md"
|
- "**/*.md"
|
||||||
- "docs/**"
|
- "docs/**"
|
||||||
@@ -11,17 +11,19 @@ jobs:
|
|||||||
lint:
|
lint:
|
||||||
name: Lint (ruff)
|
name: Lint (ruff)
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
if: github.ref == 'refs/heads/dev'
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
- uses: actions/setup-python@v5
|
- uses: actions/setup-python@v5
|
||||||
with:
|
with:
|
||||||
python-version: "3.11"
|
python-version: "3.11"
|
||||||
- run: pip install ruff
|
- run: pip install ruff
|
||||||
- run: ruff check .
|
- run: ruff check decnet/
|
||||||
|
|
||||||
bandit:
|
bandit:
|
||||||
name: SAST (bandit)
|
name: SAST (bandit)
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
if: github.ref == 'refs/heads/dev'
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
- uses: actions/setup-python@v5
|
- uses: actions/setup-python@v5
|
||||||
@@ -33,6 +35,7 @@ jobs:
|
|||||||
pip-audit:
|
pip-audit:
|
||||||
name: Dependency audit (pip-audit)
|
name: Dependency audit (pip-audit)
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
if: github.ref == 'refs/heads/dev'
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
- uses: actions/setup-python@v5
|
- uses: actions/setup-python@v5
|
||||||
@@ -40,12 +43,33 @@ jobs:
|
|||||||
python-version: "3.11"
|
python-version: "3.11"
|
||||||
- run: pip install pip-audit
|
- run: pip install pip-audit
|
||||||
- run: pip install -e .[dev]
|
- run: pip install -e .[dev]
|
||||||
- run: pip-audit --skip-editable --ignore-vuln CVE-2025-65896
|
- run: pip-audit --skip-editable --ignore-vuln CVE-2025-65896 --ignore-vuln CVE-2026-3219
|
||||||
|
|
||||||
|
merge-to-testing:
|
||||||
|
name: Merge dev → testing
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
needs: [lint, bandit, pip-audit]
|
||||||
|
if: github.ref == 'refs/heads/dev'
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
with:
|
||||||
|
fetch-depth: 0
|
||||||
|
token: ${{ secrets.DECNET_PR_TOKEN }}
|
||||||
|
- name: Configure git
|
||||||
|
run: |
|
||||||
|
git config user.name "DECNET CI"
|
||||||
|
git config user.email "ci@decnet.local"
|
||||||
|
- name: Merge dev into testing
|
||||||
|
run: |
|
||||||
|
git fetch origin testing
|
||||||
|
git checkout testing
|
||||||
|
git merge origin/dev --no-ff -m "ci: auto-merge dev → testing"
|
||||||
|
git push origin testing
|
||||||
|
|
||||||
test-standard:
|
test-standard:
|
||||||
name: Test (Standard)
|
name: Test (Standard)
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
needs: [lint, bandit, pip-audit]
|
if: github.ref == 'refs/heads/testing'
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
python-version: ["3.11"]
|
python-version: ["3.11"]
|
||||||
@@ -60,6 +84,7 @@ jobs:
|
|||||||
test-live:
|
test-live:
|
||||||
name: Test (Live)
|
name: Test (Live)
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
|
if: github.ref == 'refs/heads/testing'
|
||||||
needs: [test-standard]
|
needs: [test-standard]
|
||||||
services:
|
services:
|
||||||
mysql:
|
mysql:
|
||||||
@@ -91,48 +116,10 @@ jobs:
|
|||||||
DECNET_MYSQL_PASSWORD: root
|
DECNET_MYSQL_PASSWORD: root
|
||||||
DECNET_MYSQL_DATABASE: decnet_test
|
DECNET_MYSQL_DATABASE: decnet_test
|
||||||
|
|
||||||
test-fuzz:
|
merge-to-main:
|
||||||
name: Test (Fuzz)
|
name: Merge testing → main
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
needs: [test-live]
|
needs: [test-standard, test-live]
|
||||||
strategy:
|
|
||||||
matrix:
|
|
||||||
python-version: ["3.11"]
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
- uses: actions/setup-python@v5
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python-version }}
|
|
||||||
- run: pip install -e .[dev]
|
|
||||||
- run: pytest -m fuzz
|
|
||||||
env:
|
|
||||||
SCHEMATHESIS_CONFIG: schemathesis.ci.toml
|
|
||||||
|
|
||||||
merge-to-testing:
|
|
||||||
name: Merge dev → testing
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
needs: [test-standard, test-live, test-fuzz]
|
|
||||||
if: github.ref == 'refs/heads/dev'
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
token: ${{ secrets.DECNET_PR_TOKEN }}
|
|
||||||
- name: Configure git
|
|
||||||
run: |
|
|
||||||
git config user.name "DECNET CI"
|
|
||||||
git config user.email "ci@decnet.local"
|
|
||||||
- name: Merge dev into testing
|
|
||||||
run: |
|
|
||||||
git fetch origin testing
|
|
||||||
git checkout testing
|
|
||||||
git merge origin/dev --no-ff -m "ci: auto-merge dev → testing [skip ci]"
|
|
||||||
git push origin testing
|
|
||||||
|
|
||||||
prepare-merge-to-main:
|
|
||||||
name: Prepare Merge to Main
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
needs: [test-standard, test-live, test-fuzz]
|
|
||||||
if: github.ref == 'refs/heads/testing'
|
if: github.ref == 'refs/heads/testing'
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
@@ -143,33 +130,12 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
git config user.name "DECNET CI"
|
git config user.name "DECNET CI"
|
||||||
git config user.email "ci@decnet.local"
|
git config user.email "ci@decnet.local"
|
||||||
- name: Create temp branch and sync with main
|
- name: Merge testing into main
|
||||||
run: |
|
|
||||||
git fetch origin main
|
|
||||||
git checkout -b temp/merge-testing-to-main
|
|
||||||
echo "--- Switched to temp branch, merging main into it ---"
|
|
||||||
git merge origin/main --no-edit || { echo "CONFLICT: Manual resolution required"; exit 1; }
|
|
||||||
git push origin temp/merge-testing-to-main --force
|
|
||||||
|
|
||||||
finalize-merge-to-main:
|
|
||||||
name: Finalize Merge to Main
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
needs: [test-standard, test-live, test-fuzz]
|
|
||||||
if: startsWith(github.ref, 'refs/heads/temp/merge-')
|
|
||||||
steps:
|
|
||||||
- uses: actions/checkout@v4
|
|
||||||
with:
|
|
||||||
fetch-depth: 0
|
|
||||||
token: ${{ secrets.DECNET_PR_TOKEN }}
|
|
||||||
- name: Configure git
|
|
||||||
run: |
|
|
||||||
git config user.name "DECNET CI"
|
|
||||||
git config user.email "ci@decnet.local"
|
|
||||||
- name: Merge RC into main
|
|
||||||
run: |
|
run: |
|
||||||
git fetch origin main
|
git fetch origin main
|
||||||
git checkout main
|
git checkout main
|
||||||
git merge ${{ github.ref }} --no-ff -m "ci: auto-merge testing → main"
|
git merge origin/testing --no-ff -m "ci: auto-merge testing → main" || {
|
||||||
|
echo "CONFLICT: testing and main have diverged — manual resolution required"
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
git push origin main
|
git push origin main
|
||||||
echo "--- Cleaning up temp branch ---"
|
|
||||||
git push origin --delete ${{ github.ref_name }}
|
|
||||||
|
|||||||
23
.gitignore
vendored
23
.gitignore
vendored
@@ -1,4 +1,7 @@
|
|||||||
.venv/
|
.venv/
|
||||||
|
.venv*/
|
||||||
|
.311/
|
||||||
|
.3[0-9][0-9]/
|
||||||
logs/
|
logs/
|
||||||
.claude/*
|
.claude/*
|
||||||
CLAUDE.md
|
CLAUDE.md
|
||||||
@@ -9,6 +12,10 @@ __pycache__/
|
|||||||
dist/
|
dist/
|
||||||
build/
|
build/
|
||||||
decnet-compose.yml
|
decnet-compose.yml
|
||||||
|
# Per-topology compose fragments emitted by `decnet topology deploy`.
|
||||||
|
decnet-topology-*-compose.yml
|
||||||
|
# Docker build context cache.
|
||||||
|
.docker/
|
||||||
decnet-state.json
|
decnet-state.json
|
||||||
*.ini
|
*.ini
|
||||||
decnet.log*
|
decnet.log*
|
||||||
@@ -21,6 +28,9 @@ windows1
|
|||||||
*.db-shm
|
*.db-shm
|
||||||
*.db-wal
|
*.db-wal
|
||||||
decnet.*.log
|
decnet.*.log
|
||||||
|
# Rotated copies (logrotate appends .1, .2, .gz...) — the existing
|
||||||
|
# decnet.*.log glob doesn't catch the suffix.
|
||||||
|
decnet.*.log.*
|
||||||
decnet.json
|
decnet.json
|
||||||
.env*
|
.env*
|
||||||
.env.local
|
.env.local
|
||||||
@@ -28,3 +38,16 @@ decnet.json
|
|||||||
.hypothesis/
|
.hypothesis/
|
||||||
profiles/*
|
profiles/*
|
||||||
tests/test_decnet.db*
|
tests/test_decnet.db*
|
||||||
|
|
||||||
|
# Nested git clone of the wiki — not a submodule, just a local
|
||||||
|
# working copy so we can edit docs without a full round-trip.
|
||||||
|
wiki-checkout/
|
||||||
|
|
||||||
|
# Scratch test/debug outputs that leak from saved `pytest > hang.log`
|
||||||
|
# or `pytest > schem` redirections.
|
||||||
|
hang.log
|
||||||
|
schem
|
||||||
|
*.pytest.log
|
||||||
|
|
||||||
|
# pydeps-style dependency graph dumps from local analysis runs.
|
||||||
|
deps.txt
|
||||||
|
|||||||
674
LICENSE
Normal file
674
LICENSE
Normal file
@@ -0,0 +1,674 @@
|
|||||||
|
GNU GENERAL PUBLIC LICENSE
|
||||||
|
Version 3, 29 June 2007
|
||||||
|
|
||||||
|
Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
|
||||||
|
Everyone is permitted to copy and distribute verbatim copies
|
||||||
|
of this license document, but changing it is not allowed.
|
||||||
|
|
||||||
|
Preamble
|
||||||
|
|
||||||
|
The GNU General Public License is a free, copyleft license for
|
||||||
|
software and other kinds of works.
|
||||||
|
|
||||||
|
The licenses for most software and other practical works are designed
|
||||||
|
to take away your freedom to share and change the works. By contrast,
|
||||||
|
the GNU General Public License is intended to guarantee your freedom to
|
||||||
|
share and change all versions of a program--to make sure it remains free
|
||||||
|
software for all its users. We, the Free Software Foundation, use the
|
||||||
|
GNU General Public License for most of our software; it applies also to
|
||||||
|
any other work released this way by its authors. You can apply it to
|
||||||
|
your programs, too.
|
||||||
|
|
||||||
|
When we speak of free software, we are referring to freedom, not
|
||||||
|
price. Our General Public Licenses are designed to make sure that you
|
||||||
|
have the freedom to distribute copies of free software (and charge for
|
||||||
|
them if you wish), that you receive source code or can get it if you
|
||||||
|
want it, that you can change the software or use pieces of it in new
|
||||||
|
free programs, and that you know you can do these things.
|
||||||
|
|
||||||
|
To protect your rights, we need to prevent others from denying you
|
||||||
|
these rights or asking you to surrender the rights. Therefore, you have
|
||||||
|
certain responsibilities if you distribute copies of the software, or if
|
||||||
|
you modify it: responsibilities to respect the freedom of others.
|
||||||
|
|
||||||
|
For example, if you distribute copies of such a program, whether
|
||||||
|
gratis or for a fee, you must pass on to the recipients the same
|
||||||
|
freedoms that you received. You must make sure that they, too, receive
|
||||||
|
or can get the source code. And you must show them these terms so they
|
||||||
|
know their rights.
|
||||||
|
|
||||||
|
Developers that use the GNU GPL protect your rights with two steps:
|
||||||
|
(1) assert copyright on the software, and (2) offer you this License
|
||||||
|
giving you legal permission to copy, distribute and/or modify it.
|
||||||
|
|
||||||
|
For the developers' and authors' protection, the GPL clearly explains
|
||||||
|
that there is no warranty for this free software. For both users' and
|
||||||
|
authors' sake, the GPL requires that modified versions be marked as
|
||||||
|
changed, so that their problems will not be attributed erroneously to
|
||||||
|
authors of previous versions.
|
||||||
|
|
||||||
|
Some devices are designed to deny users access to install or run
|
||||||
|
modified versions of the software inside them, although the manufacturer
|
||||||
|
can do so. This is fundamentally incompatible with the aim of
|
||||||
|
protecting users' freedom to change the software. The systematic
|
||||||
|
pattern of such abuse occurs in the area of products for individuals to
|
||||||
|
use, which is precisely where it is most unacceptable. Therefore, we
|
||||||
|
have designed this version of the GPL to prohibit the practice for those
|
||||||
|
products. If such problems arise substantially in other domains, we
|
||||||
|
stand ready to extend this provision to those domains in future versions
|
||||||
|
of the GPL, as needed to protect the freedom of users.
|
||||||
|
|
||||||
|
Finally, every program is threatened constantly by software patents.
|
||||||
|
States should not allow patents to restrict development and use of
|
||||||
|
software on general-purpose computers, but in those that do, we wish to
|
||||||
|
avoid the special danger that patents applied to a free program could
|
||||||
|
make it effectively proprietary. To prevent this, the GPL assures that
|
||||||
|
patents cannot be used to render the program non-free.
|
||||||
|
|
||||||
|
The precise terms and conditions for copying, distribution and
|
||||||
|
modification follow.
|
||||||
|
|
||||||
|
TERMS AND CONDITIONS
|
||||||
|
|
||||||
|
0. Definitions.
|
||||||
|
|
||||||
|
"This License" refers to version 3 of the GNU General Public License.
|
||||||
|
|
||||||
|
"Copyright" also means copyright-like laws that apply to other kinds of
|
||||||
|
works, such as semiconductor masks.
|
||||||
|
|
||||||
|
"The Program" refers to any copyrightable work licensed under this
|
||||||
|
License. Each licensee is addressed as "you". "Licensees" and
|
||||||
|
"recipients" may be individuals or organizations.
|
||||||
|
|
||||||
|
To "modify" a work means to copy from or adapt all or part of the work
|
||||||
|
in a fashion requiring copyright permission, other than the making of an
|
||||||
|
exact copy. The resulting work is called a "modified version" of the
|
||||||
|
earlier work or a work "based on" the earlier work.
|
||||||
|
|
||||||
|
A "covered work" means either the unmodified Program or a work based
|
||||||
|
on the Program.
|
||||||
|
|
||||||
|
To "propagate" a work means to do anything with it that, without
|
||||||
|
permission, would make you directly or secondarily liable for
|
||||||
|
infringement under applicable copyright law, except executing it on a
|
||||||
|
computer or modifying a private copy. Propagation includes copying,
|
||||||
|
distribution (with or without modification), making available to the
|
||||||
|
public, and in some countries other activities as well.
|
||||||
|
|
||||||
|
To "convey" a work means any kind of propagation that enables other
|
||||||
|
parties to make or receive copies. Mere interaction with a user through
|
||||||
|
a computer network, with no transfer of a copy, is not conveying.
|
||||||
|
|
||||||
|
An interactive user interface displays "Appropriate Legal Notices"
|
||||||
|
to the extent that it includes a convenient and prominently visible
|
||||||
|
feature that (1) displays an appropriate copyright notice, and (2)
|
||||||
|
tells the user that there is no warranty for the work (except to the
|
||||||
|
extent that warranties are provided), that licensees may convey the
|
||||||
|
work under this License, and how to view a copy of this License. If
|
||||||
|
the interface presents a list of user commands or options, such as a
|
||||||
|
menu, a prominent item in the list meets this criterion.
|
||||||
|
|
||||||
|
1. Source Code.
|
||||||
|
|
||||||
|
The "source code" for a work means the preferred form of the work
|
||||||
|
for making modifications to it. "Object code" means any non-source
|
||||||
|
form of a work.
|
||||||
|
|
||||||
|
A "Standard Interface" means an interface that either is an official
|
||||||
|
standard defined by a recognized standards body, or, in the case of
|
||||||
|
interfaces specified for a particular programming language, one that
|
||||||
|
is widely used among developers working in that language.
|
||||||
|
|
||||||
|
The "System Libraries" of an executable work include anything, other
|
||||||
|
than the work as a whole, that (a) is included in the normal form of
|
||||||
|
packaging a Major Component, but which is not part of that Major
|
||||||
|
Component, and (b) serves only to enable use of the work with that
|
||||||
|
Major Component, or to implement a Standard Interface for which an
|
||||||
|
implementation is available to the public in source code form. A
|
||||||
|
"Major Component", in this context, means a major essential component
|
||||||
|
(kernel, window system, and so on) of the specific operating system
|
||||||
|
(if any) on which the executable work runs, or a compiler used to
|
||||||
|
produce the work, or an object code interpreter used to run it.
|
||||||
|
|
||||||
|
The "Corresponding Source" for a work in object code form means all
|
||||||
|
the source code needed to generate, install, and (for an executable
|
||||||
|
work) run the object code and to modify the work, including scripts to
|
||||||
|
control those activities. However, it does not include the work's
|
||||||
|
System Libraries, or general-purpose tools or generally available free
|
||||||
|
programs which are used unmodified in performing those activities but
|
||||||
|
which are not part of the work. For example, Corresponding Source
|
||||||
|
includes interface definition files associated with source files for
|
||||||
|
the work, and the source code for shared libraries and dynamically
|
||||||
|
linked subprograms that the work is specifically designed to require,
|
||||||
|
such as by intimate data communication or control flow between those
|
||||||
|
subprograms and other parts of the work.
|
||||||
|
|
||||||
|
The Corresponding Source need not include anything that users
|
||||||
|
can regenerate automatically from other parts of the Corresponding
|
||||||
|
Source.
|
||||||
|
|
||||||
|
The Corresponding Source for a work in source code form is that
|
||||||
|
same work.
|
||||||
|
|
||||||
|
2. Basic Permissions.
|
||||||
|
|
||||||
|
All rights granted under this License are granted for the term of
|
||||||
|
copyright on the Program, and are irrevocable provided the stated
|
||||||
|
conditions are met. This License explicitly affirms your unlimited
|
||||||
|
permission to run the unmodified Program. The output from running a
|
||||||
|
covered work is covered by this License only if the output, given its
|
||||||
|
content, constitutes a covered work. This License acknowledges your
|
||||||
|
rights of fair use or other equivalent, as provided by copyright law.
|
||||||
|
|
||||||
|
You may make, run and propagate covered works that you do not
|
||||||
|
convey, without conditions so long as your license otherwise remains
|
||||||
|
in force. You may convey covered works to others for the sole purpose
|
||||||
|
of having them make modifications exclusively for you, or provide you
|
||||||
|
with facilities for running those works, provided that you comply with
|
||||||
|
the terms of this License in conveying all material for which you do
|
||||||
|
not control copyright. Those thus making or running the covered works
|
||||||
|
for you must do so exclusively on your behalf, under your direction
|
||||||
|
and control, on terms that prohibit them from making any copies of
|
||||||
|
your copyrighted material outside their relationship with you.
|
||||||
|
|
||||||
|
Conveying under any other circumstances is permitted solely under
|
||||||
|
the conditions stated below. Sublicensing is not allowed; section 10
|
||||||
|
makes it unnecessary.
|
||||||
|
|
||||||
|
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
|
||||||
|
|
||||||
|
No covered work shall be deemed part of an effective technological
|
||||||
|
measure under any applicable law fulfilling obligations under article
|
||||||
|
11 of the WIPO copyright treaty adopted on 20 December 1996, or
|
||||||
|
similar laws prohibiting or restricting circumvention of such
|
||||||
|
measures.
|
||||||
|
|
||||||
|
When you convey a covered work, you waive any legal power to forbid
|
||||||
|
circumvention of technological measures to the extent such circumvention
|
||||||
|
is effected by exercising rights under this License with respect to
|
||||||
|
the covered work, and you disclaim any intention to limit operation or
|
||||||
|
modification of the work as a means of enforcing, against the work's
|
||||||
|
users, your or third parties' legal rights to forbid circumvention of
|
||||||
|
technological measures.
|
||||||
|
|
||||||
|
4. Conveying Verbatim Copies.
|
||||||
|
|
||||||
|
You may convey verbatim copies of the Program's source code as you
|
||||||
|
receive it, in any medium, provided that you conspicuously and
|
||||||
|
appropriately publish on each copy an appropriate copyright notice;
|
||||||
|
keep intact all notices stating that this License and any
|
||||||
|
non-permissive terms added in accord with section 7 apply to the code;
|
||||||
|
keep intact all notices of the absence of any warranty; and give all
|
||||||
|
recipients a copy of this License along with the Program.
|
||||||
|
|
||||||
|
You may charge any price or no price for each copy that you convey,
|
||||||
|
and you may offer support or warranty protection for a fee.
|
||||||
|
|
||||||
|
5. Conveying Modified Source Versions.
|
||||||
|
|
||||||
|
You may convey a work based on the Program, or the modifications to
|
||||||
|
produce it from the Program, in the form of source code under the
|
||||||
|
terms of section 4, provided that you also meet all of these conditions:
|
||||||
|
|
||||||
|
a) The work must carry prominent notices stating that you modified
|
||||||
|
it, and giving a relevant date.
|
||||||
|
|
||||||
|
b) The work must carry prominent notices stating that it is
|
||||||
|
released under this License and any conditions added under section
|
||||||
|
7. This requirement modifies the requirement in section 4 to
|
||||||
|
"keep intact all notices".
|
||||||
|
|
||||||
|
c) You must license the entire work, as a whole, under this
|
||||||
|
License to anyone who comes into possession of a copy. This
|
||||||
|
License will therefore apply, along with any applicable section 7
|
||||||
|
additional terms, to the whole of the work, and all its parts,
|
||||||
|
regardless of how they are packaged. This License gives no
|
||||||
|
permission to license the work in any other way, but it does not
|
||||||
|
invalidate such permission if you have separately received it.
|
||||||
|
|
||||||
|
d) If the work has interactive user interfaces, each must display
|
||||||
|
Appropriate Legal Notices; however, if the Program has interactive
|
||||||
|
interfaces that do not display Appropriate Legal Notices, your
|
||||||
|
work need not make them do so.
|
||||||
|
|
||||||
|
A compilation of a covered work with other separate and independent
|
||||||
|
works, which are not by their nature extensions of the covered work,
|
||||||
|
and which are not combined with it such as to form a larger program,
|
||||||
|
in or on a volume of a storage or distribution medium, is called an
|
||||||
|
"aggregate" if the compilation and its resulting copyright are not
|
||||||
|
used to limit the access or legal rights of the compilation's users
|
||||||
|
beyond what the individual works permit. Inclusion of a covered work
|
||||||
|
in an aggregate does not cause this License to apply to the other
|
||||||
|
parts of the aggregate.
|
||||||
|
|
||||||
|
6. Conveying Non-Source Forms.
|
||||||
|
|
||||||
|
You may convey a covered work in object code form under the terms
|
||||||
|
of sections 4 and 5, provided that you also convey the
|
||||||
|
machine-readable Corresponding Source under the terms of this License,
|
||||||
|
in one of these ways:
|
||||||
|
|
||||||
|
a) Convey the object code in, or embodied in, a physical product
|
||||||
|
(including a physical distribution medium), accompanied by the
|
||||||
|
Corresponding Source fixed on a durable physical medium
|
||||||
|
customarily used for software interchange.
|
||||||
|
|
||||||
|
b) Convey the object code in, or embodied in, a physical product
|
||||||
|
(including a physical distribution medium), accompanied by a
|
||||||
|
written offer, valid for at least three years and valid for as
|
||||||
|
long as you offer spare parts or customer support for that product
|
||||||
|
model, to give anyone who possesses the object code either (1) a
|
||||||
|
copy of the Corresponding Source for all the software in the
|
||||||
|
product that is covered by this License, on a durable physical
|
||||||
|
medium customarily used for software interchange, for a price no
|
||||||
|
more than your reasonable cost of physically performing this
|
||||||
|
conveying of source, or (2) access to copy the
|
||||||
|
Corresponding Source from a network server at no charge.
|
||||||
|
|
||||||
|
c) Convey individual copies of the object code with a copy of the
|
||||||
|
written offer to provide the Corresponding Source. This
|
||||||
|
alternative is allowed only occasionally and noncommercially, and
|
||||||
|
only if you received the object code with such an offer, in accord
|
||||||
|
with subsection 6b.
|
||||||
|
|
||||||
|
d) Convey the object code by offering access from a designated
|
||||||
|
place (gratis or for a charge), and offer equivalent access to the
|
||||||
|
Corresponding Source in the same way through the same place at no
|
||||||
|
further charge. You need not require recipients to copy the
|
||||||
|
Corresponding Source along with the object code. If the place to
|
||||||
|
copy the object code is a network server, the Corresponding Source
|
||||||
|
may be on a different server (operated by you or a third party)
|
||||||
|
that supports equivalent copying facilities, provided you maintain
|
||||||
|
clear directions next to the object code saying where to find the
|
||||||
|
Corresponding Source. Regardless of what server hosts the
|
||||||
|
Corresponding Source, you remain obligated to ensure that it is
|
||||||
|
available for as long as needed to satisfy these requirements.
|
||||||
|
|
||||||
|
e) Convey the object code using peer-to-peer transmission, provided
|
||||||
|
you inform other peers where the object code and Corresponding
|
||||||
|
Source of the work are being offered to the general public at no
|
||||||
|
charge under subsection 6d.
|
||||||
|
|
||||||
|
A separable portion of the object code, whose source code is excluded
|
||||||
|
from the Corresponding Source as a System Library, need not be
|
||||||
|
included in conveying the object code work.
|
||||||
|
|
||||||
|
A "User Product" is either (1) a "consumer product", which means any
|
||||||
|
tangible personal property which is normally used for personal, family,
|
||||||
|
or household purposes, or (2) anything designed or sold for incorporation
|
||||||
|
into a dwelling. In determining whether a product is a consumer product,
|
||||||
|
doubtful cases shall be resolved in favor of coverage. For a particular
|
||||||
|
product received by a particular user, "normally used" refers to a
|
||||||
|
typical or common use of that class of product, regardless of the status
|
||||||
|
of the particular user or of the way in which the particular user
|
||||||
|
actually uses, or expects or is expected to use, the product. A product
|
||||||
|
is a consumer product regardless of whether the product has substantial
|
||||||
|
commercial, industrial or non-consumer uses, unless such uses represent
|
||||||
|
the only significant mode of use of the product.
|
||||||
|
|
||||||
|
"Installation Information" for a User Product means any methods,
|
||||||
|
procedures, authorization keys, or other information required to install
|
||||||
|
and execute modified versions of a covered work in that User Product from
|
||||||
|
a modified version of its Corresponding Source. The information must
|
||||||
|
suffice to ensure that the continued functioning of the modified object
|
||||||
|
code is in no case prevented or interfered with solely because
|
||||||
|
modification has been made.
|
||||||
|
|
||||||
|
If you convey an object code work under this section in, or with, or
|
||||||
|
specifically for use in, a User Product, and the conveying occurs as
|
||||||
|
part of a transaction in which the right of possession and use of the
|
||||||
|
User Product is transferred to the recipient in perpetuity or for a
|
||||||
|
fixed term (regardless of how the transaction is characterized), the
|
||||||
|
Corresponding Source conveyed under this section must be accompanied
|
||||||
|
by the Installation Information. But this requirement does not apply
|
||||||
|
if neither you nor any third party retains the ability to install
|
||||||
|
modified object code on the User Product (for example, the work has
|
||||||
|
been installed in ROM).
|
||||||
|
|
||||||
|
The requirement to provide Installation Information does not include a
|
||||||
|
requirement to continue to provide support service, warranty, or updates
|
||||||
|
for a work that has been modified or installed by the recipient, or for
|
||||||
|
the User Product in which it has been modified or installed. Access to a
|
||||||
|
network may be denied when the modification itself materially and
|
||||||
|
adversely affects the operation of the network or violates the rules and
|
||||||
|
protocols for communication across the network.
|
||||||
|
|
||||||
|
Corresponding Source conveyed, and Installation Information provided,
|
||||||
|
in accord with this section must be in a format that is publicly
|
||||||
|
documented (and with an implementation available to the public in
|
||||||
|
source code form), and must require no special password or key for
|
||||||
|
unpacking, reading or copying.
|
||||||
|
|
||||||
|
7. Additional Terms.
|
||||||
|
|
||||||
|
"Additional permissions" are terms that supplement the terms of this
|
||||||
|
License by making exceptions from one or more of its conditions.
|
||||||
|
Additional permissions that are applicable to the entire Program shall
|
||||||
|
be treated as though they were included in this License, to the extent
|
||||||
|
that they are valid under applicable law. If additional permissions
|
||||||
|
apply only to part of the Program, that part may be used separately
|
||||||
|
under those permissions, but the entire Program remains governed by
|
||||||
|
this License without regard to the additional permissions.
|
||||||
|
|
||||||
|
When you convey a copy of a covered work, you may at your option
|
||||||
|
remove any additional permissions from that copy, or from any part of
|
||||||
|
it. (Additional permissions may be written to require their own
|
||||||
|
removal in certain cases when you modify the work.) You may place
|
||||||
|
additional permissions on material, added by you to a covered work,
|
||||||
|
for which you have or can give appropriate copyright permission.
|
||||||
|
|
||||||
|
Notwithstanding any other provision of this License, for material you
|
||||||
|
add to a covered work, you may (if authorized by the copyright holders of
|
||||||
|
that material) supplement the terms of this License with terms:
|
||||||
|
|
||||||
|
a) Disclaiming warranty or limiting liability differently from the
|
||||||
|
terms of sections 15 and 16 of this License; or
|
||||||
|
|
||||||
|
b) Requiring preservation of specified reasonable legal notices or
|
||||||
|
author attributions in that material or in the Appropriate Legal
|
||||||
|
Notices displayed by works containing it; or
|
||||||
|
|
||||||
|
c) Prohibiting misrepresentation of the origin of that material, or
|
||||||
|
requiring that modified versions of such material be marked in
|
||||||
|
reasonable ways as different from the original version; or
|
||||||
|
|
||||||
|
d) Limiting the use for publicity purposes of names of licensors or
|
||||||
|
authors of the material; or
|
||||||
|
|
||||||
|
e) Declining to grant rights under trademark law for use of some
|
||||||
|
trade names, trademarks, or service marks; or
|
||||||
|
|
||||||
|
f) Requiring indemnification of licensors and authors of that
|
||||||
|
material by anyone who conveys the material (or modified versions of
|
||||||
|
it) with contractual assumptions of liability to the recipient, for
|
||||||
|
any liability that these contractual assumptions directly impose on
|
||||||
|
those licensors and authors.
|
||||||
|
|
||||||
|
All other non-permissive additional terms are considered "further
|
||||||
|
restrictions" within the meaning of section 10. If the Program as you
|
||||||
|
received it, or any part of it, contains a notice stating that it is
|
||||||
|
governed by this License along with a term that is a further
|
||||||
|
restriction, you may remove that term. If a license document contains
|
||||||
|
a further restriction but permits relicensing or conveying under this
|
||||||
|
License, you may add to a covered work material governed by the terms
|
||||||
|
of that license document, provided that the further restriction does
|
||||||
|
not survive such relicensing or conveying.
|
||||||
|
|
||||||
|
If you add terms to a covered work in accord with this section, you
|
||||||
|
must place, in the relevant source files, a statement of the
|
||||||
|
additional terms that apply to those files, or a notice indicating
|
||||||
|
where to find the applicable terms.
|
||||||
|
|
||||||
|
Additional terms, permissive or non-permissive, may be stated in the
|
||||||
|
form of a separately written license, or stated as exceptions;
|
||||||
|
the above requirements apply either way.
|
||||||
|
|
||||||
|
8. Termination.
|
||||||
|
|
||||||
|
You may not propagate or modify a covered work except as expressly
|
||||||
|
provided under this License. Any attempt otherwise to propagate or
|
||||||
|
modify it is void, and will automatically terminate your rights under
|
||||||
|
this License (including any patent licenses granted under the third
|
||||||
|
paragraph of section 11).
|
||||||
|
|
||||||
|
However, if you cease all violation of this License, then your
|
||||||
|
license from a particular copyright holder is reinstated (a)
|
||||||
|
provisionally, unless and until the copyright holder explicitly and
|
||||||
|
finally terminates your license, and (b) permanently, if the copyright
|
||||||
|
holder fails to notify you of the violation by some reasonable means
|
||||||
|
prior to 60 days after the cessation.
|
||||||
|
|
||||||
|
Moreover, your license from a particular copyright holder is
|
||||||
|
reinstated permanently if the copyright holder notifies you of the
|
||||||
|
violation by some reasonable means, this is the first time you have
|
||||||
|
received notice of violation of this License (for any work) from that
|
||||||
|
copyright holder, and you cure the violation prior to 30 days after
|
||||||
|
your receipt of the notice.
|
||||||
|
|
||||||
|
Termination of your rights under this section does not terminate the
|
||||||
|
licenses of parties who have received copies or rights from you under
|
||||||
|
this License. If your rights have been terminated and not permanently
|
||||||
|
reinstated, you do not qualify to receive new licenses for the same
|
||||||
|
material under section 10.
|
||||||
|
|
||||||
|
9. Acceptance Not Required for Having Copies.
|
||||||
|
|
||||||
|
You are not required to accept this License in order to receive or
|
||||||
|
run a copy of the Program. Ancillary propagation of a covered work
|
||||||
|
occurring solely as a consequence of using peer-to-peer transmission
|
||||||
|
to receive a copy likewise does not require acceptance. However,
|
||||||
|
nothing other than this License grants you permission to propagate or
|
||||||
|
modify any covered work. These actions infringe copyright if you do
|
||||||
|
not accept this License. Therefore, by modifying or propagating a
|
||||||
|
covered work, you indicate your acceptance of this License to do so.
|
||||||
|
|
||||||
|
10. Automatic Licensing of Downstream Recipients.
|
||||||
|
|
||||||
|
Each time you convey a covered work, the recipient automatically
|
||||||
|
receives a license from the original licensors, to run, modify and
|
||||||
|
propagate that work, subject to this License. You are not responsible
|
||||||
|
for enforcing compliance by third parties with this License.
|
||||||
|
|
||||||
|
An "entity transaction" is a transaction transferring control of an
|
||||||
|
organization, or substantially all assets of one, or subdividing an
|
||||||
|
organization, or merging organizations. If propagation of a covered
|
||||||
|
work results from an entity transaction, each party to that
|
||||||
|
transaction who receives a copy of the work also receives whatever
|
||||||
|
licenses to the work the party's predecessor in interest had or could
|
||||||
|
give under the previous paragraph, plus a right to possession of the
|
||||||
|
Corresponding Source of the work from the predecessor in interest, if
|
||||||
|
the predecessor has it or can get it with reasonable efforts.
|
||||||
|
|
||||||
|
You may not impose any further restrictions on the exercise of the
|
||||||
|
rights granted or affirmed under this License. For example, you may
|
||||||
|
not impose a license fee, royalty, or other charge for exercise of
|
||||||
|
rights granted under this License, and you may not initiate litigation
|
||||||
|
(including a cross-claim or counterclaim in a lawsuit) alleging that
|
||||||
|
any patent claim is infringed by making, using, selling, offering for
|
||||||
|
sale, or importing the Program or any portion of it.
|
||||||
|
|
||||||
|
11. Patents.
|
||||||
|
|
||||||
|
A "contributor" is a copyright holder who authorizes use under this
|
||||||
|
License of the Program or a work on which the Program is based. The
|
||||||
|
work thus licensed is called the contributor's "contributor version".
|
||||||
|
|
||||||
|
A contributor's "essential patent claims" are all patent claims
|
||||||
|
owned or controlled by the contributor, whether already acquired or
|
||||||
|
hereafter acquired, that would be infringed by some manner, permitted
|
||||||
|
by this License, of making, using, or selling its contributor version,
|
||||||
|
but do not include claims that would be infringed only as a
|
||||||
|
consequence of further modification of the contributor version. For
|
||||||
|
purposes of this definition, "control" includes the right to grant
|
||||||
|
patent sublicenses in a manner consistent with the requirements of
|
||||||
|
this License.
|
||||||
|
|
||||||
|
Each contributor grants you a non-exclusive, worldwide, royalty-free
|
||||||
|
patent license under the contributor's essential patent claims, to
|
||||||
|
make, use, sell, offer for sale, import and otherwise run, modify and
|
||||||
|
propagate the contents of its contributor version.
|
||||||
|
|
||||||
|
In the following three paragraphs, a "patent license" is any express
|
||||||
|
agreement or commitment, however denominated, not to enforce a patent
|
||||||
|
(such as an express permission to practice a patent or covenant not to
|
||||||
|
sue for patent infringement). To "grant" such a patent license to a
|
||||||
|
party means to make such an agreement or commitment not to enforce a
|
||||||
|
patent against the party.
|
||||||
|
|
||||||
|
If you convey a covered work, knowingly relying on a patent license,
|
||||||
|
and the Corresponding Source of the work is not available for anyone
|
||||||
|
to copy, free of charge and under the terms of this License, through a
|
||||||
|
publicly available network server or other readily accessible means,
|
||||||
|
then you must either (1) cause the Corresponding Source to be so
|
||||||
|
available, or (2) arrange to deprive yourself of the benefit of the
|
||||||
|
patent license for this particular work, or (3) arrange, in a manner
|
||||||
|
consistent with the requirements of this License, to extend the patent
|
||||||
|
license to downstream recipients. "Knowingly relying" means you have
|
||||||
|
actual knowledge that, but for the patent license, your conveying the
|
||||||
|
covered work in a country, or your recipient's use of the covered work
|
||||||
|
in a country, would infringe one or more identifiable patents in that
|
||||||
|
country that you have reason to believe are valid.
|
||||||
|
|
||||||
|
If, pursuant to or in connection with a single transaction or
|
||||||
|
arrangement, you convey, or propagate by procuring conveyance of, a
|
||||||
|
covered work, and grant a patent license to some of the parties
|
||||||
|
receiving the covered work authorizing them to use, propagate, modify
|
||||||
|
or convey a specific copy of the covered work, then the patent license
|
||||||
|
you grant is automatically extended to all recipients of the covered
|
||||||
|
work and works based on it.
|
||||||
|
|
||||||
|
A patent license is "discriminatory" if it does not include within
|
||||||
|
the scope of its coverage, prohibits the exercise of, or is
|
||||||
|
conditioned on the non-exercise of one or more of the rights that are
|
||||||
|
specifically granted under this License. You may not convey a covered
|
||||||
|
work if you are a party to an arrangement with a third party that is
|
||||||
|
in the business of distributing software, under which you make payment
|
||||||
|
to the third party based on the extent of your activity of conveying
|
||||||
|
the work, and under which the third party grants, to any of the
|
||||||
|
parties who would receive the covered work from you, a discriminatory
|
||||||
|
patent license (a) in connection with copies of the covered work
|
||||||
|
conveyed by you (or copies made from those copies), or (b) primarily
|
||||||
|
for and in connection with specific products or compilations that
|
||||||
|
contain the covered work, unless you entered into that arrangement,
|
||||||
|
or that patent license was granted, prior to 28 March 2007.
|
||||||
|
|
||||||
|
Nothing in this License shall be construed as excluding or limiting
|
||||||
|
any implied license or other defenses to infringement that may
|
||||||
|
otherwise be available to you under applicable patent law.
|
||||||
|
|
||||||
|
12. No Surrender of Others' Freedom.
|
||||||
|
|
||||||
|
If conditions are imposed on you (whether by court order, agreement or
|
||||||
|
otherwise) that contradict the conditions of this License, they do not
|
||||||
|
excuse you from the conditions of this License. If you cannot convey a
|
||||||
|
covered work so as to satisfy simultaneously your obligations under this
|
||||||
|
License and any other pertinent obligations, then as a consequence you may
|
||||||
|
not convey it at all. For example, if you agree to terms that obligate you
|
||||||
|
to collect a royalty for further conveying from those to whom you convey
|
||||||
|
the Program, the only way you could satisfy both those terms and this
|
||||||
|
License would be to refrain entirely from conveying the Program.
|
||||||
|
|
||||||
|
13. Use with the GNU Affero General Public License.
|
||||||
|
|
||||||
|
Notwithstanding any other provision of this License, you have
|
||||||
|
permission to link or combine any covered work with a work licensed
|
||||||
|
under version 3 of the GNU Affero General Public License into a single
|
||||||
|
combined work, and to convey the resulting work. The terms of this
|
||||||
|
License will continue to apply to the part which is the covered work,
|
||||||
|
but the special requirements of the GNU Affero General Public License,
|
||||||
|
section 13, concerning interaction through a network will apply to the
|
||||||
|
combination as such.
|
||||||
|
|
||||||
|
14. Revised Versions of this License.
|
||||||
|
|
||||||
|
The Free Software Foundation may publish revised and/or new versions of
|
||||||
|
the GNU General Public License from time to time. Such new versions will
|
||||||
|
be similar in spirit to the present version, but may differ in detail to
|
||||||
|
address new problems or concerns.
|
||||||
|
|
||||||
|
Each version is given a distinguishing version number. If the
|
||||||
|
Program specifies that a certain numbered version of the GNU General
|
||||||
|
Public License "or any later version" applies to it, you have the
|
||||||
|
option of following the terms and conditions either of that numbered
|
||||||
|
version or of any later version published by the Free Software
|
||||||
|
Foundation. If the Program does not specify a version number of the
|
||||||
|
GNU General Public License, you may choose any version ever published
|
||||||
|
by the Free Software Foundation.
|
||||||
|
|
||||||
|
If the Program specifies that a proxy can decide which future
|
||||||
|
versions of the GNU General Public License can be used, that proxy's
|
||||||
|
public statement of acceptance of a version permanently authorizes you
|
||||||
|
to choose that version for the Program.
|
||||||
|
|
||||||
|
Later license versions may give you additional or different
|
||||||
|
permissions. However, no additional obligations are imposed on any
|
||||||
|
author or copyright holder as a result of your choosing to follow a
|
||||||
|
later version.
|
||||||
|
|
||||||
|
15. Disclaimer of Warranty.
|
||||||
|
|
||||||
|
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
|
||||||
|
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
|
||||||
|
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
|
||||||
|
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
|
||||||
|
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||||
|
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
|
||||||
|
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
|
||||||
|
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
|
||||||
|
|
||||||
|
16. Limitation of Liability.
|
||||||
|
|
||||||
|
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
||||||
|
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
|
||||||
|
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
|
||||||
|
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
|
||||||
|
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
|
||||||
|
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
|
||||||
|
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
|
||||||
|
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
|
||||||
|
SUCH DAMAGES.
|
||||||
|
|
||||||
|
17. Interpretation of Sections 15 and 16.
|
||||||
|
|
||||||
|
If the disclaimer of warranty and limitation of liability provided
|
||||||
|
above cannot be given local legal effect according to their terms,
|
||||||
|
reviewing courts shall apply local law that most closely approximates
|
||||||
|
an absolute waiver of all civil liability in connection with the
|
||||||
|
Program, unless a warranty or assumption of liability accompanies a
|
||||||
|
copy of the Program in return for a fee.
|
||||||
|
|
||||||
|
END OF TERMS AND CONDITIONS
|
||||||
|
|
||||||
|
How to Apply These Terms to Your New Programs
|
||||||
|
|
||||||
|
If you develop a new program, and you want it to be of the greatest
|
||||||
|
possible use to the public, the best way to achieve this is to make it
|
||||||
|
free software which everyone can redistribute and change under these terms.
|
||||||
|
|
||||||
|
To do so, attach the following notices to the program. It is safest
|
||||||
|
to attach them to the start of each source file to most effectively
|
||||||
|
state the exclusion of warranty; and each file should have at least
|
||||||
|
the "copyright" line and a pointer to where the full notice is found.
|
||||||
|
|
||||||
|
<one line to give the program's name and a brief idea of what it does.>
|
||||||
|
Copyright (C) <year> <name of author>
|
||||||
|
|
||||||
|
This program is free software: you can redistribute it and/or modify
|
||||||
|
it under the terms of the GNU General Public License as published by
|
||||||
|
the Free Software Foundation, either version 3 of the License, or
|
||||||
|
(at your option) any later version.
|
||||||
|
|
||||||
|
This program is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
GNU General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU General Public License
|
||||||
|
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
Also add information on how to contact you by electronic and paper mail.
|
||||||
|
|
||||||
|
If the program does terminal interaction, make it output a short
|
||||||
|
notice like this when it starts in an interactive mode:
|
||||||
|
|
||||||
|
<program> Copyright (C) <year> <name of author>
|
||||||
|
This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
|
||||||
|
This is free software, and you are welcome to redistribute it
|
||||||
|
under certain conditions; type `show c' for details.
|
||||||
|
|
||||||
|
The hypothetical commands `show w' and `show c' should show the appropriate
|
||||||
|
parts of the General Public License. Of course, your program's commands
|
||||||
|
might be different; for a GUI interface, you would use an "about box".
|
||||||
|
|
||||||
|
You should also get your employer (if you work as a programmer) or school,
|
||||||
|
if any, to sign a "copyright disclaimer" for the program, if necessary.
|
||||||
|
For more information on this, and how to apply and follow the GNU GPL, see
|
||||||
|
<https://www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
The GNU General Public License does not permit incorporating your program
|
||||||
|
into proprietary programs. If your program is a subroutine library, you
|
||||||
|
may consider it more useful to permit linking proprietary applications with
|
||||||
|
the library. If this is what you want to do, use the GNU Lesser General
|
||||||
|
Public License instead of this License. But first, please read
|
||||||
|
<https://www.gnu.org/licenses/why-not-lgpl.html>.
|
||||||
@@ -4,6 +4,8 @@ A honeypot deception network framework. Spin up a fleet of fake machines — cal
|
|||||||
|
|
||||||
Attackers probe the network, DECNET traps every interaction, and you watch from a safe, isolated logging stack.
|
Attackers probe the network, DECNET traps every interaction, and you watch from a safe, isolated logging stack.
|
||||||
|
|
||||||
|
[](https://ko-fi.com/C0C31YDLB5)
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Table of Contents
|
## Table of Contents
|
||||||
|
|||||||
@@ -18,29 +18,138 @@ Endpoints mirror the existing unihost CLI verbs:
|
|||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import pathlib
|
||||||
from contextlib import asynccontextmanager
|
from contextlib import asynccontextmanager
|
||||||
from typing import Optional
|
from typing import Any, Optional
|
||||||
|
|
||||||
from fastapi import FastAPI, HTTPException
|
from fastapi import FastAPI, HTTPException
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
import contextlib
|
||||||
|
|
||||||
from decnet.agent import executor as _exec
|
from decnet.agent import executor as _exec
|
||||||
from decnet.agent import heartbeat as _heartbeat
|
from decnet.agent import heartbeat as _heartbeat
|
||||||
|
from decnet.agent import topology_ops as _topology_ops
|
||||||
|
from decnet.bus.factory import get_bus
|
||||||
|
from decnet.bus.publish import run_health_heartbeat
|
||||||
|
from decnet.swarm.pki import DEFAULT_AGENT_DIR
|
||||||
|
from decnet.agent.topology_store import AlreadyApplied, TopologyStore
|
||||||
from decnet.config import DecnetConfig
|
from decnet.config import DecnetConfig
|
||||||
from decnet.logging import get_logger
|
from decnet.logging import get_logger
|
||||||
|
from decnet.topology.validate import ValidationError
|
||||||
|
|
||||||
log = get_logger("agent.app")
|
log = get_logger("agent.app")
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_agent_dir() -> pathlib.Path:
|
||||||
|
env = os.environ.get("DECNET_AGENT_DIR")
|
||||||
|
if env:
|
||||||
|
return pathlib.Path(env)
|
||||||
|
system = pathlib.Path("/etc/decnet/agent")
|
||||||
|
if system.exists():
|
||||||
|
return system
|
||||||
|
return DEFAULT_AGENT_DIR
|
||||||
|
|
||||||
|
|
||||||
|
# Module-level singleton. Created lazily on first use so tests can
|
||||||
|
# monkeypatch DECNET_AGENT_DIR before the store binds to a path.
|
||||||
|
_topology_store: Optional[TopologyStore] = None
|
||||||
|
|
||||||
|
|
||||||
|
def _store() -> TopologyStore:
|
||||||
|
global _topology_store
|
||||||
|
if _topology_store is None:
|
||||||
|
_topology_store = TopologyStore(_resolve_agent_dir() / "topology.db")
|
||||||
|
return _topology_store
|
||||||
|
|
||||||
|
|
||||||
|
_collector_task: Optional[asyncio.Task] = None
|
||||||
|
|
||||||
|
|
||||||
|
def _ensure_collector_started() -> None:
|
||||||
|
"""Spawn the log collector on demand — called from /topology/apply
|
||||||
|
after a successful materialise. We must NOT start this in the
|
||||||
|
lifespan hook: the agent's boot invariant is "never touch docker
|
||||||
|
until master tells us to" (see tests/swarm/test_agent_no_auto_restore.py).
|
||||||
|
|
||||||
|
The collector watches ``decnet.topology.service=true`` labels via
|
||||||
|
docker events, writing RFC 5424 lines to ``DECNET_AGENT_LOG_FILE``
|
||||||
|
which the forwarder ships to the master over syslog-TLS. Idempotent:
|
||||||
|
subsequent calls while the task is still running are no-ops.
|
||||||
|
"""
|
||||||
|
global _collector_task
|
||||||
|
if _collector_task is not None and not _collector_task.done():
|
||||||
|
return
|
||||||
|
from decnet.env import DECNET_AGENT_LOG_FILE
|
||||||
|
|
||||||
|
try:
|
||||||
|
from decnet.collector.worker import log_collector_worker
|
||||||
|
except Exception: # noqa: BLE001 — docker may be unavailable on dev
|
||||||
|
log.warning(
|
||||||
|
"agent log collector not starting — collector worker import failed",
|
||||||
|
exc_info=True,
|
||||||
|
)
|
||||||
|
return
|
||||||
|
_collector_task = asyncio.create_task(
|
||||||
|
log_collector_worker(DECNET_AGENT_LOG_FILE),
|
||||||
|
name="agent-log-collector",
|
||||||
|
)
|
||||||
|
log.info("agent log collector started log_file=%s", DECNET_AGENT_LOG_FILE)
|
||||||
|
|
||||||
|
|
||||||
|
_bus_heartbeat_task: Optional[asyncio.Task] = None
|
||||||
|
|
||||||
|
|
||||||
@asynccontextmanager
|
@asynccontextmanager
|
||||||
async def _lifespan(app: FastAPI):
|
async def _lifespan(app: FastAPI):
|
||||||
# Best-effort: if identity/bundle plumbing isn't configured (e.g. dev
|
# Best-effort: if identity/bundle plumbing isn't configured (e.g. dev
|
||||||
# runs or non-enrolled hosts), heartbeat.start() is a silent no-op.
|
# runs or non-enrolled hosts), heartbeat.start() is a silent no-op.
|
||||||
_heartbeat.start()
|
_heartbeat.start()
|
||||||
|
|
||||||
|
# Host-local bus heartbeat (system.agent.health). Separate channel
|
||||||
|
# from the mTLS master-facing heartbeat above; this one lets peers on
|
||||||
|
# the same host (dashboard, updater) see the agent is alive without
|
||||||
|
# hitting its HTTPS endpoint. Bus-disabled path is a no-op loop.
|
||||||
|
bus = None
|
||||||
|
try:
|
||||||
|
bus = get_bus(client_name="agent")
|
||||||
|
await bus.connect()
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
log.warning("agent: bus unavailable, skipping health heartbeat: %s", exc)
|
||||||
|
bus = None
|
||||||
|
|
||||||
|
global _bus_heartbeat_task
|
||||||
|
_bus_heartbeat_task = asyncio.create_task(
|
||||||
|
run_health_heartbeat(bus, "agent"),
|
||||||
|
name="agent-bus-heartbeat",
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
yield
|
yield
|
||||||
finally:
|
finally:
|
||||||
await _heartbeat.stop()
|
await _heartbeat.stop()
|
||||||
|
if _bus_heartbeat_task is not None:
|
||||||
|
_bus_heartbeat_task.cancel()
|
||||||
|
with contextlib.suppress(asyncio.CancelledError, Exception):
|
||||||
|
await _bus_heartbeat_task
|
||||||
|
_bus_heartbeat_task = None
|
||||||
|
if bus is not None:
|
||||||
|
with contextlib.suppress(Exception):
|
||||||
|
await bus.close()
|
||||||
|
global _collector_task
|
||||||
|
if _collector_task is not None and not _collector_task.done():
|
||||||
|
_collector_task.cancel()
|
||||||
|
try:
|
||||||
|
await _collector_task
|
||||||
|
except (asyncio.CancelledError, Exception): # noqa: BLE001
|
||||||
|
pass
|
||||||
|
_collector_task = None
|
||||||
|
global _topology_store
|
||||||
|
if _topology_store is not None:
|
||||||
|
_topology_store.close()
|
||||||
|
_topology_store = None
|
||||||
|
|
||||||
|
|
||||||
app = FastAPI(
|
app = FastAPI(
|
||||||
@@ -129,6 +238,73 @@ async def self_destruct() -> dict:
|
|||||||
return {"status": "self_destruct_scheduled"}
|
return {"status": "self_destruct_scheduled"}
|
||||||
|
|
||||||
|
|
||||||
|
# ------------------------------------------------------- topology endpoints
|
||||||
|
|
||||||
|
|
||||||
|
class ApplyTopologyRequest(BaseModel):
|
||||||
|
hydrated: dict[str, Any] = Field(
|
||||||
|
..., description="Hydrated topology dict from master.persistence.hydrate()"
|
||||||
|
)
|
||||||
|
version_hash: str = Field(
|
||||||
|
..., description="Master's canonical_hash(hydrated); must match ours"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TeardownTopologyRequest(BaseModel):
|
||||||
|
topology_id: str = Field(..., description="Topology UUID to dismantle")
|
||||||
|
|
||||||
|
|
||||||
|
@app.post(
|
||||||
|
"/topology/apply",
|
||||||
|
responses={
|
||||||
|
400: {"description": "Malformed hydrated topology or hash mismatch"},
|
||||||
|
409: {"description": "A different topology is already applied"},
|
||||||
|
500: {"description": "Docker or compose raised while applying"},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
async def topology_apply(req: ApplyTopologyRequest) -> dict:
|
||||||
|
store = _store()
|
||||||
|
try:
|
||||||
|
await _topology_ops.apply(req.hydrated, req.version_hash, store)
|
||||||
|
except _topology_ops.HashMismatch as exc:
|
||||||
|
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
||||||
|
except ValidationError as exc:
|
||||||
|
raise HTTPException(status_code=400, detail=str(exc)) from exc
|
||||||
|
except AlreadyApplied as exc:
|
||||||
|
raise HTTPException(status_code=409, detail=str(exc)) from exc
|
||||||
|
except Exception as exc:
|
||||||
|
log.exception("agent.topology_apply failed")
|
||||||
|
topology_id = (req.hydrated.get("topology") or {}).get("id")
|
||||||
|
if topology_id:
|
||||||
|
try:
|
||||||
|
store.record_error(
|
||||||
|
str(topology_id), str(exc)[:500], hydrated=req.hydrated,
|
||||||
|
)
|
||||||
|
except Exception: # noqa: BLE001 — don't mask original failure
|
||||||
|
log.exception("failed to record apply error")
|
||||||
|
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
||||||
|
_ensure_collector_started()
|
||||||
|
return {"status": "applied", "version_hash": req.version_hash}
|
||||||
|
|
||||||
|
|
||||||
|
@app.post(
|
||||||
|
"/topology/teardown",
|
||||||
|
responses={500: {"description": "Docker or compose raised while tearing down"}},
|
||||||
|
)
|
||||||
|
async def topology_teardown(req: TeardownTopologyRequest) -> dict:
|
||||||
|
try:
|
||||||
|
await _topology_ops.teardown(req.topology_id, _store())
|
||||||
|
except Exception as exc:
|
||||||
|
log.exception("agent.topology_teardown failed")
|
||||||
|
raise HTTPException(status_code=500, detail=str(exc)) from exc
|
||||||
|
return {"status": "torn_down", "topology_id": req.topology_id}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/topology/state")
|
||||||
|
async def topology_state() -> dict:
|
||||||
|
return _topology_ops.state(_store())
|
||||||
|
|
||||||
|
|
||||||
@app.post(
|
@app.post(
|
||||||
"/mutate",
|
"/mutate",
|
||||||
responses={501: {"description": "Worker-side mutate not yet implemented"}},
|
responses={501: {"description": "Worker-side mutate not yet implemented"}},
|
||||||
|
|||||||
@@ -132,7 +132,7 @@ if command -v docker >/dev/null 2>&1; then
|
|||||||
fi
|
fi
|
||||||
|
|
||||||
# Stop+disable every systemd unit the installer may have dropped.
|
# Stop+disable every systemd unit the installer may have dropped.
|
||||||
for unit in decnet-agent decnet-engine decnet-collector decnet-forwarder decnet-prober decnet-sniffer decnet-updater; do
|
for unit in decnet-agent decnet-engine decnet-collector decnet-forwarder decnet-prober decnet-reconciler decnet-sniffer decnet-updater; do
|
||||||
systemctl stop "$unit" 2>/dev/null
|
systemctl stop "$unit" 2>/dev/null
|
||||||
systemctl disable "$unit" 2>/dev/null
|
systemctl disable "$unit" 2>/dev/null
|
||||||
done
|
done
|
||||||
|
|||||||
@@ -52,14 +52,26 @@ def _resolve_agent_dir() -> pathlib.Path:
|
|||||||
|
|
||||||
async def _tick(client: httpx.AsyncClient, url: str, host_uuid: str, agent_version: str) -> None:
|
async def _tick(client: httpx.AsyncClient, url: str, host_uuid: str, agent_version: str) -> None:
|
||||||
snap = await _exec.status()
|
snap = await _exec.status()
|
||||||
resp = await client.post(
|
body: dict = {
|
||||||
url,
|
"host_uuid": host_uuid,
|
||||||
json={
|
"agent_version": agent_version,
|
||||||
"host_uuid": host_uuid,
|
"status": snap,
|
||||||
"agent_version": agent_version,
|
}
|
||||||
"status": snap,
|
# Best-effort: fold in applied-topology snapshot. Failures must never
|
||||||
},
|
# wedge the heartbeat loop — master will fall back to "no topology
|
||||||
)
|
# reported" which triggers a resync if it expected one.
|
||||||
|
try:
|
||||||
|
from decnet.agent import topology_ops as _topo_ops
|
||||||
|
from decnet.agent.topology_store import TopologyStore
|
||||||
|
store = TopologyStore(_resolve_agent_dir() / "topology.db")
|
||||||
|
try:
|
||||||
|
body["topology"] = _topo_ops.state(store)
|
||||||
|
finally:
|
||||||
|
store.close()
|
||||||
|
except Exception:
|
||||||
|
log.debug("heartbeat: topology state unavailable", exc_info=True)
|
||||||
|
|
||||||
|
resp = await client.post(url, json=body)
|
||||||
# 403 / 404 are terminal-ish — we still keep looping because an
|
# 403 / 404 are terminal-ish — we still keep looping because an
|
||||||
# operator may re-enrol the host mid-session, but we log loudly so
|
# operator may re-enrol the host mid-session, but we log loudly so
|
||||||
# prod ops can spot cert-pinning drift.
|
# prod ops can spot cert-pinning drift.
|
||||||
|
|||||||
208
decnet/agent/topology_ops.py
Normal file
208
decnet/agent/topology_ops.py
Normal file
@@ -0,0 +1,208 @@
|
|||||||
|
"""Agent-side topology apply/teardown/state primitives.
|
||||||
|
|
||||||
|
Wraps the compose + bridge machinery from :mod:`decnet.engine.deployer`
|
||||||
|
so the agent can drive a topology without ever touching the master's
|
||||||
|
sqlmodel repo. The master-side ``deploy_topology`` always calls
|
||||||
|
``transition_status(repo, …)`` which is useless (and unreachable) on
|
||||||
|
an agent — here we operate purely on a hydrated dict + the local
|
||||||
|
:class:`TopologyStore`.
|
||||||
|
|
||||||
|
v1 constraint: one topology per agent. A second apply for a different
|
||||||
|
``topology_id`` triggers an on-the-spot teardown of the predecessor
|
||||||
|
before the new apply proceeds — master is authoritative.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import subprocess # nosec B404
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import docker
|
||||||
|
|
||||||
|
from decnet.agent.topology_store import (
|
||||||
|
TopologyStore,
|
||||||
|
observed,
|
||||||
|
)
|
||||||
|
from decnet.engine.deployer import (
|
||||||
|
_compose,
|
||||||
|
_compose_with_retry,
|
||||||
|
_teardown_order,
|
||||||
|
_topology_compose_path,
|
||||||
|
)
|
||||||
|
from decnet.logging import get_logger
|
||||||
|
from decnet.network import create_bridge_network, remove_bridge_network
|
||||||
|
from decnet.topology.compose import (
|
||||||
|
_network_name as _topology_network_name,
|
||||||
|
write_topology_compose,
|
||||||
|
)
|
||||||
|
from decnet.topology.hashing import canonical_hash
|
||||||
|
from decnet.topology.validate import (
|
||||||
|
ValidationError,
|
||||||
|
errors as _validation_errors,
|
||||||
|
validate as _validate_topology,
|
||||||
|
)
|
||||||
|
|
||||||
|
log = get_logger("agent.topology_ops")
|
||||||
|
|
||||||
|
|
||||||
|
class HashMismatch(RuntimeError):
|
||||||
|
"""Raised when the master-provided version_hash doesn't match what we
|
||||||
|
hash locally — suggests serialisation drift. We fail loudly rather
|
||||||
|
than silently papering over a schema mismatch."""
|
||||||
|
|
||||||
|
|
||||||
|
def _topology_id(hydrated: dict[str, Any]) -> str:
|
||||||
|
topo = hydrated.get("topology") or {}
|
||||||
|
tid = topo.get("id")
|
||||||
|
if not tid:
|
||||||
|
raise ValueError("hydrated topology missing topology.id")
|
||||||
|
return str(tid)
|
||||||
|
|
||||||
|
|
||||||
|
async def apply(
|
||||||
|
hydrated: dict[str, Any],
|
||||||
|
version_hash: str,
|
||||||
|
store: TopologyStore,
|
||||||
|
) -> None:
|
||||||
|
"""Materialise *hydrated* on this agent and record it in *store*.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
HashMismatch: master and agent disagree on the canonical hash —
|
||||||
|
don't touch docker, fail the apply.
|
||||||
|
ValidationError: topology fails structural validation.
|
||||||
|
Any docker / compose error propagates up; the endpoint maps it
|
||||||
|
to 500 and records the message on the store row.
|
||||||
|
"""
|
||||||
|
local_hash = canonical_hash(hydrated)
|
||||||
|
if local_hash != version_hash:
|
||||||
|
raise HashMismatch(
|
||||||
|
f"master hash {version_hash!r} does not match agent hash "
|
||||||
|
f"{local_hash!r} — refusing to apply"
|
||||||
|
)
|
||||||
|
|
||||||
|
issues = _validate_topology(hydrated)
|
||||||
|
if _validation_errors(issues):
|
||||||
|
raise ValidationError(issues)
|
||||||
|
|
||||||
|
topology_id = _topology_id(hydrated)
|
||||||
|
# Master is authoritative. If a different topology is pinned here
|
||||||
|
# — whether it fully applied, only partially applied (failure
|
||||||
|
# marker row + orphan containers), or drifted — teardown first,
|
||||||
|
# then accept the new one. Refusing with 409 would leave the
|
||||||
|
# agent stuck in a state only a human could resolve.
|
||||||
|
existing = store.current()
|
||||||
|
if existing is not None and existing.topology_id != topology_id:
|
||||||
|
log.info(
|
||||||
|
"superseding topology %s with %s on master authority",
|
||||||
|
existing.topology_id, topology_id,
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
await teardown(existing.topology_id, store)
|
||||||
|
except Exception as exc: # noqa: BLE001 — we still want to try applying
|
||||||
|
log.warning(
|
||||||
|
"best-effort teardown of superseded topology %s failed: %s",
|
||||||
|
existing.topology_id, exc,
|
||||||
|
)
|
||||||
|
# Hard-clear the store row so the new apply isn't blocked
|
||||||
|
# by a half-torn-down predecessor. Leftover docker objects
|
||||||
|
# will surface via the next heartbeat's observed block.
|
||||||
|
store.clear(existing.topology_id)
|
||||||
|
|
||||||
|
lans = hydrated["lans"]
|
||||||
|
compose_path = _topology_compose_path(topology_id)
|
||||||
|
client = docker.from_env()
|
||||||
|
|
||||||
|
# Bridges + compose are sync/blocking; hop to a thread so we don't
|
||||||
|
# stall the event loop on a slow docker daemon.
|
||||||
|
def _materialise() -> None:
|
||||||
|
for lan in lans:
|
||||||
|
net_name = _topology_network_name(topology_id, lan["name"])
|
||||||
|
internal = not lan["is_dmz"]
|
||||||
|
create_bridge_network(
|
||||||
|
client, net_name, lan["subnet"], internal=internal
|
||||||
|
)
|
||||||
|
write_topology_compose(hydrated, compose_path)
|
||||||
|
# ``--always-recreate-deps`` keeps service containers' netns shares
|
||||||
|
# fresh: every decky service joins its base's netns via
|
||||||
|
# ``network_mode: container:<base>``, and that share is bound at
|
||||||
|
# service start time. If a base is recreated (e.g. when ``ports:``
|
||||||
|
# changes after toggling ``forwards_l3``) but compose decides the
|
||||||
|
# services are unchanged, the services keep a stale netns FD
|
||||||
|
# pointing at the destroyed base — they end up in an empty
|
||||||
|
# namespace with only ``lo``, and external traffic hits a closed
|
||||||
|
# port on the live base. Forcing dependents to recreate alongside
|
||||||
|
# the base is the cheapest way to make this race impossible.
|
||||||
|
_compose_with_retry(
|
||||||
|
"up", "--build", "-d", "--always-recreate-deps",
|
||||||
|
compose_file=compose_path,
|
||||||
|
)
|
||||||
|
|
||||||
|
await asyncio.to_thread(_materialise)
|
||||||
|
|
||||||
|
store.put(topology_id, version_hash, hydrated)
|
||||||
|
log.info(
|
||||||
|
"topology %s applied on agent (%d LANs)", topology_id, len(lans)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def teardown(
|
||||||
|
topology_id: str,
|
||||||
|
store: TopologyStore,
|
||||||
|
) -> None:
|
||||||
|
"""Tear down *topology_id* on this agent. Idempotent: if there's no
|
||||||
|
record and no compose file, it's a no-op that still returns cleanly."""
|
||||||
|
row = store.current()
|
||||||
|
# Prefer the stored hydrated blob — it's what we applied with. If
|
||||||
|
# it's gone (db wiped) but compose-file lingers, we still try to
|
||||||
|
# compose-down and delete bridges by scanning the compose file's
|
||||||
|
# LAN membership list via the hydrated blob if available.
|
||||||
|
hydrated = row.hydrated if row and row.topology_id == topology_id else None
|
||||||
|
compose_path = _topology_compose_path(topology_id)
|
||||||
|
client = docker.from_env()
|
||||||
|
|
||||||
|
def _dismantle() -> None:
|
||||||
|
if compose_path.exists():
|
||||||
|
try:
|
||||||
|
_compose("down", "--remove-orphans", compose_file=compose_path)
|
||||||
|
except subprocess.CalledProcessError as exc:
|
||||||
|
log.warning(
|
||||||
|
"topology %s compose down failed (continuing): %s",
|
||||||
|
topology_id, exc,
|
||||||
|
)
|
||||||
|
if hydrated is not None:
|
||||||
|
for lan_name in _teardown_order(hydrated["lans"]):
|
||||||
|
net_name = _topology_network_name(topology_id, lan_name)
|
||||||
|
remove_bridge_network(client, net_name)
|
||||||
|
if compose_path.exists():
|
||||||
|
compose_path.unlink()
|
||||||
|
|
||||||
|
await asyncio.to_thread(_dismantle)
|
||||||
|
store.clear(topology_id)
|
||||||
|
log.info("topology %s torn down on agent", topology_id)
|
||||||
|
|
||||||
|
|
||||||
|
def state(store: TopologyStore) -> dict[str, Any]:
|
||||||
|
"""Snapshot-plus-live-observation — the shape the heartbeat embeds."""
|
||||||
|
row = store.current()
|
||||||
|
try:
|
||||||
|
obs = observed(docker.from_env())
|
||||||
|
except Exception as exc: # noqa: BLE001 — docker socket may be gone
|
||||||
|
obs = {"error": str(exc)[:200]}
|
||||||
|
if row is None:
|
||||||
|
return {
|
||||||
|
"topology_id": None,
|
||||||
|
"applied_version_hash": None,
|
||||||
|
"applied_at": None,
|
||||||
|
"last_error": None,
|
||||||
|
"observed": obs,
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
"topology_id": row.topology_id,
|
||||||
|
"applied_version_hash": row.applied_version_hash,
|
||||||
|
"applied_at": row.applied_at,
|
||||||
|
"last_error": row.last_error,
|
||||||
|
"observed": obs,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["apply", "teardown", "state", "HashMismatch"]
|
||||||
213
decnet/agent/topology_store.py
Normal file
213
decnet/agent/topology_store.py
Normal file
@@ -0,0 +1,213 @@
|
|||||||
|
"""Agent-side sqlite cache of the currently-applied topology.
|
||||||
|
|
||||||
|
**This is a cache, not a source of truth.** The master is the only
|
||||||
|
authority for what the agent should be running. This store exists so
|
||||||
|
the agent can answer two questions quickly and offline:
|
||||||
|
|
||||||
|
1. What topology did I last apply, and with what version hash?
|
||||||
|
2. Is what docker is currently doing consistent with that?
|
||||||
|
|
||||||
|
The hash goes out on every heartbeat; the master compares it to what
|
||||||
|
it thinks this host should be running and schedules a re-push on
|
||||||
|
mismatch.
|
||||||
|
|
||||||
|
Why sqlite when the blob is JSON? Consistent with
|
||||||
|
:mod:`decnet.swarm.log_forwarder._OffsetStore` — single-row sqlite is
|
||||||
|
the project-wide pattern for agent-local persistent state. Keeps
|
||||||
|
operational mental model small: "one state.db per thing".
|
||||||
|
|
||||||
|
Design choices worth calling out:
|
||||||
|
|
||||||
|
- **One row, one topology.** v1 only supports a single topology per
|
||||||
|
agent. Attempting to :meth:`put` a different ``topology_id`` while
|
||||||
|
a row already exists raises :class:`AlreadyApplied` — the agent
|
||||||
|
rejects the apply with 409 and the master is expected to teardown
|
||||||
|
the old one first.
|
||||||
|
- **No auto-restore on boot.** The agent does NOT read this db at
|
||||||
|
startup and try to re-apply. Whatever docker has after a restart
|
||||||
|
is what it has; the next heartbeat reports the truth and the
|
||||||
|
master decides whether to re-push. Same reason we don't sync
|
||||||
|
mutations from agent → master anywhere else: split-brain is worse
|
||||||
|
than temporary drift.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import pathlib
|
||||||
|
import sqlite3
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Any, Optional
|
||||||
|
|
||||||
|
|
||||||
|
class AlreadyApplied(RuntimeError):
|
||||||
|
"""Raised when a different topology is already pinned to this agent."""
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class AppliedRow:
|
||||||
|
topology_id: str
|
||||||
|
applied_version_hash: str
|
||||||
|
hydrated: dict[str, Any]
|
||||||
|
applied_at: int
|
||||||
|
last_error: Optional[str]
|
||||||
|
|
||||||
|
|
||||||
|
class TopologyStore:
|
||||||
|
"""Single-row sqlite cache. Stdlib only, sync (called from endpoints)."""
|
||||||
|
|
||||||
|
def __init__(self, db_path: pathlib.Path) -> None:
|
||||||
|
db_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
# check_same_thread=False: Starlette/FastAPI runs sync endpoint
|
||||||
|
# bodies on a worker thread distinct from where `app` is imported.
|
||||||
|
# The agent is single-process, so there's no real contention —
|
||||||
|
# sqlite's own connection lock is enough.
|
||||||
|
self._conn = sqlite3.connect(str(db_path), check_same_thread=False)
|
||||||
|
self._conn.execute(
|
||||||
|
"CREATE TABLE IF NOT EXISTS applied_topology ("
|
||||||
|
" topology_id TEXT PRIMARY KEY,"
|
||||||
|
" applied_version_hash TEXT NOT NULL,"
|
||||||
|
" hydrated_blob_json TEXT NOT NULL,"
|
||||||
|
" applied_at INTEGER NOT NULL,"
|
||||||
|
" last_error TEXT)"
|
||||||
|
)
|
||||||
|
self._conn.commit()
|
||||||
|
|
||||||
|
# ----------------------------------------------------------------- reads
|
||||||
|
|
||||||
|
def current(self) -> Optional[AppliedRow]:
|
||||||
|
"""Return the single applied topology, or ``None`` if idle."""
|
||||||
|
row = self._conn.execute(
|
||||||
|
"SELECT topology_id, applied_version_hash, hydrated_blob_json,"
|
||||||
|
" applied_at, last_error FROM applied_topology LIMIT 1"
|
||||||
|
).fetchone()
|
||||||
|
if row is None:
|
||||||
|
return None
|
||||||
|
return AppliedRow(
|
||||||
|
topology_id=row[0],
|
||||||
|
applied_version_hash=row[1],
|
||||||
|
hydrated=json.loads(row[2]),
|
||||||
|
applied_at=int(row[3]),
|
||||||
|
last_error=row[4],
|
||||||
|
)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------- writes
|
||||||
|
|
||||||
|
def put(
|
||||||
|
self,
|
||||||
|
topology_id: str,
|
||||||
|
applied_version_hash: str,
|
||||||
|
hydrated: dict[str, Any],
|
||||||
|
) -> None:
|
||||||
|
"""Record an applied topology.
|
||||||
|
|
||||||
|
If a *different* topology is already recorded, raises
|
||||||
|
:class:`AlreadyApplied`. Re-applying the same ``topology_id``
|
||||||
|
just updates the hash + blob (idempotent re-push).
|
||||||
|
"""
|
||||||
|
existing = self.current()
|
||||||
|
if existing is not None and existing.topology_id != topology_id:
|
||||||
|
raise AlreadyApplied(
|
||||||
|
f"agent already has topology {existing.topology_id!r}; "
|
||||||
|
f"cannot apply {topology_id!r}"
|
||||||
|
)
|
||||||
|
self._conn.execute(
|
||||||
|
"INSERT INTO applied_topology"
|
||||||
|
" (topology_id, applied_version_hash, hydrated_blob_json,"
|
||||||
|
" applied_at, last_error)"
|
||||||
|
" VALUES (?, ?, ?, ?, NULL)"
|
||||||
|
" ON CONFLICT(topology_id) DO UPDATE SET"
|
||||||
|
" applied_version_hash=excluded.applied_version_hash,"
|
||||||
|
" hydrated_blob_json=excluded.hydrated_blob_json,"
|
||||||
|
" applied_at=excluded.applied_at,"
|
||||||
|
" last_error=NULL",
|
||||||
|
(
|
||||||
|
topology_id,
|
||||||
|
applied_version_hash,
|
||||||
|
json.dumps(hydrated, sort_keys=True),
|
||||||
|
int(time.time()),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
self._conn.commit()
|
||||||
|
|
||||||
|
def record_error(
|
||||||
|
self,
|
||||||
|
topology_id: str,
|
||||||
|
message: str,
|
||||||
|
hydrated: Optional[dict[str, Any]] = None,
|
||||||
|
) -> None:
|
||||||
|
"""Attach a last-error message for *topology_id*.
|
||||||
|
|
||||||
|
Upserts a marker row when no apply has yet succeeded for this
|
||||||
|
topology — that way a failure *during* the first materialise
|
||||||
|
(put() hasn't been reached) still surfaces via GET
|
||||||
|
/topology/state and the next heartbeat. The marker row uses an
|
||||||
|
empty ``applied_version_hash`` so master's heartbeat check sees
|
||||||
|
the hash mismatch and schedules a resync.
|
||||||
|
|
||||||
|
If *hydrated* is provided it is stored so a later teardown can
|
||||||
|
still walk the LAN list — otherwise a partial deploy is strands
|
||||||
|
containers + bridges with no breadcrumb back to them.
|
||||||
|
"""
|
||||||
|
blob = json.dumps(hydrated, sort_keys=True) if hydrated else "{}"
|
||||||
|
self._conn.execute(
|
||||||
|
"INSERT INTO applied_topology"
|
||||||
|
" (topology_id, applied_version_hash, hydrated_blob_json,"
|
||||||
|
" applied_at, last_error)"
|
||||||
|
" VALUES (?, '', ?, 0, ?)"
|
||||||
|
" ON CONFLICT(topology_id) DO UPDATE SET"
|
||||||
|
" last_error=excluded.last_error,"
|
||||||
|
" hydrated_blob_json=CASE"
|
||||||
|
" WHEN applied_topology.hydrated_blob_json='{}'"
|
||||||
|
" THEN excluded.hydrated_blob_json"
|
||||||
|
" ELSE applied_topology.hydrated_blob_json END",
|
||||||
|
(topology_id, blob, message),
|
||||||
|
)
|
||||||
|
self._conn.commit()
|
||||||
|
|
||||||
|
def clear(self, topology_id: str) -> None:
|
||||||
|
"""Remove the row for *topology_id* (post-teardown).
|
||||||
|
|
||||||
|
No-op if the row doesn't exist — makes teardown idempotent.
|
||||||
|
"""
|
||||||
|
self._conn.execute(
|
||||||
|
"DELETE FROM applied_topology WHERE topology_id=?",
|
||||||
|
(topology_id,),
|
||||||
|
)
|
||||||
|
self._conn.commit()
|
||||||
|
|
||||||
|
def close(self) -> None:
|
||||||
|
self._conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
# --------------------------------------------------- live docker observation
|
||||||
|
|
||||||
|
|
||||||
|
def observed(docker_client: Any) -> dict[str, Any]:
|
||||||
|
"""Snapshot what docker is *actually* running on this agent.
|
||||||
|
|
||||||
|
Returns a compact dict the heartbeat can ship so the master can
|
||||||
|
cross-check ``applied_version_hash`` against reality (a matching
|
||||||
|
hash with missing bridges is still drift). Best-effort: if docker
|
||||||
|
is unreachable we return an ``error`` marker rather than raising —
|
||||||
|
the agent still needs to heartbeat, and the master can treat
|
||||||
|
``error`` as "unknown, re-push".
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
bridges = [
|
||||||
|
n.name
|
||||||
|
for n in docker_client.networks.list()
|
||||||
|
if n.attrs.get("Driver") == "bridge"
|
||||||
|
and n.name.startswith("decnet-topology-")
|
||||||
|
]
|
||||||
|
containers = [
|
||||||
|
c.name
|
||||||
|
for c in docker_client.containers.list(all=False)
|
||||||
|
if c.name.startswith("decnet-")
|
||||||
|
]
|
||||||
|
return {"bridges": sorted(bridges), "containers": sorted(containers)}
|
||||||
|
except Exception as exc: # noqa: BLE001 — best-effort observation
|
||||||
|
return {"error": str(exc)[:200]}
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["TopologyStore", "AppliedRow", "AlreadyApplied", "observed"]
|
||||||
92
decnet/asn/__init__.py
Normal file
92
decnet/asn/__init__.py
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
"""
|
||||||
|
IP-to-ASN enrichment — maps attacker IPs to BGP-announced AS numbers and
|
||||||
|
org names for attacker intelligence.
|
||||||
|
|
||||||
|
Public surface mirrors :mod:`decnet.geoip` so callers can compose them:
|
||||||
|
|
||||||
|
* :func:`get_lookup` — returns the singleton :class:`AsnLookup`.
|
||||||
|
* :func:`enrich_ip` — takes an IP string, returns
|
||||||
|
``(asn_int, asn_name, provider_name)`` or ``(None, None, None)``.
|
||||||
|
|
||||||
|
Provider selection goes through :func:`~decnet.asn.factory.get_provider`
|
||||||
|
(env ``DECNET_ASN_PROVIDER``, default ``iptoasn``). Direct imports of
|
||||||
|
concrete providers are forbidden — mirrors the ``get_bus`` /
|
||||||
|
``get_repository`` rule.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
from typing import Optional, Tuple
|
||||||
|
|
||||||
|
from decnet.asn.factory import get_provider
|
||||||
|
from decnet.asn.lookup import AsnLookup
|
||||||
|
from decnet.asn.paths import ASN_ROOT
|
||||||
|
|
||||||
|
# 24 h — iptoasn refreshes daily.
|
||||||
|
REFRESH_INTERVAL_S = 86_400
|
||||||
|
|
||||||
|
_lookup: Optional[AsnLookup] = None
|
||||||
|
_provider_name: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
def get_lookup(*, force_refresh: bool = False) -> AsnLookup:
|
||||||
|
"""Return the cached :class:`AsnLookup`, building it on first use.
|
||||||
|
|
||||||
|
If the provider's data files are missing or older than
|
||||||
|
``REFRESH_INTERVAL_S`` seconds, refresh before building. Pass
|
||||||
|
``force_refresh=True`` to bypass the age check (used by a future
|
||||||
|
``decnet asn refresh`` CLI command).
|
||||||
|
"""
|
||||||
|
global _lookup, _provider_name
|
||||||
|
provider = get_provider()
|
||||||
|
_provider_name = provider.name
|
||||||
|
|
||||||
|
if force_refresh or _files_stale(provider):
|
||||||
|
provider.refresh()
|
||||||
|
_lookup = None # rebuild on next access
|
||||||
|
|
||||||
|
if _lookup is None:
|
||||||
|
_lookup = provider.build_lookup()
|
||||||
|
return _lookup
|
||||||
|
|
||||||
|
|
||||||
|
def enrich_ip(ip: str) -> Tuple[Optional[int], Optional[str], Optional[str]]:
|
||||||
|
"""Return ``(asn, as_name, provider_name)`` or ``(None, None, None)``.
|
||||||
|
|
||||||
|
Never raises — any lookup failure collapses to all-None so the
|
||||||
|
caller (profiler) can upsert the attacker row regardless.
|
||||||
|
|
||||||
|
``DECNET_ASN_ENABLED=false`` short-circuits the whole path, useful
|
||||||
|
for tests / agent hosts / ops wanting to disable enrichment without
|
||||||
|
touching provider config.
|
||||||
|
"""
|
||||||
|
if os.environ.get("DECNET_ASN_ENABLED", "true").lower() == "false":
|
||||||
|
return (None, None, None)
|
||||||
|
try:
|
||||||
|
lookup = get_lookup()
|
||||||
|
info = lookup.asn(ip)
|
||||||
|
if info is None:
|
||||||
|
return (None, None, None)
|
||||||
|
return (info.asn, info.name or None, _provider_name or "unknown")
|
||||||
|
except Exception:
|
||||||
|
return (None, None, None)
|
||||||
|
|
||||||
|
|
||||||
|
def _files_stale(provider) -> bool:
|
||||||
|
"""True when the provider has no fresh data on disk.
|
||||||
|
|
||||||
|
Same semantics as :func:`decnet.geoip._files_stale`: a partial
|
||||||
|
cache still produces correct answers for the ranges it covers.
|
||||||
|
"""
|
||||||
|
paths = provider.data_paths()
|
||||||
|
if not paths:
|
||||||
|
return True
|
||||||
|
now = time.time()
|
||||||
|
for p in paths:
|
||||||
|
if p.exists() and now - p.stat().st_mtime <= REFRESH_INTERVAL_S:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["get_lookup", "enrich_ip", "ASN_ROOT", "REFRESH_INTERVAL_S"]
|
||||||
33
decnet/asn/base.py
Normal file
33
decnet/asn/base.py
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
"""ASN provider protocol — mirror of :mod:`decnet.geoip.base`.
|
||||||
|
|
||||||
|
Concrete providers (e.g. :mod:`decnet.asn.iptoasn`) implement this.
|
||||||
|
Callers must go through :func:`decnet.asn.factory.get_provider`; never
|
||||||
|
import a concrete provider class directly.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Sequence
|
||||||
|
|
||||||
|
from decnet.asn.lookup import AsnLookup
|
||||||
|
|
||||||
|
|
||||||
|
class Provider(ABC):
|
||||||
|
"""Abstract IP→ASN data provider."""
|
||||||
|
|
||||||
|
#: Short tag written to ``Attacker.asn_source`` (e.g. ``'iptoasn'``).
|
||||||
|
name: str
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def refresh(self) -> None:
|
||||||
|
"""Download / regenerate the provider's raw data files."""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def build_lookup(self) -> AsnLookup:
|
||||||
|
"""Parse the on-disk data files and return a ready-to-query lookup."""
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def data_paths(self) -> Sequence[Path]:
|
||||||
|
"""Return the list of files this provider manages — used for staleness
|
||||||
|
detection. Order is not significant."""
|
||||||
39
decnet/asn/factory.py
Normal file
39
decnet/asn/factory.py
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
"""ASN provider factory — mirror of :mod:`decnet.geoip.factory`.
|
||||||
|
|
||||||
|
Dispatch key: ``DECNET_ASN_PROVIDER`` (default ``iptoasn``). Lazy
|
||||||
|
singleton.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from decnet.asn.base import Provider
|
||||||
|
|
||||||
|
_cached: Optional[Provider] = None
|
||||||
|
_cached_key: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
def get_provider() -> Provider:
|
||||||
|
"""Return the configured :class:`Provider` singleton."""
|
||||||
|
global _cached, _cached_key
|
||||||
|
key = os.environ.get("DECNET_ASN_PROVIDER", "iptoasn").lower()
|
||||||
|
if _cached is not None and _cached_key == key:
|
||||||
|
return _cached
|
||||||
|
|
||||||
|
if key == "iptoasn":
|
||||||
|
from decnet.asn.iptoasn.provider import IptoasnProvider
|
||||||
|
provider: Provider = IptoasnProvider()
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported ASN provider: {key!r}")
|
||||||
|
|
||||||
|
_cached = provider
|
||||||
|
_cached_key = key
|
||||||
|
return provider
|
||||||
|
|
||||||
|
|
||||||
|
def reset_cache() -> None:
|
||||||
|
"""Forget the singleton — tests swap providers via the env var."""
|
||||||
|
global _cached, _cached_key
|
||||||
|
_cached = None
|
||||||
|
_cached_key = None
|
||||||
9
decnet/asn/iptoasn/__init__.py
Normal file
9
decnet/asn/iptoasn/__init__.py
Normal file
@@ -0,0 +1,9 @@
|
|||||||
|
"""iptoasn.com IP→ASN provider.
|
||||||
|
|
||||||
|
Daily-refreshed gzipped TSV dump of the global BGP table, derived from
|
||||||
|
RIPE RIS. Released into the public domain by upstream — no attribution
|
||||||
|
required, no UA mandate, no terms to violate.
|
||||||
|
|
||||||
|
Direct imports of :class:`IptoasnProvider` are discouraged — go through
|
||||||
|
:func:`decnet.asn.factory.get_provider`.
|
||||||
|
"""
|
||||||
63
decnet/asn/iptoasn/fetch.py
Normal file
63
decnet/asn/iptoasn/fetch.py
Normal file
@@ -0,0 +1,63 @@
|
|||||||
|
"""iptoasn.com bulk dump download.
|
||||||
|
|
||||||
|
One file: ``ip2asn-v4.tsv.gz``, ~5 MB compressed, refreshed daily.
|
||||||
|
Pulled over HTTPS with the same generic UA the geoip RIR fetcher uses
|
||||||
|
(stealth: never identify as DECNET — public-data scrapers correlated to
|
||||||
|
honeypot operator egress is the threat model).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import shutil
|
||||||
|
import urllib.request
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
logger = logging.getLogger("decnet.asn.iptoasn.fetch")
|
||||||
|
|
||||||
|
# Mirror the (name, url) tuple shape of geoip.rir.fetch so test
|
||||||
|
# harnesses can swap one for the other.
|
||||||
|
IPTOASN_SOURCES: Tuple[Tuple[str, str], ...] = (
|
||||||
|
("ip2asn-v4", "https://iptoasn.com/data/ip2asn-v4.tsv.gz"),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Generic UA — matches geoip.rir.fetch. iptoasn.com explicitly releases
|
||||||
|
# the data into the public domain and does NOT require an identifying UA,
|
||||||
|
# so we keep DECNET stealth instead of advertising.
|
||||||
|
_USER_AGENT = "Mozilla/5.0 (compatible; fetch/1.0)"
|
||||||
|
_TIMEOUT_S = 60
|
||||||
|
|
||||||
|
|
||||||
|
def fetch_all(dest: Path) -> list[Path]:
|
||||||
|
"""Download every iptoasn file into *dest*. Returns the written paths.
|
||||||
|
|
||||||
|
Atomic per file: download to ``{name}.tsv.gz.tmp`` then rename. A
|
||||||
|
partial failure leaves the previous generation intact.
|
||||||
|
"""
|
||||||
|
dest.mkdir(parents=True, exist_ok=True)
|
||||||
|
written: list[Path] = []
|
||||||
|
for name, url in IPTOASN_SOURCES:
|
||||||
|
target = dest / f"{name}.tsv.gz"
|
||||||
|
tmp = target.with_suffix(".gz.tmp")
|
||||||
|
try:
|
||||||
|
_download(url, tmp)
|
||||||
|
tmp.replace(target)
|
||||||
|
written.append(target)
|
||||||
|
logger.info(
|
||||||
|
"asn.iptoasn: fetched %s (%d bytes)",
|
||||||
|
name, target.stat().st_size,
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.error(
|
||||||
|
"asn.iptoasn: fetch failed for %s (%s): %s", name, url, exc
|
||||||
|
)
|
||||||
|
if tmp.exists():
|
||||||
|
tmp.unlink(missing_ok=True)
|
||||||
|
# Keep any stale previous file — better outdated than empty.
|
||||||
|
return written
|
||||||
|
|
||||||
|
|
||||||
|
def _download(url: str, dest: Path) -> None:
|
||||||
|
req = urllib.request.Request(url, headers={"User-Agent": _USER_AGENT})
|
||||||
|
with urllib.request.urlopen(req, timeout=_TIMEOUT_S) as resp, dest.open("wb") as fh: # nosec B310 — fixed https iptoasn URL
|
||||||
|
shutil.copyfileobj(resp, fh)
|
||||||
78
decnet/asn/iptoasn/parse.py
Normal file
78
decnet/asn/iptoasn/parse.py
Normal file
@@ -0,0 +1,78 @@
|
|||||||
|
"""Parser for the iptoasn.com ``ip2asn-v4.tsv`` dump.
|
||||||
|
|
||||||
|
Line shape (gzipped, one row per BGP-announced prefix)::
|
||||||
|
|
||||||
|
1.0.0.0\\t1.0.0.255\\t13335\\tUS\\tCLOUDFLARENET
|
||||||
|
|
||||||
|
Fields: ``range_start``, ``range_end``, ``as_number``, ``country_code``,
|
||||||
|
``as_description``. Both range columns are dotted IPv4 strings (the dump
|
||||||
|
is IPv4-only — there's a separate ``ip2asn-v6.tsv.gz`` we don't pull).
|
||||||
|
|
||||||
|
Rows skipped:
|
||||||
|
|
||||||
|
* ``as_number == 0`` — iptoasn's sentinel for "unannounced" / private
|
||||||
|
/ reserved space. Country may still be present (``"None"`` / two-letter
|
||||||
|
CC) but we don't care: the geoip module owns country, ASN owns BGP.
|
||||||
|
* Rows where either range column won't parse as IPv4.
|
||||||
|
* Rows with fewer than 3 tab-separated columns.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import gzip
|
||||||
|
import ipaddress
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterator
|
||||||
|
|
||||||
|
from decnet.asn.lookup import AsnInfo, Range
|
||||||
|
|
||||||
|
logger = logging.getLogger("decnet.asn.iptoasn.parse")
|
||||||
|
|
||||||
|
|
||||||
|
def parse_file(path: Path) -> Iterator[Range]:
|
||||||
|
"""Yield ``(start_int, end_int_inclusive, AsnInfo)`` for every BGP row.
|
||||||
|
|
||||||
|
Accepts a gzipped path (``*.tsv.gz``); plain TSV is also fine for
|
||||||
|
test harnesses that hand-craft small fixtures.
|
||||||
|
"""
|
||||||
|
opener = gzip.open if path.suffix == ".gz" else open
|
||||||
|
with opener(path, "rt", encoding="utf-8", errors="replace") as fh:
|
||||||
|
for lineno, raw in enumerate(fh, 1):
|
||||||
|
line = raw.rstrip("\n")
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
parts = line.split("\t")
|
||||||
|
if len(parts) < 3:
|
||||||
|
continue
|
||||||
|
start_s, end_s, asn_s = parts[0], parts[1], parts[2]
|
||||||
|
# Description is the 5th column; iptoasn quotes nothing,
|
||||||
|
# but the field can contain stray whitespace. ``""`` when
|
||||||
|
# missing or unknown.
|
||||||
|
name = parts[4].strip() if len(parts) >= 5 else ""
|
||||||
|
|
||||||
|
try:
|
||||||
|
asn = int(asn_s)
|
||||||
|
except ValueError:
|
||||||
|
logger.debug(
|
||||||
|
"asn.iptoasn: skipping malformed asn line %d in %s",
|
||||||
|
lineno, path.name,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
# ASN 0 is iptoasn's sentinel for unannounced / sentinel
|
||||||
|
# space. Skip — there's no useful enrichment to attach.
|
||||||
|
if asn == 0:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
start_int = int(ipaddress.IPv4Address(start_s))
|
||||||
|
end_int = int(ipaddress.IPv4Address(end_s))
|
||||||
|
except (ValueError, ipaddress.AddressValueError):
|
||||||
|
logger.debug(
|
||||||
|
"asn.iptoasn: skipping malformed addr line %d in %s",
|
||||||
|
lineno, path.name,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
if end_int < start_int:
|
||||||
|
continue
|
||||||
|
|
||||||
|
yield (start_int, end_int, AsnInfo(asn=asn, name=name))
|
||||||
83
decnet/asn/iptoasn/provider.py
Normal file
83
decnet/asn/iptoasn/provider.py
Normal file
@@ -0,0 +1,83 @@
|
|||||||
|
"""iptoasn provider — orchestrates fetch + parse into an :class:`AsnLookup`.
|
||||||
|
|
||||||
|
Mirrors :class:`decnet.geoip.rir.provider.RirProvider` exactly: fetch,
|
||||||
|
build a pickled cache, invalidate when raw files are newer than the
|
||||||
|
cache.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Sequence
|
||||||
|
|
||||||
|
from decnet.asn.base import Provider
|
||||||
|
from decnet.asn.iptoasn.fetch import IPTOASN_SOURCES, fetch_all
|
||||||
|
from decnet.asn.iptoasn.parse import parse_file
|
||||||
|
from decnet.asn.lookup import AsnLookup
|
||||||
|
from decnet.asn.paths import ensure_root
|
||||||
|
|
||||||
|
logger = logging.getLogger("decnet.asn.iptoasn.provider")
|
||||||
|
|
||||||
|
# Pickled lookup cache — skips re-parsing the ~580k-row gz dump on every
|
||||||
|
# profiler restart. Rebuilt whenever any raw file is newer than the
|
||||||
|
# cache, see ``_cache_fresh``.
|
||||||
|
_CACHE_NAME = ".iptoasn_index.pkl"
|
||||||
|
|
||||||
|
|
||||||
|
class IptoasnProvider(Provider):
|
||||||
|
name = "iptoasn"
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self._root = ensure_root()
|
||||||
|
|
||||||
|
# ---------- Provider interface ----------
|
||||||
|
|
||||||
|
def refresh(self) -> None:
|
||||||
|
logger.info("asn.iptoasn: refreshing dump into %s", self._root)
|
||||||
|
fetch_all(self._root)
|
||||||
|
cache = self._root / _CACHE_NAME
|
||||||
|
if cache.exists():
|
||||||
|
cache.unlink(missing_ok=True)
|
||||||
|
|
||||||
|
def build_lookup(self) -> AsnLookup:
|
||||||
|
cache = self._root / _CACHE_NAME
|
||||||
|
if self._cache_fresh(cache):
|
||||||
|
try:
|
||||||
|
lookup = AsnLookup.load(cache)
|
||||||
|
logger.debug(
|
||||||
|
"asn.iptoasn: loaded cached index (%d ranges)",
|
||||||
|
len(lookup),
|
||||||
|
)
|
||||||
|
return lookup
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning(
|
||||||
|
"asn.iptoasn: cache load failed, rebuilding: %s", exc
|
||||||
|
)
|
||||||
|
|
||||||
|
ranges = []
|
||||||
|
for path in self.data_paths():
|
||||||
|
if not path.exists():
|
||||||
|
continue
|
||||||
|
ranges.extend(parse_file(path))
|
||||||
|
lookup = AsnLookup.from_ranges(ranges)
|
||||||
|
try:
|
||||||
|
lookup.save(cache)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("asn.iptoasn: cache save failed: %s", exc)
|
||||||
|
logger.info("asn.iptoasn: built index with %d ranges", len(lookup))
|
||||||
|
return lookup
|
||||||
|
|
||||||
|
def data_paths(self) -> Sequence[Path]:
|
||||||
|
return [self._root / f"{name}.tsv.gz" for name, _url in IPTOASN_SOURCES]
|
||||||
|
|
||||||
|
# ---------- internals ----------
|
||||||
|
|
||||||
|
def _cache_fresh(self, cache: Path) -> bool:
|
||||||
|
"""True when the pickle exists and is at least as new as every raw file."""
|
||||||
|
if not cache.exists():
|
||||||
|
return False
|
||||||
|
cache_mtime = cache.stat().st_mtime
|
||||||
|
for path in self.data_paths():
|
||||||
|
if path.exists() and path.stat().st_mtime > cache_mtime:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
126
decnet/asn/lookup.py
Normal file
126
decnet/asn/lookup.py
Normal file
@@ -0,0 +1,126 @@
|
|||||||
|
"""Provider-agnostic IP→ASN lookup.
|
||||||
|
|
||||||
|
A :class:`AsnLookup` is a frozen, sorted array of ``(start_ip,
|
||||||
|
end_ip_inclusive, AsnInfo)`` ranges queried via :mod:`bisect`.
|
||||||
|
O(log n) on ~600k ranges (a current iptoasn dump is ~580k rows).
|
||||||
|
|
||||||
|
Private/loopback/invalid IPv4 and all IPv6 addresses resolve to
|
||||||
|
``None`` — the same policy :mod:`decnet.geoip.lookup` uses.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import bisect
|
||||||
|
import ipaddress
|
||||||
|
import pickle # nosec B403 — self-produced cache under /var/lib/decnet, never deserialized from untrusted input
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Iterable, List, Optional, Tuple
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class AsnInfo:
|
||||||
|
"""One BGP-announced prefix's origin metadata."""
|
||||||
|
|
||||||
|
asn: int
|
||||||
|
name: str # AS description / org name; "" if absent in the source data
|
||||||
|
|
||||||
|
|
||||||
|
Range = Tuple[int, int, AsnInfo]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class AsnLookup:
|
||||||
|
"""Indexed AS lookup over IPv4 ranges."""
|
||||||
|
|
||||||
|
# Parallel arrays for bisect: _starts[i] is the start-IP of the i-th
|
||||||
|
# range, _ends[i] its inclusive end, _infos[i] its AsnInfo.
|
||||||
|
_starts: List[int]
|
||||||
|
_ends: List[int]
|
||||||
|
_infos: List[AsnInfo]
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_ranges(cls, ranges: Iterable[Range]) -> "AsnLookup":
|
||||||
|
"""Build a lookup from ``(start, end_inclusive, AsnInfo)`` triples.
|
||||||
|
|
||||||
|
Ranges are sorted by start; on identical starts, last writer
|
||||||
|
wins (matches :class:`decnet.geoip.lookup.Lookup` semantics).
|
||||||
|
Non-overlapping adjacency is preserved.
|
||||||
|
"""
|
||||||
|
sorted_ranges = sorted(ranges, key=lambda r: (r[0], r[1]))
|
||||||
|
starts: List[int] = []
|
||||||
|
ends: List[int] = []
|
||||||
|
infos: List[AsnInfo] = []
|
||||||
|
for start, end, info in sorted_ranges:
|
||||||
|
if starts and starts[-1] == start:
|
||||||
|
ends[-1] = end
|
||||||
|
infos[-1] = info
|
||||||
|
continue
|
||||||
|
starts.append(start)
|
||||||
|
ends.append(end)
|
||||||
|
infos.append(info)
|
||||||
|
return cls(starts, ends, infos)
|
||||||
|
|
||||||
|
def asn(self, ip: str) -> Optional[AsnInfo]:
|
||||||
|
"""Return the :class:`AsnInfo` for ``ip`` or ``None``.
|
||||||
|
|
||||||
|
``None`` on: IPv6, private/loopback/link-local/multicast/reserved
|
||||||
|
addresses, malformed strings, and IPs outside every BGP-announced
|
||||||
|
range in the source dump.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
addr = ipaddress.ip_address(ip)
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
if isinstance(addr, ipaddress.IPv6Address):
|
||||||
|
return None
|
||||||
|
if (
|
||||||
|
addr.is_private
|
||||||
|
or addr.is_loopback
|
||||||
|
or addr.is_link_local
|
||||||
|
or addr.is_multicast
|
||||||
|
or addr.is_reserved
|
||||||
|
or addr.is_unspecified
|
||||||
|
):
|
||||||
|
return None
|
||||||
|
|
||||||
|
n = int(addr)
|
||||||
|
idx = bisect.bisect_right(self._starts, n) - 1
|
||||||
|
if idx < 0:
|
||||||
|
return None
|
||||||
|
if n <= self._ends[idx]:
|
||||||
|
return self._infos[idx]
|
||||||
|
return None
|
||||||
|
|
||||||
|
def __len__(self) -> int:
|
||||||
|
return len(self._starts)
|
||||||
|
|
||||||
|
# ---------- persistence ----------
|
||||||
|
|
||||||
|
def save(self, path: Path) -> None:
|
||||||
|
"""Pickle the lookup to *path* (atomic rename)."""
|
||||||
|
tmp = path.with_suffix(path.suffix + ".tmp")
|
||||||
|
tmp.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with tmp.open("wb") as fh:
|
||||||
|
pickle.dump(
|
||||||
|
{
|
||||||
|
"version": 1,
|
||||||
|
"starts": self._starts,
|
||||||
|
"ends": self._ends,
|
||||||
|
"infos": [(i.asn, i.name) for i in self._infos],
|
||||||
|
},
|
||||||
|
fh,
|
||||||
|
protocol=pickle.HIGHEST_PROTOCOL,
|
||||||
|
)
|
||||||
|
tmp.replace(path)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load(cls, path: Path) -> "AsnLookup":
|
||||||
|
"""Load a pickled lookup from *path*."""
|
||||||
|
with path.open("rb") as fh:
|
||||||
|
data = pickle.load(fh) # nosec B301 — self-produced file under /var/lib/decnet
|
||||||
|
if data.get("version") != 1:
|
||||||
|
raise ValueError(
|
||||||
|
f"unsupported asn-lookup index version: {data.get('version')!r}"
|
||||||
|
)
|
||||||
|
infos = [AsnInfo(asn=a, name=n) for a, n in data["infos"]]
|
||||||
|
return cls(data["starts"], data["ends"], infos)
|
||||||
18
decnet/asn/paths.py
Normal file
18
decnet/asn/paths.py
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
"""Filesystem layout for ASN data — mirror of :mod:`decnet.geoip.paths`.
|
||||||
|
|
||||||
|
``ASN_ROOT`` is where providers drop their raw files and cache indexes.
|
||||||
|
Default ``/var/lib/decnet/asn``. Override with ``DECNET_ASN_ROOT`` for
|
||||||
|
test harnesses.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
ASN_ROOT = Path(os.environ.get("DECNET_ASN_ROOT", "/var/lib/decnet/asn"))
|
||||||
|
|
||||||
|
|
||||||
|
def ensure_root() -> Path:
|
||||||
|
"""Create ``ASN_ROOT`` if absent and return it. No-op if present."""
|
||||||
|
ASN_ROOT.mkdir(parents=True, exist_ok=True)
|
||||||
|
return ASN_ROOT
|
||||||
18
decnet/bus/__init__.py
Normal file
18
decnet/bus/__init__.py
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
"""DECNET ServiceBus — pub/sub notification substrate.
|
||||||
|
|
||||||
|
The bus is the notification layer for DECNET's worker constellation. The DB
|
||||||
|
remains the source of truth for anything durable; the bus carries "something
|
||||||
|
happened, go look" events. Delivery is at-most-once, fire-and-forget.
|
||||||
|
|
||||||
|
Consumers call :func:`get_bus` from :mod:`decnet.bus.factory`; never import
|
||||||
|
transport implementations directly. The factory selects the backend via
|
||||||
|
``DECNET_BUS_TYPE`` (``nats`` or ``fake``) and honors ``DECNET_BUS_ENABLED``.
|
||||||
|
|
||||||
|
Topic hierarchy is defined in :mod:`decnet.bus.topics` and locked early so
|
||||||
|
consumers can subscribe with stable wildcard patterns.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from decnet.bus.base import BaseBus, Event, Subscription
|
||||||
|
|
||||||
|
__all__ = ["BaseBus", "Event", "Subscription"]
|
||||||
92
decnet/bus/app.py
Normal file
92
decnet/bus/app.py
Normal file
@@ -0,0 +1,92 @@
|
|||||||
|
"""Process-wide bus singleton for request-serving workers (API, SSE routes).
|
||||||
|
|
||||||
|
A single connected :class:`~decnet.bus.base.BaseBus` shared across request
|
||||||
|
handlers — opening a UNIX socket per request would be wasteful and add
|
||||||
|
latency to the hot path. The API lifespan is responsible for calling
|
||||||
|
:func:`close_app_bus` on shutdown; connect is lazy so tests and
|
||||||
|
contract-test mode that never hit a publish/subscribe code path don't
|
||||||
|
pay for a bus connection they'll never use.
|
||||||
|
|
||||||
|
Failures during :meth:`BaseBus.connect` are swallowed and logged — a
|
||||||
|
dead bus must never break request serving. Publishers should treat a
|
||||||
|
``None`` return from :func:`get_app_bus` as "skip this notification",
|
||||||
|
same as ``DECNET_BUS_ENABLED=false``.
|
||||||
|
|
||||||
|
Connect is **retried with a short backoff** (not one-shot): a startup
|
||||||
|
race where the API lifespan hits :func:`get_app_bus` before ``decnet
|
||||||
|
bus`` is ready would otherwise poison the singleton for the entire
|
||||||
|
process lifetime. Instead we remember the last failure timestamp and
|
||||||
|
let callers retry once ``_RETRY_BACKOFF`` seconds have passed.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import time
|
||||||
|
|
||||||
|
from decnet.bus.base import BaseBus
|
||||||
|
from decnet.bus.factory import get_bus
|
||||||
|
from decnet.logging import get_logger
|
||||||
|
|
||||||
|
log = get_logger("bus.app")
|
||||||
|
|
||||||
|
# Publishers in the hot path shouldn't pay connect-retry latency on every
|
||||||
|
# call; the dashboard's own 5 s poll interval recovers within one tick
|
||||||
|
# once the bus comes up. A persistently-dead bus only gets a connect
|
||||||
|
# attempt every 2 s, not once per request.
|
||||||
|
_RETRY_BACKOFF: float = 2.0
|
||||||
|
|
||||||
|
_lock = asyncio.Lock()
|
||||||
|
_shared: BaseBus | None = None
|
||||||
|
_last_failure_ts: float = 0.0
|
||||||
|
|
||||||
|
|
||||||
|
async def get_app_bus() -> BaseBus | None:
|
||||||
|
"""Return the process-wide connected bus, or ``None`` if unavailable.
|
||||||
|
|
||||||
|
On first call, constructs a client via :func:`get_bus` and awaits
|
||||||
|
``connect()``. Subsequent calls return the cached instance. If a
|
||||||
|
connect attempt raises, the failure timestamp is recorded and
|
||||||
|
subsequent calls within ``_RETRY_BACKOFF`` seconds return ``None``
|
||||||
|
without re-attempting — after the backoff window, the next call
|
||||||
|
retries. This is what lets the API recover from a
|
||||||
|
``decnet bus``-started-after-API race without a full API restart.
|
||||||
|
"""
|
||||||
|
global _shared, _last_failure_ts
|
||||||
|
if _shared is not None:
|
||||||
|
return _shared
|
||||||
|
if (time.monotonic() - _last_failure_ts) < _RETRY_BACKOFF:
|
||||||
|
return None
|
||||||
|
async with _lock:
|
||||||
|
if _shared is not None:
|
||||||
|
return _shared
|
||||||
|
if (time.monotonic() - _last_failure_ts) < _RETRY_BACKOFF:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
candidate = get_bus(client_name="api")
|
||||||
|
await candidate.connect()
|
||||||
|
_shared = candidate
|
||||||
|
_last_failure_ts = 0.0
|
||||||
|
return _shared
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
log.warning("app bus unavailable: %s", exc)
|
||||||
|
_last_failure_ts = time.monotonic()
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
async def close_app_bus() -> None:
|
||||||
|
"""Close the shared bus if one is open; clear the backoff window.
|
||||||
|
|
||||||
|
Call from the API lifespan shutdown. Safe to call multiple times.
|
||||||
|
Resetting ``_last_failure_ts`` means the next ``get_app_bus()``
|
||||||
|
after shutdown-and-restart-within-the-same-process (rare, but
|
||||||
|
tests do this) retries immediately instead of honouring a stale
|
||||||
|
backoff.
|
||||||
|
"""
|
||||||
|
global _shared, _last_failure_ts
|
||||||
|
bus, _shared = _shared, None
|
||||||
|
_last_failure_ts = 0.0
|
||||||
|
if bus is not None:
|
||||||
|
try:
|
||||||
|
await bus.close()
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
log.warning("app bus close raised: %s", exc)
|
||||||
205
decnet/bus/base.py
Normal file
205
decnet/bus/base.py
Normal file
@@ -0,0 +1,205 @@
|
|||||||
|
"""Bus abstractions: the :class:`Event` envelope and the :class:`BaseBus` ABC.
|
||||||
|
|
||||||
|
Every transport (NATS, in-process fake, null) speaks this contract. The
|
||||||
|
envelope is versioned (``v``) so future evolution never breaks deployed
|
||||||
|
consumers that happen to see a newer event shape.
|
||||||
|
|
||||||
|
Subscription model: :meth:`BaseBus.subscribe` returns a :class:`Subscription`
|
||||||
|
that is an async context manager AND an async iterator. The expected usage is:
|
||||||
|
|
||||||
|
async with bus.subscribe("topology.*.mutation.*") as sub:
|
||||||
|
async for event in sub:
|
||||||
|
handle(event)
|
||||||
|
|
||||||
|
Leaving the ``async with`` releases the underlying subscription handle; the
|
||||||
|
transport is free to drop any buffered events after that point.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import abc
|
||||||
|
import asyncio
|
||||||
|
import time
|
||||||
|
import uuid
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Any, AsyncIterator
|
||||||
|
|
||||||
|
EVENT_SCHEMA_VERSION = 1
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class Event:
|
||||||
|
"""The bus envelope.
|
||||||
|
|
||||||
|
``v`` is the envelope schema version, bumped on incompatible shape
|
||||||
|
changes. ``type`` is a short discriminator (``"mutation.applied"``,
|
||||||
|
``"decky.state"``) useful for consumers that subscribe to a broad
|
||||||
|
wildcard and dispatch in Python; it is redundant with the trailing
|
||||||
|
segments of ``topic`` but cheaper to inspect. ``ts`` is epoch seconds
|
||||||
|
(float). ``id`` is a random UUID so consumers can de-dupe if they
|
||||||
|
ever see the same event twice (not expected at-most-once, but cheap
|
||||||
|
insurance).
|
||||||
|
"""
|
||||||
|
|
||||||
|
topic: str
|
||||||
|
payload: dict[str, Any]
|
||||||
|
type: str = ""
|
||||||
|
v: int = EVENT_SCHEMA_VERSION
|
||||||
|
ts: float = field(default_factory=time.time)
|
||||||
|
id: str = field(default_factory=lambda: uuid.uuid4().hex)
|
||||||
|
|
||||||
|
def to_dict(self) -> dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"v": self.v,
|
||||||
|
"id": self.id,
|
||||||
|
"topic": self.topic,
|
||||||
|
"type": self.type,
|
||||||
|
"ts": self.ts,
|
||||||
|
"payload": self.payload,
|
||||||
|
}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_dict(cls, topic: str, data: dict[str, Any]) -> "Event":
|
||||||
|
"""Reconstruct an Event from a wire-format dict.
|
||||||
|
|
||||||
|
``topic`` is passed explicitly because the transport knows which
|
||||||
|
subject the message arrived on; trusting a ``topic`` field from the
|
||||||
|
wire would let a misbehaving publisher spoof events on topics they
|
||||||
|
don't actually publish to.
|
||||||
|
"""
|
||||||
|
return cls(
|
||||||
|
topic=topic,
|
||||||
|
payload=data.get("payload", {}) or {},
|
||||||
|
type=data.get("type", "") or "",
|
||||||
|
v=int(data.get("v", EVENT_SCHEMA_VERSION)),
|
||||||
|
ts=float(data.get("ts", time.time())),
|
||||||
|
id=data.get("id") or uuid.uuid4().hex,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class Subscription(abc.ABC):
|
||||||
|
"""An open subscription — async context manager + async iterator.
|
||||||
|
|
||||||
|
Concrete transports subclass this and implement :meth:`_aclose` plus the
|
||||||
|
async iterator protocol. Callers should not instantiate directly; use
|
||||||
|
:meth:`BaseBus.subscribe`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, pattern: str) -> None:
|
||||||
|
self.pattern = pattern
|
||||||
|
self._closed = False
|
||||||
|
|
||||||
|
async def __aenter__(self) -> "Subscription":
|
||||||
|
return self
|
||||||
|
|
||||||
|
async def __aexit__(self, *exc: Any) -> None:
|
||||||
|
await self.aclose()
|
||||||
|
|
||||||
|
def __aiter__(self) -> AsyncIterator[Event]:
|
||||||
|
return self
|
||||||
|
|
||||||
|
async def aclose(self) -> None:
|
||||||
|
if self._closed:
|
||||||
|
return
|
||||||
|
self._closed = True
|
||||||
|
await self._aclose()
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
|
async def __anext__(self) -> Event: # pragma: no cover - abstract
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
|
async def _aclose(self) -> None: # pragma: no cover - abstract
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
|
class BaseBus(abc.ABC):
|
||||||
|
"""Pub/sub transport contract.
|
||||||
|
|
||||||
|
Implementations MUST be safe to ``await connect()`` multiple times and
|
||||||
|
``await close()`` multiple times. Publishing to a closed bus raises
|
||||||
|
:class:`RuntimeError`; subscribing to a closed bus does too.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
|
async def connect(self) -> None:
|
||||||
|
"""Establish any network/transport resources. Idempotent."""
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
|
async def publish(
|
||||||
|
self,
|
||||||
|
topic: str,
|
||||||
|
payload: dict[str, Any],
|
||||||
|
*,
|
||||||
|
event_type: str = "",
|
||||||
|
) -> None:
|
||||||
|
"""Publish *payload* on *topic*. Fire-and-forget.
|
||||||
|
|
||||||
|
Delivery is at-most-once. On transport error the implementation
|
||||||
|
logs and returns; it does not raise, because bus losses must not
|
||||||
|
cascade into worker failure (DB is source of truth).
|
||||||
|
"""
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
|
def subscribe(self, pattern: str) -> Subscription:
|
||||||
|
"""Return a :class:`Subscription` that yields events matching *pattern*.
|
||||||
|
|
||||||
|
Patterns follow NATS wildcard semantics: ``*`` matches one topic
|
||||||
|
token, ``>`` matches one-or-more trailing tokens. Examples:
|
||||||
|
|
||||||
|
* ``topology.*.mutation.applied`` — all ``applied`` events for any
|
||||||
|
topology.
|
||||||
|
* ``topology.abc123.mutation.*`` — all mutation states for one
|
||||||
|
topology.
|
||||||
|
* ``topology.>`` — every event under the ``topology`` root.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
|
async def close(self) -> None:
|
||||||
|
"""Tear down transport resources. Idempotent."""
|
||||||
|
|
||||||
|
async def __aenter__(self) -> "BaseBus":
|
||||||
|
await self.connect()
|
||||||
|
return self
|
||||||
|
|
||||||
|
async def __aexit__(self, *exc: Any) -> None:
|
||||||
|
await self.close()
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Wildcard matching shared across in-process transports ───────────────────
|
||||||
|
|
||||||
|
def matches(pattern: str, topic: str) -> bool:
|
||||||
|
"""Return True iff *topic* matches *pattern* under NATS wildcard rules.
|
||||||
|
|
||||||
|
``*`` matches exactly one non-empty token; ``>`` matches one-or-more
|
||||||
|
trailing tokens (so ``topology.>`` matches ``topology.abc.x`` but not
|
||||||
|
``topology`` alone).
|
||||||
|
"""
|
||||||
|
p_tokens = pattern.split(".")
|
||||||
|
t_tokens = topic.split(".")
|
||||||
|
for i, p in enumerate(p_tokens):
|
||||||
|
if p == ">":
|
||||||
|
# Must have at least one token remaining to match.
|
||||||
|
return i < len(t_tokens)
|
||||||
|
if i >= len(t_tokens):
|
||||||
|
return False
|
||||||
|
if p == "*":
|
||||||
|
if not t_tokens[i]:
|
||||||
|
return False
|
||||||
|
continue
|
||||||
|
if p != t_tokens[i]:
|
||||||
|
return False
|
||||||
|
return len(p_tokens) == len(t_tokens)
|
||||||
|
|
||||||
|
|
||||||
|
# Sentinel used by the in-process transports to signal "no more events"
|
||||||
|
# through the asyncio.Queue fan-out without inventing a separate control
|
||||||
|
# channel. Not part of the wire protocol.
|
||||||
|
_CLOSE_SENTINEL: Any = object()
|
||||||
|
|
||||||
|
|
||||||
|
async def _next_or_stop(queue: "asyncio.Queue[Any]") -> Event:
|
||||||
|
"""Pop the next item from *queue*, raising ``StopAsyncIteration`` on close."""
|
||||||
|
item = await queue.get()
|
||||||
|
if item is _CLOSE_SENTINEL:
|
||||||
|
raise StopAsyncIteration
|
||||||
|
return item
|
||||||
85
decnet/bus/factory.py
Normal file
85
decnet/bus/factory.py
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
"""Bus factory — selects a :class:`~decnet.bus.base.BaseBus` implementation.
|
||||||
|
|
||||||
|
Dispatch key: the ``DECNET_BUS_TYPE`` environment variable.
|
||||||
|
|
||||||
|
* ``unix`` (default) → :class:`~decnet.bus.unix_client.UnixSocketBus`
|
||||||
|
* ``fake`` → :class:`~decnet.bus.fake.FakeBus` (in-process)
|
||||||
|
|
||||||
|
If ``DECNET_BUS_ENABLED`` is ``"false"`` the factory short-circuits to
|
||||||
|
:class:`~decnet.bus.fake.NullBus` regardless of ``DECNET_BUS_TYPE`` — a
|
||||||
|
cheap way for dev environments to run workers without a bus daemon.
|
||||||
|
|
||||||
|
Mirrors :mod:`decnet.web.db.factory` (lazy imports inside each branch,
|
||||||
|
env-driven dispatch, optional telemetry wrapping). Callers MUST use
|
||||||
|
:func:`get_bus` rather than instantiating transports directly.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from decnet.bus.base import BaseBus
|
||||||
|
|
||||||
|
|
||||||
|
def get_bus(**kwargs: Any) -> BaseBus:
|
||||||
|
"""Instantiate the bus implementation selected by environment.
|
||||||
|
|
||||||
|
Keyword arguments are forwarded to the concrete transport:
|
||||||
|
|
||||||
|
* ``UnixSocketBus`` accepts ``socket_path`` (overrides
|
||||||
|
``DECNET_BUS_SOCKET``) and ``client_name``.
|
||||||
|
* ``FakeBus`` accepts ``queue_size``.
|
||||||
|
"""
|
||||||
|
if os.environ.get("DECNET_BUS_ENABLED", "true").lower() == "false":
|
||||||
|
from decnet.bus.fake import NullBus
|
||||||
|
return NullBus()
|
||||||
|
|
||||||
|
bus_type = os.environ.get("DECNET_BUS_TYPE", "unix").lower()
|
||||||
|
|
||||||
|
if bus_type == "unix":
|
||||||
|
from decnet.bus.unix_client import UnixSocketBus
|
||||||
|
socket_path = kwargs.pop("socket_path", None) or _default_socket_path()
|
||||||
|
bus: BaseBus = UnixSocketBus(socket_path=socket_path, **kwargs)
|
||||||
|
elif bus_type == "fake":
|
||||||
|
from decnet.bus.fake import FakeBus
|
||||||
|
bus = FakeBus(**kwargs)
|
||||||
|
else:
|
||||||
|
raise ValueError(f"Unsupported bus type: {bus_type}")
|
||||||
|
|
||||||
|
return _maybe_wrap_telemetry(bus)
|
||||||
|
|
||||||
|
|
||||||
|
def _default_socket_path() -> str:
|
||||||
|
"""Return the bus socket path honoring ``DECNET_BUS_SOCKET`` and falling
|
||||||
|
back to ``/run/decnet/bus.sock`` → ``~/.decnet/bus.sock``.
|
||||||
|
|
||||||
|
The runtime path (``/run/decnet``) is preferred because systemd
|
||||||
|
``RuntimeDirectory=decnet`` sets it up with the right perms; the home
|
||||||
|
fallback keeps dev boxes usable without systemd.
|
||||||
|
"""
|
||||||
|
explicit = os.environ.get("DECNET_BUS_SOCKET")
|
||||||
|
if explicit:
|
||||||
|
return explicit
|
||||||
|
|
||||||
|
runtime_dir = "/run/decnet"
|
||||||
|
if os.path.isdir(runtime_dir) and os.access(runtime_dir, os.W_OK):
|
||||||
|
return f"{runtime_dir}/bus.sock"
|
||||||
|
return os.path.expanduser("~/.decnet/bus.sock")
|
||||||
|
|
||||||
|
|
||||||
|
def _maybe_wrap_telemetry(bus: BaseBus) -> BaseBus:
|
||||||
|
"""Wrap *bus* in a tracing proxy if OTEL is enabled, else return as-is.
|
||||||
|
|
||||||
|
Uses :func:`decnet.telemetry.wrap_repository` as the underlying proxy —
|
||||||
|
its implementation is generic (wraps any async method in a span), so we
|
||||||
|
reuse it with a bus-appropriate tracer name. If telemetry isn't wired
|
||||||
|
up at all we no-op.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from decnet.telemetry import wrap_repository # type: ignore[attr-defined]
|
||||||
|
except ImportError:
|
||||||
|
return bus
|
||||||
|
try:
|
||||||
|
return wrap_repository(bus)
|
||||||
|
except Exception: # pragma: no cover - defensive
|
||||||
|
return bus
|
||||||
183
decnet/bus/fake.py
Normal file
183
decnet/bus/fake.py
Normal file
@@ -0,0 +1,183 @@
|
|||||||
|
"""In-process bus transports.
|
||||||
|
|
||||||
|
* :class:`FakeBus` — real pub/sub semantics without touching a socket. Used
|
||||||
|
by unit tests and anywhere ``DECNET_BUS_TYPE=fake`` is set. Lets code
|
||||||
|
that depends on the bus be exercised entirely inside a single event loop,
|
||||||
|
matching the DECNET testing convention of not opening real network
|
||||||
|
sockets from unit tests.
|
||||||
|
* :class:`NullBus` — no-op. Returned by :func:`~decnet.bus.factory.get_bus`
|
||||||
|
when ``DECNET_BUS_ENABLED=false`` so workers can start cleanly in dev
|
||||||
|
environments where no bus daemon is running. Publishes are dropped;
|
||||||
|
subscriptions yield nothing and close cleanly.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from decnet.bus.base import (
|
||||||
|
BaseBus,
|
||||||
|
Event,
|
||||||
|
Subscription,
|
||||||
|
_CLOSE_SENTINEL,
|
||||||
|
matches,
|
||||||
|
)
|
||||||
|
from decnet.logging import get_logger
|
||||||
|
|
||||||
|
log = get_logger("bus.fake")
|
||||||
|
|
||||||
|
# Per-subscriber bounded queue: backpressure policy is drop-oldest so a slow
|
||||||
|
# consumer cannot stall publishers (the invariant — DB is the source of
|
||||||
|
# truth — makes dropped events acceptable).
|
||||||
|
_DEFAULT_QUEUE_SIZE = 1024
|
||||||
|
|
||||||
|
|
||||||
|
# ─── FakeBus ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class _FakeSubscription(Subscription):
|
||||||
|
"""Subscription backed by an :class:`asyncio.Queue` fed from
|
||||||
|
:meth:`FakeBus.publish`. Unregisters itself on close."""
|
||||||
|
|
||||||
|
def __init__(self, bus: "FakeBus", pattern: str, queue: "asyncio.Queue[Any]") -> None:
|
||||||
|
super().__init__(pattern)
|
||||||
|
self._bus = bus
|
||||||
|
self._queue = queue
|
||||||
|
|
||||||
|
async def __anext__(self) -> Event:
|
||||||
|
if self._closed:
|
||||||
|
raise StopAsyncIteration
|
||||||
|
item = await self._queue.get()
|
||||||
|
if item is _CLOSE_SENTINEL:
|
||||||
|
raise StopAsyncIteration
|
||||||
|
return item
|
||||||
|
|
||||||
|
async def _aclose(self) -> None:
|
||||||
|
self._bus._unregister(self)
|
||||||
|
# Unblock any pending __anext__ waiter.
|
||||||
|
try:
|
||||||
|
self._queue.put_nowait(_CLOSE_SENTINEL)
|
||||||
|
except asyncio.QueueFull:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class FakeBus(BaseBus):
|
||||||
|
"""In-process pub/sub.
|
||||||
|
|
||||||
|
Publishes iterate every active subscription and enqueue the event on
|
||||||
|
the ones whose pattern matches the topic. If a subscriber's queue is
|
||||||
|
full, the oldest event is discarded to make room — same at-most-once
|
||||||
|
semantics as the real UNIX-socket transport.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, queue_size: int = _DEFAULT_QUEUE_SIZE) -> None:
|
||||||
|
self._queue_size = queue_size
|
||||||
|
self._subs: list[_FakeSubscription] = []
|
||||||
|
self._connected = False
|
||||||
|
self._closed = False
|
||||||
|
self._lock = asyncio.Lock()
|
||||||
|
|
||||||
|
async def connect(self) -> None:
|
||||||
|
self._connected = True
|
||||||
|
|
||||||
|
async def publish(
|
||||||
|
self,
|
||||||
|
topic: str,
|
||||||
|
payload: dict[str, Any],
|
||||||
|
*,
|
||||||
|
event_type: str = "",
|
||||||
|
) -> None:
|
||||||
|
if self._closed:
|
||||||
|
raise RuntimeError("publish on closed bus")
|
||||||
|
event = Event(topic=topic, payload=payload, type=event_type)
|
||||||
|
async with self._lock:
|
||||||
|
targets = [s for s in self._subs if matches(s.pattern, topic)]
|
||||||
|
for sub in targets:
|
||||||
|
_enqueue_drop_oldest(sub._queue, event)
|
||||||
|
|
||||||
|
def subscribe(self, pattern: str) -> Subscription:
|
||||||
|
if self._closed:
|
||||||
|
raise RuntimeError("subscribe on closed bus")
|
||||||
|
queue: asyncio.Queue[Any] = asyncio.Queue(maxsize=self._queue_size)
|
||||||
|
sub = _FakeSubscription(self, pattern, queue)
|
||||||
|
self._subs.append(sub)
|
||||||
|
return sub
|
||||||
|
|
||||||
|
def _unregister(self, sub: _FakeSubscription) -> None:
|
||||||
|
try:
|
||||||
|
self._subs.remove(sub)
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
async def close(self) -> None:
|
||||||
|
if self._closed:
|
||||||
|
return
|
||||||
|
self._closed = True
|
||||||
|
# Wake every still-open subscription so iterators unblock cleanly.
|
||||||
|
for sub in list(self._subs):
|
||||||
|
try:
|
||||||
|
sub._queue.put_nowait(_CLOSE_SENTINEL)
|
||||||
|
except asyncio.QueueFull:
|
||||||
|
pass
|
||||||
|
self._subs.clear()
|
||||||
|
|
||||||
|
|
||||||
|
def _enqueue_drop_oldest(queue: "asyncio.Queue[Any]", event: Event) -> None:
|
||||||
|
"""Put *event* on *queue*, dropping the oldest item if the queue is full.
|
||||||
|
|
||||||
|
Factored out so both FakeBus and the real UNIX server share the exact
|
||||||
|
same backpressure policy.
|
||||||
|
"""
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
queue.put_nowait(event)
|
||||||
|
return
|
||||||
|
except asyncio.QueueFull:
|
||||||
|
try:
|
||||||
|
dropped = queue.get_nowait()
|
||||||
|
log.warning(
|
||||||
|
"bus.fake: subscriber queue full, dropped %s", getattr(dropped, "topic", "?")
|
||||||
|
)
|
||||||
|
except asyncio.QueueEmpty:
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
# ─── NullBus ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
class _NullSubscription(Subscription):
|
||||||
|
"""A subscription that never yields and closes immediately on iteration."""
|
||||||
|
|
||||||
|
async def __anext__(self) -> Event:
|
||||||
|
raise StopAsyncIteration
|
||||||
|
|
||||||
|
async def _aclose(self) -> None:
|
||||||
|
return
|
||||||
|
|
||||||
|
|
||||||
|
class NullBus(BaseBus):
|
||||||
|
"""No-op bus used when ``DECNET_BUS_ENABLED=false``.
|
||||||
|
|
||||||
|
Publishes are silently dropped; subscriptions are empty. Intended for
|
||||||
|
dev environments where no bus daemon is running — the process starts
|
||||||
|
cleanly, code that publishes doesn't need feature flags, and nothing
|
||||||
|
ever blocks on a subscriber.
|
||||||
|
"""
|
||||||
|
|
||||||
|
async def connect(self) -> None:
|
||||||
|
return
|
||||||
|
|
||||||
|
async def publish(
|
||||||
|
self,
|
||||||
|
topic: str,
|
||||||
|
payload: dict[str, Any],
|
||||||
|
*,
|
||||||
|
event_type: str = "",
|
||||||
|
) -> None:
|
||||||
|
return
|
||||||
|
|
||||||
|
def subscribe(self, pattern: str) -> Subscription:
|
||||||
|
return _NullSubscription(pattern)
|
||||||
|
|
||||||
|
async def close(self) -> None:
|
||||||
|
return
|
||||||
144
decnet/bus/protocol.py
Normal file
144
decnet/bus/protocol.py
Normal file
@@ -0,0 +1,144 @@
|
|||||||
|
"""Wire protocol for the DECNET bus UNIX-socket transport.
|
||||||
|
|
||||||
|
Frame layout:
|
||||||
|
|
||||||
|
<VERB> [<args ...>]\\n # ASCII header, single line, no trailing space
|
||||||
|
<4-byte big-endian body length>
|
||||||
|
<body> # orjson-serialized dict, or empty (length 0)
|
||||||
|
|
||||||
|
Verbs:
|
||||||
|
|
||||||
|
* ``HELLO <client-name>`` — optional greeting, logged by server. Body empty.
|
||||||
|
* ``PUB <topic>`` — publisher → server. Body = payload dict.
|
||||||
|
* ``SUB <pattern>`` — subscriber → server. Body empty.
|
||||||
|
* ``UNSUB <pattern>`` — subscriber → server. Body empty.
|
||||||
|
* ``EVT <topic>`` — server → subscriber. Body = payload dict (wrapped
|
||||||
|
in an :class:`~decnet.bus.base.Event` envelope).
|
||||||
|
* ``BYE`` — either direction. Body empty. Graceful shutdown.
|
||||||
|
|
||||||
|
Parsing rules:
|
||||||
|
|
||||||
|
* The header is a single line terminated by ``\\n`` (LF). ``\\r`` is tolerated
|
||||||
|
but not required.
|
||||||
|
* Header tokens are whitespace-separated. The first token is the verb;
|
||||||
|
everything after is verb-specific. We split on the first space only so
|
||||||
|
topics / patterns with quoted content are not supported (they are not
|
||||||
|
needed — topic segments forbid whitespace per :mod:`decnet.bus.topics`).
|
||||||
|
* Maximum header length is 4096 bytes; maximum body length is 1 MiB. Beyond
|
||||||
|
those, the connection is dropped with a logged error. This is a honeypot
|
||||||
|
framework, not a general-purpose message broker; a malformed frame is
|
||||||
|
treated as hostile.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import struct
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import orjson
|
||||||
|
|
||||||
|
MAX_HEADER_BYTES = 4096
|
||||||
|
MAX_BODY_BYTES = 1 * 1024 * 1024 # 1 MiB
|
||||||
|
|
||||||
|
# Verb constants (callers should reference these, not bare strings).
|
||||||
|
HELLO = "HELLO"
|
||||||
|
PUB = "PUB"
|
||||||
|
SUB = "SUB"
|
||||||
|
UNSUB = "UNSUB"
|
||||||
|
EVT = "EVT"
|
||||||
|
BYE = "BYE"
|
||||||
|
|
||||||
|
_VALID_VERBS = frozenset({HELLO, PUB, SUB, UNSUB, EVT, BYE})
|
||||||
|
|
||||||
|
|
||||||
|
class ProtocolError(Exception):
|
||||||
|
"""Malformed or oversized frame. Callers should close the connection."""
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class Frame:
|
||||||
|
"""A parsed frame. ``body`` is the raw (unparsed) body bytes — callers
|
||||||
|
decide whether to orjson-decode it (the protocol does not know whether
|
||||||
|
a given verb expects a dict body or an empty one).
|
||||||
|
"""
|
||||||
|
|
||||||
|
verb: str
|
||||||
|
args: str # everything after the verb on the header line, trimmed
|
||||||
|
body: bytes
|
||||||
|
|
||||||
|
|
||||||
|
def encode(verb: str, args: str = "", body: dict[str, Any] | None = None) -> bytes:
|
||||||
|
"""Serialize a frame.
|
||||||
|
|
||||||
|
*body* is a dict that will be orjson-encoded, or ``None`` for an empty
|
||||||
|
body. The header line is written verbatim — callers must supply args
|
||||||
|
that are free of ``\\n``.
|
||||||
|
"""
|
||||||
|
if verb not in _VALID_VERBS:
|
||||||
|
raise ProtocolError(f"unknown verb {verb!r}")
|
||||||
|
if "\n" in args or "\r" in args:
|
||||||
|
raise ProtocolError("args must not contain newline characters")
|
||||||
|
|
||||||
|
body_bytes = b"" if body is None else orjson.dumps(body)
|
||||||
|
if len(body_bytes) > MAX_BODY_BYTES:
|
||||||
|
raise ProtocolError(
|
||||||
|
f"body {len(body_bytes)} bytes exceeds max {MAX_BODY_BYTES}"
|
||||||
|
)
|
||||||
|
|
||||||
|
header = f"{verb} {args}".rstrip() + "\n"
|
||||||
|
header_bytes = header.encode("ascii")
|
||||||
|
if len(header_bytes) > MAX_HEADER_BYTES:
|
||||||
|
raise ProtocolError(
|
||||||
|
f"header {len(header_bytes)} bytes exceeds max {MAX_HEADER_BYTES}"
|
||||||
|
)
|
||||||
|
return header_bytes + struct.pack(">I", len(body_bytes)) + body_bytes
|
||||||
|
|
||||||
|
|
||||||
|
async def read_frame(reader: asyncio.StreamReader) -> Frame | None:
|
||||||
|
"""Read one frame from *reader*.
|
||||||
|
|
||||||
|
Returns ``None`` on clean EOF before a new frame starts. Raises
|
||||||
|
:class:`ProtocolError` on malformed input (caller should close the
|
||||||
|
connection).
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
header = await reader.readuntil(b"\n")
|
||||||
|
except asyncio.IncompleteReadError as exc:
|
||||||
|
if not exc.partial:
|
||||||
|
return None
|
||||||
|
raise ProtocolError("connection closed mid-header") from exc
|
||||||
|
except asyncio.LimitOverrunError as exc:
|
||||||
|
raise ProtocolError("header exceeded buffer limit") from exc
|
||||||
|
|
||||||
|
if len(header) > MAX_HEADER_BYTES:
|
||||||
|
raise ProtocolError(f"header {len(header)} bytes exceeds max")
|
||||||
|
|
||||||
|
line = header.rstrip(b"\r\n").decode("ascii", errors="strict")
|
||||||
|
if not line:
|
||||||
|
raise ProtocolError("empty header line")
|
||||||
|
|
||||||
|
verb, _, args = line.partition(" ")
|
||||||
|
if verb not in _VALID_VERBS:
|
||||||
|
raise ProtocolError(f"unknown verb {verb!r}")
|
||||||
|
|
||||||
|
length_bytes = await reader.readexactly(4)
|
||||||
|
(body_len,) = struct.unpack(">I", length_bytes)
|
||||||
|
if body_len > MAX_BODY_BYTES:
|
||||||
|
raise ProtocolError(f"body length {body_len} exceeds max")
|
||||||
|
|
||||||
|
body = await reader.readexactly(body_len) if body_len else b""
|
||||||
|
return Frame(verb=verb, args=args.strip(), body=body)
|
||||||
|
|
||||||
|
|
||||||
|
def decode_body(body: bytes) -> dict[str, Any]:
|
||||||
|
"""Decode a frame body as a JSON dict. Empty body → empty dict."""
|
||||||
|
if not body:
|
||||||
|
return {}
|
||||||
|
try:
|
||||||
|
obj = orjson.loads(body)
|
||||||
|
except orjson.JSONDecodeError as exc:
|
||||||
|
raise ProtocolError(f"body is not valid JSON: {exc}") from exc
|
||||||
|
if not isinstance(obj, dict):
|
||||||
|
raise ProtocolError(f"body must be a JSON object, got {type(obj).__name__}")
|
||||||
|
return obj
|
||||||
211
decnet/bus/publish.py
Normal file
211
decnet/bus/publish.py
Normal file
@@ -0,0 +1,211 @@
|
|||||||
|
"""Fire-and-forget publish helpers shared across every worker.
|
||||||
|
|
||||||
|
Lifted out of ``decnet/mutator/engine.py`` once a second caller showed up
|
||||||
|
(DEBT-031). Keeping one implementation means the "never break the worker
|
||||||
|
loop" guarantee is audited in exactly one place.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import contextlib
|
||||||
|
import os
|
||||||
|
import signal
|
||||||
|
import time
|
||||||
|
from typing import Any, Callable
|
||||||
|
|
||||||
|
from decnet.bus import topics as _topics
|
||||||
|
from decnet.bus.base import BaseBus
|
||||||
|
from decnet.logging import get_logger
|
||||||
|
|
||||||
|
log = get_logger("bus.publish")
|
||||||
|
|
||||||
|
|
||||||
|
async def publish_safely(
|
||||||
|
bus: BaseBus | None,
|
||||||
|
topic: str,
|
||||||
|
payload: dict[str, Any],
|
||||||
|
event_type: str = "",
|
||||||
|
) -> None:
|
||||||
|
"""Publish on *bus* without ever raising back at the caller.
|
||||||
|
|
||||||
|
The DB row (or equivalent side-effect) has already been committed by
|
||||||
|
the time a worker calls this; the bus is the notification layer, not
|
||||||
|
the source of truth. A dropped publish is at most a few seconds of
|
||||||
|
UI latency until the next poll tick. A raised exception here, by
|
||||||
|
contrast, would crash the worker — which is strictly worse.
|
||||||
|
"""
|
||||||
|
if bus is None:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
await bus.publish(topic, payload, event_type=event_type)
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
log.warning("bus publish failed topic=%s: %s", topic, exc)
|
||||||
|
|
||||||
|
|
||||||
|
def make_thread_safe_publisher(
|
||||||
|
bus: BaseBus | None,
|
||||||
|
loop: asyncio.AbstractEventLoop,
|
||||||
|
) -> Callable[[str, dict[str, Any], str], None]:
|
||||||
|
"""Build a sync callable that marshals publishes back to *loop*.
|
||||||
|
|
||||||
|
Workers that run their hot paths in a worker thread (scapy sniff loop,
|
||||||
|
``asyncio.to_thread`` probes, blocking socket reads) cannot ``await``
|
||||||
|
the bus directly. This helper returns a plain function that schedules
|
||||||
|
the publish on *loop* via ``run_coroutine_threadsafe`` and returns
|
||||||
|
immediately — the calling thread is never blocked on the publish.
|
||||||
|
|
||||||
|
A ``None`` bus yields a no-op callable, matching the degraded-mode
|
||||||
|
contract the rest of this module already upholds.
|
||||||
|
"""
|
||||||
|
if bus is None:
|
||||||
|
return lambda _topic, _payload, _event_type="": None
|
||||||
|
|
||||||
|
def _publish(topic: str, payload: dict[str, Any], event_type: str = "") -> None:
|
||||||
|
# Stream threads may keep draining after the bus owner closed it
|
||||||
|
# (shutdown race). Short-circuit here so we don't marshal a
|
||||||
|
# coroutine onto a dead loop just to have publish_safely swallow
|
||||||
|
# it. bus.publish's own WARN-once guard handles the rare case
|
||||||
|
# where _closed flips between this check and the coroutine
|
||||||
|
# actually running.
|
||||||
|
if getattr(bus, "_closed", False):
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
asyncio.run_coroutine_threadsafe(
|
||||||
|
publish_safely(bus, topic, payload, event_type=event_type),
|
||||||
|
loop,
|
||||||
|
)
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
log.debug("cross-thread bus publish failed topic=%s: %s", topic, exc)
|
||||||
|
|
||||||
|
return _publish
|
||||||
|
|
||||||
|
|
||||||
|
async def run_health_heartbeat(
|
||||||
|
bus: BaseBus | None,
|
||||||
|
worker: str,
|
||||||
|
*,
|
||||||
|
interval: float = 30.0,
|
||||||
|
extra: Callable[[], dict[str, Any]] | None = None,
|
||||||
|
) -> None:
|
||||||
|
"""Publish ``system.<worker>.health`` every *interval* seconds.
|
||||||
|
|
||||||
|
Standard heartbeat loop shared across agent/forwarder/updater. Emits
|
||||||
|
``{"worker": <name>, "ts": <unix-ts>, **extra()}`` on each tick. A
|
||||||
|
``None`` bus turns the loop into a no-op sleep cycle — still cancellable
|
||||||
|
so the caller can use the same ``asyncio.create_task``/``.cancel()``
|
||||||
|
pattern regardless of bus state.
|
||||||
|
|
||||||
|
Cancellation-safe: unwraps the ``CancelledError`` so callers awaiting
|
||||||
|
the task during shutdown see a clean exit.
|
||||||
|
"""
|
||||||
|
topic = _topics.system_health(worker)
|
||||||
|
with contextlib.suppress(asyncio.CancelledError):
|
||||||
|
while True:
|
||||||
|
payload: dict[str, Any] = {"worker": worker, "ts": time.time()}
|
||||||
|
if extra is not None:
|
||||||
|
try:
|
||||||
|
payload.update(extra())
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
log.debug("heartbeat extra() failed worker=%s: %s", worker, exc)
|
||||||
|
await publish_safely(bus, topic, payload, event_type=_topics.SYSTEM_HEALTH)
|
||||||
|
await asyncio.sleep(interval)
|
||||||
|
|
||||||
|
|
||||||
|
async def run_control_listener(
|
||||||
|
bus: BaseBus | None,
|
||||||
|
worker: str,
|
||||||
|
shutdown: asyncio.Event,
|
||||||
|
) -> None:
|
||||||
|
"""Subscribe to ``system.<worker>.control`` and honour stop intents.
|
||||||
|
|
||||||
|
On a well-formed ``{"action": "stop", ...}`` message the function sets
|
||||||
|
*shutdown* and returns — the worker's main loop is expected to check
|
||||||
|
the event and unwind cleanly, matching the SIGTERM path.
|
||||||
|
|
||||||
|
Malformed payloads (missing/unknown action, non-dict, exception from
|
||||||
|
the transport) are logged and ignored. A ``None`` bus yields a noop
|
||||||
|
coroutine that simply awaits *shutdown* — callers can ``create_task``
|
||||||
|
this unconditionally regardless of bus state.
|
||||||
|
|
||||||
|
Cancellation-safe.
|
||||||
|
"""
|
||||||
|
if bus is None:
|
||||||
|
with contextlib.suppress(asyncio.CancelledError):
|
||||||
|
await shutdown.wait()
|
||||||
|
return
|
||||||
|
|
||||||
|
topic = _topics.system_control(worker)
|
||||||
|
with contextlib.suppress(asyncio.CancelledError):
|
||||||
|
try:
|
||||||
|
async with bus.subscribe(topic) as sub:
|
||||||
|
async for event in sub:
|
||||||
|
payload = event.payload or {}
|
||||||
|
action = payload.get("action")
|
||||||
|
requested_by = payload.get("requested_by", "<unknown>")
|
||||||
|
if action == _topics.WORKER_CONTROL_STOP:
|
||||||
|
log.info(
|
||||||
|
"control: stop requested worker=%s by=%s",
|
||||||
|
worker, requested_by,
|
||||||
|
)
|
||||||
|
shutdown.set()
|
||||||
|
return
|
||||||
|
log.debug(
|
||||||
|
"control: ignoring unknown action worker=%s action=%r",
|
||||||
|
worker, action,
|
||||||
|
)
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
log.warning(
|
||||||
|
"control listener failed worker=%s: %s — shutdown via bus disabled",
|
||||||
|
worker, exc,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def run_control_listener_signal(
|
||||||
|
bus: BaseBus | None,
|
||||||
|
worker: str,
|
||||||
|
) -> None:
|
||||||
|
"""Like :func:`run_control_listener` but signals the process on stop.
|
||||||
|
|
||||||
|
Preferred for workers whose main loop is a blocking thread
|
||||||
|
(container-log tail, PTY read, scapy sniff) — wiring an
|
||||||
|
``asyncio.Event`` through the thread boundary is error-prone, and
|
||||||
|
every DECNET worker already has systemd-equivalent SIGTERM cleanup.
|
||||||
|
A SIGTERM self-signal routes the stop through that same path
|
||||||
|
without inventing a second shutdown mechanism.
|
||||||
|
|
||||||
|
Cancellation-safe. Never raises: a failed self-signal is logged
|
||||||
|
and the loop simply exits (admin can fall back to ``systemctl``).
|
||||||
|
"""
|
||||||
|
if bus is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
topic = _topics.system_control(worker)
|
||||||
|
with contextlib.suppress(asyncio.CancelledError):
|
||||||
|
try:
|
||||||
|
async with bus.subscribe(topic) as sub:
|
||||||
|
async for event in sub:
|
||||||
|
payload = event.payload or {}
|
||||||
|
action = payload.get("action")
|
||||||
|
requested_by = payload.get("requested_by", "<unknown>")
|
||||||
|
if action == _topics.WORKER_CONTROL_STOP:
|
||||||
|
log.info(
|
||||||
|
"control: stop requested worker=%s by=%s → SIGTERM self",
|
||||||
|
worker, requested_by,
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
os.kill(os.getpid(), signal.SIGTERM)
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
log.warning(
|
||||||
|
"control: self-signal failed worker=%s: %s",
|
||||||
|
worker, exc,
|
||||||
|
)
|
||||||
|
return
|
||||||
|
log.debug(
|
||||||
|
"control: ignoring unknown action worker=%s action=%r",
|
||||||
|
worker, action,
|
||||||
|
)
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
log.warning(
|
||||||
|
"control signal listener failed worker=%s: %s",
|
||||||
|
worker, exc,
|
||||||
|
)
|
||||||
398
decnet/bus/topics.py
Normal file
398
decnet/bus/topics.py
Normal file
@@ -0,0 +1,398 @@
|
|||||||
|
"""Canonical topic hierarchy for the DECNET ServiceBus.
|
||||||
|
|
||||||
|
Locked early so consumers can subscribe with stable wildcard patterns.
|
||||||
|
Adding new topic families is fine; **renaming** existing ones is a breaking
|
||||||
|
change for every subscriber and requires a coordinated rollout.
|
||||||
|
|
||||||
|
Token structure (NATS-style, dot-separated):
|
||||||
|
|
||||||
|
topology.{topology_id}.mutation.{state}
|
||||||
|
topology.{topology_id}.status
|
||||||
|
decky.{decky_id}.state
|
||||||
|
decky.{decky_id}.traffic
|
||||||
|
orchestrator.traffic.{decky_id}
|
||||||
|
orchestrator.file.{decky_id}
|
||||||
|
orchestrator.email.{decky_id}
|
||||||
|
attacker.observed
|
||||||
|
attacker.scored
|
||||||
|
attacker.session.started
|
||||||
|
attacker.session.ended
|
||||||
|
identity.formed
|
||||||
|
identity.observation.linked
|
||||||
|
identity.merged
|
||||||
|
identity.unmerged
|
||||||
|
identity.campaign.assigned
|
||||||
|
campaign.formed
|
||||||
|
campaign.identity.assigned
|
||||||
|
campaign.merged
|
||||||
|
campaign.unmerged
|
||||||
|
credential.captured
|
||||||
|
credential.reuse.detected
|
||||||
|
canary.{token_id}.triggered
|
||||||
|
canary.{token_id}.placed
|
||||||
|
canary.{token_id}.revoked
|
||||||
|
system.log
|
||||||
|
system.bus.health
|
||||||
|
system.{worker}.health
|
||||||
|
|
||||||
|
Wildcards (per :func:`decnet.bus.base.matches`):
|
||||||
|
|
||||||
|
* ``*`` matches exactly one token.
|
||||||
|
* ``>`` matches one-or-more trailing tokens (so ``topology.>`` matches
|
||||||
|
``topology.abc.status`` but not the bare root ``topology``).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
# ─── Root prefixes ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
TOPOLOGY = "topology"
|
||||||
|
DECKY = "decky"
|
||||||
|
ATTACKER = "attacker"
|
||||||
|
IDENTITY = "identity"
|
||||||
|
CAMPAIGN = "campaign"
|
||||||
|
SYSTEM = "system"
|
||||||
|
CREDENTIAL = "credential"
|
||||||
|
ORCHESTRATOR = "orchestrator"
|
||||||
|
CANARY = "canary"
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Leaf event-type constants (the last segment of each topic) ──────────────
|
||||||
|
|
||||||
|
# Topology mutation lifecycle states — keep in sync with TopologyMutation.state
|
||||||
|
# in decnet/web/db/models.py; the bus topic mirrors the DB state machine.
|
||||||
|
MUTATION_ENQUEUED = "enqueued"
|
||||||
|
MUTATION_APPLYING = "applying"
|
||||||
|
MUTATION_APPLIED = "applied"
|
||||||
|
MUTATION_FAILED = "failed"
|
||||||
|
|
||||||
|
# Topology-level status transitions (topology.{id}.status): fires when the
|
||||||
|
# topology row's status column changes (pending/deploying/active/degraded/failed).
|
||||||
|
TOPOLOGY_STATUS = "status"
|
||||||
|
|
||||||
|
# Decky-level event types (second token).
|
||||||
|
DECKY_STATE = "state"
|
||||||
|
DECKY_TRAFFIC = "traffic"
|
||||||
|
# On-demand mutation request — published by the API/CLI/UI, consumed by
|
||||||
|
# the mutator's watch loop to force an immediate mutation of one decky
|
||||||
|
# without waiting for its scheduled interval. Underscored (not dotted)
|
||||||
|
# to stay a single NATS token so the builder's validator accepts it.
|
||||||
|
DECKY_MUTATE_REQUEST = "mutate_request"
|
||||||
|
# Mutation transition event — distinct from DECKY_STATE ("current
|
||||||
|
# shape") because a mutation is a *transition* that carries old/new
|
||||||
|
# services + trigger + timing. Correlator consumes these (via the
|
||||||
|
# syslog sidechannel too) to interleave substrate-change markers into
|
||||||
|
# attacker traversals.
|
||||||
|
DECKY_MUTATION = "mutation"
|
||||||
|
|
||||||
|
# Attacker event types (second token under the ``attacker`` root). First
|
||||||
|
# sighting, session boundary transitions, and score-threshold crossings
|
||||||
|
# published by correlator + profiler. Consumers typically subscribe to
|
||||||
|
# the wildcard ``attacker.>``.
|
||||||
|
ATTACKER_OBSERVED = "observed"
|
||||||
|
ATTACKER_SCORED = "scored"
|
||||||
|
# Published once per successful active probe result (JARM/HASSH/TCPfp).
|
||||||
|
# Distinct from ``observed`` which is the correlator's first-sight signal —
|
||||||
|
# a fingerprint is additional evidence about an already-observed attacker.
|
||||||
|
ATTACKER_FINGERPRINTED = "fingerprinted"
|
||||||
|
ATTACKER_SESSION_STARTED = "session.started"
|
||||||
|
ATTACKER_SESSION_ENDED = "session.ended"
|
||||||
|
# Published by the ``decnet enrich`` worker after an enrichment pass
|
||||||
|
# succeeds for an attacker IP (one or more 3rd-party intel providers
|
||||||
|
# returned a verdict). Payload carries the aggregate verdict + per-
|
||||||
|
# provider summary so SIEM-bound webhooks don't need to re-query the DB.
|
||||||
|
ATTACKER_INTEL_ENRICHED = "intel.enriched"
|
||||||
|
|
||||||
|
# Identity-resolution event types (second/third tokens under ``identity``).
|
||||||
|
# Published by the (future) clusterer worker — see
|
||||||
|
# development/IDENTITY_RESOLUTION.md. Constants ship in this commit;
|
||||||
|
# no publishers exist yet, but consumers (webhook worker, dashboard
|
||||||
|
# SSE relay) can subscribe to ``identity.>`` from day one and receive
|
||||||
|
# events the instant the clusterer comes online.
|
||||||
|
#
|
||||||
|
# identity.formed — clusterer creates a new identity from
|
||||||
|
# one or more observations
|
||||||
|
# identity.observation.linked — observation attached to an existing
|
||||||
|
# identity (or reattached from another)
|
||||||
|
# identity.merged — two identities collapsed; loser gets
|
||||||
|
# ``merged_into_uuid`` set, subscribers
|
||||||
|
# re-key cached references to the winner
|
||||||
|
# identity.unmerged — revocable-merge undo: contradicting
|
||||||
|
# evidence cleared ``merged_into_uuid``
|
||||||
|
# and re-split observations. The
|
||||||
|
# resurrected side's UUID is the same
|
||||||
|
# as the prior loser, so subscribers
|
||||||
|
# that cached references to the loser
|
||||||
|
# during the merged interval can
|
||||||
|
# re-attach without a new lookup.
|
||||||
|
#
|
||||||
|
# ``identity.campaign.assigned`` is deferred; it ships when the campaign
|
||||||
|
# clusterer ships. YAGNI before then.
|
||||||
|
IDENTITY_FORMED = "formed"
|
||||||
|
IDENTITY_OBSERVATION_LINKED = "observation.linked"
|
||||||
|
IDENTITY_MERGED = "merged"
|
||||||
|
IDENTITY_UNMERGED = "unmerged"
|
||||||
|
# Campaign-clusterer cross-family event — fires under ``identity.>`` so
|
||||||
|
# identity-stream subscribers (e.g. the IdentityDetail SSE client) get
|
||||||
|
# notified the moment an identity's ``campaign_id`` changes without
|
||||||
|
# having to subscribe to the campaign topic family. The same event
|
||||||
|
# fires under ``campaign.identity.assigned`` for campaign-side
|
||||||
|
# subscribers.
|
||||||
|
IDENTITY_CAMPAIGN_ASSIGNED = "campaign.assigned"
|
||||||
|
|
||||||
|
# Campaign-clusterer event types (second/third tokens under
|
||||||
|
# ``campaign``). Mirror of the identity family at the layer above:
|
||||||
|
# campaigns group identities into operations, and the clusterer
|
||||||
|
# publishes the same form / link / merge / unmerge lifecycle.
|
||||||
|
#
|
||||||
|
# campaign.formed — clusterer creates a new campaign from
|
||||||
|
# one or more identities
|
||||||
|
# campaign.identity.assigned — identity attached to an existing
|
||||||
|
# campaign (or reassigned from another)
|
||||||
|
# campaign.merged — two campaigns collapsed; loser gets
|
||||||
|
# ``merged_into_uuid`` set, subscribers
|
||||||
|
# re-key cached references to the winner
|
||||||
|
# campaign.unmerged — revocable-merge undo: contradicting
|
||||||
|
# evidence cleared ``merged_into_uuid``
|
||||||
|
# and re-split identities
|
||||||
|
CAMPAIGN_FORMED = "formed"
|
||||||
|
CAMPAIGN_IDENTITY_ASSIGNED = "identity.assigned"
|
||||||
|
CAMPAIGN_MERGED = "merged"
|
||||||
|
CAMPAIGN_UNMERGED = "unmerged"
|
||||||
|
|
||||||
|
# Credential event types (second/third tokens under ``credential``).
|
||||||
|
# ``credential.captured`` fires once per upserted Credential row — the
|
||||||
|
# correlator listens for it and runs the cred-reuse query in response,
|
||||||
|
# so reuse detection latency is sub-second after a fresh capture.
|
||||||
|
# ``credential.reuse.detected`` fires when the correlator inserts a new
|
||||||
|
# CredentialReuse row or grows an existing one (added decky/service/IP).
|
||||||
|
CREDENTIAL_CAPTURED = "captured"
|
||||||
|
CREDENTIAL_REUSE_DETECTED = "reuse.detected"
|
||||||
|
|
||||||
|
# Canary-token event types (third token under ``canary``).
|
||||||
|
#
|
||||||
|
# canary.{token_id}.placed — orchestrator/API successfully planted a
|
||||||
|
# canary artifact inside a decky's
|
||||||
|
# filesystem (or persisted a passive token
|
||||||
|
# that has no callback wiring). Lets
|
||||||
|
# dashboards reflect baseline coverage in
|
||||||
|
# real time without a DB poll.
|
||||||
|
# canary.{token_id}.triggered — ``decnet canary`` worker observed a
|
||||||
|
# callback hit (HTTP slug or DNS subdomain
|
||||||
|
# lookup) for the token. Payload carries
|
||||||
|
# ``src_ip``, ``user_agent``, ``request_path``
|
||||||
|
# and any DNS qname so downstream
|
||||||
|
# consumers (correlator, webhook fanout)
|
||||||
|
# can attribute and forward without a
|
||||||
|
# follow-up DB read.
|
||||||
|
# canary.{token_id}.revoked — operator removed a token; planter unlinked
|
||||||
|
# the file (best-effort) and the row was
|
||||||
|
# marked ``revoked``. Subscribers may
|
||||||
|
# evict cached lookups by token id.
|
||||||
|
CANARY_PLACED = "placed"
|
||||||
|
CANARY_TRIGGERED = "triggered"
|
||||||
|
CANARY_REVOKED = "revoked"
|
||||||
|
|
||||||
|
# Orchestrator event types (second token under ``orchestrator``). The
|
||||||
|
# orchestrator worker publishes one of these per synthetic action it
|
||||||
|
# drives against a decky — cheap inter-decky traffic and filesystem
|
||||||
|
# mutations whose role is to keep the honeypot from looking suspiciously
|
||||||
|
# static. Always nested with the destination decky uuid as the third
|
||||||
|
# token, so consumers can subscribe to a single decky's life-injection
|
||||||
|
# stream via ``orchestrator.*.<decky_uuid>``.
|
||||||
|
ORCHESTRATOR_TRAFFIC = "traffic"
|
||||||
|
ORCHESTRATOR_FILE = "file"
|
||||||
|
# Emailgen — published by the ``decnet emailgen`` worker once per generated
|
||||||
|
# fake email delivered into a mail decky's maildir. Third token is the
|
||||||
|
# destination mail-decky uuid (the IMAP/POP3 host serving the mailbox),
|
||||||
|
# matching the ``orchestrator.*.<decky_uuid>`` subscription pattern.
|
||||||
|
ORCHESTRATOR_EMAIL = "email"
|
||||||
|
|
||||||
|
# System event types.
|
||||||
|
SYSTEM_LOG = "log"
|
||||||
|
SYSTEM_BUS_HEALTH = "bus.health"
|
||||||
|
# Worker-health leaf — built per-worker as ``system.<worker>.health`` via
|
||||||
|
# :func:`system_health`. The leaf constant stays the same across workers;
|
||||||
|
# the worker name goes in the middle token.
|
||||||
|
SYSTEM_HEALTH = "health"
|
||||||
|
# Worker-control leaf — built per-worker as ``system.<worker>.control`` via
|
||||||
|
# :func:`system_control`. Admin-originated stop intents travel on this
|
||||||
|
# topic; each worker subscribes to its own.
|
||||||
|
SYSTEM_CONTROL = "control"
|
||||||
|
|
||||||
|
# Control payload ``action`` values — the wire vocabulary. Only ``stop`` is
|
||||||
|
# handled in v1; ``start`` is reserved because a stopped worker has no
|
||||||
|
# subscriber, so starting requires external supervision (systemd).
|
||||||
|
WORKER_CONTROL_STOP = "stop"
|
||||||
|
WORKER_CONTROL_START = "start"
|
||||||
|
|
||||||
|
# Webhook subscription-set changed — published by the CRUD router after any
|
||||||
|
# create / update / delete on WebhookSubscription so the webhook worker can
|
||||||
|
# reload its in-memory subscription list and re-subscribe to the new union
|
||||||
|
# of patterns. Payload is currently empty; consumers only need the signal.
|
||||||
|
WEBHOOK_SUBSCRIPTIONS_CHANGED = "system.webhook.subscriptions_changed"
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Builders ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def topology_mutation(topology_id: str, state: str) -> str:
|
||||||
|
"""Build ``topology.<id>.mutation.<state>``.
|
||||||
|
|
||||||
|
*state* should be one of the ``MUTATION_*`` constants.
|
||||||
|
"""
|
||||||
|
_reject_tokens(topology_id, state)
|
||||||
|
return f"{TOPOLOGY}.{topology_id}.mutation.{state}"
|
||||||
|
|
||||||
|
|
||||||
|
def topology_status(topology_id: str) -> str:
|
||||||
|
"""Build ``topology.<id>.status``."""
|
||||||
|
_reject_tokens(topology_id)
|
||||||
|
return f"{TOPOLOGY}.{topology_id}.{TOPOLOGY_STATUS}"
|
||||||
|
|
||||||
|
|
||||||
|
def decky(decky_id: str, event_type: str) -> str:
|
||||||
|
"""Build ``decky.<id>.<event_type>``.
|
||||||
|
|
||||||
|
*event_type* is typically one of ``DECKY_STATE`` or ``DECKY_TRAFFIC``.
|
||||||
|
"""
|
||||||
|
_reject_tokens(decky_id, event_type)
|
||||||
|
return f"{DECKY}.{decky_id}.{event_type}"
|
||||||
|
|
||||||
|
|
||||||
|
def decky_mutation(decky_id: str) -> str:
|
||||||
|
"""Build ``decky.<id>.mutation``."""
|
||||||
|
_reject_tokens(decky_id)
|
||||||
|
return f"{DECKY}.{decky_id}.{DECKY_MUTATION}"
|
||||||
|
|
||||||
|
|
||||||
|
def system(event_type: str) -> str:
|
||||||
|
"""Build ``system.<event_type>``.
|
||||||
|
|
||||||
|
*event_type* may itself contain dots (e.g. ``bus.health``) — we don't
|
||||||
|
re-validate the already-constant leaves; this just prefixes.
|
||||||
|
"""
|
||||||
|
if not event_type:
|
||||||
|
raise ValueError("system topic requires a non-empty event_type")
|
||||||
|
return f"{SYSTEM}.{event_type}"
|
||||||
|
|
||||||
|
|
||||||
|
def credential(event_type: str) -> str:
|
||||||
|
"""Build ``credential.<event_type>``.
|
||||||
|
|
||||||
|
*event_type* is typically one of :data:`CREDENTIAL_CAPTURED` or
|
||||||
|
:data:`CREDENTIAL_REUSE_DETECTED`. Dotted leaves
|
||||||
|
(``reuse.detected``) are permitted — same rationale as
|
||||||
|
:func:`system`.
|
||||||
|
"""
|
||||||
|
if not event_type:
|
||||||
|
raise ValueError("credential topic requires a non-empty event_type")
|
||||||
|
return f"{CREDENTIAL}.{event_type}"
|
||||||
|
|
||||||
|
|
||||||
|
def attacker(event_type: str) -> str:
|
||||||
|
"""Build ``attacker.<event_type>``.
|
||||||
|
|
||||||
|
*event_type* is typically one of ``ATTACKER_OBSERVED``,
|
||||||
|
``ATTACKER_SCORED``, ``ATTACKER_SESSION_STARTED``,
|
||||||
|
``ATTACKER_SESSION_ENDED``. Dotted leaves (``session.started``) are
|
||||||
|
permitted — same rationale as :func:`system`.
|
||||||
|
"""
|
||||||
|
if not event_type:
|
||||||
|
raise ValueError("attacker topic requires a non-empty event_type")
|
||||||
|
return f"{ATTACKER}.{event_type}"
|
||||||
|
|
||||||
|
|
||||||
|
def campaign(event_type: str) -> str:
|
||||||
|
"""Build ``campaign.<event_type>``.
|
||||||
|
|
||||||
|
*event_type* is typically one of :data:`CAMPAIGN_FORMED`,
|
||||||
|
:data:`CAMPAIGN_IDENTITY_ASSIGNED`, :data:`CAMPAIGN_MERGED`, or
|
||||||
|
:data:`CAMPAIGN_UNMERGED`. Dotted leaves (``identity.assigned``)
|
||||||
|
are permitted — same rationale as :func:`system`.
|
||||||
|
"""
|
||||||
|
if not event_type:
|
||||||
|
raise ValueError("campaign topic requires a non-empty event_type")
|
||||||
|
return f"{CAMPAIGN}.{event_type}"
|
||||||
|
|
||||||
|
|
||||||
|
def identity(event_type: str) -> str:
|
||||||
|
"""Build ``identity.<event_type>``.
|
||||||
|
|
||||||
|
*event_type* is typically one of :data:`IDENTITY_FORMED`,
|
||||||
|
:data:`IDENTITY_OBSERVATION_LINKED`, :data:`IDENTITY_MERGED`, or
|
||||||
|
:data:`IDENTITY_UNMERGED`. Dotted leaves (``observation.linked``)
|
||||||
|
are permitted — same rationale as :func:`system`.
|
||||||
|
"""
|
||||||
|
if not event_type:
|
||||||
|
raise ValueError("identity topic requires a non-empty event_type")
|
||||||
|
return f"{IDENTITY}.{event_type}"
|
||||||
|
|
||||||
|
|
||||||
|
def orchestrator(event_type: str, decky_id: str) -> str:
|
||||||
|
"""Build ``orchestrator.<event_type>.<decky_id>``.
|
||||||
|
|
||||||
|
*event_type* should be one of :data:`ORCHESTRATOR_TRAFFIC` or
|
||||||
|
:data:`ORCHESTRATOR_FILE`. The destination decky is always the
|
||||||
|
third token so per-decky subscribers can use
|
||||||
|
``orchestrator.*.<decky_uuid>``.
|
||||||
|
"""
|
||||||
|
_reject_tokens(event_type, decky_id)
|
||||||
|
return f"{ORCHESTRATOR}.{event_type}.{decky_id}"
|
||||||
|
|
||||||
|
|
||||||
|
def canary(token_id: str, event_type: str) -> str:
|
||||||
|
"""Build ``canary.<token_id>.<event_type>``.
|
||||||
|
|
||||||
|
*event_type* should be one of :data:`CANARY_PLACED`,
|
||||||
|
:data:`CANARY_TRIGGERED`, or :data:`CANARY_REVOKED`. The token id
|
||||||
|
is always the second token so per-token subscribers can use
|
||||||
|
``canary.<token_id>.>`` and fleet-wide consumers (webhook fanout,
|
||||||
|
correlator) use ``canary.>``.
|
||||||
|
"""
|
||||||
|
_reject_tokens(token_id, event_type)
|
||||||
|
return f"{CANARY}.{token_id}.{event_type}"
|
||||||
|
|
||||||
|
|
||||||
|
def system_health(worker: str) -> str:
|
||||||
|
"""Build ``system.<worker>.health``.
|
||||||
|
|
||||||
|
Worker-health heartbeats live as a nested leaf under ``system`` so
|
||||||
|
consumers can subscribe to ``system.*.health`` for every worker at
|
||||||
|
once, or to ``system.mutator.health`` for a single one. *worker* is
|
||||||
|
validated as a regular segment — no dots, wildcards, or whitespace.
|
||||||
|
"""
|
||||||
|
_reject_tokens(worker)
|
||||||
|
return f"{SYSTEM}.{worker}.{SYSTEM_HEALTH}"
|
||||||
|
|
||||||
|
|
||||||
|
def system_control(worker: str) -> str:
|
||||||
|
"""Build ``system.<worker>.control``.
|
||||||
|
|
||||||
|
Admin-originated stop (and, eventually, start) intents are published
|
||||||
|
here; the worker in question subscribes to its own address and reacts.
|
||||||
|
Payload shape::
|
||||||
|
|
||||||
|
{"action": "stop", "requested_by": "<username>", "ts": <unix>}
|
||||||
|
|
||||||
|
*action* must be one of :data:`WORKER_CONTROL_STOP` /
|
||||||
|
:data:`WORKER_CONTROL_START`; any other value is ignored by the
|
||||||
|
listener. Same segment rules as :func:`system_health`.
|
||||||
|
"""
|
||||||
|
_reject_tokens(worker)
|
||||||
|
return f"{SYSTEM}.{worker}.{SYSTEM_CONTROL}"
|
||||||
|
|
||||||
|
|
||||||
|
def _reject_tokens(*parts: str) -> None:
|
||||||
|
"""Reject topic segments that would break NATS-style tokenization.
|
||||||
|
|
||||||
|
Dots, wildcards, whitespace, and empty strings in a *segment* would
|
||||||
|
silently corrupt the hierarchy (e.g. ``topology.a.b.status`` for a
|
||||||
|
``topology_id`` of ``"a.b"``). Raise early at the builder instead of
|
||||||
|
shipping a malformed topic to the wire.
|
||||||
|
"""
|
||||||
|
for p in parts:
|
||||||
|
if not p:
|
||||||
|
raise ValueError("topic segment must not be empty")
|
||||||
|
if "." in p or "*" in p or ">" in p or any(c.isspace() for c in p):
|
||||||
|
raise ValueError(
|
||||||
|
f"topic segment {p!r} may not contain '.', '*', '>', or whitespace"
|
||||||
|
)
|
||||||
257
decnet/bus/unix_client.py
Normal file
257
decnet/bus/unix_client.py
Normal file
@@ -0,0 +1,257 @@
|
|||||||
|
"""UNIX-socket client — :class:`UnixSocketBus` implementation of :class:`BaseBus`.
|
||||||
|
|
||||||
|
Holds one open socket to the local :class:`~decnet.bus.unix_server.BusServer`.
|
||||||
|
Operations:
|
||||||
|
|
||||||
|
* :meth:`publish` writes a single ``PUB`` frame and returns; no ack.
|
||||||
|
* :meth:`subscribe` writes a ``SUB`` frame and returns a
|
||||||
|
:class:`~decnet.bus.base.Subscription` backed by an :class:`asyncio.Queue`
|
||||||
|
that the background reader task feeds.
|
||||||
|
|
||||||
|
One background reader task per bus instance dispatches incoming ``EVT``
|
||||||
|
frames to every registered subscription whose pattern matches the topic.
|
||||||
|
On connection drop or close, every subscription is woken via a sentinel so
|
||||||
|
iterators unblock cleanly; callers see :class:`StopAsyncIteration` from the
|
||||||
|
``async for`` loop.
|
||||||
|
|
||||||
|
No auto-reconnect in MVP. If the server restarts, callers must
|
||||||
|
:meth:`close` the bus and construct a new one. This mirrors how other
|
||||||
|
DECNET workers handle their dependencies — the systemd ``Restart=on-failure``
|
||||||
|
supervision above us is the retry loop.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import contextlib
|
||||||
|
import os
|
||||||
|
import pathlib
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from decnet.bus import protocol
|
||||||
|
from decnet.bus.base import (
|
||||||
|
BaseBus,
|
||||||
|
Event,
|
||||||
|
Subscription,
|
||||||
|
_CLOSE_SENTINEL,
|
||||||
|
matches,
|
||||||
|
)
|
||||||
|
from decnet.bus.fake import _enqueue_drop_oldest as _enqueue_event_drop_oldest
|
||||||
|
from decnet.logging import get_logger
|
||||||
|
|
||||||
|
log = get_logger("bus.client")
|
||||||
|
|
||||||
|
_INBOUND_QUEUE_SIZE = 1024
|
||||||
|
|
||||||
|
|
||||||
|
class _UnixSubscription(Subscription):
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
bus: "UnixSocketBus",
|
||||||
|
pattern: str,
|
||||||
|
queue: "asyncio.Queue[Any]",
|
||||||
|
) -> None:
|
||||||
|
super().__init__(pattern)
|
||||||
|
self._bus = bus
|
||||||
|
self._queue = queue
|
||||||
|
|
||||||
|
async def __anext__(self) -> Event:
|
||||||
|
if self._closed:
|
||||||
|
raise StopAsyncIteration
|
||||||
|
item = await self._queue.get()
|
||||||
|
if item is _CLOSE_SENTINEL:
|
||||||
|
raise StopAsyncIteration
|
||||||
|
return item
|
||||||
|
|
||||||
|
async def _aclose(self) -> None:
|
||||||
|
await self._bus._unregister(self)
|
||||||
|
try:
|
||||||
|
self._queue.put_nowait(_CLOSE_SENTINEL)
|
||||||
|
except asyncio.QueueFull:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class UnixSocketBus(BaseBus):
|
||||||
|
"""Client handle for a local :class:`BusServer`.
|
||||||
|
|
||||||
|
One instance per process typically; multiple instances simply open
|
||||||
|
multiple sockets to the same server. Connection is lazy — the first
|
||||||
|
:meth:`connect` (or any publish/subscribe call via ``async with``)
|
||||||
|
opens the socket.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
socket_path: pathlib.Path | str,
|
||||||
|
*,
|
||||||
|
client_name: str | None = None,
|
||||||
|
) -> None:
|
||||||
|
self._path = pathlib.Path(socket_path)
|
||||||
|
self._client_name = client_name or f"decnet-bus-client[{os.getpid()}]"
|
||||||
|
self._reader: asyncio.StreamReader | None = None
|
||||||
|
self._writer: asyncio.StreamWriter | None = None
|
||||||
|
self._reader_task: asyncio.Task[None] | None = None
|
||||||
|
self._subs: list[_UnixSubscription] = []
|
||||||
|
self._lock = asyncio.Lock()
|
||||||
|
self._write_lock = asyncio.Lock()
|
||||||
|
self._closed = False
|
||||||
|
# Sticky flag: the first publish-on-closed-bus call logs at
|
||||||
|
# WARNING so operators see that a publish was dropped; subsequent
|
||||||
|
# calls on the same instance log at DEBUG only to prevent a
|
||||||
|
# log flood when stream threads drain after close. The bus is
|
||||||
|
# critical infra, so the first warning is non-negotiable.
|
||||||
|
self._closed_publish_warned = False
|
||||||
|
|
||||||
|
# ─── Lifecycle ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async def connect(self) -> None:
|
||||||
|
if self._writer is not None:
|
||||||
|
return
|
||||||
|
if self._closed:
|
||||||
|
raise RuntimeError("connect on closed bus")
|
||||||
|
self._reader, self._writer = await asyncio.open_unix_connection(str(self._path))
|
||||||
|
await self._send(protocol.encode(protocol.HELLO, args=self._client_name))
|
||||||
|
self._reader_task = asyncio.create_task(self._reader_loop())
|
||||||
|
log.debug("bus.client: connected to %s as %s", self._path, self._client_name)
|
||||||
|
|
||||||
|
async def close(self) -> None:
|
||||||
|
if self._closed:
|
||||||
|
return
|
||||||
|
self._closed = True
|
||||||
|
|
||||||
|
# Best-effort BYE — we don't care if it fails.
|
||||||
|
if self._writer is not None and not self._writer.is_closing():
|
||||||
|
with contextlib.suppress(Exception):
|
||||||
|
await self._send(protocol.encode(protocol.BYE))
|
||||||
|
|
||||||
|
if self._reader_task is not None:
|
||||||
|
self._reader_task.cancel()
|
||||||
|
with contextlib.suppress(asyncio.CancelledError):
|
||||||
|
await self._reader_task
|
||||||
|
self._reader_task = None
|
||||||
|
|
||||||
|
if self._writer is not None:
|
||||||
|
with contextlib.suppress(Exception):
|
||||||
|
self._writer.close()
|
||||||
|
await self._writer.wait_closed()
|
||||||
|
self._writer = None
|
||||||
|
self._reader = None
|
||||||
|
|
||||||
|
# Wake every subscription so `async for` exits.
|
||||||
|
for sub in list(self._subs):
|
||||||
|
with contextlib.suppress(asyncio.QueueFull):
|
||||||
|
sub._queue.put_nowait(_CLOSE_SENTINEL)
|
||||||
|
self._subs.clear()
|
||||||
|
|
||||||
|
# ─── Pub/Sub ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async def publish(
|
||||||
|
self,
|
||||||
|
topic: str,
|
||||||
|
payload: dict[str, Any],
|
||||||
|
*,
|
||||||
|
event_type: str = "",
|
||||||
|
) -> None:
|
||||||
|
if self._closed:
|
||||||
|
# Degrade gracefully: the DB is the source of truth, the bus
|
||||||
|
# is only the notification layer. Raising here made every
|
||||||
|
# caller via publish_safely flood the logs once per stream
|
||||||
|
# line during shutdown races. First drop warns loudly;
|
||||||
|
# subsequent drops on the same instance are DEBUG-only.
|
||||||
|
if not self._closed_publish_warned:
|
||||||
|
self._closed_publish_warned = True
|
||||||
|
log.warning(
|
||||||
|
"bus.client: publish on closed bus dropped topic=%s "
|
||||||
|
"(further drops on this instance logged at DEBUG)",
|
||||||
|
topic,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
log.debug("bus.client: publish on closed bus dropped topic=%s", topic)
|
||||||
|
return
|
||||||
|
if self._writer is None:
|
||||||
|
await self.connect()
|
||||||
|
body = Event(topic=topic, payload=payload, type=event_type).to_dict()
|
||||||
|
try:
|
||||||
|
await self._send(protocol.encode(protocol.PUB, args=topic, body=body))
|
||||||
|
except (ConnectionError, BrokenPipeError) as exc:
|
||||||
|
# Bus loss is a logged warning, never a publisher crash. The
|
||||||
|
# DB-as-source-of-truth invariant means the work is already
|
||||||
|
# persisted; the missing event is just a missed notification.
|
||||||
|
log.warning("bus.client: publish failed: %s", exc)
|
||||||
|
|
||||||
|
def subscribe(self, pattern: str) -> Subscription:
|
||||||
|
if self._closed:
|
||||||
|
raise RuntimeError("subscribe on closed bus")
|
||||||
|
queue: asyncio.Queue[Any] = asyncio.Queue(maxsize=_INBOUND_QUEUE_SIZE)
|
||||||
|
sub = _UnixSubscription(self, pattern, queue)
|
||||||
|
self._subs.append(sub)
|
||||||
|
# Schedule the SUB frame asynchronously so subscribe() stays sync,
|
||||||
|
# matching the BaseBus signature. The caller will shortly `async
|
||||||
|
# with` / `async for` the subscription, which will run the event
|
||||||
|
# loop and pick this task up.
|
||||||
|
asyncio.ensure_future(self._send_sub(pattern))
|
||||||
|
return sub
|
||||||
|
|
||||||
|
async def _send_sub(self, pattern: str) -> None:
|
||||||
|
try:
|
||||||
|
if self._writer is None:
|
||||||
|
await self.connect()
|
||||||
|
await self._send(protocol.encode(protocol.SUB, args=pattern))
|
||||||
|
except Exception as exc: # pragma: no cover - network paths in live tests
|
||||||
|
log.warning("bus.client: SUB %s failed: %s", pattern, exc)
|
||||||
|
|
||||||
|
async def _unregister(self, sub: _UnixSubscription) -> None:
|
||||||
|
try:
|
||||||
|
self._subs.remove(sub)
|
||||||
|
except ValueError:
|
||||||
|
return
|
||||||
|
# Tell the server we no longer want events for this pattern if no
|
||||||
|
# other local subscription still wants it.
|
||||||
|
if not any(s.pattern == sub.pattern for s in self._subs):
|
||||||
|
with contextlib.suppress(Exception):
|
||||||
|
await self._send(protocol.encode(protocol.UNSUB, args=sub.pattern))
|
||||||
|
|
||||||
|
# ─── Internal I/O ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async def _send(self, frame_bytes: bytes) -> None:
|
||||||
|
if self._writer is None:
|
||||||
|
raise ConnectionError("bus.client: not connected")
|
||||||
|
async with self._write_lock:
|
||||||
|
self._writer.write(frame_bytes)
|
||||||
|
await self._writer.drain()
|
||||||
|
|
||||||
|
async def _reader_loop(self) -> None:
|
||||||
|
if self._reader is None:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
while True:
|
||||||
|
frame = await protocol.read_frame(self._reader)
|
||||||
|
if frame is None:
|
||||||
|
break
|
||||||
|
if frame.verb != protocol.EVT:
|
||||||
|
# Clients only ever legitimately receive EVT (or BYE).
|
||||||
|
if frame.verb == protocol.BYE:
|
||||||
|
break
|
||||||
|
log.warning("bus.client: unexpected verb from server: %s", frame.verb)
|
||||||
|
continue
|
||||||
|
topic = frame.args
|
||||||
|
data = protocol.decode_body(frame.body) if frame.body else {}
|
||||||
|
event = Event.from_dict(topic, data)
|
||||||
|
self._dispatch(event)
|
||||||
|
except protocol.ProtocolError as exc:
|
||||||
|
log.warning("bus.client: protocol error: %s", exc)
|
||||||
|
except (asyncio.IncompleteReadError, ConnectionError):
|
||||||
|
pass
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
raise
|
||||||
|
except Exception: # pragma: no cover
|
||||||
|
log.exception("bus.client: reader loop crashed")
|
||||||
|
finally:
|
||||||
|
# Server-side close — wake every subscription.
|
||||||
|
for sub in list(self._subs):
|
||||||
|
with contextlib.suppress(asyncio.QueueFull):
|
||||||
|
sub._queue.put_nowait(_CLOSE_SENTINEL)
|
||||||
|
|
||||||
|
def _dispatch(self, event: Event) -> None:
|
||||||
|
for sub in self._subs:
|
||||||
|
if matches(sub.pattern, event.topic):
|
||||||
|
_enqueue_event_drop_oldest(sub._queue, event)
|
||||||
309
decnet/bus/unix_server.py
Normal file
309
decnet/bus/unix_server.py
Normal file
@@ -0,0 +1,309 @@
|
|||||||
|
"""UNIX-socket server for the DECNET bus.
|
||||||
|
|
||||||
|
One :class:`BusServer` per host. Accepts local connections on a UNIX-domain
|
||||||
|
socket; each connection may:
|
||||||
|
|
||||||
|
* publish events (``PUB`` frames) that the server fans out to all matching
|
||||||
|
subscribers on other connections, and
|
||||||
|
* subscribe to patterns (``SUB`` frames) and receive matching events as
|
||||||
|
``EVT`` frames.
|
||||||
|
|
||||||
|
Authorization is socket file permissions (0660, group=``decnet`` if that
|
||||||
|
POSIX group exists, else the server process's own group). Anything the
|
||||||
|
kernel lets ``connect()`` is trusted — there is no verb-level auth. This
|
||||||
|
matches the "local processes on the same host" threat model; cross-host
|
||||||
|
federation is out of scope (see DEBT-029).
|
||||||
|
|
||||||
|
Backpressure is per-connection, drop-oldest: if a subscriber can't drain its
|
||||||
|
outbound queue fast enough, the server discards the oldest pending event
|
||||||
|
rather than blocking publishers. The bus is at-most-once by contract, so
|
||||||
|
drops are acceptable; stalled publishers are not.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import contextlib
|
||||||
|
import grp
|
||||||
|
import os
|
||||||
|
import pathlib
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from decnet.bus import protocol
|
||||||
|
from decnet.bus.base import Event, matches
|
||||||
|
from decnet.logging import get_logger
|
||||||
|
|
||||||
|
log = get_logger("bus.server")
|
||||||
|
|
||||||
|
_SOCKET_MODE = 0o660
|
||||||
|
_DEFAULT_GROUP = "decnet"
|
||||||
|
_OUTBOUND_QUEUE_SIZE = 1024
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(eq=False)
|
||||||
|
class _Connection:
|
||||||
|
"""Per-connection server state."""
|
||||||
|
|
||||||
|
writer: asyncio.StreamWriter
|
||||||
|
peer_name: str = "<unknown>"
|
||||||
|
patterns: set[str] = field(default_factory=set)
|
||||||
|
outbound: asyncio.Queue[bytes] = field(
|
||||||
|
default_factory=lambda: asyncio.Queue(maxsize=_OUTBOUND_QUEUE_SIZE)
|
||||||
|
)
|
||||||
|
closed: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
class BusServer:
|
||||||
|
"""Serve a UNIX-socket bus on *socket_path*.
|
||||||
|
|
||||||
|
Lifecycle: construct → :meth:`start` → :meth:`serve_forever` (or rely
|
||||||
|
on :meth:`start` returning once bound) → :meth:`close` for teardown.
|
||||||
|
Safe to :meth:`close` multiple times.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
socket_path: pathlib.Path | str,
|
||||||
|
*,
|
||||||
|
group: str | None = _DEFAULT_GROUP,
|
||||||
|
mode: int = _SOCKET_MODE,
|
||||||
|
) -> None:
|
||||||
|
self._path = pathlib.Path(socket_path)
|
||||||
|
self._group = group
|
||||||
|
self._mode = mode
|
||||||
|
self._server: asyncio.base_events.Server | None = None
|
||||||
|
self._connections: set[_Connection] = set()
|
||||||
|
self._closed = False
|
||||||
|
|
||||||
|
# ─── Lifecycle ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async def start(self) -> None:
|
||||||
|
"""Bind the socket and begin accepting connections.
|
||||||
|
|
||||||
|
Removes any stale socket file at *socket_path* first (common case:
|
||||||
|
the previous worker crashed without cleaning up). The parent
|
||||||
|
directory must already exist; we do NOT create it blindly because
|
||||||
|
the chosen directory (typically ``/run/decnet``) may require
|
||||||
|
systemd ``RuntimeDirectory=`` to set up.
|
||||||
|
"""
|
||||||
|
if self._server is not None:
|
||||||
|
return
|
||||||
|
|
||||||
|
parent = self._path.parent
|
||||||
|
if not parent.exists():
|
||||||
|
raise FileNotFoundError(
|
||||||
|
f"bus socket parent directory {parent} does not exist; "
|
||||||
|
f"create it with systemd RuntimeDirectory= or mkdir"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Clean up a stale socket from a previous crash. If a live server
|
||||||
|
# is actually listening there, ``bind()`` below will fail — we do
|
||||||
|
# not try to detect live vs. stale ourselves.
|
||||||
|
with contextlib.suppress(FileNotFoundError):
|
||||||
|
if self._path.is_socket():
|
||||||
|
self._path.unlink()
|
||||||
|
|
||||||
|
self._server = await asyncio.start_unix_server(
|
||||||
|
self._handle_connection, path=str(self._path),
|
||||||
|
)
|
||||||
|
_chmod_and_chown(self._path, self._mode, self._group)
|
||||||
|
log.info("bus.server: listening on %s (mode=%o group=%s)",
|
||||||
|
self._path, self._mode, self._group or "<inherit>")
|
||||||
|
|
||||||
|
async def serve_forever(self) -> None:
|
||||||
|
if self._server is None:
|
||||||
|
raise RuntimeError("BusServer not started")
|
||||||
|
async with self._server:
|
||||||
|
await self._server.serve_forever()
|
||||||
|
|
||||||
|
async def close(self) -> None:
|
||||||
|
if self._closed:
|
||||||
|
return
|
||||||
|
self._closed = True
|
||||||
|
|
||||||
|
if self._server is not None:
|
||||||
|
self._server.close()
|
||||||
|
with contextlib.suppress(Exception):
|
||||||
|
await self._server.wait_closed()
|
||||||
|
self._server = None
|
||||||
|
|
||||||
|
# Drain every live connection.
|
||||||
|
for conn in list(self._connections):
|
||||||
|
await self._close_connection(conn)
|
||||||
|
self._connections.clear()
|
||||||
|
|
||||||
|
with contextlib.suppress(FileNotFoundError):
|
||||||
|
self._path.unlink()
|
||||||
|
log.info("bus.server: closed")
|
||||||
|
|
||||||
|
# ─── Internal publish fan-out ───────────────────────────────────────────
|
||||||
|
|
||||||
|
async def publish(self, topic: str, payload: dict[str, Any], event_type: str = "") -> None:
|
||||||
|
"""Server-side publish helper — used by the worker to emit
|
||||||
|
``system.bus.health`` heartbeats without opening a client loop."""
|
||||||
|
event = Event(topic=topic, payload=payload, type=event_type)
|
||||||
|
self._fanout(event)
|
||||||
|
|
||||||
|
# ─── Connection handler ─────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async def _handle_connection(
|
||||||
|
self,
|
||||||
|
reader: asyncio.StreamReader,
|
||||||
|
writer: asyncio.StreamWriter,
|
||||||
|
) -> None:
|
||||||
|
conn = _Connection(writer=writer)
|
||||||
|
self._connections.add(conn)
|
||||||
|
writer_task = asyncio.create_task(self._writer_loop(conn))
|
||||||
|
try:
|
||||||
|
await self._reader_loop(conn, reader)
|
||||||
|
except protocol.ProtocolError as exc:
|
||||||
|
log.warning("bus.server: protocol error from %s: %s", conn.peer_name, exc)
|
||||||
|
except (asyncio.IncompleteReadError, ConnectionError) as exc:
|
||||||
|
log.debug("bus.server: %s disconnected: %s", conn.peer_name, exc)
|
||||||
|
except Exception: # pragma: no cover - defensive
|
||||||
|
log.exception("bus.server: unhandled error in connection")
|
||||||
|
finally:
|
||||||
|
await self._close_connection(conn)
|
||||||
|
self._connections.discard(conn)
|
||||||
|
writer_task.cancel()
|
||||||
|
with contextlib.suppress(asyncio.CancelledError):
|
||||||
|
await writer_task
|
||||||
|
|
||||||
|
async def _reader_loop(
|
||||||
|
self, conn: _Connection, reader: asyncio.StreamReader,
|
||||||
|
) -> None:
|
||||||
|
while True:
|
||||||
|
frame = await protocol.read_frame(reader)
|
||||||
|
if frame is None:
|
||||||
|
return
|
||||||
|
await self._dispatch(conn, frame)
|
||||||
|
if frame.verb == protocol.BYE:
|
||||||
|
return
|
||||||
|
|
||||||
|
async def _dispatch(self, conn: _Connection, frame: protocol.Frame) -> None:
|
||||||
|
if frame.verb == protocol.HELLO:
|
||||||
|
conn.peer_name = frame.args or conn.peer_name
|
||||||
|
log.debug("bus.server: HELLO from %s", conn.peer_name)
|
||||||
|
return
|
||||||
|
if frame.verb == protocol.SUB:
|
||||||
|
pattern = frame.args
|
||||||
|
if not pattern:
|
||||||
|
raise protocol.ProtocolError("SUB requires a pattern")
|
||||||
|
conn.patterns.add(pattern)
|
||||||
|
log.debug("bus.server: %s SUB %s", conn.peer_name, pattern)
|
||||||
|
return
|
||||||
|
if frame.verb == protocol.UNSUB:
|
||||||
|
conn.patterns.discard(frame.args)
|
||||||
|
return
|
||||||
|
if frame.verb == protocol.PUB:
|
||||||
|
topic = frame.args
|
||||||
|
if not topic:
|
||||||
|
raise protocol.ProtocolError("PUB requires a topic")
|
||||||
|
data = protocol.decode_body(frame.body) if frame.body else {}
|
||||||
|
event = Event(
|
||||||
|
topic=topic,
|
||||||
|
payload=data.get("payload", {}) or {},
|
||||||
|
type=data.get("type", "") or "",
|
||||||
|
)
|
||||||
|
self._fanout(event, origin=conn)
|
||||||
|
return
|
||||||
|
if frame.verb == protocol.BYE:
|
||||||
|
return
|
||||||
|
# EVT is server-to-client only; receiving one is a protocol violation.
|
||||||
|
raise protocol.ProtocolError(f"unexpected verb {frame.verb!r} from client")
|
||||||
|
|
||||||
|
def _fanout(self, event: Event, *, origin: _Connection | None = None) -> None:
|
||||||
|
"""Enqueue *event* as an EVT frame on every matching connection.
|
||||||
|
|
||||||
|
We do NOT deliver back to the originating connection (a publisher
|
||||||
|
does not receive its own event). Encoding happens once per event,
|
||||||
|
not once per subscriber.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
frame_bytes = protocol.encode(
|
||||||
|
protocol.EVT, args=event.topic, body=event.to_dict(),
|
||||||
|
)
|
||||||
|
except protocol.ProtocolError:
|
||||||
|
log.exception("bus.server: failed to encode EVT for topic=%s", event.topic)
|
||||||
|
return
|
||||||
|
|
||||||
|
for conn in self._connections:
|
||||||
|
if conn is origin or conn.closed:
|
||||||
|
continue
|
||||||
|
if not any(matches(p, event.topic) for p in conn.patterns):
|
||||||
|
continue
|
||||||
|
_enqueue_drop_oldest(conn.outbound, frame_bytes, event.topic)
|
||||||
|
|
||||||
|
async def _writer_loop(self, conn: _Connection) -> None:
|
||||||
|
"""Serialize writes onto *conn*'s socket.
|
||||||
|
|
||||||
|
One writer task per connection so a slow peer only blocks its own
|
||||||
|
queue, not the fan-out loop. The queue is bounded with drop-oldest
|
||||||
|
policy applied at enqueue time (see :func:`_enqueue_drop_oldest`).
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
while not conn.closed:
|
||||||
|
data = await conn.outbound.get()
|
||||||
|
conn.writer.write(data)
|
||||||
|
await conn.writer.drain()
|
||||||
|
except (ConnectionError, BrokenPipeError):
|
||||||
|
log.debug("bus.server: %s writer: peer closed", conn.peer_name)
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
pass
|
||||||
|
except Exception: # pragma: no cover - defensive
|
||||||
|
log.exception("bus.server: writer loop crashed for %s", conn.peer_name)
|
||||||
|
|
||||||
|
async def _close_connection(self, conn: _Connection) -> None:
|
||||||
|
if conn.closed:
|
||||||
|
return
|
||||||
|
conn.closed = True
|
||||||
|
with contextlib.suppress(Exception):
|
||||||
|
conn.writer.close()
|
||||||
|
await conn.writer.wait_closed()
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Helpers ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _chmod_and_chown(path: pathlib.Path, mode: int, group: str | None) -> None:
|
||||||
|
"""Apply socket file perms and best-effort group ownership.
|
||||||
|
|
||||||
|
If *group* is ``None`` or the named group does not exist, we leave the
|
||||||
|
socket owned by the current process group. This keeps the server
|
||||||
|
usable on dev boxes that don't have a ``decnet`` group set up.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
os.chmod(path, mode)
|
||||||
|
except OSError as exc:
|
||||||
|
log.warning("bus.server: chmod(%s, %o) failed: %s", path, mode, exc)
|
||||||
|
|
||||||
|
if not group:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
gid = grp.getgrnam(group).gr_gid
|
||||||
|
except KeyError:
|
||||||
|
log.debug("bus.server: group %r not found, leaving socket group unchanged", group)
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
os.chown(path, -1, gid)
|
||||||
|
except PermissionError:
|
||||||
|
# Dev box running as an unprivileged user can't chown. Log once at
|
||||||
|
# debug and move on — the socket is still usable by the owner.
|
||||||
|
log.debug("bus.server: chown(%s, gid=%d) denied; leaving as-is", path, gid)
|
||||||
|
except OSError as exc:
|
||||||
|
log.warning("bus.server: chown(%s, gid=%d) failed: %s", path, gid, exc)
|
||||||
|
|
||||||
|
|
||||||
|
def _enqueue_drop_oldest(
|
||||||
|
queue: "asyncio.Queue[bytes]", data: bytes, topic: str,
|
||||||
|
) -> None:
|
||||||
|
"""Drop-oldest backpressure — mirrors :func:`decnet.bus.fake._enqueue_drop_oldest`."""
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
queue.put_nowait(data)
|
||||||
|
return
|
||||||
|
except asyncio.QueueFull:
|
||||||
|
try:
|
||||||
|
queue.get_nowait()
|
||||||
|
log.warning("bus.server: subscriber queue full, dropped event topic=%s", topic)
|
||||||
|
except asyncio.QueueEmpty:
|
||||||
|
return
|
||||||
121
decnet/bus/worker.py
Normal file
121
decnet/bus/worker.py
Normal file
@@ -0,0 +1,121 @@
|
|||||||
|
"""``decnet bus`` worker entrypoint.
|
||||||
|
|
||||||
|
Starts a :class:`~decnet.bus.unix_server.BusServer` on the configured UNIX
|
||||||
|
socket and serves forever, emitting a ``system.bus.health`` heartbeat on
|
||||||
|
its own bus every :data:`HEARTBEAT_INTERVAL_SEC` seconds so liveness-aware
|
||||||
|
consumers (dashboards, watchdogs) can tell the bus is up without polling
|
||||||
|
the filesystem.
|
||||||
|
|
||||||
|
Cross-host federation is **out of scope** for the MVP; each host runs its
|
||||||
|
own bus independently. See DEBT-029 for the deferred ``--bridge-tcp``
|
||||||
|
mode that would proxy the socket over the swarm mTLS channel.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
import pathlib
|
||||||
|
import signal
|
||||||
|
import time
|
||||||
|
|
||||||
|
from decnet.bus import topics
|
||||||
|
from decnet.bus.unix_server import BusServer
|
||||||
|
from decnet.logging import get_logger
|
||||||
|
|
||||||
|
log = get_logger("bus.worker")
|
||||||
|
|
||||||
|
HEARTBEAT_INTERVAL_SEC = 10
|
||||||
|
|
||||||
|
|
||||||
|
async def bus_worker(
|
||||||
|
socket_path: str | pathlib.Path,
|
||||||
|
*,
|
||||||
|
group: str | None = "decnet",
|
||||||
|
heartbeat_interval: int = HEARTBEAT_INTERVAL_SEC,
|
||||||
|
) -> None:
|
||||||
|
"""Run the bus server until cancelled or SIGTERM/SIGINT is received.
|
||||||
|
|
||||||
|
The parent directory of *socket_path* must already exist (systemd's
|
||||||
|
``RuntimeDirectory=decnet`` handles this in prod; dev code is expected
|
||||||
|
to ``mkdir`` first). This function does not create it implicitly
|
||||||
|
because the right choice of perms/owner depends on the deployment
|
||||||
|
context.
|
||||||
|
"""
|
||||||
|
path = pathlib.Path(socket_path)
|
||||||
|
_ensure_parent(path)
|
||||||
|
|
||||||
|
server = BusServer(path, group=group)
|
||||||
|
await server.start()
|
||||||
|
log.info("bus.worker: pid=%d socket=%s", os.getpid(), path)
|
||||||
|
|
||||||
|
stop_event = asyncio.Event()
|
||||||
|
_install_signal_handlers(stop_event)
|
||||||
|
|
||||||
|
heartbeat_task = asyncio.create_task(_heartbeat_loop(server, heartbeat_interval))
|
||||||
|
serve_task = asyncio.create_task(server.serve_forever())
|
||||||
|
|
||||||
|
try:
|
||||||
|
await stop_event.wait()
|
||||||
|
log.info("bus.worker: shutdown signal received")
|
||||||
|
finally:
|
||||||
|
heartbeat_task.cancel()
|
||||||
|
serve_task.cancel()
|
||||||
|
for task in (heartbeat_task, serve_task):
|
||||||
|
try:
|
||||||
|
await task
|
||||||
|
except (asyncio.CancelledError, Exception): # noqa: BLE001 - draining shutdown
|
||||||
|
pass
|
||||||
|
await server.close()
|
||||||
|
log.info("bus.worker: stopped")
|
||||||
|
|
||||||
|
|
||||||
|
async def _heartbeat_loop(server: BusServer, interval: int) -> None:
|
||||||
|
"""Publish ``system.bus.health`` on the server's own fan-out."""
|
||||||
|
started_at = time.time()
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
await server.publish(
|
||||||
|
topics.system(topics.SYSTEM_BUS_HEALTH),
|
||||||
|
{
|
||||||
|
"pid": os.getpid(),
|
||||||
|
"uptime_sec": round(time.time() - started_at, 3),
|
||||||
|
"ts": time.time(),
|
||||||
|
},
|
||||||
|
event_type=topics.SYSTEM_BUS_HEALTH,
|
||||||
|
)
|
||||||
|
except Exception: # pragma: no cover - heartbeat must never kill the worker
|
||||||
|
log.exception("bus.worker: heartbeat publish failed")
|
||||||
|
await asyncio.sleep(interval)
|
||||||
|
|
||||||
|
|
||||||
|
def _install_signal_handlers(stop_event: asyncio.Event) -> None:
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
|
for sig in (signal.SIGTERM, signal.SIGINT):
|
||||||
|
try:
|
||||||
|
loop.add_signal_handler(sig, stop_event.set)
|
||||||
|
except (NotImplementedError, RuntimeError):
|
||||||
|
# add_signal_handler is not supported on Windows / in some
|
||||||
|
# test harnesses where the loop is running in a non-main thread.
|
||||||
|
# The worker still exits via KeyboardInterrupt bubbling up.
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def _ensure_parent(path: pathlib.Path) -> None:
|
||||||
|
parent = path.parent
|
||||||
|
if parent.exists():
|
||||||
|
return
|
||||||
|
# Dev-box convenience: if the parent is the user's ``~/.decnet`` dir,
|
||||||
|
# create it. We do not auto-mkdir ``/run/decnet`` — that's systemd's job
|
||||||
|
# and silently creating it as the wrong user would cause permission
|
||||||
|
# confusion later.
|
||||||
|
home_prefix = pathlib.Path.home() / ".decnet"
|
||||||
|
try:
|
||||||
|
parent.relative_to(home_prefix.parent)
|
||||||
|
except ValueError:
|
||||||
|
raise FileNotFoundError(
|
||||||
|
f"bus socket parent {parent} does not exist; create it first"
|
||||||
|
)
|
||||||
|
parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["bus_worker", "HEARTBEAT_INTERVAL_SEC"]
|
||||||
37
decnet/canary/__init__.py
Normal file
37
decnet/canary/__init__.py
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
"""Canary tokens — decoy artifacts planted in decky filesystems.
|
||||||
|
|
||||||
|
Public surface is exported here so callers can ``from decnet.canary
|
||||||
|
import CanaryArtifact, get_generator, get_instrumenter`` without
|
||||||
|
knowing the submodule layout. Concrete generators / instrumenters
|
||||||
|
live under :mod:`decnet.canary.generators` and
|
||||||
|
:mod:`decnet.canary.instrumenters` respectively; the factory keeps
|
||||||
|
import-time cost down by deferring those imports until first use
|
||||||
|
(same pattern as :mod:`decnet.intel.factory`).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from decnet.canary.base import (
|
||||||
|
CanaryArtifact,
|
||||||
|
CanaryContext,
|
||||||
|
CanaryGenerator,
|
||||||
|
CanaryInstrumenter,
|
||||||
|
)
|
||||||
|
from decnet.canary.factory import (
|
||||||
|
KNOWN_GENERATORS,
|
||||||
|
KNOWN_INSTRUMENTERS,
|
||||||
|
get_generator,
|
||||||
|
get_instrumenter,
|
||||||
|
pick_instrumenter_for_mime,
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"CanaryArtifact",
|
||||||
|
"CanaryContext",
|
||||||
|
"CanaryGenerator",
|
||||||
|
"CanaryInstrumenter",
|
||||||
|
"KNOWN_GENERATORS",
|
||||||
|
"KNOWN_INSTRUMENTERS",
|
||||||
|
"get_generator",
|
||||||
|
"get_instrumenter",
|
||||||
|
"pick_instrumenter_for_mime",
|
||||||
|
]
|
||||||
145
decnet/canary/base.py
Normal file
145
decnet/canary/base.py
Normal file
@@ -0,0 +1,145 @@
|
|||||||
|
"""Canary generator / instrumenter ABCs and the artifact dataclass.
|
||||||
|
|
||||||
|
Two flavors of producer share the same return shape:
|
||||||
|
|
||||||
|
* :class:`CanaryGenerator` synthesises a fake artifact from scratch
|
||||||
|
(e.g. a plausible ``~/.aws/credentials`` block, a ``.git/config``
|
||||||
|
pointing at an attacker-bait remote URL). Operators don't supply
|
||||||
|
any input.
|
||||||
|
|
||||||
|
* :class:`CanaryInstrumenter` mutates an operator-uploaded blob to
|
||||||
|
embed the callback (HTTP slug + DNS host). The original blob bytes
|
||||||
|
are passed in; the instrumenter returns the mutated version.
|
||||||
|
|
||||||
|
Both return a :class:`CanaryArtifact` — the planter doesn't care
|
||||||
|
which path produced it. Same dataclass keeps the planter's
|
||||||
|
docker-exec injector trivial.
|
||||||
|
|
||||||
|
ABCs intentionally do not include I/O — generators and instrumenters
|
||||||
|
are pure functions of (slug, host, blob?). All filesystem work
|
||||||
|
happens in :mod:`decnet.canary.planter` and :mod:`decnet.canary.storage`.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CanaryContext:
|
||||||
|
"""Inputs every generator/instrumenter needs to embed a working callback.
|
||||||
|
|
||||||
|
``callback_token`` is the unique slug; it appears verbatim in HTTP
|
||||||
|
URLs (``https://<host>/c/<callback_token>``) and as the leftmost
|
||||||
|
DNS label (``<callback_token>.canary.<dns_zone>``) so a single
|
||||||
|
slug resolves to a single :class:`CanaryToken` row regardless of
|
||||||
|
which path the attacker tripped.
|
||||||
|
|
||||||
|
``http_base`` and ``dns_zone`` come from the canary worker's
|
||||||
|
public-facing config (``DECNET_CANARY_HTTP_BASE``,
|
||||||
|
``DECNET_CANARY_DNS_ZONE``). When DNS isn't deployed,
|
||||||
|
``dns_zone`` is empty and instrumenters that only have a DNS
|
||||||
|
surface (e.g. an artifact whose only realistic embed point is a
|
||||||
|
hostname) raise.
|
||||||
|
"""
|
||||||
|
|
||||||
|
callback_token: str
|
||||||
|
http_base: str # e.g. "https://canary.example.test" — no trailing slash
|
||||||
|
dns_zone: str = "" # e.g. "canary.example.test"; "" disables DNS embeds
|
||||||
|
persona: str = "linux" # "linux" | "windows" — drives default username, path style
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CanaryArtifact:
|
||||||
|
"""Bytes-and-placement bundle produced by a generator/instrumenter."""
|
||||||
|
|
||||||
|
path: str
|
||||||
|
"""Absolute path inside the target container."""
|
||||||
|
|
||||||
|
content: bytes
|
||||||
|
"""Final bytes that hit the decky filesystem.
|
||||||
|
|
||||||
|
Always raw bytes — the planter base64-encodes for the wire so
|
||||||
|
binary blobs (DOCX/PNG/PDF) survive ``docker exec sh -c`` safely.
|
||||||
|
"""
|
||||||
|
|
||||||
|
mode: int = 0o600
|
||||||
|
"""Unix file mode. Defaults to ``0600`` because most realistic
|
||||||
|
canary placements (``~/.aws/credentials``, ``.env``, ``id_rsa``)
|
||||||
|
are operator-only. Honeydocs in user docs folders should pass
|
||||||
|
``0o644``.
|
||||||
|
"""
|
||||||
|
|
||||||
|
mtime_offset: int = 0
|
||||||
|
"""Seconds relative to *now* for the planted file's mtime.
|
||||||
|
|
||||||
|
Negative values backdate the file so it doesn't look like it
|
||||||
|
appeared the moment the decky was deployed. ``-86400 * 90`` (90
|
||||||
|
days ago) is a common choice for ``honeydoc`` artifacts; ``0``
|
||||||
|
means "stamp it now," which is fine for ``aws_creds``-like files
|
||||||
|
that would plausibly be touched recently.
|
||||||
|
"""
|
||||||
|
|
||||||
|
instrumenter: Optional[str] = None
|
||||||
|
"""Identifier of the instrumenter that produced this artifact (for
|
||||||
|
upload-driven tokens). Mirrored into ``CanaryToken.instrumenter``.
|
||||||
|
Mutually exclusive with :attr:`generator`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
generator: Optional[str] = None
|
||||||
|
"""Identifier of the generator that produced this artifact (for
|
||||||
|
synthesised tokens). Mirrored into ``CanaryToken.generator``.
|
||||||
|
Mutually exclusive with :attr:`instrumenter`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
notes: list[str] = field(default_factory=list)
|
||||||
|
"""Human-readable notes about the embedding (e.g. "DOCX: injected
|
||||||
|
1×1 remote image at relsId rId99"). Surfaced in the API
|
||||||
|
``preview`` response so the operator sees what we did before
|
||||||
|
planting. Never leaked to the attacker-facing surface.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class CanaryGenerator(ABC):
|
||||||
|
"""Produces a fake artifact from scratch."""
|
||||||
|
|
||||||
|
name: str #: short tag — matches ``CanaryToken.generator``
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def generate(self, ctx: CanaryContext) -> CanaryArtifact:
|
||||||
|
"""Synthesise the artifact.
|
||||||
|
|
||||||
|
MUST NOT do I/O. MUST be deterministic for the same
|
||||||
|
``(callback_token, http_base, dns_zone, persona)`` so re-seeding
|
||||||
|
from :attr:`CanaryToken.secret_seed` produces byte-identical
|
||||||
|
output and the planter is naturally idempotent.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class CanaryInstrumenter(ABC):
|
||||||
|
"""Mutates an operator-uploaded blob to embed a callback."""
|
||||||
|
|
||||||
|
name: str #: short tag — matches ``CanaryToken.instrumenter``
|
||||||
|
|
||||||
|
#: MIME prefixes this instrumenter handles. The factory uses these
|
||||||
|
#: to dispatch by sniffed content-type. Sub-string match against
|
||||||
|
#: the prefix list (e.g. ``("application/pdf",)`` or
|
||||||
|
#: ``("text/",)``).
|
||||||
|
mime_prefixes: tuple[str, ...] = ()
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def instrument(
|
||||||
|
self, blob: bytes, ctx: CanaryContext, *, target_path: str,
|
||||||
|
) -> CanaryArtifact:
|
||||||
|
"""Return the mutated bytes with the callback embedded.
|
||||||
|
|
||||||
|
MUST raise :class:`InstrumenterRejectedError` when the blob
|
||||||
|
can't be safely mutated (corrupt zip, encrypted PDF, etc.) so
|
||||||
|
the API can surface a 400 with the specific reason rather than
|
||||||
|
silently shipping the original bytes.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class InstrumenterRejectedError(ValueError):
|
||||||
|
"""Raised when an instrumenter can't safely mutate the input."""
|
||||||
181
decnet/canary/cultivator.py
Normal file
181
decnet/canary/cultivator.py
Normal file
@@ -0,0 +1,181 @@
|
|||||||
|
"""Realism contract adapter for canary generators.
|
||||||
|
|
||||||
|
Stage 7 of the realism migration. The orchestrator's planner picks a
|
||||||
|
``canary_*`` :class:`~decnet.realism.taxonomy.ContentClass` 1–3% of
|
||||||
|
the time on file ticks; this module turns that pick into a
|
||||||
|
:class:`~decnet.canary.base.CanaryArtifact` (bytes the SSH driver
|
||||||
|
plants) plus a persisted :class:`~decnet.web.db.models.CanaryToken`
|
||||||
|
row so the canary worker recognises the slug when an attacker trips
|
||||||
|
it.
|
||||||
|
|
||||||
|
What this is NOT: it doesn't pick *when* canaries fire — that's the
|
||||||
|
realism planner's job. It doesn't decide *where* on the filesystem
|
||||||
|
the canary lands beyond what realism naming + persona conventions
|
||||||
|
already produce. It's a thin bytes-and-row factory bolted onto the
|
||||||
|
realism contract.
|
||||||
|
|
||||||
|
Stealth (per ``feedback_stealth.md``): we never leak the
|
||||||
|
``DECNET`` literal into anything that survives to the planted file.
|
||||||
|
The underlying generators are already stealth-clean; this wrapper
|
||||||
|
must not undo that.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
import secrets as _secrets
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from typing import Any, Optional
|
||||||
|
|
||||||
|
from decnet.canary.base import CanaryArtifact, CanaryContext
|
||||||
|
from decnet.canary.factory import get_generator
|
||||||
|
from decnet.logging import get_logger
|
||||||
|
from decnet.realism.personas import login_for
|
||||||
|
from decnet.realism.taxonomy import ContentClass, Plan
|
||||||
|
|
||||||
|
log = get_logger("canary.cultivator")
|
||||||
|
|
||||||
|
|
||||||
|
# realism content_class → canary generator name. Mirrors
|
||||||
|
# :data:`decnet.canary.factory.KNOWN_GENERATORS`.
|
||||||
|
_CLASS_TO_GENERATOR: dict[ContentClass, str] = {
|
||||||
|
ContentClass.CANARY_AWS_CREDS: "aws_creds",
|
||||||
|
ContentClass.CANARY_ENV_FILE: "env_file",
|
||||||
|
ContentClass.CANARY_GIT_CONFIG: "git_config",
|
||||||
|
ContentClass.CANARY_SSH_KEY: "ssh_key",
|
||||||
|
ContentClass.CANARY_HONEYDOC: "honeydoc",
|
||||||
|
ContentClass.CANARY_HONEYDOC_DOCX: "honeydoc_docx",
|
||||||
|
ContentClass.CANARY_HONEYDOC_PDF: "honeydoc_pdf",
|
||||||
|
ContentClass.CANARY_MYSQL_DUMP: "mysql_dump",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Generator → CanaryKind. The trip surface (HTTP slug callback / DNS
|
||||||
|
# resolution / passive bait) determines how the canary worker matches
|
||||||
|
# an attacker callback to this token. Aligned with
|
||||||
|
# :data:`decnet.web.db.models.canary.CanaryKind`.
|
||||||
|
_GENERATOR_TO_KIND: dict[str, str] = {
|
||||||
|
"aws_creds": "aws_passive", # no embedded callback; passive bait
|
||||||
|
"env_file": "http",
|
||||||
|
"git_config": "http",
|
||||||
|
"honeydoc": "http",
|
||||||
|
"honeydoc_docx": "http",
|
||||||
|
"honeydoc_pdf": "http",
|
||||||
|
"ssh_key": "dns", # trip is DNS resolution of host comment
|
||||||
|
"mysql_dump": "dns", # trip is DNS resolution of subdomain
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# Path conventions per generator. The realism planner doesn't know
|
||||||
|
# about decoy-realistic credential locations (``~/.aws/credentials``,
|
||||||
|
# ``~/.git/config``); we map them per-class here so the planted
|
||||||
|
# artifact lands somewhere an attacker would actually look.
|
||||||
|
_DEFAULT_PATH: dict[ContentClass, str] = {
|
||||||
|
ContentClass.CANARY_AWS_CREDS: "/home/{persona}/.aws/credentials",
|
||||||
|
ContentClass.CANARY_ENV_FILE: "/home/{persona}/app/.env",
|
||||||
|
ContentClass.CANARY_GIT_CONFIG: "/home/{persona}/.git/config",
|
||||||
|
ContentClass.CANARY_SSH_KEY: "/home/{persona}/.ssh/id_rsa",
|
||||||
|
ContentClass.CANARY_HONEYDOC: "/home/{persona}/Documents/notes.html",
|
||||||
|
ContentClass.CANARY_HONEYDOC_DOCX: "/home/{persona}/Documents/Q3-Operations-Review.docx",
|
||||||
|
ContentClass.CANARY_HONEYDOC_PDF: "/home/{persona}/Documents/Q3-Operations-Review.pdf",
|
||||||
|
ContentClass.CANARY_MYSQL_DUMP: "/var/backups/db_backup.sql",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _path_for(plan: Plan) -> str:
|
||||||
|
"""Produce the canary placement path for *plan*.
|
||||||
|
|
||||||
|
The realism planner already filled in ``plan.target_path`` from
|
||||||
|
the namer, but canary placements have stronger conventions
|
||||||
|
(``~/.aws/credentials``, ``~/.ssh/id_rsa``) than the realism
|
||||||
|
namer's vocabulary. When :data:`_DEFAULT_PATH` has an entry,
|
||||||
|
that wins.
|
||||||
|
"""
|
||||||
|
template = _DEFAULT_PATH.get(plan.content_class)
|
||||||
|
if template is None:
|
||||||
|
return plan.target_path
|
||||||
|
return template.format(persona=login_for(plan.persona))
|
||||||
|
|
||||||
|
|
||||||
|
def _new_callback_token() -> str:
|
||||||
|
"""16 url-safe bytes — same shape canary slug fields use elsewhere."""
|
||||||
|
return _secrets.token_urlsafe(16)
|
||||||
|
|
||||||
|
|
||||||
|
async def cultivate(
|
||||||
|
plan: Plan,
|
||||||
|
repo: Any,
|
||||||
|
*,
|
||||||
|
http_base: Optional[str] = None,
|
||||||
|
dns_zone: Optional[str] = None,
|
||||||
|
created_by: str = "system",
|
||||||
|
) -> CanaryArtifact:
|
||||||
|
"""Realism-driven canary plant.
|
||||||
|
|
||||||
|
Build a :class:`CanaryContext`, ask the right generator for bytes,
|
||||||
|
persist a ``canary_tokens`` row so the canary worker can attribute
|
||||||
|
callbacks to this token, and return the artifact for the SSH
|
||||||
|
driver to plant.
|
||||||
|
|
||||||
|
*http_base* and *dns_zone* default to ``DECNET_CANARY_HTTP_BASE``
|
||||||
|
and ``DECNET_CANARY_DNS_ZONE`` env vars respectively — same
|
||||||
|
pattern the canary worker uses. When both are empty, generators
|
||||||
|
that need a callback host (``ssh_key`` DNS, ``mysql_dump``)
|
||||||
|
raise; the planner's caller logs and falls back to a non-canary
|
||||||
|
plan.
|
||||||
|
"""
|
||||||
|
if not plan.content_class.is_canary():
|
||||||
|
raise ValueError(
|
||||||
|
f"cultivate() called with non-canary content_class="
|
||||||
|
f"{plan.content_class!r}"
|
||||||
|
)
|
||||||
|
gen_name = _CLASS_TO_GENERATOR.get(plan.content_class)
|
||||||
|
if gen_name is None:
|
||||||
|
raise KeyError(
|
||||||
|
f"no canary generator mapped for content_class="
|
||||||
|
f"{plan.content_class!r}"
|
||||||
|
)
|
||||||
|
|
||||||
|
callback_token = _new_callback_token()
|
||||||
|
ctx = CanaryContext(
|
||||||
|
callback_token=callback_token,
|
||||||
|
http_base=http_base or os.environ.get("DECNET_CANARY_HTTP_BASE", ""),
|
||||||
|
dns_zone=dns_zone or os.environ.get("DECNET_CANARY_DNS_ZONE", ""),
|
||||||
|
persona="linux", # all our deckies are POSIX in MVP
|
||||||
|
)
|
||||||
|
generator = get_generator(gen_name)
|
||||||
|
artifact = generator.generate(ctx)
|
||||||
|
|
||||||
|
# The generator returns ``path=""`` (planter fills it normally).
|
||||||
|
# We have a realism-derived path on hand; stuff it in for the SSH
|
||||||
|
# driver's plant_file call AND the canary_tokens row.
|
||||||
|
placement_path = _path_for(plan)
|
||||||
|
|
||||||
|
# Persist the token row before planting so the canary worker can
|
||||||
|
# attribute a callback if the artifact trips during the plant
|
||||||
|
# itself (improbable but possible — DOCX viewers can preview
|
||||||
|
# autoplay-style).
|
||||||
|
await repo.create_canary_token({
|
||||||
|
"kind": _GENERATOR_TO_KIND.get(gen_name, "http"),
|
||||||
|
"decky_name": plan.decky_name,
|
||||||
|
"instrumenter": None,
|
||||||
|
"generator": gen_name,
|
||||||
|
"placement_path": placement_path,
|
||||||
|
"callback_token": callback_token,
|
||||||
|
"secret_seed": callback_token, # deterministic re-seed compatible
|
||||||
|
"placed_at": datetime.now(timezone.utc),
|
||||||
|
"created_by": created_by,
|
||||||
|
"state": "planted",
|
||||||
|
})
|
||||||
|
|
||||||
|
# Carry the placement_path on the artifact so the orchestrator's
|
||||||
|
# plant_file call uses it. We don't mutate the generator's
|
||||||
|
# original — copy with the new path.
|
||||||
|
return CanaryArtifact(
|
||||||
|
path=placement_path,
|
||||||
|
content=artifact.content,
|
||||||
|
mode=artifact.mode,
|
||||||
|
mtime_offset=artifact.mtime_offset,
|
||||||
|
instrumenter=artifact.instrumenter,
|
||||||
|
generator=artifact.generator,
|
||||||
|
notes=list(artifact.notes),
|
||||||
|
)
|
||||||
207
decnet/canary/dns_server.py
Normal file
207
decnet/canary/dns_server.py
Normal file
@@ -0,0 +1,207 @@
|
|||||||
|
"""Minimal authoritative DNS server for canary tokens (stdlib only).
|
||||||
|
|
||||||
|
We don't need a full resolver — only enough to:
|
||||||
|
|
||||||
|
1. Decode an inbound query's qname.
|
||||||
|
2. If the qname matches ``<slug>.<canary_zone>``, log the callback,
|
||||||
|
publish ``canary.<token_id>.triggered`` on the bus, and return a
|
||||||
|
plausible A record (any RFC-5737 reserved address would do; we
|
||||||
|
use 192.0.2.1) so the attacker's resolver doesn't loop on
|
||||||
|
NXDOMAIN.
|
||||||
|
3. For unknown qnames return NXDOMAIN.
|
||||||
|
|
||||||
|
DNS-over-UDP wire format is well-trodden: 12-byte header + name
|
||||||
|
labels + qtype + qclass. We implement just the bits we need.
|
||||||
|
|
||||||
|
This module deliberately avoids the ``dnslib`` PyPI package so the
|
||||||
|
canary worker has no extra dependency surface. If we ever need
|
||||||
|
EDNS0, DNSSEC, or other niceties we'll swap to dnslib then.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import struct
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Awaitable, Callable, Optional, Tuple
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class DNSQuery:
|
||||||
|
"""Decoded query — only the bits the canary worker cares about."""
|
||||||
|
|
||||||
|
txid: int
|
||||||
|
qname: str # lowercase, no trailing dot
|
||||||
|
qtype: int
|
||||||
|
qclass: int
|
||||||
|
flags: int
|
||||||
|
|
||||||
|
|
||||||
|
def _decode_name(buf: bytes, offset: int) -> Tuple[str, int]:
|
||||||
|
"""Return ``(qname_lowercase_no_dot, bytes_consumed)``.
|
||||||
|
|
||||||
|
Supports compressed pointers (RFC 1035 §4.1.4). Doesn't recurse —
|
||||||
|
we walk the pointer chain iteratively with a hop cap to avoid
|
||||||
|
pointer-loop DoS.
|
||||||
|
"""
|
||||||
|
labels: list[str] = []
|
||||||
|
pos = offset
|
||||||
|
consumed = 0
|
||||||
|
jumped = False
|
||||||
|
hops = 0
|
||||||
|
while True:
|
||||||
|
if pos >= len(buf):
|
||||||
|
raise ValueError("truncated DNS name")
|
||||||
|
length = buf[pos]
|
||||||
|
if length == 0:
|
||||||
|
pos += 1
|
||||||
|
if not jumped:
|
||||||
|
consumed = pos - offset
|
||||||
|
break
|
||||||
|
if (length & 0xC0) == 0xC0:
|
||||||
|
# Compression pointer.
|
||||||
|
if pos + 1 >= len(buf):
|
||||||
|
raise ValueError("truncated DNS pointer")
|
||||||
|
ptr = ((length & 0x3F) << 8) | buf[pos + 1]
|
||||||
|
if not jumped:
|
||||||
|
consumed = (pos + 2) - offset
|
||||||
|
pos = ptr
|
||||||
|
jumped = True
|
||||||
|
hops += 1
|
||||||
|
if hops > 10:
|
||||||
|
raise ValueError("DNS pointer loop")
|
||||||
|
continue
|
||||||
|
pos += 1
|
||||||
|
if pos + length > len(buf):
|
||||||
|
raise ValueError("truncated DNS label")
|
||||||
|
labels.append(buf[pos:pos + length].decode("ascii", "replace"))
|
||||||
|
pos += length
|
||||||
|
return ".".join(labels).lower(), consumed
|
||||||
|
|
||||||
|
|
||||||
|
def parse_query(packet: bytes) -> DNSQuery:
|
||||||
|
"""Parse the (single) question of a DNS query packet."""
|
||||||
|
if len(packet) < 12:
|
||||||
|
raise ValueError("DNS packet too short")
|
||||||
|
txid, flags, qdcount, _ancount, _nscount, _arcount = struct.unpack(
|
||||||
|
"!HHHHHH", packet[:12]
|
||||||
|
)
|
||||||
|
if qdcount != 1:
|
||||||
|
raise ValueError(f"expected 1 question, got {qdcount}")
|
||||||
|
qname, consumed = _decode_name(packet, 12)
|
||||||
|
pos = 12 + consumed
|
||||||
|
if pos + 4 > len(packet):
|
||||||
|
raise ValueError("truncated DNS qtype/qclass")
|
||||||
|
qtype, qclass = struct.unpack("!HH", packet[pos:pos + 4])
|
||||||
|
return DNSQuery(
|
||||||
|
txid=txid, qname=qname, qtype=qtype, qclass=qclass, flags=flags,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _encode_name(name: str) -> bytes:
|
||||||
|
out = bytearray()
|
||||||
|
for label in name.split("."):
|
||||||
|
if not label:
|
||||||
|
continue
|
||||||
|
b = label.encode("ascii", "replace")
|
||||||
|
out.append(len(b))
|
||||||
|
out.extend(b)
|
||||||
|
out.append(0)
|
||||||
|
return bytes(out)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_response(
|
||||||
|
query: DNSQuery,
|
||||||
|
*,
|
||||||
|
rcode: int = 0,
|
||||||
|
answer_ip: Optional[str] = None,
|
||||||
|
) -> bytes:
|
||||||
|
"""Encode a DNS response packet.
|
||||||
|
|
||||||
|
*rcode* 0 = NOERROR, 3 = NXDOMAIN. When *answer_ip* is supplied
|
||||||
|
and the query was for an A record we include exactly one answer
|
||||||
|
(TTL 60, class IN).
|
||||||
|
"""
|
||||||
|
qd_count = 1
|
||||||
|
an_count = 1 if (answer_ip and query.qtype == 1 and rcode == 0) else 0
|
||||||
|
flags = 0x8400 | rcode # response + authoritative + RA bit clear + rcode
|
||||||
|
header = struct.pack(
|
||||||
|
"!HHHHHH", query.txid, flags, qd_count, an_count, 0, 0,
|
||||||
|
)
|
||||||
|
qname_bytes = _encode_name(query.qname)
|
||||||
|
question = qname_bytes + struct.pack("!HH", query.qtype, query.qclass)
|
||||||
|
|
||||||
|
answer = b""
|
||||||
|
if an_count:
|
||||||
|
# Use a name pointer back to the question (offset 12).
|
||||||
|
ptr = struct.pack("!H", 0xC000 | 12)
|
||||||
|
rdata = bytes(int(o) for o in answer_ip.split("."))
|
||||||
|
answer = ptr + struct.pack("!HHIH", 1, 1, 60, 4) + rdata
|
||||||
|
|
||||||
|
return header + question + answer
|
||||||
|
|
||||||
|
|
||||||
|
# Hook signature: receives the matched slug + the query; returns
|
||||||
|
# nothing. The worker uses it to persist a CanaryTrigger row and
|
||||||
|
# publish the bus event.
|
||||||
|
TriggerHook = Callable[[str, DNSQuery, str], Awaitable[None]]
|
||||||
|
|
||||||
|
|
||||||
|
class CanaryDNSProtocol(asyncio.DatagramProtocol):
|
||||||
|
"""asyncio UDP server endpoint for canary DNS callbacks.
|
||||||
|
|
||||||
|
Constructor takes the canary zone (``"canary.example.test"``) and
|
||||||
|
a coroutine called when a query matches ``<slug>.<zone>``. The
|
||||||
|
hook runs in the event loop's task; we don't block the receive
|
||||||
|
path on it.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
zone: str,
|
||||||
|
hook: TriggerHook,
|
||||||
|
*,
|
||||||
|
answer_ip: str = "192.0.2.1",
|
||||||
|
) -> None:
|
||||||
|
# Normalise: lowercase, no leading/trailing dot.
|
||||||
|
self._zone = zone.lower().strip(".")
|
||||||
|
self._suffix = "." + self._zone if self._zone else ""
|
||||||
|
self._hook = hook
|
||||||
|
self._answer_ip = answer_ip
|
||||||
|
self._transport: Optional[asyncio.DatagramTransport] = None
|
||||||
|
|
||||||
|
def connection_made(self, transport) -> None: # type: ignore[override]
|
||||||
|
self._transport = transport # type: ignore[assignment]
|
||||||
|
|
||||||
|
def datagram_received( # type: ignore[override]
|
||||||
|
self, data: bytes, addr: Tuple[str, int],
|
||||||
|
) -> None:
|
||||||
|
try:
|
||||||
|
query = parse_query(data)
|
||||||
|
except ValueError:
|
||||||
|
# Malformed query — drop silently. Returning a FORMERR
|
||||||
|
# would tip off the attacker that *something* is listening
|
||||||
|
# on this port; the stealth posture (feedback_stealth)
|
||||||
|
# prefers radio silence on parse errors.
|
||||||
|
return
|
||||||
|
slug = self._slug_for(query.qname)
|
||||||
|
if slug is None:
|
||||||
|
# Unknown name — NXDOMAIN.
|
||||||
|
self._send(addr, _build_response(query, rcode=3))
|
||||||
|
return
|
||||||
|
# Known name — answer with our sinkhole IP, then fire the hook.
|
||||||
|
self._send(addr, _build_response(query, answer_ip=self._answer_ip))
|
||||||
|
asyncio.create_task(self._hook(slug, query, addr[0]))
|
||||||
|
|
||||||
|
def _slug_for(self, qname: str) -> Optional[str]:
|
||||||
|
if not self._zone or not qname.endswith(self._suffix):
|
||||||
|
return None
|
||||||
|
slug = qname[: -len(self._suffix)]
|
||||||
|
# Single-label slug only; multi-label means the attacker is
|
||||||
|
# querying a sub-resource we don't model.
|
||||||
|
if not slug or "." in slug:
|
||||||
|
return None
|
||||||
|
return slug
|
||||||
|
|
||||||
|
def _send(self, addr: Tuple[str, int], packet: bytes) -> None:
|
||||||
|
if self._transport is not None:
|
||||||
|
self._transport.sendto(packet, addr)
|
||||||
141
decnet/canary/factory.py
Normal file
141
decnet/canary/factory.py
Normal file
@@ -0,0 +1,141 @@
|
|||||||
|
"""Generator and instrumenter factories.
|
||||||
|
|
||||||
|
Same lazy-import pattern as :mod:`decnet.intel.factory` — concrete
|
||||||
|
implementations stay un-imported until first use so importing
|
||||||
|
:mod:`decnet.canary` from a CLI subcommand doesn't drag in
|
||||||
|
``pikepdf`` / ``python-docx`` / ``Pillow`` for callers that only
|
||||||
|
need the model layer.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
from decnet.canary.base import CanaryGenerator, CanaryInstrumenter
|
||||||
|
|
||||||
|
KNOWN_GENERATORS: Tuple[str, ...] = (
|
||||||
|
"git_config",
|
||||||
|
"env_file",
|
||||||
|
"ssh_key",
|
||||||
|
"aws_creds",
|
||||||
|
"honeydoc",
|
||||||
|
"honeydoc_docx",
|
||||||
|
"honeydoc_pdf",
|
||||||
|
"mysql_dump",
|
||||||
|
)
|
||||||
|
|
||||||
|
KNOWN_INSTRUMENTERS: Tuple[str, ...] = (
|
||||||
|
"docx",
|
||||||
|
"xlsx",
|
||||||
|
"pdf",
|
||||||
|
"html",
|
||||||
|
"image",
|
||||||
|
"plain",
|
||||||
|
"passthrough",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_generator(name: str) -> CanaryGenerator:
|
||||||
|
"""Return the generator registered under ``name``.
|
||||||
|
|
||||||
|
Raises :class:`ValueError` for unknown names so a typo in the API
|
||||||
|
request surfaces as a 400 rather than silently producing nothing.
|
||||||
|
"""
|
||||||
|
if name == "git_config":
|
||||||
|
from decnet.canary.generators.git_config import GitConfigGenerator
|
||||||
|
return GitConfigGenerator()
|
||||||
|
if name == "env_file":
|
||||||
|
from decnet.canary.generators.env_file import EnvFileGenerator
|
||||||
|
return EnvFileGenerator()
|
||||||
|
if name == "ssh_key":
|
||||||
|
from decnet.canary.generators.ssh_key import SSHKeyGenerator
|
||||||
|
return SSHKeyGenerator()
|
||||||
|
if name == "aws_creds":
|
||||||
|
from decnet.canary.generators.aws_creds import AWSCredsGenerator
|
||||||
|
return AWSCredsGenerator()
|
||||||
|
if name == "honeydoc":
|
||||||
|
from decnet.canary.generators.honeydoc import HoneydocGenerator
|
||||||
|
return HoneydocGenerator()
|
||||||
|
if name == "honeydoc_docx":
|
||||||
|
from decnet.canary.generators.honeydoc_docx import HoneydocDocxGenerator
|
||||||
|
return HoneydocDocxGenerator()
|
||||||
|
if name == "honeydoc_pdf":
|
||||||
|
from decnet.canary.generators.honeydoc_pdf import HoneydocPdfGenerator
|
||||||
|
return HoneydocPdfGenerator()
|
||||||
|
if name == "mysql_dump":
|
||||||
|
from decnet.canary.generators.mysql_dump import MySQLDumpGenerator
|
||||||
|
return MySQLDumpGenerator()
|
||||||
|
raise ValueError(
|
||||||
|
f"Unknown canary generator: {name!r}. Known: {KNOWN_GENERATORS}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_instrumenter(name: str) -> CanaryInstrumenter:
|
||||||
|
"""Return the instrumenter registered under ``name``."""
|
||||||
|
if name == "docx":
|
||||||
|
from decnet.canary.instrumenters.docx import DocxInstrumenter
|
||||||
|
return DocxInstrumenter()
|
||||||
|
if name == "xlsx":
|
||||||
|
from decnet.canary.instrumenters.xlsx import XlsxInstrumenter
|
||||||
|
return XlsxInstrumenter()
|
||||||
|
if name == "pdf":
|
||||||
|
from decnet.canary.instrumenters.pdf import PdfInstrumenter
|
||||||
|
return PdfInstrumenter()
|
||||||
|
if name == "html":
|
||||||
|
from decnet.canary.instrumenters.html import HtmlInstrumenter
|
||||||
|
return HtmlInstrumenter()
|
||||||
|
if name == "image":
|
||||||
|
from decnet.canary.instrumenters.image import ImageInstrumenter
|
||||||
|
return ImageInstrumenter()
|
||||||
|
if name == "plain":
|
||||||
|
from decnet.canary.instrumenters.plain import PlainInstrumenter
|
||||||
|
return PlainInstrumenter()
|
||||||
|
if name == "passthrough":
|
||||||
|
from decnet.canary.instrumenters.passthrough import PassthroughInstrumenter
|
||||||
|
return PassthroughInstrumenter()
|
||||||
|
raise ValueError(
|
||||||
|
f"Unknown canary instrumenter: {name!r}. Known: {KNOWN_INSTRUMENTERS}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# MIME → instrumenter dispatch. Order matters: we walk the table
|
||||||
|
# top-to-bottom and the first prefix match wins, so put the more
|
||||||
|
# specific (DOCX/XLSX) before the generic (zip/octet-stream).
|
||||||
|
_MIME_DISPATCH: tuple[tuple[str, str], ...] = (
|
||||||
|
# Office Open XML — DOCX/XLSX share a zip structure but expose
|
||||||
|
# different inner trees, so dispatch by MIME alias rather than
|
||||||
|
# zip-poking.
|
||||||
|
("application/vnd.openxmlformats-officedocument.wordprocessingml.document", "docx"),
|
||||||
|
("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", "xlsx"),
|
||||||
|
("application/pdf", "pdf"),
|
||||||
|
("text/html", "html"),
|
||||||
|
("application/xhtml+xml", "html"),
|
||||||
|
("image/png", "image"),
|
||||||
|
("image/jpeg", "image"),
|
||||||
|
("image/gif", "image"),
|
||||||
|
# Plaintext catch-alls — config files, .env, .ini, .yaml, .json,
|
||||||
|
# source code. All handled by the same regex-substitution pass.
|
||||||
|
("text/", "plain"),
|
||||||
|
("application/json", "plain"),
|
||||||
|
("application/x-yaml", "plain"),
|
||||||
|
("application/yaml", "plain"),
|
||||||
|
("application/toml", "plain"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def pick_instrumenter_for_mime(content_type: str) -> str:
|
||||||
|
"""Return the instrumenter name registered for a sniffed MIME.
|
||||||
|
|
||||||
|
Falls back to ``"passthrough"`` for anything we don't have an
|
||||||
|
embedder for (binary blobs we can't mutate safely — random
|
||||||
|
container images, archives, executables). ``passthrough`` only
|
||||||
|
supports DNS-callback tokens (the slug ends up in the filename or
|
||||||
|
an accompanying README), so the API surfaces that constraint to
|
||||||
|
the operator before they pick a kind.
|
||||||
|
"""
|
||||||
|
if not content_type:
|
||||||
|
return "passthrough"
|
||||||
|
lowered = content_type.lower()
|
||||||
|
for prefix, name in _MIME_DISPATCH:
|
||||||
|
if lowered.startswith(prefix):
|
||||||
|
return name
|
||||||
|
return "passthrough"
|
||||||
7
decnet/canary/generators/__init__.py
Normal file
7
decnet/canary/generators/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
"""Built-in canary generators (synthesised fake artifacts).
|
||||||
|
|
||||||
|
Concrete classes live in sibling modules and are imported lazily by
|
||||||
|
:func:`decnet.canary.factory.get_generator` to keep the import-time
|
||||||
|
cost of :mod:`decnet.canary` cheap for callers that only need the
|
||||||
|
ABCs.
|
||||||
|
"""
|
||||||
86
decnet/canary/generators/aws_creds.py
Normal file
86
decnet/canary/generators/aws_creds.py
Normal file
@@ -0,0 +1,86 @@
|
|||||||
|
"""Fake ``~/.aws/credentials`` block (passive bait).
|
||||||
|
|
||||||
|
This is the **passive** variant — no callback wiring. An attacker
|
||||||
|
who exfils these keys can't trip a detection unless we run a real
|
||||||
|
AWS account with a deny-all CloudTrail listener (post-v1). The
|
||||||
|
realism is the point: the file looks like a routinely used credentials
|
||||||
|
file, so the rest of the decky's persona feels lived-in.
|
||||||
|
|
||||||
|
If the operator picks ``kind="aws_passive"`` we accept that no slug
|
||||||
|
will be embedded. If they pick ``kind="http"`` or ``kind="dns"`` for
|
||||||
|
this generator, the API will reject the combination with a 400 — AWS
|
||||||
|
keys have no plausible field where a URL or hostname survives a
|
||||||
|
``grep -E '[A-Z0-9]{20}'`` smell test.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
from secrets import token_urlsafe
|
||||||
|
|
||||||
|
from decnet.canary.base import CanaryArtifact, CanaryContext, CanaryGenerator
|
||||||
|
|
||||||
|
|
||||||
|
# Stable AWS-style key body derived from the slug. Keeping the
|
||||||
|
# generator deterministic (per-slug) means re-seeding produces the
|
||||||
|
# same bytes — the planter is naturally idempotent and an operator
|
||||||
|
# who runs ``decnet canary verify`` can re-derive the expected file
|
||||||
|
# without touching the DB.
|
||||||
|
|
||||||
|
def _fake_access_key(seed: str) -> str:
|
||||||
|
# AWS access keys are 20 chars, uppercase alphanum, AKIA prefix.
|
||||||
|
body = hashlib.sha256(seed.encode()).hexdigest().upper()
|
||||||
|
return "AKIA" + body[:16]
|
||||||
|
|
||||||
|
|
||||||
|
def _fake_secret_key(seed: str) -> str:
|
||||||
|
# AWS secret keys are 40 chars, mixed-case base64-ish. We use
|
||||||
|
# base64-safe characters from token_urlsafe seeded by a SHA-256
|
||||||
|
# of the seed so the output is stable per slug.
|
||||||
|
h = hashlib.sha256(("secret:" + seed).encode()).digest()
|
||||||
|
# Reuse token_urlsafe for the alphabet but pad to 40 chars from
|
||||||
|
# the deterministic bytes so we don't depend on os.urandom.
|
||||||
|
import base64
|
||||||
|
return base64.b64encode(h)[:40].decode()
|
||||||
|
|
||||||
|
|
||||||
|
class AWSCredsGenerator(CanaryGenerator):
|
||||||
|
name = "aws_creds"
|
||||||
|
|
||||||
|
def generate(self, ctx: CanaryContext) -> CanaryArtifact:
|
||||||
|
seed = ctx.callback_token
|
||||||
|
access = _fake_access_key(seed)
|
||||||
|
secret = _fake_secret_key(seed)
|
||||||
|
body = (
|
||||||
|
"[default]\n"
|
||||||
|
f"aws_access_key_id = {access}\n"
|
||||||
|
f"aws_secret_access_key = {secret}\n"
|
||||||
|
"region = us-east-1\n"
|
||||||
|
"\n"
|
||||||
|
"[prod]\n"
|
||||||
|
f"aws_access_key_id = {_fake_access_key('prod-' + seed)}\n"
|
||||||
|
f"aws_secret_access_key = {_fake_secret_key('prod-' + seed)}\n"
|
||||||
|
"region = us-west-2\n"
|
||||||
|
)
|
||||||
|
return CanaryArtifact(
|
||||||
|
path="", # caller (planter) fills this from CanaryToken.placement_path
|
||||||
|
content=body.encode("utf-8"),
|
||||||
|
mode=0o600,
|
||||||
|
mtime_offset=-86400 * 14, # 2 weeks ago — looks lived-in
|
||||||
|
generator=self.name,
|
||||||
|
notes=[
|
||||||
|
"fake AWS keys; no callback embedded — passive bait only",
|
||||||
|
f"derived deterministically from slug={seed}",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Re-exported so the slug helper is reusable from the
|
||||||
|
# instrumenters/passthrough module without an internal import path.
|
||||||
|
__all__ = ["AWSCredsGenerator", "_fake_access_key", "_fake_secret_key"]
|
||||||
|
|
||||||
|
|
||||||
|
# Imports at the bottom keep the public dataclasses on top — pylint
|
||||||
|
# doesn't run on this repo, but tests do, and putting ``token_urlsafe``
|
||||||
|
# in a public symbol confuses readers. Suppress the unused warning by
|
||||||
|
# referencing it once.
|
||||||
|
_ = token_urlsafe
|
||||||
56
decnet/canary/generators/env_file.py
Normal file
56
decnet/canary/generators/env_file.py
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
"""Fake ``.env`` with embedded callback URLs.
|
||||||
|
|
||||||
|
Modern web stacks read environment variables for everything from
|
||||||
|
database DSNs to webhook URLs, so dropping a few realistic-looking
|
||||||
|
``KEY=value`` pairs alongside the canary URL is unremarkable. The
|
||||||
|
slug appears in two fields:
|
||||||
|
|
||||||
|
* ``API_BASE_URL`` — the obvious one; an attacker scripting against
|
||||||
|
the credentials hits the worker on first invocation.
|
||||||
|
* ``WEBHOOK_NOTIFY_URL`` — secondary, in case the attacker greps for
|
||||||
|
``WEBHOOK`` and pivots there.
|
||||||
|
|
||||||
|
Other fields (``DB_PASSWORD``, ``REDIS_URL``, ``JWT_SECRET``) are
|
||||||
|
plausible but inert — they're realism filler, not detection
|
||||||
|
mechanisms.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
|
||||||
|
from decnet.canary.base import CanaryArtifact, CanaryContext, CanaryGenerator
|
||||||
|
|
||||||
|
|
||||||
|
def _stable_token(seed: str, prefix: str = "") -> str:
|
||||||
|
h = hashlib.sha256((prefix + seed).encode()).hexdigest()
|
||||||
|
return h[:32]
|
||||||
|
|
||||||
|
|
||||||
|
class EnvFileGenerator(CanaryGenerator):
|
||||||
|
name = "env_file"
|
||||||
|
|
||||||
|
def generate(self, ctx: CanaryContext) -> CanaryArtifact:
|
||||||
|
base = ctx.http_base.rstrip("/")
|
||||||
|
slug = ctx.callback_token
|
||||||
|
api_url = f"{base}/c/{slug}"
|
||||||
|
body = (
|
||||||
|
"# Production environment — DO NOT COMMIT\n"
|
||||||
|
f"API_BASE_URL={api_url}\n"
|
||||||
|
f"WEBHOOK_NOTIFY_URL={api_url}/webhook\n"
|
||||||
|
f"DB_PASSWORD={_stable_token(slug, 'db:')}\n"
|
||||||
|
f"REDIS_URL=redis://:{_stable_token(slug, 'redis:')[:16]}@redis.internal:6379/0\n"
|
||||||
|
f"JWT_SECRET={_stable_token(slug, 'jwt:')}\n"
|
||||||
|
"LOG_LEVEL=info\n"
|
||||||
|
"ENVIRONMENT=production\n"
|
||||||
|
)
|
||||||
|
return CanaryArtifact(
|
||||||
|
path="",
|
||||||
|
content=body.encode("utf-8"),
|
||||||
|
mode=0o600,
|
||||||
|
mtime_offset=-86400 * 7, # last edited a week ago
|
||||||
|
generator=self.name,
|
||||||
|
notes=[
|
||||||
|
f"API_BASE_URL embeds {api_url}",
|
||||||
|
f"WEBHOOK_NOTIFY_URL embeds {api_url}/webhook",
|
||||||
|
],
|
||||||
|
)
|
||||||
53
decnet/canary/generators/git_config.py
Normal file
53
decnet/canary/generators/git_config.py
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
"""Fake ``.git/config`` with an attacker-bait remote URL.
|
||||||
|
|
||||||
|
The ``[remote "origin"]`` ``url`` field is the natural place to embed
|
||||||
|
an HTTP-callback URL: it's normal for git remotes to be HTTPS, the
|
||||||
|
URL is read by every git command an attacker runs (``git pull``,
|
||||||
|
``git fetch``, ``git remote -v``), and the slug fits naturally as
|
||||||
|
part of a path.
|
||||||
|
|
||||||
|
The generator emits a plausible private-mirror remote (``git.<org>``
|
||||||
|
or the canary host's hostname) so an attacker doesn't immediately
|
||||||
|
recognise it as a honeypot. The slug ends up in the URL path:
|
||||||
|
|
||||||
|
[remote "origin"]
|
||||||
|
url = https://canary.example.test/c/<slug>/repo.git
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from decnet.canary.base import CanaryArtifact, CanaryContext, CanaryGenerator
|
||||||
|
|
||||||
|
|
||||||
|
class GitConfigGenerator(CanaryGenerator):
|
||||||
|
name = "git_config"
|
||||||
|
|
||||||
|
def generate(self, ctx: CanaryContext) -> CanaryArtifact:
|
||||||
|
# Strip trailing slash defensively — operator may have
|
||||||
|
# configured DECNET_CANARY_HTTP_BASE either way.
|
||||||
|
base = ctx.http_base.rstrip("/")
|
||||||
|
slug = ctx.callback_token
|
||||||
|
# The /c/<slug>/repo.git suffix gives us a realistic-looking
|
||||||
|
# path the worker can route on a single ``startswith("/c/")``
|
||||||
|
# check, while still surviving a quick grep for the slug.
|
||||||
|
url = f"{base}/c/{slug}/repo.git"
|
||||||
|
body = (
|
||||||
|
"[core]\n"
|
||||||
|
"\trepositoryformatversion = 0\n"
|
||||||
|
"\tfilemode = true\n"
|
||||||
|
"\tbare = false\n"
|
||||||
|
"\tlogallrefupdates = true\n"
|
||||||
|
"[remote \"origin\"]\n"
|
||||||
|
f"\turl = {url}\n"
|
||||||
|
"\tfetch = +refs/heads/*:refs/remotes/origin/*\n"
|
||||||
|
"[branch \"main\"]\n"
|
||||||
|
"\tremote = origin\n"
|
||||||
|
"\tmerge = refs/heads/main\n"
|
||||||
|
)
|
||||||
|
return CanaryArtifact(
|
||||||
|
path="",
|
||||||
|
content=body.encode("utf-8"),
|
||||||
|
mode=0o644,
|
||||||
|
mtime_offset=-86400 * 30, # checked out a month ago
|
||||||
|
generator=self.name,
|
||||||
|
notes=[f"git remote 'origin' embeds {url}"],
|
||||||
|
)
|
||||||
61
decnet/canary/generators/honeydoc.py
Normal file
61
decnet/canary/generators/honeydoc.py
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
"""Built-in honeydoc — a minimal HTML "report" with a tracking pixel.
|
||||||
|
|
||||||
|
This is the *fallback* honeydoc used when the operator hasn't
|
||||||
|
uploaded a real document. The HTML instrumenter handles operator
|
||||||
|
uploads via :mod:`decnet.canary.instrumenters.html`; this generator
|
||||||
|
exists so the deploy-time baseline can plant *something* convincing
|
||||||
|
without first prompting the operator to drop a file.
|
||||||
|
|
||||||
|
The realism here is intentionally modest: a Documents-folder HTML
|
||||||
|
page with internal-looking content and a 1×1 remote image at the
|
||||||
|
bottom whose ``src`` is the canary callback URL. Most desktop
|
||||||
|
HTML renderers fetch the image as soon as the file is opened in a
|
||||||
|
browser preview, so opening the doc trips the callback.
|
||||||
|
|
||||||
|
Operators who want a richer artifact should upload their own DOCX
|
||||||
|
or PDF; the corresponding instrumenter embeds the same callback in
|
||||||
|
the appropriate format.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from decnet.canary.base import CanaryArtifact, CanaryContext, CanaryGenerator
|
||||||
|
|
||||||
|
|
||||||
|
class HoneydocGenerator(CanaryGenerator):
|
||||||
|
name = "honeydoc"
|
||||||
|
|
||||||
|
def generate(self, ctx: CanaryContext) -> CanaryArtifact:
|
||||||
|
base = ctx.http_base.rstrip("/")
|
||||||
|
slug = ctx.callback_token
|
||||||
|
pixel_url = f"{base}/c/{slug}"
|
||||||
|
body = (
|
||||||
|
"<!DOCTYPE html>\n"
|
||||||
|
"<html lang=\"en\">\n"
|
||||||
|
"<head>\n"
|
||||||
|
"<meta charset=\"utf-8\">\n"
|
||||||
|
"<title>Q3 Operations Review — DRAFT</title>\n"
|
||||||
|
"</head>\n"
|
||||||
|
"<body>\n"
|
||||||
|
"<h1>Q3 Operations Review (DRAFT — DO NOT DISTRIBUTE)</h1>\n"
|
||||||
|
"<p>Forecast and remediation timeline below. Numbers are\n"
|
||||||
|
"preliminary and subject to revision before the all-hands.</p>\n"
|
||||||
|
"<table>\n"
|
||||||
|
"<tr><th>Region</th><th>Incidents</th><th>MTTR (h)</th></tr>\n"
|
||||||
|
"<tr><td>us-east</td><td>14</td><td>3.2</td></tr>\n"
|
||||||
|
"<tr><td>us-west</td><td>9</td><td>4.7</td></tr>\n"
|
||||||
|
"<tr><td>eu-central</td><td>22</td><td>2.1</td></tr>\n"
|
||||||
|
"</table>\n"
|
||||||
|
"<p>Internal contact: <a href=\"mailto:secops@internal\">"
|
||||||
|
"secops@internal</a></p>\n"
|
||||||
|
f"<img src=\"{pixel_url}\" width=\"1\" height=\"1\" alt=\"\">\n"
|
||||||
|
"</body>\n"
|
||||||
|
"</html>\n"
|
||||||
|
)
|
||||||
|
return CanaryArtifact(
|
||||||
|
path="",
|
||||||
|
content=body.encode("utf-8"),
|
||||||
|
mode=0o644, # docs are typically world-readable
|
||||||
|
mtime_offset=-86400 * 21, # 3 weeks ago
|
||||||
|
generator=self.name,
|
||||||
|
notes=[f"tracking pixel src={pixel_url}"],
|
||||||
|
)
|
||||||
133
decnet/canary/generators/honeydoc_docx.py
Normal file
133
decnet/canary/generators/honeydoc_docx.py
Normal file
@@ -0,0 +1,133 @@
|
|||||||
|
"""Real-DOCX honeydoc generator.
|
||||||
|
|
||||||
|
Synthesises a minimal but structurally valid DOCX from scratch via
|
||||||
|
stdlib :mod:`zipfile`, then uses the same external-image relationship
|
||||||
|
trick that powers :mod:`decnet.canary.instrumenters.docx` to embed
|
||||||
|
the callback URL. No python-docx dependency.
|
||||||
|
|
||||||
|
The output opens cleanly in Word / LibreOffice; both fetch the
|
||||||
|
external image relationship on document load.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import io
|
||||||
|
import zipfile
|
||||||
|
|
||||||
|
from decnet.canary.base import CanaryArtifact, CanaryContext, CanaryGenerator
|
||||||
|
from decnet.canary.instrumenters.docx import _drawing, _next_rid
|
||||||
|
|
||||||
|
|
||||||
|
_CONTENT_TYPES = (
|
||||||
|
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
|
||||||
|
'<Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types">'
|
||||||
|
'<Default Extension="xml" ContentType="application/xml"/>'
|
||||||
|
'<Default Extension="rels" ContentType="application/vnd.openxmlformats-package.relationships+xml"/>'
|
||||||
|
'<Override PartName="/word/document.xml" '
|
||||||
|
'ContentType="application/vnd.openxmlformats-officedocument.wordprocessingml.document.main+xml"/>'
|
||||||
|
'</Types>'
|
||||||
|
).encode()
|
||||||
|
|
||||||
|
_PACKAGE_RELS = (
|
||||||
|
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
|
||||||
|
'<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">'
|
||||||
|
'<Relationship Id="rId1" '
|
||||||
|
'Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument" '
|
||||||
|
'Target="word/document.xml"/>'
|
||||||
|
'</Relationships>'
|
||||||
|
).encode()
|
||||||
|
|
||||||
|
_BODY_PARAGRAPHS = (
|
||||||
|
"Q3 Operations Review (DRAFT — DO NOT DISTRIBUTE)",
|
||||||
|
"",
|
||||||
|
"Forecast and remediation timeline below. Numbers are preliminary "
|
||||||
|
"and subject to revision before the all-hands.",
|
||||||
|
"",
|
||||||
|
"Region Incidents MTTR (h)",
|
||||||
|
"us-east 14 3.2",
|
||||||
|
"us-west 9 4.7",
|
||||||
|
"eu-central 22 2.1",
|
||||||
|
"",
|
||||||
|
"Internal contact: secops@internal",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _document_xml(rid_with_drawing: str | None = None) -> bytes:
|
||||||
|
"""Build the body XML.
|
||||||
|
|
||||||
|
``rid_with_drawing`` is the rId of the external image relationship;
|
||||||
|
when set, we append the same ``<w:drawing>`` element that the DOCX
|
||||||
|
instrumenter inserts so the body references the external resource.
|
||||||
|
"""
|
||||||
|
paragraphs = []
|
||||||
|
for line in _BODY_PARAGRAPHS:
|
||||||
|
if line:
|
||||||
|
paragraphs.append(
|
||||||
|
"<w:p><w:r><w:t xml:space=\"preserve\">"
|
||||||
|
+ _xml_escape(line)
|
||||||
|
+ "</w:t></w:r></w:p>"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
paragraphs.append("<w:p/>")
|
||||||
|
body = "".join(paragraphs)
|
||||||
|
drawing = _drawing(rid_with_drawing).decode() if rid_with_drawing else ""
|
||||||
|
return (
|
||||||
|
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
|
||||||
|
'<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">'
|
||||||
|
f'<w:body>{body}{drawing}</w:body>'
|
||||||
|
'</w:document>'
|
||||||
|
).encode()
|
||||||
|
|
||||||
|
|
||||||
|
def _xml_escape(s: str) -> str:
|
||||||
|
return (
|
||||||
|
s.replace("&", "&")
|
||||||
|
.replace("<", "<")
|
||||||
|
.replace(">", ">")
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _document_rels(rid: str, url: str) -> bytes:
|
||||||
|
return (
|
||||||
|
'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
|
||||||
|
'<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">'
|
||||||
|
f'<Relationship Id="{rid}" '
|
||||||
|
f'Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" '
|
||||||
|
f'Target="{url}" TargetMode="External"/>'
|
||||||
|
'</Relationships>'
|
||||||
|
).encode()
|
||||||
|
|
||||||
|
|
||||||
|
class HoneydocDocxGenerator(CanaryGenerator):
|
||||||
|
name = "honeydoc_docx"
|
||||||
|
|
||||||
|
def generate(self, ctx: CanaryContext) -> CanaryArtifact:
|
||||||
|
url = f"{ctx.http_base.rstrip('/')}/c/{ctx.callback_token}"
|
||||||
|
# Pick a stable rId — there's only one relationship in the
|
||||||
|
# synthesised file, so any unused id works. Reuse the
|
||||||
|
# instrumenter's allocator against the bare relationships
|
||||||
|
# skeleton for parity with operator-uploaded DOCX flow.
|
||||||
|
skeleton = (
|
||||||
|
b'<?xml version="1.0" encoding="UTF-8" standalone="yes"?>'
|
||||||
|
b'<Relationships xmlns="http://schemas.openxmlformats.org/package/2006/relationships">'
|
||||||
|
b'</Relationships>'
|
||||||
|
)
|
||||||
|
rid = _next_rid(skeleton)
|
||||||
|
|
||||||
|
out = io.BytesIO()
|
||||||
|
with zipfile.ZipFile(out, "w", zipfile.ZIP_DEFLATED) as zf:
|
||||||
|
zf.writestr("[Content_Types].xml", _CONTENT_TYPES)
|
||||||
|
zf.writestr("_rels/.rels", _PACKAGE_RELS)
|
||||||
|
zf.writestr("word/document.xml", _document_xml(rid))
|
||||||
|
zf.writestr("word/_rels/document.xml.rels", _document_rels(rid, url))
|
||||||
|
|
||||||
|
return CanaryArtifact(
|
||||||
|
path="",
|
||||||
|
content=out.getvalue(),
|
||||||
|
mode=0o644,
|
||||||
|
mtime_offset=-86400 * 21,
|
||||||
|
generator=self.name,
|
||||||
|
notes=[
|
||||||
|
"synthesised DOCX with realistic Q3 review body",
|
||||||
|
f"external-image relationship {rid} -> {url}",
|
||||||
|
],
|
||||||
|
)
|
||||||
127
decnet/canary/generators/honeydoc_pdf.py
Normal file
127
decnet/canary/generators/honeydoc_pdf.py
Normal file
@@ -0,0 +1,127 @@
|
|||||||
|
"""Real-PDF honeydoc generator (uses :mod:`pikepdf`).
|
||||||
|
|
||||||
|
Builds a one-page PDF with the same Q3-review body as the HTML/DOCX
|
||||||
|
flavors and installs an ``/OpenAction`` ``/URI`` action on the
|
||||||
|
catalog so most viewers fire the callback the moment the document
|
||||||
|
opens.
|
||||||
|
|
||||||
|
Pikepdf is now a hard dependency for this generator (the operator
|
||||||
|
installed it explicitly so we can use it). We still surface a
|
||||||
|
clear :class:`InstrumenterRejectedError` when imports fail, so a
|
||||||
|
deployment without pikepdf can fall back to the DOCX or HTML
|
||||||
|
generators rather than crashing the API.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import io
|
||||||
|
|
||||||
|
from decnet.canary.base import (
|
||||||
|
CanaryArtifact,
|
||||||
|
CanaryContext,
|
||||||
|
CanaryGenerator,
|
||||||
|
InstrumenterRejectedError,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
_BODY_LINES = (
|
||||||
|
("Q3 Operations Review (DRAFT — DO NOT DISTRIBUTE)", 14),
|
||||||
|
("", 12),
|
||||||
|
("Forecast and remediation timeline below.", 11),
|
||||||
|
("Numbers are preliminary, subject to revision.", 11),
|
||||||
|
("", 12),
|
||||||
|
("Region Incidents MTTR (h)", 11),
|
||||||
|
("us-east 14 3.2", 11),
|
||||||
|
("us-west 9 4.7", 11),
|
||||||
|
("eu-central 22 2.1", 11),
|
||||||
|
("", 12),
|
||||||
|
("Internal contact: secops@internal", 11),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class HoneydocPdfGenerator(CanaryGenerator):
|
||||||
|
name = "honeydoc_pdf"
|
||||||
|
|
||||||
|
def generate(self, ctx: CanaryContext) -> CanaryArtifact:
|
||||||
|
try:
|
||||||
|
from pikepdf import Pdf, Name, Dictionary, String # type: ignore[import-not-found]
|
||||||
|
except ImportError as e:
|
||||||
|
raise InstrumenterRejectedError(
|
||||||
|
"honeydoc_pdf requires pikepdf; install it (`pip install "
|
||||||
|
"pikepdf`) or pick honeydoc / honeydoc_docx instead."
|
||||||
|
) from e
|
||||||
|
|
||||||
|
url = f"{ctx.http_base.rstrip('/')}/c/{ctx.callback_token}"
|
||||||
|
|
||||||
|
pdf = Pdf.new()
|
||||||
|
# Helvetica is one of the 14 PDF base fonts — every viewer ships
|
||||||
|
# it, so no font embedding is required.
|
||||||
|
font = pdf.make_indirect(Dictionary(
|
||||||
|
Type=Name("/Font"),
|
||||||
|
Subtype=Name("/Type1"),
|
||||||
|
BaseFont=Name("/Helvetica"),
|
||||||
|
))
|
||||||
|
|
||||||
|
# Build a single content stream that writes each body line at a
|
||||||
|
# decreasing y-coordinate. PDF coordinates start at the bottom-
|
||||||
|
# left (US Letter = 612 x 792 points); we lay out lines roughly
|
||||||
|
# 18 points apart starting near the top.
|
||||||
|
ops: list[str] = ["BT /F1 12 Tf 72 750 Td"]
|
||||||
|
first = True
|
||||||
|
for line, size in _BODY_LINES:
|
||||||
|
if not first:
|
||||||
|
ops.append("0 -18 Td")
|
||||||
|
first = False
|
||||||
|
ops.append(f"/F1 {size} Tf")
|
||||||
|
ops.append(f"({_pdf_escape(line)}) Tj")
|
||||||
|
ops.append("ET")
|
||||||
|
content_bytes = "\n".join(ops).encode("latin-1")
|
||||||
|
|
||||||
|
content_stream = pdf.make_stream(content_bytes)
|
||||||
|
|
||||||
|
page = pdf.add_blank_page(page_size=(612, 792))
|
||||||
|
page[Name("/Resources")] = Dictionary(
|
||||||
|
Font=Dictionary(F1=font),
|
||||||
|
)
|
||||||
|
page[Name("/Contents")] = content_stream
|
||||||
|
|
||||||
|
# OpenAction fires the URI when the file is opened in Acrobat,
|
||||||
|
# Preview, the browser PDF viewer, etc. Most viewers prompt
|
||||||
|
# before fetching; that prompt itself is a tell, and an
|
||||||
|
# auto-allow viewer fetches silently.
|
||||||
|
pdf.Root[Name("/OpenAction")] = Dictionary(
|
||||||
|
Type=Name("/Action"),
|
||||||
|
S=Name("/URI"),
|
||||||
|
URI=String(url),
|
||||||
|
)
|
||||||
|
|
||||||
|
out = io.BytesIO()
|
||||||
|
pdf.save(out)
|
||||||
|
return CanaryArtifact(
|
||||||
|
path="",
|
||||||
|
content=out.getvalue(),
|
||||||
|
mode=0o644,
|
||||||
|
mtime_offset=-86400 * 21,
|
||||||
|
generator=self.name,
|
||||||
|
notes=[
|
||||||
|
"synthesised one-page PDF with realistic Q3 review body",
|
||||||
|
f"/OpenAction /URI -> {url}",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _pdf_escape(s: str) -> str:
|
||||||
|
"""Escape parens and backslashes for PDF literal-string syntax.
|
||||||
|
|
||||||
|
PDF string literals are wrapped in ``( … )``; inner ``(``, ``)``,
|
||||||
|
and ``\\`` need backslash escapes. Everything else (including
|
||||||
|
UTF-8 multibyte sequences) round-trips fine because Helvetica's
|
||||||
|
encoding is WinAnsi-ish — we'll lose exotic glyphs but the
|
||||||
|
realistic body sticks to ASCII anyway. Em-dashes are downgraded
|
||||||
|
to ``--`` to avoid the WinAnsi gap.
|
||||||
|
"""
|
||||||
|
return (
|
||||||
|
s.replace("\\", r"\\")
|
||||||
|
.replace("(", r"\(")
|
||||||
|
.replace(")", r"\)")
|
||||||
|
.replace("—", "--")
|
||||||
|
)
|
||||||
190
decnet/canary/generators/mysql_dump.py
Normal file
190
decnet/canary/generators/mysql_dump.py
Normal file
@@ -0,0 +1,190 @@
|
|||||||
|
"""Fake ``mysqldump`` output that phones home on import.
|
||||||
|
|
||||||
|
Mirrors the Canarytokens.org MySQL-dump trick. When a victim runs
|
||||||
|
``mysql < dump.sql``, the trailer block executes a base64-obfuscated
|
||||||
|
``CHANGE REPLICATION SOURCE TO`` against ``<slug>.canary.<dns_zone>``
|
||||||
|
followed by ``START REPLICA``. The victim's MySQL daemon then:
|
||||||
|
|
||||||
|
1. Resolves the slug subdomain via DNS — this is the trip our
|
||||||
|
:mod:`decnet.canary.dns_server` already detects.
|
||||||
|
2. Opens a TCP replica handshake on port 3306, sending its own
|
||||||
|
``@@hostname`` and ``@@lc_time_names`` smuggled into the
|
||||||
|
``SOURCE_USER`` field via ``CONCAT``. Capturing those bytes
|
||||||
|
requires a MySQL handshake responder on the worker — out of scope
|
||||||
|
for v1; the DNS lookup alone is sufficient for detection.
|
||||||
|
|
||||||
|
The base64 wrapper is the camouflage: a plain ``grep canary dump.sql``
|
||||||
|
finds nothing. The slug only materialises when the victim's server
|
||||||
|
runs ``PREPARE … FROM @s2``.
|
||||||
|
|
||||||
|
Because the trip surface is DNS, this generator REQUIRES a non-empty
|
||||||
|
``dns_zone``. The slug must appear as the leftmost label of the
|
||||||
|
hostname so a single DNS query identifies the token; the http_base
|
||||||
|
host is not slug-bearing and can't substitute.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import base64
|
||||||
|
import hashlib
|
||||||
|
|
||||||
|
from decnet.canary.base import CanaryArtifact, CanaryContext, CanaryGenerator
|
||||||
|
|
||||||
|
|
||||||
|
def _stable_hex(seed: str, prefix: str = "", length: int = 16) -> str:
|
||||||
|
h = hashlib.sha256((prefix + seed).encode()).hexdigest()
|
||||||
|
return h[:length]
|
||||||
|
|
||||||
|
|
||||||
|
def _build_replica_payload(slug: str, dns_zone: str) -> str:
|
||||||
|
"""Inner SQL that gets base64-wrapped.
|
||||||
|
|
||||||
|
The CONCAT splices ``@@lc_time_names`` and ``@@hostname`` into the
|
||||||
|
``SOURCE_USER`` value at PREPARE time so the victim's locale and
|
||||||
|
hostname travel as the replica username on the 3306 handshake.
|
||||||
|
"""
|
||||||
|
host = f"{slug}.{dns_zone}"
|
||||||
|
return (
|
||||||
|
"SET @bb = CONCAT("
|
||||||
|
"\"CHANGE REPLICATION SOURCE TO "
|
||||||
|
"SOURCE_PASSWORD='replica-pw', "
|
||||||
|
"SOURCE_RETRY_COUNT=1, "
|
||||||
|
"SOURCE_PORT=3306, "
|
||||||
|
f"SOURCE_HOST='{host}', "
|
||||||
|
"SOURCE_SSL=0, "
|
||||||
|
f"SOURCE_USER='{slug}\", "
|
||||||
|
"@@lc_time_names, @@hostname, \"';\");"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_trailer(slug: str, dns_zone: str) -> str:
|
||||||
|
inner = _build_replica_payload(slug, dns_zone)
|
||||||
|
encoded = base64.b64encode(inner.encode("utf-8")).decode("ascii")
|
||||||
|
return (
|
||||||
|
f"SET @b = '{encoded}';\n"
|
||||||
|
"SET @s2 = FROM_BASE64(@b);\n"
|
||||||
|
"PREPARE stmt1 FROM @s2;\n"
|
||||||
|
"EXECUTE stmt1;\n"
|
||||||
|
"PREPARE stmt2 FROM @bb;\n"
|
||||||
|
"EXECUTE stmt2;\n"
|
||||||
|
"START REPLICA;\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class MySQLDumpGenerator(CanaryGenerator):
|
||||||
|
name = "mysql_dump"
|
||||||
|
|
||||||
|
def generate(self, ctx: CanaryContext) -> CanaryArtifact:
|
||||||
|
if not ctx.dns_zone:
|
||||||
|
raise ValueError(
|
||||||
|
"mysql_dump requires a non-empty dns_zone — the trip "
|
||||||
|
"surface is a DNS lookup of <slug>.<dns_zone>."
|
||||||
|
)
|
||||||
|
slug = ctx.callback_token
|
||||||
|
zone = ctx.dns_zone
|
||||||
|
host = f"{slug}.{zone}"
|
||||||
|
|
||||||
|
# Realism filler: deterministic per-slug fake user rows so two
|
||||||
|
# runs with the same context produce byte-identical output
|
||||||
|
# (planter idempotency contract).
|
||||||
|
u1_hash = _stable_hex(slug, "u1:", 32)
|
||||||
|
u2_hash = _stable_hex(slug, "u2:", 32)
|
||||||
|
api_token = _stable_hex(slug, "api:", 40)
|
||||||
|
|
||||||
|
# Synthesised SQL bait below — never executed by us, only by
|
||||||
|
# whoever runs ``mysql < dump.sql`` against their own server.
|
||||||
|
# Built with .format() instead of f-strings so bandit's B608
|
||||||
|
# heuristic doesn't false-positive on the "INSERT INTO" + var
|
||||||
|
# pattern.
|
||||||
|
users_insert = (
|
||||||
|
"INSERT INTO `users` VALUES " # nosec B608
|
||||||
|
"(1,'alice@app.internal','$2y$10${u1a}.{u1b}','2024-11-12 09:13:44'),"
|
||||||
|
"(2,'bob@app.internal','$2y$10${u2a}.{u2b}','2025-02-03 17:42:08');\n"
|
||||||
|
).replace("{u1a}", u1_hash[:22]).replace("{u1b}", u1_hash[22:]) \
|
||||||
|
.replace("{u2a}", u2_hash[:22]).replace("{u2b}", u2_hash[22:])
|
||||||
|
api_keys_insert = (
|
||||||
|
"INSERT INTO `api_keys` VALUES (1,1,'{tok}');\n" # nosec B608
|
||||||
|
).replace("{tok}", api_token)
|
||||||
|
header = (
|
||||||
|
"-- MySQL dump 10.13 Distrib 8.0.35, for Linux (x86_64)\n"
|
||||||
|
"--\n"
|
||||||
|
"-- Host: db-prod-01 Database: app_production\n"
|
||||||
|
"-- ------------------------------------------------------\n"
|
||||||
|
"-- Server version\t8.0.35\n"
|
||||||
|
"\n"
|
||||||
|
"/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;\n"
|
||||||
|
"/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;\n"
|
||||||
|
"/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;\n"
|
||||||
|
"/*!50503 SET NAMES utf8mb4 */;\n"
|
||||||
|
"/*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;\n"
|
||||||
|
"/*!40103 SET TIME_ZONE='+00:00' */;\n"
|
||||||
|
"/*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */;\n"
|
||||||
|
"/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;\n"
|
||||||
|
"/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */;\n"
|
||||||
|
"/*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;\n"
|
||||||
|
"\n"
|
||||||
|
"--\n"
|
||||||
|
"-- Table structure for table `users`\n"
|
||||||
|
"--\n"
|
||||||
|
"\n"
|
||||||
|
"DROP TABLE IF EXISTS `users`;\n"
|
||||||
|
"CREATE TABLE `users` (\n"
|
||||||
|
" `id` int unsigned NOT NULL AUTO_INCREMENT,\n"
|
||||||
|
" `email` varchar(255) NOT NULL,\n"
|
||||||
|
" `password_hash` char(60) NOT NULL,\n"
|
||||||
|
" `created_at` datetime NOT NULL,\n"
|
||||||
|
" PRIMARY KEY (`id`),\n"
|
||||||
|
" UNIQUE KEY `uniq_email` (`email`)\n"
|
||||||
|
") ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;\n"
|
||||||
|
"\n"
|
||||||
|
"LOCK TABLES `users` WRITE;\n"
|
||||||
|
+ users_insert +
|
||||||
|
"UNLOCK TABLES;\n"
|
||||||
|
"\n"
|
||||||
|
"--\n"
|
||||||
|
"-- Table structure for table `api_keys`\n"
|
||||||
|
"--\n"
|
||||||
|
"\n"
|
||||||
|
"DROP TABLE IF EXISTS `api_keys`;\n"
|
||||||
|
"CREATE TABLE `api_keys` (\n"
|
||||||
|
" `id` int unsigned NOT NULL AUTO_INCREMENT,\n"
|
||||||
|
" `user_id` int unsigned NOT NULL,\n"
|
||||||
|
" `token` char(40) NOT NULL,\n"
|
||||||
|
" PRIMARY KEY (`id`)\n"
|
||||||
|
") ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;\n"
|
||||||
|
"\n"
|
||||||
|
"LOCK TABLES `api_keys` WRITE;\n"
|
||||||
|
+ api_keys_insert +
|
||||||
|
"UNLOCK TABLES;\n"
|
||||||
|
"\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
trailer_replica = _build_trailer(slug, zone)
|
||||||
|
|
||||||
|
trailer_close = (
|
||||||
|
"\n"
|
||||||
|
"/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;\n"
|
||||||
|
"/*!40101 SET SQL_MODE=@OLD_SQL_MODE */;\n"
|
||||||
|
"/*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */;\n"
|
||||||
|
"/*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */;\n"
|
||||||
|
"/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;\n"
|
||||||
|
"/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;\n"
|
||||||
|
"/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;\n"
|
||||||
|
"/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;\n"
|
||||||
|
"\n"
|
||||||
|
"-- Dump completed\n"
|
||||||
|
)
|
||||||
|
|
||||||
|
body = header + trailer_replica + trailer_close
|
||||||
|
|
||||||
|
return CanaryArtifact(
|
||||||
|
path="",
|
||||||
|
content=body.encode("utf-8"),
|
||||||
|
mode=0o600,
|
||||||
|
mtime_offset=-86400 * 7, # last week's backup
|
||||||
|
generator=self.name,
|
||||||
|
notes=[
|
||||||
|
f"replica payload phones home to {host}:3306 on import",
|
||||||
|
"base64-wrapped PREPARE/EXECUTE block hides the slug from grep",
|
||||||
|
"@@hostname and @@lc_time_names smuggled into SOURCE_USER",
|
||||||
|
],
|
||||||
|
)
|
||||||
68
decnet/canary/generators/ssh_key.py
Normal file
68
decnet/canary/generators/ssh_key.py
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
"""Fake SSH private key with the callback host in the comment.
|
||||||
|
|
||||||
|
OpenSSH private keys carry a free-form comment field — typically
|
||||||
|
``user@host`` — that's preserved across rounds of ``ssh-keygen -p``.
|
||||||
|
We embed the canary host as the ``user@host`` so an attacker who
|
||||||
|
imports the key into their own keyring or runs ``ssh-keygen -lf`` on
|
||||||
|
it sees a hostname they may then try to reach.
|
||||||
|
|
||||||
|
The key bytes themselves are syntactically valid (PEM envelope, base64
|
||||||
|
body) but cryptographically junk — the body is a deterministic SHA-256
|
||||||
|
hash of the slug repeated to the right length. We don't ship a real
|
||||||
|
RSA/Ed25519 key because (a) we don't want a real private key sitting
|
||||||
|
on disk pretending to be valuable, and (b) the attacker ``cat``-ing
|
||||||
|
the file or running ``ssh -i`` will trigger the callback regardless
|
||||||
|
of cryptographic validity.
|
||||||
|
|
||||||
|
The DNS-callback variant uses ``<slug>.canary.<dns_zone>`` as the
|
||||||
|
hostname so a bare ``ssh-keygen -lf`` on the file resolves a unique
|
||||||
|
subdomain even if the attacker never hits HTTP.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import base64
|
||||||
|
import hashlib
|
||||||
|
|
||||||
|
from decnet.canary.base import CanaryArtifact, CanaryContext, CanaryGenerator
|
||||||
|
|
||||||
|
|
||||||
|
def _fake_key_body(seed: str) -> str:
|
||||||
|
# Real OpenSSH keys are several hundred base64 chars; we make a
|
||||||
|
# plausible-looking 24-line block from a SHA-256-derived stream.
|
||||||
|
h = hashlib.sha256(seed.encode()).digest()
|
||||||
|
long_stream = (h * 32)[:768] # 768 bytes → ~1024 base64 chars
|
||||||
|
encoded = base64.b64encode(long_stream).decode()
|
||||||
|
# Wrap at 70 chars per line — same shape ``ssh-keygen`` produces.
|
||||||
|
return "\n".join(encoded[i:i + 70] for i in range(0, len(encoded), 70))
|
||||||
|
|
||||||
|
|
||||||
|
class SSHKeyGenerator(CanaryGenerator):
|
||||||
|
name = "ssh_key"
|
||||||
|
|
||||||
|
def generate(self, ctx: CanaryContext) -> CanaryArtifact:
|
||||||
|
slug = ctx.callback_token
|
||||||
|
body = _fake_key_body(slug)
|
||||||
|
# Hostname for the comment: prefer DNS-zone form when the
|
||||||
|
# operator has DNS deployed (so ssh-keygen -lf names a subdomain
|
||||||
|
# the attacker may resolve); fall back to the http_base host
|
||||||
|
# otherwise.
|
||||||
|
if ctx.dns_zone:
|
||||||
|
host_comment = f"deploy@{slug}.{ctx.dns_zone}"
|
||||||
|
else:
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
host = urlparse(ctx.http_base).hostname or "deploy.local"
|
||||||
|
host_comment = f"deploy@{host}"
|
||||||
|
content = (
|
||||||
|
"-----BEGIN OPENSSH PRIVATE KEY-----\n"
|
||||||
|
f"{body}\n"
|
||||||
|
"-----END OPENSSH PRIVATE KEY-----\n"
|
||||||
|
f"# {host_comment}\n"
|
||||||
|
)
|
||||||
|
return CanaryArtifact(
|
||||||
|
path="",
|
||||||
|
content=content.encode("utf-8"),
|
||||||
|
mode=0o600,
|
||||||
|
mtime_offset=-86400 * 60, # 2 months ago
|
||||||
|
generator=self.name,
|
||||||
|
notes=[f"comment line embeds {host_comment}"],
|
||||||
|
)
|
||||||
4
decnet/canary/instrumenters/__init__.py
Normal file
4
decnet/canary/instrumenters/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
"""Built-in canary instrumenters (operator-uploaded artifact mutation).
|
||||||
|
|
||||||
|
Lazy-imported by :func:`decnet.canary.factory.get_instrumenter`.
|
||||||
|
"""
|
||||||
147
decnet/canary/instrumenters/docx.py
Normal file
147
decnet/canary/instrumenters/docx.py
Normal file
@@ -0,0 +1,147 @@
|
|||||||
|
"""DOCX instrumenter — inject a remote image into the body.
|
||||||
|
|
||||||
|
DOCX files are zip archives carrying ``word/document.xml`` (the body)
|
||||||
|
and ``word/_rels/document.xml.rels`` (the relationship table that
|
||||||
|
maps ``rId`` references to URLs). We:
|
||||||
|
|
||||||
|
1. Add a new relationship of type ``image`` whose target is the
|
||||||
|
canary callback URL and ``TargetMode="External"``.
|
||||||
|
2. Add a tiny ``<w:drawing>`` element referencing that ``rId`` at
|
||||||
|
the end of ``word/document.xml`` (just before ``</w:body>``).
|
||||||
|
|
||||||
|
Word and LibreOffice both fetch external image relationships when
|
||||||
|
the document is opened (subject to the user's "trusted source"
|
||||||
|
toggle, which most enterprise environments disable in favour of
|
||||||
|
"warn but allow").
|
||||||
|
|
||||||
|
We use stdlib ``zipfile`` only — no python-docx dependency — because
|
||||||
|
the surface we touch is two small XML files and we don't need any of
|
||||||
|
the higher-level abstractions.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import io
|
||||||
|
import re
|
||||||
|
import zipfile
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
from decnet.canary.base import (
|
||||||
|
CanaryArtifact,
|
||||||
|
CanaryContext,
|
||||||
|
CanaryInstrumenter,
|
||||||
|
InstrumenterRejectedError,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
_RELS_END = re.compile(rb"</Relationships\s*>", re.IGNORECASE)
|
||||||
|
_BODY_END = re.compile(rb"</w:body\s*>", re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
|
def _next_rid(rels_xml: bytes) -> str:
|
||||||
|
"""Return an rId not already taken in the relationships file.
|
||||||
|
|
||||||
|
Word's loader tolerates non-sequential ids, so we just pick one
|
||||||
|
well above the typical range to avoid collisions.
|
||||||
|
"""
|
||||||
|
used = set(m.group(1).decode() for m in re.finditer(rb'Id="(rId\d+)"', rels_xml))
|
||||||
|
for n in range(900, 9999):
|
||||||
|
rid = f"rId{n}"
|
||||||
|
if rid not in used:
|
||||||
|
return rid
|
||||||
|
raise InstrumenterRejectedError("DOCX has too many relationships to allocate a new rId")
|
||||||
|
|
||||||
|
|
||||||
|
def _inject_relationship(rels_xml: bytes, rid: str, url: str) -> bytes:
|
||||||
|
rel = (
|
||||||
|
f'<Relationship Id="{rid}" '
|
||||||
|
f'Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" '
|
||||||
|
f'Target="{url}" TargetMode="External"/>'
|
||||||
|
).encode()
|
||||||
|
match = _RELS_END.search(rels_xml)
|
||||||
|
if not match:
|
||||||
|
raise InstrumenterRejectedError(
|
||||||
|
"DOCX rels file has no </Relationships>; refusing to mutate"
|
||||||
|
)
|
||||||
|
return rels_xml[:match.start()] + rel + rels_xml[match.start():]
|
||||||
|
|
||||||
|
|
||||||
|
def _drawing(rid: str) -> bytes:
|
||||||
|
# Minimal w:drawing tree referencing the external image at rid.
|
||||||
|
# Dimensions are 1 EMU x 1 EMU so the image is invisible; Word
|
||||||
|
# still fetches the resource on document load.
|
||||||
|
return (
|
||||||
|
'<w:p><w:r><w:drawing>'
|
||||||
|
'<wp:inline xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing">'
|
||||||
|
'<wp:extent cx="1" cy="1"/><wp:docPr id="1" name="canary"/>'
|
||||||
|
'<a:graphic xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main">'
|
||||||
|
'<a:graphicData uri="http://schemas.openxmlformats.org/drawingml/2006/picture">'
|
||||||
|
'<pic:pic xmlns:pic="http://schemas.openxmlformats.org/drawingml/2006/picture">'
|
||||||
|
'<pic:nvPicPr><pic:cNvPr id="1" name="canary"/><pic:cNvPicPr/></pic:nvPicPr>'
|
||||||
|
'<pic:blipFill>'
|
||||||
|
f'<a:blip xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" r:link="{rid}"/>'
|
||||||
|
'<a:stretch><a:fillRect/></a:stretch>'
|
||||||
|
'</pic:blipFill>'
|
||||||
|
'<pic:spPr><a:xfrm><a:off x="0" y="0"/><a:ext cx="1" cy="1"/></a:xfrm>'
|
||||||
|
'<a:prstGeom prst="rect"><a:avLst/></a:prstGeom></pic:spPr>'
|
||||||
|
'</pic:pic></a:graphicData></a:graphic></wp:inline>'
|
||||||
|
'</w:drawing></w:r></w:p>'
|
||||||
|
).encode()
|
||||||
|
|
||||||
|
|
||||||
|
def _inject_drawing(document_xml: bytes, rid: str) -> bytes:
|
||||||
|
match = _BODY_END.search(document_xml)
|
||||||
|
if not match:
|
||||||
|
raise InstrumenterRejectedError("DOCX document.xml has no </w:body>")
|
||||||
|
drawing = _drawing(rid)
|
||||||
|
return document_xml[:match.start()] + drawing + document_xml[match.start():]
|
||||||
|
|
||||||
|
|
||||||
|
def _mutate(blob: bytes, url: str) -> Tuple[bytes, str]:
|
||||||
|
try:
|
||||||
|
with zipfile.ZipFile(io.BytesIO(blob), "r") as zf:
|
||||||
|
try:
|
||||||
|
rels = zf.read("word/_rels/document.xml.rels")
|
||||||
|
doc = zf.read("word/document.xml")
|
||||||
|
except KeyError as e:
|
||||||
|
raise InstrumenterRejectedError(
|
||||||
|
f"DOCX missing expected member: {e.args[0]!r}"
|
||||||
|
) from e
|
||||||
|
members = [(zi, zf.read(zi.filename)) for zi in zf.infolist()]
|
||||||
|
except zipfile.BadZipFile as e:
|
||||||
|
raise InstrumenterRejectedError("uploaded blob is not a valid DOCX zip") from e
|
||||||
|
|
||||||
|
rid = _next_rid(rels)
|
||||||
|
new_rels = _inject_relationship(rels, rid, url)
|
||||||
|
new_doc = _inject_drawing(doc, rid)
|
||||||
|
|
||||||
|
out = io.BytesIO()
|
||||||
|
with zipfile.ZipFile(out, "w", zipfile.ZIP_DEFLATED) as zf_out:
|
||||||
|
for zi, data in members:
|
||||||
|
if zi.filename == "word/_rels/document.xml.rels":
|
||||||
|
zf_out.writestr(zi.filename, new_rels)
|
||||||
|
elif zi.filename == "word/document.xml":
|
||||||
|
zf_out.writestr(zi.filename, new_doc)
|
||||||
|
else:
|
||||||
|
zf_out.writestr(zi, data)
|
||||||
|
return out.getvalue(), rid
|
||||||
|
|
||||||
|
|
||||||
|
class DocxInstrumenter(CanaryInstrumenter):
|
||||||
|
name = "docx"
|
||||||
|
mime_prefixes = (
|
||||||
|
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
||||||
|
)
|
||||||
|
|
||||||
|
def instrument(
|
||||||
|
self, blob: bytes, ctx: CanaryContext, *, target_path: str,
|
||||||
|
) -> CanaryArtifact:
|
||||||
|
url = f"{ctx.http_base.rstrip('/')}/c/{ctx.callback_token}"
|
||||||
|
mutated, rid = _mutate(blob, url)
|
||||||
|
return CanaryArtifact(
|
||||||
|
path=target_path,
|
||||||
|
content=mutated,
|
||||||
|
mode=0o644,
|
||||||
|
mtime_offset=-86400 * 14,
|
||||||
|
instrumenter=self.name,
|
||||||
|
notes=[f"injected external-image relationship {rid} -> {url}"],
|
||||||
|
)
|
||||||
45
decnet/canary/instrumenters/html.py
Normal file
45
decnet/canary/instrumenters/html.py
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
"""HTML instrumenter — append a 1×1 tracking pixel.
|
||||||
|
|
||||||
|
Stdlib-only. We don't parse the HTML; we just inject the ``<img>``
|
||||||
|
tag immediately before the closing ``</body>`` (or, failing that, at
|
||||||
|
the end of the document). Most renderers that support remote images
|
||||||
|
(email previewers, IDE doc previews, browsers) will fetch it as
|
||||||
|
soon as the document is opened.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
from decnet.canary.base import CanaryArtifact, CanaryContext, CanaryInstrumenter
|
||||||
|
|
||||||
|
|
||||||
|
_BODY_CLOSE = re.compile(rb"</body\s*>", re.IGNORECASE)
|
||||||
|
|
||||||
|
|
||||||
|
class HtmlInstrumenter(CanaryInstrumenter):
|
||||||
|
name = "html"
|
||||||
|
mime_prefixes = ("text/html", "application/xhtml+xml")
|
||||||
|
|
||||||
|
def instrument(
|
||||||
|
self, blob: bytes, ctx: CanaryContext, *, target_path: str,
|
||||||
|
) -> CanaryArtifact:
|
||||||
|
url = f"{ctx.http_base.rstrip('/')}/c/{ctx.callback_token}".encode()
|
||||||
|
pixel = (
|
||||||
|
b"<img src=\"" + url + b"\" width=\"1\" height=\"1\" "
|
||||||
|
b"alt=\"\" style=\"display:none\">\n"
|
||||||
|
)
|
||||||
|
match = _BODY_CLOSE.search(blob)
|
||||||
|
if match:
|
||||||
|
out = blob[:match.start()] + pixel + blob[match.start():]
|
||||||
|
note = "injected 1x1 pixel before </body>"
|
||||||
|
else:
|
||||||
|
out = (blob if blob.endswith(b"\n") else blob + b"\n") + pixel
|
||||||
|
note = "appended 1x1 pixel (no </body> found)"
|
||||||
|
return CanaryArtifact(
|
||||||
|
path=target_path,
|
||||||
|
content=out,
|
||||||
|
mode=0o644,
|
||||||
|
mtime_offset=-86400 * 7,
|
||||||
|
instrumenter=self.name,
|
||||||
|
notes=[note, f"pixel src={url.decode()}"],
|
||||||
|
)
|
||||||
72
decnet/canary/instrumenters/image.py
Normal file
72
decnet/canary/instrumenters/image.py
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
"""Image instrumenter — requires :mod:`PIL` (optional dependency).
|
||||||
|
|
||||||
|
For PNG/JPEG/GIF we append a tEXt/EXIF chunk carrying the slug so
|
||||||
|
``exiftool`` / ``identify -verbose`` surface the slug, then route the
|
||||||
|
detection via a sibling **plain-text companion file**. The image
|
||||||
|
itself can't really embed an HTTP fetcher — image decoders don't
|
||||||
|
run network requests on decode — so the realistic detection surface
|
||||||
|
is "attacker exfils the image, runs metadata tools on it, hits our
|
||||||
|
URL when curious about the embedded marker."
|
||||||
|
|
||||||
|
When Pillow isn't installed we reject and direct the operator to
|
||||||
|
``passthrough`` (which preserves the bytes; the slug then lives in
|
||||||
|
the filename only).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import io
|
||||||
|
|
||||||
|
from decnet.canary.base import (
|
||||||
|
CanaryArtifact,
|
||||||
|
CanaryContext,
|
||||||
|
CanaryInstrumenter,
|
||||||
|
InstrumenterRejectedError,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class ImageInstrumenter(CanaryInstrumenter):
|
||||||
|
name = "image"
|
||||||
|
mime_prefixes = ("image/png", "image/jpeg", "image/gif")
|
||||||
|
|
||||||
|
def instrument(
|
||||||
|
self, blob: bytes, ctx: CanaryContext, *, target_path: str,
|
||||||
|
) -> CanaryArtifact:
|
||||||
|
try:
|
||||||
|
from PIL import Image, PngImagePlugin # type: ignore[import-not-found]
|
||||||
|
except ImportError as e:
|
||||||
|
raise InstrumenterRejectedError(
|
||||||
|
"image instrumenter requires Pillow; install it (`pip "
|
||||||
|
"install Pillow`) or re-upload the artifact with "
|
||||||
|
"kind=passthrough so it ships unmodified."
|
||||||
|
) from e
|
||||||
|
|
||||||
|
slug_url = f"{ctx.http_base.rstrip('/')}/c/{ctx.callback_token}"
|
||||||
|
try:
|
||||||
|
buf_in = io.BytesIO(blob)
|
||||||
|
img = Image.open(buf_in)
|
||||||
|
fmt = (img.format or "").upper()
|
||||||
|
buf_out = io.BytesIO()
|
||||||
|
if fmt == "PNG":
|
||||||
|
meta = PngImagePlugin.PngInfo()
|
||||||
|
meta.add_text("Comment", f"reference: {slug_url}")
|
||||||
|
meta.add_text("X-Canary", ctx.callback_token)
|
||||||
|
img.save(buf_out, format="PNG", pnginfo=meta)
|
||||||
|
elif fmt in ("JPEG", "JPG"):
|
||||||
|
# Pillow encodes JPEG comments via the ``comment`` kwarg.
|
||||||
|
img.save(buf_out, format="JPEG", comment=slug_url.encode())
|
||||||
|
else:
|
||||||
|
# GIF and friends — Pillow doesn't expose comment metadata
|
||||||
|
# uniformly. Re-encode as-is and skip the metadata embed.
|
||||||
|
img.save(buf_out, format=fmt or "PNG")
|
||||||
|
mutated = buf_out.getvalue()
|
||||||
|
except Exception as e:
|
||||||
|
raise InstrumenterRejectedError(f"failed to instrument image: {e!s}") from e
|
||||||
|
|
||||||
|
return CanaryArtifact(
|
||||||
|
path=target_path,
|
||||||
|
content=mutated,
|
||||||
|
mode=0o644,
|
||||||
|
mtime_offset=-86400 * 30,
|
||||||
|
instrumenter=self.name,
|
||||||
|
notes=[f"image metadata carries {slug_url} (slug={ctx.callback_token})"],
|
||||||
|
)
|
||||||
37
decnet/canary/instrumenters/passthrough.py
Normal file
37
decnet/canary/instrumenters/passthrough.py
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
"""Passthrough instrumenter — bytes go to disk unchanged.
|
||||||
|
|
||||||
|
Used as the dispatch fallback for content types we can't safely
|
||||||
|
mutate (random binary blobs, container images, archives we don't
|
||||||
|
recognise). In passthrough mode the only callback surface is the
|
||||||
|
:attr:`CanaryToken.placement_path` itself: the operator must use a
|
||||||
|
DNS-callback token whose slug appears in the filename, so a
|
||||||
|
listing/access at the OS level resolves the slug as part of the
|
||||||
|
path (e.g. ``/etc/<slug>.canary.example.test/secrets.bin``) when
|
||||||
|
the attacker greps for hostnames in their loot.
|
||||||
|
|
||||||
|
The instrumenter does not enforce that — the API does, when it sees
|
||||||
|
``instrumenter=passthrough`` with ``kind=http`` it returns 400.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from decnet.canary.base import CanaryArtifact, CanaryContext, CanaryInstrumenter
|
||||||
|
|
||||||
|
|
||||||
|
class PassthroughInstrumenter(CanaryInstrumenter):
|
||||||
|
name = "passthrough"
|
||||||
|
mime_prefixes = () # dispatched by fallback in pick_instrumenter_for_mime
|
||||||
|
|
||||||
|
def instrument(
|
||||||
|
self, blob: bytes, ctx: CanaryContext, *, target_path: str,
|
||||||
|
) -> CanaryArtifact:
|
||||||
|
return CanaryArtifact(
|
||||||
|
path=target_path,
|
||||||
|
content=blob,
|
||||||
|
mode=0o644,
|
||||||
|
mtime_offset=-86400 * 7,
|
||||||
|
instrumenter=self.name,
|
||||||
|
notes=[
|
||||||
|
"passthrough: bytes unchanged — only DNS-callback tokens "
|
||||||
|
"trip detection (slug must live in the placement path)",
|
||||||
|
],
|
||||||
|
)
|
||||||
76
decnet/canary/instrumenters/pdf.py
Normal file
76
decnet/canary/instrumenters/pdf.py
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
"""PDF instrumenter — requires :mod:`pikepdf` (optional dependency).
|
||||||
|
|
||||||
|
PDF embedding is non-trivial: the cleanest place to put a callback
|
||||||
|
is an ``/AA`` (additional actions) ``/O`` (open) entry on the
|
||||||
|
catalog or a ``/URI`` action on a link annotation. Either path
|
||||||
|
needs proper xref-table updates — pikepdf handles that for us.
|
||||||
|
|
||||||
|
If pikepdf isn't available in the environment the instrumenter
|
||||||
|
raises :class:`InstrumenterRejectedError` so the API can return a
|
||||||
|
clear 400 directing the operator to either install pikepdf or
|
||||||
|
re-upload as ``passthrough``.
|
||||||
|
|
||||||
|
We don't ship a stdlib fallback because every "naive" PDF mutation
|
||||||
|
I'm aware of (appending raw bytes, splicing into the trailer, etc.)
|
||||||
|
breaks the document's xref table and trips a "file is corrupt"
|
||||||
|
warning in modern viewers — which the attacker will absolutely
|
||||||
|
notice.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from decnet.canary.base import (
|
||||||
|
CanaryArtifact,
|
||||||
|
CanaryContext,
|
||||||
|
CanaryInstrumenter,
|
||||||
|
InstrumenterRejectedError,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class PdfInstrumenter(CanaryInstrumenter):
|
||||||
|
name = "pdf"
|
||||||
|
mime_prefixes = ("application/pdf",)
|
||||||
|
|
||||||
|
def instrument(
|
||||||
|
self, blob: bytes, ctx: CanaryContext, *, target_path: str,
|
||||||
|
) -> CanaryArtifact:
|
||||||
|
try:
|
||||||
|
import pikepdf # type: ignore[import-not-found]
|
||||||
|
except ImportError as e:
|
||||||
|
raise InstrumenterRejectedError(
|
||||||
|
"PDF instrumenter requires pikepdf; install it (`pip "
|
||||||
|
"install pikepdf`) or re-upload the artifact with "
|
||||||
|
"kind=passthrough so it ships unmodified."
|
||||||
|
) from e
|
||||||
|
|
||||||
|
url = f"{ctx.http_base.rstrip('/')}/c/{ctx.callback_token}"
|
||||||
|
try:
|
||||||
|
import io
|
||||||
|
buf = io.BytesIO(blob)
|
||||||
|
with pikepdf.open(buf) as pdf:
|
||||||
|
# Add an OpenAction that fires a URI action on document
|
||||||
|
# open. Most viewers prompt before fetching; that's
|
||||||
|
# fine — even the prompt itself can trip a "user
|
||||||
|
# interacted with the document" tell, and an
|
||||||
|
# auto-allow viewer fetches the URL silently.
|
||||||
|
action = pikepdf.Dictionary(
|
||||||
|
Type=pikepdf.Name("/Action"),
|
||||||
|
S=pikepdf.Name("/URI"),
|
||||||
|
URI=pikepdf.String(url),
|
||||||
|
)
|
||||||
|
pdf.Root[pikepdf.Name("/OpenAction")] = action
|
||||||
|
out = io.BytesIO()
|
||||||
|
pdf.save(out)
|
||||||
|
mutated = out.getvalue()
|
||||||
|
except Exception as e:
|
||||||
|
raise InstrumenterRejectedError(
|
||||||
|
f"failed to instrument PDF: {e!s}"
|
||||||
|
) from e
|
||||||
|
|
||||||
|
return CanaryArtifact(
|
||||||
|
path=target_path,
|
||||||
|
content=mutated,
|
||||||
|
mode=0o644,
|
||||||
|
mtime_offset=-86400 * 14,
|
||||||
|
instrumenter=self.name,
|
||||||
|
notes=[f"installed /OpenAction /URI -> {url}"],
|
||||||
|
)
|
||||||
79
decnet/canary/instrumenters/plain.py
Normal file
79
decnet/canary/instrumenters/plain.py
Normal file
@@ -0,0 +1,79 @@
|
|||||||
|
"""Plain-text / config-file instrumenter.
|
||||||
|
|
||||||
|
Two embedding strategies, picked in order:
|
||||||
|
|
||||||
|
1. **Token substitution.** If the blob contains the literal
|
||||||
|
placeholder ``{{CANARY_URL}}`` or ``{{CANARY_HOST}}``, replace it.
|
||||||
|
This gives operators full control over where the slug lands —
|
||||||
|
they can pre-edit the file with placeholders before uploading.
|
||||||
|
2. **Append.** Otherwise, append a comment line that mentions the
|
||||||
|
callback URL. The comment style adapts to the file's apparent
|
||||||
|
syntax (``#`` for shell/yaml/python/dockerfile, ``//`` for json5/
|
||||||
|
javascript-ish, ``;`` for ini).
|
||||||
|
|
||||||
|
Operators who want neither behavior should upload the file as
|
||||||
|
``passthrough``.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from decnet.canary.base import CanaryArtifact, CanaryContext, CanaryInstrumenter
|
||||||
|
|
||||||
|
|
||||||
|
_SLASH_HINTS = (b"//", b"function ", b"const ", b"let ", b"var ")
|
||||||
|
_SEMI_HINTS = (b"[default]", b"[section]", b"\n[")
|
||||||
|
|
||||||
|
|
||||||
|
def _comment_prefix(blob: bytes) -> bytes:
|
||||||
|
head = blob[:512]
|
||||||
|
if any(h in head for h in _SEMI_HINTS):
|
||||||
|
return b"; "
|
||||||
|
if any(h in head for h in _SLASH_HINTS):
|
||||||
|
return b"// "
|
||||||
|
# Default to # — the most common comment glyph across config files
|
||||||
|
# we'd plausibly canary.
|
||||||
|
return b"# "
|
||||||
|
|
||||||
|
|
||||||
|
class PlainInstrumenter(CanaryInstrumenter):
|
||||||
|
name = "plain"
|
||||||
|
mime_prefixes = ("text/", "application/json", "application/yaml", "application/toml")
|
||||||
|
|
||||||
|
def instrument(
|
||||||
|
self, blob: bytes, ctx: CanaryContext, *, target_path: str,
|
||||||
|
) -> CanaryArtifact:
|
||||||
|
base = ctx.http_base.rstrip("/")
|
||||||
|
callback_url = f"{base}/c/{ctx.callback_token}".encode()
|
||||||
|
callback_host = (
|
||||||
|
f"{ctx.callback_token}.{ctx.dns_zone}".encode()
|
||||||
|
if ctx.dns_zone else b""
|
||||||
|
)
|
||||||
|
notes: list[str] = []
|
||||||
|
out = blob
|
||||||
|
|
||||||
|
if b"{{CANARY_URL}}" in blob:
|
||||||
|
out = out.replace(b"{{CANARY_URL}}", callback_url)
|
||||||
|
notes.append(f"substituted {{{{CANARY_URL}}}} -> {callback_url.decode()}")
|
||||||
|
if b"{{CANARY_HOST}}" in blob and callback_host:
|
||||||
|
out = out.replace(b"{{CANARY_HOST}}", callback_host)
|
||||||
|
notes.append(f"substituted {{{{CANARY_HOST}}}} -> {callback_host.decode()}")
|
||||||
|
|
||||||
|
if not notes:
|
||||||
|
# No placeholders — append a comment line at the end.
|
||||||
|
prefix = _comment_prefix(blob)
|
||||||
|
tail = (
|
||||||
|
b"\n" + prefix + b"see " + callback_url
|
||||||
|
+ b" for the latest version\n"
|
||||||
|
)
|
||||||
|
out = (out if out.endswith(b"\n") else out + b"\n") + tail
|
||||||
|
notes.append(
|
||||||
|
f"appended comment line carrying {callback_url.decode()}"
|
||||||
|
)
|
||||||
|
|
||||||
|
return CanaryArtifact(
|
||||||
|
path=target_path,
|
||||||
|
content=out,
|
||||||
|
mode=0o644,
|
||||||
|
mtime_offset=-86400 * 7,
|
||||||
|
instrumenter=self.name,
|
||||||
|
notes=notes,
|
||||||
|
)
|
||||||
95
decnet/canary/instrumenters/xlsx.py
Normal file
95
decnet/canary/instrumenters/xlsx.py
Normal file
@@ -0,0 +1,95 @@
|
|||||||
|
"""XLSX instrumenter — embed an external-image link.
|
||||||
|
|
||||||
|
XLSX is structurally identical to DOCX (Office Open XML zip). The
|
||||||
|
injection target is the workbook's relationships file
|
||||||
|
(``xl/_rels/workbook.xml.rels``). We add an external image
|
||||||
|
relationship there; Excel/LibreOffice fetch external images on
|
||||||
|
workbook open in the same way Word does.
|
||||||
|
|
||||||
|
We don't inject a ``<drawing>`` element into a sheet because that
|
||||||
|
requires touching ``xl/worksheets/sheetN.xml`` *and* allocating a new
|
||||||
|
``xl/drawings/drawingN.xml`` part — much higher chance of mangling
|
||||||
|
the file. An orphan external image relationship is enough: many
|
||||||
|
Office viewers fetch all relationships at open time regardless of
|
||||||
|
whether they're referenced from a sheet.
|
||||||
|
|
||||||
|
If the operator wants a stronger trigger (image visible in the
|
||||||
|
sheet, fetched even by viewers that lazy-load external resources)
|
||||||
|
they should embed the slug as a hyperlink cell content via the
|
||||||
|
``plain``/``passthrough`` instrumenters.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import io
|
||||||
|
import zipfile
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
from decnet.canary.base import (
|
||||||
|
CanaryArtifact,
|
||||||
|
CanaryContext,
|
||||||
|
CanaryInstrumenter,
|
||||||
|
InstrumenterRejectedError,
|
||||||
|
)
|
||||||
|
from decnet.canary.instrumenters.docx import _inject_relationship, _next_rid
|
||||||
|
|
||||||
|
|
||||||
|
_RELS_PATHS = (
|
||||||
|
"xl/_rels/workbook.xml.rels",
|
||||||
|
"xl/_rels/sharedStrings.xml.rels",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _mutate(blob: bytes, url: str) -> Tuple[bytes, str, str]:
|
||||||
|
try:
|
||||||
|
with zipfile.ZipFile(io.BytesIO(blob), "r") as zf:
|
||||||
|
members = [(zi, zf.read(zi.filename)) for zi in zf.infolist()]
|
||||||
|
except zipfile.BadZipFile as e:
|
||||||
|
raise InstrumenterRejectedError("uploaded blob is not a valid XLSX zip") from e
|
||||||
|
|
||||||
|
target_rels: str | None = None
|
||||||
|
for zi, _ in members:
|
||||||
|
if zi.filename in _RELS_PATHS:
|
||||||
|
target_rels = zi.filename
|
||||||
|
break
|
||||||
|
if not target_rels:
|
||||||
|
raise InstrumenterRejectedError(
|
||||||
|
"XLSX has no workbook relationships file to mutate"
|
||||||
|
)
|
||||||
|
|
||||||
|
out_members = []
|
||||||
|
rid = ""
|
||||||
|
for zi, data in members:
|
||||||
|
if zi.filename == target_rels:
|
||||||
|
rid = _next_rid(data)
|
||||||
|
data = _inject_relationship(data, rid, url)
|
||||||
|
out_members.append((zi, data))
|
||||||
|
|
||||||
|
out = io.BytesIO()
|
||||||
|
with zipfile.ZipFile(out, "w", zipfile.ZIP_DEFLATED) as zf_out:
|
||||||
|
for zi, data in out_members:
|
||||||
|
zf_out.writestr(zi, data)
|
||||||
|
return out.getvalue(), rid, target_rels
|
||||||
|
|
||||||
|
|
||||||
|
class XlsxInstrumenter(CanaryInstrumenter):
|
||||||
|
name = "xlsx"
|
||||||
|
mime_prefixes = (
|
||||||
|
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
|
||||||
|
)
|
||||||
|
|
||||||
|
def instrument(
|
||||||
|
self, blob: bytes, ctx: CanaryContext, *, target_path: str,
|
||||||
|
) -> CanaryArtifact:
|
||||||
|
url = f"{ctx.http_base.rstrip('/')}/c/{ctx.callback_token}"
|
||||||
|
mutated, rid, target_rels = _mutate(blob, url)
|
||||||
|
return CanaryArtifact(
|
||||||
|
path=target_path,
|
||||||
|
content=mutated,
|
||||||
|
mode=0o644,
|
||||||
|
mtime_offset=-86400 * 14,
|
||||||
|
instrumenter=self.name,
|
||||||
|
notes=[
|
||||||
|
f"injected external-image relationship {rid} into "
|
||||||
|
f"{target_rels} -> {url}",
|
||||||
|
],
|
||||||
|
)
|
||||||
82
decnet/canary/paths.py
Normal file
82
decnet/canary/paths.py
Normal file
@@ -0,0 +1,82 @@
|
|||||||
|
"""Persona-aware path resolution for canary artifacts.
|
||||||
|
|
||||||
|
Linux-persona deckies use POSIX-shaped paths under ``/home/<user>``.
|
||||||
|
"Windows" personas (still Linux containers under the hood — see
|
||||||
|
:mod:`decnet.archetypes`) use Windows-shaped paths under
|
||||||
|
``/home/<user>/AppData/...`` so an attacker browsing the filesystem
|
||||||
|
through a planted RDP/SMB session sees the right shape.
|
||||||
|
|
||||||
|
The persona lookup is best-effort: callers pass the
|
||||||
|
:attr:`decnet.archetypes.Archetype.nmap_os` value (``"linux"`` or
|
||||||
|
``"windows"``); unknown personas fall through to ``"linux"``.
|
||||||
|
Operators can always override by passing an explicit
|
||||||
|
``placement_path`` when creating a token.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
DEFAULT_LINUX_USER = "admin"
|
||||||
|
DEFAULT_WINDOWS_USER = "Administrator"
|
||||||
|
|
||||||
|
# Canonical placements for the synthesizer-driven baseline tokens.
|
||||||
|
# Operators can override per-token via the API, but these are the
|
||||||
|
# defaults the deploy-time seed uses.
|
||||||
|
_LINUX_DEFAULTS: dict[str, str] = {
|
||||||
|
"git_config": "/home/{user}/.git/config",
|
||||||
|
"env_file": "/home/{user}/.env",
|
||||||
|
"ssh_key": "/home/{user}/.ssh/id_rsa",
|
||||||
|
"aws_creds": "/home/{user}/.aws/credentials",
|
||||||
|
"honeydoc": "/home/{user}/Documents/quarterly_report.html",
|
||||||
|
"honeydoc_docx": "/home/{user}/Documents/quarterly_report.docx",
|
||||||
|
"honeydoc_pdf": "/home/{user}/Documents/quarterly_report.pdf",
|
||||||
|
}
|
||||||
|
|
||||||
|
_WINDOWS_DEFAULTS: dict[str, str] = {
|
||||||
|
"git_config": "/home/{user}/AppData/Local/Programs/Git/etc/gitconfig",
|
||||||
|
"env_file": "/home/{user}/Desktop/prod.env",
|
||||||
|
"ssh_key": "/home/{user}/.ssh/id_rsa", # OpenSSH on Windows uses the same path
|
||||||
|
"aws_creds": "/home/{user}/.aws/credentials",
|
||||||
|
"honeydoc": "/home/{user}/Documents/quarterly_report.html",
|
||||||
|
"honeydoc_docx": "/home/{user}/Documents/quarterly_report.docx",
|
||||||
|
"honeydoc_pdf": "/home/{user}/Documents/quarterly_report.pdf",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def default_user(persona: str) -> str:
|
||||||
|
"""Return the conventional unprivileged username for a persona."""
|
||||||
|
return DEFAULT_WINDOWS_USER if persona == "windows" else DEFAULT_LINUX_USER
|
||||||
|
|
||||||
|
|
||||||
|
def default_path_for(generator: str, persona: str = "linux") -> str:
|
||||||
|
"""Resolve the default placement path for a synthesized token.
|
||||||
|
|
||||||
|
Returns an absolute container path with ``{user}`` already
|
||||||
|
expanded. Falls back to a sane Linux default for unknown
|
||||||
|
personas — better to plant *something* than fail the deploy hook.
|
||||||
|
"""
|
||||||
|
table = _WINDOWS_DEFAULTS if persona == "windows" else _LINUX_DEFAULTS
|
||||||
|
template = table.get(generator)
|
||||||
|
if not template:
|
||||||
|
# Unknown generator — fall back to a generic /tmp drop so the
|
||||||
|
# planter still has somewhere to write. The API rejects
|
||||||
|
# unknown generators upstream, so this branch is defensive.
|
||||||
|
return f"/tmp/{generator}.canary" # nosec B108 — placement inside attacker-facing decoy container, not host /tmp
|
||||||
|
return template.format(user=default_user(persona))
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_placement(path: str) -> str:
|
||||||
|
"""Validate and normalize an operator-supplied placement path.
|
||||||
|
|
||||||
|
Forbids relative paths, NUL bytes, and shell metacharacters that
|
||||||
|
``docker exec sh -c`` can't safely round-trip. Returns the
|
||||||
|
sanitised path unchanged when valid; raises :class:`ValueError`
|
||||||
|
otherwise so the API can return a 400 with a clear message.
|
||||||
|
"""
|
||||||
|
if not path or not path.startswith("/"):
|
||||||
|
raise ValueError("placement_path must be absolute (start with '/')")
|
||||||
|
if "\x00" in path:
|
||||||
|
raise ValueError("placement_path may not contain NUL")
|
||||||
|
if "\n" in path or "\r" in path:
|
||||||
|
raise ValueError("placement_path may not contain newlines")
|
||||||
|
if "../" in path or path.endswith("/.."):
|
||||||
|
raise ValueError("placement_path may not contain '..' segments")
|
||||||
|
return path
|
||||||
301
decnet/canary/planter.py
Normal file
301
decnet/canary/planter.py
Normal file
@@ -0,0 +1,301 @@
|
|||||||
|
"""Plant / revoke canary artifacts inside running decky containers.
|
||||||
|
|
||||||
|
Single entry point per operation:
|
||||||
|
|
||||||
|
* :func:`plant` writes a :class:`CanaryArtifact` into one decky's
|
||||||
|
filesystem via ``docker exec`` (mirroring the SSH driver's
|
||||||
|
``_run_file`` pattern), backdates the mtime, sets the requested
|
||||||
|
mode, and publishes ``canary.{token_id}.placed`` on the bus.
|
||||||
|
* :func:`revoke` unlinks the file (best-effort) and publishes
|
||||||
|
``canary.{token_id}.revoked``.
|
||||||
|
* :func:`seed_baseline` is the deploy-hook helper: synthesises the
|
||||||
|
configured baseline set for one decky, persists rows, plants each.
|
||||||
|
Failures are logged but do **not** abort the deploy (the deployer
|
||||||
|
hook calls this best-effort).
|
||||||
|
|
||||||
|
We don't reuse :class:`SSHDriver` directly because the orchestrator
|
||||||
|
driver is tied to its action types (``FileAction`` carries str
|
||||||
|
content; canary content is bytes). The planter takes the same
|
||||||
|
shape but speaks bytes-via-base64 over the wire.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import base64
|
||||||
|
import os
|
||||||
|
import shlex
|
||||||
|
import time
|
||||||
|
from secrets import token_urlsafe
|
||||||
|
from typing import Any, Iterable, Optional
|
||||||
|
|
||||||
|
from decnet.bus import topics
|
||||||
|
from decnet.bus.base import BaseBus
|
||||||
|
from decnet.bus.factory import get_bus
|
||||||
|
from decnet.canary.base import CanaryArtifact, CanaryContext
|
||||||
|
from decnet.canary.factory import get_generator
|
||||||
|
from decnet.canary.paths import default_path_for
|
||||||
|
from decnet.logging import get_logger
|
||||||
|
from decnet.web.db.repository import BaseRepository
|
||||||
|
|
||||||
|
log = get_logger("canary.planter")
|
||||||
|
|
||||||
|
_DOCKER = "docker"
|
||||||
|
_TIMEOUT = 8.0
|
||||||
|
# Container suffix — matches the orchestrator SSH driver's convention
|
||||||
|
# (``<decky_name>-ssh``). Canary placement always happens through the
|
||||||
|
# ssh container because every decky has one and it carries the most
|
||||||
|
# realistic filesystem layout.
|
||||||
|
_SSH_CONTAINER_SUFFIX = "-ssh"
|
||||||
|
|
||||||
|
|
||||||
|
def _container_for(decky_name: str) -> str:
|
||||||
|
return f"{decky_name}{_SSH_CONTAINER_SUFFIX}"
|
||||||
|
|
||||||
|
|
||||||
|
def _dirname(path: str) -> str:
|
||||||
|
idx = path.rfind("/")
|
||||||
|
if idx <= 0:
|
||||||
|
return "/"
|
||||||
|
return path[:idx]
|
||||||
|
|
||||||
|
|
||||||
|
async def _run(
|
||||||
|
argv: list[str], *, stdin_bytes: Optional[bytes] = None,
|
||||||
|
) -> tuple[int, str, str]:
|
||||||
|
try:
|
||||||
|
proc = await asyncio.create_subprocess_exec(
|
||||||
|
*argv,
|
||||||
|
stdin=asyncio.subprocess.PIPE if stdin_bytes is not None else None,
|
||||||
|
stdout=asyncio.subprocess.PIPE,
|
||||||
|
stderr=asyncio.subprocess.PIPE,
|
||||||
|
)
|
||||||
|
except FileNotFoundError as exc:
|
||||||
|
return 127, "", f"argv[0] not found: {exc}"
|
||||||
|
try:
|
||||||
|
stdout, stderr = await asyncio.wait_for(
|
||||||
|
proc.communicate(input=stdin_bytes), timeout=_TIMEOUT,
|
||||||
|
)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
try:
|
||||||
|
proc.kill()
|
||||||
|
except ProcessLookupError:
|
||||||
|
pass
|
||||||
|
return 124, "", "timeout"
|
||||||
|
return (
|
||||||
|
proc.returncode if proc.returncode is not None else -1,
|
||||||
|
stdout.decode("utf-8", "replace"),
|
||||||
|
stderr.decode("utf-8", "replace"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_plant_command(artifact: CanaryArtifact) -> tuple[str, bytes]:
|
||||||
|
"""Compose the ``sh -c`` script + stdin payload for one artifact.
|
||||||
|
|
||||||
|
Binary safety: we base64-encode on the host and stream the result
|
||||||
|
over stdin to ``base64 -d`` inside the container, so the bytes
|
||||||
|
never touch the argv (kernel ARG_MAX would reject anything larger
|
||||||
|
than ~128KB-2MB depending on the host). Both ``base64`` (coreutils)
|
||||||
|
and ``touch -d @<unix_ts>`` are present on every Linux base image
|
||||||
|
we ship, so there's no per-distro branching.
|
||||||
|
"""
|
||||||
|
encoded = base64.b64encode(artifact.content)
|
||||||
|
mtime = int(time.time() + artifact.mtime_offset)
|
||||||
|
mode_str = oct(artifact.mode)[2:]
|
||||||
|
parts = [
|
||||||
|
f"mkdir -p {shlex.quote(_dirname(artifact.path))}",
|
||||||
|
f"base64 -d > {shlex.quote(artifact.path)}",
|
||||||
|
f"chmod {mode_str} {shlex.quote(artifact.path)}",
|
||||||
|
f"touch -d @{mtime} {shlex.quote(artifact.path)}",
|
||||||
|
]
|
||||||
|
return " && ".join(parts), encoded
|
||||||
|
|
||||||
|
|
||||||
|
async def _publish(
|
||||||
|
bus: Optional[BaseBus], topic: str, payload: dict[str, Any],
|
||||||
|
) -> None:
|
||||||
|
"""Best-effort publish — never raises.
|
||||||
|
|
||||||
|
When ``bus`` is None we resolve via :func:`get_bus`; either way
|
||||||
|
bus-side failures are logged and swallowed (delivery is at-most-once
|
||||||
|
by contract; the DB row is source of truth).
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
owns_bus = bus is None
|
||||||
|
target = bus if bus is not None else get_bus()
|
||||||
|
if owns_bus:
|
||||||
|
await target.connect()
|
||||||
|
await target.publish(topic, payload)
|
||||||
|
if owns_bus:
|
||||||
|
await target.close()
|
||||||
|
except Exception as e: # noqa: BLE001
|
||||||
|
log.warning("canary bus publish failed topic=%s err=%s", topic, e)
|
||||||
|
|
||||||
|
|
||||||
|
async def plant(
|
||||||
|
decky_name: str,
|
||||||
|
artifact: CanaryArtifact,
|
||||||
|
*,
|
||||||
|
token_uuid: str,
|
||||||
|
repo: Optional[BaseRepository] = None,
|
||||||
|
publish: bool = True,
|
||||||
|
bus: Optional[BaseBus] = None,
|
||||||
|
) -> tuple[bool, Optional[str]]:
|
||||||
|
"""Write *artifact* into the decky's ssh container.
|
||||||
|
|
||||||
|
Returns ``(success, error_or_none)``. When ``repo`` is provided
|
||||||
|
the token row's state is updated to ``planted`` / ``failed``
|
||||||
|
accordingly. When ``publish`` is True a ``canary.<id>.placed``
|
||||||
|
event is published on the bus on success.
|
||||||
|
|
||||||
|
The function never raises on docker errors — callers (the API,
|
||||||
|
the deploy hook) treat the result as data.
|
||||||
|
"""
|
||||||
|
if not artifact.path:
|
||||||
|
err = "planter requires a non-empty artifact.path"
|
||||||
|
log.warning("canary.plant skipped: %s decky=%s token=%s", err, decky_name, token_uuid)
|
||||||
|
if repo is not None:
|
||||||
|
await repo.update_canary_token_state(token_uuid, "failed", err)
|
||||||
|
return False, err
|
||||||
|
|
||||||
|
sh_cmd, stdin_payload = _build_plant_command(artifact)
|
||||||
|
# ``-i`` keeps stdin attached so base64 -d inside the container can
|
||||||
|
# consume the encoded payload streamed from the host.
|
||||||
|
argv = [_DOCKER, "exec", "-i", _container_for(decky_name), "sh", "-c", sh_cmd]
|
||||||
|
rc, _stdout, stderr = await _run(argv, stdin_bytes=stdin_payload)
|
||||||
|
success = rc == 0
|
||||||
|
error = None if success else (stderr.strip()[:256] or f"rc={rc}")
|
||||||
|
|
||||||
|
if repo is not None:
|
||||||
|
if success:
|
||||||
|
await repo.update_canary_token_state(token_uuid, "planted", None)
|
||||||
|
else:
|
||||||
|
await repo.update_canary_token_state(token_uuid, "failed", error)
|
||||||
|
|
||||||
|
if success and publish:
|
||||||
|
await _publish(bus, topics.canary(token_uuid, topics.CANARY_PLACED), {
|
||||||
|
"token_id": token_uuid,
|
||||||
|
"decky_name": decky_name,
|
||||||
|
"placement_path": artifact.path,
|
||||||
|
"instrumenter": artifact.instrumenter,
|
||||||
|
"generator": artifact.generator,
|
||||||
|
})
|
||||||
|
|
||||||
|
if not success:
|
||||||
|
log.warning(
|
||||||
|
"canary.plant failed decky=%s token=%s rc=%d stderr=%r",
|
||||||
|
decky_name, token_uuid, rc, stderr[:120],
|
||||||
|
)
|
||||||
|
return success, error
|
||||||
|
|
||||||
|
|
||||||
|
async def revoke(
|
||||||
|
decky_name: str,
|
||||||
|
placement_path: str,
|
||||||
|
*,
|
||||||
|
token_uuid: str,
|
||||||
|
repo: Optional[BaseRepository] = None,
|
||||||
|
publish: bool = True,
|
||||||
|
bus: Optional[BaseBus] = None,
|
||||||
|
) -> tuple[bool, Optional[str]]:
|
||||||
|
"""Best-effort unlink + state transition + bus publish.
|
||||||
|
|
||||||
|
Returns ``(success, error_or_none)``. ``success`` is True when
|
||||||
|
the file is gone after the call (whether we deleted it or it was
|
||||||
|
already missing); only docker / container-down errors return False.
|
||||||
|
"""
|
||||||
|
sh_cmd = f"rm -f {shlex.quote(placement_path)}"
|
||||||
|
argv = [_DOCKER, "exec", _container_for(decky_name), "sh", "-c", sh_cmd]
|
||||||
|
rc, _stdout, stderr = await _run(argv)
|
||||||
|
success = rc == 0
|
||||||
|
error = None if success else (stderr.strip()[:256] or f"rc={rc}")
|
||||||
|
|
||||||
|
if repo is not None:
|
||||||
|
await repo.update_canary_token_state(token_uuid, "revoked", error if not success else None)
|
||||||
|
|
||||||
|
if publish:
|
||||||
|
await _publish(bus, topics.canary(token_uuid, topics.CANARY_REVOKED), {
|
||||||
|
"token_id": token_uuid,
|
||||||
|
"decky_name": decky_name,
|
||||||
|
"placement_path": placement_path,
|
||||||
|
})
|
||||||
|
|
||||||
|
return success, error
|
||||||
|
|
||||||
|
|
||||||
|
def _baseline_set() -> Iterable[str]:
|
||||||
|
"""Return the configured baseline generator names.
|
||||||
|
|
||||||
|
Honors ``DECNET_CANARY_BASELINE`` (comma-separated). Default is
|
||||||
|
a sensible mix that exercises every callback-bearing generator
|
||||||
|
plus a passive aws_creds drop for realism.
|
||||||
|
"""
|
||||||
|
raw = os.environ.get(
|
||||||
|
"DECNET_CANARY_BASELINE",
|
||||||
|
"git_config,env_file,honeydoc,aws_creds",
|
||||||
|
)
|
||||||
|
return [n.strip() for n in raw.split(",") if n.strip()]
|
||||||
|
|
||||||
|
|
||||||
|
def _ctx_for(slug: str) -> CanaryContext:
|
||||||
|
"""Build a :class:`CanaryContext` from the canary worker config."""
|
||||||
|
base = os.environ.get("DECNET_CANARY_HTTP_BASE", "http://localhost:8088")
|
||||||
|
zone = os.environ.get("DECNET_CANARY_DNS_ZONE", "")
|
||||||
|
return CanaryContext(callback_token=slug, http_base=base, dns_zone=zone)
|
||||||
|
|
||||||
|
|
||||||
|
async def seed_baseline(
|
||||||
|
decky_name: str,
|
||||||
|
repo: BaseRepository,
|
||||||
|
*,
|
||||||
|
persona: str = "linux",
|
||||||
|
created_by: str = "system",
|
||||||
|
bus: Optional[BaseBus] = None,
|
||||||
|
) -> list[dict[str, Any]]:
|
||||||
|
"""Plant the configured baseline canary set on one decky.
|
||||||
|
|
||||||
|
Best-effort: any individual placement that fails is logged and
|
||||||
|
the row is left in ``state=failed``; the deployer hook treats the
|
||||||
|
return value as informational, not authoritative.
|
||||||
|
|
||||||
|
Returns the list of token rows created (whether their planting
|
||||||
|
ultimately succeeded or not), so the caller can surface them in
|
||||||
|
the deploy report.
|
||||||
|
"""
|
||||||
|
out: list[dict[str, Any]] = []
|
||||||
|
for gen_name in _baseline_set():
|
||||||
|
try:
|
||||||
|
generator = get_generator(gen_name)
|
||||||
|
except ValueError:
|
||||||
|
log.warning("canary.seed_baseline: unknown generator %r — skipping", gen_name)
|
||||||
|
continue
|
||||||
|
slug = token_urlsafe(16)
|
||||||
|
ctx = _ctx_for(slug)
|
||||||
|
artifact = generator.generate(ctx)
|
||||||
|
artifact.path = default_path_for(gen_name, persona)
|
||||||
|
kind = "aws_passive" if gen_name == "aws_creds" else "http"
|
||||||
|
# Persist first so the planter has a row to update; that way a
|
||||||
|
# crash mid-plant leaves a recoverable failed-state row.
|
||||||
|
from uuid import uuid4
|
||||||
|
token_uuid = str(uuid4())
|
||||||
|
await repo.create_canary_token({
|
||||||
|
"uuid": token_uuid,
|
||||||
|
"kind": kind,
|
||||||
|
"decky_name": decky_name,
|
||||||
|
"blob_uuid": None,
|
||||||
|
"instrumenter": None,
|
||||||
|
"generator": gen_name,
|
||||||
|
"placement_path": artifact.path,
|
||||||
|
"callback_token": slug,
|
||||||
|
"secret_seed": slug,
|
||||||
|
"created_by": created_by,
|
||||||
|
"state": "planted", # optimistic — plant() flips to failed on error
|
||||||
|
})
|
||||||
|
await plant(
|
||||||
|
decky_name, artifact,
|
||||||
|
token_uuid=token_uuid, repo=repo, publish=True, bus=bus,
|
||||||
|
)
|
||||||
|
out.append({
|
||||||
|
"token_uuid": token_uuid, "generator": gen_name, "kind": kind,
|
||||||
|
"callback_token": slug, "placement_path": artifact.path,
|
||||||
|
})
|
||||||
|
return out
|
||||||
89
decnet/canary/storage.py
Normal file
89
decnet/canary/storage.py
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
"""Filesystem store for operator-uploaded canary blobs.
|
||||||
|
|
||||||
|
Blobs live under ``/var/lib/decnet/canary/blobs/<sha256>`` (override
|
||||||
|
via ``DECNET_CANARY_BLOB_DIR``) and are deduplicated by content hash.
|
||||||
|
The DB table :class:`decnet.web.db.models.CanaryBlob` mirrors
|
||||||
|
metadata; the bytes are read on demand at instrumentation time, so
|
||||||
|
the API process never holds large operator uploads in memory longer
|
||||||
|
than the request itself.
|
||||||
|
|
||||||
|
Refcount-aware deletion is enforced at the DB layer (see
|
||||||
|
:meth:`decnet.web.db.repository.BaseRepository.delete_canary_blob`);
|
||||||
|
this module only provides write/read/unlink primitives keyed by
|
||||||
|
sha256.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import hashlib
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Tuple
|
||||||
|
|
||||||
|
|
||||||
|
def blob_dir() -> Path:
|
||||||
|
"""Return the on-disk root for canary blobs.
|
||||||
|
|
||||||
|
Honors ``DECNET_CANARY_BLOB_DIR`` so tests can point at a tmp
|
||||||
|
path. The directory is created lazily on first write.
|
||||||
|
"""
|
||||||
|
raw = os.environ.get("DECNET_CANARY_BLOB_DIR", "/var/lib/decnet/canary/blobs")
|
||||||
|
return Path(raw)
|
||||||
|
|
||||||
|
|
||||||
|
def _path_for(sha256: str) -> Path:
|
||||||
|
# Two-level fan-out (``ab/cd/abcd...``) keeps any one directory
|
||||||
|
# from accumulating thousands of entries on busy fleets. Same
|
||||||
|
# shape as Git's loose-object store.
|
||||||
|
if len(sha256) < 4:
|
||||||
|
raise ValueError("sha256 must be at least 4 chars")
|
||||||
|
root = blob_dir()
|
||||||
|
return root / sha256[:2] / sha256[2:4] / sha256
|
||||||
|
|
||||||
|
|
||||||
|
def write_blob(content: bytes) -> Tuple[str, Path, int]:
|
||||||
|
"""Persist ``content`` under its sha256 path.
|
||||||
|
|
||||||
|
Idempotent: if the target file already exists with the same
|
||||||
|
bytes, no rewrite happens. Returns ``(sha256, path,
|
||||||
|
size_bytes)``.
|
||||||
|
"""
|
||||||
|
sha = hashlib.sha256(content).hexdigest()
|
||||||
|
target = _path_for(sha)
|
||||||
|
target.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
if not target.exists():
|
||||||
|
# Atomic-ish: write to a temp sibling and rename. Avoids the
|
||||||
|
# half-written-file race a concurrent reader would otherwise
|
||||||
|
# see if we wrote in place.
|
||||||
|
tmp = target.with_suffix(target.suffix + ".part")
|
||||||
|
tmp.write_bytes(content)
|
||||||
|
os.replace(tmp, target)
|
||||||
|
return sha, target, len(content)
|
||||||
|
|
||||||
|
|
||||||
|
def read_blob(sha256: str) -> bytes:
|
||||||
|
"""Read the bytes for a stored blob.
|
||||||
|
|
||||||
|
Raises :class:`FileNotFoundError` when the on-disk row was unlinked
|
||||||
|
out of band (operator pruned ``/var/lib/decnet`` by hand) — the
|
||||||
|
caller (instrumenter dispatch) surfaces it as a 410-ish error so
|
||||||
|
the operator can re-upload.
|
||||||
|
"""
|
||||||
|
return _path_for(sha256).read_bytes()
|
||||||
|
|
||||||
|
|
||||||
|
def unlink_blob(sha256: str) -> bool:
|
||||||
|
"""Delete the on-disk bytes for ``sha256``.
|
||||||
|
|
||||||
|
Returns True if a file was removed, False if it was already gone.
|
||||||
|
The DB row deletion happens in
|
||||||
|
:meth:`SQLModelRepository.delete_canary_blob`; this function is
|
||||||
|
a best-effort companion called *after* the DB delete commits so
|
||||||
|
a crash between them leaves a recoverable orphan, never a
|
||||||
|
dangling DB reference.
|
||||||
|
"""
|
||||||
|
target = _path_for(sha256)
|
||||||
|
try:
|
||||||
|
target.unlink()
|
||||||
|
except FileNotFoundError:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
254
decnet/canary/worker.py
Normal file
254
decnet/canary/worker.py
Normal file
@@ -0,0 +1,254 @@
|
|||||||
|
"""``decnet canary`` worker — HTTP + DNS callback receivers.
|
||||||
|
|
||||||
|
Two surfaces, one process:
|
||||||
|
|
||||||
|
* **HTTP** — a tiny FastAPI app on its own port (default 8088). The
|
||||||
|
only useful route is ``GET /c/{slug}`` which looks up the slug in
|
||||||
|
the canary token table, persists a :class:`CanaryTrigger` row,
|
||||||
|
publishes ``canary.<token_id>.triggered`` on the bus, and returns
|
||||||
|
a 1×1 transparent GIF (or 204 if the client's ``Accept`` doesn't
|
||||||
|
list any image type).
|
||||||
|
* **DNS** — an authoritative UDP server (default 5353 if non-root,
|
||||||
|
53 if root) for ``*.<canary_zone>``. Same lookup + persist +
|
||||||
|
publish flow, plus a sinkhole A record so the attacker's resolver
|
||||||
|
doesn't loop on NXDOMAIN.
|
||||||
|
|
||||||
|
Both surfaces are **stealth** by policy
|
||||||
|
(:mod:`feedback_stealth`): no DECNET strings in headers / banners /
|
||||||
|
error pages. The HTTP app strips the default ``Server: uvicorn``
|
||||||
|
header in middleware; FastAPI's docs/openapi UI is disabled because
|
||||||
|
discovering them would tip off the attacker that this is a honeypot.
|
||||||
|
|
||||||
|
The worker is supervised by its own systemd unit
|
||||||
|
(``decnet-canary.service``); like every other DECNET worker, it
|
||||||
|
crashes loudly rather than masking failures.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import os
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from fastapi import FastAPI, Request, Response
|
||||||
|
|
||||||
|
from decnet.bus import topics
|
||||||
|
from decnet.bus.base import BaseBus
|
||||||
|
from decnet.bus.factory import get_bus
|
||||||
|
from decnet.canary.dns_server import CanaryDNSProtocol, DNSQuery
|
||||||
|
from decnet.logging import get_logger
|
||||||
|
from decnet.web.db.factory import get_repository
|
||||||
|
from decnet.web.db.repository import BaseRepository
|
||||||
|
|
||||||
|
log = get_logger("canary.worker")
|
||||||
|
|
||||||
|
# 1×1 transparent GIF — public-domain canonical bytes. Returning the
|
||||||
|
# same image every time is fine: the body has no information the
|
||||||
|
# attacker shouldn't see, and image clients cache it.
|
||||||
|
_TRANSPARENT_GIF = bytes.fromhex(
|
||||||
|
"47494638396101000100800100000000ffffff21f90401000001002c00000000010001000002024401003b"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _http_base() -> str:
|
||||||
|
return os.environ.get("DECNET_CANARY_HTTP_BASE", "http://localhost:8088").rstrip("/")
|
||||||
|
|
||||||
|
|
||||||
|
def _dns_zone() -> str:
|
||||||
|
return os.environ.get("DECNET_CANARY_DNS_ZONE", "").strip(".").lower()
|
||||||
|
|
||||||
|
|
||||||
|
def _http_port() -> int:
|
||||||
|
return int(os.environ.get("DECNET_CANARY_HTTP_PORT", "8088"))
|
||||||
|
|
||||||
|
|
||||||
|
def _dns_port() -> int:
|
||||||
|
# Default 5353 (mDNS-ish, non-privileged) — operators pin :53 via
|
||||||
|
# NAT or a CAP_NET_BIND_SERVICE-enabled unit.
|
||||||
|
return int(os.environ.get("DECNET_CANARY_DNS_PORT", "5353"))
|
||||||
|
|
||||||
|
|
||||||
|
def _dns_bind() -> str:
|
||||||
|
return os.environ.get("DECNET_CANARY_DNS_BIND", "0.0.0.0") # nosec B104 — attacker-facing decoy listener, internet exposure is the design
|
||||||
|
|
||||||
|
|
||||||
|
def _http_bind() -> str:
|
||||||
|
return os.environ.get("DECNET_CANARY_HTTP_BIND", "0.0.0.0") # nosec B104 — same rationale
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------- HTTP surface --------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _build_app(repo: BaseRepository, bus: BaseBus) -> FastAPI:
|
||||||
|
"""Construct the FastAPI app.
|
||||||
|
|
||||||
|
Disables docs / openapi / redoc — operators query the canary
|
||||||
|
surface via the *main* DECNET API, never directly. Anyone hitting
|
||||||
|
these paths is either misconfigured or scanning for a honeypot.
|
||||||
|
"""
|
||||||
|
app = FastAPI(
|
||||||
|
title="", # don't leak "DECNET" in OpenAPI
|
||||||
|
docs_url=None, redoc_url=None, openapi_url=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
@app.middleware("http")
|
||||||
|
async def _stealth_headers(request: Request, call_next):
|
||||||
|
response: Response = await call_next(request)
|
||||||
|
# Strip the uvicorn / starlette banner; replace with a
|
||||||
|
# generic Server line that matches what most CDNs return.
|
||||||
|
response.headers["Server"] = "nginx"
|
||||||
|
# Don't leak request id / process id headers.
|
||||||
|
if "x-process-time" in response.headers:
|
||||||
|
del response.headers["x-process-time"]
|
||||||
|
return response
|
||||||
|
|
||||||
|
@app.get("/c/{slug}")
|
||||||
|
async def callback(slug: str, request: Request) -> Response:
|
||||||
|
await _record_hit(
|
||||||
|
repo, bus,
|
||||||
|
slug=slug,
|
||||||
|
src_ip=_client_ip(request),
|
||||||
|
user_agent=request.headers.get("user-agent"),
|
||||||
|
request_path=str(request.url.path),
|
||||||
|
dns_qname=None,
|
||||||
|
raw_headers=dict(request.headers),
|
||||||
|
)
|
||||||
|
# Always 200 with a tiny image so the attacker's client sees
|
||||||
|
# a "success" — same return regardless of whether the slug is
|
||||||
|
# known. Stealth: do NOT distinguish unknown vs known via
|
||||||
|
# status code or response body.
|
||||||
|
return Response(content=_TRANSPARENT_GIF, media_type="image/gif")
|
||||||
|
|
||||||
|
@app.get("/")
|
||||||
|
async def root() -> Response:
|
||||||
|
# Bare root returns a generic 404. The decoy posture: pretend
|
||||||
|
# to be an empty static-file host that just happens to resolve
|
||||||
|
# /c/<slug> when it matches.
|
||||||
|
return Response(status_code=404)
|
||||||
|
|
||||||
|
return app
|
||||||
|
|
||||||
|
|
||||||
|
def _client_ip(request: Request) -> str:
|
||||||
|
# Honor X-Forwarded-For if the operator deployed behind a reverse
|
||||||
|
# proxy. Take the leftmost address in the chain; everything after
|
||||||
|
# is upstream-proxy noise.
|
||||||
|
fwd = request.headers.get("x-forwarded-for")
|
||||||
|
if fwd:
|
||||||
|
return fwd.split(",", 1)[0].strip()
|
||||||
|
if request.client:
|
||||||
|
return request.client.host
|
||||||
|
return "0.0.0.0" # nosec B104 — sentinel for "unknown remote"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------- shared persistence -------------------------
|
||||||
|
|
||||||
|
|
||||||
|
async def _record_hit(
|
||||||
|
repo: BaseRepository,
|
||||||
|
bus: BaseBus,
|
||||||
|
*,
|
||||||
|
slug: str,
|
||||||
|
src_ip: str,
|
||||||
|
user_agent: Optional[str],
|
||||||
|
request_path: Optional[str],
|
||||||
|
dns_qname: Optional[str],
|
||||||
|
raw_headers: Optional[dict],
|
||||||
|
) -> None:
|
||||||
|
"""Resolve slug -> token, persist a trigger, publish on the bus.
|
||||||
|
|
||||||
|
Unknown slugs are silently swallowed: returning the same response
|
||||||
|
for known and unknown slugs is the stealth posture, and persisting
|
||||||
|
every random scan would clutter the DB.
|
||||||
|
"""
|
||||||
|
token = await repo.get_canary_token_by_slug(slug)
|
||||||
|
if token is None:
|
||||||
|
return
|
||||||
|
trigger_id = await repo.record_canary_trigger({
|
||||||
|
"token_uuid": token["uuid"],
|
||||||
|
"occurred_at": datetime.now(timezone.utc),
|
||||||
|
"src_ip": src_ip,
|
||||||
|
"user_agent": user_agent,
|
||||||
|
"request_path": request_path,
|
||||||
|
"dns_qname": dns_qname,
|
||||||
|
"raw_headers": raw_headers or {},
|
||||||
|
})
|
||||||
|
try:
|
||||||
|
await bus.publish(
|
||||||
|
topics.canary(token["uuid"], topics.CANARY_TRIGGERED),
|
||||||
|
{
|
||||||
|
"token_id": token["uuid"],
|
||||||
|
"trigger_id": trigger_id,
|
||||||
|
"decky_name": token["decky_name"],
|
||||||
|
"src_ip": src_ip,
|
||||||
|
"user_agent": user_agent,
|
||||||
|
"request_path": request_path,
|
||||||
|
"dns_qname": dns_qname,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
except Exception as e: # noqa: BLE001 — best effort
|
||||||
|
log.warning("canary.triggered publish failed slug=%s err=%s", slug, e)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------- DNS surface --------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
async def _start_dns_server(
|
||||||
|
repo: BaseRepository, bus: BaseBus, *, loop: asyncio.AbstractEventLoop,
|
||||||
|
) -> Optional[asyncio.DatagramTransport]:
|
||||||
|
zone = _dns_zone()
|
||||||
|
if not zone:
|
||||||
|
log.info("canary.dns disabled (DECNET_CANARY_DNS_ZONE unset)")
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def _hook(slug: str, query: DNSQuery, src_ip: str) -> None:
|
||||||
|
await _record_hit(
|
||||||
|
repo, bus,
|
||||||
|
slug=slug, src_ip=src_ip, user_agent=None,
|
||||||
|
request_path=None, dns_qname=query.qname,
|
||||||
|
raw_headers=None,
|
||||||
|
)
|
||||||
|
|
||||||
|
transport, _proto = await loop.create_datagram_endpoint(
|
||||||
|
lambda: CanaryDNSProtocol(zone, _hook),
|
||||||
|
local_addr=(_dns_bind(), _dns_port()),
|
||||||
|
)
|
||||||
|
log.info("canary.dns listening zone=%s port=%d", zone, _dns_port())
|
||||||
|
return transport # type: ignore[return-value]
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------- entry point --------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
async def run() -> None:
|
||||||
|
"""Worker entry point — kicked off by ``decnet canary``."""
|
||||||
|
import uvicorn
|
||||||
|
|
||||||
|
repo = get_repository()
|
||||||
|
await repo.initialize()
|
||||||
|
bus = get_bus()
|
||||||
|
await bus.connect()
|
||||||
|
|
||||||
|
app = _build_app(repo, bus)
|
||||||
|
config = uvicorn.Config(
|
||||||
|
app,
|
||||||
|
host=_http_bind(),
|
||||||
|
port=_http_port(),
|
||||||
|
log_level="warning",
|
||||||
|
access_log=False, # stealth: no per-request lines
|
||||||
|
server_header=False, # we set Server: nginx in middleware
|
||||||
|
)
|
||||||
|
server = uvicorn.Server(config)
|
||||||
|
loop = asyncio.get_running_loop()
|
||||||
|
dns_transport = await _start_dns_server(repo, bus, loop=loop)
|
||||||
|
try:
|
||||||
|
await server.serve()
|
||||||
|
finally:
|
||||||
|
if dns_transport is not None:
|
||||||
|
dns_transport.close()
|
||||||
|
await bus.close()
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
"""CLI entry point — synchronous wrapper for ``asyncio.run``."""
|
||||||
|
asyncio.run(run())
|
||||||
@@ -21,18 +21,27 @@ import typer
|
|||||||
from . import (
|
from . import (
|
||||||
agent,
|
agent,
|
||||||
api,
|
api,
|
||||||
|
bus,
|
||||||
|
canary,
|
||||||
db,
|
db,
|
||||||
deploy,
|
deploy,
|
||||||
forwarder,
|
forwarder,
|
||||||
|
geoip,
|
||||||
|
init,
|
||||||
inventory,
|
inventory,
|
||||||
lifecycle,
|
lifecycle,
|
||||||
listener,
|
listener,
|
||||||
|
orchestrator,
|
||||||
profiler,
|
profiler,
|
||||||
|
realism,
|
||||||
|
reconciler,
|
||||||
sniffer,
|
sniffer,
|
||||||
swarm,
|
swarm,
|
||||||
swarmctl,
|
swarmctl,
|
||||||
|
topology,
|
||||||
updater,
|
updater,
|
||||||
web,
|
web,
|
||||||
|
webhook,
|
||||||
workers,
|
workers,
|
||||||
)
|
)
|
||||||
from .gating import _gate_commands_by_mode
|
from .gating import _gate_commands_by_mode
|
||||||
@@ -49,7 +58,8 @@ for _mod in (
|
|||||||
api, swarmctl, agent, updater, listener, forwarder,
|
api, swarmctl, agent, updater, listener, forwarder,
|
||||||
swarm,
|
swarm,
|
||||||
deploy, lifecycle, workers, inventory,
|
deploy, lifecycle, workers, inventory,
|
||||||
web, profiler, sniffer, db,
|
web, profiler, orchestrator, realism, reconciler, sniffer, db,
|
||||||
|
topology, bus, geoip, init, webhook, canary,
|
||||||
):
|
):
|
||||||
_mod.register(app)
|
_mod.register(app)
|
||||||
|
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ def register(app: typer.Typer) -> None:
|
|||||||
with `decnet forwarder --daemon …`. Pass --no-forwarder to skip.
|
with `decnet forwarder --daemon …`. Pass --no-forwarder to skip.
|
||||||
"""
|
"""
|
||||||
from decnet.agent import server as _agent_server
|
from decnet.agent import server as _agent_server
|
||||||
from decnet.env import DECNET_SWARM_MASTER_HOST, DECNET_INGEST_LOG_FILE
|
from decnet.env import DECNET_SWARM_MASTER_HOST, DECNET_AGENT_LOG_FILE
|
||||||
from decnet.swarm import pki as _pki
|
from decnet.swarm import pki as _pki
|
||||||
|
|
||||||
resolved_dir = _pathlib.Path(agent_dir) if agent_dir else _pki.DEFAULT_AGENT_DIR
|
resolved_dir = _pathlib.Path(agent_dir) if agent_dir else _pki.DEFAULT_AGENT_DIR
|
||||||
@@ -44,7 +44,7 @@ def register(app: typer.Typer) -> None:
|
|||||||
"--master-host", DECNET_SWARM_MASTER_HOST,
|
"--master-host", DECNET_SWARM_MASTER_HOST,
|
||||||
"--master-port", str(int(os.environ.get("DECNET_SWARM_SYSLOG_PORT", "6514"))),
|
"--master-port", str(int(os.environ.get("DECNET_SWARM_SYSLOG_PORT", "6514"))),
|
||||||
"--agent-dir", str(resolved_dir),
|
"--agent-dir", str(resolved_dir),
|
||||||
"--log-file", str(DECNET_INGEST_LOG_FILE),
|
"--log-file", str(DECNET_AGENT_LOG_FILE),
|
||||||
"--daemon",
|
"--daemon",
|
||||||
]
|
]
|
||||||
try:
|
try:
|
||||||
|
|||||||
45
decnet/cli/bus.py
Normal file
45
decnet/cli/bus.py
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import typer
|
||||||
|
|
||||||
|
from . import utils as _utils
|
||||||
|
from .utils import console, log
|
||||||
|
|
||||||
|
|
||||||
|
def register(app: typer.Typer) -> None:
|
||||||
|
@app.command(name="bus")
|
||||||
|
def bus_cmd(
|
||||||
|
socket_path: str = typer.Option(
|
||||||
|
None, "--socket", "-s",
|
||||||
|
help="UNIX socket path (defaults to DECNET_BUS_SOCKET env var, "
|
||||||
|
"then /run/decnet/bus.sock, then ~/.decnet/bus.sock).",
|
||||||
|
),
|
||||||
|
group: str = typer.Option(
|
||||||
|
"decnet", "--group", "-g",
|
||||||
|
help="POSIX group to chown the socket to (falls back to process "
|
||||||
|
"group if the named group does not exist).",
|
||||||
|
),
|
||||||
|
heartbeat: int = typer.Option(
|
||||||
|
10, "--heartbeat", "-H",
|
||||||
|
help="Seconds between system.bus.health heartbeat events.",
|
||||||
|
),
|
||||||
|
daemon: bool = typer.Option(False, "--daemon", "-d", help="Detach to background as a daemon process."),
|
||||||
|
) -> None:
|
||||||
|
"""Run the DECNET ServiceBus worker (host-local UNIX-socket pub/sub)."""
|
||||||
|
import asyncio
|
||||||
|
from decnet.bus.factory import _default_socket_path
|
||||||
|
from decnet.bus.worker import bus_worker
|
||||||
|
|
||||||
|
resolved = socket_path or _default_socket_path()
|
||||||
|
|
||||||
|
if daemon:
|
||||||
|
log.info("bus daemonizing socket=%s", resolved)
|
||||||
|
_utils._daemonize()
|
||||||
|
|
||||||
|
log.info("bus starting socket=%s group=%s heartbeat=%ds", resolved, group, heartbeat)
|
||||||
|
console.print(f"[bold cyan]Bus starting[/] (socket: {resolved}, heartbeat: {heartbeat}s)")
|
||||||
|
|
||||||
|
try:
|
||||||
|
asyncio.run(bus_worker(resolved, group=group, heartbeat_interval=heartbeat))
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
console.print("\n[yellow]Bus stopped.[/]")
|
||||||
42
decnet/cli/canary.py
Normal file
42
decnet/cli/canary.py
Normal file
@@ -0,0 +1,42 @@
|
|||||||
|
"""``decnet canary`` — HTTP + DNS callback receiver for canary tokens.
|
||||||
|
|
||||||
|
Worker process. Mirrors the shape of :mod:`decnet.cli.webhook`: a
|
||||||
|
``@app.command(name="canary")`` Typer entry point that delegates to
|
||||||
|
:func:`decnet.canary.worker.run`.
|
||||||
|
|
||||||
|
Not master-only — any host that hosts deckies can run its own
|
||||||
|
canary worker (the bus events stay local; the webhook worker on
|
||||||
|
each host fans them out to SIEMs independently per the design
|
||||||
|
in ``development/let-s-move-to-the-enumerated-pike.md``).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import typer
|
||||||
|
|
||||||
|
from . import utils as _utils
|
||||||
|
from .utils import console, log
|
||||||
|
|
||||||
|
|
||||||
|
def register(app: typer.Typer) -> None:
|
||||||
|
@app.command(name="canary")
|
||||||
|
def canary_cmd(
|
||||||
|
daemon: bool = typer.Option(
|
||||||
|
False, "--daemon", "-d", help="Detach to background as a daemon process",
|
||||||
|
),
|
||||||
|
) -> None:
|
||||||
|
"""Run the canary HTTP + DNS callback receiver."""
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
from decnet.canary.worker import run
|
||||||
|
|
||||||
|
if daemon:
|
||||||
|
log.info("canary daemonizing")
|
||||||
|
_utils._daemonize()
|
||||||
|
|
||||||
|
log.info("canary starting")
|
||||||
|
console.print("[bold cyan]Canary callback receiver starting[/]")
|
||||||
|
|
||||||
|
try:
|
||||||
|
asyncio.run(run())
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
console.print("\n[yellow]Canary worker stopped.[/]")
|
||||||
@@ -8,19 +8,29 @@ from rich.table import Table
|
|||||||
from .utils import console, log
|
from .utils import console, log
|
||||||
|
|
||||||
|
|
||||||
_DB_RESET_TABLES: tuple[str, ...] = (
|
def _decnet_tables() -> tuple[str, ...]:
|
||||||
# Order matters for DROP TABLE: child FKs first.
|
"""Every DECNET-managed table, ordered child-first for DROP safety.
|
||||||
# - attacker_behavior FK-references attackers.
|
|
||||||
# - decky_shards FK-references swarm_hosts.
|
Source is ``SQLModel.metadata.sorted_tables`` — the same registry that
|
||||||
"attacker_behavior",
|
drives ``create_all`` — so adding a new model automatically enrolls
|
||||||
"attackers",
|
its table in ``db-reset`` with no manual step. (Previous hardcoded
|
||||||
"logs",
|
list drifted multiple times; ``webhook_subscriptions`` /
|
||||||
"bounty",
|
``session_profile`` / ``smtp_targets`` all got missed.)
|
||||||
"state",
|
|
||||||
"users",
|
``sorted_tables`` returns parent-first (topological order that makes
|
||||||
"decky_shards",
|
``CREATE`` safe). For ``DROP`` we need the reverse: children first,
|
||||||
"swarm_hosts",
|
so FK constraints drop before their parents. ``SET FOREIGN_KEY_CHECKS
|
||||||
)
|
= 0`` below makes this order-insensitive for MySQL, but the reverse
|
||||||
|
order keeps the code honest for any backend that doesn't support
|
||||||
|
disabling the FK check.
|
||||||
|
"""
|
||||||
|
from sqlmodel import SQLModel
|
||||||
|
# Importing the models package registers every table on SQLModel.metadata.
|
||||||
|
import decnet.web.db.models # noqa: F401
|
||||||
|
|
||||||
|
return tuple(
|
||||||
|
t.name for t in reversed(SQLModel.metadata.sorted_tables)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
async def _db_reset_mysql_async(dsn: str, mode: str, confirm: bool) -> None:
|
async def _db_reset_mysql_async(dsn: str, mode: str, confirm: bool) -> None:
|
||||||
@@ -32,10 +42,11 @@ async def _db_reset_mysql_async(dsn: str, mode: str, confirm: bool) -> None:
|
|||||||
|
|
||||||
db_name = urlparse(dsn).path.lstrip("/") or "(default)"
|
db_name = urlparse(dsn).path.lstrip("/") or "(default)"
|
||||||
engine = create_async_engine(dsn)
|
engine = create_async_engine(dsn)
|
||||||
|
tables = _decnet_tables()
|
||||||
try:
|
try:
|
||||||
rows: dict[str, int] = {}
|
rows: dict[str, int] = {}
|
||||||
async with engine.connect() as conn:
|
async with engine.connect() as conn:
|
||||||
for tbl in _DB_RESET_TABLES:
|
for tbl in tables:
|
||||||
try:
|
try:
|
||||||
result = await conn.execute(text(f"SELECT COUNT(*) FROM `{tbl}`")) # nosec B608
|
result = await conn.execute(text(f"SELECT COUNT(*) FROM `{tbl}`")) # nosec B608
|
||||||
rows[tbl] = result.scalar() or 0
|
rows[tbl] = result.scalar() or 0
|
||||||
@@ -58,7 +69,7 @@ async def _db_reset_mysql_async(dsn: str, mode: str, confirm: bool) -> None:
|
|||||||
|
|
||||||
async with engine.begin() as conn:
|
async with engine.begin() as conn:
|
||||||
await conn.execute(text("SET FOREIGN_KEY_CHECKS = 0"))
|
await conn.execute(text("SET FOREIGN_KEY_CHECKS = 0"))
|
||||||
for tbl in _DB_RESET_TABLES:
|
for tbl in tables:
|
||||||
if rows.get(tbl, -1) < 0:
|
if rows.get(tbl, -1) < 0:
|
||||||
continue
|
continue
|
||||||
if mode == "truncate":
|
if mode == "truncate":
|
||||||
|
|||||||
@@ -29,9 +29,11 @@ MASTER_ONLY_COMMANDS: frozenset[str] = frozenset({
|
|||||||
"api", "swarmctl", "deploy", "redeploy", "teardown",
|
"api", "swarmctl", "deploy", "redeploy", "teardown",
|
||||||
"mutate", "listener", "profiler",
|
"mutate", "listener", "profiler",
|
||||||
"services", "distros", "correlate", "archetypes", "web",
|
"services", "distros", "correlate", "archetypes", "web",
|
||||||
"db-reset",
|
"db-reset", "init", "webhook", "clusterer", "campaign-clusterer",
|
||||||
})
|
})
|
||||||
MASTER_ONLY_GROUPS: frozenset[str] = frozenset({"swarm"})
|
MASTER_ONLY_GROUPS: frozenset[str] = frozenset(
|
||||||
|
{"swarm", "topology", "geoip", "realism"}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _agent_mode_active() -> bool:
|
def _agent_mode_active() -> bool:
|
||||||
|
|||||||
59
decnet/cli/geoip.py
Normal file
59
decnet/cli/geoip.py
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
"""GeoIP CLI — refresh and lookup subcommands (master-only).
|
||||||
|
|
||||||
|
Usage::
|
||||||
|
|
||||||
|
decnet geoip refresh # re-download RIR files and rebuild the index
|
||||||
|
decnet geoip lookup 8.8.8.8 # one-shot IP -> country dump
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import typer
|
||||||
|
|
||||||
|
from .gating import _require_master_mode
|
||||||
|
from .utils import console, log
|
||||||
|
|
||||||
|
_group = typer.Typer(
|
||||||
|
name="geoip",
|
||||||
|
help="GeoIP provider management (master only).",
|
||||||
|
no_args_is_help=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@_group.command("refresh")
|
||||||
|
def _refresh() -> None:
|
||||||
|
"""Force re-download of the GeoIP provider data and rebuild the index."""
|
||||||
|
_require_master_mode("geoip refresh")
|
||||||
|
from decnet.geoip import get_lookup
|
||||||
|
from decnet.geoip.factory import get_provider
|
||||||
|
|
||||||
|
provider = get_provider()
|
||||||
|
log.info("geoip: forcing refresh via %s provider", provider.name)
|
||||||
|
console.print(f"[bold cyan]Refreshing {provider.name} GeoIP data…[/]")
|
||||||
|
try:
|
||||||
|
lookup = get_lookup(force_refresh=True)
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
console.print(f"[red]refresh failed: {exc}[/]")
|
||||||
|
raise typer.Exit(1) from exc
|
||||||
|
console.print(
|
||||||
|
f"[green]OK[/] {provider.name} index rebuilt "
|
||||||
|
f"({len(lookup)} ranges)."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@_group.command("lookup")
|
||||||
|
def _lookup(
|
||||||
|
ip: str = typer.Argument(..., help="IP address to resolve."),
|
||||||
|
) -> None:
|
||||||
|
"""Print the country code for an IP (or 'unknown')."""
|
||||||
|
_require_master_mode("geoip lookup")
|
||||||
|
from decnet.geoip import enrich_ip
|
||||||
|
|
||||||
|
cc, source = enrich_ip(ip)
|
||||||
|
if cc is None:
|
||||||
|
console.print(f"{ip} [yellow]unknown[/]")
|
||||||
|
raise typer.Exit(0)
|
||||||
|
console.print(f"{ip} [green]cc={cc}[/] source={source}")
|
||||||
|
|
||||||
|
|
||||||
|
def register(app: typer.Typer) -> None:
|
||||||
|
app.add_typer(_group, name="geoip")
|
||||||
843
decnet/cli/init.py
Normal file
843
decnet/cli/init.py
Normal file
@@ -0,0 +1,843 @@
|
|||||||
|
"""
|
||||||
|
`decnet init` — one-shot master-host bootstrap.
|
||||||
|
|
||||||
|
Idempotent: running it twice is a no-op on already-configured items.
|
||||||
|
Takes a freshly ``pip install``'d DECNET and turns it into a ready-to-
|
||||||
|
run master host: creates the ``decnet`` system user/group, installs
|
||||||
|
the systemd units + polkit rule + tmpfiles.d entry, seeds the
|
||||||
|
directory layout, drops a placeholder config, and starts the
|
||||||
|
``decnet.target`` grouping unit.
|
||||||
|
|
||||||
|
Requires root. Uses ``subprocess.run`` (never ``shell=True``) for every
|
||||||
|
privileged call so the full argv surface is auditable.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import grp
|
||||||
|
import hashlib
|
||||||
|
import os
|
||||||
|
import pwd
|
||||||
|
import shutil
|
||||||
|
import subprocess # nosec B404
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Callable, List, Optional
|
||||||
|
|
||||||
|
import typer
|
||||||
|
from jinja2 import Environment, FileSystemLoader, StrictUndefined
|
||||||
|
|
||||||
|
import decnet as _decnet_pkg
|
||||||
|
from .gating import _require_master_mode
|
||||||
|
from .utils import console, log
|
||||||
|
|
||||||
|
|
||||||
|
_CONFIG_PLACEHOLDER = """\
|
||||||
|
# /etc/decnet/decnet.ini — DECNET host config.
|
||||||
|
#
|
||||||
|
# Every key is OPTIONAL. Absent keys fall through to env-var defaults
|
||||||
|
# defined in decnet/env.py. Real env vars always win over this file
|
||||||
|
# (precedence: env > INI > default), so systemd EnvironmentFile= and
|
||||||
|
# one-off `DECNET_FOO=bar decnet ...` invocations always take effect.
|
||||||
|
#
|
||||||
|
# Secrets (JWT, admin password, DB password) intentionally DO NOT
|
||||||
|
# live here. Put them in /opt/decnet/.env.local or the systemd
|
||||||
|
# EnvironmentFile= — never in a group-readable INI.
|
||||||
|
|
||||||
|
[decnet]
|
||||||
|
# mode = master # or "agent"
|
||||||
|
|
||||||
|
# [api]
|
||||||
|
# host = 127.0.0.1
|
||||||
|
# port = 8000
|
||||||
|
|
||||||
|
# [web]
|
||||||
|
# host = 127.0.0.1
|
||||||
|
# port = 8080
|
||||||
|
# admin-user = admin
|
||||||
|
# cors-origins = http://localhost:8080 # comma-separated
|
||||||
|
|
||||||
|
# [database]
|
||||||
|
# type = sqlite # or "mysql"
|
||||||
|
# url = mysql+asyncmy://user@host:3306/decnet # if set, wins over host/port/name/user
|
||||||
|
# host = localhost
|
||||||
|
# port = 3306
|
||||||
|
# name = decnet
|
||||||
|
# user = decnet
|
||||||
|
|
||||||
|
# [bus]
|
||||||
|
# enabled = true
|
||||||
|
# type = unix # or "fake"
|
||||||
|
# socket = /run/decnet/bus.sock
|
||||||
|
# group = decnet
|
||||||
|
|
||||||
|
# [swarm]
|
||||||
|
# master-host = 10.0.0.1
|
||||||
|
# syslog-port = 6514
|
||||||
|
# swarmctl-port = 8770
|
||||||
|
|
||||||
|
# [logging]
|
||||||
|
# system-log = /var/log/decnet/decnet.system.log
|
||||||
|
# ingest-log = /var/log/decnet/decnet.log
|
||||||
|
# agent-log = /var/log/decnet/agent.log
|
||||||
|
|
||||||
|
# [ingester]
|
||||||
|
# batch-size = 100
|
||||||
|
# batch-max-wait-ms = 250
|
||||||
|
|
||||||
|
# [tracing]
|
||||||
|
# enabled = false
|
||||||
|
# otel-endpoint = http://localhost:4317
|
||||||
|
|
||||||
|
# [agent]
|
||||||
|
# Managed by the enroll bundle — do NOT edit by hand on an agent host.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def _deploy_root() -> Path:
|
||||||
|
"""Resolve the on-disk ``deploy/`` directory of the installed package.
|
||||||
|
|
||||||
|
Editable install (``pip install -e .``): sibling of the ``decnet``
|
||||||
|
package at repo root. Wheel installs aren't supported yet — the
|
||||||
|
error message tells the operator to use an editable install.
|
||||||
|
"""
|
||||||
|
root = Path(_decnet_pkg.__file__).resolve().parent.parent / "deploy"
|
||||||
|
if not (root / "decnet.target").is_file():
|
||||||
|
raise RuntimeError(
|
||||||
|
f"cannot locate deploy/ directory (looked at {root}); "
|
||||||
|
"are you on a wheel install that didn't bundle deploy/? "
|
||||||
|
"use `pip install -e .` from a git checkout"
|
||||||
|
)
|
||||||
|
return root
|
||||||
|
|
||||||
|
|
||||||
|
def _sha256(path: Path) -> str:
|
||||||
|
h = hashlib.sha256()
|
||||||
|
h.update(path.read_bytes())
|
||||||
|
return h.hexdigest()
|
||||||
|
|
||||||
|
|
||||||
|
def _run(argv: List[str], *, dry_run: bool) -> None:
|
||||||
|
if dry_run:
|
||||||
|
console.print(f" [dim]would run:[/] {' '.join(argv)}")
|
||||||
|
return
|
||||||
|
log.info("init: exec %s", argv)
|
||||||
|
subprocess.run(argv, check=True) # nosec B603
|
||||||
|
|
||||||
|
|
||||||
|
def _step(label: str, action: Callable[[], str]) -> bool:
|
||||||
|
"""Run ``action``, print a checklist line.
|
||||||
|
|
||||||
|
The callable returns the human-readable outcome verb:
|
||||||
|
``"ok"`` → ``[ OK ] <label>``,
|
||||||
|
``"skip: <reason>"`` → ``[SKIP] <label> (<reason>)``.
|
||||||
|
Any exception becomes ``[FAIL] <label>: <err>`` and re-raises.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
result = action()
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
console.print(f"[red][FAIL][/] {label}: {exc}")
|
||||||
|
raise
|
||||||
|
if result.startswith("skip:"):
|
||||||
|
reason = result[len("skip:") :].strip()
|
||||||
|
console.print(f"[yellow][SKIP][/] {label} ({reason})")
|
||||||
|
else:
|
||||||
|
console.print(f"[green][ OK ][/] {label}")
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def _ensure_group(group: str, *, dry_run: bool) -> str:
|
||||||
|
try:
|
||||||
|
grp.getgrnam(group)
|
||||||
|
return f"skip: group {group} already exists"
|
||||||
|
except KeyError:
|
||||||
|
_run(["groupadd", "--system", group], dry_run=dry_run)
|
||||||
|
return "ok"
|
||||||
|
|
||||||
|
|
||||||
|
def _ensure_user(user: str, group: str, install_dir: str, *, dry_run: bool) -> str:
|
||||||
|
try:
|
||||||
|
pwd.getpwnam(user)
|
||||||
|
return f"skip: user {user} already exists"
|
||||||
|
except KeyError:
|
||||||
|
_run(
|
||||||
|
[
|
||||||
|
"useradd", "--system",
|
||||||
|
"--gid", group,
|
||||||
|
"--home-dir", install_dir,
|
||||||
|
"--shell", "/usr/sbin/nologin",
|
||||||
|
"--comment", "DECNET honeypot",
|
||||||
|
user,
|
||||||
|
],
|
||||||
|
dry_run=dry_run,
|
||||||
|
)
|
||||||
|
return "ok"
|
||||||
|
|
||||||
|
|
||||||
|
def _ensure_dir(
|
||||||
|
path: Path, *, mode: int, owner: str, group: str, dry_run: bool
|
||||||
|
) -> str:
|
||||||
|
existed = path.exists()
|
||||||
|
if dry_run:
|
||||||
|
console.print(
|
||||||
|
f" [dim]would ensure dir:[/] {path} (mode={oct(mode)}, "
|
||||||
|
f"owner={owner}:{group})"
|
||||||
|
)
|
||||||
|
return "skip: dry-run" if existed else "ok"
|
||||||
|
path.mkdir(parents=True, exist_ok=True)
|
||||||
|
try:
|
||||||
|
os.chmod(path, mode)
|
||||||
|
uid = pwd.getpwnam(owner).pw_uid
|
||||||
|
gid = grp.getgrnam(group).gr_gid
|
||||||
|
os.chown(path, uid, gid)
|
||||||
|
except (KeyError, PermissionError):
|
||||||
|
# owner/group not yet created, or we're not root (--prefix tests).
|
||||||
|
# mkdir is the load-bearing part; perm bits come back on the real
|
||||||
|
# root run.
|
||||||
|
pass
|
||||||
|
return f"skip: {path} already present" if existed else "ok"
|
||||||
|
|
||||||
|
|
||||||
|
def _ensure_config(path: Path, group: str, *, dry_run: bool) -> str:
|
||||||
|
if path.exists():
|
||||||
|
return f"skip: {path} already present"
|
||||||
|
if dry_run:
|
||||||
|
console.print(f" [dim]would write:[/] {path}")
|
||||||
|
return "ok"
|
||||||
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
path.write_text(_CONFIG_PLACEHOLDER)
|
||||||
|
try:
|
||||||
|
os.chmod(path, 0o640)
|
||||||
|
gid = grp.getgrnam(group).gr_gid
|
||||||
|
os.chown(path, 0, gid)
|
||||||
|
except (KeyError, PermissionError):
|
||||||
|
pass
|
||||||
|
return "ok"
|
||||||
|
|
||||||
|
|
||||||
|
def _copy_if_changed(
|
||||||
|
src: Path, dst: Path, *, mode: int, force: bool, dry_run: bool
|
||||||
|
) -> str:
|
||||||
|
if dst.exists() and not force and _sha256(src) == _sha256(dst):
|
||||||
|
return f"skip: {dst} up to date"
|
||||||
|
if dry_run:
|
||||||
|
console.print(f" [dim]would install:[/] {src} -> {dst} (mode={oct(mode)})")
|
||||||
|
return "ok"
|
||||||
|
dst.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
shutil.copy2(src, dst)
|
||||||
|
try:
|
||||||
|
os.chmod(dst, mode)
|
||||||
|
os.chown(dst, 0, 0)
|
||||||
|
except PermissionError:
|
||||||
|
pass
|
||||||
|
return "ok"
|
||||||
|
|
||||||
|
|
||||||
|
def _render_template(src: Path, context: dict[str, str]) -> str:
|
||||||
|
"""Render a Jinja2 .j2 template with the given context.
|
||||||
|
|
||||||
|
StrictUndefined: a missing context variable is an error, not a
|
||||||
|
silent empty-string substitution — that way a typo in the template
|
||||||
|
fails loudly instead of shipping a broken systemd unit.
|
||||||
|
"""
|
||||||
|
env = Environment(
|
||||||
|
loader=FileSystemLoader(str(src.parent)),
|
||||||
|
undefined=StrictUndefined,
|
||||||
|
keep_trailing_newline=True,
|
||||||
|
autoescape=False, # nosec B701 — rendering systemd INI, not HTML
|
||||||
|
)
|
||||||
|
template = env.get_template(src.name)
|
||||||
|
return template.render(**context)
|
||||||
|
|
||||||
|
|
||||||
|
def _write_rendered_if_changed(
|
||||||
|
src: Path, dst: Path, rendered: str, *, mode: int, force: bool, dry_run: bool
|
||||||
|
) -> str:
|
||||||
|
"""Write *rendered* content to *dst* only if it differs from what's there.
|
||||||
|
|
||||||
|
SHA compares rendered-output ↔ on-disk bytes (NOT source-template ↔
|
||||||
|
on-disk) so operators who customise their install_dir get idempotent
|
||||||
|
re-runs instead of every ``decnet init`` rewriting files.
|
||||||
|
"""
|
||||||
|
rendered_bytes = rendered.encode("utf-8")
|
||||||
|
if dst.exists() and not force:
|
||||||
|
if hashlib.sha256(dst.read_bytes()).hexdigest() == hashlib.sha256(rendered_bytes).hexdigest():
|
||||||
|
return f"skip: {dst} up to date"
|
||||||
|
if dry_run:
|
||||||
|
console.print(f" [dim]would render:[/] {src} -> {dst} (mode={oct(mode)})")
|
||||||
|
return "ok"
|
||||||
|
dst.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
dst.write_bytes(rendered_bytes)
|
||||||
|
try:
|
||||||
|
os.chmod(dst, mode)
|
||||||
|
os.chown(dst, 0, 0)
|
||||||
|
except PermissionError:
|
||||||
|
pass
|
||||||
|
return "ok"
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_venv_dir(install_dir: str, explicit: str | None) -> str:
|
||||||
|
"""Pick the virtualenv systemd units should ExecStart out of.
|
||||||
|
|
||||||
|
Priority:
|
||||||
|
1. ``--venv-dir`` flag (explicit; absolute path required).
|
||||||
|
2. ``VIRTUAL_ENV`` env var, but only when it lives under
|
||||||
|
``install_dir`` (refuse to bake /home/user/.venv into a system
|
||||||
|
service — that directory is user-owned and may vanish).
|
||||||
|
3. ``{install_dir}/venv`` — what ``enroll_bootstrap.sh`` creates
|
||||||
|
on fresh agents; the production default.
|
||||||
|
4. First hit from a short list of dev-box conventions under
|
||||||
|
``install_dir``: ``.venv``, ``.311``, ``.312``, ``.313``.
|
||||||
|
|
||||||
|
Raises RuntimeError with an operator-friendly message if none of
|
||||||
|
those resolve to a directory containing ``bin/decnet``. Failing loud
|
||||||
|
at init time beats systemd spamming journalctl with
|
||||||
|
'Failed at step EXEC spawning .../venv/bin/decnet: No such file or
|
||||||
|
directory' on every auto-restart.
|
||||||
|
"""
|
||||||
|
install_path = Path(install_dir)
|
||||||
|
|
||||||
|
candidates: list[Path] = []
|
||||||
|
if explicit:
|
||||||
|
if not explicit.startswith("/"):
|
||||||
|
raise RuntimeError(
|
||||||
|
f"--venv-dir must be an absolute path, got {explicit!r}"
|
||||||
|
)
|
||||||
|
candidates.append(Path(explicit))
|
||||||
|
else:
|
||||||
|
virtual_env = os.environ.get("VIRTUAL_ENV")
|
||||||
|
if virtual_env:
|
||||||
|
ve_path = Path(virtual_env)
|
||||||
|
try:
|
||||||
|
ve_path.relative_to(install_path)
|
||||||
|
candidates.append(ve_path)
|
||||||
|
except ValueError:
|
||||||
|
# VIRTUAL_ENV lives outside install_dir — don't bake a
|
||||||
|
# user-home venv into a root-owned systemd unit.
|
||||||
|
pass
|
||||||
|
candidates.append(install_path / "venv")
|
||||||
|
for name in (".venv", ".311", ".312", ".313"):
|
||||||
|
candidates.append(install_path / name)
|
||||||
|
|
||||||
|
for cand in candidates:
|
||||||
|
if (cand / "bin" / "decnet").is_file():
|
||||||
|
return str(cand)
|
||||||
|
|
||||||
|
searched = ", ".join(str(c) for c in candidates)
|
||||||
|
raise RuntimeError(
|
||||||
|
"Could not find a DECNET venv. Create one first (e.g. "
|
||||||
|
f"`python -m venv {install_path}/venv && "
|
||||||
|
f"{install_path}/venv/bin/pip install -e {install_path}[dev]`) "
|
||||||
|
"or pass --venv-dir. Searched: " + searched
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _install_units(
|
||||||
|
deploy: Path,
|
||||||
|
systemd_dir: Path,
|
||||||
|
*,
|
||||||
|
install_dir: str,
|
||||||
|
venv_dir: str,
|
||||||
|
user: str,
|
||||||
|
group: str,
|
||||||
|
force: bool,
|
||||||
|
dry_run: bool,
|
||||||
|
) -> str:
|
||||||
|
"""Render decnet-*.service.j2 → systemd_dir/decnet-*.service, and copy
|
||||||
|
the static decnet.target (no templating needed — it has no install
|
||||||
|
path references)."""
|
||||||
|
context = {
|
||||||
|
"install_dir": install_dir,
|
||||||
|
"venv_dir": venv_dir,
|
||||||
|
"user": user,
|
||||||
|
"group": group,
|
||||||
|
}
|
||||||
|
templates = sorted(deploy.glob("decnet-*.service.j2"))
|
||||||
|
static = [deploy / "decnet.target"]
|
||||||
|
|
||||||
|
touched = 0
|
||||||
|
for src in templates:
|
||||||
|
rendered = _render_template(src, context)
|
||||||
|
# decnet-api.service.j2 → decnet-api.service
|
||||||
|
dst_name = src.name[: -len(".j2")]
|
||||||
|
result = _write_rendered_if_changed(
|
||||||
|
src, systemd_dir / dst_name, rendered,
|
||||||
|
mode=0o644, force=force, dry_run=dry_run,
|
||||||
|
)
|
||||||
|
if not result.startswith("skip:"):
|
||||||
|
touched += 1
|
||||||
|
for src in static:
|
||||||
|
result = _copy_if_changed(
|
||||||
|
src, systemd_dir / src.name,
|
||||||
|
mode=0o644, force=force, dry_run=dry_run,
|
||||||
|
)
|
||||||
|
if not result.startswith("skip:"):
|
||||||
|
touched += 1
|
||||||
|
total = len(templates) + len(static)
|
||||||
|
if touched == 0:
|
||||||
|
return f"skip: {total} unit files up to date"
|
||||||
|
return f"ok ({touched}/{total} installed)"
|
||||||
|
|
||||||
|
|
||||||
|
def _install_polkit(
|
||||||
|
deploy: Path, rules_dir: Path, *, group: str, force: bool, dry_run: bool
|
||||||
|
) -> str:
|
||||||
|
"""Render the group-scoped polkit rule to /etc/polkit-1/rules.d/.
|
||||||
|
|
||||||
|
The rule has to reference the same POSIX group passed via --group —
|
||||||
|
otherwise the API (running as that user) can't
|
||||||
|
systemctl start/stop decnet-*.service without an interactive auth
|
||||||
|
prompt that never gets answered in a daemon context.
|
||||||
|
"""
|
||||||
|
src = deploy / "polkit" / "50-decnet-workers.rules.j2"
|
||||||
|
if not src.is_file():
|
||||||
|
raise RuntimeError(f"missing polkit rule template at {src}")
|
||||||
|
rendered = _render_template(src, {"group": group})
|
||||||
|
# 50-decnet-workers.rules.j2 → 50-decnet-workers.rules
|
||||||
|
dst_name = src.name[: -len(".j2")]
|
||||||
|
return _write_rendered_if_changed(
|
||||||
|
src, rules_dir / dst_name, rendered,
|
||||||
|
mode=0o644, force=force, dry_run=dry_run,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _run_allow_fail(argv: List[str], *, dry_run: bool) -> str:
|
||||||
|
"""Like ``_run`` but tolerates non-zero exits (stop/disable on an
|
||||||
|
already-absent unit is fine during deinit)."""
|
||||||
|
if dry_run:
|
||||||
|
console.print(f" [dim]would run (allow fail):[/] {' '.join(argv)}")
|
||||||
|
return "ok"
|
||||||
|
log.info("init: exec (allow fail) %s", argv)
|
||||||
|
result = subprocess.run(argv, check=False) # nosec B603
|
||||||
|
if result.returncode != 0:
|
||||||
|
return f"skip: rc={result.returncode} (already absent)"
|
||||||
|
return "ok"
|
||||||
|
|
||||||
|
|
||||||
|
def _remove_file(path: Path, *, dry_run: bool) -> str:
|
||||||
|
if not path.exists() and not path.is_symlink():
|
||||||
|
return f"skip: {path} already absent"
|
||||||
|
if dry_run:
|
||||||
|
console.print(f" [dim]would remove:[/] {path}")
|
||||||
|
return "ok"
|
||||||
|
path.unlink()
|
||||||
|
return "ok"
|
||||||
|
|
||||||
|
|
||||||
|
def _uninstall_units(systemd_dir: Path, *, dry_run: bool) -> str:
|
||||||
|
removed = 0
|
||||||
|
present = sorted(systemd_dir.glob("decnet-*.service"))
|
||||||
|
target = systemd_dir / "decnet.target"
|
||||||
|
if target.exists():
|
||||||
|
present.append(target)
|
||||||
|
for path in present:
|
||||||
|
if dry_run:
|
||||||
|
console.print(f" [dim]would remove:[/] {path}")
|
||||||
|
removed += 1
|
||||||
|
continue
|
||||||
|
path.unlink()
|
||||||
|
removed += 1
|
||||||
|
if removed == 0:
|
||||||
|
return "skip: no decnet unit files present"
|
||||||
|
return f"ok ({removed} removed)"
|
||||||
|
|
||||||
|
|
||||||
|
def _remove_user(user: str, *, dry_run: bool) -> str:
|
||||||
|
try:
|
||||||
|
pwd.getpwnam(user)
|
||||||
|
except KeyError:
|
||||||
|
return f"skip: user {user} already absent"
|
||||||
|
# userdel returns non-zero if the user still owns running
|
||||||
|
# processes; that's the operator's problem to sort out, not ours.
|
||||||
|
return _run_allow_fail(["userdel", user], dry_run=dry_run)
|
||||||
|
|
||||||
|
|
||||||
|
def _remove_group(group: str, *, dry_run: bool) -> str:
|
||||||
|
try:
|
||||||
|
grp.getgrnam(group)
|
||||||
|
except KeyError:
|
||||||
|
return f"skip: group {group} already absent"
|
||||||
|
return _run_allow_fail(["groupdel", group], dry_run=dry_run)
|
||||||
|
|
||||||
|
|
||||||
|
def _remove_dir_if_present(
|
||||||
|
path: Path, *, dry_run: bool, recursive: bool = False
|
||||||
|
) -> str:
|
||||||
|
if not path.exists():
|
||||||
|
return f"skip: {path} already absent"
|
||||||
|
if dry_run:
|
||||||
|
verb = "would rm -rf" if recursive else "would rmdir"
|
||||||
|
console.print(f" [dim]{verb}:[/] {path}")
|
||||||
|
return "ok"
|
||||||
|
if recursive:
|
||||||
|
shutil.rmtree(path, ignore_errors=True)
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
path.rmdir()
|
||||||
|
except OSError as exc:
|
||||||
|
return f"skip: {path} not empty ({exc.strerror})"
|
||||||
|
return "ok"
|
||||||
|
|
||||||
|
|
||||||
|
def _install_tmpfiles(
|
||||||
|
deploy: Path, tmpfiles_dir: Path, *, force: bool, dry_run: bool
|
||||||
|
) -> str:
|
||||||
|
src = deploy / "tmpfiles.d" / "decnet.conf"
|
||||||
|
if not src.is_file():
|
||||||
|
raise RuntimeError(f"missing tmpfiles.d entry at {src}")
|
||||||
|
result = _copy_if_changed(
|
||||||
|
src, tmpfiles_dir / src.name,
|
||||||
|
mode=0o644, force=force, dry_run=dry_run,
|
||||||
|
)
|
||||||
|
# Apply immediately so /run/decnet exists before daemon-reload.
|
||||||
|
_run(["systemd-tmpfiles", "--create", str(tmpfiles_dir / src.name)], dry_run=dry_run)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _install_logrotate(
|
||||||
|
deploy: Path, logrotate_dir: Path, *, force: bool, dry_run: bool
|
||||||
|
) -> str:
|
||||||
|
"""Drop the logrotate config into ``/etc/logrotate.d/decnet``.
|
||||||
|
|
||||||
|
The ingester / forwarder hold the log files open via Python, so the
|
||||||
|
config uses ``copytruncate`` rather than rename+create. Without this
|
||||||
|
rule, /var/log/decnet/ grows without bound and a single noisy day of
|
||||||
|
attacker traffic fills the disk on a small VPS. Best-effort: a host
|
||||||
|
without logrotate installed (rare on systemd distros) still boots
|
||||||
|
fine — the operator just needs to wire their own rotation.
|
||||||
|
"""
|
||||||
|
src = deploy / "logrotate.d" / "decnet"
|
||||||
|
if not src.is_file():
|
||||||
|
raise RuntimeError(f"missing logrotate config at {src}")
|
||||||
|
return _copy_if_changed(
|
||||||
|
src, logrotate_dir / src.name,
|
||||||
|
mode=0o644, force=force, dry_run=dry_run,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def register(app: typer.Typer) -> None:
|
||||||
|
@app.command(name="init")
|
||||||
|
def init_cmd(
|
||||||
|
dry_run: bool = typer.Option(
|
||||||
|
False, "--dry-run",
|
||||||
|
help="Print every action; make no changes.",
|
||||||
|
),
|
||||||
|
no_start: bool = typer.Option(
|
||||||
|
False, "--no-start",
|
||||||
|
help="Install everything but don't `systemctl enable --now decnet.target`.",
|
||||||
|
),
|
||||||
|
force: bool = typer.Option(
|
||||||
|
False, "--force",
|
||||||
|
help="Overwrite unit / polkit / tmpfiles entries even if identical.",
|
||||||
|
),
|
||||||
|
deinit: bool = typer.Option(
|
||||||
|
False, "--deinit",
|
||||||
|
help="Undo a previous init: stop + disable decnet.target, remove "
|
||||||
|
"unit files, polkit rule, tmpfiles.d entry, /etc/decnet. "
|
||||||
|
"Preserves /var/lib/decnet, /var/log/decnet, and the "
|
||||||
|
"service user/group — pass --purge to remove those too.",
|
||||||
|
),
|
||||||
|
purge: bool = typer.Option(
|
||||||
|
False, "--purge",
|
||||||
|
help="With --deinit, also wipe /var/lib/decnet, "
|
||||||
|
"/var/log/decnet, AND the service user/group. "
|
||||||
|
"Destructive — operator data is gone, and if --user "
|
||||||
|
"points at your own login account, that account goes "
|
||||||
|
"with it. Only use when the user/group was created by "
|
||||||
|
"`decnet init` in the first place.",
|
||||||
|
),
|
||||||
|
user: str = typer.Option(
|
||||||
|
"decnet", "--user",
|
||||||
|
help="System user to own DECNET processes.",
|
||||||
|
),
|
||||||
|
group: str = typer.Option(
|
||||||
|
"decnet", "--group",
|
||||||
|
help="Primary group of the DECNET user.",
|
||||||
|
),
|
||||||
|
install_dir: str = typer.Option(
|
||||||
|
"/opt/decnet", "--install-dir",
|
||||||
|
help="Absolute path where DECNET is installed. Default "
|
||||||
|
"/opt/decnet; distros that reserve /opt can point this "
|
||||||
|
"at /srv/decnet, /usr/local/decnet, etc. Gets rendered "
|
||||||
|
"into every systemd unit via Jinja2 and used as the "
|
||||||
|
"decnet user's home directory.",
|
||||||
|
),
|
||||||
|
venv_dir: Optional[str] = typer.Option(
|
||||||
|
None, "--venv-dir",
|
||||||
|
help="Absolute path to the Python venv systemd should "
|
||||||
|
"ExecStart from. If omitted, auto-detected in order: "
|
||||||
|
"$VIRTUAL_ENV (if under --install-dir), "
|
||||||
|
"{install-dir}/venv, then {install-dir}/{.venv,.311,"
|
||||||
|
".312,.313}. Init aborts if none exists.",
|
||||||
|
),
|
||||||
|
prefix: str = typer.Option(
|
||||||
|
"", "--prefix", hidden=True,
|
||||||
|
help="Filesystem prefix for tests (e.g. tmp_path). Empty = real root.",
|
||||||
|
),
|
||||||
|
) -> None:
|
||||||
|
"""One-shot bootstrap of a DECNET master host.
|
||||||
|
|
||||||
|
Creates the `decnet` user/group, installs systemd units,
|
||||||
|
polkit rules, tmpfiles.d entries, seeds directories and
|
||||||
|
drops a placeholder config, then starts decnet.target.
|
||||||
|
"""
|
||||||
|
_require_master_mode("init")
|
||||||
|
|
||||||
|
if purge and not deinit:
|
||||||
|
console.print("[red]--purge only applies with --deinit[/]")
|
||||||
|
raise typer.Exit(1)
|
||||||
|
|
||||||
|
# Root check — skip when --prefix is set (tests don't run as root).
|
||||||
|
if not prefix and os.geteuid() != 0:
|
||||||
|
verb = "deinit" if deinit else "init"
|
||||||
|
console.print(f"[red]decnet {verb}: must run as root (use sudo)[/]")
|
||||||
|
raise typer.Exit(1)
|
||||||
|
|
||||||
|
if not install_dir.startswith("/"):
|
||||||
|
console.print(
|
||||||
|
f"[red]decnet init: --install-dir must be absolute, got {install_dir!r}[/]"
|
||||||
|
)
|
||||||
|
raise typer.Exit(1)
|
||||||
|
# Strip leading slash so pfx-joining works under --prefix test mode
|
||||||
|
# (Path("/"). / "/opt/decnet" == Path("/opt/decnet"), dropping pfx).
|
||||||
|
_install_rel = install_dir.lstrip("/")
|
||||||
|
|
||||||
|
required_tools = ("systemctl",) if deinit else (
|
||||||
|
"systemctl", "useradd", "groupadd", "systemd-tmpfiles",
|
||||||
|
)
|
||||||
|
if deinit:
|
||||||
|
required_tools = required_tools + ("userdel", "groupdel")
|
||||||
|
for tool in required_tools:
|
||||||
|
if shutil.which(tool) is None and not dry_run:
|
||||||
|
verb = "deinit" if deinit else "init"
|
||||||
|
console.print(f"[red]decnet {verb}: {tool!r} is required on PATH[/]")
|
||||||
|
raise typer.Exit(1)
|
||||||
|
|
||||||
|
pfx = Path(prefix) if prefix else Path("/")
|
||||||
|
systemd_dir = pfx / "etc/systemd/system"
|
||||||
|
polkit_dir = pfx / "etc/polkit-1/rules.d"
|
||||||
|
tmpfiles_dir = pfx / "etc/tmpfiles.d"
|
||||||
|
logrotate_dir = pfx / "etc/logrotate.d"
|
||||||
|
etc_decnet = pfx / "etc/decnet"
|
||||||
|
|
||||||
|
if deinit:
|
||||||
|
console.print(
|
||||||
|
f"[bold cyan]DECNET deinit[/] "
|
||||||
|
f"(dry_run={dry_run}, purge={purge})"
|
||||||
|
)
|
||||||
|
_step(
|
||||||
|
"systemctl stop + disable decnet.target",
|
||||||
|
lambda: _run_allow_fail(
|
||||||
|
["systemctl", "disable", "--now", "decnet.target"],
|
||||||
|
dry_run=dry_run,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
_step(
|
||||||
|
"remove systemd unit files",
|
||||||
|
lambda: _uninstall_units(systemd_dir, dry_run=dry_run),
|
||||||
|
)
|
||||||
|
_step(
|
||||||
|
"remove polkit rule",
|
||||||
|
lambda: _remove_file(
|
||||||
|
polkit_dir / "50-decnet-workers.rules",
|
||||||
|
dry_run=dry_run,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
_step(
|
||||||
|
"remove tmpfiles.d entry",
|
||||||
|
lambda: _remove_file(
|
||||||
|
tmpfiles_dir / "decnet.conf",
|
||||||
|
dry_run=dry_run,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
_step(
|
||||||
|
"remove logrotate config",
|
||||||
|
lambda: _remove_file(
|
||||||
|
logrotate_dir / "decnet",
|
||||||
|
dry_run=dry_run,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
_step(
|
||||||
|
"systemctl daemon-reload",
|
||||||
|
lambda: (_run(["systemctl", "daemon-reload"], dry_run=dry_run), "ok")[1],
|
||||||
|
)
|
||||||
|
_step(
|
||||||
|
f"remove {etc_decnet / 'decnet.ini'}",
|
||||||
|
lambda: _remove_file(etc_decnet / "decnet.ini", dry_run=dry_run),
|
||||||
|
)
|
||||||
|
# Legacy name from pre-domain-sections placeholder era.
|
||||||
|
# Harmless if absent (the _remove_file step logs skip).
|
||||||
|
_step(
|
||||||
|
f"remove legacy {etc_decnet / 'config.ini'}",
|
||||||
|
lambda: _remove_file(etc_decnet / "config.ini", dry_run=dry_run),
|
||||||
|
)
|
||||||
|
_step(
|
||||||
|
f"remove {etc_decnet}",
|
||||||
|
lambda: _remove_dir_if_present(etc_decnet, dry_run=dry_run),
|
||||||
|
)
|
||||||
|
_step(
|
||||||
|
f"remove {pfx / 'run/decnet'}",
|
||||||
|
lambda: _remove_dir_if_present(
|
||||||
|
pfx / "run/decnet", dry_run=dry_run,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
_step(
|
||||||
|
f"remove {pfx / _install_rel}",
|
||||||
|
lambda: _remove_dir_if_present(
|
||||||
|
pfx / _install_rel, dry_run=dry_run,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
if purge:
|
||||||
|
_step(
|
||||||
|
f"purge {pfx / 'var/lib/decnet'}",
|
||||||
|
lambda: _remove_dir_if_present(
|
||||||
|
pfx / "var/lib/decnet",
|
||||||
|
dry_run=dry_run, recursive=True,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
_step(
|
||||||
|
f"purge {pfx / 'var/log/decnet'}",
|
||||||
|
lambda: _remove_dir_if_present(
|
||||||
|
pfx / "var/log/decnet",
|
||||||
|
dry_run=dry_run, recursive=True,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
console.print(
|
||||||
|
f"[dim]preserved {pfx / 'var/lib/decnet'} and "
|
||||||
|
f"{pfx / 'var/log/decnet'} (operator data); "
|
||||||
|
"re-run with --purge to remove.[/]"
|
||||||
|
)
|
||||||
|
# User / group removal is also gated on --purge. In dev the
|
||||||
|
# operator may have passed their own login user via
|
||||||
|
# `--user $USER` to avoid ownership churn; an unconditional
|
||||||
|
# `userdel anti` during deinit would nuke their account.
|
||||||
|
if purge:
|
||||||
|
_step(
|
||||||
|
f"remove user {user!r}",
|
||||||
|
lambda: _remove_user(user, dry_run=dry_run),
|
||||||
|
)
|
||||||
|
_step(
|
||||||
|
f"remove group {group!r}",
|
||||||
|
lambda: _remove_group(group, dry_run=dry_run),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
console.print(
|
||||||
|
f"[dim]preserved user {user!r} and group {group!r}; "
|
||||||
|
"re-run with --purge to remove (only do this if "
|
||||||
|
"they were created by `decnet init`).[/]"
|
||||||
|
)
|
||||||
|
console.print("[bold green]DECNET deinit complete.[/]")
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
deploy = _deploy_root()
|
||||||
|
except RuntimeError as exc:
|
||||||
|
console.print(f"[red]decnet init: {exc}[/]")
|
||||||
|
raise typer.Exit(1) from exc
|
||||||
|
|
||||||
|
# Resolve venv BEFORE any file writes — fails loud if the
|
||||||
|
# operator hasn't created one yet, instead of shipping broken
|
||||||
|
# systemd units that journalctl spams forever. Skipped under
|
||||||
|
# --prefix (test mode) because the test harness doesn't build a
|
||||||
|
# real venv and the rendered string is asserted on directly.
|
||||||
|
if prefix:
|
||||||
|
resolved_venv = venv_dir or f"{install_dir}/venv"
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
resolved_venv = _resolve_venv_dir(install_dir, venv_dir)
|
||||||
|
except RuntimeError as exc:
|
||||||
|
console.print(f"[red]decnet init: {exc}[/]")
|
||||||
|
raise typer.Exit(1) from exc
|
||||||
|
console.print(f"[dim]using venv: {resolved_venv}[/]")
|
||||||
|
|
||||||
|
dirs = [
|
||||||
|
(pfx / _install_rel, 0o755, user, group),
|
||||||
|
(pfx / "var/lib/decnet", 0o750, user, group),
|
||||||
|
(pfx / "var/lib/decnet/geoip", 0o755, user, group),
|
||||||
|
(pfx / "var/log/decnet", 0o750, user, group),
|
||||||
|
(etc_decnet, 0o755, "root", group),
|
||||||
|
(pfx / "run/decnet", 0o755, "root", group),
|
||||||
|
]
|
||||||
|
|
||||||
|
console.print(
|
||||||
|
f"[bold cyan]DECNET init[/] "
|
||||||
|
f"(dry_run={dry_run}, no_start={no_start}, force={force})"
|
||||||
|
)
|
||||||
|
|
||||||
|
_step(
|
||||||
|
f"ensure group {group!r}",
|
||||||
|
lambda: _ensure_group(group, dry_run=dry_run),
|
||||||
|
)
|
||||||
|
_step(
|
||||||
|
f"ensure user {user!r}",
|
||||||
|
lambda: _ensure_user(user, group, install_dir, dry_run=dry_run),
|
||||||
|
)
|
||||||
|
for path, mode, d_owner, d_group in dirs:
|
||||||
|
_step(
|
||||||
|
f"ensure dir {path}",
|
||||||
|
lambda p=path, m=mode, o=d_owner, g=d_group:
|
||||||
|
_ensure_dir(p, mode=m, owner=o, group=g, dry_run=dry_run),
|
||||||
|
)
|
||||||
|
_step(
|
||||||
|
f"write {etc_decnet / 'decnet.ini'}",
|
||||||
|
lambda: _ensure_config(etc_decnet / "decnet.ini", group, dry_run=dry_run),
|
||||||
|
)
|
||||||
|
_step(
|
||||||
|
"install systemd units",
|
||||||
|
lambda: _install_units(
|
||||||
|
deploy, systemd_dir,
|
||||||
|
install_dir=install_dir, venv_dir=resolved_venv,
|
||||||
|
user=user, group=group,
|
||||||
|
force=force, dry_run=dry_run,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
_step(
|
||||||
|
"install polkit rule",
|
||||||
|
lambda: _install_polkit(
|
||||||
|
deploy, polkit_dir, group=group,
|
||||||
|
force=force, dry_run=dry_run,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
_step(
|
||||||
|
"install tmpfiles.d entry",
|
||||||
|
lambda: _install_tmpfiles(
|
||||||
|
deploy, tmpfiles_dir, force=force, dry_run=dry_run,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
_step(
|
||||||
|
"install logrotate config",
|
||||||
|
lambda: _install_logrotate(
|
||||||
|
deploy, logrotate_dir, force=force, dry_run=dry_run,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
_step(
|
||||||
|
"systemctl daemon-reload",
|
||||||
|
lambda: (_run(["systemctl", "daemon-reload"], dry_run=dry_run), "ok")[1],
|
||||||
|
)
|
||||||
|
|
||||||
|
if no_start:
|
||||||
|
console.print("[yellow]--no-start: skipping decnet.target start[/]")
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
_step(
|
||||||
|
"systemctl enable --now decnet.target",
|
||||||
|
lambda: (
|
||||||
|
_run(
|
||||||
|
["systemctl", "enable", "--now", "decnet.target"],
|
||||||
|
dry_run=dry_run,
|
||||||
|
),
|
||||||
|
"ok",
|
||||||
|
)[1],
|
||||||
|
)
|
||||||
|
except subprocess.CalledProcessError as exc:
|
||||||
|
console.print(
|
||||||
|
f"[red]decnet.target failed to start (rc={exc.returncode}); "
|
||||||
|
"inspect `systemctl status decnet.target` and individual "
|
||||||
|
"`decnet-*.service` units.[/]"
|
||||||
|
)
|
||||||
|
raise typer.Exit(1) from exc
|
||||||
|
|
||||||
|
console.print("[bold green]DECNET init complete.[/] "
|
||||||
|
"Check `decnet status` or the Workers panel.")
|
||||||
|
sys.stdout.flush()
|
||||||
@@ -55,15 +55,65 @@ def register(app: typer.Typer) -> None:
|
|||||||
|
|
||||||
@app.command()
|
@app.command()
|
||||||
def status() -> None:
|
def status() -> None:
|
||||||
"""Show running deckies and their status."""
|
"""Show running deckies and the state of every ``decnet-*`` unit.
|
||||||
|
|
||||||
|
Prefers systemd (``systemctl list-units 'decnet-*.service'``) so
|
||||||
|
agents, masters and mixed hosts all get one consistent view of
|
||||||
|
what's installed, loaded, and active. Falls back to the psutil
|
||||||
|
cmdline registry on boxes without systemd (dev laptops, CI
|
||||||
|
containers, non-systemd init) so `decnet status` is still useful
|
||||||
|
there.
|
||||||
|
"""
|
||||||
log.info("status command invoked")
|
log.info("status command invoked")
|
||||||
from decnet.engine import status as _status
|
from decnet.engine import status as _status
|
||||||
_status()
|
_status()
|
||||||
|
|
||||||
|
units = _utils._systemd_units()
|
||||||
|
if units is not None:
|
||||||
|
_render_systemd_units(units)
|
||||||
|
else:
|
||||||
|
_render_psutil_fallback()
|
||||||
|
|
||||||
|
def _render_systemd_units(units: list[dict]) -> None:
|
||||||
|
svc_table = Table(title="DECNET Services (systemd)", show_lines=True)
|
||||||
|
svc_table.add_column("Unit", style="bold cyan")
|
||||||
|
svc_table.add_column("Load")
|
||||||
|
svc_table.add_column("Active")
|
||||||
|
svc_table.add_column("Sub")
|
||||||
|
svc_table.add_column("Description", style="dim")
|
||||||
|
|
||||||
|
if not units:
|
||||||
|
console.print(
|
||||||
|
"[yellow]No decnet-* systemd units loaded. "
|
||||||
|
"Run `sudo decnet init` to install them.[/]"
|
||||||
|
)
|
||||||
|
return
|
||||||
|
|
||||||
|
def _active_style(active: str) -> str:
|
||||||
|
if active == "active":
|
||||||
|
return "[green]active[/]"
|
||||||
|
if active == "failed":
|
||||||
|
return "[red]failed[/]"
|
||||||
|
return f"[yellow]{active}[/]"
|
||||||
|
|
||||||
|
for u in sorted(units, key=lambda x: x.get("unit", "")):
|
||||||
|
svc_table.add_row(
|
||||||
|
u.get("unit", ""),
|
||||||
|
u.get("load", ""),
|
||||||
|
_active_style(u.get("active", "")),
|
||||||
|
u.get("sub", ""),
|
||||||
|
u.get("description", ""),
|
||||||
|
)
|
||||||
|
console.print(svc_table)
|
||||||
|
|
||||||
|
def _render_psutil_fallback() -> None:
|
||||||
registry = _utils._service_registry(str(DECNET_INGEST_LOG_FILE))
|
registry = _utils._service_registry(str(DECNET_INGEST_LOG_FILE))
|
||||||
if _agent_mode_active():
|
if _agent_mode_active():
|
||||||
registry = [r for r in registry if r[0] not in {"Mutator", "Profiler", "API"}]
|
registry = [r for r in registry if r[0] not in {"Mutator", "Profiler", "API"}]
|
||||||
svc_table = Table(title="DECNET Services", show_lines=True)
|
svc_table = Table(
|
||||||
|
title="DECNET Services (psutil fallback — systemd unavailable)",
|
||||||
|
show_lines=True,
|
||||||
|
)
|
||||||
svc_table.add_column("Service", style="bold cyan")
|
svc_table.add_column("Service", style="bold cyan")
|
||||||
svc_table.add_column("Status")
|
svc_table.add_column("Status")
|
||||||
svc_table.add_column("PID", style="dim")
|
svc_table.add_column("PID", style="dim")
|
||||||
|
|||||||
55
decnet/cli/orchestrator.py
Normal file
55
decnet/cli/orchestrator.py
Normal file
@@ -0,0 +1,55 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import typer
|
||||||
|
|
||||||
|
from . import utils as _utils
|
||||||
|
from .utils import console, log
|
||||||
|
|
||||||
|
|
||||||
|
def register(app: typer.Typer) -> None:
|
||||||
|
@app.command(name="orchestrate")
|
||||||
|
def orchestrate_cmd(
|
||||||
|
interval: int = typer.Option(
|
||||||
|
60, "--interval", "-i",
|
||||||
|
help="Seconds between synthetic activity ticks",
|
||||||
|
),
|
||||||
|
daemon: bool = typer.Option(
|
||||||
|
False, "--daemon", "-d",
|
||||||
|
help="Detach to background as a daemon process",
|
||||||
|
),
|
||||||
|
llm: Optional[bool] = typer.Option(
|
||||||
|
None, "--llm/--no-llm",
|
||||||
|
help=(
|
||||||
|
"Enable / disable LLM enrichment of user-class file "
|
||||||
|
"bodies. Default reads $DECNET_REALISM_LLM (any "
|
||||||
|
"non-empty value enables; 'off' / unset disables)."
|
||||||
|
),
|
||||||
|
),
|
||||||
|
) -> None:
|
||||||
|
"""Inject synthetic life (inter-decky traffic + file ops + email) into the fleet."""
|
||||||
|
import asyncio
|
||||||
|
from decnet.orchestrator import orchestrator_worker
|
||||||
|
from decnet.web.dependencies import repo
|
||||||
|
|
||||||
|
if daemon:
|
||||||
|
log.info("orchestrator daemonizing interval=%d", interval)
|
||||||
|
_utils._daemonize()
|
||||||
|
|
||||||
|
log.info(
|
||||||
|
"orchestrator starting interval=%d llm=%s",
|
||||||
|
interval, "default" if llm is None else ("on" if llm else "off"),
|
||||||
|
)
|
||||||
|
console.print(
|
||||||
|
f"[bold cyan]Orchestrator starting[/] (interval: {interval}s)"
|
||||||
|
)
|
||||||
|
|
||||||
|
async def _run() -> None:
|
||||||
|
await repo.initialize()
|
||||||
|
await orchestrator_worker(repo, interval=interval, llm_enabled=llm)
|
||||||
|
|
||||||
|
try:
|
||||||
|
asyncio.run(_run())
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
console.print("\n[yellow]Orchestrator stopped.[/]")
|
||||||
111
decnet/cli/realism.py
Normal file
111
decnet/cli/realism.py
Normal file
@@ -0,0 +1,111 @@
|
|||||||
|
"""``decnet realism ...`` — content-engine maintenance commands.
|
||||||
|
|
||||||
|
After stage 5 of the realism migration, this is the only remaining
|
||||||
|
CLI surface from the realism library / former emailgen. ``decnet
|
||||||
|
realism run`` does not exist (the orchestrator runs the unified
|
||||||
|
worker via ``decnet orchestrate``); the only sub-command is
|
||||||
|
``import-personas``, which validates + installs the host-wide global
|
||||||
|
persona pool consumed by fleet (MACVLAN/IPVLAN) and SWARM-shard
|
||||||
|
deckies.
|
||||||
|
|
||||||
|
Topology personas live on ``Topology.email_personas`` and are
|
||||||
|
managed via the dashboard or the topology API; this command does
|
||||||
|
not touch them.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import typer
|
||||||
|
|
||||||
|
from .gating import _require_master_mode
|
||||||
|
from .utils import console, log
|
||||||
|
|
||||||
|
|
||||||
|
def register(app: typer.Typer) -> None:
|
||||||
|
realism_app = typer.Typer(
|
||||||
|
name="realism",
|
||||||
|
help=(
|
||||||
|
"Maintain the realism content engine (persona pool import, "
|
||||||
|
"future content-class tuning)."
|
||||||
|
),
|
||||||
|
)
|
||||||
|
app.add_typer(realism_app, name="realism")
|
||||||
|
|
||||||
|
@realism_app.command("import-personas")
|
||||||
|
def realism_import_personas(
|
||||||
|
path: Path = typer.Argument(
|
||||||
|
..., exists=True, file_okay=True, dir_okay=False, readable=True,
|
||||||
|
help="JSON file containing a list of EmailPersona objects",
|
||||||
|
),
|
||||||
|
output: Optional[Path] = typer.Option(
|
||||||
|
None, "--output", "-o",
|
||||||
|
help=(
|
||||||
|
"Override the destination path. Defaults to the canonical "
|
||||||
|
"global pool (DECNET_REALISM_PERSONAS, /etc/decnet/"
|
||||||
|
"email_personas.json, or ~/.decnet/email_personas.json)."
|
||||||
|
),
|
||||||
|
),
|
||||||
|
) -> None:
|
||||||
|
"""Validate + install a personas JSON file as the global pool.
|
||||||
|
|
||||||
|
Use this when deploying with IMAP/POP3 services on fleet
|
||||||
|
(MACVLAN/IPVLAN) or SWARM-shard mail deckies — those have no
|
||||||
|
parent topology row, so they read this host-wide list.
|
||||||
|
MazeNET topology mail deckies use ``Topology.email_personas``
|
||||||
|
instead and this command does not touch them.
|
||||||
|
"""
|
||||||
|
_require_master_mode("realism import-personas")
|
||||||
|
from decnet.realism import personas_pool as global_pool
|
||||||
|
from decnet.realism.personas import parse_personas
|
||||||
|
|
||||||
|
try:
|
||||||
|
raw = path.read_text(encoding="utf-8")
|
||||||
|
except OSError as exc:
|
||||||
|
console.print(f"[red]Cannot read {path}:[/] {exc}")
|
||||||
|
raise typer.Exit(code=1) from exc
|
||||||
|
|
||||||
|
try:
|
||||||
|
payload = json.loads(raw)
|
||||||
|
except json.JSONDecodeError as exc:
|
||||||
|
console.print(f"[red]Invalid JSON in {path}:[/] {exc}")
|
||||||
|
raise typer.Exit(code=1) from exc
|
||||||
|
if not isinstance(payload, list):
|
||||||
|
console.print(
|
||||||
|
f"[red]{path} must contain a JSON list of personas, "
|
||||||
|
f"got {type(payload).__name__}[/]"
|
||||||
|
)
|
||||||
|
raise typer.Exit(code=1)
|
||||||
|
|
||||||
|
personas = parse_personas(payload)
|
||||||
|
if not personas:
|
||||||
|
console.print(
|
||||||
|
f"[red]No valid personas in {path}.[/] "
|
||||||
|
"Check the schema (name, email, role, tone, mannerisms)."
|
||||||
|
)
|
||||||
|
raise typer.Exit(code=1)
|
||||||
|
if len(personas) < 2:
|
||||||
|
console.print(
|
||||||
|
f"[yellow]Warning: only {len(personas)} valid persona(s) — "
|
||||||
|
"the worker requires at least 2 to send mail; importing "
|
||||||
|
"anyway in case more are added later.[/]"
|
||||||
|
)
|
||||||
|
|
||||||
|
dest = output or global_pool.resolve_path()
|
||||||
|
dest.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
dest.write_text(
|
||||||
|
json.dumps(
|
||||||
|
[p.model_dump(exclude_none=False) for p in personas],
|
||||||
|
indent=2,
|
||||||
|
ensure_ascii=False,
|
||||||
|
),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
global_pool.reset_cache()
|
||||||
|
console.print(
|
||||||
|
f"[green]Imported {len(personas)} personas to[/] {dest}"
|
||||||
|
)
|
||||||
|
if path != dest:
|
||||||
|
log.info("realism import-personas src=%s dest=%s", path, dest)
|
||||||
62
decnet/cli/reconciler.py
Normal file
62
decnet/cli/reconciler.py
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import typer
|
||||||
|
|
||||||
|
from . import utils as _utils
|
||||||
|
from .utils import console, log
|
||||||
|
|
||||||
|
|
||||||
|
def register(app: typer.Typer) -> None:
|
||||||
|
@app.command(name="reconcile")
|
||||||
|
def reconcile_cmd(
|
||||||
|
once: bool = typer.Option(
|
||||||
|
False, "--once",
|
||||||
|
help="Run a single reconcile pass and exit (no daemon loop).",
|
||||||
|
),
|
||||||
|
interval: int = typer.Option(
|
||||||
|
30, "--interval", "-i",
|
||||||
|
help="Seconds between reconcile passes (ignored with --once).",
|
||||||
|
),
|
||||||
|
daemon: bool = typer.Option(
|
||||||
|
False, "--daemon", "-d",
|
||||||
|
help="Detach to background as a daemon process (long-lived only).",
|
||||||
|
),
|
||||||
|
) -> None:
|
||||||
|
"""Converge fleet state across decnet-state.json, the DB, and docker."""
|
||||||
|
import asyncio
|
||||||
|
from decnet.web.dependencies import repo
|
||||||
|
|
||||||
|
if once:
|
||||||
|
from decnet.fleet.reconciler import reconcile_once
|
||||||
|
|
||||||
|
async def _one() -> None:
|
||||||
|
await repo.initialize()
|
||||||
|
counts = await reconcile_once(repo)
|
||||||
|
console.print(
|
||||||
|
f"[bold cyan]reconcile:[/] "
|
||||||
|
f"inserted={counts['inserted']} "
|
||||||
|
f"deleted={counts['deleted']} "
|
||||||
|
f"state_updated={counts['state_updated']}"
|
||||||
|
)
|
||||||
|
asyncio.run(_one())
|
||||||
|
return
|
||||||
|
|
||||||
|
from decnet.fleet.reconciler_worker import fleet_reconciler_worker
|
||||||
|
|
||||||
|
if daemon:
|
||||||
|
log.info("reconciler daemonizing interval=%d", interval)
|
||||||
|
_utils._daemonize()
|
||||||
|
|
||||||
|
log.info("reconciler starting interval=%d", interval)
|
||||||
|
console.print(
|
||||||
|
f"[bold cyan]Fleet reconciler starting[/] (interval: {interval}s)"
|
||||||
|
)
|
||||||
|
|
||||||
|
async def _run() -> None:
|
||||||
|
await repo.initialize()
|
||||||
|
await fleet_reconciler_worker(repo, interval=interval)
|
||||||
|
|
||||||
|
try:
|
||||||
|
asyncio.run(_run())
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
console.print("\n[yellow]Reconciler stopped.[/]")
|
||||||
348
decnet/cli/topology.py
Normal file
348
decnet/cli/topology.py
Normal file
@@ -0,0 +1,348 @@
|
|||||||
|
"""MazeNET topology CLI: generate / deploy / teardown / list / show."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import typer
|
||||||
|
from rich.console import Console
|
||||||
|
from rich.table import Table
|
||||||
|
|
||||||
|
from decnet.topology.config import TopologyConfig
|
||||||
|
from decnet.topology.generator import generate
|
||||||
|
from decnet.topology.persistence import hydrate, persist
|
||||||
|
from decnet.topology.status import TopologyStatus
|
||||||
|
|
||||||
|
from .gating import _require_master_mode
|
||||||
|
|
||||||
|
_console = Console()
|
||||||
|
|
||||||
|
_group = typer.Typer(
|
||||||
|
name="topology",
|
||||||
|
help="MazeNET nested-topology commands (DECNET master only).",
|
||||||
|
no_args_is_help=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def _repo():
|
||||||
|
from decnet.web.db.factory import get_repository
|
||||||
|
r = get_repository()
|
||||||
|
await r.initialize()
|
||||||
|
return r
|
||||||
|
|
||||||
|
|
||||||
|
@_group.command("generate")
|
||||||
|
def _generate(
|
||||||
|
name: str = typer.Option(..., "--name", help="Topology name"),
|
||||||
|
depth: int = typer.Option(3, "--depth", min=1, max=16),
|
||||||
|
branching: int = typer.Option(2, "--branching", min=1, max=8),
|
||||||
|
deckies_per_lan: str = typer.Option(
|
||||||
|
"1-3",
|
||||||
|
"--deckies-per-lan",
|
||||||
|
help="Min-max deckies per LAN, e.g. 1-3",
|
||||||
|
),
|
||||||
|
bridge_forward_probability: float = typer.Option(1.0, "--bridge-forward-p", min=0.0, max=1.0),
|
||||||
|
cross_edge_probability: float = typer.Option(0.0, "--cross-edge-p", min=0.0, max=1.0),
|
||||||
|
services: Optional[str] = typer.Option(None, "--services", help="Comma-separated explicit services"),
|
||||||
|
randomize_services: bool = typer.Option(True, "--randomize-services/--no-randomize-services"),
|
||||||
|
seed: Optional[int] = typer.Option(None, "--seed", min=0),
|
||||||
|
) -> None:
|
||||||
|
"""Generate a topology plan and persist it as pending."""
|
||||||
|
_require_master_mode("topology generate")
|
||||||
|
|
||||||
|
try:
|
||||||
|
lo, hi = (int(x) for x in deckies_per_lan.split("-", 1))
|
||||||
|
except ValueError:
|
||||||
|
_console.print("[red]--deckies-per-lan must be formatted as MIN-MAX, e.g. 1-3.[/]")
|
||||||
|
raise typer.Exit(1)
|
||||||
|
|
||||||
|
services_explicit = (
|
||||||
|
[s.strip() for s in services.split(",") if s.strip()] if services else None
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
cfg = TopologyConfig(
|
||||||
|
name=name,
|
||||||
|
depth=depth,
|
||||||
|
branching_factor=branching,
|
||||||
|
deckies_per_lan_min=lo,
|
||||||
|
deckies_per_lan_max=hi,
|
||||||
|
bridge_forward_probability=bridge_forward_probability,
|
||||||
|
cross_edge_probability=cross_edge_probability,
|
||||||
|
services_explicit=services_explicit,
|
||||||
|
randomize_services=randomize_services if not services_explicit else False,
|
||||||
|
seed=seed,
|
||||||
|
)
|
||||||
|
except ValueError as e:
|
||||||
|
_console.print(f"[red]{e}[/]")
|
||||||
|
raise typer.Exit(1)
|
||||||
|
|
||||||
|
plan = generate(cfg)
|
||||||
|
|
||||||
|
async def _go() -> str:
|
||||||
|
repo = await _repo()
|
||||||
|
return await persist(repo, plan)
|
||||||
|
|
||||||
|
tid = asyncio.run(_go())
|
||||||
|
_console.print(f"[green]Topology persisted as pending[/] — id=[bold]{tid}[/]")
|
||||||
|
_console.print(
|
||||||
|
f" LANs: {len(plan.lans)} deckies: {len(plan.deckies)} edges: {len(plan.edges)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@_group.command("list")
|
||||||
|
def _list() -> None:
|
||||||
|
"""List all topologies."""
|
||||||
|
_require_master_mode("topology list")
|
||||||
|
|
||||||
|
async def _go() -> list[dict]:
|
||||||
|
repo = await _repo()
|
||||||
|
return await repo.list_topologies()
|
||||||
|
|
||||||
|
rows = asyncio.run(_go())
|
||||||
|
if not rows:
|
||||||
|
_console.print("[yellow]No topologies.[/]")
|
||||||
|
return
|
||||||
|
table = Table(title="DECNET / MazeNET Topologies")
|
||||||
|
for col in ("id", "name", "mode", "status", "created_at"):
|
||||||
|
table.add_column(col)
|
||||||
|
for r in rows:
|
||||||
|
table.add_row(
|
||||||
|
str(r["id"]),
|
||||||
|
str(r["name"]),
|
||||||
|
str(r["mode"]),
|
||||||
|
str(r["status"]),
|
||||||
|
str(r.get("created_at", "")),
|
||||||
|
)
|
||||||
|
_console.print(table)
|
||||||
|
|
||||||
|
|
||||||
|
@_group.command("show")
|
||||||
|
def _show(topology_id: str = typer.Argument(..., help="Topology id")) -> None:
|
||||||
|
"""Print a structured summary of a topology."""
|
||||||
|
_require_master_mode("topology show")
|
||||||
|
|
||||||
|
async def _go():
|
||||||
|
repo = await _repo()
|
||||||
|
return await hydrate(repo, topology_id)
|
||||||
|
|
||||||
|
hydrated = asyncio.run(_go())
|
||||||
|
if hydrated is None:
|
||||||
|
_console.print(f"[red]No such topology: {topology_id}[/]")
|
||||||
|
raise typer.Exit(1)
|
||||||
|
|
||||||
|
topo = hydrated["topology"]
|
||||||
|
_console.print(
|
||||||
|
f"[bold]{topo['name']}[/] id={topo['id']} status={topo['status']}"
|
||||||
|
f" mode={topo['mode']}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def _decky_name(d: dict) -> str:
|
||||||
|
cfg = d.get("decky_config") or {}
|
||||||
|
return cfg.get("name") or d.get("name") or d["uuid"]
|
||||||
|
|
||||||
|
deckies_by_name = {_decky_name(d): d for d in hydrated["deckies"]}
|
||||||
|
edges_by_lan: dict[str, list[dict]] = {}
|
||||||
|
for e in hydrated["edges"]:
|
||||||
|
edges_by_lan.setdefault(e["lan_id"], []).append(e)
|
||||||
|
|
||||||
|
for lan in hydrated["lans"]:
|
||||||
|
dmz_tag = " [dim](DMZ)[/]" if lan["is_dmz"] else ""
|
||||||
|
_console.print(f"\n[cyan]LAN[/] {lan['name']} {lan['subnet']}{dmz_tag}")
|
||||||
|
lan_edges = edges_by_lan.get(lan["id"], [])
|
||||||
|
for e in lan_edges:
|
||||||
|
# Find the decky name via uuid.
|
||||||
|
decky = next(
|
||||||
|
(d for d in hydrated["deckies"] if d["uuid"] == e["decky_uuid"]),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
if decky is None:
|
||||||
|
continue
|
||||||
|
cfg = decky.get("decky_config") or {}
|
||||||
|
name = _decky_name(decky)
|
||||||
|
ip = (cfg.get("ips_by_lan") or {}).get(lan["name"]) or decky.get("ip") or "?"
|
||||||
|
tags = []
|
||||||
|
if e["is_bridge"]:
|
||||||
|
tags.append("bridge")
|
||||||
|
if e["forwards_l3"]:
|
||||||
|
tags.append("L3-forward")
|
||||||
|
tag_s = f" [yellow]({', '.join(tags)})[/]" if tags else ""
|
||||||
|
svcs = ",".join(cfg.get("services") or decky.get("services") or []) or "-"
|
||||||
|
_console.print(f" • {name} {ip} svcs={svcs}{tag_s}")
|
||||||
|
|
||||||
|
_ = deckies_by_name # for future cross-reference extensions
|
||||||
|
|
||||||
|
|
||||||
|
@_group.command("deploy")
|
||||||
|
def _deploy(
|
||||||
|
topology_id: str = typer.Argument(..., help="Topology id (must be pending)"),
|
||||||
|
dry_run: bool = typer.Option(False, "--dry-run", help="Write compose + create nets, skip containers"),
|
||||||
|
) -> None:
|
||||||
|
"""Deploy a pending topology."""
|
||||||
|
_require_master_mode("topology deploy")
|
||||||
|
from decnet.engine.deployer import deploy_topology
|
||||||
|
|
||||||
|
async def _go() -> None:
|
||||||
|
repo = await _repo()
|
||||||
|
await deploy_topology(repo, topology_id, dry_run=dry_run)
|
||||||
|
|
||||||
|
asyncio.run(_go())
|
||||||
|
_console.print(f"[green]Topology {topology_id} deployed.[/]")
|
||||||
|
|
||||||
|
|
||||||
|
@_group.command("teardown")
|
||||||
|
def _teardown(
|
||||||
|
topology_id: str = typer.Argument(..., help="Topology id"),
|
||||||
|
) -> None:
|
||||||
|
"""Tear down a topology. Legal from active|degraded|failed|deploying."""
|
||||||
|
_require_master_mode("topology teardown")
|
||||||
|
from decnet.engine.deployer import teardown_topology
|
||||||
|
|
||||||
|
async def _go() -> None:
|
||||||
|
repo = await _repo()
|
||||||
|
await teardown_topology(repo, topology_id)
|
||||||
|
|
||||||
|
asyncio.run(_go())
|
||||||
|
_console.print(f"[green]Topology {topology_id} torn down.[/]")
|
||||||
|
|
||||||
|
|
||||||
|
@_group.command("delete")
|
||||||
|
def _delete(
|
||||||
|
topology_id: str = typer.Argument(..., help="Topology id"),
|
||||||
|
force: bool = typer.Option(
|
||||||
|
False,
|
||||||
|
"--force",
|
||||||
|
help="Skip the confirmation prompt (required for non-interactive use).",
|
||||||
|
),
|
||||||
|
) -> None:
|
||||||
|
"""Delete a topology and all its children (LANs, deckies, edges, mutations).
|
||||||
|
|
||||||
|
Refuses while containers are running — teardown first.
|
||||||
|
"""
|
||||||
|
_require_master_mode("topology delete")
|
||||||
|
|
||||||
|
_RUNNING = {
|
||||||
|
TopologyStatus.DEPLOYING,
|
||||||
|
TopologyStatus.ACTIVE,
|
||||||
|
TopologyStatus.DEGRADED,
|
||||||
|
TopologyStatus.TEARING_DOWN,
|
||||||
|
}
|
||||||
|
|
||||||
|
async def _go() -> tuple[bool, Optional[str]]:
|
||||||
|
repo = await _repo()
|
||||||
|
topo = await repo.get_topology(topology_id)
|
||||||
|
if topo is None:
|
||||||
|
return False, "not-found"
|
||||||
|
if topo["status"] in _RUNNING:
|
||||||
|
return False, str(topo["status"])
|
||||||
|
ok = await repo.delete_topology_cascade(topology_id)
|
||||||
|
return ok, None
|
||||||
|
|
||||||
|
if not force and not typer.confirm(
|
||||||
|
f"Delete topology {topology_id} and all its children? This cannot be undone.",
|
||||||
|
default=False,
|
||||||
|
):
|
||||||
|
_console.print("[yellow]Cancelled.[/]")
|
||||||
|
raise typer.Exit(0)
|
||||||
|
|
||||||
|
ok, reason = asyncio.run(_go())
|
||||||
|
if reason == "not-found":
|
||||||
|
_console.print(f"[red]No such topology: {topology_id}[/]")
|
||||||
|
raise typer.Exit(1)
|
||||||
|
if reason is not None:
|
||||||
|
_console.print(
|
||||||
|
f"[red]Cannot delete while status={reason!r}. Run "
|
||||||
|
f"[bold]decnet topology teardown {topology_id}[/] first.[/]"
|
||||||
|
)
|
||||||
|
raise typer.Exit(1)
|
||||||
|
if not ok:
|
||||||
|
_console.print(f"[red]Delete failed: {topology_id}[/]")
|
||||||
|
raise typer.Exit(1)
|
||||||
|
_console.print(f"[green]Topology {topology_id} deleted.[/]")
|
||||||
|
|
||||||
|
|
||||||
|
@_group.command("mutate")
|
||||||
|
def _mutate(
|
||||||
|
topology_id: str = typer.Argument(..., help="Topology id (active or degraded)"),
|
||||||
|
op: str = typer.Argument(
|
||||||
|
...,
|
||||||
|
help=(
|
||||||
|
"One of: add_lan, remove_lan, add_decky, attach_decky, "
|
||||||
|
"detach_decky, remove_decky, update_decky, update_lan"
|
||||||
|
),
|
||||||
|
),
|
||||||
|
payload_json: str = typer.Option(
|
||||||
|
"{}",
|
||||||
|
"--payload-json",
|
||||||
|
help="JSON payload for the op (see mutator.ops for keys)",
|
||||||
|
),
|
||||||
|
expected_version: Optional[int] = typer.Option(
|
||||||
|
None,
|
||||||
|
"--expected-version",
|
||||||
|
help="Optimistic-concurrency guard; enqueue fails with a "
|
||||||
|
"VersionConflict if the topology has since been mutated.",
|
||||||
|
),
|
||||||
|
) -> None:
|
||||||
|
"""Enqueue a live mutation. The mutator's watch loop applies it."""
|
||||||
|
_require_master_mode("topology mutate")
|
||||||
|
import json
|
||||||
|
|
||||||
|
try:
|
||||||
|
payload = json.loads(payload_json)
|
||||||
|
except ValueError as e:
|
||||||
|
_console.print(f"[red]Invalid JSON: {e}[/]")
|
||||||
|
raise typer.Exit(1)
|
||||||
|
|
||||||
|
async def _go() -> str:
|
||||||
|
repo = await _repo()
|
||||||
|
return await repo.enqueue_topology_mutation(
|
||||||
|
topology_id, op, payload, expected_version=expected_version,
|
||||||
|
)
|
||||||
|
|
||||||
|
mid = asyncio.run(_go())
|
||||||
|
_console.print(
|
||||||
|
f"[green]Mutation enqueued[/] — id=[bold]{mid}[/] op={op} "
|
||||||
|
f"(watch for state=applied on [cyan]topology mutations {topology_id}[/])"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@_group.command("mutations")
|
||||||
|
def _mutations(
|
||||||
|
topology_id: str = typer.Argument(..., help="Topology id"),
|
||||||
|
state: Optional[str] = typer.Option(
|
||||||
|
None,
|
||||||
|
"--state",
|
||||||
|
help="Filter to one of pending|applying|applied|failed",
|
||||||
|
),
|
||||||
|
) -> None:
|
||||||
|
"""List queued/applied mutations for a topology."""
|
||||||
|
_require_master_mode("topology mutations")
|
||||||
|
|
||||||
|
async def _go() -> list[dict]:
|
||||||
|
repo = await _repo()
|
||||||
|
return await repo.list_topology_mutations(topology_id, state=state)
|
||||||
|
|
||||||
|
rows = asyncio.run(_go())
|
||||||
|
if not rows:
|
||||||
|
_console.print("[yellow]No mutations.[/]")
|
||||||
|
return
|
||||||
|
table = Table(title=f"Mutations — topology {topology_id}")
|
||||||
|
for col in ("id", "op", "state", "requested_at", "applied_at", "reason"):
|
||||||
|
table.add_column(col)
|
||||||
|
for r in rows:
|
||||||
|
table.add_row(
|
||||||
|
str(r["id"]),
|
||||||
|
str(r["op"]),
|
||||||
|
str(r["state"]),
|
||||||
|
str(r.get("requested_at", "")),
|
||||||
|
str(r.get("applied_at") or ""),
|
||||||
|
str(r.get("reason") or ""),
|
||||||
|
)
|
||||||
|
_console.print(table)
|
||||||
|
|
||||||
|
|
||||||
|
def register(app: typer.Typer) -> None:
|
||||||
|
app.add_typer(_group, name="topology")
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["register", "TopologyStatus"]
|
||||||
@@ -134,6 +134,46 @@ def _service_registry(log_file: str) -> list[tuple[str, callable, list[str]]]:
|
|||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _systemd_units(pattern: str = "decnet-*.service") -> list[dict] | None:
|
||||||
|
"""Return state of every systemd unit matching *pattern*, or ``None``
|
||||||
|
when systemctl is unavailable (non-systemd host, container lab,
|
||||||
|
PATH-stripped env, user-manager unreachable).
|
||||||
|
|
||||||
|
Output shape mirrors ``systemctl list-units --output=json``: each
|
||||||
|
dict has ``unit``, ``load``, ``active``, ``sub``, ``description``.
|
||||||
|
Empty list = systemd works but no matching units are loaded (fresh
|
||||||
|
host that never ran ``decnet init``).
|
||||||
|
"""
|
||||||
|
import json # local import — avoids paying it on every CLI startup
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
if not shutil.which("systemctl"):
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
proc = subprocess.run( # nosec B603 B607 — fixed argv, no shell
|
||||||
|
[
|
||||||
|
"systemctl", "list-units",
|
||||||
|
"--type=service", "--all",
|
||||||
|
"--no-legend", "--no-pager",
|
||||||
|
"--output=json",
|
||||||
|
pattern,
|
||||||
|
],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=5,
|
||||||
|
check=False,
|
||||||
|
)
|
||||||
|
except (OSError, subprocess.SubprocessError):
|
||||||
|
return None
|
||||||
|
if proc.returncode != 0:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
data = json.loads(proc.stdout or "[]")
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
return None
|
||||||
|
return data if isinstance(data, list) else None
|
||||||
|
|
||||||
|
|
||||||
def _kill_all_services() -> None:
|
def _kill_all_services() -> None:
|
||||||
"""Find and kill all running DECNET microservice processes."""
|
"""Find and kill all running DECNET microservice processes."""
|
||||||
registry = _service_registry(str(DECNET_INGEST_LOG_FILE))
|
registry = _service_registry(str(DECNET_INGEST_LOG_FILE))
|
||||||
|
|||||||
@@ -2,17 +2,33 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import typer
|
import typer
|
||||||
|
|
||||||
from decnet.env import DECNET_API_PORT, DECNET_WEB_HOST, DECNET_WEB_PORT
|
from decnet.env import DECNET_API_HOST, DECNET_API_PORT, DECNET_WEB_HOST, DECNET_WEB_PORT
|
||||||
|
|
||||||
from . import utils as _utils
|
from . import utils as _utils
|
||||||
from .utils import console, log
|
from .utils import console, log
|
||||||
|
|
||||||
|
|
||||||
|
def _proxy_target(api_host: str) -> str:
|
||||||
|
"""Resolve the host the web proxy should connect to.
|
||||||
|
|
||||||
|
The API binds at ``DECNET_API_HOST``; when that's a wildcard
|
||||||
|
(``0.0.0.0`` / ``::``) we still connect over loopback because the
|
||||||
|
web and API run in the same host. When the operator binds the API
|
||||||
|
to a specific address (e.g. a Tailscale IP), the API is *only*
|
||||||
|
reachable there — loopback is closed — so the proxy must follow.
|
||||||
|
"""
|
||||||
|
wildcard = {"0.0.0.0", "::", ""} # nosec B104 — comparison only
|
||||||
|
if api_host in wildcard:
|
||||||
|
return "127.0.0.1"
|
||||||
|
return api_host
|
||||||
|
|
||||||
|
|
||||||
def register(app: typer.Typer) -> None:
|
def register(app: typer.Typer) -> None:
|
||||||
@app.command(name="web")
|
@app.command(name="web")
|
||||||
def serve_web(
|
def serve_web(
|
||||||
web_port: int = typer.Option(DECNET_WEB_PORT, "--web-port", help="Port to serve the DECNET Web Dashboard"),
|
web_port: int = typer.Option(DECNET_WEB_PORT, "--web-port", help="Port to serve the DECNET Web Dashboard"),
|
||||||
host: str = typer.Option(DECNET_WEB_HOST, "--host", help="Host IP to serve the Web Dashboard"),
|
host: str = typer.Option(DECNET_WEB_HOST, "--host", help="Host IP to serve the Web Dashboard"),
|
||||||
|
api_host: str = typer.Option(DECNET_API_HOST, "--api-host", help="Host the DECNET API is listening on (loopback for wildcard binds)"),
|
||||||
api_port: int = typer.Option(DECNET_API_PORT, "--api-port", help="Port the DECNET API is listening on"),
|
api_port: int = typer.Option(DECNET_API_PORT, "--api-port", help="Port the DECNET API is listening on"),
|
||||||
daemon: bool = typer.Option(False, "--daemon", "-d", help="Detach to background as a daemon process"),
|
daemon: bool = typer.Option(False, "--daemon", "-d", help="Detach to background as a daemon process"),
|
||||||
) -> None:
|
) -> None:
|
||||||
@@ -33,8 +49,13 @@ def register(app: typer.Typer) -> None:
|
|||||||
console.print(f"[red]Frontend build not found at {dist_dir}. Make sure you run 'npm run build' inside 'decnet_web'.[/]")
|
console.print(f"[red]Frontend build not found at {dist_dir}. Make sure you run 'npm run build' inside 'decnet_web'.[/]")
|
||||||
raise typer.Exit(1)
|
raise typer.Exit(1)
|
||||||
|
|
||||||
|
_api_target = _proxy_target(api_host)
|
||||||
|
|
||||||
if daemon:
|
if daemon:
|
||||||
log.info("web daemonizing host=%s port=%d api_port=%d", host, web_port, api_port)
|
log.info(
|
||||||
|
"web daemonizing host=%s port=%d api_target=%s:%d",
|
||||||
|
host, web_port, _api_target, api_port,
|
||||||
|
)
|
||||||
_utils._daemonize()
|
_utils._daemonize()
|
||||||
|
|
||||||
_api_port = api_port
|
_api_port = api_port
|
||||||
@@ -67,6 +88,18 @@ def register(app: typer.Typer) -> None:
|
|||||||
return
|
return
|
||||||
self.send_error(405)
|
self.send_error(405)
|
||||||
|
|
||||||
|
def do_PATCH(self):
|
||||||
|
if self.path.startswith("/api/"):
|
||||||
|
self._proxy("PATCH")
|
||||||
|
return
|
||||||
|
self.send_error(405)
|
||||||
|
|
||||||
|
def do_OPTIONS(self):
|
||||||
|
if self.path.startswith("/api/"):
|
||||||
|
self._proxy("OPTIONS")
|
||||||
|
return
|
||||||
|
self.send_error(405)
|
||||||
|
|
||||||
def _proxy(self, method: str) -> None:
|
def _proxy(self, method: str) -> None:
|
||||||
content_length = int(self.headers.get("Content-Length", 0))
|
content_length = int(self.headers.get("Content-Length", 0))
|
||||||
body = self.rfile.read(content_length) if content_length else None
|
body = self.rfile.read(content_length) if content_length else None
|
||||||
@@ -75,7 +108,7 @@ def register(app: typer.Typer) -> None:
|
|||||||
if k.lower() not in ("host", "connection")}
|
if k.lower() not in ("host", "connection")}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
conn = http.client.HTTPConnection("127.0.0.1", _api_port, timeout=120)
|
conn = http.client.HTTPConnection(_api_target, _api_port, timeout=120)
|
||||||
conn.request(method, self.path, body=body, headers=forward)
|
conn.request(method, self.path, body=body, headers=forward)
|
||||||
resp = conn.getresponse()
|
resp = conn.getresponse()
|
||||||
|
|
||||||
@@ -113,7 +146,7 @@ def register(app: typer.Typer) -> None:
|
|||||||
socketserver.TCPServer.allow_reuse_address = True
|
socketserver.TCPServer.allow_reuse_address = True
|
||||||
with socketserver.ThreadingTCPServer((host, web_port), SPAHTTPRequestHandler) as httpd:
|
with socketserver.ThreadingTCPServer((host, web_port), SPAHTTPRequestHandler) as httpd:
|
||||||
console.print(f"[green]Serving DECNET Web Dashboard on http://{host}:{web_port}[/]")
|
console.print(f"[green]Serving DECNET Web Dashboard on http://{host}:{web_port}[/]")
|
||||||
console.print(f"[dim]Proxying /api/* → http://127.0.0.1:{_api_port}[/]")
|
console.print(f"[dim]Proxying /api/* → http://{_api_target}:{_api_port}[/]")
|
||||||
try:
|
try:
|
||||||
httpd.serve_forever()
|
httpd.serve_forever()
|
||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
|
|||||||
35
decnet/cli/webhook.py
Normal file
35
decnet/cli/webhook.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import typer
|
||||||
|
|
||||||
|
from . import utils as _utils
|
||||||
|
from .utils import console, log
|
||||||
|
|
||||||
|
|
||||||
|
def register(app: typer.Typer) -> None:
|
||||||
|
@app.command(name="webhook")
|
||||||
|
def webhook_cmd(
|
||||||
|
daemon: bool = typer.Option(
|
||||||
|
False, "--daemon", "-d", help="Detach to background as a daemon process"
|
||||||
|
),
|
||||||
|
) -> None:
|
||||||
|
"""Run the webhook dispatcher — bus consumer → external HTTP egress."""
|
||||||
|
import asyncio
|
||||||
|
from decnet.web.dependencies import repo
|
||||||
|
from decnet.webhook import webhook_worker
|
||||||
|
|
||||||
|
if daemon:
|
||||||
|
log.info("webhook daemonizing")
|
||||||
|
_utils._daemonize()
|
||||||
|
|
||||||
|
log.info("webhook starting")
|
||||||
|
console.print("[bold cyan]Webhook dispatcher starting[/]")
|
||||||
|
|
||||||
|
async def _run() -> None:
|
||||||
|
await repo.initialize()
|
||||||
|
await webhook_worker(repo)
|
||||||
|
|
||||||
|
try:
|
||||||
|
asyncio.run(_run())
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
console.print("\n[yellow]Webhook worker stopped.[/]")
|
||||||
@@ -82,61 +82,216 @@ def register(app: typer.Typer) -> None:
|
|||||||
|
|
||||||
asyncio.run(_run())
|
asyncio.run(_run())
|
||||||
|
|
||||||
@app.command(name="correlate")
|
@app.command(name="enrich")
|
||||||
def correlate(
|
def enrich(
|
||||||
log_file: Optional[str] = typer.Option(None, "--log-file", "-f", help="Path to DECNET syslog file to analyse"),
|
poll_interval_secs: float = typer.Option(
|
||||||
min_deckies: int = typer.Option(2, "--min-deckies", "-m", help="Minimum number of distinct deckies an IP must touch to be reported"),
|
60.0, "--poll-interval", "-i",
|
||||||
output: str = typer.Option("table", "--output", "-o", help="Output format: table | json | syslog"),
|
help="Slow-tick fallback when the bus is idle or unavailable (seconds)",
|
||||||
emit_syslog: bool = typer.Option(False, "--emit-syslog", help="Also print traversal events as RFC 5424 lines (for SIEM piping)"),
|
),
|
||||||
daemon: bool = typer.Option(False, "--daemon", "-d", help="Detach to background as a daemon process"),
|
ttl_hours: int = typer.Option(
|
||||||
|
24, "--ttl-hours",
|
||||||
|
help="Cache lifetime per attacker IP — re-firings inside the window short-circuit before any HTTP egress",
|
||||||
|
),
|
||||||
|
daemon: bool = typer.Option(
|
||||||
|
False, "--daemon", "-d",
|
||||||
|
help="Detach to background as a daemon process",
|
||||||
|
),
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Analyse logs for cross-decky traversals and print the attacker movement graph."""
|
"""Threat-intel enrichment worker — fan out per attacker IP across
|
||||||
import sys
|
configured providers (GreyNoise, AbuseIPDB, abuse.ch Feodo Tracker
|
||||||
import json as _json
|
+ ThreatFox), cache the verdict in ``attacker_intel``, and publish
|
||||||
from pathlib import Path
|
``attacker.intel.enriched`` for SIEM-bound webhook consumers.
|
||||||
from decnet.correlation.engine import CorrelationEngine
|
"""
|
||||||
|
import asyncio
|
||||||
|
from decnet.intel.worker import run_intel_loop
|
||||||
|
from decnet.web.dependencies import repo
|
||||||
|
|
||||||
if daemon:
|
if daemon:
|
||||||
log.info("correlate daemonizing log_file=%s", log_file)
|
log.info(
|
||||||
|
"enrich daemonizing poll=%s ttl_hours=%d",
|
||||||
|
poll_interval_secs, ttl_hours,
|
||||||
|
)
|
||||||
_utils._daemonize()
|
_utils._daemonize()
|
||||||
|
|
||||||
engine = CorrelationEngine()
|
log.info(
|
||||||
|
"enrich command invoked poll=%s ttl_hours=%d",
|
||||||
|
poll_interval_secs, ttl_hours,
|
||||||
|
)
|
||||||
|
console.print(
|
||||||
|
f"[bold cyan]Intel enrichment starting[/] "
|
||||||
|
f"poll={poll_interval_secs}s ttl={ttl_hours}h"
|
||||||
|
)
|
||||||
|
console.print("[dim]Press Ctrl+C to stop[/]")
|
||||||
|
|
||||||
if log_file:
|
async def _run() -> None:
|
||||||
path = Path(log_file)
|
await repo.initialize()
|
||||||
if not path.exists():
|
await run_intel_loop(
|
||||||
console.print(f"[red]Log file not found: {log_file}[/]")
|
repo,
|
||||||
raise typer.Exit(1)
|
poll_interval_secs=poll_interval_secs,
|
||||||
engine.ingest_file(path)
|
ttl_hours=ttl_hours,
|
||||||
elif not sys.stdin.isatty():
|
)
|
||||||
for line in sys.stdin:
|
|
||||||
engine.ingest(line)
|
|
||||||
else:
|
|
||||||
console.print("[red]Provide --log-file or pipe log data via stdin.[/]")
|
|
||||||
raise typer.Exit(1)
|
|
||||||
|
|
||||||
traversals = engine.traversals(min_deckies)
|
try:
|
||||||
|
asyncio.run(_run())
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
console.print("\n[yellow]Intel enrichment stopped.[/]")
|
||||||
|
|
||||||
if output == "json":
|
@app.command(name="reuse-correlate")
|
||||||
console.print_json(_json.dumps(engine.report_json(min_deckies), indent=2))
|
def reuse_correlate(
|
||||||
elif output == "syslog":
|
min_targets: int = typer.Option(
|
||||||
for line in engine.traversal_syslog_lines(min_deckies):
|
2, "--min-targets", "-m",
|
||||||
typer.echo(line)
|
help="Minimum distinct (decky, service) targets a secret must hit before a CredentialReuse row is persisted",
|
||||||
else:
|
),
|
||||||
if not traversals:
|
poll_interval_secs: float = typer.Option(
|
||||||
console.print(
|
60.0, "--poll-interval", "-i",
|
||||||
f"[yellow]No traversals detected "
|
help="Slow-tick fallback when the bus is idle or unavailable (seconds)",
|
||||||
f"(min_deckies={min_deckies}, events_indexed={engine.events_indexed}).[/]"
|
),
|
||||||
)
|
daemon: bool = typer.Option(
|
||||||
else:
|
False, "--daemon", "-d",
|
||||||
console.print(engine.report_table(min_deckies))
|
help="Detach to background as a daemon process",
|
||||||
console.print(
|
),
|
||||||
f"[dim]Parsed {engine.lines_parsed} lines · "
|
) -> None:
|
||||||
f"indexed {engine.events_indexed} events · "
|
"""Long-running credential-reuse correlator.
|
||||||
f"{len(engine.all_attackers())} unique IPs · "
|
|
||||||
f"[bold]{len(traversals)}[/] traversal(s)[/]"
|
|
||||||
)
|
|
||||||
|
|
||||||
if emit_syslog:
|
Watches the bus for ``credential.captured`` and ``attacker.observed``
|
||||||
for line in engine.traversal_syslog_lines(min_deckies):
|
events, re-runs the reuse pass on each wake, and publishes
|
||||||
typer.echo(line)
|
``credential.reuse.detected`` for every new or grown
|
||||||
|
``CredentialReuse`` row.
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
from decnet.correlation.reuse_worker import run_reuse_loop
|
||||||
|
from decnet.web.dependencies import repo
|
||||||
|
|
||||||
|
if daemon:
|
||||||
|
log.info(
|
||||||
|
"reuse-correlate daemonizing min_targets=%d poll=%s",
|
||||||
|
min_targets, poll_interval_secs,
|
||||||
|
)
|
||||||
|
_utils._daemonize()
|
||||||
|
|
||||||
|
log.info(
|
||||||
|
"reuse-correlate command invoked min_targets=%d poll=%s",
|
||||||
|
min_targets, poll_interval_secs,
|
||||||
|
)
|
||||||
|
console.print(
|
||||||
|
f"[bold cyan]Reuse correlator starting[/] "
|
||||||
|
f"min_targets={min_targets} poll={poll_interval_secs}s"
|
||||||
|
)
|
||||||
|
console.print("[dim]Press Ctrl+C to stop[/]")
|
||||||
|
|
||||||
|
async def _run() -> None:
|
||||||
|
await repo.initialize()
|
||||||
|
await run_reuse_loop(
|
||||||
|
repo,
|
||||||
|
poll_interval_secs=poll_interval_secs,
|
||||||
|
min_targets=min_targets,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
asyncio.run(_run())
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
console.print("\n[yellow]Reuse correlator stopped.[/]")
|
||||||
|
|
||||||
|
@app.command(name="clusterer")
|
||||||
|
def clusterer(
|
||||||
|
poll_interval_secs: float = typer.Option(
|
||||||
|
60.0, "--poll-interval", "-i",
|
||||||
|
help="Slow-tick fallback when the bus is idle or unavailable (seconds)",
|
||||||
|
),
|
||||||
|
daemon: bool = typer.Option(
|
||||||
|
False, "--daemon", "-d",
|
||||||
|
help="Detach to background as a daemon process",
|
||||||
|
),
|
||||||
|
) -> None:
|
||||||
|
"""Identity-resolution clusterer.
|
||||||
|
|
||||||
|
Bus-woken on ``attacker.observed`` and ``attacker.scored``;
|
||||||
|
builds a similarity graph over observations, runs
|
||||||
|
connected-components, writes ``attacker_identities`` rows, and
|
||||||
|
publishes ``identity.formed`` / ``identity.observation.linked``
|
||||||
|
/ ``identity.merged`` / ``identity.unmerged``.
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
from decnet.cli.gating import _require_master_mode
|
||||||
|
from decnet.clustering.worker import run_clusterer_loop
|
||||||
|
from decnet.web.dependencies import repo
|
||||||
|
|
||||||
|
_require_master_mode("clusterer")
|
||||||
|
|
||||||
|
if daemon:
|
||||||
|
log.info("clusterer daemonizing poll=%s", poll_interval_secs)
|
||||||
|
_utils._daemonize()
|
||||||
|
|
||||||
|
log.info("clusterer command invoked poll=%s", poll_interval_secs)
|
||||||
|
console.print(
|
||||||
|
f"[bold cyan]Identity clusterer starting[/] "
|
||||||
|
f"poll={poll_interval_secs}s"
|
||||||
|
)
|
||||||
|
console.print("[dim]Press Ctrl+C to stop[/]")
|
||||||
|
|
||||||
|
async def _run() -> None:
|
||||||
|
await repo.initialize()
|
||||||
|
await run_clusterer_loop(
|
||||||
|
repo, poll_interval_secs=poll_interval_secs,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
asyncio.run(_run())
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
console.print("\n[yellow]Identity clusterer stopped.[/]")
|
||||||
|
|
||||||
|
@app.command(name="campaign-clusterer")
|
||||||
|
def campaign_clusterer(
|
||||||
|
poll_interval_secs: float = typer.Option(
|
||||||
|
60.0, "--poll-interval", "-i",
|
||||||
|
help="Slow-tick fallback when the bus is idle or unavailable (seconds)",
|
||||||
|
),
|
||||||
|
daemon: bool = typer.Option(
|
||||||
|
False, "--daemon", "-d",
|
||||||
|
help="Detach to background as a daemon process",
|
||||||
|
),
|
||||||
|
) -> None:
|
||||||
|
"""Campaign clusterer — groups identities into operations.
|
||||||
|
|
||||||
|
Bus-woken on ``identity.>`` (any identity-layer change is
|
||||||
|
potential input); reads ``AttackerIdentity`` rows, runs
|
||||||
|
connected-components over the campaign-level similarity graph
|
||||||
|
(phase-handoff / shared-infra / temporal-overlap / cohort),
|
||||||
|
writes ``campaigns`` rows + sets ``attacker_identities.campaign_id``,
|
||||||
|
and publishes ``campaign.formed`` / ``campaign.identity.assigned``
|
||||||
|
/ ``campaign.merged`` / ``campaign.unmerged`` plus the cross-family
|
||||||
|
``identity.campaign.assigned`` so identity-side subscribers see
|
||||||
|
the badge update.
|
||||||
|
"""
|
||||||
|
import asyncio
|
||||||
|
from decnet.cli.gating import _require_master_mode
|
||||||
|
from decnet.clustering.campaign.worker import (
|
||||||
|
run_campaign_clusterer_loop,
|
||||||
|
)
|
||||||
|
from decnet.web.dependencies import repo
|
||||||
|
|
||||||
|
_require_master_mode("campaign-clusterer")
|
||||||
|
|
||||||
|
if daemon:
|
||||||
|
log.info("campaign-clusterer daemonizing poll=%s", poll_interval_secs)
|
||||||
|
_utils._daemonize()
|
||||||
|
|
||||||
|
log.info(
|
||||||
|
"campaign-clusterer command invoked poll=%s", poll_interval_secs,
|
||||||
|
)
|
||||||
|
console.print(
|
||||||
|
f"[bold cyan]Campaign clusterer starting[/] "
|
||||||
|
f"poll={poll_interval_secs}s"
|
||||||
|
)
|
||||||
|
console.print("[dim]Press Ctrl+C to stop[/]")
|
||||||
|
|
||||||
|
async def _run() -> None:
|
||||||
|
await repo.initialize()
|
||||||
|
await run_campaign_clusterer_loop(
|
||||||
|
repo, poll_interval_secs=poll_interval_secs,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
asyncio.run(_run())
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
console.print("\n[yellow]Campaign clusterer stopped.[/]")
|
||||||
|
|||||||
1
decnet/clustering/__init__.py
Normal file
1
decnet/clustering/__init__.py
Normal file
@@ -0,0 +1 @@
|
|||||||
|
"""Campaign clustering — see development/CAMPAIGN_CLUSTERING.md."""
|
||||||
83
decnet/clustering/base.py
Normal file
83
decnet/clustering/base.py
Normal file
@@ -0,0 +1,83 @@
|
|||||||
|
"""Identity-resolution clusterer protocol.
|
||||||
|
|
||||||
|
Each concrete clusterer (``decnet.clustering.impl.connected_components``,
|
||||||
|
and any future variant) implements this. Callers must obtain the active
|
||||||
|
clusterer via :func:`decnet.clustering.factory.get_clusterer` — never
|
||||||
|
instantiate a concrete class directly.
|
||||||
|
|
||||||
|
The clusterer mirrors the provider-subpackage convention used by
|
||||||
|
:mod:`decnet.bus` and :mod:`decnet.web.db`: ``base.py`` defines the
|
||||||
|
protocol, ``factory.py`` dispatches on ``DECNET_CLUSTERER_TYPE``, and
|
||||||
|
``impl/`` holds concrete implementations.
|
||||||
|
|
||||||
|
Distinct from the ``tests/factories/campaign_factory.py`` namespace —
|
||||||
|
that's the synthetic-data DSL used by the fixture suite. The clusterer
|
||||||
|
here is the production worker that the fixture suite *gates*.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from decnet.web.db.repository import BaseRepository
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ClusterResult:
|
||||||
|
"""Side-effects produced by a single clusterer ``tick``.
|
||||||
|
|
||||||
|
The worker shell consumes these to publish on the bus
|
||||||
|
(``identity.formed`` / ``identity.observation.linked`` /
|
||||||
|
``identity.merged`` / ``identity.unmerged``). The clusterer itself
|
||||||
|
has already committed any DB writes by the time it returns this —
|
||||||
|
losing a publish is at most a few seconds of UI latency.
|
||||||
|
"""
|
||||||
|
|
||||||
|
identities_formed: list[dict[str, Any]] = field(default_factory=list)
|
||||||
|
"""One dict per newly created identity. Shape:
|
||||||
|
``{"identity_uuid": str, "observation_uuids": [str, ...]}``."""
|
||||||
|
|
||||||
|
observations_linked: list[dict[str, Any]] = field(default_factory=list)
|
||||||
|
"""One dict per observation attached to an existing identity. Shape:
|
||||||
|
``{"identity_uuid": str, "observation_uuid": str}``."""
|
||||||
|
|
||||||
|
identities_merged: list[dict[str, Any]] = field(default_factory=list)
|
||||||
|
"""One dict per merge. Shape: ``{"winner_uuid": str,
|
||||||
|
"loser_uuid": str}``."""
|
||||||
|
|
||||||
|
identities_unmerged: list[dict[str, Any]] = field(default_factory=list)
|
||||||
|
"""One dict per revoked merge (contradicting evidence re-split a
|
||||||
|
previously-merged pair). Shape:
|
||||||
|
``{"resurrected_uuid": str, "former_winner_uuid": str}``.
|
||||||
|
|
||||||
|
Reserved for the revocable-merge work; the skeleton clusterer never
|
||||||
|
produces these. Subscribers on ``identity.>`` should still handle
|
||||||
|
them from day one — see ``identity.unmerged`` in
|
||||||
|
:mod:`decnet.bus.topics`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class Clusterer(ABC):
|
||||||
|
"""Abstract identity-resolution clusterer.
|
||||||
|
|
||||||
|
Single-method contract: ``tick`` reads pending observations from the
|
||||||
|
repo, runs a clustering pass, commits ``attacker_identities`` rows +
|
||||||
|
sets ``attackers.identity_id``, and returns a :class:`ClusterResult`
|
||||||
|
summarising the side-effects so the worker shell can publish.
|
||||||
|
|
||||||
|
Implementations MUST NOT raise from ``tick``: a single bad pass
|
||||||
|
cannot be allowed to crash the worker. Internal failures should be
|
||||||
|
logged and the method should return an empty :class:`ClusterResult`.
|
||||||
|
"""
|
||||||
|
|
||||||
|
#: Short tag — surfaces in logs and in
|
||||||
|
#: ``DECNET_CLUSTERER_TYPE`` for factory dispatch.
|
||||||
|
name: str
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def tick(self, repo: BaseRepository) -> ClusterResult:
|
||||||
|
"""Run a single clustering pass. See class docstring."""
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["Clusterer", "ClusterResult"]
|
||||||
5
decnet/clustering/campaign/__init__.py
Normal file
5
decnet/clustering/campaign/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
"""Campaign clusterer — groups resolved identities into operations.
|
||||||
|
|
||||||
|
The layer above identity resolution. See
|
||||||
|
``development/CAMPAIGN_CLUSTERING.md`` for the signal taxonomy.
|
||||||
|
"""
|
||||||
66
decnet/clustering/campaign/base.py
Normal file
66
decnet/clustering/campaign/base.py
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
"""Campaign clusterer protocol — layer above identity resolution.
|
||||||
|
|
||||||
|
Mirrors :mod:`decnet.clustering.base` for the layer above. Each concrete
|
||||||
|
campaign clusterer implements :class:`CampaignClusterer`; callers obtain
|
||||||
|
the active instance via
|
||||||
|
:func:`decnet.clustering.campaign.factory.get_campaign_clusterer`.
|
||||||
|
|
||||||
|
The result shape parallels :class:`ClusterResult` but speaks campaign
|
||||||
|
vocabulary: campaigns formed, identities assigned, campaigns merged,
|
||||||
|
campaigns unmerged.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from abc import ABC, abstractmethod
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from decnet.web.db.repository import BaseRepository
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CampaignClusterResult:
|
||||||
|
"""Side-effects produced by a single campaign-clusterer ``tick``.
|
||||||
|
|
||||||
|
Consumed by the worker shell to publish on the bus
|
||||||
|
(``campaign.formed`` / ``campaign.identity.assigned`` /
|
||||||
|
``campaign.merged`` / ``campaign.unmerged`` plus the cross-family
|
||||||
|
``identity.campaign.assigned``). DB writes are already committed
|
||||||
|
by the time this returns.
|
||||||
|
"""
|
||||||
|
|
||||||
|
campaigns_formed: list[dict[str, Any]] = field(default_factory=list)
|
||||||
|
"""``{"campaign_uuid": str, "identity_uuids": [str, ...]}``."""
|
||||||
|
|
||||||
|
identities_assigned: list[dict[str, Any]] = field(default_factory=list)
|
||||||
|
"""``{"campaign_uuid": str, "identity_uuid": str,
|
||||||
|
"prior_campaign_uuid": Optional[str]}``."""
|
||||||
|
|
||||||
|
campaigns_merged: list[dict[str, Any]] = field(default_factory=list)
|
||||||
|
"""``{"winner_uuid": str, "loser_uuid": str}``."""
|
||||||
|
|
||||||
|
campaigns_unmerged: list[dict[str, Any]] = field(default_factory=list)
|
||||||
|
"""``{"resurrected_uuid": str, "former_winner_uuid": str}``."""
|
||||||
|
|
||||||
|
|
||||||
|
class CampaignClusterer(ABC):
|
||||||
|
"""Abstract campaign clusterer.
|
||||||
|
|
||||||
|
Single-method contract mirroring :class:`Clusterer`: ``tick`` reads
|
||||||
|
identities from the repo, projects them to a campaign-level feature
|
||||||
|
shape, runs a clustering pass, commits ``campaigns`` rows + sets
|
||||||
|
``attacker_identities.campaign_id``, and returns a
|
||||||
|
:class:`CampaignClusterResult` summarising side-effects.
|
||||||
|
|
||||||
|
Implementations MUST NOT raise from ``tick``: a single bad pass
|
||||||
|
cannot be allowed to crash the worker.
|
||||||
|
"""
|
||||||
|
|
||||||
|
name: str
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
async def tick(self, repo: BaseRepository) -> CampaignClusterResult:
|
||||||
|
"""Run a single campaign clustering pass."""
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["CampaignClusterer", "CampaignClusterResult"]
|
||||||
31
decnet/clustering/campaign/factory.py
Normal file
31
decnet/clustering/campaign/factory.py
Normal file
@@ -0,0 +1,31 @@
|
|||||||
|
"""Campaign-clusterer factory.
|
||||||
|
|
||||||
|
Mirrors :mod:`decnet.clustering.factory` for the campaign layer.
|
||||||
|
Configuration knob ``DECNET_CAMPAIGN_CLUSTERER_TYPE``; default
|
||||||
|
``"connected_components"``.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
from decnet.clustering.campaign.base import CampaignClusterer
|
||||||
|
|
||||||
|
_KNOWN: tuple[str, ...] = ("connected_components",)
|
||||||
|
_DEFAULT = "connected_components"
|
||||||
|
|
||||||
|
|
||||||
|
def get_campaign_clusterer() -> CampaignClusterer:
|
||||||
|
name = os.environ.get(
|
||||||
|
"DECNET_CAMPAIGN_CLUSTERER_TYPE", _DEFAULT,
|
||||||
|
).strip().lower()
|
||||||
|
if name == "connected_components":
|
||||||
|
from decnet.clustering.campaign.impl.connected_components import (
|
||||||
|
ConnectedComponentsCampaignClusterer,
|
||||||
|
)
|
||||||
|
return ConnectedComponentsCampaignClusterer()
|
||||||
|
raise ValueError(
|
||||||
|
f"Unknown campaign clusterer: {name!r}. Known: {_KNOWN}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["get_campaign_clusterer"]
|
||||||
0
decnet/clustering/campaign/impl/__init__.py
Normal file
0
decnet/clustering/campaign/impl/__init__.py
Normal file
304
decnet/clustering/campaign/impl/connected_components.py
Normal file
304
decnet/clustering/campaign/impl/connected_components.py
Normal file
@@ -0,0 +1,304 @@
|
|||||||
|
"""Connected-components campaign clusterer (v1).
|
||||||
|
|
||||||
|
Builds a similarity graph over identities (the layer below — already
|
||||||
|
clustered from raw observations), runs union-find over edges that pass
|
||||||
|
:data:`CAMPAIGN_EDGE_THRESHOLD`, and writes one ``campaigns`` row per
|
||||||
|
component.
|
||||||
|
|
||||||
|
Mirror of :mod:`decnet.clustering.impl.connected_components` for the
|
||||||
|
layer above. Same revocable-merge discipline: identities stay FK'd to
|
||||||
|
their original campaign row throughout, soft pointers via
|
||||||
|
``campaigns.merged_into_uuid``.
|
||||||
|
|
||||||
|
**Time-agnostic.** Edges depend only on pairwise relative offsets —
|
||||||
|
fixture F7 (slow_burn) invariant carries forward to this layer.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import uuid as _uuid
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from typing import Any, Iterable, Optional
|
||||||
|
|
||||||
|
from decnet.clustering.campaign.base import (
|
||||||
|
CampaignClusterer,
|
||||||
|
CampaignClusterResult,
|
||||||
|
)
|
||||||
|
from decnet.clustering.campaign.impl.similarity import (
|
||||||
|
CAMPAIGN_EDGE_THRESHOLD,
|
||||||
|
IdentityFeatures,
|
||||||
|
combined_campaign_weight,
|
||||||
|
)
|
||||||
|
from decnet.logging import get_logger
|
||||||
|
from decnet.web.db.repository import BaseRepository
|
||||||
|
|
||||||
|
log = get_logger("clustering.campaign.connected_components")
|
||||||
|
|
||||||
|
|
||||||
|
def cluster_identities(
|
||||||
|
features: Iterable[IdentityFeatures],
|
||||||
|
) -> dict[str, str]:
|
||||||
|
"""Run connected-components over the campaign-level similarity graph.
|
||||||
|
|
||||||
|
Pure: no DB, no clock, no I/O. Returns ``{identity_uuid: cluster_id}``.
|
||||||
|
Singletons get a stable per-identity cluster id; cluster ids are
|
||||||
|
opaque strings.
|
||||||
|
"""
|
||||||
|
feat_list = list(features)
|
||||||
|
parent: dict[str, str] = {f.identity_uuid: f.identity_uuid for f in feat_list}
|
||||||
|
|
||||||
|
def find(x: str) -> str:
|
||||||
|
while parent[x] != x:
|
||||||
|
parent[x] = parent[parent[x]]
|
||||||
|
x = parent[x]
|
||||||
|
return x
|
||||||
|
|
||||||
|
def union(x: str, y: str) -> None:
|
||||||
|
rx, ry = find(x), find(y)
|
||||||
|
if rx != ry:
|
||||||
|
parent[rx] = ry
|
||||||
|
|
||||||
|
for i, a in enumerate(feat_list):
|
||||||
|
for b in feat_list[i + 1:]:
|
||||||
|
if combined_campaign_weight(a, b) >= CAMPAIGN_EDGE_THRESHOLD:
|
||||||
|
union(a.identity_uuid, b.identity_uuid)
|
||||||
|
|
||||||
|
return {f.identity_uuid: f"cmp-{find(f.identity_uuid)}" for f in feat_list}
|
||||||
|
|
||||||
|
|
||||||
|
def from_identity_row(row: dict[str, Any]) -> IdentityFeatures:
|
||||||
|
"""Project an ``AttackerIdentity`` projection row dict into an
|
||||||
|
:class:`IdentityFeatures`.
|
||||||
|
|
||||||
|
``row`` is the shape returned by
|
||||||
|
``BaseRepository.list_identities_for_clustering``: uuid +
|
||||||
|
ja3_hashes / hassh_hashes / payload_simhashes / c2_endpoints
|
||||||
|
(JSON list[str] or null).
|
||||||
|
|
||||||
|
Phase-handoff fields stay empty until the production-row adapter
|
||||||
|
learns to mine logs for per-decky phase sequences (TODO.md
|
||||||
|
"production-side payload + C2 + commands joins"). Without those,
|
||||||
|
the campaign clusterer falls back to shared-infra + temporal
|
||||||
|
overlap + cohort signals on production data; the fixture path
|
||||||
|
exercises the full feature set via :func:`from_synthetic_identity`.
|
||||||
|
"""
|
||||||
|
payload_hashes = _parse_json_list(row.get("payload_simhashes"))
|
||||||
|
c2_endpoints = _parse_json_list(row.get("c2_endpoints"))
|
||||||
|
|
||||||
|
return IdentityFeatures(
|
||||||
|
identity_uuid=row["uuid"],
|
||||||
|
payload_hashes=frozenset(payload_hashes),
|
||||||
|
c2_endpoints=frozenset(c2_endpoints),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_json_list(raw: Optional[str]) -> list[str]:
|
||||||
|
if not raw:
|
||||||
|
return []
|
||||||
|
try:
|
||||||
|
decoded = json.loads(raw)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
return []
|
||||||
|
if not isinstance(decoded, list):
|
||||||
|
return []
|
||||||
|
return [str(x) for x in decoded if x is not None]
|
||||||
|
|
||||||
|
|
||||||
|
class ConnectedComponentsCampaignClusterer(CampaignClusterer):
|
||||||
|
"""Connected-components campaign clusterer."""
|
||||||
|
|
||||||
|
name = "connected_components"
|
||||||
|
|
||||||
|
async def tick(self, repo: BaseRepository) -> CampaignClusterResult:
|
||||||
|
try:
|
||||||
|
rows = await repo.list_identities_for_clustering()
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
log.exception("campaign clusterer: failed to read identities")
|
||||||
|
return CampaignClusterResult()
|
||||||
|
|
||||||
|
if not rows:
|
||||||
|
return CampaignClusterResult()
|
||||||
|
|
||||||
|
# Pre-compute the campaign merge chain so an identity's
|
||||||
|
# "effective" campaign follows merged_into_uuid up to the winner.
|
||||||
|
try:
|
||||||
|
all_campaigns = await repo.list_all_campaigns()
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
log.exception("campaign clusterer: failed to read campaigns")
|
||||||
|
return CampaignClusterResult()
|
||||||
|
campaign_chain = _build_merge_chain(all_campaigns)
|
||||||
|
|
||||||
|
# Project + cluster. Skip identities that are themselves
|
||||||
|
# merged out — their winner is the active row and gets clustered
|
||||||
|
# on its own. This keeps the campaign graph from double-counting.
|
||||||
|
active_rows = [r for r in rows if not r.get("merged_into_uuid")]
|
||||||
|
feature_list: list[IdentityFeatures] = [
|
||||||
|
from_identity_row(r) for r in active_rows
|
||||||
|
]
|
||||||
|
row_by_uuid: dict[str, dict[str, Any]] = {
|
||||||
|
r["uuid"]: r for r in active_rows
|
||||||
|
}
|
||||||
|
labels = cluster_identities(feature_list)
|
||||||
|
|
||||||
|
# Group identities by predicted cluster.
|
||||||
|
components: dict[str, list[str]] = {}
|
||||||
|
for identity_uuid, cluster_id in labels.items():
|
||||||
|
components.setdefault(cluster_id, []).append(identity_uuid)
|
||||||
|
|
||||||
|
result = CampaignClusterResult()
|
||||||
|
now = datetime.now(timezone.utc)
|
||||||
|
|
||||||
|
# Pass 1 — per-component reconciliation: form, link, merge.
|
||||||
|
for member_ids in components.values():
|
||||||
|
literal_campaign_ids = {
|
||||||
|
row_by_uuid[m]["campaign_id"] for m in member_ids
|
||||||
|
if row_by_uuid[m].get("campaign_id")
|
||||||
|
}
|
||||||
|
effective_ids = {
|
||||||
|
campaign_chain.get(c, c) for c in literal_campaign_ids
|
||||||
|
}
|
||||||
|
unassigned = [
|
||||||
|
m for m in member_ids
|
||||||
|
if not row_by_uuid[m].get("campaign_id")
|
||||||
|
]
|
||||||
|
|
||||||
|
if not effective_ids:
|
||||||
|
campaign_uuid = str(_uuid.uuid4())
|
||||||
|
try:
|
||||||
|
await repo.create_campaign({
|
||||||
|
"uuid": campaign_uuid,
|
||||||
|
"schema_version": 1,
|
||||||
|
"first_seen_at": now,
|
||||||
|
"last_seen_at": now,
|
||||||
|
"created_at": now,
|
||||||
|
"updated_at": now,
|
||||||
|
"identity_count": len(member_ids),
|
||||||
|
})
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
log.exception(
|
||||||
|
"campaign clusterer: failed to create campaign for "
|
||||||
|
"component %s", member_ids,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
linked: list[str] = []
|
||||||
|
for identity_uuid in member_ids:
|
||||||
|
if await _link(repo, identity_uuid, campaign_uuid):
|
||||||
|
linked.append(identity_uuid)
|
||||||
|
if linked:
|
||||||
|
result.campaigns_formed.append({
|
||||||
|
"campaign_uuid": campaign_uuid,
|
||||||
|
"identity_uuids": linked,
|
||||||
|
})
|
||||||
|
continue
|
||||||
|
|
||||||
|
winner_uuid = min(effective_ids)
|
||||||
|
losers = effective_ids - {winner_uuid}
|
||||||
|
|
||||||
|
for loser_uuid in losers:
|
||||||
|
try:
|
||||||
|
await repo.update_campaign_merged_into(
|
||||||
|
loser_uuid, winner_uuid,
|
||||||
|
)
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
log.exception(
|
||||||
|
"campaign clusterer: failed to merge %s -> %s",
|
||||||
|
loser_uuid, winner_uuid,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
campaign_chain[loser_uuid] = winner_uuid
|
||||||
|
result.campaigns_merged.append({
|
||||||
|
"winner_uuid": winner_uuid,
|
||||||
|
"loser_uuid": loser_uuid,
|
||||||
|
})
|
||||||
|
|
||||||
|
for identity_uuid in unassigned:
|
||||||
|
if await _link(repo, identity_uuid, winner_uuid):
|
||||||
|
result.identities_assigned.append({
|
||||||
|
"campaign_uuid": winner_uuid,
|
||||||
|
"identity_uuid": identity_uuid,
|
||||||
|
"prior_campaign_uuid": None,
|
||||||
|
})
|
||||||
|
|
||||||
|
# Pass 2 — revocable-merge undo for campaigns. Same shape as
|
||||||
|
# the identity-side check: if a merged-out campaign's
|
||||||
|
# identities no longer cluster with the winner's, revoke.
|
||||||
|
identities_by_literal_campaign: dict[str, list[str]] = {}
|
||||||
|
for identity_uuid, r in row_by_uuid.items():
|
||||||
|
cid = r.get("campaign_id")
|
||||||
|
if cid:
|
||||||
|
identities_by_literal_campaign.setdefault(cid, []).append(
|
||||||
|
identity_uuid,
|
||||||
|
)
|
||||||
|
|
||||||
|
for campaign_row in all_campaigns:
|
||||||
|
if not campaign_row.get("merged_into_uuid"):
|
||||||
|
continue
|
||||||
|
loser_uuid = campaign_row["uuid"]
|
||||||
|
winner_uuid = campaign_chain.get(loser_uuid, loser_uuid)
|
||||||
|
if winner_uuid == loser_uuid:
|
||||||
|
continue
|
||||||
|
loser_idents = identities_by_literal_campaign.get(loser_uuid, [])
|
||||||
|
winner_idents = identities_by_literal_campaign.get(winner_uuid, [])
|
||||||
|
if not loser_idents or not winner_idents:
|
||||||
|
continue
|
||||||
|
loser_clusters = {labels[i] for i in loser_idents if i in labels}
|
||||||
|
winner_clusters = {labels[i] for i in winner_idents if i in labels}
|
||||||
|
if loser_clusters & winner_clusters:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
await repo.update_campaign_merged_into(loser_uuid, None)
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
log.exception(
|
||||||
|
"campaign clusterer: failed to unmerge %s from %s",
|
||||||
|
loser_uuid, winner_uuid,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
campaign_chain[loser_uuid] = loser_uuid
|
||||||
|
result.campaigns_unmerged.append({
|
||||||
|
"resurrected_uuid": loser_uuid,
|
||||||
|
"former_winner_uuid": winner_uuid,
|
||||||
|
})
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _build_merge_chain(
|
||||||
|
rows: list[dict[str, Any]],
|
||||||
|
) -> dict[str, str]:
|
||||||
|
_MAX_HOPS = 8
|
||||||
|
by_uuid: dict[str, dict[str, Any]] = {r["uuid"]: r for r in rows}
|
||||||
|
chain: dict[str, str] = {}
|
||||||
|
for uuid_ in by_uuid:
|
||||||
|
cur = uuid_
|
||||||
|
for _ in range(_MAX_HOPS):
|
||||||
|
row = by_uuid.get(cur)
|
||||||
|
if row is None:
|
||||||
|
break
|
||||||
|
nxt = row.get("merged_into_uuid")
|
||||||
|
if not nxt or nxt == cur:
|
||||||
|
break
|
||||||
|
cur = nxt
|
||||||
|
chain[uuid_] = cur
|
||||||
|
return chain
|
||||||
|
|
||||||
|
|
||||||
|
async def _link(
|
||||||
|
repo: BaseRepository, identity_uuid: str, campaign_uuid: str,
|
||||||
|
) -> bool:
|
||||||
|
try:
|
||||||
|
await repo.set_identity_campaign_id(identity_uuid, campaign_uuid)
|
||||||
|
return True
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
log.exception(
|
||||||
|
"campaign clusterer: failed to link identity=%s -> campaign=%s",
|
||||||
|
identity_uuid, campaign_uuid,
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"ConnectedComponentsCampaignClusterer",
|
||||||
|
"cluster_identities",
|
||||||
|
"from_identity_row",
|
||||||
|
]
|
||||||
441
decnet/clustering/campaign/impl/similarity.py
Normal file
441
decnet/clustering/campaign/impl/similarity.py
Normal file
@@ -0,0 +1,441 @@
|
|||||||
|
"""Similarity-graph primitives for the campaign clusterer.
|
||||||
|
|
||||||
|
The campaign clusterer reads ``AttackerIdentity`` rows (the layer below)
|
||||||
|
and groups them into operations. The graph it builds is **not** the
|
||||||
|
identity-level graph: identity-level signals don't translate 1:1, and
|
||||||
|
some that get vetoed at identity level (shared infra) are the *primary
|
||||||
|
positive signal* at campaign level.
|
||||||
|
|
||||||
|
Mirror of ``decnet.clustering.impl.similarity`` for the
|
||||||
|
identity layer; see that module for the four-tier identity taxonomy.
|
||||||
|
|
||||||
|
**Time-agnostic.** Same F7 invariant as the identity layer — edges
|
||||||
|
MUST depend only on *pairwise relative* offsets, never on absolute
|
||||||
|
clocks. Shift two identities' session windows by the same Δ and the
|
||||||
|
edge weights MUST be identical. The temporal-overlap edge below uses
|
||||||
|
this invariant explicitly.
|
||||||
|
|
||||||
|
**Edge families** (from ``development/CAMPAIGN_CLUSTERING.md``):
|
||||||
|
|
||||||
|
* **Phase-handoff** — A ends in ``COMMAND_AND_CONTROL`` / ``PERSISTENCE``
|
||||||
|
on decky D, B begins ``DISCOVERY`` / ``LATERAL_MOVEMENT`` on D
|
||||||
|
within window W. Load-bearing for fixture F5 (multi_operator) — the
|
||||||
|
signal the identity-side fingerprint-disagreement veto deliberately
|
||||||
|
doesn't try to be.
|
||||||
|
* **Shared-infra** — Jaccard over aggregated payload-hashes /
|
||||||
|
C2-endpoints / decky-set across the identities' member observations.
|
||||||
|
Vetoed at identity level (``ed32358``); primary positive signal here.
|
||||||
|
* **Temporal overlap** — sessions inside a bounded *relative* window.
|
||||||
|
Campaigns are operations and operations have bounded duration;
|
||||||
|
overlap of distinct identities on shared infra is the canonical
|
||||||
|
co-op pattern.
|
||||||
|
* **Cohort** — ASN-cohort + tooling-cohort weak signals. Defeated alone
|
||||||
|
(per F2); useful as supporting weight only.
|
||||||
|
|
||||||
|
The functions are pure (no DB, no I/O); the worker maps identities into
|
||||||
|
:class:`IdentityFeatures` once per tick and feeds these into the graph
|
||||||
|
builder in a sibling module.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Mapping, Optional
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Identity-level projection ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class IdentityFeatures:
|
||||||
|
"""Minimal projection of an :class:`AttackerIdentity` row.
|
||||||
|
|
||||||
|
Built once per identity by the worker (or per fixture identity in
|
||||||
|
tests via :func:`from_synthetic_identity`). Keeping the projection
|
||||||
|
tight isolates the campaign-graph code from schema drift on the
|
||||||
|
identity layer.
|
||||||
|
"""
|
||||||
|
|
||||||
|
identity_uuid: str
|
||||||
|
"""Stable ID — production: ``AttackerIdentity.uuid``."""
|
||||||
|
|
||||||
|
asn_cohort: frozenset[int] = field(default_factory=frozenset)
|
||||||
|
"""All ASNs observed across the identity's member observations.
|
||||||
|
A single rotating actor (F2) appears in many ASNs; the *set*
|
||||||
|
overlap is the cohort signal."""
|
||||||
|
|
||||||
|
tooling_cohort: frozenset[str] = field(default_factory=frozenset)
|
||||||
|
"""Tooling labels (e.g. ``"hydra"``, ``"hping"``) inferred from
|
||||||
|
fingerprints / commands. Empty until tooling-attribution lands."""
|
||||||
|
|
||||||
|
payload_hashes: frozenset[str] = field(default_factory=frozenset)
|
||||||
|
"""Aggregated payload hashes across member observations."""
|
||||||
|
|
||||||
|
c2_endpoints: frozenset[str] = field(default_factory=frozenset)
|
||||||
|
"""Aggregated C2 endpoints across member observations."""
|
||||||
|
|
||||||
|
decky_set: frozenset[str] = field(default_factory=frozenset)
|
||||||
|
"""Aggregated decky IDs the identity touched."""
|
||||||
|
|
||||||
|
commands_by_phase_on_decky: Mapping[
|
||||||
|
tuple[str, str], tuple[str, ...]
|
||||||
|
] = field(default_factory=dict)
|
||||||
|
"""``(decky_id, UKCPhase.value)`` → ordered command sequence
|
||||||
|
observed on that decky in that phase. Required for the
|
||||||
|
phase-handoff edge — same decky is the join key. Empty when
|
||||||
|
``commands_by_phase`` is unavailable on the production-row
|
||||||
|
adapter (deferred per TODO.md until log-mining lands)."""
|
||||||
|
|
||||||
|
session_windows: tuple[tuple[float, float], ...] = ()
|
||||||
|
"""Per-session ``(start_ts, end_ts)`` tuples in seconds since
|
||||||
|
epoch. Used ONLY for pairwise relative deltas — never compared
|
||||||
|
to an absolute clock. F7 (slow_burn) invariance check verifies
|
||||||
|
that adding Δ to every entry on both sides yields the same edge
|
||||||
|
weight."""
|
||||||
|
|
||||||
|
last_phase_per_decky: Mapping[str, str] = field(default_factory=dict)
|
||||||
|
"""``decky_id`` → last UKC phase observed on that decky. The
|
||||||
|
"from" side of a phase handoff."""
|
||||||
|
|
||||||
|
first_phase_per_decky: Mapping[str, str] = field(default_factory=dict)
|
||||||
|
"""``decky_id`` → first UKC phase observed on that decky. The
|
||||||
|
"to" side of a phase handoff."""
|
||||||
|
|
||||||
|
last_seen_per_decky: Mapping[str, float] = field(default_factory=dict)
|
||||||
|
"""``decky_id`` → last activity timestamp on that decky. Pairs
|
||||||
|
with :attr:`first_seen_per_decky` to compute pairwise handoff
|
||||||
|
gap relative to the two identities (no absolute clock)."""
|
||||||
|
|
||||||
|
first_seen_per_decky: Mapping[str, float] = field(default_factory=dict)
|
||||||
|
"""``decky_id`` → first activity timestamp on that decky."""
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Phase-handoff edge ─────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
#: Phases that mark a *handoff-out* — operator A is finished setting
|
||||||
|
#: up a foothold and the next operator can step in. Drawn from the
|
||||||
|
#: STAGE_IN tail (PERSISTENCE / COMMAND_AND_CONTROL) per the UKC
|
||||||
|
#: vocabulary; expanding this set is a tunable knob.
|
||||||
|
HANDOFF_OUT_PHASES: frozenset[str] = frozenset({
|
||||||
|
"command_and_control",
|
||||||
|
"persistence",
|
||||||
|
})
|
||||||
|
|
||||||
|
#: Phases that mark a *handoff-in* — operator B picks up a prepared
|
||||||
|
#: foothold and starts operating through the network. STAGE_THROUGH
|
||||||
|
#: head (DISCOVERY / LATERAL_MOVEMENT).
|
||||||
|
HANDOFF_IN_PHASES: frozenset[str] = frozenset({
|
||||||
|
"discovery",
|
||||||
|
"lateral_movement",
|
||||||
|
})
|
||||||
|
|
||||||
|
#: Default handoff-window in seconds. The "B starts within W of A's
|
||||||
|
#: end" guard. Bounded relative to the pair — fixture F7 invariant
|
||||||
|
#: still holds because shifting both timestamps preserves the gap.
|
||||||
|
DEFAULT_HANDOFF_WINDOW_S: float = 24 * 3600.0 # 24h
|
||||||
|
|
||||||
|
|
||||||
|
def phase_handoff_weight(
|
||||||
|
a: IdentityFeatures,
|
||||||
|
b: IdentityFeatures,
|
||||||
|
window_s: float = DEFAULT_HANDOFF_WINDOW_S,
|
||||||
|
) -> float:
|
||||||
|
"""Phase-handoff edge — the load-bearing F5 signal.
|
||||||
|
|
||||||
|
Returns ``1.0`` if there exists a decky D such that EITHER:
|
||||||
|
|
||||||
|
* A's last phase on D is in :data:`HANDOFF_OUT_PHASES`, B's first
|
||||||
|
phase on D is in :data:`HANDOFF_IN_PHASES`, and B's first
|
||||||
|
activity on D is within ``window_s`` AFTER A's last activity
|
||||||
|
on D, OR
|
||||||
|
* the symmetric case with A and B swapped.
|
||||||
|
|
||||||
|
Returns ``0.0`` when no shared decky has a matching out→in pair
|
||||||
|
within window. Window comparison is on the *gap* (a single
|
||||||
|
subtraction) — pairwise-relative, so F7 invariance holds.
|
||||||
|
"""
|
||||||
|
return max(
|
||||||
|
_directed_handoff(a, b, window_s),
|
||||||
|
_directed_handoff(b, a, window_s),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _directed_handoff(
|
||||||
|
out: IdentityFeatures, in_: IdentityFeatures, window_s: float,
|
||||||
|
) -> float:
|
||||||
|
shared = set(out.last_phase_per_decky) & set(in_.first_phase_per_decky)
|
||||||
|
for decky in shared:
|
||||||
|
out_phase = out.last_phase_per_decky.get(decky)
|
||||||
|
in_phase = in_.first_phase_per_decky.get(decky)
|
||||||
|
if out_phase not in HANDOFF_OUT_PHASES:
|
||||||
|
continue
|
||||||
|
if in_phase not in HANDOFF_IN_PHASES:
|
||||||
|
continue
|
||||||
|
out_t = out.last_seen_per_decky.get(decky)
|
||||||
|
in_t = in_.first_seen_per_decky.get(decky)
|
||||||
|
if out_t is None or in_t is None:
|
||||||
|
continue
|
||||||
|
gap = in_t - out_t
|
||||||
|
if 0.0 <= gap <= window_s:
|
||||||
|
return 1.0
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Shared-infra edge ──────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def shared_infra_weight(a: IdentityFeatures, b: IdentityFeatures) -> float:
|
||||||
|
"""Jaccard over payload-hashes ∪ C2-endpoints.
|
||||||
|
|
||||||
|
Excludes ``decky_set`` deliberately: decky overlap is a *fleet
|
||||||
|
scarcity* artifact (a small fleet means many distinct campaigns
|
||||||
|
hit the same deckies) and would fuse F1's two unrelated campaigns
|
||||||
|
on shared targeting. Payload hashes and C2 endpoints are
|
||||||
|
operational artifacts; distinct campaigns rarely share them.
|
||||||
|
|
||||||
|
At identity level this gets vetoed by the fingerprint-disagreement
|
||||||
|
rule (``ed32358``); at campaign level it's the *primary* positive
|
||||||
|
signal — distinct identities sharing payload + C2 is the canonical
|
||||||
|
co-op pattern (F5 multi_operator).
|
||||||
|
|
||||||
|
The decky-overlap signal lives in :func:`cohort_weight` instead
|
||||||
|
where its weak-tier multiplier prevents F1-style false merges.
|
||||||
|
|
||||||
|
Returns Jaccard across the union of the two set families,
|
||||||
|
``0.0`` when both sides are empty.
|
||||||
|
"""
|
||||||
|
a_set = a.payload_hashes | a.c2_endpoints
|
||||||
|
b_set = b.payload_hashes | b.c2_endpoints
|
||||||
|
if not a_set and not b_set:
|
||||||
|
return 0.0
|
||||||
|
union = a_set | b_set
|
||||||
|
if not union:
|
||||||
|
return 0.0
|
||||||
|
return len(a_set & b_set) / len(union)
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Temporal-overlap edge ──────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def temporal_overlap_weight(
|
||||||
|
a: IdentityFeatures, b: IdentityFeatures,
|
||||||
|
) -> float:
|
||||||
|
"""Pairwise-relative temporal overlap fraction.
|
||||||
|
|
||||||
|
Returns the fraction of A's total session time that overlaps any
|
||||||
|
B session, capped at ``1.0``. Pairwise-relative: the value is
|
||||||
|
invariant under a uniform Δ-shift of every timestamp on both
|
||||||
|
sides (F7 fixture's invariant). Returns ``0.0`` when either side
|
||||||
|
has no session windows.
|
||||||
|
|
||||||
|
Two non-cooperating actors with bounded operations rarely overlap
|
||||||
|
by chance; co-op campaigns overlap heavily. Defeated alone (one
|
||||||
|
overlapping minute means little) — combined with shared-infra
|
||||||
|
or handoff it pulls a pair over threshold.
|
||||||
|
"""
|
||||||
|
if not a.session_windows or not b.session_windows:
|
||||||
|
return 0.0
|
||||||
|
a_total = sum(end - start for start, end in a.session_windows)
|
||||||
|
if a_total <= 0:
|
||||||
|
return 0.0
|
||||||
|
overlap = 0.0
|
||||||
|
for a_start, a_end in a.session_windows:
|
||||||
|
for b_start, b_end in b.session_windows:
|
||||||
|
lo = max(a_start, b_start)
|
||||||
|
hi = min(a_end, b_end)
|
||||||
|
if hi > lo:
|
||||||
|
overlap += hi - lo
|
||||||
|
return min(1.0, overlap / a_total)
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Cohort edges ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def cohort_weight(a: IdentityFeatures, b: IdentityFeatures) -> float:
|
||||||
|
"""ASN-cohort + tooling-cohort + decky-overlap weak signal.
|
||||||
|
|
||||||
|
Jaccard over the union of ASN cohort, tooling cohort, and decky
|
||||||
|
set. F2's failure mode (one identity rotating across many ASNs)
|
||||||
|
doesn't apply at *campaign* level — but multiple identities
|
||||||
|
cooperating out of the same hosting cohort is plausible co-op
|
||||||
|
evidence. Decky overlap lives here (not in :func:`shared_infra`)
|
||||||
|
because decky scarcity in a small honeypot fleet would otherwise
|
||||||
|
fuse unrelated campaigns hitting the same SSH targets (F1
|
||||||
|
shared_wordlist).
|
||||||
|
|
||||||
|
Weak by design: the combined-weight tier multiplier keeps this
|
||||||
|
from crossing threshold alone.
|
||||||
|
"""
|
||||||
|
a_set: frozenset = frozenset(
|
||||||
|
{("asn", str(x)) for x in a.asn_cohort}
|
||||||
|
| {("tool", x) for x in a.tooling_cohort}
|
||||||
|
| {("decky", x) for x in a.decky_set}
|
||||||
|
)
|
||||||
|
b_set: frozenset = frozenset(
|
||||||
|
{("asn", str(x)) for x in b.asn_cohort}
|
||||||
|
| {("tool", x) for x in b.tooling_cohort}
|
||||||
|
| {("decky", x) for x in b.decky_set}
|
||||||
|
)
|
||||||
|
if not a_set and not b_set:
|
||||||
|
return 0.0
|
||||||
|
union = a_set | b_set
|
||||||
|
if not union:
|
||||||
|
return 0.0
|
||||||
|
return len(a_set & b_set) / len(union)
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Combined campaign-level weight ─────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
#: Tier multipliers for the campaign graph. Tuned so:
|
||||||
|
#:
|
||||||
|
#: * Phase-handoff alone (max 1.0) crosses threshold — a clean
|
||||||
|
#: F5-style handoff is sufficient evidence on its own.
|
||||||
|
#: * Shared-infra alone (max 1.0) crosses threshold — payload+C2
|
||||||
|
#: overlap is the canonical co-op signal (F5 multi_operator's
|
||||||
|
#: intended pass condition; decky overlap was deliberately moved
|
||||||
|
#: to :func:`cohort_weight` to avoid F1's false merge on shared
|
||||||
|
#: targeting).
|
||||||
|
#: * Temporal overlap alone (max 1.0) yields 0.4 — supporting weight.
|
||||||
|
#: * Cohort alone (max 1.0) yields 0.1 — defeats F1's shared-decky
|
||||||
|
#: failure mode and F2's rotating-ASN one.
|
||||||
|
#:
|
||||||
|
#: F1 shared_wordlist: payload+C2 = ∅ on both sides → shared_infra =
|
||||||
|
#: 0; ASN+decky overlap fires cohort but at 0.1 stays well below
|
||||||
|
#: threshold. F2 vpn_hopping is folded by the identity layer first,
|
||||||
|
#: so the campaign clusterer sees one identity → one campaign.
|
||||||
|
CAMPAIGN_TIER_WEIGHTS: dict[str, float] = {
|
||||||
|
"phase_handoff": 1.0,
|
||||||
|
"shared_infra": 1.0,
|
||||||
|
"temporal_overlap": 0.4,
|
||||||
|
"cohort": 0.1,
|
||||||
|
}
|
||||||
|
|
||||||
|
#: Threshold a combined campaign-edge weight must meet to survive
|
||||||
|
#: into the similarity graph.
|
||||||
|
CAMPAIGN_EDGE_THRESHOLD: float = 1.0
|
||||||
|
|
||||||
|
|
||||||
|
def combined_campaign_weight(
|
||||||
|
a: IdentityFeatures,
|
||||||
|
b: IdentityFeatures,
|
||||||
|
*,
|
||||||
|
handoff_window_s: float = DEFAULT_HANDOFF_WINDOW_S,
|
||||||
|
) -> float:
|
||||||
|
"""Sum of all four tier scores, weighted by
|
||||||
|
:data:`CAMPAIGN_TIER_WEIGHTS`.
|
||||||
|
|
||||||
|
The campaign-clusterer worker compares this against
|
||||||
|
:data:`CAMPAIGN_EDGE_THRESHOLD` to decide whether to draw an
|
||||||
|
edge. Pure / time-agnostic — F7 invariant preserved.
|
||||||
|
"""
|
||||||
|
return (
|
||||||
|
CAMPAIGN_TIER_WEIGHTS["phase_handoff"]
|
||||||
|
* phase_handoff_weight(a, b, handoff_window_s)
|
||||||
|
+ CAMPAIGN_TIER_WEIGHTS["shared_infra"] * shared_infra_weight(a, b)
|
||||||
|
+ CAMPAIGN_TIER_WEIGHTS["temporal_overlap"]
|
||||||
|
* temporal_overlap_weight(a, b)
|
||||||
|
+ CAMPAIGN_TIER_WEIGHTS["cohort"] * cohort_weight(a, b)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Adapter for synthetic-fixture tests ────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def from_synthetic_identity(att, identity_uuid: Optional[str] = None) -> IdentityFeatures: # type: ignore[no-untyped-def]
|
||||||
|
"""Build an :class:`IdentityFeatures` from a ``SyntheticAttacker``.
|
||||||
|
|
||||||
|
Treats one ``SyntheticAttacker`` as one identity — adequate for
|
||||||
|
fixture validation where the campaign-clusterer reads identities
|
||||||
|
not raw observations. The worker's production-row adapter
|
||||||
|
(commit 3) builds the same shape from real ``AttackerIdentity``
|
||||||
|
rows + their member observations.
|
||||||
|
|
||||||
|
Lives here so test code doesn't import the factory shape into the
|
||||||
|
production module — the adapter is a documented integration point.
|
||||||
|
"""
|
||||||
|
payload_hashes: set[str] = set()
|
||||||
|
c2_endpoints: set[str] = set()
|
||||||
|
decky_set: set[str] = set()
|
||||||
|
asn_cohort: set[int] = set()
|
||||||
|
if att.asn is not None:
|
||||||
|
asn_cohort.add(att.asn)
|
||||||
|
|
||||||
|
commands_by_phase_on_decky: dict[tuple[str, str], list[str]] = {}
|
||||||
|
last_phase_per_decky: dict[str, str] = {}
|
||||||
|
first_phase_per_decky: dict[str, str] = {}
|
||||||
|
last_seen_per_decky: dict[str, float] = {}
|
||||||
|
first_seen_per_decky: dict[str, float] = {}
|
||||||
|
session_windows: list[tuple[float, float]] = []
|
||||||
|
|
||||||
|
# SyntheticSession order is the campaign DSL's emission order, which
|
||||||
|
# is monotonically time-ordered by construction. We rely on that to
|
||||||
|
# extract first/last phase per decky.
|
||||||
|
for s in att.sessions:
|
||||||
|
if s.payload_hash:
|
||||||
|
payload_hashes.add(s.payload_hash)
|
||||||
|
if s.c2_callback:
|
||||||
|
c2_endpoints.add(s.c2_callback)
|
||||||
|
decky = getattr(s, "decky", None) or getattr(s, "decky_id", None)
|
||||||
|
if decky:
|
||||||
|
decky_set.add(decky)
|
||||||
|
# SyntheticSession exposes ``started_at`` (datetime) +
|
||||||
|
# ``duration_s``; the production-row adapter (commit 3) gets
|
||||||
|
# ``start_ts``/``end_ts`` directly. Support both.
|
||||||
|
started_at = getattr(s, "started_at", None)
|
||||||
|
duration_s = getattr(s, "duration_s", None)
|
||||||
|
if started_at is not None:
|
||||||
|
ts_start = started_at.timestamp()
|
||||||
|
ts_end = ts_start + (float(duration_s) if duration_s else 0.0)
|
||||||
|
else:
|
||||||
|
ts_start = getattr(s, "start_ts", None)
|
||||||
|
ts_end = getattr(s, "end_ts", None)
|
||||||
|
if ts_start is not None and ts_end is not None:
|
||||||
|
session_windows.append((float(ts_start), float(ts_end)))
|
||||||
|
phase_value = s.phase.value if hasattr(s, "phase") else None
|
||||||
|
if decky and phase_value:
|
||||||
|
key = (decky, phase_value)
|
||||||
|
if s.commands:
|
||||||
|
commands_by_phase_on_decky.setdefault(key, []).extend(s.commands)
|
||||||
|
if decky not in first_phase_per_decky:
|
||||||
|
first_phase_per_decky[decky] = phase_value
|
||||||
|
if ts_start is not None:
|
||||||
|
first_seen_per_decky[decky] = float(ts_start)
|
||||||
|
last_phase_per_decky[decky] = phase_value
|
||||||
|
if ts_end is not None:
|
||||||
|
last_seen_per_decky[decky] = float(ts_end)
|
||||||
|
elif ts_start is not None:
|
||||||
|
last_seen_per_decky[decky] = float(ts_start)
|
||||||
|
|
||||||
|
return IdentityFeatures(
|
||||||
|
identity_uuid=identity_uuid or att.attacker_id,
|
||||||
|
asn_cohort=frozenset(asn_cohort),
|
||||||
|
tooling_cohort=frozenset(),
|
||||||
|
payload_hashes=frozenset(payload_hashes),
|
||||||
|
c2_endpoints=frozenset(c2_endpoints),
|
||||||
|
decky_set=frozenset(decky_set),
|
||||||
|
commands_by_phase_on_decky={
|
||||||
|
k: tuple(v) for k, v in commands_by_phase_on_decky.items()
|
||||||
|
},
|
||||||
|
session_windows=tuple(session_windows),
|
||||||
|
last_phase_per_decky=dict(last_phase_per_decky),
|
||||||
|
first_phase_per_decky=dict(first_phase_per_decky),
|
||||||
|
last_seen_per_decky=dict(last_seen_per_decky),
|
||||||
|
first_seen_per_decky=dict(first_seen_per_decky),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"IdentityFeatures",
|
||||||
|
"phase_handoff_weight",
|
||||||
|
"shared_infra_weight",
|
||||||
|
"temporal_overlap_weight",
|
||||||
|
"cohort_weight",
|
||||||
|
"combined_campaign_weight",
|
||||||
|
"from_synthetic_identity",
|
||||||
|
"HANDOFF_OUT_PHASES",
|
||||||
|
"HANDOFF_IN_PHASES",
|
||||||
|
"DEFAULT_HANDOFF_WINDOW_S",
|
||||||
|
"CAMPAIGN_TIER_WEIGHTS",
|
||||||
|
"CAMPAIGN_EDGE_THRESHOLD",
|
||||||
|
]
|
||||||
191
decnet/clustering/campaign/worker.py
Normal file
191
decnet/clustering/campaign/worker.py
Normal file
@@ -0,0 +1,191 @@
|
|||||||
|
"""Long-running campaign-clusterer worker.
|
||||||
|
|
||||||
|
Mirrors :mod:`decnet.clustering.worker` for the layer above. Bus-woken
|
||||||
|
on ``identity.>`` (not ``attacker.>`` — the campaign clusterer reads
|
||||||
|
identities, not raw observations); falls back to a 60s slow-tick poll
|
||||||
|
when the bus is unavailable.
|
||||||
|
|
||||||
|
Publishes the four ``campaign.*`` events plus the cross-family
|
||||||
|
``identity.campaign.assigned`` so existing identity-stream subscribers
|
||||||
|
see campaign-id changes without subscribing to ``campaign.>``.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import contextlib
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from decnet.bus import topics as _topics
|
||||||
|
from decnet.bus.base import BaseBus
|
||||||
|
from decnet.bus.factory import get_bus
|
||||||
|
from decnet.bus.publish import (
|
||||||
|
publish_safely,
|
||||||
|
run_control_listener_signal as _run_control_listener_signal,
|
||||||
|
run_health_heartbeat as _run_health_heartbeat,
|
||||||
|
)
|
||||||
|
from decnet.clustering.campaign.base import (
|
||||||
|
CampaignClusterer,
|
||||||
|
CampaignClusterResult,
|
||||||
|
)
|
||||||
|
from decnet.clustering.campaign.factory import get_campaign_clusterer
|
||||||
|
from decnet.logging import get_logger
|
||||||
|
from decnet.web.db.repository import BaseRepository
|
||||||
|
|
||||||
|
log = get_logger("clustering.campaign.worker")
|
||||||
|
|
||||||
|
_DEFAULT_POLL_SECS = 60.0
|
||||||
|
_WORKER_NAME = "campaign-clusterer"
|
||||||
|
|
||||||
|
|
||||||
|
async def run_campaign_clusterer_loop(
|
||||||
|
repo: BaseRepository,
|
||||||
|
*,
|
||||||
|
poll_interval_secs: float = _DEFAULT_POLL_SECS,
|
||||||
|
clusterer: Optional[CampaignClusterer] = None,
|
||||||
|
shutdown: Optional[asyncio.Event] = None,
|
||||||
|
) -> None:
|
||||||
|
"""Run the campaign clusterer until cancelled."""
|
||||||
|
if clusterer is None:
|
||||||
|
clusterer = get_campaign_clusterer()
|
||||||
|
log.info(
|
||||||
|
"campaign-clusterer started impl=%s poll_interval_secs=%s",
|
||||||
|
clusterer.name, poll_interval_secs,
|
||||||
|
)
|
||||||
|
|
||||||
|
bus: Optional[BaseBus] = None
|
||||||
|
wake = asyncio.Event()
|
||||||
|
wake_tasks: list[asyncio.Task] = []
|
||||||
|
heartbeat_task: Optional[asyncio.Task] = None
|
||||||
|
try:
|
||||||
|
candidate = get_bus(client_name=_WORKER_NAME)
|
||||||
|
await candidate.connect()
|
||||||
|
bus = candidate
|
||||||
|
# Wake on any identity-layer event — formed / linked / merged /
|
||||||
|
# unmerged all change the input set the campaign clusterer
|
||||||
|
# operates over.
|
||||||
|
wake_tasks.append(asyncio.create_task(
|
||||||
|
_wake_on(bus, wake, f"{_topics.IDENTITY}.>"),
|
||||||
|
))
|
||||||
|
heartbeat_task = asyncio.create_task(
|
||||||
|
_run_health_heartbeat(bus, _WORKER_NAME),
|
||||||
|
)
|
||||||
|
wake_tasks.append(asyncio.create_task(
|
||||||
|
_run_control_listener_signal(bus, _WORKER_NAME),
|
||||||
|
))
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
log.warning(
|
||||||
|
"campaign-clusterer: bus unavailable, running in poll-only "
|
||||||
|
"mode: %s", exc,
|
||||||
|
)
|
||||||
|
|
||||||
|
if shutdown is None:
|
||||||
|
shutdown = asyncio.Event()
|
||||||
|
|
||||||
|
try:
|
||||||
|
while not shutdown.is_set():
|
||||||
|
try:
|
||||||
|
result = await clusterer.tick(repo)
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
log.exception("campaign-clusterer: tick failed")
|
||||||
|
result = CampaignClusterResult()
|
||||||
|
|
||||||
|
await _publish_result(bus, result)
|
||||||
|
|
||||||
|
try:
|
||||||
|
await asyncio.wait_for(
|
||||||
|
wake.wait(), timeout=float(poll_interval_secs),
|
||||||
|
)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
pass
|
||||||
|
wake.clear()
|
||||||
|
except (asyncio.CancelledError, KeyboardInterrupt):
|
||||||
|
log.info("campaign-clusterer stopped")
|
||||||
|
finally:
|
||||||
|
for t in wake_tasks:
|
||||||
|
t.cancel()
|
||||||
|
if heartbeat_task is not None:
|
||||||
|
heartbeat_task.cancel()
|
||||||
|
for t in (*wake_tasks, heartbeat_task):
|
||||||
|
if t is None:
|
||||||
|
continue
|
||||||
|
with contextlib.suppress(asyncio.CancelledError, Exception):
|
||||||
|
await t
|
||||||
|
if bus is not None:
|
||||||
|
with contextlib.suppress(Exception):
|
||||||
|
await bus.close()
|
||||||
|
|
||||||
|
|
||||||
|
async def _publish_result(
|
||||||
|
bus: Optional[BaseBus], result: CampaignClusterResult,
|
||||||
|
) -> None:
|
||||||
|
"""Fan ``CampaignClusterResult`` out to ``campaign.*`` topics +
|
||||||
|
cross-family ``identity.campaign.assigned``."""
|
||||||
|
for formed in result.campaigns_formed:
|
||||||
|
await publish_safely(
|
||||||
|
bus,
|
||||||
|
_topics.campaign(_topics.CAMPAIGN_FORMED),
|
||||||
|
formed,
|
||||||
|
event_type=_topics.CAMPAIGN_FORMED,
|
||||||
|
)
|
||||||
|
# Also fire identity.campaign.assigned per identity so the
|
||||||
|
# existing identity SSE stream sees the badge update.
|
||||||
|
for identity_uuid in formed.get("identity_uuids", []):
|
||||||
|
await publish_safely(
|
||||||
|
bus,
|
||||||
|
_topics.identity(_topics.IDENTITY_CAMPAIGN_ASSIGNED),
|
||||||
|
{
|
||||||
|
"identity_uuid": identity_uuid,
|
||||||
|
"campaign_uuid": formed["campaign_uuid"],
|
||||||
|
"prior_campaign_uuid": None,
|
||||||
|
},
|
||||||
|
event_type=_topics.IDENTITY_CAMPAIGN_ASSIGNED,
|
||||||
|
)
|
||||||
|
for assigned in result.identities_assigned:
|
||||||
|
await publish_safely(
|
||||||
|
bus,
|
||||||
|
_topics.campaign(_topics.CAMPAIGN_IDENTITY_ASSIGNED),
|
||||||
|
assigned,
|
||||||
|
event_type=_topics.CAMPAIGN_IDENTITY_ASSIGNED,
|
||||||
|
)
|
||||||
|
await publish_safely(
|
||||||
|
bus,
|
||||||
|
_topics.identity(_topics.IDENTITY_CAMPAIGN_ASSIGNED),
|
||||||
|
{
|
||||||
|
"identity_uuid": assigned["identity_uuid"],
|
||||||
|
"campaign_uuid": assigned["campaign_uuid"],
|
||||||
|
"prior_campaign_uuid": assigned.get("prior_campaign_uuid"),
|
||||||
|
},
|
||||||
|
event_type=_topics.IDENTITY_CAMPAIGN_ASSIGNED,
|
||||||
|
)
|
||||||
|
for merged in result.campaigns_merged:
|
||||||
|
await publish_safely(
|
||||||
|
bus,
|
||||||
|
_topics.campaign(_topics.CAMPAIGN_MERGED),
|
||||||
|
merged,
|
||||||
|
event_type=_topics.CAMPAIGN_MERGED,
|
||||||
|
)
|
||||||
|
for unmerged in result.campaigns_unmerged:
|
||||||
|
await publish_safely(
|
||||||
|
bus,
|
||||||
|
_topics.campaign(_topics.CAMPAIGN_UNMERGED),
|
||||||
|
unmerged,
|
||||||
|
event_type=_topics.CAMPAIGN_UNMERGED,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def _wake_on(bus: BaseBus, wake: asyncio.Event, pattern: str) -> None:
|
||||||
|
try:
|
||||||
|
sub = bus.subscribe(pattern)
|
||||||
|
async with sub:
|
||||||
|
async for _event in sub:
|
||||||
|
wake.set()
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
raise
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
log.warning(
|
||||||
|
"campaign-clusterer: subscriber for %s died (%s); falling back "
|
||||||
|
"to poll", pattern, exc,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["run_campaign_clusterer_loop"]
|
||||||
46
decnet/clustering/factory.py
Normal file
46
decnet/clustering/factory.py
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
"""Clusterer factory.
|
||||||
|
|
||||||
|
Returns the active :class:`~decnet.clustering.base.Clusterer` instance.
|
||||||
|
Mirrors :mod:`decnet.bus.factory` and :mod:`decnet.web.db.factory`:
|
||||||
|
callers obtain the clusterer via :func:`get_clusterer` rather than
|
||||||
|
importing a concrete impl directly.
|
||||||
|
|
||||||
|
Configuration knobs (env-overridable):
|
||||||
|
|
||||||
|
* ``DECNET_CLUSTERER_TYPE`` — which implementation to use. Default
|
||||||
|
``"connected_components"``. Unknown values raise :class:`ValueError`
|
||||||
|
so a typo in ``decnet.ini`` surfaces immediately rather than silently
|
||||||
|
falling back.
|
||||||
|
|
||||||
|
The ``connected_components`` implementation is the v1 production
|
||||||
|
clusterer. Other implementations (e.g. an HDBSCAN variant) can land
|
||||||
|
here later without churning callers.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
|
from decnet.clustering.base import Clusterer
|
||||||
|
|
||||||
|
_KNOWN_CLUSTERERS = ("connected_components",)
|
||||||
|
_DEFAULT_CLUSTERER = "connected_components"
|
||||||
|
|
||||||
|
|
||||||
|
def get_clusterer() -> Clusterer:
|
||||||
|
"""Return the configured clusterer instance.
|
||||||
|
|
||||||
|
Lazy-imports the concrete impl so the base module stays free of
|
||||||
|
implementation-specific dependencies.
|
||||||
|
"""
|
||||||
|
name = os.environ.get("DECNET_CLUSTERER_TYPE", _DEFAULT_CLUSTERER).strip().lower()
|
||||||
|
if name == "connected_components":
|
||||||
|
from decnet.clustering.impl.connected_components import (
|
||||||
|
ConnectedComponentsClusterer,
|
||||||
|
)
|
||||||
|
return ConnectedComponentsClusterer()
|
||||||
|
raise ValueError(
|
||||||
|
f"Unknown clusterer: {name!r}. Known: {_KNOWN_CLUSTERERS}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["get_clusterer"]
|
||||||
6
decnet/clustering/impl/__init__.py
Normal file
6
decnet/clustering/impl/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
"""Concrete clusterer implementations.
|
||||||
|
|
||||||
|
Each module here contains exactly one :class:`~decnet.clustering.base.Clusterer`
|
||||||
|
subclass. New implementations register themselves in
|
||||||
|
:func:`decnet.clustering.factory.get_clusterer`.
|
||||||
|
"""
|
||||||
379
decnet/clustering/impl/connected_components.py
Normal file
379
decnet/clustering/impl/connected_components.py
Normal file
@@ -0,0 +1,379 @@
|
|||||||
|
"""Connected-components identity clusterer (v1).
|
||||||
|
|
||||||
|
Builds a similarity graph over observations (per-IP attacker rows),
|
||||||
|
runs union-find over edges that pass a confidence threshold, and writes
|
||||||
|
one ``attacker_identities`` row per component.
|
||||||
|
|
||||||
|
**v1 signal coverage (this commit):**
|
||||||
|
|
||||||
|
* High-weight tier: JA3 / HASSH / payload-hash / C2-endpoint exact
|
||||||
|
match (alone enough to cluster). The production tick currently sees
|
||||||
|
JA3 + HASSH only — payload + C2 require log mining and join in
|
||||||
|
later commits. The fixture tests exercise the full high-weight set
|
||||||
|
through the in-memory path.
|
||||||
|
|
||||||
|
Subsequent commits add medium / low / very-low tier edges, phase-
|
||||||
|
handoff edges, and revocable merges. Edges MUST stay time-agnostic
|
||||||
|
— fixture 7 forbids recency-decay clustering.
|
||||||
|
|
||||||
|
**v1 behavior:**
|
||||||
|
|
||||||
|
The clusterer assigns identities to NULL observations, merges existing
|
||||||
|
identities when a single predicted component spans them, and revokes
|
||||||
|
prior merges when the predicted component splits a merged-out identity
|
||||||
|
away from its winner. Observations stay FK'd to their original identity
|
||||||
|
row throughout — merges are soft pointers via
|
||||||
|
``attacker_identities.merged_into_uuid``, never observation re-points.
|
||||||
|
That keeps the audit trail intact and lets cached subscribers resolve
|
||||||
|
merged-out UUIDs through the chain.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import uuid as _uuid
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from typing import Any, Iterable, Optional
|
||||||
|
|
||||||
|
from decnet.clustering.base import Clusterer, ClusterResult
|
||||||
|
from decnet.clustering.impl.similarity import (
|
||||||
|
EDGE_THRESHOLD,
|
||||||
|
Observation,
|
||||||
|
combined_edge_weight,
|
||||||
|
)
|
||||||
|
from decnet.logging import get_logger
|
||||||
|
from decnet.profiler.identity_rollup import extract_fp_summaries
|
||||||
|
from decnet.web.db.repository import BaseRepository
|
||||||
|
|
||||||
|
log = get_logger("clustering.connected_components")
|
||||||
|
|
||||||
|
|
||||||
|
def cluster_observations(
|
||||||
|
observations: Iterable[Observation],
|
||||||
|
) -> dict[str, str]:
|
||||||
|
"""Run connected-components over the high-weight similarity graph.
|
||||||
|
|
||||||
|
Pure: no DB, no clock, no I/O. Both the fixture-validation tests
|
||||||
|
and the production ``tick`` consume this. The mapping is a
|
||||||
|
deterministic function of the input set + edge function.
|
||||||
|
|
||||||
|
Singletons get a stable per-observation cluster id so callers can
|
||||||
|
distinguish "isolated observation" from "merged into nothing."
|
||||||
|
|
||||||
|
Returns ``{observation_id: cluster_id}``. Cluster ids are opaque
|
||||||
|
strings — callers must not rely on their format.
|
||||||
|
"""
|
||||||
|
obs_list = list(observations)
|
||||||
|
parent: dict[str, str] = {o.observation_id: o.observation_id for o in obs_list}
|
||||||
|
|
||||||
|
def find(x: str) -> str:
|
||||||
|
while parent[x] != x:
|
||||||
|
parent[x] = parent[parent[x]]
|
||||||
|
x = parent[x]
|
||||||
|
return x
|
||||||
|
|
||||||
|
def union(x: str, y: str) -> None:
|
||||||
|
rx, ry = find(x), find(y)
|
||||||
|
if rx != ry:
|
||||||
|
parent[rx] = ry
|
||||||
|
|
||||||
|
for i, a in enumerate(obs_list):
|
||||||
|
for b in obs_list[i + 1:]:
|
||||||
|
if combined_edge_weight(a, b) >= EDGE_THRESHOLD:
|
||||||
|
union(a.observation_id, b.observation_id)
|
||||||
|
|
||||||
|
# Roots: each unique find(o) is a component representative. Use
|
||||||
|
# them as the cluster id so two runs over the same input produce
|
||||||
|
# the same labels (handy for assertions).
|
||||||
|
return {o.observation_id: f"cc-{find(o.observation_id)}" for o in obs_list}
|
||||||
|
|
||||||
|
|
||||||
|
def from_attacker_row(row: dict[str, Any]) -> Observation:
|
||||||
|
"""Project an ``Attacker`` row dict into an :class:`Observation`.
|
||||||
|
|
||||||
|
Pulls JA3 / HASSH out of the ``Attacker.fingerprints`` JSON list
|
||||||
|
(one entry per fingerprint event the prober collected). Multiple
|
||||||
|
JA3s on a single observation are flattened to a single value —
|
||||||
|
the most-recent — because :class:`Observation` is a single-row
|
||||||
|
projection; an observation that exhibits two distinct JA3s across
|
||||||
|
its lifetime is a wire-level oddity that the clusterer treats by
|
||||||
|
keeping the latest. The identity row itself can store the full
|
||||||
|
list across observations.
|
||||||
|
|
||||||
|
Payload + C2 + commands are left empty — log mining lands in
|
||||||
|
later commits. The function shape doesn't change when they do.
|
||||||
|
"""
|
||||||
|
raw = row.get("fingerprints") or "[]"
|
||||||
|
try:
|
||||||
|
entries = json.loads(raw) if isinstance(raw, str) else list(raw)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
entries = []
|
||||||
|
|
||||||
|
ja3: Optional[str] = None
|
||||||
|
hassh: Optional[str] = None
|
||||||
|
for entry in entries:
|
||||||
|
if not isinstance(entry, dict):
|
||||||
|
continue
|
||||||
|
kind = entry.get("kind")
|
||||||
|
h = entry.get("hash") or entry.get("value")
|
||||||
|
if not h:
|
||||||
|
continue
|
||||||
|
if kind == "ja3":
|
||||||
|
ja3 = h
|
||||||
|
elif kind == "hassh":
|
||||||
|
hassh = h
|
||||||
|
|
||||||
|
return Observation(
|
||||||
|
observation_id=row["uuid"],
|
||||||
|
ja3=ja3,
|
||||||
|
hassh=hassh,
|
||||||
|
asn=row.get("asn"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class ConnectedComponentsClusterer(Clusterer):
|
||||||
|
"""Connected-components clusterer over the similarity graph.
|
||||||
|
|
||||||
|
See module docstring for v1 signal coverage and behavior notes.
|
||||||
|
"""
|
||||||
|
|
||||||
|
name = "connected_components"
|
||||||
|
|
||||||
|
async def tick(self, repo: BaseRepository) -> ClusterResult:
|
||||||
|
try:
|
||||||
|
rows = await repo.list_attackers_for_clustering()
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
log.exception("clusterer: failed to read attackers")
|
||||||
|
return ClusterResult()
|
||||||
|
|
||||||
|
if not rows:
|
||||||
|
return ClusterResult()
|
||||||
|
|
||||||
|
# Build the merge chain so a row's "effective" identity follows
|
||||||
|
# merged_into_uuid up to the canonical winner. Pre-computing it
|
||||||
|
# lets us reason about post-merge identity membership in one
|
||||||
|
# place. ``identity_chain[u]`` is the canonical winner for
|
||||||
|
# identity ``u`` (or ``u`` itself if not merged out).
|
||||||
|
try:
|
||||||
|
all_identities = await repo.list_all_identities()
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
log.exception("clusterer: failed to read identities")
|
||||||
|
return ClusterResult()
|
||||||
|
identity_chain = _build_merge_chain(all_identities)
|
||||||
|
|
||||||
|
# Project + cluster.
|
||||||
|
observations: list[Observation] = []
|
||||||
|
row_by_id: dict[str, dict[str, Any]] = {}
|
||||||
|
for r in rows:
|
||||||
|
obs = from_attacker_row(r)
|
||||||
|
observations.append(obs)
|
||||||
|
row_by_id[obs.observation_id] = r
|
||||||
|
labels = cluster_observations(observations)
|
||||||
|
|
||||||
|
# Group observations by predicted cluster.
|
||||||
|
components: dict[str, list[str]] = {}
|
||||||
|
for obs_id, cluster_id in labels.items():
|
||||||
|
components.setdefault(cluster_id, []).append(obs_id)
|
||||||
|
|
||||||
|
result = ClusterResult()
|
||||||
|
now = datetime.now(timezone.utc)
|
||||||
|
|
||||||
|
# Pass 1 — per-component reconciliation: form, link, merge.
|
||||||
|
for member_ids in components.values():
|
||||||
|
literal_ids = {
|
||||||
|
row_by_id[m]["identity_id"] for m in member_ids
|
||||||
|
if row_by_id[m].get("identity_id")
|
||||||
|
}
|
||||||
|
effective_ids = {identity_chain.get(i, i) for i in literal_ids}
|
||||||
|
unassigned = [
|
||||||
|
m for m in member_ids
|
||||||
|
if not row_by_id[m].get("identity_id")
|
||||||
|
]
|
||||||
|
|
||||||
|
if not effective_ids:
|
||||||
|
# Fresh component — mint a new identity.
|
||||||
|
identity_uuid = str(_uuid.uuid4())
|
||||||
|
try:
|
||||||
|
await repo.create_attacker_identity({
|
||||||
|
"uuid": identity_uuid,
|
||||||
|
"schema_version": 1,
|
||||||
|
"first_seen_at": now,
|
||||||
|
"last_seen_at": now,
|
||||||
|
"created_at": now,
|
||||||
|
"updated_at": now,
|
||||||
|
"observation_count": len(member_ids),
|
||||||
|
})
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
log.exception(
|
||||||
|
"clusterer: failed to create identity for component %s",
|
||||||
|
member_ids,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
linked: list[str] = []
|
||||||
|
for obs_id in member_ids:
|
||||||
|
if await _link(repo, obs_id, identity_uuid):
|
||||||
|
linked.append(obs_id)
|
||||||
|
if linked:
|
||||||
|
result.identities_formed.append({
|
||||||
|
"identity_uuid": identity_uuid,
|
||||||
|
"observation_uuids": linked,
|
||||||
|
})
|
||||||
|
await _roll_up_fingerprints(
|
||||||
|
repo, identity_uuid, [row_by_id[m] for m in member_ids],
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Deterministic winner so two clusterer runs produce the
|
||||||
|
# same merge direction. Sorting by uuid string is stable
|
||||||
|
# and doesn't depend on row insertion order.
|
||||||
|
winner_uuid = min(effective_ids)
|
||||||
|
losers = effective_ids - {winner_uuid}
|
||||||
|
|
||||||
|
for loser_uuid in losers:
|
||||||
|
try:
|
||||||
|
await repo.update_identity_merged_into(loser_uuid, winner_uuid)
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
log.exception(
|
||||||
|
"clusterer: failed to merge %s -> %s",
|
||||||
|
loser_uuid, winner_uuid,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
identity_chain[loser_uuid] = winner_uuid
|
||||||
|
result.identities_merged.append({
|
||||||
|
"winner_uuid": winner_uuid,
|
||||||
|
"loser_uuid": loser_uuid,
|
||||||
|
})
|
||||||
|
|
||||||
|
# Link any unassigned observations in the component to the
|
||||||
|
# winner so a subsequent tick sees a single-identity
|
||||||
|
# component and skips this branch entirely.
|
||||||
|
for obs_id in unassigned:
|
||||||
|
if await _link(repo, obs_id, winner_uuid):
|
||||||
|
result.observations_linked.append({
|
||||||
|
"identity_uuid": winner_uuid,
|
||||||
|
"observation_uuid": obs_id,
|
||||||
|
})
|
||||||
|
|
||||||
|
# Re-roll the winner's fingerprint summary across every
|
||||||
|
# observation now in this component (including the loser
|
||||||
|
# side — the merge unifies their evidence even though the
|
||||||
|
# loser's identity row stays FK'd via merged_into_uuid).
|
||||||
|
await _roll_up_fingerprints(
|
||||||
|
repo, winner_uuid, [row_by_id[m] for m in member_ids],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Pass 2 — revocable-merge undo. For each currently-merged-out
|
||||||
|
# identity, check whether its observations still cluster with
|
||||||
|
# the winner's. If not, the merge is contradicted by new
|
||||||
|
# evidence — clear merged_into_uuid and emit identity.unmerged.
|
||||||
|
# Observations FK'd to the resurrected loser stay where they
|
||||||
|
# were; the chain just stops following.
|
||||||
|
observations_by_literal_identity: dict[str, list[str]] = {}
|
||||||
|
for obs_id, r in row_by_id.items():
|
||||||
|
iid = r.get("identity_id")
|
||||||
|
if iid:
|
||||||
|
observations_by_literal_identity.setdefault(iid, []).append(obs_id)
|
||||||
|
|
||||||
|
for identity_row in all_identities:
|
||||||
|
if not identity_row.get("merged_into_uuid"):
|
||||||
|
continue
|
||||||
|
loser_uuid = identity_row["uuid"]
|
||||||
|
winner_uuid = identity_chain.get(loser_uuid, loser_uuid)
|
||||||
|
if winner_uuid == loser_uuid:
|
||||||
|
continue # broken chain — paranoia
|
||||||
|
loser_obs = observations_by_literal_identity.get(loser_uuid, [])
|
||||||
|
winner_obs = observations_by_literal_identity.get(winner_uuid, [])
|
||||||
|
if not loser_obs or not winner_obs:
|
||||||
|
# No observations either side — can't disprove the merge.
|
||||||
|
continue
|
||||||
|
loser_clusters = {labels[o] for o in loser_obs}
|
||||||
|
winner_clusters = {labels[o] for o in winner_obs}
|
||||||
|
if loser_clusters & winner_clusters:
|
||||||
|
continue # still co-clustered with winner — merge stands
|
||||||
|
try:
|
||||||
|
await repo.update_identity_merged_into(loser_uuid, None)
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
log.exception(
|
||||||
|
"clusterer: failed to unmerge %s from %s",
|
||||||
|
loser_uuid, winner_uuid,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
identity_chain[loser_uuid] = loser_uuid
|
||||||
|
result.identities_unmerged.append({
|
||||||
|
"resurrected_uuid": loser_uuid,
|
||||||
|
"former_winner_uuid": winner_uuid,
|
||||||
|
})
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _build_merge_chain(
|
||||||
|
identities: list[dict[str, Any]],
|
||||||
|
) -> dict[str, str]:
|
||||||
|
"""Build a uuid → canonical-winner map from a list of identity rows.
|
||||||
|
|
||||||
|
Follows ``merged_into_uuid`` to a fixed point per identity, with a
|
||||||
|
hop cap to defend against accidental cycles. The returned dict
|
||||||
|
contains an entry for every identity uuid (mapping to itself if
|
||||||
|
not merged out).
|
||||||
|
"""
|
||||||
|
_MAX_HOPS = 8
|
||||||
|
by_uuid: dict[str, dict[str, Any]] = {i["uuid"]: i for i in identities}
|
||||||
|
chain: dict[str, str] = {}
|
||||||
|
for uuid_ in by_uuid:
|
||||||
|
cur = uuid_
|
||||||
|
for _ in range(_MAX_HOPS):
|
||||||
|
row = by_uuid.get(cur)
|
||||||
|
if row is None:
|
||||||
|
break
|
||||||
|
nxt = row.get("merged_into_uuid")
|
||||||
|
if not nxt or nxt == cur:
|
||||||
|
break
|
||||||
|
cur = nxt
|
||||||
|
chain[uuid_] = cur
|
||||||
|
return chain
|
||||||
|
|
||||||
|
|
||||||
|
async def _link(
|
||||||
|
repo: BaseRepository, observation_uuid: str, identity_uuid: str,
|
||||||
|
) -> bool:
|
||||||
|
"""Set ``attackers.identity_id`` and return ``True`` on success.
|
||||||
|
|
||||||
|
Wraps the repo call so the tick body stays linear and exception
|
||||||
|
handling is consistent across the form / link / merge branches.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
await repo.set_attacker_identity_id(observation_uuid, identity_uuid)
|
||||||
|
return True
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
log.exception(
|
||||||
|
"clusterer: failed to link obs=%s -> identity=%s",
|
||||||
|
observation_uuid, identity_uuid,
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
async def _roll_up_fingerprints(
|
||||||
|
repo: BaseRepository,
|
||||||
|
identity_uuid: str,
|
||||||
|
member_rows: list[dict[str, Any]],
|
||||||
|
) -> None:
|
||||||
|
"""Project member observations' fingerprint blobs onto the identity's
|
||||||
|
summary columns. Best-effort: a write failure is logged but never
|
||||||
|
breaks the clusterer tick — the columns just stay stale until the
|
||||||
|
next pass."""
|
||||||
|
summaries = extract_fp_summaries(member_rows)
|
||||||
|
try:
|
||||||
|
await repo.update_identity_fingerprints(identity_uuid, **summaries)
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
log.exception(
|
||||||
|
"clusterer: failed to roll up fingerprints for identity=%s",
|
||||||
|
identity_uuid,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"ConnectedComponentsClusterer",
|
||||||
|
"cluster_observations",
|
||||||
|
"from_attacker_row",
|
||||||
|
]
|
||||||
313
decnet/clustering/impl/similarity.py
Normal file
313
decnet/clustering/impl/similarity.py
Normal file
@@ -0,0 +1,313 @@
|
|||||||
|
"""Similarity-graph primitives for the connected-components clusterer.
|
||||||
|
|
||||||
|
Each function takes two :class:`Observation` projections and returns a
|
||||||
|
similarity score in ``[0.0, 1.0]``. The connected-components impl
|
||||||
|
(landing in subsequent commits) decides how to combine these into a
|
||||||
|
single edge weight, applies a threshold, and runs union-find.
|
||||||
|
|
||||||
|
**Time-agnostic.** Edges MUST NOT depend on observation timestamps.
|
||||||
|
Fixture 7 (``slow_burn``) proves recency-decay clustering fragments
|
||||||
|
multi-month APT campaigns; the production graph cannot silently expire
|
||||||
|
old edges. Timestamps are still useful for *audit* (the ``first_seen``
|
||||||
|
on the resulting identity row) but never for *similarity*.
|
||||||
|
|
||||||
|
**Weight tiers** (from `development/IDENTITY_RESOLUTION.md`):
|
||||||
|
|
||||||
|
* High — JA3 / HASSH / payload-hash / C2-callback exact match. Stable
|
||||||
|
signals an attacker can't cheaply rotate. A single high-tier match
|
||||||
|
supports identity strongly.
|
||||||
|
* Medium — command-sequence Jaccard, bucketed by UKC phase. Tooling
|
||||||
|
habits leak through command order; phase-bucketing avoids comparing
|
||||||
|
a Discovery cmd-list to an Exploitation one.
|
||||||
|
* Low — credential-attempt-set Jaccard. Defeated alone by fixture 1
|
||||||
|
(``shared_wordlist``) where two campaigns share rockyou but diverge
|
||||||
|
on infra.
|
||||||
|
* Very low — ASN match. Defeated alone by fixture 2 (``vpn_hopping``)
|
||||||
|
where one identity rotates across many ASNs.
|
||||||
|
|
||||||
|
The functions are pure (no DB, no I/O); the worker maps observations
|
||||||
|
into :class:`Observation` once per tick and feeds these into the
|
||||||
|
graph builder.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Mapping, Optional
|
||||||
|
|
||||||
|
# ─── Observation projection ─────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(frozen=True)
|
||||||
|
class Observation:
|
||||||
|
"""Minimal projection of a per-IP attacker observation.
|
||||||
|
|
||||||
|
Built once per ``Attacker`` row by the worker (or per
|
||||||
|
``SyntheticAttacker`` in tests via :func:`from_synthetic`).
|
||||||
|
Keeping the projection tight isolates the graph code from schema
|
||||||
|
drift on either side.
|
||||||
|
|
||||||
|
All set-typed fields are :class:`frozenset` so they hash and so
|
||||||
|
callers don't accidentally mutate them mid-pass.
|
||||||
|
"""
|
||||||
|
|
||||||
|
observation_id: str
|
||||||
|
"""Stable ID — for production, the ``Attacker.uuid``; for tests,
|
||||||
|
the ``SyntheticAttacker.attacker_id``."""
|
||||||
|
|
||||||
|
ja3: Optional[str] = None
|
||||||
|
hassh: Optional[str] = None
|
||||||
|
asn: Optional[int] = None
|
||||||
|
|
||||||
|
payload_hashes: frozenset[str] = field(default_factory=frozenset)
|
||||||
|
c2_endpoints: frozenset[str] = field(default_factory=frozenset)
|
||||||
|
credentials: frozenset[tuple[str, str]] = field(default_factory=frozenset)
|
||||||
|
|
||||||
|
commands_by_phase: Mapping[str, tuple[str, ...]] = field(default_factory=dict)
|
||||||
|
"""``UKCPhase.value`` → ordered command sequence observed in that
|
||||||
|
phase. Empty dict when no command-bearing sessions were seen."""
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Edge functions ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def _fingerprints_fully_disagree(a: Observation, b: Observation) -> bool:
|
||||||
|
"""True iff every comparable fingerprint slot disagrees.
|
||||||
|
|
||||||
|
"Comparable" = both sides have a non-null value for that slot.
|
||||||
|
Used as a soft-veto on shared C2 / payload signals: when two
|
||||||
|
observations have distinct stable TLS + SSH stacks, sharing a C2
|
||||||
|
endpoint is a *campaign*-level signal (cooperating operators,
|
||||||
|
distinct identities) — not an identity-level one. Fixture 5
|
||||||
|
(``multi_operator``) is the canonical demonstration.
|
||||||
|
|
||||||
|
Returns ``False`` when no fingerprint slot is comparable (any-null
|
||||||
|
cases) — without evidence of disagreement we don't veto. Also
|
||||||
|
``False`` when at least one slot agrees.
|
||||||
|
"""
|
||||||
|
ja3_comparable = a.ja3 is not None and b.ja3 is not None
|
||||||
|
hassh_comparable = a.hassh is not None and b.hassh is not None
|
||||||
|
if not (ja3_comparable or hassh_comparable):
|
||||||
|
return False
|
||||||
|
if ja3_comparable and a.ja3 == b.ja3:
|
||||||
|
return False
|
||||||
|
if hassh_comparable and a.hassh == b.hassh:
|
||||||
|
return False
|
||||||
|
if ja3_comparable and hassh_comparable:
|
||||||
|
return a.ja3 != b.ja3 and a.hassh != b.hassh
|
||||||
|
return True # exactly one slot is comparable, and it disagrees
|
||||||
|
|
||||||
|
|
||||||
|
def high_weight_edge(a: Observation, b: Observation) -> float:
|
||||||
|
"""JA3 / HASSH / payload-hash / C2-endpoint exact match.
|
||||||
|
|
||||||
|
Returns ``1.0`` if any of the four exact-match signals agrees
|
||||||
|
(non-null on both sides), ``0.0`` otherwise. Single-signal high-tier
|
||||||
|
agreement is by design enough to support identity — these are the
|
||||||
|
signals the design doc calls out as "stable signals an attacker
|
||||||
|
can't cheaply rotate."
|
||||||
|
|
||||||
|
**Fingerprint-disagreement veto.** Payload and C2 are infra signals
|
||||||
|
that two cooperating operators (different identities) can share.
|
||||||
|
JA3 + HASSH are tooling signals that differ when the operators are
|
||||||
|
actually different humans with different tool stacks. So when the
|
||||||
|
available fingerprint slots fully disagree, we drop the
|
||||||
|
payload/C2 contribution to zero — preventing a campaign-level
|
||||||
|
co-op signal from fusing two distinct identities. Fixture 5
|
||||||
|
(``multi_operator``) is the canonical demonstration: shared
|
||||||
|
stage-1 payload + shared C2, distinct JA3/HASSH per operator —
|
||||||
|
must stay two identities. JA3 / HASSH agreement still returns
|
||||||
|
``1.0`` directly, since by definition no veto applies when
|
||||||
|
something agrees.
|
||||||
|
|
||||||
|
JA4 will join this tier as a sibling of JA3 once the prober emits
|
||||||
|
it (``ATTACKER_FINGERPRINTED`` already carries a JA4 slot in
|
||||||
|
``AttackerIdentity``); the function shape doesn't change.
|
||||||
|
"""
|
||||||
|
if a.ja3 is not None and a.ja3 == b.ja3:
|
||||||
|
return 1.0
|
||||||
|
if a.hassh is not None and a.hassh == b.hassh:
|
||||||
|
return 1.0
|
||||||
|
if _fingerprints_fully_disagree(a, b):
|
||||||
|
# Stable-tool disagreement vetoes shared-infra signals.
|
||||||
|
return 0.0
|
||||||
|
if a.payload_hashes and b.payload_hashes and (a.payload_hashes & b.payload_hashes):
|
||||||
|
return 1.0
|
||||||
|
if a.c2_endpoints and b.c2_endpoints and (a.c2_endpoints & b.c2_endpoints):
|
||||||
|
return 1.0
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
|
||||||
|
def medium_weight_edge(a: Observation, b: Observation) -> float:
|
||||||
|
"""Phase-bucketed command-sequence Jaccard.
|
||||||
|
|
||||||
|
For each UKC phase observed on both sides, computes the Jaccard
|
||||||
|
similarity of the command sets (multisets collapsed to sets — the
|
||||||
|
*order* signal is reserved for a future feature, this commit is
|
||||||
|
the scaffolding). Returns the **maximum** Jaccard across shared
|
||||||
|
phases, so a single strong phase match isn't averaged away by a
|
||||||
|
different phase where the actors diverge.
|
||||||
|
|
||||||
|
Phase-bucketing matters: comparing a Discovery cmd-list to an
|
||||||
|
Exploitation one is meaningless. Both actors had to be in the
|
||||||
|
same phase for the comparison to count.
|
||||||
|
|
||||||
|
Returns ``0.0`` when no phase is observed on both sides.
|
||||||
|
"""
|
||||||
|
shared_phases = set(a.commands_by_phase) & set(b.commands_by_phase)
|
||||||
|
if not shared_phases:
|
||||||
|
return 0.0
|
||||||
|
best = 0.0
|
||||||
|
for phase in shared_phases:
|
||||||
|
sa = set(a.commands_by_phase[phase])
|
||||||
|
sb = set(b.commands_by_phase[phase])
|
||||||
|
if not sa and not sb:
|
||||||
|
continue
|
||||||
|
union = sa | sb
|
||||||
|
if not union:
|
||||||
|
continue
|
||||||
|
j = len(sa & sb) / len(union)
|
||||||
|
if j > best:
|
||||||
|
best = j
|
||||||
|
return best
|
||||||
|
|
||||||
|
|
||||||
|
def low_weight_edge(a: Observation, b: Observation) -> float:
|
||||||
|
"""Credential-attempt-set Jaccard.
|
||||||
|
|
||||||
|
Returns the Jaccard of ``(username, password)`` tuples. Two campaigns
|
||||||
|
burning the same wordlist will score high here — fixture 1 proves
|
||||||
|
this signal is dangerous in isolation. The connected-components
|
||||||
|
impl combines this with other signals; alone it must not push a
|
||||||
|
pair over threshold.
|
||||||
|
|
||||||
|
Returns ``0.0`` when either side attempted no credentials, or when
|
||||||
|
the union is empty.
|
||||||
|
"""
|
||||||
|
if not a.credentials or not b.credentials:
|
||||||
|
return 0.0
|
||||||
|
union = a.credentials | b.credentials
|
||||||
|
if not union:
|
||||||
|
return 0.0
|
||||||
|
return len(a.credentials & b.credentials) / len(union)
|
||||||
|
|
||||||
|
|
||||||
|
def very_low_weight_edge(a: Observation, b: Observation) -> float:
|
||||||
|
"""ASN equality.
|
||||||
|
|
||||||
|
Returns ``1.0`` iff both observations have a non-null ASN and they
|
||||||
|
match. Fixture 2 (``vpn_hopping``) proves ASN-only clustering is
|
||||||
|
a failure mode — one identity legitimately rotates across many
|
||||||
|
ASNs. The combination logic in the connected-components impl
|
||||||
|
weights this so that ASN agreement alone never crosses threshold.
|
||||||
|
"""
|
||||||
|
if a.asn is None or b.asn is None:
|
||||||
|
return 0.0
|
||||||
|
return 1.0 if a.asn == b.asn else 0.0
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Combined weight ────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
#: Tier multipliers applied to the per-tier edge scores when combining
|
||||||
|
#: into a single weight. Tuned so that:
|
||||||
|
#:
|
||||||
|
#: * High-tier agreement alone (1.0) crosses the 1.0 threshold.
|
||||||
|
#: * Medium-tier alone (max 1.0) yields 0.6 — below threshold.
|
||||||
|
#: * Low-tier alone (max 1.0) yields 0.2 — defeats fixture 1's
|
||||||
|
#: credential-overlap-only failure mode.
|
||||||
|
#: * Very-low alone (max 1.0) yields 0.05 — defeats fixture 2's
|
||||||
|
#: ASN-rotation failure mode.
|
||||||
|
#:
|
||||||
|
#: The ratio between tiers matters more than the absolute values: a
|
||||||
|
#: tier should never combine its way past threshold without help from
|
||||||
|
#: a stronger one.
|
||||||
|
TIER_WEIGHTS = {
|
||||||
|
"high": 1.0,
|
||||||
|
"medium": 0.6,
|
||||||
|
"low": 0.2,
|
||||||
|
"very_low": 0.05,
|
||||||
|
}
|
||||||
|
|
||||||
|
#: Threshold a combined edge weight must meet to survive into the
|
||||||
|
#: similarity graph. The connected-components impl drops anything
|
||||||
|
#: under this before running union-find.
|
||||||
|
EDGE_THRESHOLD = 1.0
|
||||||
|
|
||||||
|
|
||||||
|
def combined_edge_weight(a: Observation, b: Observation) -> float:
|
||||||
|
"""Sum of all four tier scores, weighted by :data:`TIER_WEIGHTS`.
|
||||||
|
|
||||||
|
Each per-tier function returns a score in ``[0, 1]``; the
|
||||||
|
weighted sum lets stronger tiers dominate without letting weaker
|
||||||
|
ones combine their way past threshold.
|
||||||
|
|
||||||
|
The connected-components clusterer compares this against
|
||||||
|
:data:`EDGE_THRESHOLD` to decide whether to draw an edge. Pure /
|
||||||
|
time-agnostic — fixture 7 forbids recency-decay weighting.
|
||||||
|
|
||||||
|
Commits 5–7 land each tier in the call site:
|
||||||
|
|
||||||
|
* Commit 5 (this commit): high + medium.
|
||||||
|
* Commit 6: + phase-handoff (a separate edge family, not a tier).
|
||||||
|
* Commit 7: + low + very_low.
|
||||||
|
|
||||||
|
Until commit 7 lands, the low / very_low contributions stay zero
|
||||||
|
by virtue of the underlying functions returning ``0.0`` whenever
|
||||||
|
their inputs are missing. The combination is forward-compatible.
|
||||||
|
"""
|
||||||
|
return (
|
||||||
|
TIER_WEIGHTS["high"] * high_weight_edge(a, b)
|
||||||
|
+ TIER_WEIGHTS["medium"] * medium_weight_edge(a, b)
|
||||||
|
+ TIER_WEIGHTS["low"] * low_weight_edge(a, b)
|
||||||
|
+ TIER_WEIGHTS["very_low"] * very_low_weight_edge(a, b)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Adapter for the synthetic-corpus tests ─────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def from_synthetic(att) -> Observation: # type: ignore[no-untyped-def]
|
||||||
|
"""Build an :class:`Observation` from a ``SyntheticAttacker``.
|
||||||
|
|
||||||
|
Lives here so test code doesn't import the factory shape into the
|
||||||
|
production module — the adapter is a documented integration point.
|
||||||
|
Imported lazily by callers; the production worker uses a parallel
|
||||||
|
adapter from :class:`Attacker` rows once that lands.
|
||||||
|
"""
|
||||||
|
payload_hashes: set[str] = set()
|
||||||
|
c2_endpoints: set[str] = set()
|
||||||
|
credentials: set[tuple[str, str]] = set()
|
||||||
|
commands_by_phase: dict[str, list[str]] = {}
|
||||||
|
|
||||||
|
for s in att.sessions:
|
||||||
|
if s.payload_hash:
|
||||||
|
payload_hashes.add(s.payload_hash)
|
||||||
|
if s.c2_callback:
|
||||||
|
c2_endpoints.add(s.c2_callback)
|
||||||
|
for cred in s.credentials_tried:
|
||||||
|
credentials.add(tuple(cred))
|
||||||
|
if s.commands:
|
||||||
|
commands_by_phase.setdefault(s.phase.value, []).extend(s.commands)
|
||||||
|
|
||||||
|
return Observation(
|
||||||
|
observation_id=att.attacker_id,
|
||||||
|
ja3=att.ja3,
|
||||||
|
hassh=att.hassh,
|
||||||
|
asn=att.asn,
|
||||||
|
payload_hashes=frozenset(payload_hashes),
|
||||||
|
c2_endpoints=frozenset(c2_endpoints),
|
||||||
|
credentials=frozenset(credentials),
|
||||||
|
commands_by_phase={k: tuple(v) for k, v in commands_by_phase.items()},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"Observation",
|
||||||
|
"high_weight_edge",
|
||||||
|
"medium_weight_edge",
|
||||||
|
"low_weight_edge",
|
||||||
|
"very_low_weight_edge",
|
||||||
|
"combined_edge_weight",
|
||||||
|
"from_synthetic",
|
||||||
|
"EDGE_THRESHOLD",
|
||||||
|
"TIER_WEIGHTS",
|
||||||
|
]
|
||||||
108
decnet/clustering/ukc.py
Normal file
108
decnet/clustering/ukc.py
Normal file
@@ -0,0 +1,108 @@
|
|||||||
|
"""
|
||||||
|
Unified Kill Chain phase vocabulary (Pols, 2017).
|
||||||
|
|
||||||
|
Used as the canonical phase enum for campaign clustering and (eventually)
|
||||||
|
the MITRE ATT&CK / TTPs-tagging worker. UKC tactic names map cleanly onto
|
||||||
|
ATT&CK tactics, so emitting these labels in synthetic data and runtime
|
||||||
|
phase inference avoids a renaming pass when TTP-tagging lands.
|
||||||
|
|
||||||
|
A honeypot does not observe the entire chain. Pre-target phases (OSINT
|
||||||
|
reconnaissance, resource development, weaponization, social engineering)
|
||||||
|
happen before any decky is touched. The DSL allows the full enum so a
|
||||||
|
campaign spec can describe an end-to-end story; the synthetic generator
|
||||||
|
emits no events for unobservable phases.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
|
||||||
|
class UKCPhase(str, Enum):
|
||||||
|
# In — initial foothold
|
||||||
|
RECONNAISSANCE = "reconnaissance"
|
||||||
|
RESOURCE_DEVELOPMENT = "resource_development"
|
||||||
|
WEAPONIZATION = "weaponization"
|
||||||
|
DELIVERY = "delivery"
|
||||||
|
SOCIAL_ENGINEERING = "social_engineering"
|
||||||
|
EXPLOITATION = "exploitation"
|
||||||
|
PERSISTENCE = "persistence"
|
||||||
|
DEFENSE_EVASION = "defense_evasion"
|
||||||
|
COMMAND_AND_CONTROL = "command_and_control"
|
||||||
|
# Through — network propagation
|
||||||
|
PIVOTING = "pivoting"
|
||||||
|
DISCOVERY = "discovery"
|
||||||
|
PRIVILEGE_ESCALATION = "privilege_escalation"
|
||||||
|
EXECUTION = "execution"
|
||||||
|
CREDENTIAL_ACCESS = "credential_access"
|
||||||
|
LATERAL_MOVEMENT = "lateral_movement"
|
||||||
|
# Out — action on objectives
|
||||||
|
COLLECTION = "collection"
|
||||||
|
EXFILTRATION = "exfiltration"
|
||||||
|
IMPACT = "impact"
|
||||||
|
OBJECTIVES = "objectives"
|
||||||
|
|
||||||
|
|
||||||
|
# Phases a honeypot can plausibly observe. Pre-target phases are excluded:
|
||||||
|
# OSINT recon, infrastructure-stand-up, payload authoring, and human-target
|
||||||
|
# manipulation all happen before the attacker touches a decky. The synthetic
|
||||||
|
# generator validates campaign specs against this set and warns (but does
|
||||||
|
# not error) on unobservable phases — a campaign can describe them; we just
|
||||||
|
# emit no events.
|
||||||
|
OBSERVABLE_PHASES: frozenset[UKCPhase] = frozenset({
|
||||||
|
UKCPhase.DELIVERY,
|
||||||
|
UKCPhase.EXPLOITATION,
|
||||||
|
UKCPhase.PERSISTENCE,
|
||||||
|
UKCPhase.DEFENSE_EVASION,
|
||||||
|
UKCPhase.COMMAND_AND_CONTROL,
|
||||||
|
UKCPhase.PIVOTING,
|
||||||
|
UKCPhase.DISCOVERY,
|
||||||
|
UKCPhase.PRIVILEGE_ESCALATION,
|
||||||
|
UKCPhase.EXECUTION,
|
||||||
|
UKCPhase.CREDENTIAL_ACCESS,
|
||||||
|
UKCPhase.LATERAL_MOVEMENT,
|
||||||
|
UKCPhase.COLLECTION,
|
||||||
|
UKCPhase.EXFILTRATION,
|
||||||
|
UKCPhase.IMPACT,
|
||||||
|
UKCPhase.OBJECTIVES,
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
# Stage groupings — useful for the multi_operator fixture (operators tend
|
||||||
|
# to split along the In / Through / Out boundary) and for downstream
|
||||||
|
# UI rendering of campaign timelines.
|
||||||
|
STAGE_IN: frozenset[UKCPhase] = frozenset({
|
||||||
|
UKCPhase.RECONNAISSANCE,
|
||||||
|
UKCPhase.RESOURCE_DEVELOPMENT,
|
||||||
|
UKCPhase.WEAPONIZATION,
|
||||||
|
UKCPhase.DELIVERY,
|
||||||
|
UKCPhase.SOCIAL_ENGINEERING,
|
||||||
|
UKCPhase.EXPLOITATION,
|
||||||
|
UKCPhase.PERSISTENCE,
|
||||||
|
UKCPhase.DEFENSE_EVASION,
|
||||||
|
UKCPhase.COMMAND_AND_CONTROL,
|
||||||
|
})
|
||||||
|
|
||||||
|
STAGE_THROUGH: frozenset[UKCPhase] = frozenset({
|
||||||
|
UKCPhase.PIVOTING,
|
||||||
|
UKCPhase.DISCOVERY,
|
||||||
|
UKCPhase.PRIVILEGE_ESCALATION,
|
||||||
|
UKCPhase.EXECUTION,
|
||||||
|
UKCPhase.CREDENTIAL_ACCESS,
|
||||||
|
UKCPhase.LATERAL_MOVEMENT,
|
||||||
|
})
|
||||||
|
|
||||||
|
STAGE_OUT: frozenset[UKCPhase] = frozenset({
|
||||||
|
UKCPhase.COLLECTION,
|
||||||
|
UKCPhase.EXFILTRATION,
|
||||||
|
UKCPhase.IMPACT,
|
||||||
|
UKCPhase.OBJECTIVES,
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
def stage_of(phase: UKCPhase) -> str:
|
||||||
|
"""Return 'in' | 'through' | 'out' for a given phase."""
|
||||||
|
if phase in STAGE_IN:
|
||||||
|
return "in"
|
||||||
|
if phase in STAGE_THROUGH:
|
||||||
|
return "through"
|
||||||
|
return "out"
|
||||||
180
decnet/clustering/worker.py
Normal file
180
decnet/clustering/worker.py
Normal file
@@ -0,0 +1,180 @@
|
|||||||
|
"""Long-running identity-resolution clusterer worker.
|
||||||
|
|
||||||
|
Runs :meth:`Clusterer.tick` on bus-wake or slow-tick fallback. Mirrors
|
||||||
|
:mod:`decnet.intel.worker` and :mod:`decnet.correlation.reuse_worker`:
|
||||||
|
woken on ``attacker.observed`` and ``attacker.scored`` for sub-second
|
||||||
|
latency, falls back to a 60s poll when the bus is unavailable.
|
||||||
|
|
||||||
|
The clusterer itself owns its DB writes (``attacker_identities`` +
|
||||||
|
``attackers.identity_id`` updates). The worker shell is responsible only
|
||||||
|
for:
|
||||||
|
|
||||||
|
* lifecycle (bus connect, heartbeat, control listener, clean shutdown),
|
||||||
|
* publishing ``identity.formed`` / ``identity.observation.linked`` /
|
||||||
|
``identity.merged`` / ``identity.unmerged`` from the
|
||||||
|
:class:`ClusterResult` returned by ``tick``.
|
||||||
|
|
||||||
|
The skeleton ``ConnectedComponentsClusterer.tick`` returns an empty
|
||||||
|
result, so this worker runs but emits no identity events until edges
|
||||||
|
are wired in.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import contextlib
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from decnet.bus import topics as _topics
|
||||||
|
from decnet.bus.base import BaseBus
|
||||||
|
from decnet.bus.factory import get_bus
|
||||||
|
from decnet.bus.publish import (
|
||||||
|
publish_safely,
|
||||||
|
run_control_listener_signal as _run_control_listener_signal,
|
||||||
|
run_health_heartbeat as _run_health_heartbeat,
|
||||||
|
)
|
||||||
|
from decnet.clustering.base import Clusterer, ClusterResult
|
||||||
|
from decnet.clustering.factory import get_clusterer
|
||||||
|
from decnet.logging import get_logger
|
||||||
|
from decnet.web.db.repository import BaseRepository
|
||||||
|
|
||||||
|
log = get_logger("clustering.worker")
|
||||||
|
|
||||||
|
_DEFAULT_POLL_SECS = 60.0
|
||||||
|
|
||||||
|
|
||||||
|
async def run_clusterer_loop(
|
||||||
|
repo: BaseRepository,
|
||||||
|
*,
|
||||||
|
poll_interval_secs: float = _DEFAULT_POLL_SECS,
|
||||||
|
clusterer: Optional[Clusterer] = None,
|
||||||
|
shutdown: Optional[asyncio.Event] = None,
|
||||||
|
) -> None:
|
||||||
|
"""Run the identity clusterer until cancelled.
|
||||||
|
|
||||||
|
*clusterer* defaults to :func:`get_clusterer` — tests pass a fake.
|
||||||
|
*shutdown* is an optional external stop signal; the loop also exits
|
||||||
|
cleanly on :class:`asyncio.CancelledError` and
|
||||||
|
:class:`KeyboardInterrupt`.
|
||||||
|
"""
|
||||||
|
if clusterer is None:
|
||||||
|
clusterer = get_clusterer()
|
||||||
|
log.info(
|
||||||
|
"clusterer started impl=%s poll_interval_secs=%s",
|
||||||
|
clusterer.name, poll_interval_secs,
|
||||||
|
)
|
||||||
|
|
||||||
|
bus: Optional[BaseBus] = None
|
||||||
|
wake = asyncio.Event()
|
||||||
|
wake_tasks: list[asyncio.Task] = []
|
||||||
|
heartbeat_task: Optional[asyncio.Task] = None
|
||||||
|
try:
|
||||||
|
candidate = get_bus(client_name="clusterer")
|
||||||
|
await candidate.connect()
|
||||||
|
bus = candidate
|
||||||
|
wake_tasks.append(asyncio.create_task(
|
||||||
|
_wake_on(bus, wake, _topics.attacker(_topics.ATTACKER_OBSERVED)),
|
||||||
|
))
|
||||||
|
wake_tasks.append(asyncio.create_task(
|
||||||
|
_wake_on(bus, wake, _topics.attacker(_topics.ATTACKER_SCORED)),
|
||||||
|
))
|
||||||
|
heartbeat_task = asyncio.create_task(
|
||||||
|
_run_health_heartbeat(bus, "clusterer"),
|
||||||
|
)
|
||||||
|
wake_tasks.append(asyncio.create_task(
|
||||||
|
_run_control_listener_signal(bus, "clusterer"),
|
||||||
|
))
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
log.warning(
|
||||||
|
"clusterer: bus unavailable, running in poll-only mode: %s", exc,
|
||||||
|
)
|
||||||
|
|
||||||
|
if shutdown is None:
|
||||||
|
shutdown = asyncio.Event()
|
||||||
|
|
||||||
|
try:
|
||||||
|
while not shutdown.is_set():
|
||||||
|
try:
|
||||||
|
result = await clusterer.tick(repo)
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
log.exception("clusterer: tick failed")
|
||||||
|
result = ClusterResult()
|
||||||
|
|
||||||
|
await _publish_result(bus, result)
|
||||||
|
|
||||||
|
try:
|
||||||
|
await asyncio.wait_for(
|
||||||
|
wake.wait(), timeout=float(poll_interval_secs),
|
||||||
|
)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
pass
|
||||||
|
wake.clear()
|
||||||
|
except (asyncio.CancelledError, KeyboardInterrupt):
|
||||||
|
log.info("clusterer stopped")
|
||||||
|
finally:
|
||||||
|
for t in wake_tasks:
|
||||||
|
t.cancel()
|
||||||
|
if heartbeat_task is not None:
|
||||||
|
heartbeat_task.cancel()
|
||||||
|
for t in (*wake_tasks, heartbeat_task):
|
||||||
|
if t is None:
|
||||||
|
continue
|
||||||
|
with contextlib.suppress(asyncio.CancelledError, Exception):
|
||||||
|
await t
|
||||||
|
if bus is not None:
|
||||||
|
with contextlib.suppress(Exception):
|
||||||
|
await bus.close()
|
||||||
|
|
||||||
|
|
||||||
|
async def _publish_result(bus: Optional[BaseBus], result: ClusterResult) -> None:
|
||||||
|
"""Fan ``ClusterResult`` out to the four ``identity.*`` topics."""
|
||||||
|
for formed in result.identities_formed:
|
||||||
|
await publish_safely(
|
||||||
|
bus,
|
||||||
|
_topics.identity(_topics.IDENTITY_FORMED),
|
||||||
|
formed,
|
||||||
|
event_type=_topics.IDENTITY_FORMED,
|
||||||
|
)
|
||||||
|
for linked in result.observations_linked:
|
||||||
|
await publish_safely(
|
||||||
|
bus,
|
||||||
|
_topics.identity(_topics.IDENTITY_OBSERVATION_LINKED),
|
||||||
|
linked,
|
||||||
|
event_type=_topics.IDENTITY_OBSERVATION_LINKED,
|
||||||
|
)
|
||||||
|
for merged in result.identities_merged:
|
||||||
|
await publish_safely(
|
||||||
|
bus,
|
||||||
|
_topics.identity(_topics.IDENTITY_MERGED),
|
||||||
|
merged,
|
||||||
|
event_type=_topics.IDENTITY_MERGED,
|
||||||
|
)
|
||||||
|
for unmerged in result.identities_unmerged:
|
||||||
|
await publish_safely(
|
||||||
|
bus,
|
||||||
|
_topics.identity(_topics.IDENTITY_UNMERGED),
|
||||||
|
unmerged,
|
||||||
|
event_type=_topics.IDENTITY_UNMERGED,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def _wake_on(bus: BaseBus, wake: asyncio.Event, pattern: str) -> None:
|
||||||
|
"""Flip *wake* every time *pattern* fires on the bus.
|
||||||
|
|
||||||
|
Survives transient subscriber errors by logging and exiting; the
|
||||||
|
poll-interval fallback keeps the loop alive in poll-only mode.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
sub = bus.subscribe(pattern)
|
||||||
|
async with sub:
|
||||||
|
async for _event in sub:
|
||||||
|
wake.set()
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
raise
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
log.warning(
|
||||||
|
"clusterer: subscriber for %s died (%s); falling back to poll",
|
||||||
|
pattern, exc,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["run_clusterer_loop"]
|
||||||
@@ -7,6 +7,7 @@ The ingester tails the .json file; rsyslog can consume the .log file independent
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import contextlib
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
@@ -15,11 +16,23 @@ import time
|
|||||||
from concurrent.futures import ThreadPoolExecutor
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Optional
|
from typing import Any, Callable, Optional
|
||||||
|
|
||||||
|
from decnet.bus import topics as _topics
|
||||||
|
from decnet.bus.factory import get_bus
|
||||||
|
from decnet.bus.publish import (
|
||||||
|
make_thread_safe_publisher,
|
||||||
|
run_control_listener_signal,
|
||||||
|
run_health_heartbeat,
|
||||||
|
)
|
||||||
from decnet.logging import get_logger
|
from decnet.logging import get_logger
|
||||||
from decnet.telemetry import traced as _traced, get_tracer as _get_tracer, inject_context as _inject_ctx
|
from decnet.telemetry import traced as _traced, get_tracer as _get_tracer, inject_context as _inject_ctx
|
||||||
|
|
||||||
|
# Collector publish signature: ``publish_fn(parsed_event_dict)``. Callable
|
||||||
|
# from the container-stream threads; the worker wraps it around a thread-safe
|
||||||
|
# bus publisher that marshals onto the asyncio loop.
|
||||||
|
CollectorPublishFn = Callable[[dict[str, Any]], None]
|
||||||
|
|
||||||
logger = get_logger("collector")
|
logger = get_logger("collector")
|
||||||
|
|
||||||
# ─── Ingestion rate limiter ───────────────────────────────────────────────────
|
# ─── Ingestion rate limiter ───────────────────────────────────────────────────
|
||||||
@@ -127,6 +140,22 @@ _IP_FIELDS = ("src_ip", "src", "client_ip", "remote_ip", "remote_addr", "target_
|
|||||||
# as one unit; we only care about IP-shaped fields here anyway.
|
# as one unit; we only care about IP-shaped fields here anyway.
|
||||||
_MSG_KV_RE = re.compile(r'(\w+)=(\S+)')
|
_MSG_KV_RE = re.compile(r'(\w+)=(\S+)')
|
||||||
|
|
||||||
|
# Native sshd / pam syslog lines arrive without an SD block and without
|
||||||
|
# key=value pairs. The remote address shows up as free prose:
|
||||||
|
# "Failed password for root from 1.2.3.4 port 42772 ssh2"
|
||||||
|
# "Connection from 1.2.3.4 port 42772 on 10.0.0.2 port 22"
|
||||||
|
# "pam_unix(sshd:auth): authentication failure; … rhost=1.2.3.4 user=root"
|
||||||
|
# Anchored patterns first so we never confuse the attacker with the
|
||||||
|
# local listener IP ("on 10.0.0.2"). Bare IP scan is the last resort.
|
||||||
|
_IPV4 = r"\d{1,3}(?:\.\d{1,3}){3}"
|
||||||
|
_IPV6 = r"[0-9a-fA-F:]+:[0-9a-fA-F:]+"
|
||||||
|
_IP = rf"(?:{_IPV4}|{_IPV6})"
|
||||||
|
_MSG_IP_ANCHORED_RE = re.compile(
|
||||||
|
rf"\b(?:from|rhost[:=]|client[:=]|src[:=])\s*({_IP})",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
_MSG_IP_BARE_RE = re.compile(rf"\b({_IPV4})\b")
|
||||||
|
|
||||||
|
|
||||||
def parse_rfc5424(line: str) -> Optional[dict[str, Any]]:
|
def parse_rfc5424(line: str) -> Optional[dict[str, Any]]:
|
||||||
"""
|
"""
|
||||||
@@ -173,6 +202,19 @@ def parse_rfc5424(line: str) -> Optional[dict[str, Any]]:
|
|||||||
attacker_ip = v
|
attacker_ip = v
|
||||||
break
|
break
|
||||||
|
|
||||||
|
# Final fallback for native syslog producers that emit free-form prose
|
||||||
|
# (notably sshd and pam_unix routed via rsyslog without the relay@55555
|
||||||
|
# SD wrapper). Prefer anchored matches so the local listener address in
|
||||||
|
# "Connection from X port Y on Z port 22" never wins over X.
|
||||||
|
if attacker_ip == "Unknown" and msg:
|
||||||
|
anchored = _MSG_IP_ANCHORED_RE.search(msg)
|
||||||
|
if anchored:
|
||||||
|
attacker_ip = anchored.group(1)
|
||||||
|
else:
|
||||||
|
bare = _MSG_IP_BARE_RE.search(msg)
|
||||||
|
if bare:
|
||||||
|
attacker_ip = bare.group(1)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
ts_formatted = datetime.fromisoformat(ts_raw).strftime("%Y-%m-%d %H:%M:%S")
|
ts_formatted = datetime.fromisoformat(ts_raw).strftime("%Y-%m-%d %H:%M:%S")
|
||||||
except ValueError:
|
except ValueError:
|
||||||
@@ -210,14 +252,67 @@ def _load_service_container_names() -> set[str]:
|
|||||||
return names
|
return names
|
||||||
|
|
||||||
|
|
||||||
|
_TOPOLOGY_SERVICE_LABEL = "decnet.topology.service"
|
||||||
|
_FLEET_SERVICE_LABEL = "decnet.fleet.service"
|
||||||
|
|
||||||
|
|
||||||
|
def _has_decnet_service_label(labels: Optional[dict]) -> bool:
|
||||||
|
"""Recognize both fleet (``decnet.fleet.service``, set by
|
||||||
|
``decnet/composer.py``) and MazeNET topology (``decnet.topology.service``,
|
||||||
|
set by ``decnet/topology/compose.py``) containers.
|
||||||
|
|
||||||
|
Label-based detection is the canonical path: it's stateless and avoids
|
||||||
|
the race between ``docker compose up`` and the ``decnet-state.json``
|
||||||
|
write that previously caused freshly-deployed fleet containers to be
|
||||||
|
silently dropped by the docker-events watcher.
|
||||||
|
"""
|
||||||
|
if not labels:
|
||||||
|
return False
|
||||||
|
return (
|
||||||
|
labels.get(_TOPOLOGY_SERVICE_LABEL) == "true"
|
||||||
|
or labels.get(_FLEET_SERVICE_LABEL) == "true"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def is_service_container(container) -> bool:
|
def is_service_container(container) -> bool:
|
||||||
"""Return True if this Docker container is a known DECNET service container."""
|
"""Return True if this Docker container is a known DECNET service container.
|
||||||
name = (container if isinstance(container, str) else container.name).lstrip("/")
|
|
||||||
|
Label-based detection is preferred (works for both fleet and MazeNET
|
||||||
|
topology containers without touching decnet-state.json). The
|
||||||
|
state-file name match remains as a fallback so containers built from
|
||||||
|
older composes — which predate the ``decnet.fleet.service`` label —
|
||||||
|
are still picked up.
|
||||||
|
"""
|
||||||
|
if isinstance(container, str):
|
||||||
|
return container.lstrip("/") in _load_service_container_names()
|
||||||
|
labels: Optional[dict] = None
|
||||||
|
attrs = getattr(container, "attrs", None)
|
||||||
|
if isinstance(attrs, dict):
|
||||||
|
labels = (attrs.get("Config") or {}).get("Labels")
|
||||||
|
if labels is None:
|
||||||
|
labels = getattr(container, "labels", None)
|
||||||
|
if _has_decnet_service_label(labels):
|
||||||
|
return True
|
||||||
|
# Fallback: legacy containers without labels still match by name.
|
||||||
|
name = container.name.lstrip("/")
|
||||||
return name in _load_service_container_names()
|
return name in _load_service_container_names()
|
||||||
|
|
||||||
|
|
||||||
def is_service_event(attrs: dict) -> bool:
|
def is_service_event(attrs: dict) -> bool:
|
||||||
"""Return True if a Docker start event is for a known DECNET service container."""
|
"""Return True if a Docker start event is for a known DECNET service container.
|
||||||
|
|
||||||
|
Docker start-event attrs flatten every container label alongside the
|
||||||
|
``name``/``image`` keys — no separate ``labels`` sub-dict — so label
|
||||||
|
detection happens directly on ``attrs``.
|
||||||
|
|
||||||
|
Prefer the label path because it's race-free with respect to the
|
||||||
|
``decnet-state.json`` write that ``decnet deploy`` performs around
|
||||||
|
``docker compose up``: a freshly-started container's start event can
|
||||||
|
arrive before the state file has been updated, and the legacy
|
||||||
|
name-based fallback would then drop the event.
|
||||||
|
"""
|
||||||
|
if _has_decnet_service_label(attrs):
|
||||||
|
return True
|
||||||
name = attrs.get("name", "").lstrip("/")
|
name = attrs.get("name", "").lstrip("/")
|
||||||
return name in _load_service_container_names()
|
return name in _load_service_container_names()
|
||||||
|
|
||||||
@@ -244,7 +339,12 @@ def _reopen_if_needed(path: Path, fh: Optional[Any]) -> Any:
|
|||||||
|
|
||||||
|
|
||||||
@_traced("collector.stream_container")
|
@_traced("collector.stream_container")
|
||||||
def _stream_container(container_id: str, log_path: Path, json_path: Path) -> None:
|
def _stream_container(
|
||||||
|
container_id: str,
|
||||||
|
log_path: Path,
|
||||||
|
json_path: Path,
|
||||||
|
publish_fn: CollectorPublishFn | None = None,
|
||||||
|
) -> None:
|
||||||
"""Stream logs from one container and append to the host log files."""
|
"""Stream logs from one container and append to the host log files."""
|
||||||
import docker # type: ignore[import]
|
import docker # type: ignore[import]
|
||||||
|
|
||||||
@@ -279,6 +379,13 @@ def _stream_container(container_id: str, log_path: Path, json_path: Path) -> Non
|
|||||||
jf = _reopen_if_needed(json_path, jf)
|
jf = _reopen_if_needed(json_path, jf)
|
||||||
jf.write(json.dumps(parsed) + "\n")
|
jf.write(json.dumps(parsed) + "\n")
|
||||||
jf.flush()
|
jf.flush()
|
||||||
|
if publish_fn is not None:
|
||||||
|
try:
|
||||||
|
publish_fn(parsed)
|
||||||
|
except Exception as exc:
|
||||||
|
logger.debug(
|
||||||
|
"collector: bus publish failed: %s", exc,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"collector: rate-limited decky=%s service=%s type=%s attacker=%s",
|
"collector: rate-limited decky=%s service=%s type=%s attacker=%s",
|
||||||
@@ -298,6 +405,41 @@ def _stream_container(container_id: str, log_path: Path, json_path: Path) -> Non
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
# ─── Bus plumbing ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _make_system_log_publisher(
|
||||||
|
bus: Any, loop: asyncio.AbstractEventLoop,
|
||||||
|
) -> CollectorPublishFn:
|
||||||
|
"""Factory: returns a ``publish_fn(parsed)`` for use by stream threads.
|
||||||
|
|
||||||
|
When *bus* is ``None`` the returned callable is a no-op, so the stream
|
||||||
|
thread can call it unconditionally. Otherwise each call is marshalled
|
||||||
|
onto *loop* (the asyncio event loop that owns the bus socket) via
|
||||||
|
``make_thread_safe_publisher``.
|
||||||
|
"""
|
||||||
|
raw_publish = make_thread_safe_publisher(bus, loop) if bus is not None else None
|
||||||
|
if raw_publish is None:
|
||||||
|
return lambda _parsed: None
|
||||||
|
|
||||||
|
topic = _topics.system(_topics.SYSTEM_LOG)
|
||||||
|
|
||||||
|
def _publish(parsed: dict[str, Any]) -> None:
|
||||||
|
event_type = parsed.get("event_type", "")
|
||||||
|
raw_publish(
|
||||||
|
topic,
|
||||||
|
{
|
||||||
|
"decky": parsed.get("decky", ""),
|
||||||
|
"service": parsed.get("service", ""),
|
||||||
|
"event_type": event_type,
|
||||||
|
"attacker_ip": parsed.get("attacker_ip", "Unknown"),
|
||||||
|
"timestamp": parsed.get("timestamp", ""),
|
||||||
|
},
|
||||||
|
event_type,
|
||||||
|
)
|
||||||
|
|
||||||
|
return _publish
|
||||||
|
|
||||||
|
|
||||||
# ─── Async collector ──────────────────────────────────────────────────────────
|
# ─── Async collector ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
async def log_collector_worker(log_file: str) -> None:
|
async def log_collector_worker(log_file: str) -> None:
|
||||||
@@ -317,6 +459,38 @@ async def log_collector_worker(log_file: str) -> None:
|
|||||||
active: dict[str, asyncio.Task[None]] = {}
|
active: dict[str, asyncio.Task[None]] = {}
|
||||||
loop = asyncio.get_running_loop()
|
loop = asyncio.get_running_loop()
|
||||||
|
|
||||||
|
# Optional bus wiring — per-line system.log publish. Fan-in from many
|
||||||
|
# container-stream threads is handled by make_thread_safe_publisher,
|
||||||
|
# which marshals each publish onto this loop.
|
||||||
|
bus = None
|
||||||
|
try:
|
||||||
|
bus = get_bus(client_name="collector")
|
||||||
|
await bus.connect()
|
||||||
|
except Exception as exc:
|
||||||
|
logger.warning("collector: bus unavailable, continuing without publish: %s", exc)
|
||||||
|
bus = None
|
||||||
|
|
||||||
|
_publish_log = _make_system_log_publisher(bus, loop)
|
||||||
|
|
||||||
|
# Workers panel health heartbeat + bus-driven stop control. The
|
||||||
|
# heartbeat beacons on system.collector.health every 30s; the
|
||||||
|
# control listener translates a bus stop intent into a SIGTERM to
|
||||||
|
# this process (collector's main loop is a blocking thread pool, so
|
||||||
|
# self-signalling is cleaner than threading a shutdown event).
|
||||||
|
heartbeat_task = asyncio.create_task(run_health_heartbeat(bus, "collector"))
|
||||||
|
control_task = asyncio.create_task(run_control_listener_signal(bus, "collector"))
|
||||||
|
|
||||||
|
# Periodic re-scan of running containers. Belt to the event-watcher's
|
||||||
|
# suspenders: if dockerd or the SDK ever drops a start event during a
|
||||||
|
# reconnect window (the retry loop in ``_watch_events`` covers the
|
||||||
|
# restart itself, but events fired *during* the gap are lost), this
|
||||||
|
# loop picks up the orphan within ``RECONCILE_INTERVAL_S``. Also
|
||||||
|
# prunes finished futures so ``active`` doesn't accumulate over the
|
||||||
|
# agent's lifetime as topology mutations churn containers.
|
||||||
|
_reconcile_interval_s = float(
|
||||||
|
os.environ.get("DECNET_COLLECTOR_RECONCILE_S", "30")
|
||||||
|
)
|
||||||
|
|
||||||
# Dedicated thread pool so long-running container log streams don't
|
# Dedicated thread pool so long-running container log streams don't
|
||||||
# saturate the default asyncio executor and starve short-lived
|
# saturate the default asyncio executor and starve short-lived
|
||||||
# to_thread() calls elsewhere (e.g. load_state in the web API).
|
# to_thread() calls elsewhere (e.g. load_state in the web API).
|
||||||
@@ -329,7 +503,7 @@ async def log_collector_worker(log_file: str) -> None:
|
|||||||
active[container_id] = asyncio.ensure_future(
|
active[container_id] = asyncio.ensure_future(
|
||||||
loop.run_in_executor(
|
loop.run_in_executor(
|
||||||
collector_pool, _stream_container,
|
collector_pool, _stream_container,
|
||||||
container_id, log_path, json_path,
|
container_id, log_path, json_path, _publish_log,
|
||||||
),
|
),
|
||||||
loop=loop,
|
loop=loop,
|
||||||
)
|
)
|
||||||
@@ -339,20 +513,73 @@ async def log_collector_worker(log_file: str) -> None:
|
|||||||
logger.info("collector started log_path=%s", log_path)
|
logger.info("collector started log_path=%s", log_path)
|
||||||
client = docker.from_env()
|
client = docker.from_env()
|
||||||
|
|
||||||
|
async def _reconcile_loop() -> None:
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
await asyncio.sleep(_reconcile_interval_s)
|
||||||
|
# Drop done futures so the dict's bounded by the
|
||||||
|
# current container count, not lifetime churn.
|
||||||
|
for cid in [c for c, t in active.items() if t.done()]:
|
||||||
|
active.pop(cid, None)
|
||||||
|
containers = await loop.run_in_executor(
|
||||||
|
collector_pool,
|
||||||
|
lambda: list(client.containers.list()),
|
||||||
|
)
|
||||||
|
for container in containers:
|
||||||
|
if container.id in active:
|
||||||
|
continue
|
||||||
|
if is_service_container(container):
|
||||||
|
_spawn(container.id, container.name.lstrip("/"))
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
raise
|
||||||
|
except Exception as exc: # noqa: BLE001 — keep loop alive across SDK transients
|
||||||
|
logger.warning("collector: reconcile pass failed: %s", exc)
|
||||||
|
|
||||||
|
reconcile_task = asyncio.create_task(_reconcile_loop())
|
||||||
|
|
||||||
for container in client.containers.list():
|
for container in client.containers.list():
|
||||||
if is_service_container(container):
|
if is_service_container(container):
|
||||||
_spawn(container.id, container.name.lstrip("/"))
|
_spawn(container.id, container.name.lstrip("/"))
|
||||||
|
|
||||||
def _watch_events() -> None:
|
def _watch_events() -> None:
|
||||||
for event in client.events(
|
# The dockerd event stream is the fast path for picking up
|
||||||
decode=True,
|
# newly-started service containers. It can break in two ways:
|
||||||
filters={"type": "container", "event": "start"},
|
# (a) dockerd restart / reload severs the long-poll, (b) the
|
||||||
):
|
# SDK's JSON-stream decoder occasionally raises on a partial
|
||||||
attrs = event.get("Actor", {}).get("Attributes", {})
|
# frame. Both used to make this thread return cleanly, leaving
|
||||||
cid = event.get("id", "")
|
# the collector "running" with no event subscription — future
|
||||||
name = attrs.get("name", "")
|
# container starts were silently dropped until an operator
|
||||||
if cid and is_service_event(attrs):
|
# restarted the unit. Retry with exponential backoff (cap at
|
||||||
loop.call_soon_threadsafe(_spawn, cid, name)
|
# 30s, matching the heartbeat cadence) so dockerd hiccups are
|
||||||
|
# invisible to the operator. The reconcile loop is the safety
|
||||||
|
# net for any events lost during the reconnect window.
|
||||||
|
backoff = 1.0
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
for event in client.events(
|
||||||
|
decode=True,
|
||||||
|
filters={"type": "container", "event": "start"},
|
||||||
|
):
|
||||||
|
attrs = event.get("Actor", {}).get("Attributes", {})
|
||||||
|
cid = event.get("id", "")
|
||||||
|
name = attrs.get("name", "")
|
||||||
|
if cid and is_service_event(attrs):
|
||||||
|
loop.call_soon_threadsafe(_spawn, cid, name)
|
||||||
|
# Clean iterator exhaustion: real dockerd doesn't
|
||||||
|
# close the stream voluntarily, so this only
|
||||||
|
# happens in tests with mocked iterators or in
|
||||||
|
# genuinely unrecoverable daemon states. Either
|
||||||
|
# way, returning lets the worker shut down
|
||||||
|
# cleanly — the reconciler is the safety net for
|
||||||
|
# productive cases.
|
||||||
|
return
|
||||||
|
except Exception as exc: # noqa: BLE001 — SDK leaks bare Exceptions on stream-decode errors
|
||||||
|
logger.warning(
|
||||||
|
"collector: event stream broke (%s: %s); reconnecting in %.1fs",
|
||||||
|
type(exc).__name__, exc, backoff,
|
||||||
|
)
|
||||||
|
time.sleep(backoff)
|
||||||
|
backoff = min(backoff * 2, 30.0)
|
||||||
|
|
||||||
await loop.run_in_executor(collector_pool, _watch_events)
|
await loop.run_in_executor(collector_pool, _watch_events)
|
||||||
|
|
||||||
@@ -366,3 +593,15 @@ async def log_collector_worker(log_file: str) -> None:
|
|||||||
logger.error("collector error: %s", exc)
|
logger.error("collector error: %s", exc)
|
||||||
finally:
|
finally:
|
||||||
collector_pool.shutdown(wait=False)
|
collector_pool.shutdown(wait=False)
|
||||||
|
# `reconcile_task` may not exist if startup failed before
|
||||||
|
# `client = docker.from_env()` returned; tolerate that.
|
||||||
|
_maintenance_tasks = [heartbeat_task, control_task]
|
||||||
|
if "reconcile_task" in locals():
|
||||||
|
_maintenance_tasks.append(reconcile_task)
|
||||||
|
for t in _maintenance_tasks:
|
||||||
|
t.cancel()
|
||||||
|
with contextlib.suppress(Exception, asyncio.CancelledError):
|
||||||
|
await t
|
||||||
|
if bus is not None:
|
||||||
|
with contextlib.suppress(Exception):
|
||||||
|
await bus.close()
|
||||||
|
|||||||
@@ -91,6 +91,19 @@ def generate_compose(config: DecnetConfig) -> dict:
|
|||||||
# Rotate Docker logs so disk usage is bounded
|
# Rotate Docker logs so disk usage is bounded
|
||||||
fragment["logging"] = _DOCKER_LOGGING
|
fragment["logging"] = _DOCKER_LOGGING
|
||||||
|
|
||||||
|
# Stamp DECNET ownership labels so the collector's docker-events
|
||||||
|
# watcher can identify newly-started containers without consulting
|
||||||
|
# decnet-state.json (which is written and read out-of-band with
|
||||||
|
# `docker compose up`, leaving a race window where freshly started
|
||||||
|
# containers were silently ignored).
|
||||||
|
labels = dict(fragment.get("labels") or {})
|
||||||
|
labels.update({
|
||||||
|
"decnet.fleet.service": "true",
|
||||||
|
"decnet.fleet.decky": decky.name,
|
||||||
|
"decnet.fleet.service_name": svc_name,
|
||||||
|
})
|
||||||
|
fragment["labels"] = labels
|
||||||
|
|
||||||
services[f"{decky.name}-{svc_name}"] = fragment
|
services[f"{decky.name}-{svc_name}"] = fragment
|
||||||
|
|
||||||
# Network definitions
|
# Network definitions
|
||||||
|
|||||||
@@ -82,19 +82,33 @@ def _configure_logging(dev: bool) -> None:
|
|||||||
_in_pytest = any(k.startswith("PYTEST") for k in os.environ)
|
_in_pytest = any(k.startswith("PYTEST") for k in os.environ)
|
||||||
if not _in_pytest:
|
if not _in_pytest:
|
||||||
_log_path = os.environ.get("DECNET_SYSTEM_LOGS", "decnet.system.log")
|
_log_path = os.environ.get("DECNET_SYSTEM_LOGS", "decnet.system.log")
|
||||||
file_handler = InodeAwareRotatingFileHandler(
|
# Never let file-handler attach failure kill the process. The
|
||||||
_log_path,
|
# stream handler above is already installed, so losing the file
|
||||||
mode="a",
|
# handler just means 'tail syslog / journalctl instead' — the
|
||||||
maxBytes=10 * 1024 * 1024, # 10 MB
|
# daemon itself must keep running. This path trips most
|
||||||
backupCount=5,
|
# commonly under systemd with ProtectSystem=full + ProtectHome=
|
||||||
encoding="utf-8",
|
# read-only when an operator hasn't passed a writable
|
||||||
)
|
# DECNET_SYSTEM_LOGS yet.
|
||||||
file_handler.setFormatter(fmt)
|
try:
|
||||||
root.addHandler(file_handler)
|
file_handler = InodeAwareRotatingFileHandler(
|
||||||
# Drop root ownership when invoked via sudo so non-root follow-up
|
_log_path,
|
||||||
# commands (e.g. `decnet api` after `sudo decnet deploy`) can append.
|
mode="a",
|
||||||
from decnet.privdrop import chown_to_invoking_user
|
maxBytes=10 * 1024 * 1024, # 10 MB
|
||||||
chown_to_invoking_user(_log_path)
|
backupCount=5,
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
file_handler.setFormatter(fmt)
|
||||||
|
root.addHandler(file_handler)
|
||||||
|
# Drop root ownership when invoked via sudo so non-root follow-up
|
||||||
|
# commands (e.g. `decnet api` after `sudo decnet deploy`) can append.
|
||||||
|
from decnet.privdrop import chown_to_invoking_user
|
||||||
|
chown_to_invoking_user(_log_path)
|
||||||
|
except OSError as exc:
|
||||||
|
logging.getLogger(__name__).warning(
|
||||||
|
"could not open %s (%s); continuing with stderr-only logging. "
|
||||||
|
"Set DECNET_SYSTEM_LOGS to a writable path to silence this.",
|
||||||
|
_log_path, exc,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
_dev = os.environ.get("DECNET_DEVELOPER", "").lower() == "true"
|
_dev = os.environ.get("DECNET_DEVELOPER", "").lower() == "true"
|
||||||
|
|||||||
@@ -9,31 +9,71 @@ had been exported by the shell.
|
|||||||
Shape::
|
Shape::
|
||||||
|
|
||||||
[decnet]
|
[decnet]
|
||||||
mode = agent # or "master"
|
mode = master # or "agent"
|
||||||
log-directory = /var/log/decnet
|
|
||||||
disallow-master = true
|
[api]
|
||||||
|
host = 127.0.0.1
|
||||||
|
port = 8000
|
||||||
|
|
||||||
|
[web]
|
||||||
|
host = 127.0.0.1
|
||||||
|
port = 8080
|
||||||
|
admin-user = admin
|
||||||
|
cors-origins = http://localhost:8080
|
||||||
|
|
||||||
|
[database]
|
||||||
|
type = sqlite # or "mysql"
|
||||||
|
url = mysql+asyncmy://user@host:3306/decnet # wins over host/port/name/user
|
||||||
|
host = localhost
|
||||||
|
port = 3306
|
||||||
|
name = decnet
|
||||||
|
user = decnet
|
||||||
|
|
||||||
|
[bus]
|
||||||
|
enabled = true
|
||||||
|
type = unix # or "fake"
|
||||||
|
socket = /run/decnet/bus.sock
|
||||||
|
group = decnet
|
||||||
|
|
||||||
|
[swarm]
|
||||||
|
master-host = 10.0.0.1 # required on agents
|
||||||
|
syslog-port = 6514
|
||||||
|
swarmctl-port = 8770
|
||||||
|
|
||||||
|
[logging]
|
||||||
|
system-log = /var/log/decnet/decnet.system.log
|
||||||
|
ingest-log = /var/log/decnet/decnet.log
|
||||||
|
agent-log = /var/log/decnet/agent.log
|
||||||
|
|
||||||
|
[ingester]
|
||||||
|
batch-size = 100
|
||||||
|
batch-max-wait-ms = 250
|
||||||
|
|
||||||
|
[tracing]
|
||||||
|
enabled = false
|
||||||
|
otel-endpoint = http://localhost:4317
|
||||||
|
|
||||||
[agent]
|
[agent]
|
||||||
master-host = 192.168.1.50
|
# Written by the enroll bundle on agent hosts — don't hand-edit.
|
||||||
master-port = 8770
|
host-uuid = ...
|
||||||
agent-port = 8765
|
master-host = ...
|
||||||
agent-dir = /home/anti/.decnet/agent
|
|
||||||
...
|
|
||||||
|
|
||||||
[master]
|
The ``[decnet]`` and role-specific ``[agent]`` / ``[master]`` sections
|
||||||
api-host = 0.0.0.0
|
use auto kebab-to-snake translation (``master-host`` → ``DECNET_MASTER_HOST``).
|
||||||
swarmctl-port = 8770
|
The domain sections (``[api]``, ``[web]``, etc.) use an explicit key map
|
||||||
listener-port = 6514
|
so ``[web] admin-user`` resolves to ``DECNET_ADMIN_USER`` without silently
|
||||||
...
|
renaming the env-var contract consumers already import from ``decnet.env``.
|
||||||
|
|
||||||
Only the section matching `mode` is loaded. The other section is
|
Secrets (``DECNET_JWT_SECRET``, ``DECNET_ADMIN_PASSWORD``,
|
||||||
ignored silently so an agent host never reads master secrets (and
|
``DECNET_DB_PASSWORD``) are deliberately NOT in the domain map. They
|
||||||
vice versa). Keys are converted to SCREAMING_SNAKE_CASE and prefixed
|
belong in ``.env.local`` / systemd ``EnvironmentFile=`` so they never
|
||||||
with ``DECNET_`` — e.g. ``master-host`` → ``DECNET_MASTER_HOST``.
|
hit the dashboard, never end up in `config.ini`-style diffs, and never
|
||||||
|
get group-readable alongside tunables.
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import configparser
|
import configparser
|
||||||
|
import logging
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
@@ -41,10 +81,62 @@ from typing import Optional
|
|||||||
|
|
||||||
DEFAULT_CONFIG_PATH = Path("/etc/decnet/decnet.ini")
|
DEFAULT_CONFIG_PATH = Path("/etc/decnet/decnet.ini")
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
# The [decnet] section keys are role-agnostic and always exported.
|
# The [decnet] section keys are role-agnostic and always exported.
|
||||||
_COMMON_KEYS = frozenset({"mode", "disallow-master", "log-directory"})
|
_COMMON_KEYS = frozenset({"mode", "disallow-master", "log-directory"})
|
||||||
|
|
||||||
|
|
||||||
|
# Explicit INI-key → env-var mapping for the domain sections. Kept
|
||||||
|
# separate from the role-specific [agent] / [master] loader so the
|
||||||
|
# admin-facing section layout ([web] admin-user) can diverge from the
|
||||||
|
# env-var name (DECNET_ADMIN_USER) without breaking any consumer.
|
||||||
|
_DOMAIN_MAP: dict[str, dict[str, str]] = {
|
||||||
|
"api": {
|
||||||
|
"host": "DECNET_API_HOST",
|
||||||
|
"port": "DECNET_API_PORT",
|
||||||
|
},
|
||||||
|
"web": {
|
||||||
|
"host": "DECNET_WEB_HOST",
|
||||||
|
"port": "DECNET_WEB_PORT",
|
||||||
|
"admin-user": "DECNET_ADMIN_USER",
|
||||||
|
"cors-origins": "DECNET_CORS_ORIGINS",
|
||||||
|
},
|
||||||
|
"database": {
|
||||||
|
"type": "DECNET_DB_TYPE",
|
||||||
|
"url": "DECNET_DB_URL",
|
||||||
|
"host": "DECNET_DB_HOST",
|
||||||
|
"port": "DECNET_DB_PORT",
|
||||||
|
"name": "DECNET_DB_NAME",
|
||||||
|
"user": "DECNET_DB_USER",
|
||||||
|
},
|
||||||
|
"bus": {
|
||||||
|
"enabled": "DECNET_BUS_ENABLED",
|
||||||
|
"type": "DECNET_BUS_TYPE",
|
||||||
|
"socket": "DECNET_BUS_SOCKET",
|
||||||
|
"group": "DECNET_BUS_GROUP",
|
||||||
|
},
|
||||||
|
"swarm": {
|
||||||
|
"master-host": "DECNET_SWARM_MASTER_HOST",
|
||||||
|
"syslog-port": "DECNET_SWARM_SYSLOG_PORT",
|
||||||
|
"swarmctl-port": "DECNET_SWARMCTL_PORT",
|
||||||
|
},
|
||||||
|
"logging": {
|
||||||
|
"system-log": "DECNET_SYSTEM_LOGS",
|
||||||
|
"ingest-log": "DECNET_INGEST_LOG_FILE",
|
||||||
|
"agent-log": "DECNET_AGENT_LOG_FILE",
|
||||||
|
},
|
||||||
|
"ingester": {
|
||||||
|
"batch-size": "DECNET_BATCH_SIZE",
|
||||||
|
"batch-max-wait-ms": "DECNET_BATCH_MAX_WAIT_MS",
|
||||||
|
},
|
||||||
|
"tracing": {
|
||||||
|
"enabled": "DECNET_DEVELOPER_TRACING",
|
||||||
|
"otel-endpoint": "DECNET_OTEL_ENDPOINT",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def _key_to_env(key: str) -> str:
|
def _key_to_env(key: str) -> str:
|
||||||
return "DECNET_" + key.replace("-", "_").upper()
|
return "DECNET_" + key.replace("-", "_").upper()
|
||||||
|
|
||||||
@@ -66,7 +158,14 @@ def load_ini_config(path: Optional[Path] = None) -> Optional[Path]:
|
|||||||
if not path.is_file():
|
if not path.is_file():
|
||||||
return None
|
return None
|
||||||
|
|
||||||
parser = configparser.ConfigParser()
|
# The docstring at the top of this module advertises inline ``#`` and
|
||||||
|
# ``;`` comments (e.g. ``mode = master # or "agent"``). Python's
|
||||||
|
# ``configparser`` only recognises those when ``inline_comment_prefixes``
|
||||||
|
# is set explicitly — without it, the comment becomes part of the value
|
||||||
|
# and downstream validators reject it ("mode must be 'agent' or 'master',
|
||||||
|
# got 'master # or \"agent\"'"). Match what the docs
|
||||||
|
# promise.
|
||||||
|
parser = configparser.ConfigParser(inline_comment_prefixes=("#", ";"))
|
||||||
parser.read(path)
|
parser.read(path)
|
||||||
|
|
||||||
# [decnet] first — mode/disallow-master/log-directory. These seed the
|
# [decnet] first — mode/disallow-master/log-directory. These seed the
|
||||||
@@ -81,10 +180,29 @@ def load_ini_config(path: Optional[Path] = None) -> Optional[Path]:
|
|||||||
f"decnet.ini: [decnet] mode must be 'agent' or 'master', got '{mode}'"
|
f"decnet.ini: [decnet] mode must be 'agent' or 'master', got '{mode}'"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Role-specific section.
|
# Role-specific section — kebab→SCREAMING_SNAKE auto-translation.
|
||||||
|
# Kept for backwards compatibility with the enroll-bundle [agent]
|
||||||
|
# writer (decnet/web/router/swarm_mgmt/api_enroll_bundle.py).
|
||||||
section = mode
|
section = mode
|
||||||
if parser.has_section(section):
|
if parser.has_section(section):
|
||||||
for key, value in parser.items(section):
|
for key, value in parser.items(section):
|
||||||
os.environ.setdefault(_key_to_env(key), value)
|
os.environ.setdefault(_key_to_env(key), value)
|
||||||
|
|
||||||
|
# Domain sections — explicit key map; loaded regardless of mode.
|
||||||
|
# Unknown keys inside a known section log a WARNING so operator
|
||||||
|
# typos are visible; unknown sections are silently ignored (so the
|
||||||
|
# file format can grow without breaking older loaders).
|
||||||
|
for section_name, key_map in _DOMAIN_MAP.items():
|
||||||
|
if not parser.has_section(section_name):
|
||||||
|
continue
|
||||||
|
for key, value in parser.items(section_name):
|
||||||
|
env_name = key_map.get(key)
|
||||||
|
if env_name is None:
|
||||||
|
log.warning(
|
||||||
|
"decnet.ini: unknown key [%s] %s — ignored",
|
||||||
|
section_name, key,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
os.environ.setdefault(env_name, value)
|
||||||
|
|
||||||
return path
|
return path
|
||||||
|
|||||||
@@ -24,26 +24,68 @@ from __future__ import annotations
|
|||||||
|
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
from typing import Any, Callable
|
||||||
|
|
||||||
from rich.table import Table
|
from rich.table import Table
|
||||||
|
|
||||||
from decnet.correlation.graph import AttackerTraversal, TraversalHop
|
from decnet.correlation.graph import AttackerTraversal, MutationMarker, TraversalHop
|
||||||
from decnet.correlation.parser import LogEvent, parse_line
|
from decnet.correlation.parser import LogEvent, parse_line
|
||||||
from decnet.logging.syslog_formatter import (
|
from decnet.logging.syslog_formatter import (
|
||||||
SEVERITY_WARNING,
|
SEVERITY_WARNING,
|
||||||
format_rfc5424,
|
format_rfc5424,
|
||||||
)
|
)
|
||||||
|
from decnet.logging import get_logger
|
||||||
from decnet.telemetry import traced as _traced, get_tracer as _get_tracer
|
from decnet.telemetry import traced as _traced, get_tracer as _get_tracer
|
||||||
|
|
||||||
|
log = get_logger("correlation.engine")
|
||||||
|
|
||||||
|
|
||||||
|
# Decky-name prefix reserved for DECNET's own infrastructure workers
|
||||||
|
# that log attacker IPs without representing actual decoy hops. The
|
||||||
|
# prober is the canonical example: when it fingerprints an attacker's
|
||||||
|
# externally-exposed services, it writes events with
|
||||||
|
# ``hostname=decnet-prober`` and ``target_ip=<attacker IP>``. The parser
|
||||||
|
# pulls ``target_ip`` into ``attacker_ip`` so the prober event is
|
||||||
|
# co-indexed with that attacker — but it's outbound recon from the
|
||||||
|
# master, not the attacker traversing into another decoy. Excluding the
|
||||||
|
# whole ``decnet-*`` namespace from distinct-decky counts and hop paths
|
||||||
|
# avoids labelling every fingerprinted attacker as a "traversal."
|
||||||
|
_INTERNAL_DECKY_PREFIX = "decnet-"
|
||||||
|
|
||||||
|
|
||||||
|
def _is_internal_decky(name: str) -> bool:
|
||||||
|
"""True if ``name`` is a DECNET internal worker (prober, etc.) — not a real decoy."""
|
||||||
|
return bool(name) and name.startswith(_INTERNAL_DECKY_PREFIX)
|
||||||
|
|
||||||
|
|
||||||
|
# ``publish_fn(event_type, payload_dict)``. Sync to avoid rippling
|
||||||
|
# ``async`` through every call site of :meth:`CorrelationEngine.ingest`;
|
||||||
|
# the caller wraps bus-publish via
|
||||||
|
# :func:`decnet.bus.publish.make_thread_safe_publisher`, which is safe to
|
||||||
|
# invoke from any thread including the event-loop thread.
|
||||||
|
CorrelationPublishFn = Callable[[str, dict[str, Any]], None]
|
||||||
|
|
||||||
|
|
||||||
class CorrelationEngine:
|
class CorrelationEngine:
|
||||||
def __init__(self) -> None:
|
def __init__(
|
||||||
|
self,
|
||||||
|
*,
|
||||||
|
publish_fn: CorrelationPublishFn | None = None,
|
||||||
|
) -> None:
|
||||||
# attacker_ip → chronological list of events (only events with an IP)
|
# attacker_ip → chronological list of events (only events with an IP)
|
||||||
self._events: dict[str, list[LogEvent]] = defaultdict(list)
|
self._events: dict[str, list[LogEvent]] = defaultdict(list)
|
||||||
|
# decky_name → chronological list of mutation events. Sibling
|
||||||
|
# index to ``_events``; traversals() joins them by time window.
|
||||||
|
self._mutations: dict[str, list[LogEvent]] = defaultdict(list)
|
||||||
# Total lines parsed (including no-IP and non-DECNET lines)
|
# Total lines parsed (including no-IP and non-DECNET lines)
|
||||||
self.lines_parsed: int = 0
|
self.lines_parsed: int = 0
|
||||||
# Total events indexed (had an attacker_ip)
|
# Total events indexed (had an attacker_ip)
|
||||||
self.events_indexed: int = 0
|
self.events_indexed: int = 0
|
||||||
|
# Total mutation events indexed (kind="mutation")
|
||||||
|
self.mutations_indexed: int = 0
|
||||||
|
# Optional bus hook — invoked on first-sighting of an attacker IP.
|
||||||
|
# Always fires exactly once per IP for the lifetime of the engine.
|
||||||
|
self._publish_fn = publish_fn
|
||||||
|
|
||||||
# ------------------------------------------------------------------ #
|
# ------------------------------------------------------------------ #
|
||||||
# Ingestion #
|
# Ingestion #
|
||||||
@@ -60,9 +102,28 @@ class CorrelationEngine:
|
|||||||
event = parse_line(line)
|
event = parse_line(line)
|
||||||
if event is None:
|
if event is None:
|
||||||
return None
|
return None
|
||||||
|
if event.kind == "mutation":
|
||||||
|
self._mutations[event.decky].append(event)
|
||||||
|
self.mutations_indexed += 1
|
||||||
|
return event
|
||||||
if event.attacker_ip:
|
if event.attacker_ip:
|
||||||
|
first_sighting = event.attacker_ip not in self._events
|
||||||
self._events[event.attacker_ip].append(event)
|
self._events[event.attacker_ip].append(event)
|
||||||
self.events_indexed += 1
|
self.events_indexed += 1
|
||||||
|
if first_sighting and self._publish_fn is not None:
|
||||||
|
try:
|
||||||
|
self._publish_fn(
|
||||||
|
"observed",
|
||||||
|
{
|
||||||
|
"attacker_ip": event.attacker_ip,
|
||||||
|
"decky": event.decky,
|
||||||
|
"service": event.service,
|
||||||
|
"event_type": event.event_type,
|
||||||
|
"first_seen": event.timestamp.isoformat(),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
except Exception as exc:
|
||||||
|
log.warning("correlation publish hook failed: %s", exc)
|
||||||
return event
|
return event
|
||||||
|
|
||||||
@_traced("correlation.ingest_file")
|
@_traced("correlation.ingest_file")
|
||||||
@@ -94,14 +155,36 @@ class CorrelationEngine:
|
|||||||
"""
|
"""
|
||||||
result: list[AttackerTraversal] = []
|
result: list[AttackerTraversal] = []
|
||||||
for ip, events in self._events.items():
|
for ip, events in self._events.items():
|
||||||
if len({e.decky for e in events}) < min_deckies:
|
# Exclude internal-infrastructure events (e.g. prober) from
|
||||||
|
# distinct-decky counting and the hop list. They aren't
|
||||||
|
# attacker movement — they're outbound recon co-indexed by
|
||||||
|
# attacker IP. Without this filter every fingerprinted
|
||||||
|
# attacker shows up as a 2-decky "traversal" with a bogus
|
||||||
|
# ``dmz-gateway → decnet-prober`` path.
|
||||||
|
decoy_events = [e for e in events if not _is_internal_decky(e.decky)]
|
||||||
|
if len({e.decky for e in decoy_events}) < min_deckies:
|
||||||
continue
|
continue
|
||||||
hops = sorted(
|
hops = sorted(
|
||||||
(TraversalHop(e.timestamp, e.decky, e.service, e.event_type)
|
(TraversalHop(e.timestamp, e.decky, e.service, e.event_type)
|
||||||
for e in events),
|
for e in decoy_events),
|
||||||
key=lambda h: h.timestamp,
|
key=lambda h: h.timestamp,
|
||||||
)
|
)
|
||||||
result.append(AttackerTraversal(attacker_ip=ip, hops=hops))
|
# Per-attacker mutation markers: any mutation on a touched
|
||||||
|
# decky between first_seen and last_seen. Window is
|
||||||
|
# inclusive on both ends so a creation-at-T0 + first-contact-
|
||||||
|
# at-T0 race still attaches the marker.
|
||||||
|
first_ts = hops[0].timestamp
|
||||||
|
last_ts = hops[-1].timestamp
|
||||||
|
touched = {h.decky for h in hops}
|
||||||
|
markers: list[MutationMarker] = []
|
||||||
|
for decky in touched:
|
||||||
|
for mev in self._mutations.get(decky, ()):
|
||||||
|
if first_ts <= mev.timestamp <= last_ts:
|
||||||
|
markers.append(_marker_from_event(mev))
|
||||||
|
markers.sort(key=lambda m: m.timestamp)
|
||||||
|
result.append(AttackerTraversal(
|
||||||
|
attacker_ip=ip, hops=hops, mutations_during=markers,
|
||||||
|
))
|
||||||
return sorted(result, key=lambda t: t.first_seen)
|
return sorted(result, key=lambda t: t.first_seen)
|
||||||
|
|
||||||
def all_attackers(self) -> dict[str, int]:
|
def all_attackers(self) -> dict[str, int]:
|
||||||
@@ -156,6 +239,62 @@ class CorrelationEngine:
|
|||||||
"traversals": [t.to_dict() for t in self.traversals(min_deckies)],
|
"traversals": [t.to_dict() for t in self.traversals(min_deckies)],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
# Credential reuse #
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
|
||||||
|
async def correlate_credential_reuse(
|
||||||
|
self,
|
||||||
|
repo: Any,
|
||||||
|
min_targets: int = 2,
|
||||||
|
) -> list[dict[str, Any]]:
|
||||||
|
"""Detect cross-target credential reuse and persist findings.
|
||||||
|
|
||||||
|
Groups every ``Credential`` row by ``(secret_sha256, secret_kind,
|
||||||
|
principal)``. Groups crossing *min_targets* distinct
|
||||||
|
``(decky, service)`` pairs are folded into ``CredentialReuse`` via
|
||||||
|
:meth:`BaseRepository.upsert_credential_reuse` — one upsert per
|
||||||
|
underlying credential row, since the upsert itself dedups on the
|
||||||
|
unique key and recomputes aggregates from the credentials table.
|
||||||
|
|
||||||
|
Returns the upsert results that flipped ``inserted`` or
|
||||||
|
``changed``, so the caller can publish ``credential.reuse.detected``
|
||||||
|
for each new or grown finding without re-querying.
|
||||||
|
"""
|
||||||
|
results: list[dict[str, Any]] = []
|
||||||
|
candidates = await repo.find_credential_reuse_candidates(min_targets)
|
||||||
|
for group in candidates:
|
||||||
|
# Per-group flags: each credential in a group hits the same
|
||||||
|
# CredentialReuse row, so several upserts may flip
|
||||||
|
# ``inserted``/``changed`` along the way. Collapse to one
|
||||||
|
# publish per group keyed by the final state — otherwise a
|
||||||
|
# group of N creds emits N partial reuse.detected events
|
||||||
|
# with intermediate target_counts.
|
||||||
|
final_row: dict[str, Any] | None = None
|
||||||
|
saw_insert = False
|
||||||
|
saw_change = False
|
||||||
|
for cred in group["credentials"]:
|
||||||
|
row = await repo.upsert_credential_reuse(
|
||||||
|
secret_sha256=group["secret_sha256"],
|
||||||
|
secret_kind=group["secret_kind"],
|
||||||
|
principal=group["principal"],
|
||||||
|
attacker_uuid=cred.get("attacker_uuid"),
|
||||||
|
attacker_ip=cred["attacker_ip"],
|
||||||
|
decky=cred["decky_name"],
|
||||||
|
service=cred["service"],
|
||||||
|
attempt_count=int(cred.get("attempt_count") or 1),
|
||||||
|
)
|
||||||
|
if row is None:
|
||||||
|
continue
|
||||||
|
final_row = row
|
||||||
|
saw_insert = saw_insert or bool(row.get("inserted"))
|
||||||
|
saw_change = saw_change or bool(row.get("changed"))
|
||||||
|
if final_row is not None and (saw_insert or saw_change):
|
||||||
|
final_row["inserted"] = saw_insert
|
||||||
|
final_row["changed"] = saw_change
|
||||||
|
results.append(final_row)
|
||||||
|
return results
|
||||||
|
|
||||||
@_traced("correlation.traversal_syslog_lines")
|
@_traced("correlation.traversal_syslog_lines")
|
||||||
def traversal_syslog_lines(self, min_deckies: int = 2) -> list[str]:
|
def traversal_syslog_lines(self, min_deckies: int = 2) -> list[str]:
|
||||||
"""
|
"""
|
||||||
@@ -187,6 +326,26 @@ class CorrelationEngine:
|
|||||||
# Helpers #
|
# Helpers #
|
||||||
# ------------------------------------------------------------------ #
|
# ------------------------------------------------------------------ #
|
||||||
|
|
||||||
|
def _marker_from_event(event: LogEvent) -> MutationMarker:
|
||||||
|
"""Build a :class:`MutationMarker` from a parsed ``decky_mutated`` log event.
|
||||||
|
|
||||||
|
The mutator emits ``old_services``/``new_services`` as comma-joined
|
||||||
|
strings in the SD params (the RFC 5424 grammar doesn't have native
|
||||||
|
lists). We split them back on the way out — empty string ⇒ empty
|
||||||
|
list, matching the creation/retirement emission sites.
|
||||||
|
"""
|
||||||
|
def _split(s: str) -> list[str]:
|
||||||
|
return [p for p in s.split(",") if p]
|
||||||
|
|
||||||
|
return MutationMarker(
|
||||||
|
timestamp=event.timestamp,
|
||||||
|
decky=event.decky,
|
||||||
|
old_services=_split(event.fields.get("old_services", "")),
|
||||||
|
new_services=_split(event.fields.get("new_services", "")),
|
||||||
|
trigger=event.fields.get("trigger", ""),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _fmt_duration(seconds: float) -> str:
|
def _fmt_duration(seconds: float) -> str:
|
||||||
if seconds < 60:
|
if seconds < 60:
|
||||||
return f"{seconds:.0f}s"
|
return f"{seconds:.0f}s"
|
||||||
|
|||||||
113
decnet/correlation/event_kinds.py
Normal file
113
decnet/correlation/event_kinds.py
Normal file
@@ -0,0 +1,113 @@
|
|||||||
|
"""Classify RFC 5424 event_type strings as interaction vs. scan vs. noise.
|
||||||
|
|
||||||
|
Used by:
|
||||||
|
- The attacker detail endpoint to split services into "scanned" and
|
||||||
|
"interacted with" buckets, distinguishing port scanners from
|
||||||
|
attackers who actually engaged.
|
||||||
|
- The profiler worker to filter command-family events when extracting
|
||||||
|
executed-command history.
|
||||||
|
|
||||||
|
Classification is conservative: an unknown event_type defaults to
|
||||||
|
``scan`` rather than ``interaction``. That way a new service template
|
||||||
|
emitting a fresh verb shows up as "scanned" on the dashboard — visible
|
||||||
|
but not over-credited. Adding it to ``INTERACTION_EVENT_TYPES`` is
|
||||||
|
always a deliberate promotion.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Literal
|
||||||
|
|
||||||
|
# Events that mean the attacker did something past reconnaissance —
|
||||||
|
# executed a command, sent mail, uploaded a file, subscribed to a topic.
|
||||||
|
# A service with ≥1 of these from a given attacker is "interacted with".
|
||||||
|
INTERACTION_EVENT_TYPES: frozenset[str] = frozenset({
|
||||||
|
# Shell / command-family — lifted from the profiler's original
|
||||||
|
# command-extraction frozenset; this module is now the source of
|
||||||
|
# truth for that vocabulary too.
|
||||||
|
"command",
|
||||||
|
"exec",
|
||||||
|
"query",
|
||||||
|
"input",
|
||||||
|
"shell_input",
|
||||||
|
"execute",
|
||||||
|
"run",
|
||||||
|
"sql_query",
|
||||||
|
"redis_command",
|
||||||
|
"ldap_search",
|
||||||
|
# SMTP meaningful engagement — once MAIL FROM / RCPT TO lands the
|
||||||
|
# attacker is trying to send mail, not just banner-grab.
|
||||||
|
# message_accepted is the DATA-commit moment.
|
||||||
|
"mail_from",
|
||||||
|
"rcpt_to",
|
||||||
|
"rcpt_denied",
|
||||||
|
"message_accepted",
|
||||||
|
# File / payload activity
|
||||||
|
"file_captured",
|
||||||
|
"upload",
|
||||||
|
"download_attempt",
|
||||||
|
"retr", # FTP retrieve
|
||||||
|
# Pub/sub operational use (vs. mere connection)
|
||||||
|
"publish",
|
||||||
|
"subscribe",
|
||||||
|
# A recorded TTY session is always an interaction — sessrec only
|
||||||
|
# writes when there was PTY input.
|
||||||
|
"session_recorded",
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
# Events that are DECNET-internal or protocol-framework noise rather
|
||||||
|
# than attacker-caused signal. Dropped from both buckets.
|
||||||
|
NOISE_EVENT_TYPES: frozenset[str] = frozenset({
|
||||||
|
"startup",
|
||||||
|
"shutdown",
|
||||||
|
"config_error",
|
||||||
|
"parse_error",
|
||||||
|
"unknown_packet",
|
||||||
|
"unknown_opcode",
|
||||||
|
"unknown_command",
|
||||||
|
"protocol_error",
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
EventKind = Literal["interaction", "scan", "noise"]
|
||||||
|
|
||||||
|
|
||||||
|
def classify_event(event_type: str) -> EventKind:
|
||||||
|
"""Return the kind label for a single event_type string."""
|
||||||
|
if event_type in INTERACTION_EVENT_TYPES:
|
||||||
|
return "interaction"
|
||||||
|
if event_type in NOISE_EVENT_TYPES:
|
||||||
|
return "noise"
|
||||||
|
return "scan"
|
||||||
|
|
||||||
|
|
||||||
|
def bucket_services(
|
||||||
|
pairs: list[tuple[str, str]],
|
||||||
|
) -> dict[str, list[str]]:
|
||||||
|
"""Group distinct service names into scanned vs. interacted buckets.
|
||||||
|
|
||||||
|
*pairs* is an iterable of ``(service, event_type)`` tuples — the
|
||||||
|
shape the repo returns from a ``SELECT DISTINCT service, event_type``
|
||||||
|
query. A service is placed in ``interacted`` if any of its events
|
||||||
|
classifies as interaction; otherwise in ``scanned`` if any event
|
||||||
|
classifies as scan; noise-only services are dropped.
|
||||||
|
|
||||||
|
Return shape: ``{"interacted": [...sorted...], "scanned": [...sorted...]}``.
|
||||||
|
Buckets are disjoint by construction.
|
||||||
|
"""
|
||||||
|
best: dict[str, EventKind] = {}
|
||||||
|
for service, event_type in pairs:
|
||||||
|
kind = classify_event(event_type)
|
||||||
|
current = best.get(service)
|
||||||
|
# Rank: interaction > scan > noise > unset.
|
||||||
|
if current == "interaction":
|
||||||
|
continue
|
||||||
|
if kind == "interaction":
|
||||||
|
best[service] = "interaction"
|
||||||
|
elif kind == "scan" and current != "interaction":
|
||||||
|
best[service] = "scan"
|
||||||
|
elif kind == "noise" and current is None:
|
||||||
|
best[service] = "noise"
|
||||||
|
interacted = sorted(s for s, k in best.items() if k == "interaction")
|
||||||
|
scanned = sorted(s for s, k in best.items() if k == "scan")
|
||||||
|
return {"interacted": interacted, "scanned": scanned}
|
||||||
@@ -8,10 +8,29 @@ by reading the unique decky sequence from the hop list.
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass, field
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class MutationMarker:
|
||||||
|
"""A substrate transition that occurred during an attacker's traversal.
|
||||||
|
|
||||||
|
Emitted by the mutator (or deploy/teardown) and consumed by the
|
||||||
|
correlation engine so ``AttackerTraversal.to_dict()`` can interleave
|
||||||
|
substrate-change markers chronologically with attacker hops — an
|
||||||
|
interaction with ``decky-03@T5`` followed by a mutation at ``T6`` and
|
||||||
|
another interaction at ``T7`` is a substrate transition mid-session,
|
||||||
|
not a silent discontinuity.
|
||||||
|
"""
|
||||||
|
|
||||||
|
timestamp: datetime
|
||||||
|
decky: str
|
||||||
|
old_services: list[str]
|
||||||
|
new_services: list[str]
|
||||||
|
trigger: str # creation | retirement | scheduled | operator | …
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class TraversalHop:
|
class TraversalHop:
|
||||||
"""A single event in an attacker's traversal through the deception network."""
|
"""A single event in an attacker's traversal through the deception network."""
|
||||||
@@ -31,6 +50,10 @@ class AttackerTraversal:
|
|||||||
|
|
||||||
attacker_ip: str
|
attacker_ip: str
|
||||||
hops: list[TraversalHop] # chronologically sorted
|
hops: list[TraversalHop] # chronologically sorted
|
||||||
|
# Substrate-change markers on deckies this attacker touched, bounded
|
||||||
|
# by first_seen/last_seen. Empty for legacy attacker-only ingest;
|
||||||
|
# populated once mutation events flow through the engine.
|
||||||
|
mutations_during: list[MutationMarker] = field(default_factory=list)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def first_seen(self) -> datetime:
|
def first_seen(self) -> datetime:
|
||||||
@@ -62,6 +85,35 @@ class AttackerTraversal:
|
|||||||
"""Human-readable traversal path: decky-01 → decky-03 → decky-07"""
|
"""Human-readable traversal path: decky-01 → decky-03 → decky-07"""
|
||||||
return " → ".join(self.deckies)
|
return " → ".join(self.deckies)
|
||||||
|
|
||||||
|
def timeline(self) -> list[dict]:
|
||||||
|
"""Chronologically interleaved hops and mutation markers.
|
||||||
|
|
||||||
|
Each entry carries a ``kind`` discriminant (``hop`` | ``mutation``)
|
||||||
|
so JSON consumers can render them distinctly. Mutations of
|
||||||
|
deckies the attacker never touched are already filtered out at
|
||||||
|
the engine; here we just merge by timestamp.
|
||||||
|
"""
|
||||||
|
merged: list[tuple[datetime, dict]] = []
|
||||||
|
for h in self.hops:
|
||||||
|
merged.append((h.timestamp, {
|
||||||
|
"kind": "hop",
|
||||||
|
"timestamp": h.timestamp.isoformat(),
|
||||||
|
"decky": h.decky,
|
||||||
|
"service": h.service,
|
||||||
|
"event_type": h.event_type,
|
||||||
|
}))
|
||||||
|
for m in self.mutations_during:
|
||||||
|
merged.append((m.timestamp, {
|
||||||
|
"kind": "mutation",
|
||||||
|
"timestamp": m.timestamp.isoformat(),
|
||||||
|
"decky": m.decky,
|
||||||
|
"old_services": m.old_services,
|
||||||
|
"new_services": m.new_services,
|
||||||
|
"trigger": m.trigger,
|
||||||
|
}))
|
||||||
|
merged.sort(key=lambda kv: kv[0])
|
||||||
|
return [entry for _, entry in merged]
|
||||||
|
|
||||||
def to_dict(self) -> dict:
|
def to_dict(self) -> dict:
|
||||||
return {
|
return {
|
||||||
"attacker_ip": self.attacker_ip,
|
"attacker_ip": self.attacker_ip,
|
||||||
@@ -81,4 +133,15 @@ class AttackerTraversal:
|
|||||||
}
|
}
|
||||||
for h in self.hops
|
for h in self.hops
|
||||||
],
|
],
|
||||||
|
"mutations_during": [
|
||||||
|
{
|
||||||
|
"timestamp": m.timestamp.isoformat(),
|
||||||
|
"decky": m.decky,
|
||||||
|
"old_services": m.old_services,
|
||||||
|
"new_services": m.new_services,
|
||||||
|
"trigger": m.trigger,
|
||||||
|
}
|
||||||
|
for m in self.mutations_during
|
||||||
|
],
|
||||||
|
"timeline": self.timeline(),
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -17,8 +17,9 @@ The attacker IP may appear under several field names depending on service:
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import re
|
import re
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass, field
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from typing import Literal
|
||||||
|
|
||||||
# RFC 5424 line structure
|
# RFC 5424 line structure
|
||||||
_RFC5424_RE = re.compile(
|
_RFC5424_RE = re.compile(
|
||||||
@@ -40,6 +41,23 @@ _PARAM_RE = re.compile(r'(\w+)="((?:[^"\\]|\\.)*)"')
|
|||||||
# Field names to probe for attacker IP, in priority order
|
# Field names to probe for attacker IP, in priority order
|
||||||
_IP_FIELDS = ("src_ip", "src", "client_ip", "remote_ip", "remote_addr", "target_ip", "ip")
|
_IP_FIELDS = ("src_ip", "src", "client_ip", "remote_ip", "remote_addr", "target_ip", "ip")
|
||||||
|
|
||||||
|
# Native syslog producers (sshd, pam_unix routed through rsyslog) emit
|
||||||
|
# free prose with no SD block. Pull the remote address out of idiomatic
|
||||||
|
# anchors first ("from <ip>", "rhost=<ip>"), then fall back to the first
|
||||||
|
# IPv4 in the line. Anchored matches keep us from picking the local
|
||||||
|
# listener in "Connection from X port Y on Z port 22".
|
||||||
|
_IPV4 = r"\d{1,3}(?:\.\d{1,3}){3}"
|
||||||
|
_IPV6 = r"[0-9a-fA-F:]+:[0-9a-fA-F:]+"
|
||||||
|
_IP_RE = rf"(?:{_IPV4}|{_IPV6})"
|
||||||
|
_MSG_IP_ANCHORED_RE = re.compile(
|
||||||
|
rf"\b(?:from|rhost[:=]|client[:=]|src[:=])\s*({_IP_RE})",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
_MSG_IP_BARE_RE = re.compile(rf"\b({_IPV4})\b")
|
||||||
|
|
||||||
|
|
||||||
|
EventKind = Literal["attacker", "mutation"]
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class LogEvent:
|
class LogEvent:
|
||||||
@@ -52,6 +70,12 @@ class LogEvent:
|
|||||||
attacker_ip: str | None # extracted from SD params; None if not present
|
attacker_ip: str | None # extracted from SD params; None if not present
|
||||||
fields: dict[str, str] # all structured data params
|
fields: dict[str, str] # all structured data params
|
||||||
raw: str # original log line (stripped)
|
raw: str # original log line (stripped)
|
||||||
|
# ``attacker`` = service-emitted event keyed on a source IP (the
|
||||||
|
# existing correlation input). ``mutation`` = ``mutator`` worker
|
||||||
|
# event — same RFC 5424 wire format but routed into a separate
|
||||||
|
# per-decky index so substrate transitions can be interleaved into
|
||||||
|
# attacker traversals without polluting the per-IP event stream.
|
||||||
|
kind: EventKind = field(default="attacker")
|
||||||
|
|
||||||
|
|
||||||
def _parse_sd_params(sd_rest: str) -> dict[str, str]:
|
def _parse_sd_params(sd_rest: str) -> dict[str, str]:
|
||||||
@@ -66,10 +90,17 @@ def _parse_sd_params(sd_rest: str) -> dict[str, str]:
|
|||||||
return params
|
return params
|
||||||
|
|
||||||
|
|
||||||
def _extract_attacker_ip(fields: dict[str, str]) -> str | None:
|
def _extract_attacker_ip(fields: dict[str, str], msg: str = "") -> str | None:
|
||||||
for fname in _IP_FIELDS:
|
for fname in _IP_FIELDS:
|
||||||
if fname in fields:
|
if fname in fields:
|
||||||
return fields[fname]
|
return fields[fname]
|
||||||
|
if msg:
|
||||||
|
anchored = _MSG_IP_ANCHORED_RE.search(msg)
|
||||||
|
if anchored:
|
||||||
|
return anchored.group(1)
|
||||||
|
bare = _MSG_IP_BARE_RE.search(msg)
|
||||||
|
if bare:
|
||||||
|
return bare.group(1)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
@@ -99,7 +130,20 @@ def parse_line(line: str) -> LogEvent | None:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
fields = _parse_sd_params(sd_rest)
|
fields = _parse_sd_params(sd_rest)
|
||||||
attacker_ip = _extract_attacker_ip(fields)
|
if sd_rest.startswith("-"):
|
||||||
|
msg = sd_rest[1:].lstrip()
|
||||||
|
else:
|
||||||
|
tail = re.search(r'\]\s+(.+)$', sd_rest)
|
||||||
|
msg = tail.group(1).strip() if tail else ""
|
||||||
|
attacker_ip = _extract_attacker_ip(fields, msg)
|
||||||
|
|
||||||
|
# Mutator-emitted transitions arrive on the same ingest stream but
|
||||||
|
# belong in the substrate-state index, not the per-IP attacker one.
|
||||||
|
kind: EventKind = (
|
||||||
|
"mutation"
|
||||||
|
if service == "mutator" and event_type == "decky_mutated"
|
||||||
|
else "attacker"
|
||||||
|
)
|
||||||
|
|
||||||
return LogEvent(
|
return LogEvent(
|
||||||
timestamp=timestamp,
|
timestamp=timestamp,
|
||||||
@@ -109,4 +153,5 @@ def parse_line(line: str) -> LogEvent | None:
|
|||||||
attacker_ip=attacker_ip,
|
attacker_ip=attacker_ip,
|
||||||
fields=fields,
|
fields=fields,
|
||||||
raw=line,
|
raw=line,
|
||||||
|
kind=kind,
|
||||||
)
|
)
|
||||||
|
|||||||
153
decnet/correlation/reuse_worker.py
Normal file
153
decnet/correlation/reuse_worker.py
Normal file
@@ -0,0 +1,153 @@
|
|||||||
|
"""Long-running credential-reuse correlator.
|
||||||
|
|
||||||
|
Loops :meth:`CorrelationEngine.correlate_credential_reuse` over the
|
||||||
|
credentials table and publishes ``credential.reuse.detected`` for every
|
||||||
|
new or grown ``CredentialReuse`` row. Mirrors the mutator's bus-wake +
|
||||||
|
slow-tick pattern from :mod:`decnet.mutator.engine`: woken on
|
||||||
|
``credential.captured`` and ``attacker.observed`` for sub-second latency,
|
||||||
|
falls back to a 60s poll if the bus is unavailable.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import contextlib
|
||||||
|
|
||||||
|
from decnet.bus import topics as _topics
|
||||||
|
from decnet.bus.base import BaseBus
|
||||||
|
from decnet.bus.factory import get_bus
|
||||||
|
from decnet.bus.publish import (
|
||||||
|
publish_safely,
|
||||||
|
run_control_listener_signal as _run_control_listener_signal,
|
||||||
|
run_health_heartbeat as _run_health_heartbeat,
|
||||||
|
)
|
||||||
|
from decnet.correlation.engine import CorrelationEngine
|
||||||
|
from decnet.logging import get_logger
|
||||||
|
from decnet.web.db.repository import BaseRepository
|
||||||
|
|
||||||
|
log = get_logger("correlation.reuse_worker")
|
||||||
|
|
||||||
|
_DEFAULT_POLL_SECS = 60.0
|
||||||
|
_DEFAULT_MIN_TARGETS = 2
|
||||||
|
|
||||||
|
|
||||||
|
async def run_reuse_loop(
|
||||||
|
repo: BaseRepository,
|
||||||
|
*,
|
||||||
|
poll_interval_secs: float = _DEFAULT_POLL_SECS,
|
||||||
|
min_targets: int = _DEFAULT_MIN_TARGETS,
|
||||||
|
shutdown: asyncio.Event | None = None,
|
||||||
|
) -> None:
|
||||||
|
"""Run the credential-reuse correlator until cancelled.
|
||||||
|
|
||||||
|
*shutdown* is an optional external stop signal; the loop also exits
|
||||||
|
cleanly on ``CancelledError`` and ``KeyboardInterrupt``. The
|
||||||
|
*min_targets* threshold is the minimum number of distinct
|
||||||
|
``(decky, service)`` pairs a secret must touch before it's persisted
|
||||||
|
as a reuse finding.
|
||||||
|
"""
|
||||||
|
log.info(
|
||||||
|
"reuse correlator started poll_interval_secs=%s min_targets=%s",
|
||||||
|
poll_interval_secs, min_targets,
|
||||||
|
)
|
||||||
|
|
||||||
|
bus: BaseBus | None = None
|
||||||
|
wake = asyncio.Event()
|
||||||
|
wake_tasks: list[asyncio.Task] = []
|
||||||
|
heartbeat_task: asyncio.Task | None = None
|
||||||
|
try:
|
||||||
|
candidate = get_bus(client_name="reuse-correlator")
|
||||||
|
await candidate.connect()
|
||||||
|
bus = candidate
|
||||||
|
wake_tasks.append(asyncio.create_task(
|
||||||
|
_wake_on(bus, wake, _topics.credential(_topics.CREDENTIAL_CAPTURED)),
|
||||||
|
))
|
||||||
|
wake_tasks.append(asyncio.create_task(
|
||||||
|
_wake_on(bus, wake, _topics.attacker(_topics.ATTACKER_OBSERVED)),
|
||||||
|
))
|
||||||
|
heartbeat_task = asyncio.create_task(
|
||||||
|
_run_health_heartbeat(bus, "reuse-correlator"),
|
||||||
|
)
|
||||||
|
wake_tasks.append(asyncio.create_task(
|
||||||
|
_run_control_listener_signal(bus, "reuse-correlator"),
|
||||||
|
))
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
log.warning(
|
||||||
|
"reuse correlator: bus unavailable, running in poll-only mode: %s",
|
||||||
|
exc,
|
||||||
|
)
|
||||||
|
|
||||||
|
engine = CorrelationEngine()
|
||||||
|
if shutdown is None:
|
||||||
|
shutdown = asyncio.Event()
|
||||||
|
|
||||||
|
try:
|
||||||
|
while not shutdown.is_set():
|
||||||
|
try:
|
||||||
|
results = await engine.correlate_credential_reuse(
|
||||||
|
repo, min_targets=min_targets,
|
||||||
|
)
|
||||||
|
except Exception: # noqa: BLE001
|
||||||
|
log.exception("reuse correlator: tick failed")
|
||||||
|
results = []
|
||||||
|
|
||||||
|
for row in results:
|
||||||
|
await publish_safely(
|
||||||
|
bus,
|
||||||
|
_topics.credential(_topics.CREDENTIAL_REUSE_DETECTED),
|
||||||
|
{
|
||||||
|
"id": row.get("id"),
|
||||||
|
"secret_kind": row.get("secret_kind"),
|
||||||
|
"target_count": row.get("target_count"),
|
||||||
|
"attacker_uuids": row.get("attacker_uuids"),
|
||||||
|
"attacker_ips": row.get("attacker_ips"),
|
||||||
|
"deckies": row.get("deckies"),
|
||||||
|
"services": row.get("services"),
|
||||||
|
},
|
||||||
|
event_type=_topics.CREDENTIAL_REUSE_DETECTED,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
await asyncio.wait_for(
|
||||||
|
wake.wait(), timeout=float(poll_interval_secs),
|
||||||
|
)
|
||||||
|
except asyncio.TimeoutError:
|
||||||
|
pass
|
||||||
|
wake.clear()
|
||||||
|
except (asyncio.CancelledError, KeyboardInterrupt):
|
||||||
|
log.info("reuse correlator stopped")
|
||||||
|
finally:
|
||||||
|
for t in wake_tasks:
|
||||||
|
t.cancel()
|
||||||
|
if heartbeat_task is not None:
|
||||||
|
heartbeat_task.cancel()
|
||||||
|
for t in (*wake_tasks, heartbeat_task):
|
||||||
|
if t is None:
|
||||||
|
continue
|
||||||
|
with contextlib.suppress(asyncio.CancelledError, Exception):
|
||||||
|
await t
|
||||||
|
if bus is not None:
|
||||||
|
with contextlib.suppress(Exception):
|
||||||
|
await bus.close()
|
||||||
|
|
||||||
|
|
||||||
|
async def _wake_on(bus: BaseBus, wake: asyncio.Event, pattern: str) -> None:
|
||||||
|
"""Flip *wake* every time *pattern* fires on the bus.
|
||||||
|
|
||||||
|
Survives transient subscriber errors by logging and exiting; the
|
||||||
|
poll-interval fallback keeps the loop alive in poll-only mode.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
sub = bus.subscribe(pattern)
|
||||||
|
async with sub:
|
||||||
|
async for _event in sub:
|
||||||
|
wake.set()
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
raise
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
log.warning(
|
||||||
|
"reuse correlator: subscriber for %s died (%s); falling back to poll",
|
||||||
|
pattern, exc,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = ["run_reuse_loop"]
|
||||||
@@ -2,41 +2,67 @@
|
|||||||
Deploy, teardown, and status via Docker SDK + subprocess docker compose.
|
Deploy, teardown, and status via Docker SDK + subprocess docker compose.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
import shutil
|
import shutil
|
||||||
import subprocess # nosec B404
|
import subprocess # nosec B404
|
||||||
import time
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
import anyio
|
||||||
import docker
|
import docker
|
||||||
from rich.console import Console
|
from rich.console import Console
|
||||||
from rich.table import Table
|
from rich.table import Table
|
||||||
|
|
||||||
|
from decnet.topology.hashing import canonical_hash
|
||||||
|
|
||||||
from decnet.logging import get_logger
|
from decnet.logging import get_logger
|
||||||
from decnet.telemetry import traced as _traced
|
from decnet.telemetry import traced as _traced
|
||||||
from decnet.config import DecnetConfig, clear_state, load_state, save_state
|
from decnet.config import DecnetConfig, clear_state, load_state, save_state
|
||||||
from decnet.composer import write_compose
|
from decnet.composer import write_compose
|
||||||
from decnet.network import (
|
from decnet.network import (
|
||||||
MACVLAN_NETWORK_NAME,
|
MACVLAN_NETWORK_NAME,
|
||||||
|
create_bridge_network,
|
||||||
create_ipvlan_network,
|
create_ipvlan_network,
|
||||||
create_macvlan_network,
|
create_macvlan_network,
|
||||||
get_host_ip,
|
get_host_ip,
|
||||||
ips_to_range,
|
ips_to_range,
|
||||||
|
remove_bridge_network,
|
||||||
remove_macvlan_network,
|
remove_macvlan_network,
|
||||||
setup_host_ipvlan,
|
setup_host_ipvlan,
|
||||||
setup_host_macvlan,
|
setup_host_macvlan,
|
||||||
teardown_host_ipvlan,
|
teardown_host_ipvlan,
|
||||||
teardown_host_macvlan,
|
teardown_host_macvlan,
|
||||||
)
|
)
|
||||||
|
from decnet.topology.compose import (
|
||||||
|
_network_name as _topology_network_name,
|
||||||
|
write_topology_compose,
|
||||||
|
)
|
||||||
|
from decnet.topology.persistence import hydrate, transition_status
|
||||||
|
from decnet.topology.status import TopologyStatus
|
||||||
|
from decnet.topology.validate import (
|
||||||
|
ValidationError,
|
||||||
|
check_no_host_port_collision,
|
||||||
|
errors as _validation_errors,
|
||||||
|
validate as _validate_topology,
|
||||||
|
)
|
||||||
|
|
||||||
log = get_logger("engine")
|
log = get_logger("engine")
|
||||||
console = Console()
|
console = Console()
|
||||||
COMPOSE_FILE = Path("decnet-compose.yml")
|
COMPOSE_FILE = Path("decnet-compose.yml")
|
||||||
_CANONICAL_LOGGING = Path(__file__).parent.parent / "templates" / "syslog_bridge.py"
|
_CANONICAL_LOGGING = Path(__file__).parent.parent / "templates" / "syslog_bridge.py"
|
||||||
|
_CANONICAL_INSTANCE_SEED = Path(__file__).parent.parent / "templates" / "instance_seed.py"
|
||||||
|
_CANONICAL_SESSREC_DIR = Path(__file__).parent.parent / "templates" / "_shared" / "sessrec"
|
||||||
|
_SESSREC_SERVICES = {"ssh", "telnet"}
|
||||||
|
_CANONICAL_AUTH_HELPER_DIR = Path(__file__).parent.parent / "templates" / "_shared" / "auth-helper"
|
||||||
|
_AUTH_HELPER_SERVICES = {"ssh", "telnet"}
|
||||||
|
_CANONICAL_NTLMSSP = Path(__file__).parent.parent / "templates" / "_shared" / "ntlmssp.py"
|
||||||
|
_NTLMSSP_SERVICES = {"smb", "rdp"}
|
||||||
|
|
||||||
|
|
||||||
def _sync_logging_helper(config: DecnetConfig) -> None:
|
def _sync_logging_helper(config: DecnetConfig) -> None:
|
||||||
"""Copy the canonical syslog_bridge.py into every active template build context."""
|
"""Copy canonical shared helpers into every active template build context."""
|
||||||
from decnet.services.registry import get_service
|
from decnet.services.registry import get_service
|
||||||
|
shared_files = [_CANONICAL_LOGGING, _CANONICAL_INSTANCE_SEED]
|
||||||
seen: set[Path] = set()
|
seen: set[Path] = set()
|
||||||
for decky in config.deckies:
|
for decky in config.deckies:
|
||||||
for svc_name in decky.services:
|
for svc_name in decky.services:
|
||||||
@@ -47,9 +73,94 @@ def _sync_logging_helper(config: DecnetConfig) -> None:
|
|||||||
if ctx is None or ctx in seen:
|
if ctx is None or ctx in seen:
|
||||||
continue
|
continue
|
||||||
seen.add(ctx)
|
seen.add(ctx)
|
||||||
dest = ctx / "syslog_bridge.py"
|
for src in shared_files:
|
||||||
if not dest.exists() or dest.read_bytes() != _CANONICAL_LOGGING.read_bytes():
|
dest = ctx / src.name
|
||||||
shutil.copy2(_CANONICAL_LOGGING, dest)
|
if not dest.exists() or dest.read_bytes() != src.read_bytes():
|
||||||
|
shutil.copy2(src, dest)
|
||||||
|
|
||||||
|
|
||||||
|
def _sync_auth_helper_sources(config: DecnetConfig) -> None:
|
||||||
|
"""Copy auth-helper.c into SSH/Telnet build contexts as auth-helper/.
|
||||||
|
|
||||||
|
The static cred-capture binary (compiled in a multi-stage Dockerfile
|
||||||
|
layer via musl-gcc) is service-agnostic — same source compiles for
|
||||||
|
both sshd's PAM stack (/etc/pam.d/sshd) and busybox-telnetd's
|
||||||
|
/bin/login PAM stack (/etc/pam.d/login). Mirrors the sessrec sync
|
||||||
|
pattern below.
|
||||||
|
"""
|
||||||
|
from decnet.services.registry import get_service
|
||||||
|
sources = [_CANONICAL_AUTH_HELPER_DIR / "auth-helper.c"]
|
||||||
|
seen: set[Path] = set()
|
||||||
|
for decky in config.deckies:
|
||||||
|
for svc_name in decky.services:
|
||||||
|
if svc_name not in _AUTH_HELPER_SERVICES:
|
||||||
|
continue
|
||||||
|
svc = get_service(svc_name)
|
||||||
|
if svc is None:
|
||||||
|
continue
|
||||||
|
ctx = svc.dockerfile_context()
|
||||||
|
if ctx is None or ctx in seen:
|
||||||
|
continue
|
||||||
|
seen.add(ctx)
|
||||||
|
dest_dir = ctx / "auth-helper"
|
||||||
|
dest_dir.mkdir(exist_ok=True)
|
||||||
|
for src in sources:
|
||||||
|
dest = dest_dir / src.name
|
||||||
|
if not dest.exists() or dest.read_bytes() != src.read_bytes():
|
||||||
|
shutil.copy2(src, dest)
|
||||||
|
|
||||||
|
|
||||||
|
def _sync_ntlmssp_sources(config: DecnetConfig) -> None:
|
||||||
|
"""Copy _shared/ntlmssp.py into SMB/RDP build contexts.
|
||||||
|
|
||||||
|
Both templates parse NTLMSSP Type 3 messages (SMB Session Setup,
|
||||||
|
RDP NLA CredSSP); the canonical parser lives at
|
||||||
|
``templates/_shared/ntlmssp.py`` and is mirrored into each active
|
||||||
|
build context here, mirroring the auth-helper / sessrec patterns.
|
||||||
|
"""
|
||||||
|
from decnet.services.registry import get_service
|
||||||
|
seen: set[Path] = set()
|
||||||
|
for decky in config.deckies:
|
||||||
|
for svc_name in decky.services:
|
||||||
|
if svc_name not in _NTLMSSP_SERVICES:
|
||||||
|
continue
|
||||||
|
svc = get_service(svc_name)
|
||||||
|
if svc is None:
|
||||||
|
continue
|
||||||
|
ctx = svc.dockerfile_context()
|
||||||
|
if ctx is None or ctx in seen:
|
||||||
|
continue
|
||||||
|
seen.add(ctx)
|
||||||
|
dest = ctx / _CANONICAL_NTLMSSP.name
|
||||||
|
if not dest.exists() or dest.read_bytes() != _CANONICAL_NTLMSSP.read_bytes():
|
||||||
|
shutil.copy2(_CANONICAL_NTLMSSP, dest)
|
||||||
|
|
||||||
|
|
||||||
|
def _sync_sessrec_sources(config: DecnetConfig) -> None:
|
||||||
|
"""Copy sessrec.c + Makefile into SSH/Telnet build contexts as sessrec/."""
|
||||||
|
from decnet.services.registry import get_service
|
||||||
|
sources = [
|
||||||
|
_CANONICAL_SESSREC_DIR / "sessrec.c",
|
||||||
|
_CANONICAL_SESSREC_DIR / "Makefile",
|
||||||
|
]
|
||||||
|
seen: set[Path] = set()
|
||||||
|
for decky in config.deckies:
|
||||||
|
for svc_name in decky.services:
|
||||||
|
if svc_name not in _SESSREC_SERVICES:
|
||||||
|
continue
|
||||||
|
svc = get_service(svc_name)
|
||||||
|
if svc is None:
|
||||||
|
continue
|
||||||
|
ctx = svc.dockerfile_context()
|
||||||
|
if ctx is None or ctx in seen:
|
||||||
|
continue
|
||||||
|
seen.add(ctx)
|
||||||
|
dest_dir = ctx / "sessrec"
|
||||||
|
dest_dir.mkdir(exist_ok=True)
|
||||||
|
for src in sources:
|
||||||
|
dest = dest_dir / src.name
|
||||||
|
if not dest.exists() or dest.read_bytes() != src.read_bytes():
|
||||||
|
shutil.copy2(src, dest)
|
||||||
|
|
||||||
|
|
||||||
def _compose(*args: str, compose_file: Path = COMPOSE_FILE, env: dict | None = None) -> None:
|
def _compose(*args: str, compose_file: Path = COMPOSE_FILE, env: dict | None = None) -> None:
|
||||||
@@ -83,6 +194,127 @@ _PERMANENT_ERRORS = (
|
|||||||
"repository does not exist",
|
"repository does not exist",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Signature of a wedged buildx. The phrase is what buildx itself emits
|
||||||
|
# when its activity-file write fails. Pairing it with "read-only file
|
||||||
|
# system" avoids false-positives on stderr that merely mentions the
|
||||||
|
# activity dir path for unrelated reasons.
|
||||||
|
_BUILDX_WEDGE_SIGNATURE = "failed to update builder last activity time"
|
||||||
|
_BUILDX_EROFS_SIGNATURE = "read-only file system"
|
||||||
|
|
||||||
|
# Count above which we consider buildx's bind-mount table pathological.
|
||||||
|
# A healthy daemon has 0; a couple is transient during a build. Past
|
||||||
|
# 10 you're seeing accumulation from a previous failed run.
|
||||||
|
_BUILDKIT_MOUNT_THRESHOLD = 10
|
||||||
|
|
||||||
|
|
||||||
|
def _count_leaked_buildkit_mounts() -> int:
|
||||||
|
"""How many orphaned buildkit bind-mounts is the daemon holding?
|
||||||
|
|
||||||
|
Best-effort: reads /proc/self/mounts and greps for the known
|
||||||
|
buildkit tmp pattern. Returns 0 if the file can't be read so we
|
||||||
|
never block a deploy over our own diagnostic.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
with open("/proc/self/mounts", "r", encoding="utf-8") as fh:
|
||||||
|
return sum(1 for line in fh if "/var/lib/docker/tmp/buildkit-mount" in line)
|
||||||
|
except OSError:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
def _format_subprocess_error(exc: BaseException) -> str:
|
||||||
|
"""Stringify CalledProcessError so stderr actually shows up.
|
||||||
|
|
||||||
|
The default str(CalledProcessError) is just 'Command ... returned
|
||||||
|
non-zero exit status N', which drops the stderr we carefully stuff
|
||||||
|
our buildx recovery hint into. Status reasons and deploy-failure
|
||||||
|
log lines were losing the payload — surface it here instead.
|
||||||
|
"""
|
||||||
|
if isinstance(exc, subprocess.CalledProcessError):
|
||||||
|
stderr = (exc.stderr or "").strip()
|
||||||
|
if stderr:
|
||||||
|
return f"{exc}: {stderr}"
|
||||||
|
return str(exc)
|
||||||
|
|
||||||
|
|
||||||
|
def _buildx_recovery_hint(*, leaked_mounts: int, original_stderr: str = "") -> str:
|
||||||
|
"""Compose a recovery recipe tailored to which side of the wedge fired.
|
||||||
|
|
||||||
|
Three failure modes share the 'read-only file system' symptom:
|
||||||
|
|
||||||
|
* **Sandboxed home** (path under ``/home/.../.docker``): the
|
||||||
|
service unit has ``ProtectHome=read-only`` and docker CLI is
|
||||||
|
trying to write its activity file in the user's HOME. Fix is
|
||||||
|
to redirect ``DOCKER_CONFIG`` / ``BUILDX_CONFIG`` to a path
|
||||||
|
inside ``ReadWritePaths``.
|
||||||
|
|
||||||
|
* **Leaked mounts** (count > 0): buildkit accumulated bind mounts
|
||||||
|
in /var/lib/docker/tmp from a prior failed build. Fix is to drop
|
||||||
|
the mounts by stopping Docker, unmounting them explicitly, and
|
||||||
|
starting clean — ``prune -af && systemctl restart`` alone does
|
||||||
|
not evict already-held mounts.
|
||||||
|
|
||||||
|
* **Driver corruption** (count == 0): the buildx driver's own
|
||||||
|
state is inconsistent (activity dir permissions, stale instance
|
||||||
|
pointer, etc.). Fix is to rebuild the default builder.
|
||||||
|
"""
|
||||||
|
head = (
|
||||||
|
"Buildx is wedged — Docker's build driver can no longer write "
|
||||||
|
"its activity file (spurious 'read-only file system' error)."
|
||||||
|
)
|
||||||
|
|
||||||
|
# If the offending path is under /home/, leaked mounts are a red
|
||||||
|
# herring — the unit's namespace is what's blocking the write.
|
||||||
|
is_protecthome_case = (
|
||||||
|
leaked_mounts == 0
|
||||||
|
and "/home/" in original_stderr
|
||||||
|
and ".docker/buildx" in original_stderr
|
||||||
|
)
|
||||||
|
if is_protecthome_case:
|
||||||
|
fix = (
|
||||||
|
"Path is under /home but no mounts are leaked — the API "
|
||||||
|
"unit is running with ProtectHome=read-only and docker CLI "
|
||||||
|
"can't write its activity file inside the user's HOME.\n"
|
||||||
|
"Recovery (in the systemd unit):\n"
|
||||||
|
" Environment=DOCKER_CONFIG=<install_dir>/.docker\n"
|
||||||
|
" Environment=BUILDX_CONFIG=<install_dir>/.docker/buildx\n"
|
||||||
|
"Then: sudo systemctl daemon-reload && sudo systemctl restart decnet-api\n"
|
||||||
|
"(Already wired into deploy/decnet-api.service.j2 — re-run\n"
|
||||||
|
"`decnet init` to refresh the installed unit, then restart.)"
|
||||||
|
)
|
||||||
|
tail = "See wiki: Troubleshooting → 'Buildx leaked mounts'."
|
||||||
|
parts = [head, fix, tail]
|
||||||
|
if original_stderr:
|
||||||
|
parts.append(f"Original error:\n{original_stderr.strip()}")
|
||||||
|
return "\n\n".join(parts)
|
||||||
|
|
||||||
|
if leaked_mounts > 0:
|
||||||
|
fix = (
|
||||||
|
f"Detected {leaked_mounts} leaked buildkit bind-mounts — "
|
||||||
|
"prune+restart alone won't evict them.\n"
|
||||||
|
"Recovery:\n"
|
||||||
|
" 1. sudo systemctl stop docker.socket docker.service\n"
|
||||||
|
" 2. sudo pkill -9 -f buildkitd; sudo pkill -9 -f containerd-shim\n"
|
||||||
|
" 3. for m in $(mount | awk '$3 ~ /buildkit-mount/ {print $3}'); do sudo umount -l \"$m\"; done\n"
|
||||||
|
" 4. rm -rf ~/.docker/buildx/activity\n"
|
||||||
|
" 5. sudo systemctl start docker\n"
|
||||||
|
" 6. docker buildx use default # bundled builder is reserved-named; switch to it"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
fix = (
|
||||||
|
"No leaked mounts (count=0) — the buildx driver state "
|
||||||
|
"itself is inconsistent.\n"
|
||||||
|
"Recovery:\n"
|
||||||
|
" 1. rm -rf ~/.docker/buildx/activity ~/.docker/buildx/instances/*\n"
|
||||||
|
" 2. docker buildx create --name decnet-builder --use --bootstrap\n"
|
||||||
|
" (the name 'default' is reserved by Docker — pick anything else)\n"
|
||||||
|
" 3. docker buildx inspect"
|
||||||
|
)
|
||||||
|
tail = "See wiki: Troubleshooting → 'Buildx leaked mounts'."
|
||||||
|
parts = [head, fix, tail]
|
||||||
|
if original_stderr:
|
||||||
|
parts.append(f"Original error:\n{original_stderr.strip()}")
|
||||||
|
return "\n\n".join(parts)
|
||||||
|
|
||||||
|
|
||||||
@_traced("engine.compose_with_retry")
|
@_traced("engine.compose_with_retry")
|
||||||
def _compose_with_retry(
|
def _compose_with_retry(
|
||||||
@@ -101,6 +333,21 @@ def _compose_with_retry(
|
|||||||
# "project name must not be empty".
|
# "project name must not be empty".
|
||||||
cmd = ["docker", "compose", "-p", "decnet", "-f", str(compose_file), *args]
|
cmd = ["docker", "compose", "-p", "decnet", "-f", str(compose_file), *args]
|
||||||
merged = {**os.environ, **(env or {})}
|
merged = {**os.environ, **(env or {})}
|
||||||
|
|
||||||
|
# Preflight: if buildx already looks wedged before the first attempt,
|
||||||
|
# refuse to start — retrying just leaks more mounts. Only applies to
|
||||||
|
# build-bearing invocations ("up --build", "build"); "down" etc. are
|
||||||
|
# unaffected by buildx state.
|
||||||
|
is_build_cmd = any(a in args for a in ("--build", "build"))
|
||||||
|
if is_build_cmd:
|
||||||
|
leaked = _count_leaked_buildkit_mounts()
|
||||||
|
if leaked >= _BUILDKIT_MOUNT_THRESHOLD:
|
||||||
|
hint = _buildx_recovery_hint(leaked_mounts=leaked)
|
||||||
|
log.error("preflight: buildx wedge detected (%d mounts) — refusing to deploy", leaked)
|
||||||
|
raise subprocess.CalledProcessError(
|
||||||
|
returncode=1, cmd=cmd, output="", stderr=hint,
|
||||||
|
)
|
||||||
|
|
||||||
for attempt in range(1, retries + 1):
|
for attempt in range(1, retries + 1):
|
||||||
result = subprocess.run(cmd, capture_output=True, text=True, env=merged) # nosec B603
|
result = subprocess.run(cmd, capture_output=True, text=True, env=merged) # nosec B603
|
||||||
if result.returncode == 0:
|
if result.returncode == 0:
|
||||||
@@ -114,6 +361,24 @@ def _compose_with_retry(
|
|||||||
if any(pat in stderr_lower for pat in _PERMANENT_ERRORS):
|
if any(pat in stderr_lower for pat in _PERMANENT_ERRORS):
|
||||||
console.print(f"[red]Permanent Docker error — not retrying:[/]\n{result.stderr.strip()}")
|
console.print(f"[red]Permanent Docker error — not retrying:[/]\n{result.stderr.strip()}")
|
||||||
raise last_exc
|
raise last_exc
|
||||||
|
# Wedge match needs BOTH the buildx-specific phrase AND the
|
||||||
|
# EROFS marker — otherwise unrelated stderr that mentions the
|
||||||
|
# activity dir false-positives.
|
||||||
|
if (
|
||||||
|
_BUILDX_WEDGE_SIGNATURE in stderr_lower
|
||||||
|
and _BUILDX_EROFS_SIGNATURE in stderr_lower
|
||||||
|
):
|
||||||
|
leaked = _count_leaked_buildkit_mounts()
|
||||||
|
hint = _buildx_recovery_hint(
|
||||||
|
leaked_mounts=leaked,
|
||||||
|
original_stderr=result.stderr or "",
|
||||||
|
)
|
||||||
|
console.print(f"[red]{hint}[/]")
|
||||||
|
log.error("buildx wedge detected mid-build (%d mounts) — not retrying", leaked)
|
||||||
|
raise subprocess.CalledProcessError(
|
||||||
|
returncode=result.returncode, cmd=cmd,
|
||||||
|
output=result.stdout, stderr=hint,
|
||||||
|
)
|
||||||
if attempt < retries:
|
if attempt < retries:
|
||||||
console.print(
|
console.print(
|
||||||
f"[yellow]docker compose {' '.join(args)} failed "
|
f"[yellow]docker compose {' '.join(args)} failed "
|
||||||
@@ -131,6 +396,135 @@ def _compose_with_retry(
|
|||||||
raise last_exc
|
raise last_exc
|
||||||
|
|
||||||
|
|
||||||
|
def _emit_lifecycle_event(
|
||||||
|
*,
|
||||||
|
decky_name: str,
|
||||||
|
old_services: list[str],
|
||||||
|
new_services: list[str],
|
||||||
|
trigger: str,
|
||||||
|
) -> None:
|
||||||
|
"""Fire a ``decky_mutated`` event from a sync code path.
|
||||||
|
|
||||||
|
Deploy/teardown are sync functions; ``emit_decky_mutated`` is async
|
||||||
|
because its bus half awaits. Bus is ``None`` here (CLI has no live
|
||||||
|
client), so only the syslog side actually does work — but running
|
||||||
|
the coroutine keeps the emission site a single call regardless.
|
||||||
|
Soft-fails: a missing log path or broken bus must not abort the
|
||||||
|
deploy. The import is lazy to dodge the circular dependency between
|
||||||
|
``decnet.mutator`` (which imports engine helpers) and this module.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from decnet.mutator.events import emit_decky_mutated
|
||||||
|
asyncio.run(
|
||||||
|
emit_decky_mutated(
|
||||||
|
bus=None,
|
||||||
|
decky=decky_name,
|
||||||
|
old_services=old_services,
|
||||||
|
new_services=new_services,
|
||||||
|
trigger=trigger, # type: ignore[arg-type]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
log.warning("lifecycle event emission failed decky=%s trigger=%s: %s",
|
||||||
|
decky_name, trigger, exc)
|
||||||
|
|
||||||
|
|
||||||
|
def _run_async(coro_factory) -> None:
|
||||||
|
"""Run an async coroutine from a sync context, even when an event loop
|
||||||
|
is already running on this thread.
|
||||||
|
|
||||||
|
``deploy()`` / ``teardown()`` are sync, but the API handler at
|
||||||
|
``web.router.fleet.api_deploy_deckies`` calls them from inside its own
|
||||||
|
event loop. ``asyncio.run`` refuses to run nested, so we always punt
|
||||||
|
to a fresh thread — small overhead, but deploy is already a heavy op.
|
||||||
|
"""
|
||||||
|
import threading
|
||||||
|
err: list[BaseException] = []
|
||||||
|
|
||||||
|
def _runner() -> None:
|
||||||
|
try:
|
||||||
|
asyncio.run(coro_factory())
|
||||||
|
except BaseException as exc: # noqa: BLE001
|
||||||
|
err.append(exc)
|
||||||
|
|
||||||
|
t = threading.Thread(target=_runner, daemon=False)
|
||||||
|
t.start()
|
||||||
|
t.join()
|
||||||
|
if err:
|
||||||
|
raise err[0]
|
||||||
|
|
||||||
|
|
||||||
|
def _mirror_fleet_deploy_to_db(config: DecnetConfig) -> None:
|
||||||
|
"""Mirror fleet rows into the ``fleet_deckies`` DB table.
|
||||||
|
|
||||||
|
Best-effort: a DB outage on a CLI-only host must not abort deploy.
|
||||||
|
The JSON state file (``decnet-state.json``) remains the canonical
|
||||||
|
artifact for every consumer that runs without the API daemon
|
||||||
|
(``decnet status``, ``decnet teardown``, sniffer, collector).
|
||||||
|
|
||||||
|
State defaults to ``running`` to mirror what the dashboard already
|
||||||
|
assumes about JSON-only fleet rows; the reconciler corrects drift
|
||||||
|
by polling ``docker inspect``.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
from decnet.web.db.factory import get_repository
|
||||||
|
from decnet.web.db.models import LOCAL_HOST_SENTINEL
|
||||||
|
repo = get_repository()
|
||||||
|
|
||||||
|
async def _go() -> None:
|
||||||
|
from decnet.canary import planter as _canary_planter
|
||||||
|
for d in config.deckies:
|
||||||
|
await repo.upsert_fleet_decky({
|
||||||
|
"host_uuid": d.host_uuid or LOCAL_HOST_SENTINEL,
|
||||||
|
"name": d.name,
|
||||||
|
"services": list(d.services),
|
||||||
|
"decky_config": d.model_dump(mode="json"),
|
||||||
|
"decky_ip": d.ip,
|
||||||
|
"state": "running",
|
||||||
|
})
|
||||||
|
# Best-effort canary baseline seed. A failure here is
|
||||||
|
# logged inside the planter and surfaces as state=failed
|
||||||
|
# rows in the UI; it must NOT abort the deploy (per the
|
||||||
|
# resilience principle in CLAUDE.md).
|
||||||
|
try:
|
||||||
|
persona = "linux"
|
||||||
|
cfg = d.model_dump(mode="json")
|
||||||
|
nmap_os = cfg.get("nmap_os") or cfg.get("archetype_os")
|
||||||
|
if isinstance(nmap_os, str) and nmap_os.lower().startswith("win"):
|
||||||
|
persona = "windows"
|
||||||
|
await _canary_planter.seed_baseline(
|
||||||
|
d.name, repo, persona=persona,
|
||||||
|
)
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
log.warning(
|
||||||
|
"canary baseline seed failed (best-effort) decky=%s err=%s",
|
||||||
|
d.name, exc,
|
||||||
|
)
|
||||||
|
|
||||||
|
_run_async(_go)
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
log.warning("fleet DB mirror (deploy) failed (best-effort): %s", exc)
|
||||||
|
|
||||||
|
|
||||||
|
def _mirror_fleet_teardown_to_db(deckies) -> None:
|
||||||
|
"""Remove fleet rows from the DB. Best-effort, same rationale."""
|
||||||
|
try:
|
||||||
|
from decnet.web.db.factory import get_repository
|
||||||
|
from decnet.web.db.models import LOCAL_HOST_SENTINEL
|
||||||
|
repo = get_repository()
|
||||||
|
|
||||||
|
async def _go() -> None:
|
||||||
|
for d in deckies:
|
||||||
|
await repo.delete_fleet_decky(
|
||||||
|
host_uuid=d.host_uuid or LOCAL_HOST_SENTINEL,
|
||||||
|
name=d.name,
|
||||||
|
)
|
||||||
|
|
||||||
|
_run_async(_go)
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
log.warning("fleet DB mirror (teardown) failed (best-effort): %s", exc)
|
||||||
|
|
||||||
|
|
||||||
@_traced("engine.deploy")
|
@_traced("engine.deploy")
|
||||||
def deploy(config: DecnetConfig, dry_run: bool = False, no_cache: bool = False, parallel: bool = False) -> None:
|
def deploy(config: DecnetConfig, dry_run: bool = False, no_cache: bool = False, parallel: bool = False) -> None:
|
||||||
log.info("deployment started n_deckies=%d interface=%s subnet=%s dry_run=%s", len(config.deckies), config.interface, config.subnet, dry_run)
|
log.info("deployment started n_deckies=%d interface=%s subnet=%s dry_run=%s", len(config.deckies), config.interface, config.subnet, dry_run)
|
||||||
@@ -165,6 +559,9 @@ def deploy(config: DecnetConfig, dry_run: bool = False, no_cache: bool = False,
|
|||||||
setup_host_macvlan(config.interface, host_ip, decky_range)
|
setup_host_macvlan(config.interface, host_ip, decky_range)
|
||||||
|
|
||||||
_sync_logging_helper(config)
|
_sync_logging_helper(config)
|
||||||
|
_sync_sessrec_sources(config)
|
||||||
|
_sync_auth_helper_sources(config)
|
||||||
|
_sync_ntlmssp_sources(config)
|
||||||
|
|
||||||
compose_path = write_compose(config, COMPOSE_FILE)
|
compose_path = write_compose(config, COMPOSE_FILE)
|
||||||
console.print(f"[bold cyan]Compose file written[/] → {compose_path}")
|
console.print(f"[bold cyan]Compose file written[/] → {compose_path}")
|
||||||
@@ -175,6 +572,18 @@ def deploy(config: DecnetConfig, dry_run: bool = False, no_cache: bool = False,
|
|||||||
return
|
return
|
||||||
|
|
||||||
save_state(config, compose_path)
|
save_state(config, compose_path)
|
||||||
|
_mirror_fleet_deploy_to_db(config)
|
||||||
|
|
||||||
|
# Emit one creation event per decky so the correlation graph has a
|
||||||
|
# well-formed lifecycle start (old_services=[] ⇒ new_services=<initial>).
|
||||||
|
# Bus is None here — the syslog line is what the correlator consumes.
|
||||||
|
for decky in config.deckies:
|
||||||
|
_emit_lifecycle_event(
|
||||||
|
decky_name=decky.name,
|
||||||
|
old_services=[],
|
||||||
|
new_services=list(decky.services),
|
||||||
|
trigger="creation",
|
||||||
|
)
|
||||||
|
|
||||||
# Pre-up cleanup: a prior half-failed `up` can leave containers still
|
# Pre-up cleanup: a prior half-failed `up` can leave containers still
|
||||||
# holding the IPs/ports this run wants, which surfaces as the recurring
|
# holding the IPs/ports this run wants, which surfaces as the recurring
|
||||||
@@ -226,9 +635,23 @@ def teardown(decky_id: str | None = None) -> None:
|
|||||||
if not svc_names:
|
if not svc_names:
|
||||||
log.warning("teardown: decky %s has no services to stop", decky_id)
|
log.warning("teardown: decky %s has no services to stop", decky_id)
|
||||||
return
|
return
|
||||||
|
_emit_lifecycle_event(
|
||||||
|
decky_name=decky.name,
|
||||||
|
old_services=list(decky.services),
|
||||||
|
new_services=[],
|
||||||
|
trigger="retirement",
|
||||||
|
)
|
||||||
_compose("stop", *svc_names, compose_file=compose_path)
|
_compose("stop", *svc_names, compose_file=compose_path)
|
||||||
_compose("rm", "-f", *svc_names, compose_file=compose_path)
|
_compose("rm", "-f", *svc_names, compose_file=compose_path)
|
||||||
|
_mirror_fleet_teardown_to_db([decky])
|
||||||
else:
|
else:
|
||||||
|
for decky in config.deckies:
|
||||||
|
_emit_lifecycle_event(
|
||||||
|
decky_name=decky.name,
|
||||||
|
old_services=list(decky.services),
|
||||||
|
new_services=[],
|
||||||
|
trigger="retirement",
|
||||||
|
)
|
||||||
_compose("down", compose_file=compose_path)
|
_compose("down", compose_file=compose_path)
|
||||||
|
|
||||||
ip_list = [d.ip for d in config.deckies]
|
ip_list = [d.ip for d in config.deckies]
|
||||||
@@ -239,6 +662,7 @@ def teardown(decky_id: str | None = None) -> None:
|
|||||||
teardown_host_macvlan(decky_range)
|
teardown_host_macvlan(decky_range)
|
||||||
remove_macvlan_network(client)
|
remove_macvlan_network(client)
|
||||||
clear_state()
|
clear_state()
|
||||||
|
_mirror_fleet_teardown_to_db(config.deckies)
|
||||||
|
|
||||||
net_driver = "IPvlan" if config.ipvlan else "MACVLAN"
|
net_driver = "IPvlan" if config.ipvlan else "MACVLAN"
|
||||||
log.info("teardown complete all deckies removed network_driver=%s", net_driver)
|
log.info("teardown complete all deckies removed network_driver=%s", net_driver)
|
||||||
@@ -281,6 +705,301 @@ def status() -> None:
|
|||||||
console.print(table)
|
console.print(table)
|
||||||
|
|
||||||
|
|
||||||
|
def _teardown_order(lans: list[dict]) -> list[str]:
|
||||||
|
"""Return LAN names in leaf-first (DMZ-last) teardown order.
|
||||||
|
|
||||||
|
The generator names LANs in BFS order (``LAN-00`` = DMZ root,
|
||||||
|
then children, then grandchildren), so reverse-name order is a
|
||||||
|
correct leaf-first topological sort for the tree. Cross-edges
|
||||||
|
are membership-only — they don't introduce parent/child
|
||||||
|
relationships, so the BFS numbering remains valid.
|
||||||
|
"""
|
||||||
|
return sorted((lan["name"] for lan in lans), reverse=True)
|
||||||
|
|
||||||
|
|
||||||
|
def _topology_compose_path(topology_id: str) -> Path:
|
||||||
|
return Path(f"decnet-topology-{topology_id[:8]}-compose.yml")
|
||||||
|
|
||||||
|
|
||||||
|
async def _resolve_swarm_host(repo, host_uuid: str) -> dict:
|
||||||
|
host = await repo.get_swarm_host_by_uuid(host_uuid)
|
||||||
|
if host is None:
|
||||||
|
raise ValueError(
|
||||||
|
f"topology pinned to unknown swarm host {host_uuid!r}"
|
||||||
|
)
|
||||||
|
return host
|
||||||
|
|
||||||
|
|
||||||
|
async def _deploy_on_agent(repo, topology_id: str, hydrated: dict) -> None:
|
||||||
|
"""Route a topology apply to the agent pinned by ``target_host_uuid``.
|
||||||
|
|
||||||
|
Local imports avoid a circular dependency: decnet.swarm.client already
|
||||||
|
pulls decnet.engine indirectly via decnet.config.
|
||||||
|
"""
|
||||||
|
from decnet.swarm.client import AgentClient
|
||||||
|
|
||||||
|
target_host_uuid = hydrated["topology"]["target_host_uuid"]
|
||||||
|
host = await _resolve_swarm_host(repo, target_host_uuid)
|
||||||
|
version_hash = canonical_hash(hydrated)
|
||||||
|
|
||||||
|
await transition_status(repo, topology_id, TopologyStatus.DEPLOYING)
|
||||||
|
try:
|
||||||
|
async with AgentClient(host=host) as agent:
|
||||||
|
await agent.apply_topology(hydrated, version_hash)
|
||||||
|
except Exception as exc:
|
||||||
|
log.error(
|
||||||
|
"topology %s agent-apply failed on %s: %s",
|
||||||
|
topology_id, host.get("name"), exc,
|
||||||
|
)
|
||||||
|
await transition_status(
|
||||||
|
repo, topology_id, TopologyStatus.FAILED,
|
||||||
|
reason=_format_subprocess_error(exc),
|
||||||
|
)
|
||||||
|
raise
|
||||||
|
|
||||||
|
await transition_status(repo, topology_id, TopologyStatus.ACTIVE)
|
||||||
|
log.info(
|
||||||
|
"topology %s deployed on agent %s (hash=%s)",
|
||||||
|
topology_id, host.get("name"), version_hash[:12],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def resync_agent_topology(repo, topology_id: str) -> None:
|
||||||
|
"""Re-push an ACTIVE agent-targeted topology without status churn.
|
||||||
|
|
||||||
|
Used by the mutator reconcile loop when an agent's reported
|
||||||
|
applied_version_hash drifts from what master expects. Unlike the
|
||||||
|
initial deploy, we do NOT flip status — the topology is already
|
||||||
|
ACTIVE; we just want the agent's cache + live state to match
|
||||||
|
master's current hydrated blob.
|
||||||
|
"""
|
||||||
|
from decnet.swarm.client import AgentClient
|
||||||
|
|
||||||
|
hydrated = await hydrate(repo, topology_id)
|
||||||
|
if hydrated is None:
|
||||||
|
raise ValueError(f"topology {topology_id!r} not found")
|
||||||
|
target_host_uuid = hydrated["topology"].get("target_host_uuid")
|
||||||
|
if not target_host_uuid:
|
||||||
|
raise ValueError(
|
||||||
|
f"topology {topology_id!r} has no target_host_uuid; "
|
||||||
|
"resync is agent-only"
|
||||||
|
)
|
||||||
|
host = await _resolve_swarm_host(repo, target_host_uuid)
|
||||||
|
version_hash = canonical_hash(hydrated)
|
||||||
|
async with AgentClient(host=host) as agent:
|
||||||
|
await agent.apply_topology(hydrated, version_hash)
|
||||||
|
log.info(
|
||||||
|
"topology %s resynced to agent %s (hash=%s)",
|
||||||
|
topology_id, host.get("name"), version_hash[:12],
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def _teardown_on_agent(repo, topology_id: str, hydrated: dict) -> None:
|
||||||
|
"""Route a topology teardown to the pinned agent."""
|
||||||
|
from decnet.swarm.client import AgentClient
|
||||||
|
|
||||||
|
target_host_uuid = hydrated["topology"]["target_host_uuid"]
|
||||||
|
host = await _resolve_swarm_host(repo, target_host_uuid)
|
||||||
|
|
||||||
|
await transition_status(repo, topology_id, TopologyStatus.TEARING_DOWN)
|
||||||
|
try:
|
||||||
|
async with AgentClient(host=host) as agent:
|
||||||
|
await agent.teardown_topology(topology_id)
|
||||||
|
except Exception as exc:
|
||||||
|
log.warning(
|
||||||
|
"topology %s agent-teardown failed on %s (continuing): %s",
|
||||||
|
topology_id, host.get("name"), exc,
|
||||||
|
)
|
||||||
|
|
||||||
|
await transition_status(repo, topology_id, TopologyStatus.TORN_DOWN)
|
||||||
|
log.info("topology %s torn down on agent %s", topology_id, host.get("name"))
|
||||||
|
|
||||||
|
|
||||||
|
def _warn_if_userland_proxy_enabled(hydrated: dict) -> None:
|
||||||
|
"""Soft warning: docker-proxy masks attacker source IPs.
|
||||||
|
|
||||||
|
Only log if the topology will publish ports (gateway deckies with
|
||||||
|
``forwards_l3=True``) — no point scaring operators on port-less
|
||||||
|
topologies. Best-effort: any failure talking to the daemon is
|
||||||
|
silently ignored.
|
||||||
|
"""
|
||||||
|
publishes = any(
|
||||||
|
(d.get("decky_config") or {}).get("forwards_l3")
|
||||||
|
for d in hydrated.get("deckies", [])
|
||||||
|
)
|
||||||
|
if not publishes:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
info = docker.from_env().info()
|
||||||
|
except Exception:
|
||||||
|
return
|
||||||
|
if info.get("UserlandProxy") or info.get("Userland Proxy"):
|
||||||
|
log.warning(
|
||||||
|
"[USERLAND_PROXY] docker-proxy is enabled; attacker source IPs "
|
||||||
|
"will appear as the bridge gateway. Set "
|
||||||
|
'"userland-proxy": false in /etc/docker/daemon.json to preserve '
|
||||||
|
"real source IPs."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@_traced("engine.deploy_topology")
|
||||||
|
async def deploy_topology(repo, topology_id: str, *, dry_run: bool = False) -> None:
|
||||||
|
"""Deploy a persisted MazeNET topology.
|
||||||
|
|
||||||
|
Assumes ``repo`` has the topology in ``pending`` state. Creates one
|
||||||
|
Docker bridge network per LAN, writes a per-topology compose file,
|
||||||
|
and brings all deckies up. Marks ``active`` on success, ``failed``
|
||||||
|
on exception (partial state left for later teardown).
|
||||||
|
"""
|
||||||
|
hydrated = await hydrate(repo, topology_id)
|
||||||
|
if hydrated is None:
|
||||||
|
raise ValueError(f"topology {topology_id!r} not found")
|
||||||
|
|
||||||
|
# Precondition: validate before any status transition or Docker call.
|
||||||
|
# Errors bubble up as ValidationError and leave status untouched.
|
||||||
|
issues = _validate_topology(hydrated)
|
||||||
|
if _validation_errors(issues):
|
||||||
|
raise ValidationError(issues)
|
||||||
|
|
||||||
|
lans = hydrated["lans"]
|
||||||
|
compose_path = _topology_compose_path(topology_id)
|
||||||
|
|
||||||
|
if dry_run:
|
||||||
|
# Plan-only: don't touch repo status or Docker — write the compose
|
||||||
|
# so operators can diff it, nothing else.
|
||||||
|
write_topology_compose(hydrated, compose_path)
|
||||||
|
console.print(
|
||||||
|
f"[bold cyan]Dry run — topology compose file written[/] → {compose_path}"
|
||||||
|
)
|
||||||
|
log.info("topology %s dry-run complete", topology_id)
|
||||||
|
return
|
||||||
|
|
||||||
|
# Host-state precheck: PORT_COLLISION is a warning (docker-compose
|
||||||
|
# will hard-fail if the port is actually unavailable; we just want
|
||||||
|
# the clearer log line up-front). Only runs at live deploy.
|
||||||
|
for w in check_no_host_port_collision(hydrated):
|
||||||
|
log.warning("[%s] %s", w.code, w.message)
|
||||||
|
|
||||||
|
_warn_if_userland_proxy_enabled(hydrated)
|
||||||
|
|
||||||
|
# Pinned to an agent? Hand off to the mTLS path. Everything below
|
||||||
|
# this line is the master-local deploy.
|
||||||
|
if hydrated["topology"].get("target_host_uuid"):
|
||||||
|
await _deploy_on_agent(repo, topology_id, hydrated)
|
||||||
|
return
|
||||||
|
|
||||||
|
await transition_status(repo, topology_id, TopologyStatus.DEPLOYING)
|
||||||
|
|
||||||
|
client = docker.from_env()
|
||||||
|
created_networks: list[str] = []
|
||||||
|
compose_started = False
|
||||||
|
try:
|
||||||
|
for lan in lans:
|
||||||
|
net_name = _topology_network_name(topology_id, lan["name"])
|
||||||
|
# DMZ LAN is publicly routable; internal LANs are isolated
|
||||||
|
# from the host's default egress.
|
||||||
|
internal = not lan["is_dmz"]
|
||||||
|
create_bridge_network(
|
||||||
|
client, net_name, lan["subnet"], internal=internal
|
||||||
|
)
|
||||||
|
created_networks.append(net_name)
|
||||||
|
write_topology_compose(hydrated, compose_path)
|
||||||
|
console.print(
|
||||||
|
f"[bold cyan]Topology compose file written[/] → {compose_path}"
|
||||||
|
)
|
||||||
|
# Offload to a worker thread so the API event loop stays
|
||||||
|
# responsive during the build — otherwise every other request
|
||||||
|
# (mutator events, SSE, status polls) waits behind compose.
|
||||||
|
await anyio.to_thread.run_sync(
|
||||||
|
lambda: _compose_with_retry(
|
||||||
|
"up", "--build", "-d", compose_file=compose_path,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
compose_started = True
|
||||||
|
except Exception as exc:
|
||||||
|
log.error("topology %s deploy failed: %s", topology_id, exc)
|
||||||
|
# Roll back any Docker state we created in this attempt so the
|
||||||
|
# next deploy doesn't trip over orphan networks or half-started
|
||||||
|
# containers. Best-effort: rollback errors must not mask the
|
||||||
|
# original deploy failure.
|
||||||
|
if compose_started or compose_path.exists():
|
||||||
|
try:
|
||||||
|
_compose(
|
||||||
|
"down", "--remove-orphans", compose_file=compose_path
|
||||||
|
)
|
||||||
|
except Exception as rb_exc: # pragma: no cover
|
||||||
|
log.warning(
|
||||||
|
"topology %s rollback compose-down failed: %s",
|
||||||
|
topology_id, rb_exc,
|
||||||
|
)
|
||||||
|
for net_name in reversed(created_networks):
|
||||||
|
try:
|
||||||
|
remove_bridge_network(client, net_name)
|
||||||
|
except Exception as rb_exc: # pragma: no cover
|
||||||
|
log.warning(
|
||||||
|
"topology %s rollback network %s removal failed: %s",
|
||||||
|
topology_id, net_name, rb_exc,
|
||||||
|
)
|
||||||
|
if compose_path.exists():
|
||||||
|
try:
|
||||||
|
compose_path.unlink()
|
||||||
|
except OSError: # pragma: no cover
|
||||||
|
pass
|
||||||
|
await transition_status(
|
||||||
|
repo, topology_id, TopologyStatus.FAILED,
|
||||||
|
reason=_format_subprocess_error(exc),
|
||||||
|
)
|
||||||
|
raise
|
||||||
|
|
||||||
|
await transition_status(repo, topology_id, TopologyStatus.ACTIVE)
|
||||||
|
log.info("topology %s deployed n_lans=%d", topology_id, len(lans))
|
||||||
|
|
||||||
|
|
||||||
|
@_traced("engine.teardown_topology")
|
||||||
|
async def teardown_topology(repo, topology_id: str) -> None:
|
||||||
|
"""Tear down a persisted MazeNET topology.
|
||||||
|
|
||||||
|
Legal from ``active|degraded|failed|deploying``. Brings compose
|
||||||
|
down, removes each LAN's Docker bridge network in leaf-first order,
|
||||||
|
and marks ``torn_down``.
|
||||||
|
"""
|
||||||
|
hydrated = await hydrate(repo, topology_id)
|
||||||
|
if hydrated is None:
|
||||||
|
raise ValueError(f"topology {topology_id!r} not found")
|
||||||
|
|
||||||
|
if hydrated["topology"].get("target_host_uuid"):
|
||||||
|
await _teardown_on_agent(repo, topology_id, hydrated)
|
||||||
|
return
|
||||||
|
|
||||||
|
await transition_status(repo, topology_id, TopologyStatus.TEARING_DOWN)
|
||||||
|
|
||||||
|
client = docker.from_env()
|
||||||
|
compose_path = _topology_compose_path(topology_id)
|
||||||
|
|
||||||
|
if compose_path.exists():
|
||||||
|
try:
|
||||||
|
await anyio.to_thread.run_sync(
|
||||||
|
lambda: _compose(
|
||||||
|
"down", "--remove-orphans", compose_file=compose_path,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
except subprocess.CalledProcessError as exc:
|
||||||
|
log.warning(
|
||||||
|
"topology %s compose down failed (continuing): %s",
|
||||||
|
topology_id, exc,
|
||||||
|
)
|
||||||
|
|
||||||
|
for lan_name in _teardown_order(hydrated["lans"]):
|
||||||
|
net_name = _topology_network_name(topology_id, lan_name)
|
||||||
|
remove_bridge_network(client, net_name)
|
||||||
|
|
||||||
|
if compose_path.exists():
|
||||||
|
compose_path.unlink()
|
||||||
|
|
||||||
|
await transition_status(repo, topology_id, TopologyStatus.TORN_DOWN)
|
||||||
|
log.info("topology %s torn down", topology_id)
|
||||||
|
|
||||||
|
|
||||||
def _print_status(config: DecnetConfig) -> None:
|
def _print_status(config: DecnetConfig) -> None:
|
||||||
table = Table(title="Deployed Deckies", show_lines=True)
|
table = Table(title="Deployed Deckies", show_lines=True)
|
||||||
table.add_column("Decky")
|
table.add_column("Decky")
|
||||||
|
|||||||
171
decnet/engine/reaper.py
Normal file
171
decnet/engine/reaper.py
Normal file
@@ -0,0 +1,171 @@
|
|||||||
|
"""Orphan Docker resource reaper for MazeNET topologies.
|
||||||
|
|
||||||
|
Every topology's Docker resources carry the fixed prefix
|
||||||
|
``decnet_t_<first-8-of-topology-uuid>_`` (see
|
||||||
|
:func:`decnet.topology.compose._network_name`). When a topology row is
|
||||||
|
deleted from the DB without a proper teardown — operator error, crashed
|
||||||
|
master, straight ``DELETE FROM topologies`` — the containers and
|
||||||
|
networks linger and steal IPAM pools.
|
||||||
|
|
||||||
|
This module walks the local Docker daemon, extracts the 8-char prefix
|
||||||
|
from every matching container/network, compares against the set of
|
||||||
|
prefixes that *do* map to a known topology, and removes the rest.
|
||||||
|
|
||||||
|
It never touches resources whose prefix matches a live topology, and it
|
||||||
|
never touches non-DECNET resources.
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Any, Iterable, Optional
|
||||||
|
|
||||||
|
import docker
|
||||||
|
|
||||||
|
from decnet.logging import get_logger
|
||||||
|
from decnet.network import remove_bridge_network
|
||||||
|
|
||||||
|
log = get_logger("engine.reaper")
|
||||||
|
|
||||||
|
# decnet_t_<8hex>_<anything>. The 8-char prefix is sliced from the
|
||||||
|
# topology UUID in decnet.topology.compose._network_name. Tolerate any
|
||||||
|
# suffix (network name, decky name) after the second underscore.
|
||||||
|
_RESOURCE_NAME_RE = re.compile(r"^decnet_t_([0-9a-f]{8})_")
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ReapReport:
|
||||||
|
"""Outcome of a reap pass — what was found and what was removed."""
|
||||||
|
|
||||||
|
live_prefixes: list[str] = field(default_factory=list)
|
||||||
|
orphan_prefixes: list[str] = field(default_factory=list)
|
||||||
|
containers_removed: list[str] = field(default_factory=list)
|
||||||
|
networks_removed: list[str] = field(default_factory=list)
|
||||||
|
errors: list[str] = field(default_factory=list)
|
||||||
|
|
||||||
|
def to_dict(self) -> dict[str, Any]:
|
||||||
|
return {
|
||||||
|
"live_prefixes": self.live_prefixes,
|
||||||
|
"orphan_prefixes": self.orphan_prefixes,
|
||||||
|
"containers_removed": self.containers_removed,
|
||||||
|
"networks_removed": self.networks_removed,
|
||||||
|
"errors": self.errors,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _prefix_of(name: str) -> Optional[str]:
|
||||||
|
m = _RESOURCE_NAME_RE.match(name)
|
||||||
|
return m.group(1) if m else None
|
||||||
|
|
||||||
|
|
||||||
|
async def _live_prefixes(repo: Any) -> set[str]:
|
||||||
|
"""Every topology-id prefix the DB still knows about.
|
||||||
|
|
||||||
|
Tearing down only marks ``torn_down``; the row stays around for
|
||||||
|
audit. We consider *every* persisted topology to be live for the
|
||||||
|
reaper so we never race a concurrent teardown / redeploy by nuking
|
||||||
|
its networks mid-flight. Operators who want those resources gone
|
||||||
|
should delete the topology row (which cascades) or run teardown.
|
||||||
|
"""
|
||||||
|
rows = await repo.list_topologies()
|
||||||
|
return {r["id"][:8] for r in rows}
|
||||||
|
|
||||||
|
|
||||||
|
def _orphan_prefixes(
|
||||||
|
container_names: Iterable[str],
|
||||||
|
network_names: Iterable[str],
|
||||||
|
live: set[str],
|
||||||
|
) -> tuple[set[str], list[str], list[str]]:
|
||||||
|
"""Return (orphan_prefixes, decnet_containers, decnet_networks).
|
||||||
|
|
||||||
|
Pure function — no Docker / repo I/O. Kept separate so the test
|
||||||
|
suite can drive it without mocking the docker SDK."""
|
||||||
|
c_decnet = [n for n in container_names if _prefix_of(n) is not None]
|
||||||
|
n_decnet = [n for n in network_names if _prefix_of(n) is not None]
|
||||||
|
orphans = {
|
||||||
|
_prefix_of(n) for n in (*c_decnet, *n_decnet)
|
||||||
|
} - live
|
||||||
|
orphans.discard(None)
|
||||||
|
return orphans, c_decnet, n_decnet # type: ignore[return-value]
|
||||||
|
|
||||||
|
|
||||||
|
async def reap_orphan_topology_resources(
|
||||||
|
repo: Any,
|
||||||
|
client: Optional[docker.DockerClient] = None,
|
||||||
|
) -> ReapReport:
|
||||||
|
"""Remove Docker containers + networks whose topology id is gone.
|
||||||
|
|
||||||
|
* Enumerates every container and network whose name matches the
|
||||||
|
DECNET topology pattern.
|
||||||
|
* Computes the set of prefixes still referenced in the DB.
|
||||||
|
* Force-removes every container (so networks can drop their
|
||||||
|
endpoints), then removes the networks in a second pass.
|
||||||
|
* Errors on any single resource are captured into the report but
|
||||||
|
never abort the sweep — one stuck container should not block the
|
||||||
|
other nineteen from being cleaned up.
|
||||||
|
"""
|
||||||
|
if client is None:
|
||||||
|
client = docker.from_env()
|
||||||
|
|
||||||
|
live = await _live_prefixes(repo)
|
||||||
|
report = ReapReport(live_prefixes=sorted(live))
|
||||||
|
|
||||||
|
try:
|
||||||
|
containers = client.containers.list(all=True)
|
||||||
|
networks = client.networks.list()
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
report.errors.append(f"docker list failed: {exc}")
|
||||||
|
return report
|
||||||
|
|
||||||
|
container_names = [c.name for c in containers]
|
||||||
|
network_names = [n.name for n in networks]
|
||||||
|
orphans, decnet_containers, decnet_networks = _orphan_prefixes(
|
||||||
|
container_names, network_names, live
|
||||||
|
)
|
||||||
|
report.orphan_prefixes = sorted(orphans)
|
||||||
|
|
||||||
|
if not orphans:
|
||||||
|
log.info(
|
||||||
|
"reaper: no orphans (decnet containers=%d, networks=%d, live=%d)",
|
||||||
|
len(decnet_containers), len(decnet_networks), len(live),
|
||||||
|
)
|
||||||
|
return report
|
||||||
|
|
||||||
|
# Pass 1: containers. Force-remove so we don't hang on a stopped
|
||||||
|
# container whose network is about to be killed.
|
||||||
|
for c in containers:
|
||||||
|
prefix = _prefix_of(c.name)
|
||||||
|
if prefix is None or prefix not in orphans:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
c.remove(force=True)
|
||||||
|
report.containers_removed.append(c.name)
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
report.errors.append(f"container {c.name!r}: {exc}")
|
||||||
|
log.warning("reaper: container %s remove failed: %s", c.name, exc)
|
||||||
|
|
||||||
|
# Pass 2: networks.
|
||||||
|
for n in networks:
|
||||||
|
prefix = _prefix_of(n.name)
|
||||||
|
if prefix is None or prefix not in orphans:
|
||||||
|
continue
|
||||||
|
try:
|
||||||
|
remove_bridge_network(client, n.name)
|
||||||
|
report.networks_removed.append(n.name)
|
||||||
|
except Exception as exc: # noqa: BLE001
|
||||||
|
report.errors.append(f"network {n.name!r}: {exc}")
|
||||||
|
log.warning("reaper: network %s remove failed: %s", n.name, exc)
|
||||||
|
|
||||||
|
log.info(
|
||||||
|
"reaper: removed %d container(s), %d network(s) across %d orphan prefix(es)",
|
||||||
|
len(report.containers_removed),
|
||||||
|
len(report.networks_removed),
|
||||||
|
len(report.orphan_prefixes),
|
||||||
|
)
|
||||||
|
return report
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"ReapReport",
|
||||||
|
"reap_orphan_topology_resources",
|
||||||
|
]
|
||||||
123
decnet/env.py
123
decnet/env.py
@@ -70,6 +70,15 @@ DECNET_EMBED_PROFILER: bool = os.environ.get("DECNET_EMBED_PROFILER", "").lower(
|
|||||||
# workers sniffing the same interface — duplicated events and wasted CPU.
|
# workers sniffing the same interface — duplicated events and wasted CPU.
|
||||||
DECNET_EMBED_SNIFFER: bool = os.environ.get("DECNET_EMBED_SNIFFER", "").lower() == "true"
|
DECNET_EMBED_SNIFFER: bool = os.environ.get("DECNET_EMBED_SNIFFER", "").lower() == "true"
|
||||||
|
|
||||||
|
# Set to "true" to embed the Docker log collector inside the API process.
|
||||||
|
# Leave unset (default) when `decnet-collector.service` (or a standalone
|
||||||
|
# `decnet collect --daemon`) is running — embedding both yields two
|
||||||
|
# tailers appending every container log line to the ingest file, which
|
||||||
|
# the ingester then inserts into the DB twice. Single-process dev
|
||||||
|
# setups without systemd units can flip this on to get the old all-in
|
||||||
|
# -one behaviour.
|
||||||
|
DECNET_EMBED_COLLECTOR: bool = os.environ.get("DECNET_EMBED_COLLECTOR", "").lower() == "true"
|
||||||
|
|
||||||
# Set to "true" to mount the Pyinstrument ASGI middleware on the FastAPI app.
|
# Set to "true" to mount the Pyinstrument ASGI middleware on the FastAPI app.
|
||||||
# Produces per-request HTML flamegraphs under ./profiles/. Off by default so
|
# Produces per-request HTML flamegraphs under ./profiles/. Off by default so
|
||||||
# production and normal dev runs pay zero profiling overhead.
|
# production and normal dev runs pay zero profiling overhead.
|
||||||
@@ -84,6 +93,16 @@ DECNET_API_PORT: int = _port("DECNET_API_PORT", 8000)
|
|||||||
# the master's JWT secret being present in the environment.
|
# the master's JWT secret being present in the environment.
|
||||||
DECNET_INGEST_LOG_FILE: str | None = os.environ.get("DECNET_INGEST_LOG_FILE", "/var/log/decnet/decnet.log")
|
DECNET_INGEST_LOG_FILE: str | None = os.environ.get("DECNET_INGEST_LOG_FILE", "/var/log/decnet/decnet.log")
|
||||||
|
|
||||||
|
# Agent-side RFC 5424 sink written by decnet.collector.worker when run on
|
||||||
|
# a SWARM worker. The forwarder tails this file and ships lines over
|
||||||
|
# syslog-TLS to the master listener. Kept separate from
|
||||||
|
# DECNET_INGEST_LOG_FILE so a workstation-dev box (which may run both the
|
||||||
|
# master and a throwaway agent pointed at itself) can't accidentally
|
||||||
|
# recurse by forwarding its own ingest file back to itself.
|
||||||
|
DECNET_AGENT_LOG_FILE: str = os.environ.get(
|
||||||
|
"DECNET_AGENT_LOG_FILE", "/var/log/decnet/agent.log"
|
||||||
|
)
|
||||||
|
|
||||||
# SWARM log pipeline — RFC 5425 syslog-over-TLS between worker forwarders
|
# SWARM log pipeline — RFC 5425 syslog-over-TLS between worker forwarders
|
||||||
# and the master listener. Plaintext syslog across hosts is forbidden.
|
# and the master listener. Plaintext syslog across hosts is forbidden.
|
||||||
DECNET_SWARM_SYSLOG_PORT: int = _port("DECNET_SWARM_SYSLOG_PORT", 6514)
|
DECNET_SWARM_SYSLOG_PORT: int = _port("DECNET_SWARM_SYSLOG_PORT", 6514)
|
||||||
@@ -121,6 +140,15 @@ DECNET_DISALLOW_MASTER: bool = (
|
|||||||
os.environ.get("DECNET_DISALLOW_MASTER", "true").lower() == "true"
|
os.environ.get("DECNET_DISALLOW_MASTER", "true").lower() == "true"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# ServiceBus — host-local UNIX-socket pub/sub. Workers consume via
|
||||||
|
# ``decnet.bus.factory.get_bus()``. Disabled → NullBus (publishes drop,
|
||||||
|
# subscriptions yield nothing) so dev environments without a bus daemon
|
||||||
|
# can still boot. See DEBT-029 for the MVP design.
|
||||||
|
DECNET_BUS_ENABLED: bool = os.environ.get("DECNET_BUS_ENABLED", "true").lower() != "false"
|
||||||
|
DECNET_BUS_TYPE: str = os.environ.get("DECNET_BUS_TYPE", "unix").lower()
|
||||||
|
DECNET_BUS_SOCKET: Optional[str] = os.environ.get("DECNET_BUS_SOCKET")
|
||||||
|
DECNET_BUS_GROUP: str = os.environ.get("DECNET_BUS_GROUP", "decnet")
|
||||||
|
|
||||||
# Tracing — set to "true" to enable OpenTelemetry distributed tracing.
|
# Tracing — set to "true" to enable OpenTelemetry distributed tracing.
|
||||||
# Separate from DECNET_DEVELOPER so tracing can be toggled independently.
|
# Separate from DECNET_DEVELOPER so tracing can be toggled independently.
|
||||||
DECNET_DEVELOPER_TRACING: bool = os.environ.get("DECNET_DEVELOPER_TRACING", "").lower() == "true"
|
DECNET_DEVELOPER_TRACING: bool = os.environ.get("DECNET_DEVELOPER_TRACING", "").lower() == "true"
|
||||||
@@ -146,6 +174,101 @@ _cors_raw: str = os.environ.get("DECNET_CORS_ORIGINS", _cors_default)
|
|||||||
DECNET_CORS_ORIGINS: list[str] = [o.strip() for o in _cors_raw.split(",") if o.strip()]
|
DECNET_CORS_ORIGINS: list[str] = [o.strip() for o in _cors_raw.split(",") if o.strip()]
|
||||||
|
|
||||||
|
|
||||||
|
# Master→worker mTLS hostname verification. Off by default because legacy
|
||||||
|
# enrollments were issued certs with operator-supplied SAN lists that may
|
||||||
|
# not match the URL the master uses to connect; set to "true" on a fresh
|
||||||
|
# production deploy where you control enrollment to get TLS hostname checks
|
||||||
|
# on top of CA + fingerprint pinning.
|
||||||
|
DECNET_VERIFY_HOSTNAME: bool = (
|
||||||
|
os.environ.get("DECNET_VERIFY_HOSTNAME", "false").lower() == "true"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
_LOOPBACK_HOSTS = {"localhost", "127.0.0.1", "::1"}
|
||||||
|
_WILDCARD_BIND_HOSTS = {"0.0.0.0", "::"} # nosec B104 — comparison only
|
||||||
|
|
||||||
|
|
||||||
|
def _origin_host(origin: str) -> str:
|
||||||
|
"""Pull the bare hostname out of a CORS origin (``http(s)://host:port``).
|
||||||
|
|
||||||
|
Returns the full origin lowercased if the URL can't be parsed — the
|
||||||
|
caller treats unrecognised origins as non-loopback, which is the safer
|
||||||
|
default for a public-binding check.
|
||||||
|
"""
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
try:
|
||||||
|
parsed = urlparse(origin)
|
||||||
|
host = (parsed.hostname or "").lower()
|
||||||
|
return host or origin.strip().lower()
|
||||||
|
except (ValueError, AttributeError):
|
||||||
|
return origin.strip().lower()
|
||||||
|
|
||||||
|
|
||||||
|
def validate_public_binding() -> None:
|
||||||
|
"""Refuse to start the master API/web with a footgun config.
|
||||||
|
|
||||||
|
Three checks, all gated on the API binding being non-loopback (i.e.
|
||||||
|
actually exposed to the network):
|
||||||
|
|
||||||
|
* If CORS allow-list still contains a loopback origin, fail. The most
|
||||||
|
common shape of this bug is operator flips ``DECNET_API_HOST=0.0.0.0``
|
||||||
|
to "make it work" and forgets to update ``DECNET_CORS_ORIGINS`` —
|
||||||
|
the dashboard then either can't talk to the API at all, or worse,
|
||||||
|
they wildcard CORS to paper over it.
|
||||||
|
|
||||||
|
* If the canary HTTP base is plaintext (``http://``) and the canary
|
||||||
|
host isn't loopback, fail. Canary tokens phone home on trigger;
|
||||||
|
plaintext over the public internet leaks the token to anyone on
|
||||||
|
the path.
|
||||||
|
|
||||||
|
* If the rate limiter is globally disabled, log loudly. Don't fail —
|
||||||
|
operators sometimes want this for benchmarking — but never let it
|
||||||
|
slip past unmentioned on a public binding.
|
||||||
|
|
||||||
|
Called from the FastAPI lifespan so it surfaces at startup, not on
|
||||||
|
first request. Skipped automatically when running under pytest so
|
||||||
|
the test suite doesn't have to set five env vars per fixture.
|
||||||
|
"""
|
||||||
|
if any(k.startswith("PYTEST") for k in os.environ):
|
||||||
|
return
|
||||||
|
if DECNET_API_HOST in _LOOPBACK_HOSTS:
|
||||||
|
return # not exposed; nothing to validate
|
||||||
|
|
||||||
|
bind_label = "DECNET_API_HOST" if DECNET_API_HOST in _WILDCARD_BIND_HOSTS else "DECNET_API_HOST"
|
||||||
|
loopback_origins = [o for o in DECNET_CORS_ORIGINS if _origin_host(o) in _LOOPBACK_HOSTS]
|
||||||
|
if loopback_origins:
|
||||||
|
raise ValueError(
|
||||||
|
f"{bind_label}={DECNET_API_HOST!r} exposes the API to the network, "
|
||||||
|
f"but DECNET_CORS_ORIGINS still contains loopback origin(s) "
|
||||||
|
f"{loopback_origins!r}. Set DECNET_CORS_ORIGINS to the public "
|
||||||
|
f"dashboard URL(s) before starting (e.g. "
|
||||||
|
f"DECNET_CORS_ORIGINS=https://dashboard.example.com)."
|
||||||
|
)
|
||||||
|
|
||||||
|
canary_base = os.environ.get("DECNET_CANARY_HTTP_BASE", "").strip()
|
||||||
|
if canary_base and canary_base.lower().startswith("http://"):
|
||||||
|
host = _origin_host(canary_base)
|
||||||
|
if host and host not in _LOOPBACK_HOSTS:
|
||||||
|
raise ValueError(
|
||||||
|
f"DECNET_CANARY_HTTP_BASE={canary_base!r} is plaintext HTTP and "
|
||||||
|
f"points at a non-loopback host. Canary triggers carry secrets "
|
||||||
|
f"that must not cross the public internet in cleartext — use "
|
||||||
|
f"https:// or front the canary endpoint with a TLS proxy."
|
||||||
|
)
|
||||||
|
|
||||||
|
limiter_enabled = os.environ.get("DECNET_LIMITER_ENABLED", "true").lower() != "false"
|
||||||
|
if not limiter_enabled:
|
||||||
|
# Late import to avoid a circular dependency through decnet.logging.
|
||||||
|
from decnet.logging import get_logger
|
||||||
|
get_logger("env").critical(
|
||||||
|
"DECNET_LIMITER_ENABLED=false on a public binding (%s=%s). "
|
||||||
|
"Login + write endpoints have no rate limiting — only run this "
|
||||||
|
"way for benchmarking or behind an external rate-limiting proxy.",
|
||||||
|
bind_label, DECNET_API_HOST,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def __getattr__(name: str) -> str:
|
def __getattr__(name: str) -> str:
|
||||||
"""Lazy resolution for secrets only the master web/api process needs."""
|
"""Lazy resolution for secrets only the master web/api process needs."""
|
||||||
if name == "DECNET_JWT_SECRET":
|
if name == "DECNET_JWT_SECRET":
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user