refactor(swarm-mgmt): backfill host address from agent's .tgz source IP

This commit is contained in:
2026-04-19 05:20:23 -04:00
parent e32fdf9cbf
commit ff4c993617
4 changed files with 46 additions and 29 deletions

View File

@@ -83,8 +83,6 @@ class EnrollBundleRequest(BaseModel):
description="IP/host the agent will reach back to")
agent_name: str = Field(..., pattern=r"^[a-z0-9][a-z0-9-]{0,62}$",
description="Worker name (DNS-label safe)")
agent_host: str = Field(..., min_length=1, max_length=253,
description="IP/host of the new worker — shown in SwarmHosts and used as cert SAN")
with_updater: bool = Field(
default=True,
description="Include updater cert bundle and auto-start decnet updater on the agent",
@@ -111,6 +109,7 @@ class _Bundle:
sh_path: pathlib.Path
tgz_path: pathlib.Path
expires_at: datetime
host_uuid: str
served: bool = False
@@ -275,9 +274,11 @@ async def create_enroll_bundle(
if existing is not None:
raise HTTPException(status_code=409, detail=f"Worker '{req.agent_name}' is already enrolled")
# 1. Issue certs (reuses the same code as /swarm/enroll).
# 1. Issue certs (reuses the same code as /swarm/enroll). The worker's own
# address is not known yet — the master learns it when the agent fetches
# the tarball (see get_payload), which also backfills the SwarmHost row.
ca = pki.ensure_ca()
sans = list({req.agent_name, req.agent_host, req.master_host})
sans = list({req.agent_name, req.master_host})
issued = pki.issue_worker_cert(ca, req.agent_name, sans)
bundle_dir = pki.DEFAULT_CA_DIR / "workers" / req.agent_name
pki.write_worker_bundle(issued, bundle_dir)
@@ -301,7 +302,7 @@ async def create_enroll_bundle(
{
"uuid": host_uuid,
"name": req.agent_name,
"address": req.agent_host,
"address": "", # filled in when the agent fetches the .tgz (its source IP)
"agent_port": 8765,
"status": "enrolled",
"client_cert_fingerprint": issued.fingerprint_sha256,
@@ -338,7 +339,9 @@ async def create_enroll_bundle(
os.chmod(sh_path, 0o600)
async with _LOCK:
_BUNDLES[token] = _Bundle(sh_path=sh_path, tgz_path=tgz_path, expires_at=expires_at)
_BUNDLES[token] = _Bundle(
sh_path=sh_path, tgz_path=tgz_path, expires_at=expires_at, host_uuid=host_uuid,
)
_ensure_sweeper()
log.info("enroll-bundle created agent=%s master=%s token=%s...", req.agent_name, req.master_host, token[:8])
@@ -380,14 +383,30 @@ async def get_bootstrap(token: str) -> Response:
tags=["Swarm Management"],
include_in_schema=False,
)
async def get_payload(token: str) -> Response:
async def get_payload(
token: str,
request: Request,
repo: BaseRepository = Depends(get_repo),
) -> Response:
async with _LOCK:
b = await _lookup_live(token)
b.served = True
data = b.tgz_path.read_bytes()
host_uuid = b.host_uuid
for p in (b.sh_path, b.tgz_path):
try:
p.unlink()
except FileNotFoundError:
pass
# The agent's first connect-back — its source IP is the reachable address
# the master will later use to probe it. Backfill the SwarmHost row here
# so the operator sees the real address instead of an empty placeholder.
client_host = request.client.host if request.client else ""
if client_host:
try:
await repo.update_swarm_host(host_uuid, {"address": client_host})
except Exception as e: # noqa: BLE001
log.warning("enroll-bundle could not backfill address host=%s err=%s", host_uuid, e)
return Response(content=data, media_type="application/gzip")

View File

@@ -14,7 +14,6 @@ interface BundleResult {
const AgentEnrollment: React.FC = () => {
const [masterHost, setMasterHost] = useState(window.location.hostname);
const [agentName, setAgentName] = useState('');
const [agentHost, setAgentHost] = useState('');
const [withUpdater, setWithUpdater] = useState(true);
const [servicesIni, setServicesIni] = useState<string | null>(null);
const [servicesIniName, setServicesIniName] = useState<string | null>(null);
@@ -49,7 +48,6 @@ const AgentEnrollment: React.FC = () => {
setResult(null);
setError(null);
setAgentName('');
setAgentHost('');
setWithUpdater(true);
setServicesIni(null);
setServicesIniName(null);
@@ -65,7 +63,6 @@ const AgentEnrollment: React.FC = () => {
const res = await api.post('/swarm/enroll-bundle', {
master_host: masterHost,
agent_name: agentName,
agent_host: agentHost,
with_updater: withUpdater,
services_ini: servicesIni,
});
@@ -112,16 +109,6 @@ const AgentEnrollment: React.FC = () => {
required
/>
</label>
<label>
Agent host (IP or DNS of the new worker VM)
<input
type="text"
value={agentHost}
onChange={(e) => setAgentHost(e.target.value)}
placeholder="e.g. 192.168.1.23"
required
/>
</label>
<label>
Agent name (lowercase, digits, dashes)
<input
@@ -152,7 +139,7 @@ const AgentEnrollment: React.FC = () => {
<button
type="submit"
className="control-btn primary"
disabled={submitting || !nameOk || !masterHost || !agentHost}
disabled={submitting || !nameOk || !masterHost}
>
{submitting ? 'Generating…' : 'Generate enrollment bundle'}
</button>

View File

@@ -92,7 +92,7 @@ const SwarmHosts: React.FC = () => {
{h.status === 'active' ? <Wifi size={16} /> : <WifiOff size={16} />} {h.status}
</td>
<td>{h.name}</td>
<td>{h.address}:{h.agent_port}</td>
<td>{h.address ? `${h.address}:${h.agent_port}` : <em>pending first connect</em>}</td>
<td>{h.last_heartbeat ? new Date(h.last_heartbeat).toLocaleString() : '—'}</td>
<td title={h.client_cert_fingerprint}><code>{shortFp(h.client_cert_fingerprint)}</code></td>
<td>{new Date(h.enrolled_at).toLocaleString()}</td>

View File

@@ -33,7 +33,6 @@ async def _post(client, auth_token, **overrides):
body = {
"master_host": "10.0.0.50",
"agent_name": "worker-a",
"agent_host": "10.0.0.100",
"with_updater": True,
}
body.update(overrides)
@@ -97,19 +96,31 @@ async def test_non_admin_forbidden(client, viewer_token):
async def test_no_auth_401(client):
resp = await client.post(
"/api/v1/swarm/enroll-bundle",
json={"master_host": "10.0.0.50", "agent_name": "worker-a", "agent_host": "10.0.0.100"},
json={"master_host": "10.0.0.50", "agent_name": "worker-a"},
)
assert resp.status_code == 401
@pytest.mark.anyio
async def test_host_row_uses_agent_host_not_master_host(client, auth_token):
"""SwarmHosts table should show the worker's own address, not the master's."""
async def test_host_row_address_backfilled_from_tgz_source_ip(client, auth_token):
"""SwarmHosts.address starts blank at enroll time and is populated from
the agent's source IP when it curls the .tgz."""
from decnet.web.dependencies import repo
resp = await _post(client, auth_token, agent_name="addr-test",
master_host="192.168.1.5", agent_host="192.168.1.23")
row = await repo.get_swarm_host_by_uuid(resp.json()["host_uuid"])
assert row["address"] == "192.168.1.23"
master_host="192.168.1.5")
host_uuid = resp.json()["host_uuid"]
token = resp.json()["token"]
row = await repo.get_swarm_host_by_uuid(host_uuid)
assert row["address"] == "" # placeholder until first tgz fetch
tgz = await client.get(f"/api/v1/swarm/enroll-bundle/{token}.tgz")
assert tgz.status_code == 200
row = await repo.get_swarm_host_by_uuid(host_uuid)
# The TestClient client.host depends on httpx's ASGITransport — any
# non-empty value proves the backfill path ran.
assert row["address"] != ""
@pytest.mark.anyio