refactor(swarm-mgmt): backfill host address from agent's .tgz source IP

This commit is contained in:
2026-04-19 05:20:23 -04:00
parent e32fdf9cbf
commit ff4c993617
4 changed files with 46 additions and 29 deletions

View File

@@ -83,8 +83,6 @@ class EnrollBundleRequest(BaseModel):
description="IP/host the agent will reach back to") description="IP/host the agent will reach back to")
agent_name: str = Field(..., pattern=r"^[a-z0-9][a-z0-9-]{0,62}$", agent_name: str = Field(..., pattern=r"^[a-z0-9][a-z0-9-]{0,62}$",
description="Worker name (DNS-label safe)") description="Worker name (DNS-label safe)")
agent_host: str = Field(..., min_length=1, max_length=253,
description="IP/host of the new worker — shown in SwarmHosts and used as cert SAN")
with_updater: bool = Field( with_updater: bool = Field(
default=True, default=True,
description="Include updater cert bundle and auto-start decnet updater on the agent", description="Include updater cert bundle and auto-start decnet updater on the agent",
@@ -111,6 +109,7 @@ class _Bundle:
sh_path: pathlib.Path sh_path: pathlib.Path
tgz_path: pathlib.Path tgz_path: pathlib.Path
expires_at: datetime expires_at: datetime
host_uuid: str
served: bool = False served: bool = False
@@ -275,9 +274,11 @@ async def create_enroll_bundle(
if existing is not None: if existing is not None:
raise HTTPException(status_code=409, detail=f"Worker '{req.agent_name}' is already enrolled") raise HTTPException(status_code=409, detail=f"Worker '{req.agent_name}' is already enrolled")
# 1. Issue certs (reuses the same code as /swarm/enroll). # 1. Issue certs (reuses the same code as /swarm/enroll). The worker's own
# address is not known yet — the master learns it when the agent fetches
# the tarball (see get_payload), which also backfills the SwarmHost row.
ca = pki.ensure_ca() ca = pki.ensure_ca()
sans = list({req.agent_name, req.agent_host, req.master_host}) sans = list({req.agent_name, req.master_host})
issued = pki.issue_worker_cert(ca, req.agent_name, sans) issued = pki.issue_worker_cert(ca, req.agent_name, sans)
bundle_dir = pki.DEFAULT_CA_DIR / "workers" / req.agent_name bundle_dir = pki.DEFAULT_CA_DIR / "workers" / req.agent_name
pki.write_worker_bundle(issued, bundle_dir) pki.write_worker_bundle(issued, bundle_dir)
@@ -301,7 +302,7 @@ async def create_enroll_bundle(
{ {
"uuid": host_uuid, "uuid": host_uuid,
"name": req.agent_name, "name": req.agent_name,
"address": req.agent_host, "address": "", # filled in when the agent fetches the .tgz (its source IP)
"agent_port": 8765, "agent_port": 8765,
"status": "enrolled", "status": "enrolled",
"client_cert_fingerprint": issued.fingerprint_sha256, "client_cert_fingerprint": issued.fingerprint_sha256,
@@ -338,7 +339,9 @@ async def create_enroll_bundle(
os.chmod(sh_path, 0o600) os.chmod(sh_path, 0o600)
async with _LOCK: async with _LOCK:
_BUNDLES[token] = _Bundle(sh_path=sh_path, tgz_path=tgz_path, expires_at=expires_at) _BUNDLES[token] = _Bundle(
sh_path=sh_path, tgz_path=tgz_path, expires_at=expires_at, host_uuid=host_uuid,
)
_ensure_sweeper() _ensure_sweeper()
log.info("enroll-bundle created agent=%s master=%s token=%s...", req.agent_name, req.master_host, token[:8]) log.info("enroll-bundle created agent=%s master=%s token=%s...", req.agent_name, req.master_host, token[:8])
@@ -380,14 +383,30 @@ async def get_bootstrap(token: str) -> Response:
tags=["Swarm Management"], tags=["Swarm Management"],
include_in_schema=False, include_in_schema=False,
) )
async def get_payload(token: str) -> Response: async def get_payload(
token: str,
request: Request,
repo: BaseRepository = Depends(get_repo),
) -> Response:
async with _LOCK: async with _LOCK:
b = await _lookup_live(token) b = await _lookup_live(token)
b.served = True b.served = True
data = b.tgz_path.read_bytes() data = b.tgz_path.read_bytes()
host_uuid = b.host_uuid
for p in (b.sh_path, b.tgz_path): for p in (b.sh_path, b.tgz_path):
try: try:
p.unlink() p.unlink()
except FileNotFoundError: except FileNotFoundError:
pass pass
# The agent's first connect-back — its source IP is the reachable address
# the master will later use to probe it. Backfill the SwarmHost row here
# so the operator sees the real address instead of an empty placeholder.
client_host = request.client.host if request.client else ""
if client_host:
try:
await repo.update_swarm_host(host_uuid, {"address": client_host})
except Exception as e: # noqa: BLE001
log.warning("enroll-bundle could not backfill address host=%s err=%s", host_uuid, e)
return Response(content=data, media_type="application/gzip") return Response(content=data, media_type="application/gzip")

View File

@@ -14,7 +14,6 @@ interface BundleResult {
const AgentEnrollment: React.FC = () => { const AgentEnrollment: React.FC = () => {
const [masterHost, setMasterHost] = useState(window.location.hostname); const [masterHost, setMasterHost] = useState(window.location.hostname);
const [agentName, setAgentName] = useState(''); const [agentName, setAgentName] = useState('');
const [agentHost, setAgentHost] = useState('');
const [withUpdater, setWithUpdater] = useState(true); const [withUpdater, setWithUpdater] = useState(true);
const [servicesIni, setServicesIni] = useState<string | null>(null); const [servicesIni, setServicesIni] = useState<string | null>(null);
const [servicesIniName, setServicesIniName] = useState<string | null>(null); const [servicesIniName, setServicesIniName] = useState<string | null>(null);
@@ -49,7 +48,6 @@ const AgentEnrollment: React.FC = () => {
setResult(null); setResult(null);
setError(null); setError(null);
setAgentName(''); setAgentName('');
setAgentHost('');
setWithUpdater(true); setWithUpdater(true);
setServicesIni(null); setServicesIni(null);
setServicesIniName(null); setServicesIniName(null);
@@ -65,7 +63,6 @@ const AgentEnrollment: React.FC = () => {
const res = await api.post('/swarm/enroll-bundle', { const res = await api.post('/swarm/enroll-bundle', {
master_host: masterHost, master_host: masterHost,
agent_name: agentName, agent_name: agentName,
agent_host: agentHost,
with_updater: withUpdater, with_updater: withUpdater,
services_ini: servicesIni, services_ini: servicesIni,
}); });
@@ -112,16 +109,6 @@ const AgentEnrollment: React.FC = () => {
required required
/> />
</label> </label>
<label>
Agent host (IP or DNS of the new worker VM)
<input
type="text"
value={agentHost}
onChange={(e) => setAgentHost(e.target.value)}
placeholder="e.g. 192.168.1.23"
required
/>
</label>
<label> <label>
Agent name (lowercase, digits, dashes) Agent name (lowercase, digits, dashes)
<input <input
@@ -152,7 +139,7 @@ const AgentEnrollment: React.FC = () => {
<button <button
type="submit" type="submit"
className="control-btn primary" className="control-btn primary"
disabled={submitting || !nameOk || !masterHost || !agentHost} disabled={submitting || !nameOk || !masterHost}
> >
{submitting ? 'Generating…' : 'Generate enrollment bundle'} {submitting ? 'Generating…' : 'Generate enrollment bundle'}
</button> </button>

View File

@@ -92,7 +92,7 @@ const SwarmHosts: React.FC = () => {
{h.status === 'active' ? <Wifi size={16} /> : <WifiOff size={16} />} {h.status} {h.status === 'active' ? <Wifi size={16} /> : <WifiOff size={16} />} {h.status}
</td> </td>
<td>{h.name}</td> <td>{h.name}</td>
<td>{h.address}:{h.agent_port}</td> <td>{h.address ? `${h.address}:${h.agent_port}` : <em>pending first connect</em>}</td>
<td>{h.last_heartbeat ? new Date(h.last_heartbeat).toLocaleString() : '—'}</td> <td>{h.last_heartbeat ? new Date(h.last_heartbeat).toLocaleString() : '—'}</td>
<td title={h.client_cert_fingerprint}><code>{shortFp(h.client_cert_fingerprint)}</code></td> <td title={h.client_cert_fingerprint}><code>{shortFp(h.client_cert_fingerprint)}</code></td>
<td>{new Date(h.enrolled_at).toLocaleString()}</td> <td>{new Date(h.enrolled_at).toLocaleString()}</td>

View File

@@ -33,7 +33,6 @@ async def _post(client, auth_token, **overrides):
body = { body = {
"master_host": "10.0.0.50", "master_host": "10.0.0.50",
"agent_name": "worker-a", "agent_name": "worker-a",
"agent_host": "10.0.0.100",
"with_updater": True, "with_updater": True,
} }
body.update(overrides) body.update(overrides)
@@ -97,19 +96,31 @@ async def test_non_admin_forbidden(client, viewer_token):
async def test_no_auth_401(client): async def test_no_auth_401(client):
resp = await client.post( resp = await client.post(
"/api/v1/swarm/enroll-bundle", "/api/v1/swarm/enroll-bundle",
json={"master_host": "10.0.0.50", "agent_name": "worker-a", "agent_host": "10.0.0.100"}, json={"master_host": "10.0.0.50", "agent_name": "worker-a"},
) )
assert resp.status_code == 401 assert resp.status_code == 401
@pytest.mark.anyio @pytest.mark.anyio
async def test_host_row_uses_agent_host_not_master_host(client, auth_token): async def test_host_row_address_backfilled_from_tgz_source_ip(client, auth_token):
"""SwarmHosts table should show the worker's own address, not the master's.""" """SwarmHosts.address starts blank at enroll time and is populated from
the agent's source IP when it curls the .tgz."""
from decnet.web.dependencies import repo from decnet.web.dependencies import repo
resp = await _post(client, auth_token, agent_name="addr-test", resp = await _post(client, auth_token, agent_name="addr-test",
master_host="192.168.1.5", agent_host="192.168.1.23") master_host="192.168.1.5")
row = await repo.get_swarm_host_by_uuid(resp.json()["host_uuid"]) host_uuid = resp.json()["host_uuid"]
assert row["address"] == "192.168.1.23" token = resp.json()["token"]
row = await repo.get_swarm_host_by_uuid(host_uuid)
assert row["address"] == "" # placeholder until first tgz fetch
tgz = await client.get(f"/api/v1/swarm/enroll-bundle/{token}.tgz")
assert tgz.status_code == 200
row = await repo.get_swarm_host_by_uuid(host_uuid)
# The TestClient client.host depends on httpx's ASGITransport — any
# non-empty value proves the backfill path ran.
assert row["address"] != ""
@pytest.mark.anyio @pytest.mark.anyio