feat(cloud): single-shot deploy.sh + FROST stack + healthchecks
Stage 5 — make the cloud composition spin up in one command and add
the SensorThings (FROST) stack as a fully segregated tenant.
cloud/deploy.sh — idempotent, 7-step bring-up:
preflight → validate → up + wait → cert state → issue/renew →
service status → endpoint smoke test. Reissues LE cert only when
current issuer no longer matches ACME_CA_URI. Move-aside-then-
restore-on-failure so the bootstrap cert survives a failed certbot.
stacks/frost — new stack, segregated from shared sql/rabbitmq:
- dedicated postgis container (frost-db)
- dedicated internal mosquitto bus (frost-mosquitto)
- frost-http + frost-mqtt on a private frost-internal network,
joined to cloud-app only for nginx ingress at frost.wbd-rd.nl
- shared mosquitto stack deleted; rabbitmq remains the only public
MQTT broker (mqtt.wbd-rd.nl:8883 via stream proxy)
stacks/sql — pg_isready healthcheck so keycloak/gitea/mlflow can gate
on service_healthy via cloud-level depends_on overrides.
stacks/nginx-proxy:
- nginx-init service generates a self-signed bootstrap cert on
fresh deploy so nginx starts before certbot has issued a real one
- frost.wbd-rd.nl vhost (/FROST-Server → frost-http:8080,
/mqtt → frost-mqtt:9876 WebSocket)
stacks/mlflow — custom Dockerfile (upstream + psycopg2-binary) so the
official image can speak to the shared sql backend.
stacks/jupyterhub — DummyAuthenticator stub gated by
JUPYTERHUB_ADMIN_PASSWORD; TODO comments point at OIDC + DockerSpawner.
stacks/rabbitmq — config/{enabled_plugins,rabbitmq.conf} stubs
(management + mqtt plugins, MQTT auth required).
stacks/portainer — ports unpublished; nginx now the only ingress.
stacks/node-red — pin to 4.1 (the floating "4" tag does not exist).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
248
cloud/deploy.sh
Normal file
248
cloud/deploy.sh
Normal file
@@ -0,0 +1,248 @@
|
||||
#!/usr/bin/env bash
|
||||
# cloud/deploy.sh — one-shot bring-up for the cloud composition.
|
||||
#
|
||||
# Idempotent. Safe to rerun. Will reissue the Let's Encrypt cert only when:
|
||||
# - the current cert is the self-signed bootstrap dummy, or
|
||||
# - .env's ACME_CA_URI no longer matches the issuer of the current cert
|
||||
# (e.g. you flipped staging → prod).
|
||||
#
|
||||
# Usage:
|
||||
# cd cloud && ./deploy.sh
|
||||
|
||||
set -euo pipefail
|
||||
cd "$(dirname "$0")"
|
||||
|
||||
# ---------- UI ----------
|
||||
if [ -t 1 ]; then
|
||||
B=$'\e[34m'; G=$'\e[32m'; Y=$'\e[33m'; R=$'\e[31m'; D=$'\e[2m'; N=$'\e[0m'
|
||||
else
|
||||
B=""; G=""; Y=""; R=""; D=""; N=""
|
||||
fi
|
||||
STEP=0; TOTAL=7
|
||||
|
||||
step() { STEP=$((STEP+1)); printf "\n${B}[%d/%d]${N} %s\n" "$STEP" "$TOTAL" "$*"; }
|
||||
ok() { printf " ${G}[OK]${N} %s\n" "$*"; }
|
||||
info() { printf " ${D}...${N} %s\n" "$*"; }
|
||||
warn() { printf " ${Y}[!]${N} %s\n" "$*"; }
|
||||
fail() { printf " ${R}[X]${N} %s\n" "$*"; }
|
||||
die() { fail "$*"; exit 1; }
|
||||
|
||||
trap 'rc=$?; [ "$rc" -ne 0 ] && printf "\n${R}DEPLOY FAILED${N} (exit $rc) at step $STEP/$TOTAL\n"' EXIT
|
||||
|
||||
# Subdomains covered by the SAN cert (kept in lock-step with nginx-proxy vhosts)
|
||||
HOSTS=(
|
||||
git.wbd-rd.nl auth.wbd-rd.nl dash.wbd-rd.nl flow.wbd-rd.nl
|
||||
ml.wbd-rd.nl hub.wbd-rd.nl ops.wbd-rd.nl mq.wbd-rd.nl
|
||||
ci.wbd-rd.nl mqtt.wbd-rd.nl portainer.wbd-rd.nl
|
||||
frost.wbd-rd.nl
|
||||
)
|
||||
|
||||
# ---------- 1. Preflight ----------
|
||||
step "Preflight"
|
||||
|
||||
[ -f .env ] || die ".env missing (cp .env.example .env and fill secrets)"
|
||||
ok ".env present"
|
||||
|
||||
command -v docker >/dev/null || die "docker not installed"
|
||||
docker compose version >/dev/null 2>&1 || die "docker compose plugin missing"
|
||||
ok "docker $(docker --version | awk '{print $3}' | tr -d ,)"
|
||||
ok "docker compose $(docker compose version --short)"
|
||||
|
||||
# Source .env so we can read variables (without leaking to the wider env)
|
||||
set -a; . ./.env; set +a
|
||||
|
||||
REQUIRED=(
|
||||
LETSENCRYPT_EMAIL ACME_CA_URI
|
||||
KEYCLOAK_ADMIN_PASSWORD KEYCLOAK_DB_PASSWORD
|
||||
SQL_PASSWORD
|
||||
GITEA_DB_PASSWORD GITEA_OAUTH_CLIENT_SECRET
|
||||
GRAFANA_ADMIN_PASSWORD
|
||||
INFLUX_ADMIN_PASSWORD INFLUX_ADMIN_TOKEN
|
||||
RABBITMQ_PASSWORD
|
||||
JENKINS_ADMIN_PASSWORD
|
||||
MLFLOW_DB_PASSWORD
|
||||
JUPYTERHUB_ADMIN_PASSWORD
|
||||
FROST_DB_PASSWORD
|
||||
WG_SERVER_PUBLIC_HOST
|
||||
)
|
||||
missing=0
|
||||
for v in "${REQUIRED[@]}"; do
|
||||
if [ -z "${!v:-}" ]; then warn "\$$v is empty in .env"; missing=$((missing+1)); fi
|
||||
done
|
||||
[ "$missing" -eq 0 ] || die "$missing required env var(s) empty"
|
||||
ok "required env vars present"
|
||||
|
||||
# ---------- 2. Validate compose ----------
|
||||
step "Validate compose"
|
||||
docker compose config --quiet || die "docker compose config invalid"
|
||||
services_total=$(docker compose config --services | wc -l)
|
||||
ok "compose valid, $services_total services defined"
|
||||
|
||||
# ---------- 3. Bring up containers ----------
|
||||
step "Bring up containers (docker compose up -d)"
|
||||
docker compose up -d --remove-orphans
|
||||
ok "containers requested"
|
||||
|
||||
# Wait for postgres healthy (longest dep — gates keycloak/gitea/mlflow)
|
||||
info "waiting for sql to become healthy ..."
|
||||
sql_cid=$(docker compose ps -q sql)
|
||||
[ -n "$sql_cid" ] || die "sql container not found"
|
||||
for i in $(seq 1 60); do
|
||||
state=$(docker inspect --format '{{if .State.Health}}{{.State.Health.Status}}{{else}}none{{end}}' "$sql_cid" 2>/dev/null || echo "")
|
||||
case "$state" in
|
||||
healthy) ok "sql healthy (after ${i} probe(s))"; break;;
|
||||
starting|"") sleep 2;;
|
||||
unhealthy) die "sql reports unhealthy — check 'docker compose logs sql'";;
|
||||
none) warn "sql has no healthcheck — proceeding anyway"; break;;
|
||||
esac
|
||||
[ "$i" -eq 60 ] && die "sql did not become healthy within 120s"
|
||||
done
|
||||
|
||||
# Wait for nginx accepting on :80 (nginx-init must have produced the bootstrap cert)
|
||||
info "waiting for nginx :80 ..."
|
||||
for i in $(seq 1 30); do
|
||||
code=$(curl -s -o /dev/null -w '%{http_code}' --max-time 2 -H "Host: ping" http://127.0.0.1/ 2>/dev/null || echo 000)
|
||||
if [ "$code" != "000" ]; then ok "nginx :80 responding (HTTP $code) after ${i} probe(s)"; break; fi
|
||||
sleep 2
|
||||
[ "$i" -eq 30 ] && die "nginx :80 unreachable — check 'docker compose logs nginx-init nginx'"
|
||||
done
|
||||
|
||||
# ---------- 4. Detect cert state ----------
|
||||
step "Inspect TLS cert state"
|
||||
|
||||
CERT_PATH=/etc/letsencrypt/live/infra/fullchain.pem
|
||||
# nginx:1.27-alpine doesn't ship openssl; the certbot image does.
|
||||
cert_subj=$(docker compose run --rm --entrypoint openssl certbot \
|
||||
x509 -in "$CERT_PATH" -noout -subject 2>/dev/null || echo "")
|
||||
cert_iss=$(docker compose run --rm --entrypoint openssl certbot \
|
||||
x509 -in "$CERT_PATH" -noout -issuer 2>/dev/null || echo "")
|
||||
|
||||
case "$ACME_CA_URI" in
|
||||
*acme-staging*) want_ca=STAGING;;
|
||||
*) want_ca=PROD;;
|
||||
esac
|
||||
|
||||
cur_ca=UNKNOWN
|
||||
case "$cert_subj" in *bootstrap-infra*) cur_ca=SELFSIGNED;; esac
|
||||
if [ "$cur_ca" = "UNKNOWN" ]; then
|
||||
case "$cert_iss" in
|
||||
*STAGING*|*Fake*|*staging*) cur_ca=STAGING;;
|
||||
*Encrypt*|*ISRG*|*\ R3*|*\ R10*|*\ R11*|*\ E1*|*\ E5*|*\ E6*) cur_ca=PROD;;
|
||||
esac
|
||||
fi
|
||||
ok "current cert: $cur_ca / target: $want_ca"
|
||||
|
||||
# Decide what to do: 'none', 'initial' (no certbot lineage yet), or 'renew' (lineage exists but wrong CA).
|
||||
if [ "$cur_ca" = "$want_ca" ]; then
|
||||
action="none"; reason=""
|
||||
elif [ "$cur_ca" = "SELFSIGNED" ] || [ "$cur_ca" = "UNKNOWN" ]; then
|
||||
action="initial"; reason="bootstrap → $want_ca"
|
||||
else
|
||||
action="renew"; reason="$cur_ca → $want_ca"
|
||||
fi
|
||||
|
||||
# ---------- 5. Issue / renew cert ----------
|
||||
step "Cert issuance"
|
||||
|
||||
if [ "$action" = "none" ]; then
|
||||
ok "no issuance needed (cert matches ACME_CA_URI)"
|
||||
else
|
||||
warn "$reason"
|
||||
|
||||
d_args=()
|
||||
for h in "${HOSTS[@]}"; do d_args+=(-d "$h"); done
|
||||
|
||||
# For 'initial': move the bootstrap dummy aside into a backup location so certbot
|
||||
# can create a fresh lineage. Restore from backup if certbot fails so nginx
|
||||
# always has *some* cert available on the next restart.
|
||||
if [ "$action" = "initial" ]; then
|
||||
info "moving bootstrap cert aside before issuance ..."
|
||||
docker compose run --rm --entrypoint sh certbot -c '
|
||||
set -e
|
||||
mkdir -p /etc/letsencrypt/_backup
|
||||
rm -rf /etc/letsencrypt/_backup/*
|
||||
for p in live/infra archive/infra renewal/infra.conf; do
|
||||
[ -e "/etc/letsencrypt/$p" ] && mv "/etc/letsencrypt/$p" "/etc/letsencrypt/_backup/$(echo $p | tr / -)" || true
|
||||
done
|
||||
' >/dev/null
|
||||
force_flag=""
|
||||
else
|
||||
force_flag="--force-renewal"
|
||||
fi
|
||||
|
||||
if docker compose run --rm --entrypoint certbot certbot \
|
||||
certonly --webroot -w /var/www/certbot \
|
||||
--server "$ACME_CA_URI" \
|
||||
--email "$LETSENCRYPT_EMAIL" --agree-tos --no-eff-email \
|
||||
--cert-name infra --non-interactive --keep-until-expiring \
|
||||
$force_flag \
|
||||
"${d_args[@]}"; then
|
||||
ok "cert issued by $want_ca CA"
|
||||
|
||||
# Issuance OK: discard backup
|
||||
if [ "$action" = "initial" ]; then
|
||||
docker compose run --rm --entrypoint sh certbot -c \
|
||||
"rm -rf /etc/letsencrypt/_backup" >/dev/null
|
||||
fi
|
||||
|
||||
docker compose exec -T nginx nginx -s reload
|
||||
ok "nginx reloaded with new cert"
|
||||
else
|
||||
# Restore backup so nginx still has a working cert next time it restarts
|
||||
if [ "$action" = "initial" ]; then
|
||||
warn "restoring bootstrap cert after failed issuance ..."
|
||||
docker compose run --rm --entrypoint sh certbot -c '
|
||||
for f in /etc/letsencrypt/_backup/*; do
|
||||
[ -e "$f" ] || continue
|
||||
dest=/etc/letsencrypt/$(basename "$f" | sed "s/-/\//")
|
||||
mkdir -p "$(dirname "$dest")"
|
||||
mv "$f" "$dest"
|
||||
done
|
||||
rmdir /etc/letsencrypt/_backup 2>/dev/null || true
|
||||
' >/dev/null
|
||||
fi
|
||||
die "certbot failed — DNS A records pointing at this host?"
|
||||
fi
|
||||
fi
|
||||
|
||||
# ---------- 6. Service status ----------
|
||||
step "Service status"
|
||||
|
||||
running=0; total=0
|
||||
while IFS= read -r line; do
|
||||
total=$((total+1))
|
||||
case "$line" in *running*|*healthy*) running=$((running+1));; esac
|
||||
done < <(docker compose ps --format '{{.Name}} {{.Status}}')
|
||||
|
||||
docker compose ps --format 'table {{.Name}}\t{{.Status}}' | sed 's/^/ /'
|
||||
ok "$running/$total containers running"
|
||||
|
||||
# ---------- 7. Endpoint smoke test ----------
|
||||
step "Endpoint smoke test (loopback)"
|
||||
|
||||
reachable=0; unreachable=0
|
||||
for h in "${HOSTS[@]}"; do
|
||||
code=$(curl -sk -o /dev/null -w '%{http_code}' --max-time 5 \
|
||||
--resolve "$h:443:127.0.0.1" "https://$h/" 2>/dev/null || echo 000)
|
||||
case "$code" in
|
||||
2*|3*) ok "$h → HTTP $code"; reachable=$((reachable+1));;
|
||||
4*) ok "$h → HTTP $code (auth gate — vhost OK)"; reachable=$((reachable+1));;
|
||||
5*) warn "$h → HTTP $code (vhost OK, upstream not ready)"; reachable=$((reachable+1));;
|
||||
000) fail "$h → unreachable"; unreachable=$((unreachable+1));;
|
||||
*) warn "$h → HTTP $code"; reachable=$((reachable+1));;
|
||||
esac
|
||||
done
|
||||
|
||||
# ---------- Summary ----------
|
||||
echo
|
||||
if [ "$unreachable" -eq 0 ] && [ "$running" -eq "$total" ]; then
|
||||
printf "${G}DEPLOY OK${N} — $running/$total containers, $reachable/${#HOSTS[@]} endpoints reachable, cert: $want_ca\n"
|
||||
else
|
||||
printf "${Y}DEPLOY COMPLETED WITH WARNINGS${N} — $running/$total containers, $unreachable unreachable endpoint(s)\n"
|
||||
fi
|
||||
|
||||
if [ "$want_ca" = "STAGING" ]; then
|
||||
printf "\n${D}Next: when staging looks right, flip ACME_CA_URI to the prod URL in .env and rerun this script.${N}\n"
|
||||
fi
|
||||
|
||||
trap - EXIT
|
||||
Reference in New Issue
Block a user