Stage 5 — make the cloud composition spin up in one command and add
the SensorThings (FROST) stack as a fully segregated tenant.
cloud/deploy.sh — idempotent, 7-step bring-up:
preflight → validate → up + wait → cert state → issue/renew →
service status → endpoint smoke test. Reissues LE cert only when
current issuer no longer matches ACME_CA_URI. Move-aside-then-
restore-on-failure so the bootstrap cert survives a failed certbot.
stacks/frost — new stack, segregated from shared sql/rabbitmq:
- dedicated postgis container (frost-db)
- dedicated internal mosquitto bus (frost-mosquitto)
- frost-http + frost-mqtt on a private frost-internal network,
joined to cloud-app only for nginx ingress at frost.wbd-rd.nl
- shared mosquitto stack deleted; rabbitmq remains the only public
MQTT broker (mqtt.wbd-rd.nl:8883 via stream proxy)
stacks/sql — pg_isready healthcheck so keycloak/gitea/mlflow can gate
on service_healthy via cloud-level depends_on overrides.
stacks/nginx-proxy:
- nginx-init service generates a self-signed bootstrap cert on
fresh deploy so nginx starts before certbot has issued a real one
- frost.wbd-rd.nl vhost (/FROST-Server → frost-http:8080,
/mqtt → frost-mqtt:9876 WebSocket)
stacks/mlflow — custom Dockerfile (upstream + psycopg2-binary) so the
official image can speak to the shared sql backend.
stacks/jupyterhub — DummyAuthenticator stub gated by
JUPYTERHUB_ADMIN_PASSWORD; TODO comments point at OIDC + DockerSpawner.
stacks/rabbitmq — config/{enabled_plugins,rabbitmq.conf} stubs
(management + mqtt plugins, MQTT auth required).
stacks/portainer — ports unpublished; nginx now the only ingress.
stacks/node-red — pin to 4.1 (the floating "4" tag does not exist).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
249 lines
8.9 KiB
Bash
249 lines
8.9 KiB
Bash
#!/usr/bin/env bash
|
|
# cloud/deploy.sh — one-shot bring-up for the cloud composition.
|
|
#
|
|
# Idempotent. Safe to rerun. Will reissue the Let's Encrypt cert only when:
|
|
# - the current cert is the self-signed bootstrap dummy, or
|
|
# - .env's ACME_CA_URI no longer matches the issuer of the current cert
|
|
# (e.g. you flipped staging → prod).
|
|
#
|
|
# Usage:
|
|
# cd cloud && ./deploy.sh
|
|
|
|
set -euo pipefail
|
|
cd "$(dirname "$0")"
|
|
|
|
# ---------- UI ----------
|
|
if [ -t 1 ]; then
|
|
B=$'\e[34m'; G=$'\e[32m'; Y=$'\e[33m'; R=$'\e[31m'; D=$'\e[2m'; N=$'\e[0m'
|
|
else
|
|
B=""; G=""; Y=""; R=""; D=""; N=""
|
|
fi
|
|
STEP=0; TOTAL=7
|
|
|
|
step() { STEP=$((STEP+1)); printf "\n${B}[%d/%d]${N} %s\n" "$STEP" "$TOTAL" "$*"; }
|
|
ok() { printf " ${G}[OK]${N} %s\n" "$*"; }
|
|
info() { printf " ${D}...${N} %s\n" "$*"; }
|
|
warn() { printf " ${Y}[!]${N} %s\n" "$*"; }
|
|
fail() { printf " ${R}[X]${N} %s\n" "$*"; }
|
|
die() { fail "$*"; exit 1; }
|
|
|
|
trap 'rc=$?; [ "$rc" -ne 0 ] && printf "\n${R}DEPLOY FAILED${N} (exit $rc) at step $STEP/$TOTAL\n"' EXIT
|
|
|
|
# Subdomains covered by the SAN cert (kept in lock-step with nginx-proxy vhosts)
|
|
HOSTS=(
|
|
git.wbd-rd.nl auth.wbd-rd.nl dash.wbd-rd.nl flow.wbd-rd.nl
|
|
ml.wbd-rd.nl hub.wbd-rd.nl ops.wbd-rd.nl mq.wbd-rd.nl
|
|
ci.wbd-rd.nl mqtt.wbd-rd.nl portainer.wbd-rd.nl
|
|
frost.wbd-rd.nl
|
|
)
|
|
|
|
# ---------- 1. Preflight ----------
|
|
step "Preflight"
|
|
|
|
[ -f .env ] || die ".env missing (cp .env.example .env and fill secrets)"
|
|
ok ".env present"
|
|
|
|
command -v docker >/dev/null || die "docker not installed"
|
|
docker compose version >/dev/null 2>&1 || die "docker compose plugin missing"
|
|
ok "docker $(docker --version | awk '{print $3}' | tr -d ,)"
|
|
ok "docker compose $(docker compose version --short)"
|
|
|
|
# Source .env so we can read variables (without leaking to the wider env)
|
|
set -a; . ./.env; set +a
|
|
|
|
REQUIRED=(
|
|
LETSENCRYPT_EMAIL ACME_CA_URI
|
|
KEYCLOAK_ADMIN_PASSWORD KEYCLOAK_DB_PASSWORD
|
|
SQL_PASSWORD
|
|
GITEA_DB_PASSWORD GITEA_OAUTH_CLIENT_SECRET
|
|
GRAFANA_ADMIN_PASSWORD
|
|
INFLUX_ADMIN_PASSWORD INFLUX_ADMIN_TOKEN
|
|
RABBITMQ_PASSWORD
|
|
JENKINS_ADMIN_PASSWORD
|
|
MLFLOW_DB_PASSWORD
|
|
JUPYTERHUB_ADMIN_PASSWORD
|
|
FROST_DB_PASSWORD
|
|
WG_SERVER_PUBLIC_HOST
|
|
)
|
|
missing=0
|
|
for v in "${REQUIRED[@]}"; do
|
|
if [ -z "${!v:-}" ]; then warn "\$$v is empty in .env"; missing=$((missing+1)); fi
|
|
done
|
|
[ "$missing" -eq 0 ] || die "$missing required env var(s) empty"
|
|
ok "required env vars present"
|
|
|
|
# ---------- 2. Validate compose ----------
|
|
step "Validate compose"
|
|
docker compose config --quiet || die "docker compose config invalid"
|
|
services_total=$(docker compose config --services | wc -l)
|
|
ok "compose valid, $services_total services defined"
|
|
|
|
# ---------- 3. Bring up containers ----------
|
|
step "Bring up containers (docker compose up -d)"
|
|
docker compose up -d --remove-orphans
|
|
ok "containers requested"
|
|
|
|
# Wait for postgres healthy (longest dep — gates keycloak/gitea/mlflow)
|
|
info "waiting for sql to become healthy ..."
|
|
sql_cid=$(docker compose ps -q sql)
|
|
[ -n "$sql_cid" ] || die "sql container not found"
|
|
for i in $(seq 1 60); do
|
|
state=$(docker inspect --format '{{if .State.Health}}{{.State.Health.Status}}{{else}}none{{end}}' "$sql_cid" 2>/dev/null || echo "")
|
|
case "$state" in
|
|
healthy) ok "sql healthy (after ${i} probe(s))"; break;;
|
|
starting|"") sleep 2;;
|
|
unhealthy) die "sql reports unhealthy — check 'docker compose logs sql'";;
|
|
none) warn "sql has no healthcheck — proceeding anyway"; break;;
|
|
esac
|
|
[ "$i" -eq 60 ] && die "sql did not become healthy within 120s"
|
|
done
|
|
|
|
# Wait for nginx accepting on :80 (nginx-init must have produced the bootstrap cert)
|
|
info "waiting for nginx :80 ..."
|
|
for i in $(seq 1 30); do
|
|
code=$(curl -s -o /dev/null -w '%{http_code}' --max-time 2 -H "Host: ping" http://127.0.0.1/ 2>/dev/null || echo 000)
|
|
if [ "$code" != "000" ]; then ok "nginx :80 responding (HTTP $code) after ${i} probe(s)"; break; fi
|
|
sleep 2
|
|
[ "$i" -eq 30 ] && die "nginx :80 unreachable — check 'docker compose logs nginx-init nginx'"
|
|
done
|
|
|
|
# ---------- 4. Detect cert state ----------
|
|
step "Inspect TLS cert state"
|
|
|
|
CERT_PATH=/etc/letsencrypt/live/infra/fullchain.pem
|
|
# nginx:1.27-alpine doesn't ship openssl; the certbot image does.
|
|
cert_subj=$(docker compose run --rm --entrypoint openssl certbot \
|
|
x509 -in "$CERT_PATH" -noout -subject 2>/dev/null || echo "")
|
|
cert_iss=$(docker compose run --rm --entrypoint openssl certbot \
|
|
x509 -in "$CERT_PATH" -noout -issuer 2>/dev/null || echo "")
|
|
|
|
case "$ACME_CA_URI" in
|
|
*acme-staging*) want_ca=STAGING;;
|
|
*) want_ca=PROD;;
|
|
esac
|
|
|
|
cur_ca=UNKNOWN
|
|
case "$cert_subj" in *bootstrap-infra*) cur_ca=SELFSIGNED;; esac
|
|
if [ "$cur_ca" = "UNKNOWN" ]; then
|
|
case "$cert_iss" in
|
|
*STAGING*|*Fake*|*staging*) cur_ca=STAGING;;
|
|
*Encrypt*|*ISRG*|*\ R3*|*\ R10*|*\ R11*|*\ E1*|*\ E5*|*\ E6*) cur_ca=PROD;;
|
|
esac
|
|
fi
|
|
ok "current cert: $cur_ca / target: $want_ca"
|
|
|
|
# Decide what to do: 'none', 'initial' (no certbot lineage yet), or 'renew' (lineage exists but wrong CA).
|
|
if [ "$cur_ca" = "$want_ca" ]; then
|
|
action="none"; reason=""
|
|
elif [ "$cur_ca" = "SELFSIGNED" ] || [ "$cur_ca" = "UNKNOWN" ]; then
|
|
action="initial"; reason="bootstrap → $want_ca"
|
|
else
|
|
action="renew"; reason="$cur_ca → $want_ca"
|
|
fi
|
|
|
|
# ---------- 5. Issue / renew cert ----------
|
|
step "Cert issuance"
|
|
|
|
if [ "$action" = "none" ]; then
|
|
ok "no issuance needed (cert matches ACME_CA_URI)"
|
|
else
|
|
warn "$reason"
|
|
|
|
d_args=()
|
|
for h in "${HOSTS[@]}"; do d_args+=(-d "$h"); done
|
|
|
|
# For 'initial': move the bootstrap dummy aside into a backup location so certbot
|
|
# can create a fresh lineage. Restore from backup if certbot fails so nginx
|
|
# always has *some* cert available on the next restart.
|
|
if [ "$action" = "initial" ]; then
|
|
info "moving bootstrap cert aside before issuance ..."
|
|
docker compose run --rm --entrypoint sh certbot -c '
|
|
set -e
|
|
mkdir -p /etc/letsencrypt/_backup
|
|
rm -rf /etc/letsencrypt/_backup/*
|
|
for p in live/infra archive/infra renewal/infra.conf; do
|
|
[ -e "/etc/letsencrypt/$p" ] && mv "/etc/letsencrypt/$p" "/etc/letsencrypt/_backup/$(echo $p | tr / -)" || true
|
|
done
|
|
' >/dev/null
|
|
force_flag=""
|
|
else
|
|
force_flag="--force-renewal"
|
|
fi
|
|
|
|
if docker compose run --rm --entrypoint certbot certbot \
|
|
certonly --webroot -w /var/www/certbot \
|
|
--server "$ACME_CA_URI" \
|
|
--email "$LETSENCRYPT_EMAIL" --agree-tos --no-eff-email \
|
|
--cert-name infra --non-interactive --keep-until-expiring \
|
|
$force_flag \
|
|
"${d_args[@]}"; then
|
|
ok "cert issued by $want_ca CA"
|
|
|
|
# Issuance OK: discard backup
|
|
if [ "$action" = "initial" ]; then
|
|
docker compose run --rm --entrypoint sh certbot -c \
|
|
"rm -rf /etc/letsencrypt/_backup" >/dev/null
|
|
fi
|
|
|
|
docker compose exec -T nginx nginx -s reload
|
|
ok "nginx reloaded with new cert"
|
|
else
|
|
# Restore backup so nginx still has a working cert next time it restarts
|
|
if [ "$action" = "initial" ]; then
|
|
warn "restoring bootstrap cert after failed issuance ..."
|
|
docker compose run --rm --entrypoint sh certbot -c '
|
|
for f in /etc/letsencrypt/_backup/*; do
|
|
[ -e "$f" ] || continue
|
|
dest=/etc/letsencrypt/$(basename "$f" | sed "s/-/\//")
|
|
mkdir -p "$(dirname "$dest")"
|
|
mv "$f" "$dest"
|
|
done
|
|
rmdir /etc/letsencrypt/_backup 2>/dev/null || true
|
|
' >/dev/null
|
|
fi
|
|
die "certbot failed — DNS A records pointing at this host?"
|
|
fi
|
|
fi
|
|
|
|
# ---------- 6. Service status ----------
|
|
step "Service status"
|
|
|
|
running=0; total=0
|
|
while IFS= read -r line; do
|
|
total=$((total+1))
|
|
case "$line" in *running*|*healthy*) running=$((running+1));; esac
|
|
done < <(docker compose ps --format '{{.Name}} {{.Status}}')
|
|
|
|
docker compose ps --format 'table {{.Name}}\t{{.Status}}' | sed 's/^/ /'
|
|
ok "$running/$total containers running"
|
|
|
|
# ---------- 7. Endpoint smoke test ----------
|
|
step "Endpoint smoke test (loopback)"
|
|
|
|
reachable=0; unreachable=0
|
|
for h in "${HOSTS[@]}"; do
|
|
code=$(curl -sk -o /dev/null -w '%{http_code}' --max-time 5 \
|
|
--resolve "$h:443:127.0.0.1" "https://$h/" 2>/dev/null || echo 000)
|
|
case "$code" in
|
|
2*|3*) ok "$h → HTTP $code"; reachable=$((reachable+1));;
|
|
4*) ok "$h → HTTP $code (auth gate — vhost OK)"; reachable=$((reachable+1));;
|
|
5*) warn "$h → HTTP $code (vhost OK, upstream not ready)"; reachable=$((reachable+1));;
|
|
000) fail "$h → unreachable"; unreachable=$((unreachable+1));;
|
|
*) warn "$h → HTTP $code"; reachable=$((reachable+1));;
|
|
esac
|
|
done
|
|
|
|
# ---------- Summary ----------
|
|
echo
|
|
if [ "$unreachable" -eq 0 ] && [ "$running" -eq "$total" ]; then
|
|
printf "${G}DEPLOY OK${N} — $running/$total containers, $reachable/${#HOSTS[@]} endpoints reachable, cert: $want_ca\n"
|
|
else
|
|
printf "${Y}DEPLOY COMPLETED WITH WARNINGS${N} — $running/$total containers, $unreachable unreachable endpoint(s)\n"
|
|
fi
|
|
|
|
if [ "$want_ca" = "STAGING" ]; then
|
|
printf "\n${D}Next: when staging looks right, flip ACME_CA_URI to the prod URL in .env and rerun this script.${N}\n"
|
|
fi
|
|
|
|
trap - EXIT
|