Files
infra/cloud/deploy.sh
znetsixe f69453df99 refactor(dns): rename frost.wbd-rd.nl → sta.wbd-rd.nl; drop redundant portainer.wbd-rd.nl
Match the short-functional naming convention used by the other vhosts
(git, auth, dash, flow, ml, hub, ops, mq, ci, mqtt). FROST implements
OGC SensorThings API, so `sta` is the natural fit.

portainer.wbd-rd.nl is dropped from deploy.sh HOSTS — there is no
nginx vhost for it; portainer is already served via ops.wbd-rd.nl.

DNS prereq for first deploy is now: create one new A record for
sta.wbd-rd.nl → cloud public IP. All other short subdomains already
point correctly.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-21 16:46:32 +02:00

249 lines
8.8 KiB
Bash

#!/usr/bin/env bash
# cloud/deploy.sh — one-shot bring-up for the cloud composition.
#
# Idempotent. Safe to rerun. Will reissue the Let's Encrypt cert only when:
# - the current cert is the self-signed bootstrap dummy, or
# - .env's ACME_CA_URI no longer matches the issuer of the current cert
# (e.g. you flipped staging → prod).
#
# Usage:
# cd cloud && ./deploy.sh
set -euo pipefail
cd "$(dirname "$0")"
# ---------- UI ----------
if [ -t 1 ]; then
B=$'\e[34m'; G=$'\e[32m'; Y=$'\e[33m'; R=$'\e[31m'; D=$'\e[2m'; N=$'\e[0m'
else
B=""; G=""; Y=""; R=""; D=""; N=""
fi
STEP=0; TOTAL=7
step() { STEP=$((STEP+1)); printf "\n${B}[%d/%d]${N} %s\n" "$STEP" "$TOTAL" "$*"; }
ok() { printf " ${G}[OK]${N} %s\n" "$*"; }
info() { printf " ${D}...${N} %s\n" "$*"; }
warn() { printf " ${Y}[!]${N} %s\n" "$*"; }
fail() { printf " ${R}[X]${N} %s\n" "$*"; }
die() { fail "$*"; exit 1; }
trap 'rc=$?; [ "$rc" -ne 0 ] && printf "\n${R}DEPLOY FAILED${N} (exit $rc) at step $STEP/$TOTAL\n"' EXIT
# Subdomains covered by the SAN cert (kept in lock-step with nginx-proxy vhosts)
HOSTS=(
git.wbd-rd.nl auth.wbd-rd.nl dash.wbd-rd.nl flow.wbd-rd.nl
ml.wbd-rd.nl hub.wbd-rd.nl ops.wbd-rd.nl mq.wbd-rd.nl
ci.wbd-rd.nl mqtt.wbd-rd.nl
sta.wbd-rd.nl
)
# ---------- 1. Preflight ----------
step "Preflight"
[ -f .env ] || die ".env missing (cp .env.example .env and fill secrets)"
ok ".env present"
command -v docker >/dev/null || die "docker not installed"
docker compose version >/dev/null 2>&1 || die "docker compose plugin missing"
ok "docker $(docker --version | awk '{print $3}' | tr -d ,)"
ok "docker compose $(docker compose version --short)"
# Source .env so we can read variables (without leaking to the wider env)
set -a; . ./.env; set +a
REQUIRED=(
LETSENCRYPT_EMAIL ACME_CA_URI
KEYCLOAK_ADMIN_PASSWORD KEYCLOAK_DB_PASSWORD
SQL_PASSWORD
GITEA_DB_PASSWORD GITEA_OAUTH_CLIENT_SECRET
GRAFANA_ADMIN_PASSWORD
INFLUX_ADMIN_PASSWORD INFLUX_ADMIN_TOKEN
RABBITMQ_PASSWORD
JENKINS_ADMIN_PASSWORD
MLFLOW_DB_PASSWORD
JUPYTERHUB_ADMIN_PASSWORD
FROST_DB_PASSWORD
WG_SERVER_PUBLIC_HOST
)
missing=0
for v in "${REQUIRED[@]}"; do
if [ -z "${!v:-}" ]; then warn "\$$v is empty in .env"; missing=$((missing+1)); fi
done
[ "$missing" -eq 0 ] || die "$missing required env var(s) empty"
ok "required env vars present"
# ---------- 2. Validate compose ----------
step "Validate compose"
docker compose config --quiet || die "docker compose config invalid"
services_total=$(docker compose config --services | wc -l)
ok "compose valid, $services_total services defined"
# ---------- 3. Bring up containers ----------
step "Bring up containers (docker compose up -d)"
docker compose up -d --remove-orphans
ok "containers requested"
# Wait for postgres healthy (longest dep — gates keycloak/gitea/mlflow)
info "waiting for sql to become healthy ..."
sql_cid=$(docker compose ps -q sql)
[ -n "$sql_cid" ] || die "sql container not found"
for i in $(seq 1 60); do
state=$(docker inspect --format '{{if .State.Health}}{{.State.Health.Status}}{{else}}none{{end}}' "$sql_cid" 2>/dev/null || echo "")
case "$state" in
healthy) ok "sql healthy (after ${i} probe(s))"; break;;
starting|"") sleep 2;;
unhealthy) die "sql reports unhealthy — check 'docker compose logs sql'";;
none) warn "sql has no healthcheck — proceeding anyway"; break;;
esac
[ "$i" -eq 60 ] && die "sql did not become healthy within 120s"
done
# Wait for nginx accepting on :80 (nginx-init must have produced the bootstrap cert)
info "waiting for nginx :80 ..."
for i in $(seq 1 30); do
code=$(curl -s -o /dev/null -w '%{http_code}' --max-time 2 -H "Host: ping" http://127.0.0.1/ 2>/dev/null || echo 000)
if [ "$code" != "000" ]; then ok "nginx :80 responding (HTTP $code) after ${i} probe(s)"; break; fi
sleep 2
[ "$i" -eq 30 ] && die "nginx :80 unreachable — check 'docker compose logs nginx-init nginx'"
done
# ---------- 4. Detect cert state ----------
step "Inspect TLS cert state"
CERT_PATH=/etc/letsencrypt/live/infra/fullchain.pem
# nginx:1.27-alpine doesn't ship openssl; the certbot image does.
cert_subj=$(docker compose run --rm --entrypoint openssl certbot \
x509 -in "$CERT_PATH" -noout -subject 2>/dev/null || echo "")
cert_iss=$(docker compose run --rm --entrypoint openssl certbot \
x509 -in "$CERT_PATH" -noout -issuer 2>/dev/null || echo "")
case "$ACME_CA_URI" in
*acme-staging*) want_ca=STAGING;;
*) want_ca=PROD;;
esac
cur_ca=UNKNOWN
case "$cert_subj" in *bootstrap-infra*) cur_ca=SELFSIGNED;; esac
if [ "$cur_ca" = "UNKNOWN" ]; then
case "$cert_iss" in
*STAGING*|*Fake*|*staging*) cur_ca=STAGING;;
*Encrypt*|*ISRG*|*\ R3*|*\ R10*|*\ R11*|*\ E1*|*\ E5*|*\ E6*) cur_ca=PROD;;
esac
fi
ok "current cert: $cur_ca / target: $want_ca"
# Decide what to do: 'none', 'initial' (no certbot lineage yet), or 'renew' (lineage exists but wrong CA).
if [ "$cur_ca" = "$want_ca" ]; then
action="none"; reason=""
elif [ "$cur_ca" = "SELFSIGNED" ] || [ "$cur_ca" = "UNKNOWN" ]; then
action="initial"; reason="bootstrap → $want_ca"
else
action="renew"; reason="$cur_ca$want_ca"
fi
# ---------- 5. Issue / renew cert ----------
step "Cert issuance"
if [ "$action" = "none" ]; then
ok "no issuance needed (cert matches ACME_CA_URI)"
else
warn "$reason"
d_args=()
for h in "${HOSTS[@]}"; do d_args+=(-d "$h"); done
# For 'initial': move the bootstrap dummy aside into a backup location so certbot
# can create a fresh lineage. Restore from backup if certbot fails so nginx
# always has *some* cert available on the next restart.
if [ "$action" = "initial" ]; then
info "moving bootstrap cert aside before issuance ..."
docker compose run --rm --entrypoint sh certbot -c '
set -e
mkdir -p /etc/letsencrypt/_backup
rm -rf /etc/letsencrypt/_backup/*
for p in live/infra archive/infra renewal/infra.conf; do
[ -e "/etc/letsencrypt/$p" ] && mv "/etc/letsencrypt/$p" "/etc/letsencrypt/_backup/$(echo $p | tr / -)" || true
done
' >/dev/null
force_flag=""
else
force_flag="--force-renewal"
fi
if docker compose run --rm --entrypoint certbot certbot \
certonly --webroot -w /var/www/certbot \
--server "$ACME_CA_URI" \
--email "$LETSENCRYPT_EMAIL" --agree-tos --no-eff-email \
--cert-name infra --non-interactive --keep-until-expiring \
$force_flag \
"${d_args[@]}"; then
ok "cert issued by $want_ca CA"
# Issuance OK: discard backup
if [ "$action" = "initial" ]; then
docker compose run --rm --entrypoint sh certbot -c \
"rm -rf /etc/letsencrypt/_backup" >/dev/null
fi
docker compose exec -T nginx nginx -s reload
ok "nginx reloaded with new cert"
else
# Restore backup so nginx still has a working cert next time it restarts
if [ "$action" = "initial" ]; then
warn "restoring bootstrap cert after failed issuance ..."
docker compose run --rm --entrypoint sh certbot -c '
for f in /etc/letsencrypt/_backup/*; do
[ -e "$f" ] || continue
dest=/etc/letsencrypt/$(basename "$f" | sed "s/-/\//")
mkdir -p "$(dirname "$dest")"
mv "$f" "$dest"
done
rmdir /etc/letsencrypt/_backup 2>/dev/null || true
' >/dev/null
fi
die "certbot failed — DNS A records pointing at this host?"
fi
fi
# ---------- 6. Service status ----------
step "Service status"
running=0; total=0
while IFS= read -r line; do
total=$((total+1))
case "$line" in *running*|*healthy*) running=$((running+1));; esac
done < <(docker compose ps --format '{{.Name}} {{.Status}}')
docker compose ps --format 'table {{.Name}}\t{{.Status}}' | sed 's/^/ /'
ok "$running/$total containers running"
# ---------- 7. Endpoint smoke test ----------
step "Endpoint smoke test (loopback)"
reachable=0; unreachable=0
for h in "${HOSTS[@]}"; do
code=$(curl -sk -o /dev/null -w '%{http_code}' --max-time 5 \
--resolve "$h:443:127.0.0.1" "https://$h/" 2>/dev/null || echo 000)
case "$code" in
2*|3*) ok "$h → HTTP $code"; reachable=$((reachable+1));;
4*) ok "$h → HTTP $code (auth gate — vhost OK)"; reachable=$((reachable+1));;
5*) warn "$h → HTTP $code (vhost OK, upstream not ready)"; reachable=$((reachable+1));;
000) fail "$h → unreachable"; unreachable=$((unreachable+1));;
*) warn "$h → HTTP $code"; reachable=$((reachable+1));;
esac
done
# ---------- Summary ----------
echo
if [ "$unreachable" -eq 0 ] && [ "$running" -eq "$total" ]; then
printf "${G}DEPLOY OK${N}$running/$total containers, $reachable/${#HOSTS[@]} endpoints reachable, cert: $want_ca\n"
else
printf "${Y}DEPLOY COMPLETED WITH WARNINGS${N}$running/$total containers, $unreachable unreachable endpoint(s)\n"
fi
if [ "$want_ca" = "STAGING" ]; then
printf "\n${D}Next: when staging looks right, flip ACME_CA_URI to the prod URL in .env and rerun this script.${N}\n"
fi
trap - EXIT