#!/usr/bin/env bash # cloud/deploy.sh — one-shot bring-up for the cloud composition. # # Idempotent. Safe to rerun. Will reissue the Let's Encrypt cert only when: # - the current cert is the self-signed bootstrap dummy, or # - .env's ACME_CA_URI no longer matches the issuer of the current cert # (e.g. you flipped staging → prod). # # Usage: # cd cloud && ./deploy.sh set -euo pipefail cd "$(dirname "$0")" # ---------- UI ---------- if [ -t 1 ]; then B=$'\e[34m'; G=$'\e[32m'; Y=$'\e[33m'; R=$'\e[31m'; D=$'\e[2m'; N=$'\e[0m' else B=""; G=""; Y=""; R=""; D=""; N="" fi STEP=0; TOTAL=7 step() { STEP=$((STEP+1)); printf "\n${B}[%d/%d]${N} %s\n" "$STEP" "$TOTAL" "$*"; } ok() { printf " ${G}[OK]${N} %s\n" "$*"; } info() { printf " ${D}...${N} %s\n" "$*"; } warn() { printf " ${Y}[!]${N} %s\n" "$*"; } fail() { printf " ${R}[X]${N} %s\n" "$*"; } die() { fail "$*"; exit 1; } trap 'rc=$?; [ "$rc" -ne 0 ] && printf "\n${R}DEPLOY FAILED${N} (exit $rc) at step $STEP/$TOTAL\n"' EXIT # Subdomains covered by the SAN cert (kept in lock-step with nginx-proxy vhosts) HOSTS=( git.wbd-rd.nl auth.wbd-rd.nl dash.wbd-rd.nl flow.wbd-rd.nl ml.wbd-rd.nl hub.wbd-rd.nl ops.wbd-rd.nl mq.wbd-rd.nl ci.wbd-rd.nl mqtt.wbd-rd.nl sta.wbd-rd.nl ) # ---------- 1. Preflight ---------- step "Preflight" [ -f .env ] || die ".env missing (cp .env.example .env and fill secrets)" ok ".env present" command -v docker >/dev/null || die "docker not installed" docker compose version >/dev/null 2>&1 || die "docker compose plugin missing" ok "docker $(docker --version | awk '{print $3}' | tr -d ,)" ok "docker compose $(docker compose version --short)" # Source .env so we can read variables (without leaking to the wider env) set -a; . ./.env; set +a REQUIRED=( LETSENCRYPT_EMAIL ACME_CA_URI KEYCLOAK_ADMIN_PASSWORD KEYCLOAK_DB_PASSWORD SQL_PASSWORD GITEA_DB_PASSWORD GITEA_OAUTH_CLIENT_SECRET GRAFANA_ADMIN_PASSWORD INFLUX_ADMIN_PASSWORD INFLUX_ADMIN_TOKEN RABBITMQ_PASSWORD JENKINS_ADMIN_PASSWORD MLFLOW_DB_PASSWORD JUPYTERHUB_ADMIN_PASSWORD FROST_DB_PASSWORD WG_SERVER_PUBLIC_HOST ) missing=0 for v in "${REQUIRED[@]}"; do if [ -z "${!v:-}" ]; then warn "\$$v is empty in .env"; missing=$((missing+1)); fi done [ "$missing" -eq 0 ] || die "$missing required env var(s) empty" ok "required env vars present" # ---------- 2. Validate compose ---------- step "Validate compose" docker compose config --quiet || die "docker compose config invalid" services_total=$(docker compose config --services | wc -l) ok "compose valid, $services_total services defined" # ---------- 3. Bring up containers ---------- step "Bring up containers (docker compose up -d)" docker compose up -d --remove-orphans ok "containers requested" # Wait for postgres healthy (longest dep — gates keycloak/gitea/mlflow) info "waiting for sql to become healthy ..." sql_cid=$(docker compose ps -q sql) [ -n "$sql_cid" ] || die "sql container not found" for i in $(seq 1 60); do state=$(docker inspect --format '{{if .State.Health}}{{.State.Health.Status}}{{else}}none{{end}}' "$sql_cid" 2>/dev/null || echo "") case "$state" in healthy) ok "sql healthy (after ${i} probe(s))"; break;; starting|"") sleep 2;; unhealthy) die "sql reports unhealthy — check 'docker compose logs sql'";; none) warn "sql has no healthcheck — proceeding anyway"; break;; esac [ "$i" -eq 60 ] && die "sql did not become healthy within 120s" done # Wait for nginx accepting on :80 (nginx-init must have produced the bootstrap cert) info "waiting for nginx :80 ..." for i in $(seq 1 30); do code=$(curl -s -o /dev/null -w '%{http_code}' --max-time 2 -H "Host: ping" http://127.0.0.1/ 2>/dev/null || echo 000) if [ "$code" != "000" ]; then ok "nginx :80 responding (HTTP $code) after ${i} probe(s)"; break; fi sleep 2 [ "$i" -eq 30 ] && die "nginx :80 unreachable — check 'docker compose logs nginx-init nginx'" done # ---------- 4. Detect cert state ---------- step "Inspect TLS cert state" CERT_PATH=/etc/letsencrypt/live/infra/fullchain.pem # nginx:1.27-alpine doesn't ship openssl; the certbot image does. cert_subj=$(docker compose run --rm --entrypoint openssl certbot \ x509 -in "$CERT_PATH" -noout -subject 2>/dev/null || echo "") cert_iss=$(docker compose run --rm --entrypoint openssl certbot \ x509 -in "$CERT_PATH" -noout -issuer 2>/dev/null || echo "") case "$ACME_CA_URI" in *acme-staging*) want_ca=STAGING;; *) want_ca=PROD;; esac cur_ca=UNKNOWN case "$cert_subj" in *bootstrap-infra*) cur_ca=SELFSIGNED;; esac if [ "$cur_ca" = "UNKNOWN" ]; then case "$cert_iss" in *STAGING*|*Fake*|*staging*) cur_ca=STAGING;; *Encrypt*|*ISRG*|*\ R3*|*\ R10*|*\ R11*|*\ E1*|*\ E5*|*\ E6*) cur_ca=PROD;; esac fi ok "current cert: $cur_ca / target: $want_ca" # Decide what to do: 'none', 'initial' (no certbot lineage yet), or 'renew' (lineage exists but wrong CA). if [ "$cur_ca" = "$want_ca" ]; then action="none"; reason="" elif [ "$cur_ca" = "SELFSIGNED" ] || [ "$cur_ca" = "UNKNOWN" ]; then action="initial"; reason="bootstrap → $want_ca" else action="renew"; reason="$cur_ca → $want_ca" fi # ---------- 5. Issue / renew cert ---------- step "Cert issuance" if [ "$action" = "none" ]; then ok "no issuance needed (cert matches ACME_CA_URI)" else warn "$reason" d_args=() for h in "${HOSTS[@]}"; do d_args+=(-d "$h"); done # For 'initial': move the bootstrap dummy aside into a backup location so certbot # can create a fresh lineage. Restore from backup if certbot fails so nginx # always has *some* cert available on the next restart. if [ "$action" = "initial" ]; then info "moving bootstrap cert aside before issuance ..." docker compose run --rm --entrypoint sh certbot -c ' set -e mkdir -p /etc/letsencrypt/_backup rm -rf /etc/letsencrypt/_backup/* for p in live/infra archive/infra renewal/infra.conf; do [ -e "/etc/letsencrypt/$p" ] && mv "/etc/letsencrypt/$p" "/etc/letsencrypt/_backup/$(echo $p | tr / -)" || true done ' >/dev/null force_flag="" else force_flag="--force-renewal" fi if docker compose run --rm --entrypoint certbot certbot \ certonly --webroot -w /var/www/certbot \ --server "$ACME_CA_URI" \ --email "$LETSENCRYPT_EMAIL" --agree-tos --no-eff-email \ --cert-name infra --non-interactive --keep-until-expiring \ $force_flag \ "${d_args[@]}"; then ok "cert issued by $want_ca CA" # Issuance OK: discard backup if [ "$action" = "initial" ]; then docker compose run --rm --entrypoint sh certbot -c \ "rm -rf /etc/letsencrypt/_backup" >/dev/null fi docker compose exec -T nginx nginx -s reload ok "nginx reloaded with new cert" else # Restore backup so nginx still has a working cert next time it restarts if [ "$action" = "initial" ]; then warn "restoring bootstrap cert after failed issuance ..." docker compose run --rm --entrypoint sh certbot -c ' for f in /etc/letsencrypt/_backup/*; do [ -e "$f" ] || continue dest=/etc/letsencrypt/$(basename "$f" | sed "s/-/\//") mkdir -p "$(dirname "$dest")" mv "$f" "$dest" done rmdir /etc/letsencrypt/_backup 2>/dev/null || true ' >/dev/null fi die "certbot failed — DNS A records pointing at this host?" fi fi # ---------- 6. Service status ---------- step "Service status" running=0; total=0 while IFS= read -r line; do total=$((total+1)) case "$line" in *running*|*healthy*) running=$((running+1));; esac done < <(docker compose ps --format '{{.Name}} {{.Status}}') docker compose ps --format 'table {{.Name}}\t{{.Status}}' | sed 's/^/ /' ok "$running/$total containers running" # ---------- 7. Endpoint smoke test ---------- step "Endpoint smoke test (loopback)" reachable=0; unreachable=0 for h in "${HOSTS[@]}"; do code=$(curl -sk -o /dev/null -w '%{http_code}' --max-time 5 \ --resolve "$h:443:127.0.0.1" "https://$h/" 2>/dev/null || echo 000) case "$code" in 2*|3*) ok "$h → HTTP $code"; reachable=$((reachable+1));; 4*) ok "$h → HTTP $code (auth gate — vhost OK)"; reachable=$((reachable+1));; 5*) warn "$h → HTTP $code (vhost OK, upstream not ready)"; reachable=$((reachable+1));; 000) fail "$h → unreachable"; unreachable=$((unreachable+1));; *) warn "$h → HTTP $code"; reachable=$((reachable+1));; esac done # ---------- Summary ---------- echo if [ "$unreachable" -eq 0 ] && [ "$running" -eq "$total" ]; then printf "${G}DEPLOY OK${N} — $running/$total containers, $reachable/${#HOSTS[@]} endpoints reachable, cert: $want_ca\n" else printf "${Y}DEPLOY COMPLETED WITH WARNINGS${N} — $running/$total containers, $unreachable unreachable endpoint(s)\n" fi if [ "$want_ca" = "STAGING" ]; then printf "\n${D}Next: when staging looks right, flip ACME_CA_URI to the prod URL in .env and rerun this script.${N}\n" fi trap - EXIT