fix(deploy): move router config write to after health check passes

Router had no profile so it was always included in `up -d --wait`. Writing the new target's config BEFORE the wait caused the router to become unhealthy if the new slot failed — leaving it in a broken state for the next deploy attempt. Now: router keeps its old config (pointing to the still-running old slot) during the health check wait, so it stays healthy throughout. Config is only written and nginx -s reload triggered after the new slot passes its health check. This is the correct blue-green pattern. Also add `retries: 3` and `start_period: 10s` to the router health check for resilience against transient startup failures. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-22 13:22:50 +01:00
parent e39eaefb43
commit 5f7e8f1200
2 changed files with 23 additions and 21 deletions
--- a/deploy.sh
+++ b/deploy.sh
@@ -35,7 +35,27 @@ echo "==> Running migrations..."
 $COMPOSE --profile "$TARGET" run --rm "${TARGET}-app" \
    python -m padelnomics.migrations.migrate

-# ── Write router config (before starting, so nginx -t passes) ──
+# ── Start & health check ───────────────────────────────────
+# Router config is NOT written yet — router keeps old config so it stays
+# healthy while we wait for the new slot to pass its own health check.
+
+echo "==> Starting $TARGET (waiting for health check)..."
+if ! $COMPOSE --profile "$TARGET" up -d --wait; then
+    echo "!!! Health check failed — dumping logs"
+    $COMPOSE logs --tail=100 2>&1 || true
+    echo "!!! Rolling back"
+    $COMPOSE stop "${TARGET}-app" "${TARGET}-worker" "${TARGET}-scheduler"
+    LATEST=$($COMPOSE run --rm --entrypoint "" "${TARGET}-app" \
+        sh -c "ls -t /app/data/app.db.pre-deploy-* 2>/dev/null | head -1")
+    if [ -n "$LATEST" ]; then
+        echo "==> Restoring database from ${LATEST}..."
+        $COMPOSE run --rm --entrypoint "" "${TARGET}-app" \
+            sh -c "cp '${LATEST}' /app/data/app.db"
+    fi
+    exit 1
+fi
+
+# ── Write router config and reload (new slot is healthy) ────

 echo "==> Writing router config for $TARGET..."
 mkdir -p "$(dirname "$ROUTER_CONF")"
@@ -57,26 +77,6 @@ server {
 }
 NGINX

-# ── Start & health check ───────────────────────────────────
-
-echo "==> Starting $TARGET (waiting for health check)..."
-if ! $COMPOSE --profile "$TARGET" up -d --wait; then
-    echo "!!! Health check failed — dumping logs"
-    $COMPOSE logs "${TARGET}-app" --tail=100 || true
-    echo "!!! Rolling back"
-    $COMPOSE stop "${TARGET}-app" "${TARGET}-worker" "${TARGET}-scheduler"
-    LATEST=$($COMPOSE run --rm --entrypoint "" "${TARGET}-app" \
-        sh -c "ls -t /app/data/app.db.pre-deploy-* 2>/dev/null | head -1")
-    if [ -n "$LATEST" ]; then
-        echo "==> Restoring database from ${LATEST}..."
-        $COMPOSE run --rm --entrypoint "" "${TARGET}-app" \
-            sh -c "cp '${LATEST}' /app/data/app.db"
-    fi
-    exit 1
-fi
-
-# ── Reload router to pick up new upstream ───────────────────
-
 echo "==> Reloading router..."
 $COMPOSE exec router nginx -s reload

--- a/docker-compose.prod.yml
+++ b/docker-compose.prod.yml
@@ -14,6 +14,8 @@ services:
      test: ["CMD", "nginx", "-t"]
      interval: 30s
      timeout: 5s
+      retries: 3
+      start_period: 10s

  litestream:
    image: litestream/litestream:latest