fix(supervisor): improve alert messages with category prefix and error snippet
Each alert now includes a neutral category tag ([extract], [transform], [export], [deploy], [supervisor]) and the first line of the error, so notifications are actionable without revealing tech stack details on the public free ntfy tier. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -192,9 +192,9 @@ def run_workflow(conn, workflow: dict) -> None:
|
||||
entry_fn = getattr(module, entry_name)
|
||||
entry_fn()
|
||||
logger.info("Workflow %s completed successfully", workflow["name"])
|
||||
except Exception:
|
||||
except Exception as exc:
|
||||
logger.exception("Workflow %s failed", workflow["name"])
|
||||
send_alert(f"Workflow '{workflow['name']}' failed")
|
||||
send_alert(f"[extract] {type(exc).__name__}: {str(exc)[:100]}")
|
||||
raise
|
||||
|
||||
|
||||
@@ -233,8 +233,8 @@ def run_due_workflows(conn, workflows: list[dict]) -> bool:
|
||||
# Transform + Export + Deploy
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def run_shell(cmd: str, timeout_seconds: int = SUBPROCESS_TIMEOUT_SECONDS) -> bool:
|
||||
"""Run a shell command. Returns True on success."""
|
||||
def run_shell(cmd: str, timeout_seconds: int = SUBPROCESS_TIMEOUT_SECONDS) -> tuple[bool, str]:
|
||||
"""Run a shell command. Returns (success, error_snippet)."""
|
||||
logger.info("Shell: %s", cmd)
|
||||
result = subprocess.run(
|
||||
cmd, shell=True, capture_output=True, text=True, timeout=timeout_seconds
|
||||
@@ -242,29 +242,31 @@ def run_shell(cmd: str, timeout_seconds: int = SUBPROCESS_TIMEOUT_SECONDS) -> bo
|
||||
if result.returncode != 0:
|
||||
logger.error("Shell failed (rc=%d): %s\nstdout: %s\nstderr: %s",
|
||||
result.returncode, cmd, result.stdout[-500:], result.stderr[-500:])
|
||||
return False
|
||||
return True
|
||||
raw = (result.stderr or result.stdout).strip()
|
||||
snippet = next((ln.strip() for ln in raw.splitlines() if ln.strip()), raw)[:120]
|
||||
return False, snippet
|
||||
return True, ""
|
||||
|
||||
|
||||
def run_transform() -> None:
|
||||
"""Run SQLMesh — it evaluates model staleness internally."""
|
||||
logger.info("Running SQLMesh transform")
|
||||
ok = run_shell(
|
||||
ok, err = run_shell(
|
||||
"uv run sqlmesh -p transform/sqlmesh_padelnomics plan prod --auto-apply",
|
||||
)
|
||||
if not ok:
|
||||
send_alert("SQLMesh transform failed")
|
||||
send_alert(f"[transform] {err}")
|
||||
|
||||
|
||||
def run_export() -> None:
|
||||
"""Export serving tables to analytics.duckdb."""
|
||||
logger.info("Exporting serving tables")
|
||||
ok = run_shell(
|
||||
ok, err = run_shell(
|
||||
f"DUCKDB_PATH={DUCKDB_PATH} SERVING_DUCKDB_PATH={SERVING_DUCKDB_PATH} "
|
||||
f"uv run python src/padelnomics/export_serving.py"
|
||||
)
|
||||
if not ok:
|
||||
send_alert("Serving export failed")
|
||||
send_alert(f"[export] {err}")
|
||||
|
||||
|
||||
def web_code_changed() -> bool:
|
||||
@@ -365,11 +367,11 @@ def tick() -> None:
|
||||
# Deploy web app if code changed
|
||||
if os.getenv("SUPERVISOR_GIT_PULL") and web_code_changed():
|
||||
logger.info("Web code changed — deploying")
|
||||
ok = run_shell("./deploy.sh")
|
||||
ok, err = run_shell("./deploy.sh")
|
||||
if ok:
|
||||
send_alert("Deploy succeeded")
|
||||
send_alert("[deploy] ok")
|
||||
else:
|
||||
send_alert("Deploy FAILED — check journalctl -u padelnomics-supervisor")
|
||||
send_alert(f"[deploy] failed: {err}")
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
@@ -386,9 +388,9 @@ def supervisor_loop() -> None:
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Supervisor stopped (KeyboardInterrupt)")
|
||||
break
|
||||
except Exception:
|
||||
except Exception as exc:
|
||||
logger.exception("Supervisor tick failed — backing off %ds", BACKOFF_SECONDS)
|
||||
send_alert("Supervisor tick failed")
|
||||
send_alert(f"[supervisor] {type(exc).__name__}: {str(exc)[:100]}")
|
||||
time.sleep(BACKOFF_SECONDS)
|
||||
else:
|
||||
time.sleep(TICK_INTERVAL_SECONDS)
|
||||
|
||||
Reference in New Issue
Block a user