From 9ea4f09600bc84b31805b9ee29b2f6b3f232b787 Mon Sep 17 00:00:00 2001 From: Deeman Date: Sat, 28 Feb 2026 12:28:44 +0100 Subject: [PATCH] fix(supervisor): improve alert messages with category prefix and error snippet Mirrors the same fix applied to padelnomics. Each alert now includes a neutral category tag ([extract], [transform], [export], [deploy], [supervisor]) and the first line of the error for quick diagnosis without revealing tech stack details on the public free ntfy tier. Co-Authored-By: Claude Sonnet 4.6 --- src/materia/supervisor.py | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/src/materia/supervisor.py b/src/materia/supervisor.py index fadfc25..7ee1e2e 100644 --- a/src/materia/supervisor.py +++ b/src/materia/supervisor.py @@ -181,9 +181,9 @@ def run_workflow(conn, workflow: dict) -> None: entry_fn = getattr(module, entry_name) entry_fn() logger.info("Workflow %s completed successfully", workflow["name"]) - except Exception: + except Exception as exc: logger.exception("Workflow %s failed", workflow["name"]) - send_alert(f"Workflow '{workflow['name']}' failed") + send_alert(f"[extract] {type(exc).__name__}: {str(exc)[:100]}") raise @@ -222,8 +222,8 @@ def run_due_workflows(conn, workflows: list[dict]) -> bool: # Transform + Export + Deploy # --------------------------------------------------------------------------- -def run_shell(cmd: str, timeout_seconds: int = SUBPROCESS_TIMEOUT_SECONDS) -> bool: - """Run a shell command. Returns True on success.""" +def run_shell(cmd: str, timeout_seconds: int = SUBPROCESS_TIMEOUT_SECONDS) -> tuple[bool, str]: + """Run a shell command. Returns (success, error_snippet).""" logger.info("Shell: %s", cmd) result = subprocess.run( cmd, shell=True, capture_output=True, text=True, timeout=timeout_seconds @@ -233,27 +233,29 @@ def run_shell(cmd: str, timeout_seconds: int = SUBPROCESS_TIMEOUT_SECONDS) -> bo "Shell failed (rc=%d): %s\nstdout: %s\nstderr: %s", result.returncode, cmd, result.stdout[-500:], result.stderr[-500:], ) - return False - return True + raw = (result.stderr or result.stdout).strip() + snippet = next((ln.strip() for ln in raw.splitlines() if ln.strip()), raw)[:120] + return False, snippet + return True, "" def run_transform() -> None: """Run SQLMesh — evaluates model staleness internally.""" logger.info("Running SQLMesh transform") - ok = run_shell("uv run sqlmesh -p transform/sqlmesh_materia plan prod --auto-apply") + ok, err = run_shell("uv run sqlmesh -p transform/sqlmesh_materia plan prod --auto-apply") if not ok: - send_alert("SQLMesh transform failed") + send_alert(f"[transform] {err}") def run_export() -> None: """Export serving tables to analytics.duckdb.""" logger.info("Exporting serving tables") - ok = run_shell( + ok, err = run_shell( f"DUCKDB_PATH={DUCKDB_PATH} SERVING_DUCKDB_PATH={SERVING_DUCKDB_PATH} " f"uv run materia pipeline run export_serving" ) if not ok: - send_alert("Serving export failed") + send_alert(f"[export] {err}") def web_code_changed() -> bool: @@ -353,11 +355,11 @@ def tick() -> None: # Deploy web app if code changed if os.getenv("SUPERVISOR_GIT_PULL") and web_code_changed(): logger.info("Web code changed — deploying") - ok = run_shell("./deploy.sh") + ok, err = run_shell("./deploy.sh") if ok: - send_alert("Deploy succeeded") + send_alert("[deploy] ok") else: - send_alert("Deploy FAILED — check journalctl -u materia-supervisor") + send_alert(f"[deploy] failed: {err}") finally: conn.close() @@ -374,9 +376,9 @@ def supervisor_loop() -> None: except KeyboardInterrupt: logger.info("Supervisor stopped (KeyboardInterrupt)") break - except Exception: + except Exception as exc: logger.exception("Supervisor tick failed — backing off %ds", BACKOFF_SECONDS) - send_alert("Supervisor tick failed") + send_alert(f"[supervisor] {type(exc).__name__}: {str(exc)[:100]}") time.sleep(BACKOFF_SECONDS) else: time.sleep(TICK_INTERVAL_SECONDS)