feat(pseo): add content/health.py — gap detection, freshness, health checks

New module with pure async query functions for the pSEO Engine dashboard: - get_template_stats() — article counts by status/language per template - get_template_freshness() — compare _serving_meta.json vs last article gen - get_content_gaps() — DuckDB rows with no matching article per language - check_hreflang_orphans() — published articles missing a sibling language - check_missing_build_files() — published articles with no HTML on disk - check_broken_scenario_refs() — articles referencing non-existent scenarios - get_all_health_issues() — runs all checks, returns counts + detail lists Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-24 18:21:34 +01:00
parent 97c3aafea8
commit 567100076f
1 changed files with 378 additions and 0 deletions
--- a/web/src/padelnomics/content/health.py
+++ b/web/src/padelnomics/content/health.py
@@ -0,0 +1,378 @@
+"""
+pSEO Engine health checks and content gap queries.
+
+All functions are async, pure queries — no side effects.
+Used by the pSEO Engine admin dashboard.
+
+Functions overview:
+  get_template_stats()       — article counts per status/language for one template
+  get_template_freshness()   — compare _serving_meta.json timestamp vs last article generation
+  get_content_gaps()         — DuckDB rows with no matching article for a template+language
+  check_hreflang_orphans()   — published articles missing a sibling language
+  check_missing_build_files()— published articles whose HTML file is absent from disk
+  check_broken_scenario_refs()— articles referencing [scenario:slug] that doesn't exist
+  get_all_health_issues()    — run all checks, return counts + details
+"""
+import json
+import logging
+import os
+import re
+from datetime import datetime
+from pathlib import Path
+
+from ..analytics import fetch_analytics
+from ..core import fetch_all
+
+logger = logging.getLogger(__name__)
+
+# Directory where generate_articles() writes HTML + markdown source files.
+BUILD_DIR = Path("data/content/_build")
+
+# Pattern matching [scenario:slug] and [scenario:slug:section] markers.
+_SCENARIO_REF_RE = re.compile(r"\[scenario:([a-z0-9_-]+)(?::[a-z]+)?\]")
+
+
+def _validate_table_name(data_table: str) -> None:
+    """Guard against SQL injection in table names."""
+    assert re.match(r"^[a-z_][a-z0-9_.]*$", data_table), (
+        f"Invalid table name: {data_table}"
+    )
+
+
+def _read_serving_meta() -> dict:
+    """Read _serving_meta.json written by export_serving.py. Returns {} if absent."""
+    serving_path = os.environ.get("SERVING_DUCKDB_PATH", "data/analytics.duckdb")
+    meta_path = Path(serving_path).parent / "_serving_meta.json"
+    if not meta_path.exists():
+        return {}
+    try:
+        return json.loads(meta_path.read_text())
+    except (json.JSONDecodeError, OSError):
+        return {}
+
+
+def _parse_dt(s: str | None) -> datetime | None:
+    """Parse an ISO datetime string to a naive UTC datetime. Returns None on failure."""
+    if not s:
+        return None
+    try:
+        dt = datetime.fromisoformat(s)
+        # Strip timezone info so both aware (from meta) and naive (from SQLite) compare cleanly.
+        return dt.replace(tzinfo=None)
+    except (ValueError, TypeError):
+        return None
+
+
+# ── Template statistics ───────────────────────────────────────────────────────
+
+
+async def get_template_stats(template_slug: str) -> dict:
+    """Article counts for a template: total, published, draft, scheduled, by language.
+
+    Returns:
+        {
+            "total": N,
+            "published": N,
+            "draft": N,
+            "scheduled": N,
+            "by_language": {"en": {"total": N, "published": N, ...}, ...},
+        }
+    """
+    rows = await fetch_all(
+        "SELECT status, language, COUNT(*) as cnt FROM articles"
+        " WHERE template_slug = ? GROUP BY status, language",
+        (template_slug,),
+    )
+    stats: dict = {"total": 0, "published": 0, "draft": 0, "scheduled": 0, "by_language": {}}
+    for r in rows:
+        cnt = r["cnt"]
+        status = r["status"]
+        lang = r["language"]
+
+        stats["total"] += cnt
+        if status in stats:
+            stats[status] += cnt
+
+        if lang not in stats["by_language"]:
+            stats["by_language"][lang] = {"total": 0, "published": 0, "draft": 0, "scheduled": 0}
+        stats["by_language"][lang]["total"] += cnt
+        if status in stats["by_language"][lang]:
+            stats["by_language"][lang][status] += cnt
+
+    return stats
+
+
+# ── Data freshness ────────────────────────────────────────────────────────────
+
+
+async def get_template_freshness(templates: list[dict]) -> list[dict]:
+    """Compare _serving_meta.json exported_at vs max(articles.updated_at) per template.
+
+    Returns list of dicts — one per template:
+        {
+            "slug":           str,
+            "name":           str,
+            "data_table":     str,
+            "exported_at_utc": str | None,   # from _serving_meta.json
+            "last_generated":  str | None,   # max(updated_at) in articles
+            "row_count":       int | None,   # DuckDB row count from meta
+            "status":          "fresh" | "stale" | "no_articles" | "no_data",
+        }
+
+    Freshness semantics:
+      "fresh"       — articles generated after last data export (up to date)
+      "stale"       — data export is newer than last article generation (regen needed)
+      "no_articles" — DuckDB data exists but no articles generated yet
+      "no_data"     — _serving_meta.json absent (export_serving not yet run)
+    """
+    meta = _read_serving_meta()
+    exported_at_str = meta.get("exported_at_utc")
+    exported_at = _parse_dt(exported_at_str)
+    table_meta = meta.get("tables", {})
+
+    result = []
+    for t in templates:
+        slug = t["slug"]
+        data_table = t.get("data_table", "")
+        # Strip schema prefix to match the key in _serving_meta.json tables dict.
+        # e.g. "serving.pseo_city_costs_de" → "pseo_city_costs_de"
+        table_key = data_table.split(".")[-1] if "." in data_table else data_table
+
+        rows = await fetch_all(
+            "SELECT MAX(COALESCE(updated_at, created_at)) as last_gen FROM articles"
+            " WHERE template_slug = ?",
+            (slug,),
+        )
+        last_gen_str = rows[0]["last_gen"] if rows else None
+        last_gen = _parse_dt(last_gen_str)
+
+        row_count = table_meta.get(table_key, {}).get("row_count")
+
+        if not exported_at_str:
+            status = "no_data"
+        elif last_gen is None:
+            status = "no_articles"
+        elif exported_at and last_gen and exported_at > last_gen:
+            # New data available — articles haven't been regenerated against it yet.
+            status = "stale"
+        else:
+            status = "fresh"
+
+        result.append({
+            "slug": slug,
+            "name": t.get("name", slug),
+            "data_table": data_table,
+            "exported_at_utc": exported_at_str,
+            "last_generated": last_gen_str,
+            "row_count": row_count,
+            "status": status,
+        })
+
+    return result
+
+
+# ── Content gaps ──────────────────────────────────────────────────────────────
+
+
+async def get_content_gaps(
+    template_slug: str,
+    data_table: str,
+    natural_key: str,
+    languages: list[str],
+    limit: int = 200,
+) -> list[dict]:
+    """Return DuckDB rows that have no matching article for at least one language.
+
+    The article slug is constructed as: "{template_slug}-{lang}-{natural_key_value}"
+    This lets us efficiently detect gaps without rendering URL patterns.
+
+    Returns list of dicts — each is the DuckDB row with two extra keys:
+        "_natural_key":     str   — the natural key value for this row
+        "_missing_languages": list[str] — languages with no article
+    """
+    assert languages, "languages must not be empty"
+    _validate_table_name(data_table)
+
+    # Fetch all article slugs for this template to determine which rows exist.
+    slug_rows = await fetch_all(
+        "SELECT slug, language FROM articles WHERE template_slug = ?",
+        (template_slug,),
+    )
+
+    # Build lookup: (lang, natural_key_value) → True
+    prefix_by_lang = {lang: f"{template_slug}-{lang}-" for lang in languages}
+    existing: set[tuple[str, str]] = set()
+    for r in slug_rows:
+        lang = r["language"]
+        if lang not in prefix_by_lang:
+            continue
+        prefix = prefix_by_lang[lang]
+        if r["slug"].startswith(prefix):
+            nk_val = r["slug"][len(prefix):]
+            existing.add((lang, nk_val))
+
+    duckdb_rows = await fetch_analytics(
+        f"SELECT * FROM {data_table} LIMIT ?",
+        [limit],
+    )
+
+    gaps = []
+    for row in duckdb_rows:
+        nk_val = str(row.get(natural_key, ""))
+        missing = [lang for lang in languages if (lang, nk_val) not in existing]
+        if missing:
+            gaps.append({**row, "_natural_key": nk_val, "_missing_languages": missing})
+
+    return gaps
+
+
+# ── Health checks ─────────────────────────────────────────────────────────────
+
+
+async def check_hreflang_orphans(templates: list[dict]) -> list[dict]:
+    """Published articles missing a sibling language expected by their template.
+
+    For example: city-cost-de generates EN + DE. If the EN article exists but
+    DE is absent, that article is an hreflang orphan.
+
+    Returns list of dicts:
+        {
+            "template_slug":      str,
+            "url_path":           str,
+            "present_languages":  list[str],
+            "missing_languages":  list[str],
+        }
+    """
+    orphans = []
+    for t in templates:
+        expected = set(t.get("languages", ["en"]))
+        if len(expected) < 2:
+            continue  # Single-language template — no orphans possible.
+
+        rows = await fetch_all(
+            """SELECT url_path,
+                      GROUP_CONCAT(language) as langs,
+                      COUNT(DISTINCT language) as lang_count
+               FROM articles
+               WHERE template_slug = ? AND status = 'published'
+               GROUP BY url_path
+               HAVING COUNT(DISTINCT language) < ?""",
+            (t["slug"], len(expected)),
+        )
+        for r in rows:
+            present = set(r["langs"].split(","))
+            missing = sorted(expected - present)
+            orphans.append({
+                "template_slug": t["slug"],
+                "url_path": r["url_path"],
+                "present_languages": sorted(present),
+                "missing_languages": missing,
+            })
+
+    return orphans
+
+
+async def check_missing_build_files(build_dir: Path | None = None) -> list[dict]:
+    """Published articles whose HTML file is absent from disk.
+
+    Expected path: BUILD_DIR/{language}/{slug}.html
+
+    Returns list of dicts:
+        {"id", "slug", "language", "url_path", "template_slug", "expected_path"}
+    """
+    bd = build_dir or BUILD_DIR
+    rows = await fetch_all(
+        "SELECT id, slug, language, url_path, template_slug FROM articles"
+        " WHERE status = 'published'",
+    )
+    missing = []
+    for r in rows:
+        path = bd / r["language"] / f"{r['slug']}.html"
+        if not path.exists():
+            missing.append({
+                "id": r["id"],
+                "slug": r["slug"],
+                "language": r["language"],
+                "url_path": r["url_path"],
+                "template_slug": r["template_slug"],
+                "expected_path": str(path),
+            })
+    return missing
+
+
+async def check_broken_scenario_refs(build_dir: Path | None = None) -> list[dict]:
+    """pSEO articles referencing [scenario:slug] markers that don't exist.
+
+    Reads markdown source from BUILD_DIR/{language}/md/{slug}.md.
+    Only checks published articles with a template_slug (pSEO-generated).
+
+    Returns list of dicts:
+        {"id", "slug", "language", "url_path", "broken_scenario_refs": [str, ...]}
+    """
+    bd = build_dir or BUILD_DIR
+
+    scenario_rows = await fetch_all("SELECT slug FROM published_scenarios")
+    valid_slugs = {r["slug"] for r in scenario_rows}
+
+    articles = await fetch_all(
+        "SELECT id, slug, language, url_path FROM articles"
+        " WHERE status = 'published' AND template_slug IS NOT NULL",
+    )
+
+    broken = []
+    for a in articles:
+        md_path = bd / a["language"] / "md" / f"{a['slug']}.md"
+        if not md_path.exists():
+            continue
+        markdown = md_path.read_text()
+        refs = {m.group(1) for m in _SCENARIO_REF_RE.finditer(markdown)}
+        missing_refs = sorted(refs - valid_slugs)
+        if missing_refs:
+            broken.append({
+                "id": a["id"],
+                "slug": a["slug"],
+                "language": a["language"],
+                "url_path": a["url_path"],
+                "broken_scenario_refs": missing_refs,
+            })
+
+    return broken
+
+
+# ── Aggregate check ───────────────────────────────────────────────────────────
+
+
+async def get_all_health_issues(
+    templates: list[dict],
+    build_dir: Path | None = None,
+) -> dict:
+    """Run all health checks, return issue counts and full detail lists.
+
+    Returns:
+        {
+            "hreflang_orphans":     [...],
+            "missing_build_files":  [...],
+            "broken_scenario_refs": [...],
+            "counts": {
+                "hreflang_orphans":     N,
+                "missing_build_files":  N,
+                "broken_scenario_refs": N,
+                "total":                N,
+            },
+        }
+    """
+    orphans = await check_hreflang_orphans(templates)
+    missing_files = await check_missing_build_files(build_dir)
+    broken_refs = await check_broken_scenario_refs(build_dir)
+
+    return {
+        "hreflang_orphans": orphans,
+        "missing_build_files": missing_files,
+        "broken_scenario_refs": broken_refs,
+        "counts": {
+            "hreflang_orphans": len(orphans),
+            "missing_build_files": len(missing_files),
+            "broken_scenario_refs": len(broken_refs),
+            "total": len(orphans) + len(missing_files) + len(broken_refs),
+        },
+    }