feat(pseo): add content/health.py — gap detection, freshness, health checks

New module with pure async query functions for the pSEO Engine dashboard: - get_template_stats() — article counts by status/language per template - get_template_freshness() — compare _serving_meta.json vs last article gen - get_content_gaps() — DuckDB rows with no matching article per language - check_hreflang_orphans() — published articles missing a sibling language - check_missing_build_files() — published articles with no HTML on disk - check_broken_scenario_refs() — articles referencing non-existent scenarios - get_all_health_issues() — runs all checks, returns counts + detail lists Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-24 18:21:34 +01:00
parent 97c3aafea8
commit 567100076f
1 changed files with 378 additions and 0 deletions
--- a/web/src/padelnomics/content/health.py
+++ b/web/src/padelnomics/content/health.py
@@ -0,0 +1,378 @@
 """
 pSEO Engine health checks and content gap queries.
 All functions are async, pure queries — no side effects.
 Used by the pSEO Engine admin dashboard.
 Functions overview:
  get_template_stats()       — article counts per status/language for one template
  get_template_freshness()   — compare _serving_meta.json timestamp vs last article generation
  get_content_gaps()         — DuckDB rows with no matching article for a template+language
  check_hreflang_orphans()   — published articles missing a sibling language
  check_missing_build_files()— published articles whose HTML file is absent from disk
  check_broken_scenario_refs()— articles referencing [scenario:slug] that doesn't exist
  get_all_health_issues()    — run all checks, return counts + details
 """
 import json
 import logging
 import os
 import re
 from datetime import datetime
 from pathlib import Path
 from ..analytics import fetch_analytics
 from ..core import fetch_all
 logger = logging.getLogger(__name__)
 # Directory where generate_articles() writes HTML + markdown source files.
 BUILD_DIR = Path("data/content/_build")
 # Pattern matching [scenario:slug] and [scenario:slug:section] markers.
 _SCENARIO_REF_RE = re.compile(r"\[scenario:([a-z0-9_-]+)(?::[a-z]+)?\]")
 def _validate_table_name(data_table: str) -> None:
    """Guard against SQL injection in table names."""
    assert re.match(r"^[a-z_][a-z0-9_.]*$", data_table), (
        f"Invalid table name: {data_table}"
    )
 def _read_serving_meta() -> dict:
    """Read _serving_meta.json written by export_serving.py. Returns {} if absent."""
    serving_path = os.environ.get("SERVING_DUCKDB_PATH", "data/analytics.duckdb")
    meta_path = Path(serving_path).parent / "_serving_meta.json"
    if not meta_path.exists():
        return {}
    try:
        return json.loads(meta_path.read_text())
    except (json.JSONDecodeError, OSError):
        return {}
 def _parse_dt(s: str | None) -> datetime | None:
    """Parse an ISO datetime string to a naive UTC datetime. Returns None on failure."""
    if not s:
        return None
    try:
        dt = datetime.fromisoformat(s)
        # Strip timezone info so both aware (from meta) and naive (from SQLite) compare cleanly.
        return dt.replace(tzinfo=None)
    except (ValueError, TypeError):
        return None
 # ── Template statistics ───────────────────────────────────────────────────────
 async def get_template_stats(template_slug: str) -> dict:
    """Article counts for a template: total, published, draft, scheduled, by language.
    Returns:
        {
            "total": N,
            "published": N,
            "draft": N,
            "scheduled": N,
            "by_language": {"en": {"total": N, "published": N, ...}, ...},
        }
    """
    rows = await fetch_all(
        "SELECT status, language, COUNT(*) as cnt FROM articles"
        " WHERE template_slug = ? GROUP BY status, language",
        (template_slug,),
    )
    stats: dict = {"total": 0, "published": 0, "draft": 0, "scheduled": 0, "by_language": {}}
    for r in rows:
        cnt = r["cnt"]
        status = r["status"]
        lang = r["language"]
        stats["total"] += cnt
        if status in stats:
            stats[status] += cnt
        if lang not in stats["by_language"]:
            stats["by_language"][lang] = {"total": 0, "published": 0, "draft": 0, "scheduled": 0}
        stats["by_language"][lang]["total"] += cnt
        if status in stats["by_language"][lang]:
            stats["by_language"][lang][status] += cnt
    return stats
 # ── Data freshness ────────────────────────────────────────────────────────────
 async def get_template_freshness(templates: list[dict]) -> list[dict]:
    """Compare _serving_meta.json exported_at vs max(articles.updated_at) per template.
    Returns list of dicts — one per template:
        {
            "slug":           str,
            "name":           str,
            "data_table":     str,
            "exported_at_utc": str | None,   # from _serving_meta.json
            "last_generated":  str | None,   # max(updated_at) in articles
            "row_count":       int | None,   # DuckDB row count from meta
            "status":          "fresh" | "stale" | "no_articles" | "no_data",
        }
    Freshness semantics:
      "fresh"       — articles generated after last data export (up to date)
      "stale"       — data export is newer than last article generation (regen needed)
      "no_articles" — DuckDB data exists but no articles generated yet
      "no_data"     — _serving_meta.json absent (export_serving not yet run)
    """
    meta = _read_serving_meta()
    exported_at_str = meta.get("exported_at_utc")
    exported_at = _parse_dt(exported_at_str)
    table_meta = meta.get("tables", {})
    result = []
    for t in templates:
        slug = t["slug"]
        data_table = t.get("data_table", "")
        # Strip schema prefix to match the key in _serving_meta.json tables dict.
        # e.g. "serving.pseo_city_costs_de" → "pseo_city_costs_de"
        table_key = data_table.split(".")[-1] if "." in data_table else data_table
        rows = await fetch_all(
            "SELECT MAX(COALESCE(updated_at, created_at)) as last_gen FROM articles"
            " WHERE template_slug = ?",
            (slug,),
        )
        last_gen_str = rows[0]["last_gen"] if rows else None
        last_gen = _parse_dt(last_gen_str)
        row_count = table_meta.get(table_key, {}).get("row_count")
        if not exported_at_str:
            status = "no_data"
        elif last_gen is None:
            status = "no_articles"
        elif exported_at and last_gen and exported_at > last_gen:
            # New data available — articles haven't been regenerated against it yet.
            status = "stale"
        else:
            status = "fresh"
        result.append({
            "slug": slug,
            "name": t.get("name", slug),
            "data_table": data_table,
            "exported_at_utc": exported_at_str,
            "last_generated": last_gen_str,
            "row_count": row_count,
            "status": status,
        })
    return result
 # ── Content gaps ──────────────────────────────────────────────────────────────
 async def get_content_gaps(
    template_slug: str,
    data_table: str,
    natural_key: str,
    languages: list[str],
    limit: int = 200,
 ) -> list[dict]:
    """Return DuckDB rows that have no matching article for at least one language.
    The article slug is constructed as: "{template_slug}-{lang}-{natural_key_value}"
    This lets us efficiently detect gaps without rendering URL patterns.
    Returns list of dicts — each is the DuckDB row with two extra keys:
        "_natural_key":     str   — the natural key value for this row
        "_missing_languages": list[str] — languages with no article
    """
    assert languages, "languages must not be empty"
    _validate_table_name(data_table)
    # Fetch all article slugs for this template to determine which rows exist.
    slug_rows = await fetch_all(
        "SELECT slug, language FROM articles WHERE template_slug = ?",
        (template_slug,),
    )
    # Build lookup: (lang, natural_key_value) → True
    prefix_by_lang = {lang: f"{template_slug}-{lang}-" for lang in languages}
    existing: set[tuple[str, str]] = set()
    for r in slug_rows:
        lang = r["language"]
        if lang not in prefix_by_lang:
            continue
        prefix = prefix_by_lang[lang]
        if r["slug"].startswith(prefix):
            nk_val = r["slug"][len(prefix):]
            existing.add((lang, nk_val))
    duckdb_rows = await fetch_analytics(
        f"SELECT * FROM {data_table} LIMIT ?",
        [limit],
    )
    gaps = []
    for row in duckdb_rows:
        nk_val = str(row.get(natural_key, ""))
        missing = [lang for lang in languages if (lang, nk_val) not in existing]
        if missing:
            gaps.append({**row, "_natural_key": nk_val, "_missing_languages": missing})
    return gaps
 # ── Health checks ─────────────────────────────────────────────────────────────
 async def check_hreflang_orphans(templates: list[dict]) -> list[dict]:
    """Published articles missing a sibling language expected by their template.
    For example: city-cost-de generates EN + DE. If the EN article exists but
    DE is absent, that article is an hreflang orphan.
    Returns list of dicts:
        {
            "template_slug":      str,
            "url_path":           str,
            "present_languages":  list[str],
            "missing_languages":  list[str],
        }
    """
    orphans = []
    for t in templates:
        expected = set(t.get("languages", ["en"]))
        if len(expected) < 2:
            continue  # Single-language template — no orphans possible.
        rows = await fetch_all(
            """SELECT url_path,
                      GROUP_CONCAT(language) as langs,
                      COUNT(DISTINCT language) as lang_count
               FROM articles
               WHERE template_slug = ? AND status = 'published'
               GROUP BY url_path
               HAVING COUNT(DISTINCT language) < ?""",
            (t["slug"], len(expected)),
        )
        for r in rows:
            present = set(r["langs"].split(","))
            missing = sorted(expected - present)
            orphans.append({
                "template_slug": t["slug"],
                "url_path": r["url_path"],
                "present_languages": sorted(present),
                "missing_languages": missing,
            })
    return orphans
 async def check_missing_build_files(build_dir: Path | None = None) -> list[dict]:
    """Published articles whose HTML file is absent from disk.
    Expected path: BUILD_DIR/{language}/{slug}.html
    Returns list of dicts:
        {"id", "slug", "language", "url_path", "template_slug", "expected_path"}
    """
    bd = build_dir or BUILD_DIR
    rows = await fetch_all(
        "SELECT id, slug, language, url_path, template_slug FROM articles"
        " WHERE status = 'published'",
    )
    missing = []
    for r in rows:
        path = bd / r["language"] / f"{r['slug']}.html"
        if not path.exists():
            missing.append({
                "id": r["id"],
                "slug": r["slug"],
                "language": r["language"],
                "url_path": r["url_path"],
                "template_slug": r["template_slug"],
                "expected_path": str(path),
            })
    return missing
 async def check_broken_scenario_refs(build_dir: Path | None = None) -> list[dict]:
    """pSEO articles referencing [scenario:slug] markers that don't exist.
    Reads markdown source from BUILD_DIR/{language}/md/{slug}.md.
    Only checks published articles with a template_slug (pSEO-generated).
    Returns list of dicts:
        {"id", "slug", "language", "url_path", "broken_scenario_refs": [str, ...]}
    """
    bd = build_dir or BUILD_DIR
    scenario_rows = await fetch_all("SELECT slug FROM published_scenarios")
    valid_slugs = {r["slug"] for r in scenario_rows}
    articles = await fetch_all(
        "SELECT id, slug, language, url_path FROM articles"
        " WHERE status = 'published' AND template_slug IS NOT NULL",
    )
    broken = []
    for a in articles:
        md_path = bd / a["language"] / "md" / f"{a['slug']}.md"
        if not md_path.exists():
            continue
        markdown = md_path.read_text()
        refs = {m.group(1) for m in _SCENARIO_REF_RE.finditer(markdown)}
        missing_refs = sorted(refs - valid_slugs)
        if missing_refs:
            broken.append({
                "id": a["id"],
                "slug": a["slug"],
                "language": a["language"],
                "url_path": a["url_path"],
                "broken_scenario_refs": missing_refs,
            })
    return broken
 # ── Aggregate check ───────────────────────────────────────────────────────────
 async def get_all_health_issues(
    templates: list[dict],
    build_dir: Path | None = None,
 ) -> dict:
    """Run all health checks, return issue counts and full detail lists.
    Returns:
        {
            "hreflang_orphans":     [...],
            "missing_build_files":  [...],
            "broken_scenario_refs": [...],
            "counts": {
                "hreflang_orphans":     N,
                "missing_build_files":  N,
                "broken_scenario_refs": N,
                "total":                N,
            },
        }
    """
    orphans = await check_hreflang_orphans(templates)
    missing_files = await check_missing_build_files(build_dir)
    broken_refs = await check_broken_scenario_refs(build_dir)
    return {
        "hreflang_orphans": orphans,
        "missing_build_files": missing_files,
        "broken_scenario_refs": broken_refs,
        "counts": {
            "hreflang_orphans": len(orphans),
            "missing_build_files": len(missing_files),
            "broken_scenario_refs": len(broken_refs),
            "total": len(orphans) + len(missing_files) + len(broken_refs),
        },
    }