From 567100076f4926d3014846e1b0d062f17090ab07 Mon Sep 17 00:00:00 2001 From: Deeman Date: Tue, 24 Feb 2026 18:21:34 +0100 Subject: [PATCH] =?UTF-8?q?feat(pseo):=20add=20content/health.py=20?= =?UTF-8?q?=E2=80=94=20gap=20detection,=20freshness,=20health=20checks?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New module with pure async query functions for the pSEO Engine dashboard: - get_template_stats() — article counts by status/language per template - get_template_freshness() — compare _serving_meta.json vs last article gen - get_content_gaps() — DuckDB rows with no matching article per language - check_hreflang_orphans() — published articles missing a sibling language - check_missing_build_files() — published articles with no HTML on disk - check_broken_scenario_refs() — articles referencing non-existent scenarios - get_all_health_issues() — runs all checks, returns counts + detail lists Co-Authored-By: Claude Sonnet 4.6 --- web/src/padelnomics/content/health.py | 378 ++++++++++++++++++++++++++ 1 file changed, 378 insertions(+) create mode 100644 web/src/padelnomics/content/health.py diff --git a/web/src/padelnomics/content/health.py b/web/src/padelnomics/content/health.py new file mode 100644 index 0000000..13a6f34 --- /dev/null +++ b/web/src/padelnomics/content/health.py @@ -0,0 +1,378 @@ +""" +pSEO Engine health checks and content gap queries. + +All functions are async, pure queries — no side effects. +Used by the pSEO Engine admin dashboard. + +Functions overview: + get_template_stats() — article counts per status/language for one template + get_template_freshness() — compare _serving_meta.json timestamp vs last article generation + get_content_gaps() — DuckDB rows with no matching article for a template+language + check_hreflang_orphans() — published articles missing a sibling language + check_missing_build_files()— published articles whose HTML file is absent from disk + check_broken_scenario_refs()— articles referencing [scenario:slug] that doesn't exist + get_all_health_issues() — run all checks, return counts + details +""" +import json +import logging +import os +import re +from datetime import datetime +from pathlib import Path + +from ..analytics import fetch_analytics +from ..core import fetch_all + +logger = logging.getLogger(__name__) + +# Directory where generate_articles() writes HTML + markdown source files. +BUILD_DIR = Path("data/content/_build") + +# Pattern matching [scenario:slug] and [scenario:slug:section] markers. +_SCENARIO_REF_RE = re.compile(r"\[scenario:([a-z0-9_-]+)(?::[a-z]+)?\]") + + +def _validate_table_name(data_table: str) -> None: + """Guard against SQL injection in table names.""" + assert re.match(r"^[a-z_][a-z0-9_.]*$", data_table), ( + f"Invalid table name: {data_table}" + ) + + +def _read_serving_meta() -> dict: + """Read _serving_meta.json written by export_serving.py. Returns {} if absent.""" + serving_path = os.environ.get("SERVING_DUCKDB_PATH", "data/analytics.duckdb") + meta_path = Path(serving_path).parent / "_serving_meta.json" + if not meta_path.exists(): + return {} + try: + return json.loads(meta_path.read_text()) + except (json.JSONDecodeError, OSError): + return {} + + +def _parse_dt(s: str | None) -> datetime | None: + """Parse an ISO datetime string to a naive UTC datetime. Returns None on failure.""" + if not s: + return None + try: + dt = datetime.fromisoformat(s) + # Strip timezone info so both aware (from meta) and naive (from SQLite) compare cleanly. + return dt.replace(tzinfo=None) + except (ValueError, TypeError): + return None + + +# ── Template statistics ─────────────────────────────────────────────────────── + + +async def get_template_stats(template_slug: str) -> dict: + """Article counts for a template: total, published, draft, scheduled, by language. + + Returns: + { + "total": N, + "published": N, + "draft": N, + "scheduled": N, + "by_language": {"en": {"total": N, "published": N, ...}, ...}, + } + """ + rows = await fetch_all( + "SELECT status, language, COUNT(*) as cnt FROM articles" + " WHERE template_slug = ? GROUP BY status, language", + (template_slug,), + ) + stats: dict = {"total": 0, "published": 0, "draft": 0, "scheduled": 0, "by_language": {}} + for r in rows: + cnt = r["cnt"] + status = r["status"] + lang = r["language"] + + stats["total"] += cnt + if status in stats: + stats[status] += cnt + + if lang not in stats["by_language"]: + stats["by_language"][lang] = {"total": 0, "published": 0, "draft": 0, "scheduled": 0} + stats["by_language"][lang]["total"] += cnt + if status in stats["by_language"][lang]: + stats["by_language"][lang][status] += cnt + + return stats + + +# ── Data freshness ──────────────────────────────────────────────────────────── + + +async def get_template_freshness(templates: list[dict]) -> list[dict]: + """Compare _serving_meta.json exported_at vs max(articles.updated_at) per template. + + Returns list of dicts — one per template: + { + "slug": str, + "name": str, + "data_table": str, + "exported_at_utc": str | None, # from _serving_meta.json + "last_generated": str | None, # max(updated_at) in articles + "row_count": int | None, # DuckDB row count from meta + "status": "fresh" | "stale" | "no_articles" | "no_data", + } + + Freshness semantics: + "fresh" — articles generated after last data export (up to date) + "stale" — data export is newer than last article generation (regen needed) + "no_articles" — DuckDB data exists but no articles generated yet + "no_data" — _serving_meta.json absent (export_serving not yet run) + """ + meta = _read_serving_meta() + exported_at_str = meta.get("exported_at_utc") + exported_at = _parse_dt(exported_at_str) + table_meta = meta.get("tables", {}) + + result = [] + for t in templates: + slug = t["slug"] + data_table = t.get("data_table", "") + # Strip schema prefix to match the key in _serving_meta.json tables dict. + # e.g. "serving.pseo_city_costs_de" → "pseo_city_costs_de" + table_key = data_table.split(".")[-1] if "." in data_table else data_table + + rows = await fetch_all( + "SELECT MAX(COALESCE(updated_at, created_at)) as last_gen FROM articles" + " WHERE template_slug = ?", + (slug,), + ) + last_gen_str = rows[0]["last_gen"] if rows else None + last_gen = _parse_dt(last_gen_str) + + row_count = table_meta.get(table_key, {}).get("row_count") + + if not exported_at_str: + status = "no_data" + elif last_gen is None: + status = "no_articles" + elif exported_at and last_gen and exported_at > last_gen: + # New data available — articles haven't been regenerated against it yet. + status = "stale" + else: + status = "fresh" + + result.append({ + "slug": slug, + "name": t.get("name", slug), + "data_table": data_table, + "exported_at_utc": exported_at_str, + "last_generated": last_gen_str, + "row_count": row_count, + "status": status, + }) + + return result + + +# ── Content gaps ────────────────────────────────────────────────────────────── + + +async def get_content_gaps( + template_slug: str, + data_table: str, + natural_key: str, + languages: list[str], + limit: int = 200, +) -> list[dict]: + """Return DuckDB rows that have no matching article for at least one language. + + The article slug is constructed as: "{template_slug}-{lang}-{natural_key_value}" + This lets us efficiently detect gaps without rendering URL patterns. + + Returns list of dicts — each is the DuckDB row with two extra keys: + "_natural_key": str — the natural key value for this row + "_missing_languages": list[str] — languages with no article + """ + assert languages, "languages must not be empty" + _validate_table_name(data_table) + + # Fetch all article slugs for this template to determine which rows exist. + slug_rows = await fetch_all( + "SELECT slug, language FROM articles WHERE template_slug = ?", + (template_slug,), + ) + + # Build lookup: (lang, natural_key_value) → True + prefix_by_lang = {lang: f"{template_slug}-{lang}-" for lang in languages} + existing: set[tuple[str, str]] = set() + for r in slug_rows: + lang = r["language"] + if lang not in prefix_by_lang: + continue + prefix = prefix_by_lang[lang] + if r["slug"].startswith(prefix): + nk_val = r["slug"][len(prefix):] + existing.add((lang, nk_val)) + + duckdb_rows = await fetch_analytics( + f"SELECT * FROM {data_table} LIMIT ?", + [limit], + ) + + gaps = [] + for row in duckdb_rows: + nk_val = str(row.get(natural_key, "")) + missing = [lang for lang in languages if (lang, nk_val) not in existing] + if missing: + gaps.append({**row, "_natural_key": nk_val, "_missing_languages": missing}) + + return gaps + + +# ── Health checks ───────────────────────────────────────────────────────────── + + +async def check_hreflang_orphans(templates: list[dict]) -> list[dict]: + """Published articles missing a sibling language expected by their template. + + For example: city-cost-de generates EN + DE. If the EN article exists but + DE is absent, that article is an hreflang orphan. + + Returns list of dicts: + { + "template_slug": str, + "url_path": str, + "present_languages": list[str], + "missing_languages": list[str], + } + """ + orphans = [] + for t in templates: + expected = set(t.get("languages", ["en"])) + if len(expected) < 2: + continue # Single-language template — no orphans possible. + + rows = await fetch_all( + """SELECT url_path, + GROUP_CONCAT(language) as langs, + COUNT(DISTINCT language) as lang_count + FROM articles + WHERE template_slug = ? AND status = 'published' + GROUP BY url_path + HAVING COUNT(DISTINCT language) < ?""", + (t["slug"], len(expected)), + ) + for r in rows: + present = set(r["langs"].split(",")) + missing = sorted(expected - present) + orphans.append({ + "template_slug": t["slug"], + "url_path": r["url_path"], + "present_languages": sorted(present), + "missing_languages": missing, + }) + + return orphans + + +async def check_missing_build_files(build_dir: Path | None = None) -> list[dict]: + """Published articles whose HTML file is absent from disk. + + Expected path: BUILD_DIR/{language}/{slug}.html + + Returns list of dicts: + {"id", "slug", "language", "url_path", "template_slug", "expected_path"} + """ + bd = build_dir or BUILD_DIR + rows = await fetch_all( + "SELECT id, slug, language, url_path, template_slug FROM articles" + " WHERE status = 'published'", + ) + missing = [] + for r in rows: + path = bd / r["language"] / f"{r['slug']}.html" + if not path.exists(): + missing.append({ + "id": r["id"], + "slug": r["slug"], + "language": r["language"], + "url_path": r["url_path"], + "template_slug": r["template_slug"], + "expected_path": str(path), + }) + return missing + + +async def check_broken_scenario_refs(build_dir: Path | None = None) -> list[dict]: + """pSEO articles referencing [scenario:slug] markers that don't exist. + + Reads markdown source from BUILD_DIR/{language}/md/{slug}.md. + Only checks published articles with a template_slug (pSEO-generated). + + Returns list of dicts: + {"id", "slug", "language", "url_path", "broken_scenario_refs": [str, ...]} + """ + bd = build_dir or BUILD_DIR + + scenario_rows = await fetch_all("SELECT slug FROM published_scenarios") + valid_slugs = {r["slug"] for r in scenario_rows} + + articles = await fetch_all( + "SELECT id, slug, language, url_path FROM articles" + " WHERE status = 'published' AND template_slug IS NOT NULL", + ) + + broken = [] + for a in articles: + md_path = bd / a["language"] / "md" / f"{a['slug']}.md" + if not md_path.exists(): + continue + markdown = md_path.read_text() + refs = {m.group(1) for m in _SCENARIO_REF_RE.finditer(markdown)} + missing_refs = sorted(refs - valid_slugs) + if missing_refs: + broken.append({ + "id": a["id"], + "slug": a["slug"], + "language": a["language"], + "url_path": a["url_path"], + "broken_scenario_refs": missing_refs, + }) + + return broken + + +# ── Aggregate check ─────────────────────────────────────────────────────────── + + +async def get_all_health_issues( + templates: list[dict], + build_dir: Path | None = None, +) -> dict: + """Run all health checks, return issue counts and full detail lists. + + Returns: + { + "hreflang_orphans": [...], + "missing_build_files": [...], + "broken_scenario_refs": [...], + "counts": { + "hreflang_orphans": N, + "missing_build_files": N, + "broken_scenario_refs": N, + "total": N, + }, + } + """ + orphans = await check_hreflang_orphans(templates) + missing_files = await check_missing_build_files(build_dir) + broken_refs = await check_broken_scenario_refs(build_dir) + + return { + "hreflang_orphans": orphans, + "missing_build_files": missing_files, + "broken_scenario_refs": broken_refs, + "counts": { + "hreflang_orphans": len(orphans), + "missing_build_files": len(missing_files), + "broken_scenario_refs": len(broken_refs), + "total": len(orphans) + len(missing_files) + len(broken_refs), + }, + }