feat(pseo): add content/health.py — gap detection, freshness, health checks
New module with pure async query functions for the pSEO Engine dashboard: - get_template_stats() — article counts by status/language per template - get_template_freshness() — compare _serving_meta.json vs last article gen - get_content_gaps() — DuckDB rows with no matching article per language - check_hreflang_orphans() — published articles missing a sibling language - check_missing_build_files() — published articles with no HTML on disk - check_broken_scenario_refs() — articles referencing non-existent scenarios - get_all_health_issues() — runs all checks, returns counts + detail lists Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
378
web/src/padelnomics/content/health.py
Normal file
378
web/src/padelnomics/content/health.py
Normal file
@@ -0,0 +1,378 @@
|
|||||||
|
"""
|
||||||
|
pSEO Engine health checks and content gap queries.
|
||||||
|
|
||||||
|
All functions are async, pure queries — no side effects.
|
||||||
|
Used by the pSEO Engine admin dashboard.
|
||||||
|
|
||||||
|
Functions overview:
|
||||||
|
get_template_stats() — article counts per status/language for one template
|
||||||
|
get_template_freshness() — compare _serving_meta.json timestamp vs last article generation
|
||||||
|
get_content_gaps() — DuckDB rows with no matching article for a template+language
|
||||||
|
check_hreflang_orphans() — published articles missing a sibling language
|
||||||
|
check_missing_build_files()— published articles whose HTML file is absent from disk
|
||||||
|
check_broken_scenario_refs()— articles referencing [scenario:slug] that doesn't exist
|
||||||
|
get_all_health_issues() — run all checks, return counts + details
|
||||||
|
"""
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from ..analytics import fetch_analytics
|
||||||
|
from ..core import fetch_all
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Directory where generate_articles() writes HTML + markdown source files.
|
||||||
|
BUILD_DIR = Path("data/content/_build")
|
||||||
|
|
||||||
|
# Pattern matching [scenario:slug] and [scenario:slug:section] markers.
|
||||||
|
_SCENARIO_REF_RE = re.compile(r"\[scenario:([a-z0-9_-]+)(?::[a-z]+)?\]")
|
||||||
|
|
||||||
|
|
||||||
|
def _validate_table_name(data_table: str) -> None:
|
||||||
|
"""Guard against SQL injection in table names."""
|
||||||
|
assert re.match(r"^[a-z_][a-z0-9_.]*$", data_table), (
|
||||||
|
f"Invalid table name: {data_table}"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _read_serving_meta() -> dict:
|
||||||
|
"""Read _serving_meta.json written by export_serving.py. Returns {} if absent."""
|
||||||
|
serving_path = os.environ.get("SERVING_DUCKDB_PATH", "data/analytics.duckdb")
|
||||||
|
meta_path = Path(serving_path).parent / "_serving_meta.json"
|
||||||
|
if not meta_path.exists():
|
||||||
|
return {}
|
||||||
|
try:
|
||||||
|
return json.loads(meta_path.read_text())
|
||||||
|
except (json.JSONDecodeError, OSError):
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_dt(s: str | None) -> datetime | None:
|
||||||
|
"""Parse an ISO datetime string to a naive UTC datetime. Returns None on failure."""
|
||||||
|
if not s:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
dt = datetime.fromisoformat(s)
|
||||||
|
# Strip timezone info so both aware (from meta) and naive (from SQLite) compare cleanly.
|
||||||
|
return dt.replace(tzinfo=None)
|
||||||
|
except (ValueError, TypeError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# ── Template statistics ───────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
async def get_template_stats(template_slug: str) -> dict:
|
||||||
|
"""Article counts for a template: total, published, draft, scheduled, by language.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
{
|
||||||
|
"total": N,
|
||||||
|
"published": N,
|
||||||
|
"draft": N,
|
||||||
|
"scheduled": N,
|
||||||
|
"by_language": {"en": {"total": N, "published": N, ...}, ...},
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
rows = await fetch_all(
|
||||||
|
"SELECT status, language, COUNT(*) as cnt FROM articles"
|
||||||
|
" WHERE template_slug = ? GROUP BY status, language",
|
||||||
|
(template_slug,),
|
||||||
|
)
|
||||||
|
stats: dict = {"total": 0, "published": 0, "draft": 0, "scheduled": 0, "by_language": {}}
|
||||||
|
for r in rows:
|
||||||
|
cnt = r["cnt"]
|
||||||
|
status = r["status"]
|
||||||
|
lang = r["language"]
|
||||||
|
|
||||||
|
stats["total"] += cnt
|
||||||
|
if status in stats:
|
||||||
|
stats[status] += cnt
|
||||||
|
|
||||||
|
if lang not in stats["by_language"]:
|
||||||
|
stats["by_language"][lang] = {"total": 0, "published": 0, "draft": 0, "scheduled": 0}
|
||||||
|
stats["by_language"][lang]["total"] += cnt
|
||||||
|
if status in stats["by_language"][lang]:
|
||||||
|
stats["by_language"][lang][status] += cnt
|
||||||
|
|
||||||
|
return stats
|
||||||
|
|
||||||
|
|
||||||
|
# ── Data freshness ────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
async def get_template_freshness(templates: list[dict]) -> list[dict]:
|
||||||
|
"""Compare _serving_meta.json exported_at vs max(articles.updated_at) per template.
|
||||||
|
|
||||||
|
Returns list of dicts — one per template:
|
||||||
|
{
|
||||||
|
"slug": str,
|
||||||
|
"name": str,
|
||||||
|
"data_table": str,
|
||||||
|
"exported_at_utc": str | None, # from _serving_meta.json
|
||||||
|
"last_generated": str | None, # max(updated_at) in articles
|
||||||
|
"row_count": int | None, # DuckDB row count from meta
|
||||||
|
"status": "fresh" | "stale" | "no_articles" | "no_data",
|
||||||
|
}
|
||||||
|
|
||||||
|
Freshness semantics:
|
||||||
|
"fresh" — articles generated after last data export (up to date)
|
||||||
|
"stale" — data export is newer than last article generation (regen needed)
|
||||||
|
"no_articles" — DuckDB data exists but no articles generated yet
|
||||||
|
"no_data" — _serving_meta.json absent (export_serving not yet run)
|
||||||
|
"""
|
||||||
|
meta = _read_serving_meta()
|
||||||
|
exported_at_str = meta.get("exported_at_utc")
|
||||||
|
exported_at = _parse_dt(exported_at_str)
|
||||||
|
table_meta = meta.get("tables", {})
|
||||||
|
|
||||||
|
result = []
|
||||||
|
for t in templates:
|
||||||
|
slug = t["slug"]
|
||||||
|
data_table = t.get("data_table", "")
|
||||||
|
# Strip schema prefix to match the key in _serving_meta.json tables dict.
|
||||||
|
# e.g. "serving.pseo_city_costs_de" → "pseo_city_costs_de"
|
||||||
|
table_key = data_table.split(".")[-1] if "." in data_table else data_table
|
||||||
|
|
||||||
|
rows = await fetch_all(
|
||||||
|
"SELECT MAX(COALESCE(updated_at, created_at)) as last_gen FROM articles"
|
||||||
|
" WHERE template_slug = ?",
|
||||||
|
(slug,),
|
||||||
|
)
|
||||||
|
last_gen_str = rows[0]["last_gen"] if rows else None
|
||||||
|
last_gen = _parse_dt(last_gen_str)
|
||||||
|
|
||||||
|
row_count = table_meta.get(table_key, {}).get("row_count")
|
||||||
|
|
||||||
|
if not exported_at_str:
|
||||||
|
status = "no_data"
|
||||||
|
elif last_gen is None:
|
||||||
|
status = "no_articles"
|
||||||
|
elif exported_at and last_gen and exported_at > last_gen:
|
||||||
|
# New data available — articles haven't been regenerated against it yet.
|
||||||
|
status = "stale"
|
||||||
|
else:
|
||||||
|
status = "fresh"
|
||||||
|
|
||||||
|
result.append({
|
||||||
|
"slug": slug,
|
||||||
|
"name": t.get("name", slug),
|
||||||
|
"data_table": data_table,
|
||||||
|
"exported_at_utc": exported_at_str,
|
||||||
|
"last_generated": last_gen_str,
|
||||||
|
"row_count": row_count,
|
||||||
|
"status": status,
|
||||||
|
})
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
# ── Content gaps ──────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
async def get_content_gaps(
|
||||||
|
template_slug: str,
|
||||||
|
data_table: str,
|
||||||
|
natural_key: str,
|
||||||
|
languages: list[str],
|
||||||
|
limit: int = 200,
|
||||||
|
) -> list[dict]:
|
||||||
|
"""Return DuckDB rows that have no matching article for at least one language.
|
||||||
|
|
||||||
|
The article slug is constructed as: "{template_slug}-{lang}-{natural_key_value}"
|
||||||
|
This lets us efficiently detect gaps without rendering URL patterns.
|
||||||
|
|
||||||
|
Returns list of dicts — each is the DuckDB row with two extra keys:
|
||||||
|
"_natural_key": str — the natural key value for this row
|
||||||
|
"_missing_languages": list[str] — languages with no article
|
||||||
|
"""
|
||||||
|
assert languages, "languages must not be empty"
|
||||||
|
_validate_table_name(data_table)
|
||||||
|
|
||||||
|
# Fetch all article slugs for this template to determine which rows exist.
|
||||||
|
slug_rows = await fetch_all(
|
||||||
|
"SELECT slug, language FROM articles WHERE template_slug = ?",
|
||||||
|
(template_slug,),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Build lookup: (lang, natural_key_value) → True
|
||||||
|
prefix_by_lang = {lang: f"{template_slug}-{lang}-" for lang in languages}
|
||||||
|
existing: set[tuple[str, str]] = set()
|
||||||
|
for r in slug_rows:
|
||||||
|
lang = r["language"]
|
||||||
|
if lang not in prefix_by_lang:
|
||||||
|
continue
|
||||||
|
prefix = prefix_by_lang[lang]
|
||||||
|
if r["slug"].startswith(prefix):
|
||||||
|
nk_val = r["slug"][len(prefix):]
|
||||||
|
existing.add((lang, nk_val))
|
||||||
|
|
||||||
|
duckdb_rows = await fetch_analytics(
|
||||||
|
f"SELECT * FROM {data_table} LIMIT ?",
|
||||||
|
[limit],
|
||||||
|
)
|
||||||
|
|
||||||
|
gaps = []
|
||||||
|
for row in duckdb_rows:
|
||||||
|
nk_val = str(row.get(natural_key, ""))
|
||||||
|
missing = [lang for lang in languages if (lang, nk_val) not in existing]
|
||||||
|
if missing:
|
||||||
|
gaps.append({**row, "_natural_key": nk_val, "_missing_languages": missing})
|
||||||
|
|
||||||
|
return gaps
|
||||||
|
|
||||||
|
|
||||||
|
# ── Health checks ─────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
async def check_hreflang_orphans(templates: list[dict]) -> list[dict]:
|
||||||
|
"""Published articles missing a sibling language expected by their template.
|
||||||
|
|
||||||
|
For example: city-cost-de generates EN + DE. If the EN article exists but
|
||||||
|
DE is absent, that article is an hreflang orphan.
|
||||||
|
|
||||||
|
Returns list of dicts:
|
||||||
|
{
|
||||||
|
"template_slug": str,
|
||||||
|
"url_path": str,
|
||||||
|
"present_languages": list[str],
|
||||||
|
"missing_languages": list[str],
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
orphans = []
|
||||||
|
for t in templates:
|
||||||
|
expected = set(t.get("languages", ["en"]))
|
||||||
|
if len(expected) < 2:
|
||||||
|
continue # Single-language template — no orphans possible.
|
||||||
|
|
||||||
|
rows = await fetch_all(
|
||||||
|
"""SELECT url_path,
|
||||||
|
GROUP_CONCAT(language) as langs,
|
||||||
|
COUNT(DISTINCT language) as lang_count
|
||||||
|
FROM articles
|
||||||
|
WHERE template_slug = ? AND status = 'published'
|
||||||
|
GROUP BY url_path
|
||||||
|
HAVING COUNT(DISTINCT language) < ?""",
|
||||||
|
(t["slug"], len(expected)),
|
||||||
|
)
|
||||||
|
for r in rows:
|
||||||
|
present = set(r["langs"].split(","))
|
||||||
|
missing = sorted(expected - present)
|
||||||
|
orphans.append({
|
||||||
|
"template_slug": t["slug"],
|
||||||
|
"url_path": r["url_path"],
|
||||||
|
"present_languages": sorted(present),
|
||||||
|
"missing_languages": missing,
|
||||||
|
})
|
||||||
|
|
||||||
|
return orphans
|
||||||
|
|
||||||
|
|
||||||
|
async def check_missing_build_files(build_dir: Path | None = None) -> list[dict]:
|
||||||
|
"""Published articles whose HTML file is absent from disk.
|
||||||
|
|
||||||
|
Expected path: BUILD_DIR/{language}/{slug}.html
|
||||||
|
|
||||||
|
Returns list of dicts:
|
||||||
|
{"id", "slug", "language", "url_path", "template_slug", "expected_path"}
|
||||||
|
"""
|
||||||
|
bd = build_dir or BUILD_DIR
|
||||||
|
rows = await fetch_all(
|
||||||
|
"SELECT id, slug, language, url_path, template_slug FROM articles"
|
||||||
|
" WHERE status = 'published'",
|
||||||
|
)
|
||||||
|
missing = []
|
||||||
|
for r in rows:
|
||||||
|
path = bd / r["language"] / f"{r['slug']}.html"
|
||||||
|
if not path.exists():
|
||||||
|
missing.append({
|
||||||
|
"id": r["id"],
|
||||||
|
"slug": r["slug"],
|
||||||
|
"language": r["language"],
|
||||||
|
"url_path": r["url_path"],
|
||||||
|
"template_slug": r["template_slug"],
|
||||||
|
"expected_path": str(path),
|
||||||
|
})
|
||||||
|
return missing
|
||||||
|
|
||||||
|
|
||||||
|
async def check_broken_scenario_refs(build_dir: Path | None = None) -> list[dict]:
|
||||||
|
"""pSEO articles referencing [scenario:slug] markers that don't exist.
|
||||||
|
|
||||||
|
Reads markdown source from BUILD_DIR/{language}/md/{slug}.md.
|
||||||
|
Only checks published articles with a template_slug (pSEO-generated).
|
||||||
|
|
||||||
|
Returns list of dicts:
|
||||||
|
{"id", "slug", "language", "url_path", "broken_scenario_refs": [str, ...]}
|
||||||
|
"""
|
||||||
|
bd = build_dir or BUILD_DIR
|
||||||
|
|
||||||
|
scenario_rows = await fetch_all("SELECT slug FROM published_scenarios")
|
||||||
|
valid_slugs = {r["slug"] for r in scenario_rows}
|
||||||
|
|
||||||
|
articles = await fetch_all(
|
||||||
|
"SELECT id, slug, language, url_path FROM articles"
|
||||||
|
" WHERE status = 'published' AND template_slug IS NOT NULL",
|
||||||
|
)
|
||||||
|
|
||||||
|
broken = []
|
||||||
|
for a in articles:
|
||||||
|
md_path = bd / a["language"] / "md" / f"{a['slug']}.md"
|
||||||
|
if not md_path.exists():
|
||||||
|
continue
|
||||||
|
markdown = md_path.read_text()
|
||||||
|
refs = {m.group(1) for m in _SCENARIO_REF_RE.finditer(markdown)}
|
||||||
|
missing_refs = sorted(refs - valid_slugs)
|
||||||
|
if missing_refs:
|
||||||
|
broken.append({
|
||||||
|
"id": a["id"],
|
||||||
|
"slug": a["slug"],
|
||||||
|
"language": a["language"],
|
||||||
|
"url_path": a["url_path"],
|
||||||
|
"broken_scenario_refs": missing_refs,
|
||||||
|
})
|
||||||
|
|
||||||
|
return broken
|
||||||
|
|
||||||
|
|
||||||
|
# ── Aggregate check ───────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
async def get_all_health_issues(
|
||||||
|
templates: list[dict],
|
||||||
|
build_dir: Path | None = None,
|
||||||
|
) -> dict:
|
||||||
|
"""Run all health checks, return issue counts and full detail lists.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
{
|
||||||
|
"hreflang_orphans": [...],
|
||||||
|
"missing_build_files": [...],
|
||||||
|
"broken_scenario_refs": [...],
|
||||||
|
"counts": {
|
||||||
|
"hreflang_orphans": N,
|
||||||
|
"missing_build_files": N,
|
||||||
|
"broken_scenario_refs": N,
|
||||||
|
"total": N,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
orphans = await check_hreflang_orphans(templates)
|
||||||
|
missing_files = await check_missing_build_files(build_dir)
|
||||||
|
broken_refs = await check_broken_scenario_refs(build_dir)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"hreflang_orphans": orphans,
|
||||||
|
"missing_build_files": missing_files,
|
||||||
|
"broken_scenario_refs": broken_refs,
|
||||||
|
"counts": {
|
||||||
|
"hreflang_orphans": len(orphans),
|
||||||
|
"missing_build_files": len(missing_files),
|
||||||
|
"broken_scenario_refs": len(broken_refs),
|
||||||
|
"total": len(orphans) + len(missing_files) + len(broken_refs),
|
||||||
|
},
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user