feat(pseo): add content/health.py — gap detection, freshness, health checks
New module with pure async query functions for the pSEO Engine dashboard: - get_template_stats() — article counts by status/language per template - get_template_freshness() — compare _serving_meta.json vs last article gen - get_content_gaps() — DuckDB rows with no matching article per language - check_hreflang_orphans() — published articles missing a sibling language - check_missing_build_files() — published articles with no HTML on disk - check_broken_scenario_refs() — articles referencing non-existent scenarios - get_all_health_issues() — runs all checks, returns counts + detail lists Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
378
web/src/padelnomics/content/health.py
Normal file
378
web/src/padelnomics/content/health.py
Normal file
@@ -0,0 +1,378 @@
|
||||
"""
|
||||
pSEO Engine health checks and content gap queries.
|
||||
|
||||
All functions are async, pure queries — no side effects.
|
||||
Used by the pSEO Engine admin dashboard.
|
||||
|
||||
Functions overview:
|
||||
get_template_stats() — article counts per status/language for one template
|
||||
get_template_freshness() — compare _serving_meta.json timestamp vs last article generation
|
||||
get_content_gaps() — DuckDB rows with no matching article for a template+language
|
||||
check_hreflang_orphans() — published articles missing a sibling language
|
||||
check_missing_build_files()— published articles whose HTML file is absent from disk
|
||||
check_broken_scenario_refs()— articles referencing [scenario:slug] that doesn't exist
|
||||
get_all_health_issues() — run all checks, return counts + details
|
||||
"""
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from ..analytics import fetch_analytics
|
||||
from ..core import fetch_all
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Directory where generate_articles() writes HTML + markdown source files.
|
||||
BUILD_DIR = Path("data/content/_build")
|
||||
|
||||
# Pattern matching [scenario:slug] and [scenario:slug:section] markers.
|
||||
_SCENARIO_REF_RE = re.compile(r"\[scenario:([a-z0-9_-]+)(?::[a-z]+)?\]")
|
||||
|
||||
|
||||
def _validate_table_name(data_table: str) -> None:
|
||||
"""Guard against SQL injection in table names."""
|
||||
assert re.match(r"^[a-z_][a-z0-9_.]*$", data_table), (
|
||||
f"Invalid table name: {data_table}"
|
||||
)
|
||||
|
||||
|
||||
def _read_serving_meta() -> dict:
|
||||
"""Read _serving_meta.json written by export_serving.py. Returns {} if absent."""
|
||||
serving_path = os.environ.get("SERVING_DUCKDB_PATH", "data/analytics.duckdb")
|
||||
meta_path = Path(serving_path).parent / "_serving_meta.json"
|
||||
if not meta_path.exists():
|
||||
return {}
|
||||
try:
|
||||
return json.loads(meta_path.read_text())
|
||||
except (json.JSONDecodeError, OSError):
|
||||
return {}
|
||||
|
||||
|
||||
def _parse_dt(s: str | None) -> datetime | None:
|
||||
"""Parse an ISO datetime string to a naive UTC datetime. Returns None on failure."""
|
||||
if not s:
|
||||
return None
|
||||
try:
|
||||
dt = datetime.fromisoformat(s)
|
||||
# Strip timezone info so both aware (from meta) and naive (from SQLite) compare cleanly.
|
||||
return dt.replace(tzinfo=None)
|
||||
except (ValueError, TypeError):
|
||||
return None
|
||||
|
||||
|
||||
# ── Template statistics ───────────────────────────────────────────────────────
|
||||
|
||||
|
||||
async def get_template_stats(template_slug: str) -> dict:
|
||||
"""Article counts for a template: total, published, draft, scheduled, by language.
|
||||
|
||||
Returns:
|
||||
{
|
||||
"total": N,
|
||||
"published": N,
|
||||
"draft": N,
|
||||
"scheduled": N,
|
||||
"by_language": {"en": {"total": N, "published": N, ...}, ...},
|
||||
}
|
||||
"""
|
||||
rows = await fetch_all(
|
||||
"SELECT status, language, COUNT(*) as cnt FROM articles"
|
||||
" WHERE template_slug = ? GROUP BY status, language",
|
||||
(template_slug,),
|
||||
)
|
||||
stats: dict = {"total": 0, "published": 0, "draft": 0, "scheduled": 0, "by_language": {}}
|
||||
for r in rows:
|
||||
cnt = r["cnt"]
|
||||
status = r["status"]
|
||||
lang = r["language"]
|
||||
|
||||
stats["total"] += cnt
|
||||
if status in stats:
|
||||
stats[status] += cnt
|
||||
|
||||
if lang not in stats["by_language"]:
|
||||
stats["by_language"][lang] = {"total": 0, "published": 0, "draft": 0, "scheduled": 0}
|
||||
stats["by_language"][lang]["total"] += cnt
|
||||
if status in stats["by_language"][lang]:
|
||||
stats["by_language"][lang][status] += cnt
|
||||
|
||||
return stats
|
||||
|
||||
|
||||
# ── Data freshness ────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
async def get_template_freshness(templates: list[dict]) -> list[dict]:
|
||||
"""Compare _serving_meta.json exported_at vs max(articles.updated_at) per template.
|
||||
|
||||
Returns list of dicts — one per template:
|
||||
{
|
||||
"slug": str,
|
||||
"name": str,
|
||||
"data_table": str,
|
||||
"exported_at_utc": str | None, # from _serving_meta.json
|
||||
"last_generated": str | None, # max(updated_at) in articles
|
||||
"row_count": int | None, # DuckDB row count from meta
|
||||
"status": "fresh" | "stale" | "no_articles" | "no_data",
|
||||
}
|
||||
|
||||
Freshness semantics:
|
||||
"fresh" — articles generated after last data export (up to date)
|
||||
"stale" — data export is newer than last article generation (regen needed)
|
||||
"no_articles" — DuckDB data exists but no articles generated yet
|
||||
"no_data" — _serving_meta.json absent (export_serving not yet run)
|
||||
"""
|
||||
meta = _read_serving_meta()
|
||||
exported_at_str = meta.get("exported_at_utc")
|
||||
exported_at = _parse_dt(exported_at_str)
|
||||
table_meta = meta.get("tables", {})
|
||||
|
||||
result = []
|
||||
for t in templates:
|
||||
slug = t["slug"]
|
||||
data_table = t.get("data_table", "")
|
||||
# Strip schema prefix to match the key in _serving_meta.json tables dict.
|
||||
# e.g. "serving.pseo_city_costs_de" → "pseo_city_costs_de"
|
||||
table_key = data_table.split(".")[-1] if "." in data_table else data_table
|
||||
|
||||
rows = await fetch_all(
|
||||
"SELECT MAX(COALESCE(updated_at, created_at)) as last_gen FROM articles"
|
||||
" WHERE template_slug = ?",
|
||||
(slug,),
|
||||
)
|
||||
last_gen_str = rows[0]["last_gen"] if rows else None
|
||||
last_gen = _parse_dt(last_gen_str)
|
||||
|
||||
row_count = table_meta.get(table_key, {}).get("row_count")
|
||||
|
||||
if not exported_at_str:
|
||||
status = "no_data"
|
||||
elif last_gen is None:
|
||||
status = "no_articles"
|
||||
elif exported_at and last_gen and exported_at > last_gen:
|
||||
# New data available — articles haven't been regenerated against it yet.
|
||||
status = "stale"
|
||||
else:
|
||||
status = "fresh"
|
||||
|
||||
result.append({
|
||||
"slug": slug,
|
||||
"name": t.get("name", slug),
|
||||
"data_table": data_table,
|
||||
"exported_at_utc": exported_at_str,
|
||||
"last_generated": last_gen_str,
|
||||
"row_count": row_count,
|
||||
"status": status,
|
||||
})
|
||||
|
||||
return result
|
||||
|
||||
|
||||
# ── Content gaps ──────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
async def get_content_gaps(
|
||||
template_slug: str,
|
||||
data_table: str,
|
||||
natural_key: str,
|
||||
languages: list[str],
|
||||
limit: int = 200,
|
||||
) -> list[dict]:
|
||||
"""Return DuckDB rows that have no matching article for at least one language.
|
||||
|
||||
The article slug is constructed as: "{template_slug}-{lang}-{natural_key_value}"
|
||||
This lets us efficiently detect gaps without rendering URL patterns.
|
||||
|
||||
Returns list of dicts — each is the DuckDB row with two extra keys:
|
||||
"_natural_key": str — the natural key value for this row
|
||||
"_missing_languages": list[str] — languages with no article
|
||||
"""
|
||||
assert languages, "languages must not be empty"
|
||||
_validate_table_name(data_table)
|
||||
|
||||
# Fetch all article slugs for this template to determine which rows exist.
|
||||
slug_rows = await fetch_all(
|
||||
"SELECT slug, language FROM articles WHERE template_slug = ?",
|
||||
(template_slug,),
|
||||
)
|
||||
|
||||
# Build lookup: (lang, natural_key_value) → True
|
||||
prefix_by_lang = {lang: f"{template_slug}-{lang}-" for lang in languages}
|
||||
existing: set[tuple[str, str]] = set()
|
||||
for r in slug_rows:
|
||||
lang = r["language"]
|
||||
if lang not in prefix_by_lang:
|
||||
continue
|
||||
prefix = prefix_by_lang[lang]
|
||||
if r["slug"].startswith(prefix):
|
||||
nk_val = r["slug"][len(prefix):]
|
||||
existing.add((lang, nk_val))
|
||||
|
||||
duckdb_rows = await fetch_analytics(
|
||||
f"SELECT * FROM {data_table} LIMIT ?",
|
||||
[limit],
|
||||
)
|
||||
|
||||
gaps = []
|
||||
for row in duckdb_rows:
|
||||
nk_val = str(row.get(natural_key, ""))
|
||||
missing = [lang for lang in languages if (lang, nk_val) not in existing]
|
||||
if missing:
|
||||
gaps.append({**row, "_natural_key": nk_val, "_missing_languages": missing})
|
||||
|
||||
return gaps
|
||||
|
||||
|
||||
# ── Health checks ─────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
async def check_hreflang_orphans(templates: list[dict]) -> list[dict]:
|
||||
"""Published articles missing a sibling language expected by their template.
|
||||
|
||||
For example: city-cost-de generates EN + DE. If the EN article exists but
|
||||
DE is absent, that article is an hreflang orphan.
|
||||
|
||||
Returns list of dicts:
|
||||
{
|
||||
"template_slug": str,
|
||||
"url_path": str,
|
||||
"present_languages": list[str],
|
||||
"missing_languages": list[str],
|
||||
}
|
||||
"""
|
||||
orphans = []
|
||||
for t in templates:
|
||||
expected = set(t.get("languages", ["en"]))
|
||||
if len(expected) < 2:
|
||||
continue # Single-language template — no orphans possible.
|
||||
|
||||
rows = await fetch_all(
|
||||
"""SELECT url_path,
|
||||
GROUP_CONCAT(language) as langs,
|
||||
COUNT(DISTINCT language) as lang_count
|
||||
FROM articles
|
||||
WHERE template_slug = ? AND status = 'published'
|
||||
GROUP BY url_path
|
||||
HAVING COUNT(DISTINCT language) < ?""",
|
||||
(t["slug"], len(expected)),
|
||||
)
|
||||
for r in rows:
|
||||
present = set(r["langs"].split(","))
|
||||
missing = sorted(expected - present)
|
||||
orphans.append({
|
||||
"template_slug": t["slug"],
|
||||
"url_path": r["url_path"],
|
||||
"present_languages": sorted(present),
|
||||
"missing_languages": missing,
|
||||
})
|
||||
|
||||
return orphans
|
||||
|
||||
|
||||
async def check_missing_build_files(build_dir: Path | None = None) -> list[dict]:
|
||||
"""Published articles whose HTML file is absent from disk.
|
||||
|
||||
Expected path: BUILD_DIR/{language}/{slug}.html
|
||||
|
||||
Returns list of dicts:
|
||||
{"id", "slug", "language", "url_path", "template_slug", "expected_path"}
|
||||
"""
|
||||
bd = build_dir or BUILD_DIR
|
||||
rows = await fetch_all(
|
||||
"SELECT id, slug, language, url_path, template_slug FROM articles"
|
||||
" WHERE status = 'published'",
|
||||
)
|
||||
missing = []
|
||||
for r in rows:
|
||||
path = bd / r["language"] / f"{r['slug']}.html"
|
||||
if not path.exists():
|
||||
missing.append({
|
||||
"id": r["id"],
|
||||
"slug": r["slug"],
|
||||
"language": r["language"],
|
||||
"url_path": r["url_path"],
|
||||
"template_slug": r["template_slug"],
|
||||
"expected_path": str(path),
|
||||
})
|
||||
return missing
|
||||
|
||||
|
||||
async def check_broken_scenario_refs(build_dir: Path | None = None) -> list[dict]:
|
||||
"""pSEO articles referencing [scenario:slug] markers that don't exist.
|
||||
|
||||
Reads markdown source from BUILD_DIR/{language}/md/{slug}.md.
|
||||
Only checks published articles with a template_slug (pSEO-generated).
|
||||
|
||||
Returns list of dicts:
|
||||
{"id", "slug", "language", "url_path", "broken_scenario_refs": [str, ...]}
|
||||
"""
|
||||
bd = build_dir or BUILD_DIR
|
||||
|
||||
scenario_rows = await fetch_all("SELECT slug FROM published_scenarios")
|
||||
valid_slugs = {r["slug"] for r in scenario_rows}
|
||||
|
||||
articles = await fetch_all(
|
||||
"SELECT id, slug, language, url_path FROM articles"
|
||||
" WHERE status = 'published' AND template_slug IS NOT NULL",
|
||||
)
|
||||
|
||||
broken = []
|
||||
for a in articles:
|
||||
md_path = bd / a["language"] / "md" / f"{a['slug']}.md"
|
||||
if not md_path.exists():
|
||||
continue
|
||||
markdown = md_path.read_text()
|
||||
refs = {m.group(1) for m in _SCENARIO_REF_RE.finditer(markdown)}
|
||||
missing_refs = sorted(refs - valid_slugs)
|
||||
if missing_refs:
|
||||
broken.append({
|
||||
"id": a["id"],
|
||||
"slug": a["slug"],
|
||||
"language": a["language"],
|
||||
"url_path": a["url_path"],
|
||||
"broken_scenario_refs": missing_refs,
|
||||
})
|
||||
|
||||
return broken
|
||||
|
||||
|
||||
# ── Aggregate check ───────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
async def get_all_health_issues(
|
||||
templates: list[dict],
|
||||
build_dir: Path | None = None,
|
||||
) -> dict:
|
||||
"""Run all health checks, return issue counts and full detail lists.
|
||||
|
||||
Returns:
|
||||
{
|
||||
"hreflang_orphans": [...],
|
||||
"missing_build_files": [...],
|
||||
"broken_scenario_refs": [...],
|
||||
"counts": {
|
||||
"hreflang_orphans": N,
|
||||
"missing_build_files": N,
|
||||
"broken_scenario_refs": N,
|
||||
"total": N,
|
||||
},
|
||||
}
|
||||
"""
|
||||
orphans = await check_hreflang_orphans(templates)
|
||||
missing_files = await check_missing_build_files(build_dir)
|
||||
broken_refs = await check_broken_scenario_refs(build_dir)
|
||||
|
||||
return {
|
||||
"hreflang_orphans": orphans,
|
||||
"missing_build_files": missing_files,
|
||||
"broken_scenario_refs": broken_refs,
|
||||
"counts": {
|
||||
"hreflang_orphans": len(orphans),
|
||||
"missing_build_files": len(missing_files),
|
||||
"broken_scenario_refs": len(broken_refs),
|
||||
"total": len(orphans) + len(missing_files) + len(broken_refs),
|
||||
},
|
||||
}
|
||||
Reference in New Issue
Block a user