feat(pseo): add content/health.py — gap detection, freshness, health checks

New module with pure async query functions for the pSEO Engine dashboard:
- get_template_stats() — article counts by status/language per template
- get_template_freshness() — compare _serving_meta.json vs last article gen
- get_content_gaps() — DuckDB rows with no matching article per language
- check_hreflang_orphans() — published articles missing a sibling language
- check_missing_build_files() — published articles with no HTML on disk
- check_broken_scenario_refs() — articles referencing non-existent scenarios
- get_all_health_issues() — runs all checks, returns counts + detail lists

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Deeman
2026-02-24 18:21:34 +01:00
parent 97c3aafea8
commit 567100076f

View File

@@ -0,0 +1,378 @@
"""
pSEO Engine health checks and content gap queries.
All functions are async, pure queries — no side effects.
Used by the pSEO Engine admin dashboard.
Functions overview:
get_template_stats() — article counts per status/language for one template
get_template_freshness() — compare _serving_meta.json timestamp vs last article generation
get_content_gaps() — DuckDB rows with no matching article for a template+language
check_hreflang_orphans() — published articles missing a sibling language
check_missing_build_files()— published articles whose HTML file is absent from disk
check_broken_scenario_refs()— articles referencing [scenario:slug] that doesn't exist
get_all_health_issues() — run all checks, return counts + details
"""
import json
import logging
import os
import re
from datetime import datetime
from pathlib import Path
from ..analytics import fetch_analytics
from ..core import fetch_all
logger = logging.getLogger(__name__)
# Directory where generate_articles() writes HTML + markdown source files.
BUILD_DIR = Path("data/content/_build")
# Pattern matching [scenario:slug] and [scenario:slug:section] markers.
_SCENARIO_REF_RE = re.compile(r"\[scenario:([a-z0-9_-]+)(?::[a-z]+)?\]")
def _validate_table_name(data_table: str) -> None:
"""Guard against SQL injection in table names."""
assert re.match(r"^[a-z_][a-z0-9_.]*$", data_table), (
f"Invalid table name: {data_table}"
)
def _read_serving_meta() -> dict:
"""Read _serving_meta.json written by export_serving.py. Returns {} if absent."""
serving_path = os.environ.get("SERVING_DUCKDB_PATH", "data/analytics.duckdb")
meta_path = Path(serving_path).parent / "_serving_meta.json"
if not meta_path.exists():
return {}
try:
return json.loads(meta_path.read_text())
except (json.JSONDecodeError, OSError):
return {}
def _parse_dt(s: str | None) -> datetime | None:
"""Parse an ISO datetime string to a naive UTC datetime. Returns None on failure."""
if not s:
return None
try:
dt = datetime.fromisoformat(s)
# Strip timezone info so both aware (from meta) and naive (from SQLite) compare cleanly.
return dt.replace(tzinfo=None)
except (ValueError, TypeError):
return None
# ── Template statistics ───────────────────────────────────────────────────────
async def get_template_stats(template_slug: str) -> dict:
"""Article counts for a template: total, published, draft, scheduled, by language.
Returns:
{
"total": N,
"published": N,
"draft": N,
"scheduled": N,
"by_language": {"en": {"total": N, "published": N, ...}, ...},
}
"""
rows = await fetch_all(
"SELECT status, language, COUNT(*) as cnt FROM articles"
" WHERE template_slug = ? GROUP BY status, language",
(template_slug,),
)
stats: dict = {"total": 0, "published": 0, "draft": 0, "scheduled": 0, "by_language": {}}
for r in rows:
cnt = r["cnt"]
status = r["status"]
lang = r["language"]
stats["total"] += cnt
if status in stats:
stats[status] += cnt
if lang not in stats["by_language"]:
stats["by_language"][lang] = {"total": 0, "published": 0, "draft": 0, "scheduled": 0}
stats["by_language"][lang]["total"] += cnt
if status in stats["by_language"][lang]:
stats["by_language"][lang][status] += cnt
return stats
# ── Data freshness ────────────────────────────────────────────────────────────
async def get_template_freshness(templates: list[dict]) -> list[dict]:
"""Compare _serving_meta.json exported_at vs max(articles.updated_at) per template.
Returns list of dicts — one per template:
{
"slug": str,
"name": str,
"data_table": str,
"exported_at_utc": str | None, # from _serving_meta.json
"last_generated": str | None, # max(updated_at) in articles
"row_count": int | None, # DuckDB row count from meta
"status": "fresh" | "stale" | "no_articles" | "no_data",
}
Freshness semantics:
"fresh" — articles generated after last data export (up to date)
"stale" — data export is newer than last article generation (regen needed)
"no_articles" — DuckDB data exists but no articles generated yet
"no_data" — _serving_meta.json absent (export_serving not yet run)
"""
meta = _read_serving_meta()
exported_at_str = meta.get("exported_at_utc")
exported_at = _parse_dt(exported_at_str)
table_meta = meta.get("tables", {})
result = []
for t in templates:
slug = t["slug"]
data_table = t.get("data_table", "")
# Strip schema prefix to match the key in _serving_meta.json tables dict.
# e.g. "serving.pseo_city_costs_de" → "pseo_city_costs_de"
table_key = data_table.split(".")[-1] if "." in data_table else data_table
rows = await fetch_all(
"SELECT MAX(COALESCE(updated_at, created_at)) as last_gen FROM articles"
" WHERE template_slug = ?",
(slug,),
)
last_gen_str = rows[0]["last_gen"] if rows else None
last_gen = _parse_dt(last_gen_str)
row_count = table_meta.get(table_key, {}).get("row_count")
if not exported_at_str:
status = "no_data"
elif last_gen is None:
status = "no_articles"
elif exported_at and last_gen and exported_at > last_gen:
# New data available — articles haven't been regenerated against it yet.
status = "stale"
else:
status = "fresh"
result.append({
"slug": slug,
"name": t.get("name", slug),
"data_table": data_table,
"exported_at_utc": exported_at_str,
"last_generated": last_gen_str,
"row_count": row_count,
"status": status,
})
return result
# ── Content gaps ──────────────────────────────────────────────────────────────
async def get_content_gaps(
template_slug: str,
data_table: str,
natural_key: str,
languages: list[str],
limit: int = 200,
) -> list[dict]:
"""Return DuckDB rows that have no matching article for at least one language.
The article slug is constructed as: "{template_slug}-{lang}-{natural_key_value}"
This lets us efficiently detect gaps without rendering URL patterns.
Returns list of dicts — each is the DuckDB row with two extra keys:
"_natural_key": str — the natural key value for this row
"_missing_languages": list[str] — languages with no article
"""
assert languages, "languages must not be empty"
_validate_table_name(data_table)
# Fetch all article slugs for this template to determine which rows exist.
slug_rows = await fetch_all(
"SELECT slug, language FROM articles WHERE template_slug = ?",
(template_slug,),
)
# Build lookup: (lang, natural_key_value) → True
prefix_by_lang = {lang: f"{template_slug}-{lang}-" for lang in languages}
existing: set[tuple[str, str]] = set()
for r in slug_rows:
lang = r["language"]
if lang not in prefix_by_lang:
continue
prefix = prefix_by_lang[lang]
if r["slug"].startswith(prefix):
nk_val = r["slug"][len(prefix):]
existing.add((lang, nk_val))
duckdb_rows = await fetch_analytics(
f"SELECT * FROM {data_table} LIMIT ?",
[limit],
)
gaps = []
for row in duckdb_rows:
nk_val = str(row.get(natural_key, ""))
missing = [lang for lang in languages if (lang, nk_val) not in existing]
if missing:
gaps.append({**row, "_natural_key": nk_val, "_missing_languages": missing})
return gaps
# ── Health checks ─────────────────────────────────────────────────────────────
async def check_hreflang_orphans(templates: list[dict]) -> list[dict]:
"""Published articles missing a sibling language expected by their template.
For example: city-cost-de generates EN + DE. If the EN article exists but
DE is absent, that article is an hreflang orphan.
Returns list of dicts:
{
"template_slug": str,
"url_path": str,
"present_languages": list[str],
"missing_languages": list[str],
}
"""
orphans = []
for t in templates:
expected = set(t.get("languages", ["en"]))
if len(expected) < 2:
continue # Single-language template — no orphans possible.
rows = await fetch_all(
"""SELECT url_path,
GROUP_CONCAT(language) as langs,
COUNT(DISTINCT language) as lang_count
FROM articles
WHERE template_slug = ? AND status = 'published'
GROUP BY url_path
HAVING COUNT(DISTINCT language) < ?""",
(t["slug"], len(expected)),
)
for r in rows:
present = set(r["langs"].split(","))
missing = sorted(expected - present)
orphans.append({
"template_slug": t["slug"],
"url_path": r["url_path"],
"present_languages": sorted(present),
"missing_languages": missing,
})
return orphans
async def check_missing_build_files(build_dir: Path | None = None) -> list[dict]:
"""Published articles whose HTML file is absent from disk.
Expected path: BUILD_DIR/{language}/{slug}.html
Returns list of dicts:
{"id", "slug", "language", "url_path", "template_slug", "expected_path"}
"""
bd = build_dir or BUILD_DIR
rows = await fetch_all(
"SELECT id, slug, language, url_path, template_slug FROM articles"
" WHERE status = 'published'",
)
missing = []
for r in rows:
path = bd / r["language"] / f"{r['slug']}.html"
if not path.exists():
missing.append({
"id": r["id"],
"slug": r["slug"],
"language": r["language"],
"url_path": r["url_path"],
"template_slug": r["template_slug"],
"expected_path": str(path),
})
return missing
async def check_broken_scenario_refs(build_dir: Path | None = None) -> list[dict]:
"""pSEO articles referencing [scenario:slug] markers that don't exist.
Reads markdown source from BUILD_DIR/{language}/md/{slug}.md.
Only checks published articles with a template_slug (pSEO-generated).
Returns list of dicts:
{"id", "slug", "language", "url_path", "broken_scenario_refs": [str, ...]}
"""
bd = build_dir or BUILD_DIR
scenario_rows = await fetch_all("SELECT slug FROM published_scenarios")
valid_slugs = {r["slug"] for r in scenario_rows}
articles = await fetch_all(
"SELECT id, slug, language, url_path FROM articles"
" WHERE status = 'published' AND template_slug IS NOT NULL",
)
broken = []
for a in articles:
md_path = bd / a["language"] / "md" / f"{a['slug']}.md"
if not md_path.exists():
continue
markdown = md_path.read_text()
refs = {m.group(1) for m in _SCENARIO_REF_RE.finditer(markdown)}
missing_refs = sorted(refs - valid_slugs)
if missing_refs:
broken.append({
"id": a["id"],
"slug": a["slug"],
"language": a["language"],
"url_path": a["url_path"],
"broken_scenario_refs": missing_refs,
})
return broken
# ── Aggregate check ───────────────────────────────────────────────────────────
async def get_all_health_issues(
templates: list[dict],
build_dir: Path | None = None,
) -> dict:
"""Run all health checks, return issue counts and full detail lists.
Returns:
{
"hreflang_orphans": [...],
"missing_build_files": [...],
"broken_scenario_refs": [...],
"counts": {
"hreflang_orphans": N,
"missing_build_files": N,
"broken_scenario_refs": N,
"total": N,
},
}
"""
orphans = await check_hreflang_orphans(templates)
missing_files = await check_missing_build_files(build_dir)
broken_refs = await check_broken_scenario_refs(build_dir)
return {
"hreflang_orphans": orphans,
"missing_build_files": missing_files,
"broken_scenario_refs": broken_refs,
"counts": {
"hreflang_orphans": len(orphans),
"missing_build_files": len(missing_files),
"broken_scenario_refs": len(broken_refs),
"total": len(orphans) + len(missing_files) + len(broken_refs),
},
}