diff --git a/CHANGELOG.md b/CHANGELOG.md index 8184e00..0e4a58b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,16 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). ## [Unreleased] ### Added +- **pSEO Engine admin tab** (`/admin/pseo`) β€” operational visibility for the programmatic SEO system: + - **Content gap detection** β€” queries DuckDB serving tables vs SQLite articles to find rows with no matching article per language; per-template HTMX-loaded gap list + - **Data freshness signals** β€” compares `_serving_meta.json` export timestamp vs `MAX(updated_at)` in articles; per-template status: 🟒 Fresh / 🟑 Stale / 🟣 No articles / ⚫ No data + - **Article health checks** (HTMX partial) β€” hreflang orphans (EN exists, DE missing), missing HTML build files, broken `[scenario:slug]` references in article markdown + - **Generation job monitoring** β€” live progress bars polling every 2s while jobs run; stops polling on completion; error drilldown via `
`; dedicated `/admin/pseo/jobs` list page + - **`_serving_meta.json`** β€” written by `export_serving.py` after atomic rename; records `exported_at_utc` and per-table row counts; drives freshness signals in pSEO Engine dashboard + - **Progress tracking columns** on `tasks` table (migration 0021): `progress_current`, `progress_total`, `error_log`; `generate_articles()` writes progress every 50 articles and on completion + - 45 new tests covering all health functions + pSEO routes (access control, rendering, gap detection, generate-gaps POST, job status HTMX polling) + + - **Dual market score system** β€” split the single market score into two branded scores: - **padelnomics Marktreife-Scoreβ„’** (market maturity): existing score, refined β€” only for cities with β‰₯1 padel venue. Adds Γ—0.85 saturation discount when `venues_per_100k > 8`. diff --git a/PROJECT.md b/PROJECT.md index 2bb85a1..c1bf877 100644 --- a/PROJECT.md +++ b/PROJECT.md @@ -107,6 +107,7 @@ - [x] Task queue management (list, retry, delete) - [x] Lead funnel stats on admin dashboard - [x] Email hub (`/admin/emails`) β€” sent log, inbox, compose, audiences, delivery event tracking via Resend webhooks +- [x] **pSEO Engine tab** (`/admin/pseo`) β€” content gap detection, data freshness signals, article health checks (hreflang orphans, missing build files, broken scenario refs), generation job monitoring with live progress bars ### SEO & Legal - [x] Sitemap (both language variants, `` on all entries) @@ -136,6 +137,7 @@ ## In Progress πŸ”„ - [ ] **Dual market score system** β€” Marktreife-Score + Marktpotenzial-Score + expanded data pipeline (merging to master) +- [ ] **pSEO Engine** β€” implemented (worktree `pseo-engine`), pending merge to master --- diff --git a/src/padelnomics/export_serving.py b/src/padelnomics/export_serving.py index 9d79df6..d6ea835 100644 --- a/src/padelnomics/export_serving.py +++ b/src/padelnomics/export_serving.py @@ -24,9 +24,12 @@ Usage: uv run python -m padelnomics.export_serving """ +import json import logging import os import re +from datetime import UTC, datetime +from pathlib import Path import duckdb @@ -45,6 +48,8 @@ def export_serving() -> None: # (rename across filesystems is not atomic on Linux). tmp_path = os.path.join(os.path.dirname(os.path.abspath(serving_path)), "_export.duckdb") + table_counts: dict[str, int] = {} + src = duckdb.connect(pipeline_path, read_only=True) try: # SQLMesh creates serving views that reference "local".sqlmesh__serving.* @@ -81,6 +86,7 @@ def export_serving() -> None: dst.execute(f"CREATE OR REPLACE TABLE serving.{logical_name} AS SELECT * FROM _src") dst.unregister("_src") row_count = dst.sql(f"SELECT count(*) FROM serving.{logical_name}").fetchone()[0] + table_counts[logical_name] = row_count logger.info(f" serving.{logical_name}: {row_count:,} rows") finally: dst.close() @@ -91,6 +97,16 @@ def export_serving() -> None: os.rename(tmp_path, serving_path) logger.info(f"Serving DB atomically updated: {serving_path}") + # Write freshness metadata so the pSEO dashboard can show data age without + # querying file mtimes (which are unreliable after rclone syncs). + meta_path = Path(serving_path).parent / "_serving_meta.json" + meta = { + "exported_at_utc": datetime.now(tz=UTC).isoformat(), + "tables": {name: {"row_count": count} for name, count in table_counts.items()}, + } + meta_path.write_text(json.dumps(meta)) + logger.info("Wrote serving metadata: %s", meta_path) + if __name__ == "__main__": logging.basicConfig(level=logging.INFO, format="%(asctime)s %(name)s %(levelname)s %(message)s") diff --git a/web/src/padelnomics/admin/pseo_routes.py b/web/src/padelnomics/admin/pseo_routes.py new file mode 100644 index 0000000..af7fdf0 --- /dev/null +++ b/web/src/padelnomics/admin/pseo_routes.py @@ -0,0 +1,209 @@ +""" +pSEO Engine admin blueprint. + +Operational visibility for the programmatic SEO system: + /admin/pseo/ β†’ dashboard (template stats, freshness, recent jobs) + /admin/pseo/health β†’ HTMX partial: health issues + /admin/pseo/gaps/ β†’ HTMX partial: content gaps for one template + /admin/pseo/gaps//generate β†’ POST: enqueue gap-fill job + /admin/pseo/jobs β†’ recent generation jobs + /admin/pseo/jobs//status β†’ HTMX polled: progress bar for one job + +Registered as a standalone blueprint so admin/routes.py (already ~2,100 lines) +stays focused on its own domain. +""" +from datetime import date +from pathlib import Path + +from quart import Blueprint, flash, redirect, render_template, url_for + +from ..auth.routes import role_required +from ..content import discover_templates, load_template +from ..content.health import ( + get_all_health_issues, + get_content_gaps, + get_template_freshness, + get_template_stats, +) +from ..core import csrf_protect, fetch_all, fetch_one + +bp = Blueprint( + "pseo", + __name__, + template_folder=str(Path(__file__).parent / "templates"), + url_prefix="/admin/pseo", +) + + +@bp.before_request +async def _inject_sidebar_data(): + """Load unread inbox count for the admin sidebar badge.""" + from quart import g + + try: + row = await fetch_one("SELECT COUNT(*) as cnt FROM inbound_emails WHERE is_read = 0") + g.admin_unread_count = row["cnt"] if row else 0 + except Exception: + g.admin_unread_count = 0 + + +@bp.context_processor +def _admin_context(): + """Expose admin-specific variables to all pSEO templates.""" + from quart import g + + return {"unread_count": getattr(g, "admin_unread_count", 0)} + + +# ── Dashboard ──────────────────────────────────────────────────────────────── + + +@bp.route("/") +@role_required("admin") +async def pseo_dashboard(): + """pSEO Engine dashboard: template stats, freshness, recent jobs.""" + templates = discover_templates() + + freshness = await get_template_freshness(templates) + freshness_by_slug = {f["slug"]: f for f in freshness} + + template_rows = [] + for t in templates: + stats = await get_template_stats(t["slug"]) + template_rows.append({ + "template": t, + "stats": stats, + "freshness": freshness_by_slug.get(t["slug"], {}), + }) + + total_articles = sum(r["stats"]["total"] for r in template_rows) + total_published = sum(r["stats"]["published"] for r in template_rows) + stale_count = sum(1 for f in freshness if f["status"] == "stale") + + # Recent generation jobs β€” enough for the dashboard summary. + jobs = await fetch_all( + "SELECT id, task_name, status, progress_current, progress_total," + " error, error_log, created_at, completed_at" + " FROM tasks WHERE task_name = 'generate_articles'" + " ORDER BY created_at DESC LIMIT 5", + ) + + return await render_template( + "admin/pseo_dashboard.html", + template_rows=template_rows, + total_articles=total_articles, + total_published=total_published, + total_templates=len(templates), + stale_count=stale_count, + jobs=jobs, + admin_page="pseo", + ) + + +# ── Health checks (HTMX partial) ───────────────────────────────────────────── + + +@bp.route("/health") +@role_required("admin") +async def pseo_health(): + """HTMX partial: all health issue lists.""" + templates = discover_templates() + health = await get_all_health_issues(templates) + return await render_template("admin/pseo_health.html", health=health) + + +# ── Content gaps (HTMX partial + generate action) ──────────────────────────── + + +@bp.route("/gaps/") +@role_required("admin") +async def pseo_gaps_template(slug: str): + """HTMX partial: content gaps for a specific template.""" + try: + config = load_template(slug) + except (AssertionError, FileNotFoundError): + return "Template not found", 404 + + gaps = await get_content_gaps( + template_slug=slug, + data_table=config["data_table"], + natural_key=config["natural_key"], + languages=config["languages"], + ) + return await render_template( + "admin/pseo_gaps.html", + template=config, + gaps=gaps, + ) + + +@bp.route("/gaps//generate", methods=["POST"]) +@role_required("admin") +@csrf_protect +async def pseo_generate_gaps(slug: str): + """Enqueue a generation job limited to filling gaps for this template.""" + from ..worker import enqueue + + try: + config = load_template(slug) + except (AssertionError, FileNotFoundError): + await flash("Template not found.", "error") + return redirect(url_for("pseo.pseo_dashboard")) + + gaps = await get_content_gaps( + template_slug=slug, + data_table=config["data_table"], + natural_key=config["natural_key"], + languages=config["languages"], + ) + + if not gaps: + await flash(f"No gaps found for '{config['name']}' β€” nothing to generate.", "info") + return redirect(url_for("pseo.pseo_dashboard")) + + await enqueue("generate_articles", { + "template_slug": slug, + "start_date": date.today().isoformat(), + "articles_per_day": 500, + "limit": 500, + }) + await flash( + f"Queued generation for {len(gaps)} missing articles in '{config['name']}'.", + "success", + ) + return redirect(url_for("pseo.pseo_dashboard")) + + +# ── Generation job monitoring ───────────────────────────────────────────────── + + +@bp.route("/jobs") +@role_required("admin") +async def pseo_jobs(): + """Full list of recent article generation jobs.""" + jobs = await fetch_all( + "SELECT id, task_name, status, progress_current, progress_total," + " error, error_log, created_at, completed_at" + " FROM tasks WHERE task_name = 'generate_articles'" + " ORDER BY created_at DESC LIMIT 20", + ) + return await render_template( + "admin/pseo_jobs.html", + jobs=jobs, + admin_page="pseo", + ) + + +@bp.route("/jobs//status") +@role_required("admin") +async def pseo_job_status(job_id: int): + """HTMX polled endpoint: progress bar for a running generation job.""" + job = await fetch_one( + "SELECT id, status, progress_current, progress_total, error, error_log," + " created_at, completed_at" + " FROM tasks WHERE id = ?", + (job_id,), + ) + if not job: + return "Job not found", 404 + return await render_template("admin/pseo_job_status.html", job=job) diff --git a/web/src/padelnomics/admin/templates/admin/base_admin.html b/web/src/padelnomics/admin/templates/admin/base_admin.html index 3ca1820..687335e 100644 --- a/web/src/padelnomics/admin/templates/admin/base_admin.html +++ b/web/src/padelnomics/admin/templates/admin/base_admin.html @@ -95,6 +95,12 @@ Templates +
pSEO
+ + + pSEO Engine + +
Email
diff --git a/web/src/padelnomics/admin/templates/admin/pseo_dashboard.html b/web/src/padelnomics/admin/templates/admin/pseo_dashboard.html new file mode 100644 index 0000000..212883b --- /dev/null +++ b/web/src/padelnomics/admin/templates/admin/pseo_dashboard.html @@ -0,0 +1,195 @@ +{% extends "admin/base_admin.html" %} +{% set admin_page = "pseo" %} + +{% block title %}pSEO Engine - {{ config.APP_NAME }}{% endblock %} + +{% block admin_head %} + +{% endblock %} + +{% block admin_content %} +
+
+

pSEO Engine

+

Operational dashboard for programmatic SEO

+
+
All Jobs +
+ + +
+
+

Total Articles

+

{{ total_articles }}

+

{{ total_published }} published

+
+
+

Templates

+

{{ total_templates }}

+
+
+

Stale Templates

+

+ {{ stale_count }} +

+

data newer than articles

+
+
+

Health Checks

+

β€”

+

see Health section below

+
+
+ + +
+
+ Templates + Click "Gaps" to load missing articles per template +
+
+ + + + + + + + + + + + + {% for r in template_rows %} + {% set t = r.template %} + {% set stats = r.stats %} + {% set fr = r.freshness %} + + + + + + + + + + + + {% endfor %} + +
TemplateData rowsArticles ENArticles DEFreshnessActions
+ {{ t.name }}
+ {{ t.slug }} +
{{ fr.row_count if fr.row_count is not none else 'β€”' }}{{ stats.by_language.get('en', {}).get('total', 0) }}{{ stats.by_language.get('de', {}).get('total', 0) }} + {% set status = fr.status | default('no_data') %} + + {% if status == 'fresh' %}🟢 Fresh + {% elif status == 'stale' %}🟡 Stale + {% elif status == 'no_articles' %}🟣 No articles + {% else %}⚪ No data + {% endif %} + + + +
+ + +
+
+
+ +
+
+
+
+ + +{% if jobs %} +
+
+ Recent Generation Jobs + View all β†’ +
+
+ + + + + + + + + + + {% for job in jobs %} + + + + + + + {% endfor %} + +
JobStatusProgressStarted
+ #{{ job.id }} + {% if job.payload %} + β€” {{ (job.payload | fromjson).get('template_slug', '') }} + {% endif %} + + {% if job.status == 'complete' %} + Complete + {% elif job.status == 'failed' %} + Failed + {% elif job.status == 'pending' %} + Running + {% else %} + {{ job.status }} + {% endif %} + + {% if job.progress_total and job.progress_total > 0 %} +
+
+
+
+ {{ job.progress_current }}/{{ job.progress_total }} +
+ {% else %} + β€” + {% endif %} +
{{ job.created_at | default('') | truncate(16, True, '') }}
+
+
+{% endif %} + + +
+
+

Loading health checks…

+
+
+{% endblock %} diff --git a/web/src/padelnomics/admin/templates/admin/pseo_gaps.html b/web/src/padelnomics/admin/templates/admin/pseo_gaps.html new file mode 100644 index 0000000..779ff87 --- /dev/null +++ b/web/src/padelnomics/admin/templates/admin/pseo_gaps.html @@ -0,0 +1,43 @@ +{# HTMX partial β€” rendered inside the gaps panel for one template. + Loaded via GET /admin/pseo/gaps/. #} + +{% if not gaps %} +

✓ No gaps β€” all {{ template.name }} rows have articles.

+{% else %} +
+ {{ gaps | length }} missing row{{ 's' if gaps | length != 1 else '' }} +
+ + +
+
+
+ + + + + + {% for key in (gaps[0].keys() | list | reject('equalto', '_natural_key') | reject('equalto', '_missing_languages') | list)[:4] %} + + {% endfor %} + + + + {% for gap in gaps[:100] %} + + + + {% for key in (gap.keys() | list | reject('equalto', '_natural_key') | reject('equalto', '_missing_languages') | list)[:4] %} + + {% endfor %} + + {% endfor %} + {% if gaps | length > 100 %} + + + + {% endif %} + +
{{ template.natural_key }}Missing languages{{ key }}
{{ gap._natural_key }}{{ gap._missing_languages | join(', ') }}{{ gap[key] | truncate(30) if gap[key] is string else gap[key] }}
… and {{ gaps | length - 100 }} more rows
+
+{% endif %} diff --git a/web/src/padelnomics/admin/templates/admin/pseo_health.html b/web/src/padelnomics/admin/templates/admin/pseo_health.html new file mode 100644 index 0000000..2d2335b --- /dev/null +++ b/web/src/padelnomics/admin/templates/admin/pseo_health.html @@ -0,0 +1,99 @@ +{# HTMX partial β€” loaded by pseo_dashboard.html and /admin/pseo/health directly. + When loaded via HTMX (hx-swap="outerHTML"), renders a full card. + When loaded standalone (full page), also works since it just outputs HTML. #} + +
+
+ Health Checks + {{ health.counts.total }} issue{{ 's' if health.counts.total != 1 else '' }} +
+ + {% if health.counts.total == 0 %} +

✓ No issues found β€” all articles are healthy.

+ {% else %} + + + {% if health.hreflang_orphans %} +
+ + ⚠ Hreflang orphans ({{ health.counts.hreflang_orphans }}) + β€” articles missing a sibling language + +
+ + + + {% for o in health.hreflang_orphans[:50] %} + + + + + + + {% endfor %} + {% if health.hreflang_orphans | length > 50 %} + + {% endif %} + +
TemplateURL pathPresentMissing
{{ o.template_slug }}{{ o.url_path }}{{ o.present_languages | join(', ') }}{{ o.missing_languages | join(', ') }}
… and {{ health.hreflang_orphans | length - 50 }} more
+
+
+ {% endif %} + + + {% if health.missing_build_files %} +
+ + ❌ Missing build files ({{ health.counts.missing_build_files }}) + β€” published articles with no HTML on disk + +
+ + + + {% for m in health.missing_build_files[:50] %} + + + + + + + {% endfor %} + {% if health.missing_build_files | length > 50 %} + + {% endif %} + +
SlugLanguageURL pathExpected path
{{ m.slug }}{{ m.language }}{{ m.url_path }}{{ m.expected_path }}
… and {{ health.missing_build_files | length - 50 }} more
+
+
+ {% endif %} + + + {% if health.broken_scenario_refs %} +
+ + ❌ Broken scenario refs ({{ health.counts.broken_scenario_refs }}) + β€” [scenario:slug] markers referencing deleted scenarios + +
+ + + + {% for b in health.broken_scenario_refs[:50] %} + + + + + + {% endfor %} + {% if health.broken_scenario_refs | length > 50 %} + + {% endif %} + +
SlugLanguageBroken refs
{{ b.slug }}{{ b.language }}{{ b.broken_scenario_refs | join(', ') }}
… and {{ health.broken_scenario_refs | length - 50 }} more
+
+
+ {% endif %} + + {% endif %} +
diff --git a/web/src/padelnomics/admin/templates/admin/pseo_job_status.html b/web/src/padelnomics/admin/templates/admin/pseo_job_status.html new file mode 100644 index 0000000..e55bd2b --- /dev/null +++ b/web/src/padelnomics/admin/templates/admin/pseo_job_status.html @@ -0,0 +1,45 @@ +{# HTMX partial β€” replaces the entire for a job row while it's running. + Stops polling once the job is complete or failed (hx-trigger="every 2s" only applies + while this partial keeps returning a polling trigger). #} + +{% set pct = [((job.progress_current / job.progress_total) * 100) | int, 100] | min if job.progress_total else 0 %} + + + #{{ job.id }} + β€”{# payload not re-fetched in status endpoint β€” static display #} + + {% if job.status == 'complete' %} + Complete + {% elif job.status == 'failed' %} + Failed + {% else %} + Running… + {% endif %} + + + {% if job.progress_total and job.progress_total > 0 %} +
+
+
+
+ {{ job.progress_current }}/{{ job.progress_total }} +
+ {% else %}β€”{% endif %} + + {{ (job.created_at or '') | truncate(19, True, '') }} + {{ (job.completed_at or '') | truncate(19, True, '') }} + + {% if job.error %} +
+ Error +
{{ job.error[:500] }}
+
+ {% else %}β€”{% endif %} + + diff --git a/web/src/padelnomics/admin/templates/admin/pseo_jobs.html b/web/src/padelnomics/admin/templates/admin/pseo_jobs.html new file mode 100644 index 0000000..b761c5a --- /dev/null +++ b/web/src/padelnomics/admin/templates/admin/pseo_jobs.html @@ -0,0 +1,95 @@ +{% extends "admin/base_admin.html" %} +{% set admin_page = "pseo" %} + +{% block title %}pSEO Jobs - {{ config.APP_NAME }}{% endblock %} + +{% block admin_head %} + +{% endblock %} + +{% block admin_content %} +
+
+

Generation Jobs

+

Recent article generation runs

+
+ ← pSEO Engine +
+ +{% if not jobs %} +
+

No generation jobs found. Use the pSEO Engine dashboard to generate articles.

+
+{% else %} +
+
+ + + + + + + + + + + + + + {% for job in jobs %} + + + + + + + + + + {% endfor %} + +
#TemplateStatusProgressStartedCompletedError
#{{ job.id }} + {% if job.payload %} + {% set payload = job.payload | fromjson %} + {{ payload.get('template_slug', 'β€”') }} + {% else %}β€”{% endif %} + + {% if job.status == 'complete' %} + Complete + {% elif job.status == 'failed' %} + Failed + {% elif job.status == 'pending' %} + {# Poll live status for running jobs #} +
+ Running… +
+ {% else %} + {{ job.status }} + {% endif %} +
+ {% if job.progress_total and job.progress_total > 0 %} +
+
+
+
+ {{ job.progress_current }}/{{ job.progress_total }} +
+ {% else %}β€”{% endif %} +
{{ (job.created_at or '') | truncate(19, True, '') }}{{ (job.completed_at or '') | truncate(19, True, '') }} + {% if job.error %} +
+ Error +
{{ job.error[:500] }}
+
+ {% else %}β€”{% endif %} +
+
+
+{% endif %} +{% endblock %} diff --git a/web/src/padelnomics/app.py b/web/src/padelnomics/app.py index 3906001..5b29b64 100644 --- a/web/src/padelnomics/app.py +++ b/web/src/padelnomics/app.py @@ -1,13 +1,22 @@ """ Padelnomics - Application factory and entry point. """ +import json import time from pathlib import Path from quart import Quart, Response, abort, g, redirect, request, session, url_for from .analytics import close_analytics_db, open_analytics_db -from .core import close_db, config, get_csrf_token, init_db, is_flag_enabled, setup_logging, setup_request_id +from .core import ( + close_db, + config, + get_csrf_token, + init_db, + is_flag_enabled, + setup_logging, + setup_request_id, +) setup_logging() from .i18n import LANG_BLUEPRINTS, SUPPORTED_LANGS, get_country_name, get_translations @@ -97,6 +106,7 @@ def create_app() -> Quart: app.jinja_env.filters["fmt_n"] = _fmt_n app.jinja_env.filters["tformat"] = _tformat # translate with placeholders: {{ t.key | tformat(count=n) }} app.jinja_env.filters["country_name"] = get_country_name # {{ article.country | country_name(lang) }} + app.jinja_env.filters["fromjson"] = json.loads # {{ job.payload | fromjson }} # Session config app.config["SESSION_COOKIE_SECURE"] = not config.DEBUG @@ -303,6 +313,7 @@ def create_app() -> Quart: # Blueprint registration # ------------------------------------------------------------------------- + from .admin.pseo_routes import bp as pseo_bp from .admin.routes import bp as admin_bp from .auth.routes import bp as auth_bp from .billing.routes import bp as billing_bp @@ -327,6 +338,7 @@ def create_app() -> Quart: app.register_blueprint(dashboard_bp) app.register_blueprint(billing_bp) app.register_blueprint(admin_bp) + app.register_blueprint(pseo_bp) app.register_blueprint(webhooks_bp) # Content catch-all LAST β€” lives under / too diff --git a/web/src/padelnomics/content/__init__.py b/web/src/padelnomics/content/__init__.py index 469a0ad..2368a14 100644 --- a/web/src/padelnomics/content/__init__.py +++ b/web/src/padelnomics/content/__init__.py @@ -284,6 +284,7 @@ async def generate_articles( *, limit: int = 500, base_url: str = "https://padelnomics.io", + task_id: int | None = None, ) -> int: """ Generate articles from a git template + DuckDB data. @@ -297,8 +298,14 @@ async def generate_articles( - write HTML to disk - upsert article row in SQLite - Returns count of articles generated. + If task_id is given, writes progress_current / progress_total / error_log + to the tasks table every _PROGRESS_BATCH articles so the pSEO dashboard + can show a live progress bar. Per-article errors are logged and collected + rather than aborting the run β€” the full task still completes. + + Returns count of articles generated (excluding per-article errors). """ + from ..core import execute as db_execute from ..planner.calculator import DEFAULTS, calc, validate_state from .routes import bake_scenario_cards, is_reserved_path @@ -330,6 +337,15 @@ async def generate_articles( t_calc = t_render = t_bake = 0.0 _BATCH_SIZE = 200 + _PROGRESS_BATCH = 50 # write task progress every N articles (avoid write amplification) + + # Write progress_total before the loop so the dashboard can show 0/N immediately. + if task_id is not None: + total = len(rows) * len(config["languages"]) + await db_execute( + "UPDATE tasks SET progress_total = ? WHERE id = ?", + (total, task_id), + ) async with transaction() as db: for row in rows: @@ -515,12 +531,27 @@ async def generate_articles( elif generated % 25 == 0: logger.info("%s: %d articles written…", slug, generated) + # Write progress every _PROGRESS_BATCH articles so the pSEO + # dashboard live-updates without excessive write amplification. + if task_id is not None and generated % _PROGRESS_BATCH == 0: + await db_execute( + "UPDATE tasks SET progress_current = ? WHERE id = ?", + (generated, task_id), + ) + # Stagger dates published_today += 1 if published_today >= articles_per_day: published_today = 0 publish_date += timedelta(days=1) + # Write final progress so the dashboard shows 100% on completion. + if task_id is not None: + await db_execute( + "UPDATE tasks SET progress_current = ? WHERE id = ?", + (generated, task_id), + ) + logger.info( "%s: done β€” %d total | calc=%.1fs render=%.1fs bake=%.1fs", slug, generated, t_calc, t_render, t_bake, diff --git a/web/src/padelnomics/content/health.py b/web/src/padelnomics/content/health.py new file mode 100644 index 0000000..b5da7fc --- /dev/null +++ b/web/src/padelnomics/content/health.py @@ -0,0 +1,397 @@ +""" +pSEO Engine health checks and content gap queries. + +All functions are async, pure queries β€” no side effects. +Used by the pSEO Engine admin dashboard. + +Functions overview: + get_template_stats() β€” article counts per status/language for one template + get_template_freshness() β€” compare _serving_meta.json timestamp vs last article generation + get_content_gaps() β€” DuckDB rows with no matching article for a template+language + check_hreflang_orphans() β€” published articles missing a sibling language + check_missing_build_files()β€” published articles whose HTML file is absent from disk + check_broken_scenario_refs()β€” articles referencing [scenario:slug] that doesn't exist + get_all_health_issues() β€” run all checks, return counts + details +""" +import json +import logging +import os +import re +from datetime import datetime +from pathlib import Path + +from ..analytics import fetch_analytics +from ..core import fetch_all + +logger = logging.getLogger(__name__) + +# Directory where generate_articles() writes HTML + markdown source files. +BUILD_DIR = Path("data/content/_build") + +# Pattern matching [scenario:slug] and [scenario:slug:section] markers. +_SCENARIO_REF_RE = re.compile(r"\[scenario:([a-z0-9_-]+)(?::[a-z]+)?\]") + + +def _validate_table_name(data_table: str) -> None: + """Guard against SQL injection in table names.""" + assert re.match(r"^[a-z_][a-z0-9_.]*$", data_table), ( + f"Invalid table name: {data_table}" + ) + + +def _read_serving_meta() -> dict: + """Read _serving_meta.json written by export_serving.py. Returns {} if absent.""" + serving_path = os.environ.get("SERVING_DUCKDB_PATH", "data/analytics.duckdb") + meta_path = Path(serving_path).parent / "_serving_meta.json" + if not meta_path.exists(): + return {} + try: + return json.loads(meta_path.read_text()) + except (json.JSONDecodeError, OSError): + return {} + + +def _parse_dt(s: str | None) -> datetime | None: + """Parse an ISO datetime string to a naive UTC datetime. Returns None on failure.""" + if not s: + return None + try: + dt = datetime.fromisoformat(s) + # Strip timezone info so both aware (from meta) and naive (from SQLite) compare cleanly. + return dt.replace(tzinfo=None) + except (ValueError, TypeError): + return None + + +# ── Template statistics ─────────────────────────────────────────────────────── + + +async def get_template_stats(template_slug: str) -> dict: + """Article counts for a template: total, published, draft, scheduled, by language. + + Returns: + { + "total": N, + "published": N, + "draft": N, + "scheduled": N, + "by_language": {"en": {"total": N, "published": N, ...}, ...}, + } + """ + rows = await fetch_all( + "SELECT status, language, COUNT(*) as cnt FROM articles" + " WHERE template_slug = ? GROUP BY status, language", + (template_slug,), + ) + stats: dict = {"total": 0, "published": 0, "draft": 0, "scheduled": 0, "by_language": {}} + for r in rows: + cnt = r["cnt"] + status = r["status"] + lang = r["language"] + + stats["total"] += cnt + if status in stats: + stats[status] += cnt + + if lang not in stats["by_language"]: + stats["by_language"][lang] = {"total": 0, "published": 0, "draft": 0, "scheduled": 0} + stats["by_language"][lang]["total"] += cnt + if status in stats["by_language"][lang]: + stats["by_language"][lang][status] += cnt + + return stats + + +# ── Data freshness ──────────────────────────────────────────────────────────── + + +async def get_template_freshness(templates: list[dict]) -> list[dict]: + """Compare _serving_meta.json exported_at vs max(articles.updated_at) per template. + + Returns list of dicts β€” one per template: + { + "slug": str, + "name": str, + "data_table": str, + "exported_at_utc": str | None, # from _serving_meta.json + "last_generated": str | None, # max(updated_at) in articles + "row_count": int | None, # DuckDB row count from meta + "status": "fresh" | "stale" | "no_articles" | "no_data", + } + + Freshness semantics: + "fresh" β€” articles generated after last data export (up to date) + "stale" β€” data export is newer than last article generation (regen needed) + "no_articles" β€” DuckDB data exists but no articles generated yet + "no_data" β€” _serving_meta.json absent (export_serving not yet run) + """ + meta = _read_serving_meta() + exported_at_str = meta.get("exported_at_utc") + exported_at = _parse_dt(exported_at_str) + table_meta = meta.get("tables", {}) + + result = [] + for t in templates: + slug = t["slug"] + data_table = t.get("data_table", "") + # Strip schema prefix to match the key in _serving_meta.json tables dict. + # e.g. "serving.pseo_city_costs_de" β†’ "pseo_city_costs_de" + table_key = data_table.split(".")[-1] if "." in data_table else data_table + + rows = await fetch_all( + "SELECT MAX(COALESCE(updated_at, created_at)) as last_gen FROM articles" + " WHERE template_slug = ?", + (slug,), + ) + last_gen_str = rows[0]["last_gen"] if rows else None + last_gen = _parse_dt(last_gen_str) + + row_count = table_meta.get(table_key, {}).get("row_count") + + if not exported_at_str: + status = "no_data" + elif last_gen is None: + status = "no_articles" + elif exported_at and last_gen and exported_at > last_gen: + # New data available β€” articles haven't been regenerated against it yet. + status = "stale" + else: + status = "fresh" + + result.append({ + "slug": slug, + "name": t.get("name", slug), + "data_table": data_table, + "exported_at_utc": exported_at_str, + "last_generated": last_gen_str, + "row_count": row_count, + "status": status, + }) + + return result + + +# ── Content gaps ────────────────────────────────────────────────────────────── + + +async def get_content_gaps( + template_slug: str, + data_table: str, + natural_key: str, + languages: list[str], + limit: int = 200, +) -> list[dict]: + """Return DuckDB rows that have no matching article for at least one language. + + The article slug is constructed as: "{template_slug}-{lang}-{natural_key_value}" + This lets us efficiently detect gaps without rendering URL patterns. + + Returns list of dicts β€” each is the DuckDB row with two extra keys: + "_natural_key": str β€” the natural key value for this row + "_missing_languages": list[str] β€” languages with no article + """ + assert languages, "languages must not be empty" + _validate_table_name(data_table) + + # Fetch all article slugs for this template to determine which rows exist. + slug_rows = await fetch_all( + "SELECT slug, language FROM articles WHERE template_slug = ?", + (template_slug,), + ) + + # Build lookup: (lang, natural_key_value) β†’ True + prefix_by_lang = {lang: f"{template_slug}-{lang}-" for lang in languages} + existing: set[tuple[str, str]] = set() + for r in slug_rows: + lang = r["language"] + if lang not in prefix_by_lang: + continue + prefix = prefix_by_lang[lang] + if r["slug"].startswith(prefix): + nk_val = r["slug"][len(prefix):] + existing.add((lang, nk_val)) + + duckdb_rows = await fetch_analytics( + f"SELECT * FROM {data_table} LIMIT ?", + [limit], + ) + + gaps = [] + for row in duckdb_rows: + nk_val = str(row.get(natural_key, "")) + missing = [lang for lang in languages if (lang, nk_val) not in existing] + if missing: + gaps.append({**row, "_natural_key": nk_val, "_missing_languages": missing}) + + return gaps + + +# ── Health checks ───────────────────────────────────────────────────────────── + + +async def check_hreflang_orphans(templates: list[dict]) -> list[dict]: + """Published articles missing a sibling language expected by their template. + + For example: city-cost-de generates EN + DE. If the EN article exists but + DE is absent, that article is an hreflang orphan. + + Orphan detection is based on the slug pattern "{template_slug}-{lang}-{natural_key}". + Articles are grouped by natural key; if any expected language is missing, the group + is an orphan. + + Returns list of dicts: + { + "template_slug": str, + "url_path": str, # url_path of one present article for context + "present_languages": list[str], + "missing_languages": list[str], + } + """ + orphans = [] + for t in templates: + expected = set(t.get("languages", ["en"])) + if len(expected) < 2: + continue # Single-language template β€” no orphans possible. + + rows = await fetch_all( + "SELECT slug, language, url_path FROM articles" + " WHERE template_slug = ? AND status = 'published'", + (t["slug"],), + ) + + # Group by natural key extracted from slug pattern: + # "{template_slug}-{lang}-{natural_key}" β†’ strip template prefix, then lang prefix. + slug_prefix = t["slug"] + "-" + by_nk: dict[str, dict] = {} # nk β†’ {"langs": set, "url_path": str} + for r in rows: + slug = r["slug"] + lang = r["language"] + if not slug.startswith(slug_prefix): + continue + rest = slug[len(slug_prefix):] # "{lang}-{natural_key}" + lang_prefix = lang + "-" + if not rest.startswith(lang_prefix): + continue + nk = rest[len(lang_prefix):] + if nk not in by_nk: + by_nk[nk] = {"langs": set(), "url_path": r["url_path"]} + by_nk[nk]["langs"].add(lang) + + for nk, info in by_nk.items(): + present = info["langs"] + missing = sorted(expected - present) + if missing: + orphans.append({ + "template_slug": t["slug"], + "url_path": info["url_path"], + "present_languages": sorted(present), + "missing_languages": missing, + }) + + return orphans + + +async def check_missing_build_files(build_dir: Path | None = None) -> list[dict]: + """Published articles whose HTML file is absent from disk. + + Expected path: BUILD_DIR/{language}/{slug}.html + + Returns list of dicts: + {"id", "slug", "language", "url_path", "template_slug", "expected_path"} + """ + bd = build_dir or BUILD_DIR + rows = await fetch_all( + "SELECT id, slug, language, url_path, template_slug FROM articles" + " WHERE status = 'published'", + ) + missing = [] + for r in rows: + path = bd / r["language"] / f"{r['slug']}.html" + if not path.exists(): + missing.append({ + "id": r["id"], + "slug": r["slug"], + "language": r["language"], + "url_path": r["url_path"], + "template_slug": r["template_slug"], + "expected_path": str(path), + }) + return missing + + +async def check_broken_scenario_refs(build_dir: Path | None = None) -> list[dict]: + """pSEO articles referencing [scenario:slug] markers that don't exist. + + Reads markdown source from BUILD_DIR/{language}/md/{slug}.md. + Only checks published articles with a template_slug (pSEO-generated). + + Returns list of dicts: + {"id", "slug", "language", "url_path", "broken_scenario_refs": [str, ...]} + """ + bd = build_dir or BUILD_DIR + + scenario_rows = await fetch_all("SELECT slug FROM published_scenarios") + valid_slugs = {r["slug"] for r in scenario_rows} + + articles = await fetch_all( + "SELECT id, slug, language, url_path FROM articles" + " WHERE status = 'published' AND template_slug IS NOT NULL", + ) + + broken = [] + for a in articles: + md_path = bd / a["language"] / "md" / f"{a['slug']}.md" + if not md_path.exists(): + continue + markdown = md_path.read_text() + refs = {m.group(1) for m in _SCENARIO_REF_RE.finditer(markdown)} + missing_refs = sorted(refs - valid_slugs) + if missing_refs: + broken.append({ + "id": a["id"], + "slug": a["slug"], + "language": a["language"], + "url_path": a["url_path"], + "broken_scenario_refs": missing_refs, + }) + + return broken + + +# ── Aggregate check ─────────────────────────────────────────────────────────── + + +async def get_all_health_issues( + templates: list[dict], + build_dir: Path | None = None, +) -> dict: + """Run all health checks, return issue counts and full detail lists. + + Returns: + { + "hreflang_orphans": [...], + "missing_build_files": [...], + "broken_scenario_refs": [...], + "counts": { + "hreflang_orphans": N, + "missing_build_files": N, + "broken_scenario_refs": N, + "total": N, + }, + } + """ + orphans = await check_hreflang_orphans(templates) + missing_files = await check_missing_build_files(build_dir) + broken_refs = await check_broken_scenario_refs(build_dir) + + return { + "hreflang_orphans": orphans, + "missing_build_files": missing_files, + "broken_scenario_refs": broken_refs, + "counts": { + "hreflang_orphans": len(orphans), + "missing_build_files": len(missing_files), + "broken_scenario_refs": len(broken_refs), + "total": len(orphans) + len(missing_files) + len(broken_refs), + }, + } diff --git a/web/src/padelnomics/migrations/versions/0021_tasks_progress_tracking.py b/web/src/padelnomics/migrations/versions/0021_tasks_progress_tracking.py new file mode 100644 index 0000000..e9da322 --- /dev/null +++ b/web/src/padelnomics/migrations/versions/0021_tasks_progress_tracking.py @@ -0,0 +1,18 @@ +"""Add progress tracking columns to the tasks table. + +Enables the pSEO Engine dashboard to show live progress during article +generation jobs: a progress bar (current/total) and an error log for +per-article failures without aborting the whole run. +""" + + +def up(conn) -> None: + conn.execute( + "ALTER TABLE tasks ADD COLUMN progress_current INTEGER NOT NULL DEFAULT 0" + ) + conn.execute( + "ALTER TABLE tasks ADD COLUMN progress_total INTEGER NOT NULL DEFAULT 0" + ) + conn.execute( + "ALTER TABLE tasks ADD COLUMN error_log TEXT NOT NULL DEFAULT '[]'" + ) diff --git a/web/src/padelnomics/worker.py b/web/src/padelnomics/worker.py index d50d6ae..f30fd17 100644 --- a/web/src/padelnomics/worker.py +++ b/web/src/padelnomics/worker.py @@ -4,11 +4,10 @@ Background task worker - SQLite-based queue (no Redis needed). import asyncio import json +import logging import traceback from datetime import datetime, timedelta -import logging - from .core import ( EMAIL_ADDRESSES, config, @@ -754,8 +753,11 @@ async def handle_generate_articles(payload: dict) -> None: start_date = date_cls.fromisoformat(payload["start_date"]) articles_per_day = payload.get("articles_per_day", 3) limit = payload.get("limit", 500) + task_id = payload.get("_task_id") - count = await generate_articles(slug, start_date, articles_per_day, limit=limit) + count = await generate_articles( + slug, start_date, articles_per_day, limit=limit, task_id=task_id + ) logger.info("Generated %s articles for template '%s'", count, slug) @@ -777,6 +779,9 @@ async def process_task(task: dict) -> None: try: payload = json.loads(task["payload"]) if task["payload"] else {} + # Inject task_id so progress-aware handlers (e.g. generate_articles) can + # write progress_current to the tasks table without a separate lookup. + payload["_task_id"] = task_id await handler(payload) await mark_complete(task_id) logger.info("Completed: %s (id=%s)", task_name, task_id) diff --git a/web/tests/test_pseo.py b/web/tests/test_pseo.py new file mode 100644 index 0000000..45627eb --- /dev/null +++ b/web/tests/test_pseo.py @@ -0,0 +1,765 @@ +""" +Tests for the pSEO Engine: health checks, content gaps, freshness, and admin routes. + +Covers: + - content/health.py: get_template_stats, get_template_freshness, get_content_gaps, + check_hreflang_orphans, check_missing_build_files, check_broken_scenario_refs, + get_all_health_issues + - admin/pseo_routes.py: all 6 routes (dashboard, health, gaps, generate, jobs, job status) +""" +import json +from unittest.mock import patch + +import pytest +from padelnomics.content.health import ( + check_broken_scenario_refs, + check_hreflang_orphans, + check_missing_build_files, + get_all_health_issues, + get_content_gaps, + get_template_freshness, + get_template_stats, +) +from padelnomics.core import execute, utcnow_iso + +from padelnomics import core + +# ── Fixtures ────────────────────────────────────────────────────────────────── + + +@pytest.fixture +async def admin_client(app, db): + """Authenticated admin test client.""" + now = utcnow_iso() + async with db.execute( + "INSERT INTO users (email, name, created_at) VALUES (?, ?, ?)", + ("pseo-admin@test.com", "pSEO Admin", now), + ) as cursor: + admin_id = cursor.lastrowid + await db.execute( + "INSERT INTO user_roles (user_id, role) VALUES (?, 'admin')", (admin_id,) + ) + await db.commit() + + async with app.test_client() as c: + async with c.session_transaction() as sess: + sess["user_id"] = admin_id + yield c + + +# ── DB helpers ──────────────────────────────────────────────────────────────── + + +async def _insert_article( + slug, + url_path, + status="published", + language="en", + template_slug="city-cost-de", + created_at=None, +): + """Insert a minimal article row and return its id.""" + ts = created_at or utcnow_iso() + return await execute( + """INSERT INTO articles + (url_path, slug, title, meta_description, country, region, + status, published_at, language, template_slug, created_at, updated_at) + VALUES (?, ?, ?, ?, 'DE', 'Europe', ?, ?, ?, ?, ?, ?)""", + ( + url_path, + slug, + f"Title {slug}", + f"Desc {slug}", + status, + ts if status == "published" else None, + language, + template_slug, + ts, + ts, + ), + ) + + +async def _insert_scenario(slug="test-scenario"): + """Insert a minimal published_scenario row.""" + from padelnomics.planner.calculator import calc, validate_state + + state = validate_state({"dblCourts": 2}) + d = calc(state) + return await execute( + """INSERT INTO published_scenarios + (slug, title, subtitle, location, country, venue_type, ownership, + court_config, state_json, calc_json) + VALUES (?, ?, '', 'TestCity', 'TC', 'indoor', 'rent', '2 double', ?, ?)""", + (slug, f"Scenario {slug}", json.dumps(state), json.dumps(d)), + ) + + +async def _insert_task(status="pending", progress_current=0, progress_total=0): + """Insert a generate_articles task row and return its id.""" + now = utcnow_iso() + async with core._db.execute( + """INSERT INTO tasks + (task_name, payload, status, run_at, progress_current, progress_total, created_at) + VALUES ('generate_articles', '{}', ?, ?, ?, ?, ?)""", + (status, now, progress_current, progress_total, now), + ) as cursor: + task_id = cursor.lastrowid + await core._db.commit() + return task_id + + +# ── DuckDB mock rows ────────────────────────────────────────────────────────── + +_DUCKDB_ROWS = [ + {"city_slug": "berlin", "city": "Berlin", "country": "DE"}, + {"city_slug": "munich", "city": "Munich", "country": "DE"}, + {"city_slug": "hamburg", "city": "Hamburg", "country": "DE"}, +] + + +async def _mock_fetch_duckdb(query, params=None): + return _DUCKDB_ROWS + + +# ════════════════════════════════════════════════════════════════════════════ +# get_template_stats() +# ════════════════════════════════════════════════════════════════════════════ + + +class TestGetTemplateStats: + async def test_empty_db_returns_zeros(self, db): + stats = await get_template_stats("city-cost-de") + assert stats["total"] == 0 + assert stats["published"] == 0 + assert stats["draft"] == 0 + assert stats["by_language"] == {} + + async def test_counts_per_status(self, db): + await _insert_article("city-cost-de-en-berlin", "/en/markets/germany/berlin", + status="published", language="en") + await _insert_article("city-cost-de-en-munich", "/en/markets/germany/munich", + status="draft", language="en") + await _insert_article("city-cost-de-de-berlin", "/de/markets/germany/berlin", + status="published", language="de") + + stats = await get_template_stats("city-cost-de") + + assert stats["total"] == 3 + assert stats["published"] == 2 + assert stats["draft"] == 1 + assert stats["by_language"]["en"]["total"] == 2 + assert stats["by_language"]["de"]["total"] == 1 + + async def test_ignores_other_templates(self, db): + await _insert_article("other-en-berlin", "/en/other/berlin", template_slug="other") + stats = await get_template_stats("city-cost-de") + assert stats["total"] == 0 + + +# ════════════════════════════════════════════════════════════════════════════ +# get_template_freshness() +# ════════════════════════════════════════════════════════════════════════════ + +_SAMPLE_TEMPLATES = [ + { + "slug": "city-cost-de", + "name": "City Cost DE", + "data_table": "serving.pseo_city_costs_de", + "languages": ["en", "de"], + } +] + + +class TestGetTemplateFreshness: + async def test_no_meta_file_returns_no_data(self, db, monkeypatch): + import padelnomics.content.health as health_mod + + monkeypatch.setattr(health_mod, "_read_serving_meta", lambda: {}) + + result = await get_template_freshness(_SAMPLE_TEMPLATES) + assert len(result) == 1 + assert result[0]["status"] == "no_data" + + async def test_meta_present_no_articles_returns_no_articles(self, db, monkeypatch): + import padelnomics.content.health as health_mod + + monkeypatch.setattr(health_mod, "_read_serving_meta", lambda: { + "exported_at_utc": "2026-01-15T10:00:00+00:00", + "tables": {"pseo_city_costs_de": {"row_count": 100}}, + }) + + result = await get_template_freshness(_SAMPLE_TEMPLATES) + assert result[0]["status"] == "no_articles" + assert result[0]["row_count"] == 100 + + async def test_article_older_than_export_returns_stale(self, db, monkeypatch): + import padelnomics.content.health as health_mod + + # Article created Jan 10, data exported Jan 15 β†’ stale + await _insert_article( + "city-cost-de-en-berlin", "/en/markets/germany/berlin", + status="published", language="en", created_at="2026-01-10T08:00:00", + ) + monkeypatch.setattr(health_mod, "_read_serving_meta", lambda: { + "exported_at_utc": "2026-01-15T10:00:00+00:00", + "tables": {"pseo_city_costs_de": {"row_count": 100}}, + }) + + result = await get_template_freshness(_SAMPLE_TEMPLATES) + assert result[0]["status"] == "stale" + + async def test_article_newer_than_export_returns_fresh(self, db, monkeypatch): + import padelnomics.content.health as health_mod + + # Data exported Jan 10, article updated Jan 15 β†’ fresh + await _insert_article( + "city-cost-de-en-berlin", "/en/markets/germany/berlin", + status="published", language="en", created_at="2026-01-15T12:00:00", + ) + monkeypatch.setattr(health_mod, "_read_serving_meta", lambda: { + "exported_at_utc": "2026-01-10T10:00:00+00:00", + "tables": {}, + }) + + result = await get_template_freshness(_SAMPLE_TEMPLATES) + assert result[0]["status"] == "fresh" + + +# ════════════════════════════════════════════════════════════════════════════ +# get_content_gaps() +# ════════════════════════════════════════════════════════════════════════════ + + +class TestGetContentGaps: + async def test_no_articles_returns_all_duckdb_rows(self, db, monkeypatch): + import padelnomics.content.health as health_mod + + monkeypatch.setattr(health_mod, "fetch_analytics", _mock_fetch_duckdb) + + gaps = await get_content_gaps( + template_slug="city-cost-de", + data_table="serving.pseo_city_costs_de", + natural_key="city_slug", + languages=["en"], + ) + assert len(gaps) == len(_DUCKDB_ROWS) + assert all(g["_missing_languages"] == ["en"] for g in gaps) + + async def test_existing_article_excluded_from_gaps(self, db, monkeypatch): + import padelnomics.content.health as health_mod + + monkeypatch.setattr(health_mod, "fetch_analytics", _mock_fetch_duckdb) + await _insert_article( + "city-cost-de-en-berlin", "/en/markets/germany/berlin", language="en", + ) + + gaps = await get_content_gaps( + template_slug="city-cost-de", + data_table="serving.pseo_city_costs_de", + natural_key="city_slug", + languages=["en"], + ) + gap_keys = {g["_natural_key"] for g in gaps} + assert "berlin" not in gap_keys + assert "munich" in gap_keys + assert "hamburg" in gap_keys + + async def test_partial_language_gap_detected(self, db, monkeypatch): + import padelnomics.content.health as health_mod + + monkeypatch.setattr(health_mod, "fetch_analytics", _mock_fetch_duckdb) + # EN exists for berlin, DE is missing β†’ berlin has a gap for "de" + await _insert_article( + "city-cost-de-en-berlin", "/en/markets/germany/berlin", language="en", + ) + + gaps = await get_content_gaps( + template_slug="city-cost-de", + data_table="serving.pseo_city_costs_de", + natural_key="city_slug", + languages=["en", "de"], + ) + berlin = next((g for g in gaps if g["_natural_key"] == "berlin"), None) + assert berlin is not None + assert berlin["_missing_languages"] == ["de"] + + async def test_no_gaps_when_all_articles_exist(self, db, monkeypatch): + import padelnomics.content.health as health_mod + + monkeypatch.setattr(health_mod, "fetch_analytics", _mock_fetch_duckdb) + for key in ("berlin", "munich", "hamburg"): + await _insert_article( + f"city-cost-de-en-{key}", f"/en/markets/germany/{key}", language="en", + ) + + gaps = await get_content_gaps( + template_slug="city-cost-de", + data_table="serving.pseo_city_costs_de", + natural_key="city_slug", + languages=["en"], + ) + assert gaps == [] + + +# ════════════════════════════════════════════════════════════════════════════ +# check_hreflang_orphans() +# ════════════════════════════════════════════════════════════════════════════ + + +class TestCheckHreflangOrphans: + async def test_single_lang_template_no_orphans(self, db): + templates = [{"slug": "city-cost-de", "name": "City Cost DE", "languages": ["en"]}] + await _insert_article( + "city-cost-de-en-berlin", "/en/markets/germany/berlin", + language="en", status="published", + ) + orphans = await check_hreflang_orphans(templates) + assert orphans == [] + + async def test_bilingual_both_present_no_orphans(self, db): + templates = [{"slug": "city-cost-de", "name": "City Cost DE", "languages": ["en", "de"]}] + await _insert_article( + "city-cost-de-en-berlin", "/en/markets/germany/berlin", + language="en", status="published", + ) + await _insert_article( + "city-cost-de-de-berlin", "/de/markets/germany/berlin", + language="de", status="published", + ) + orphans = await check_hreflang_orphans(templates) + assert orphans == [] + + async def test_missing_de_sibling_detected(self, db): + templates = [{"slug": "city-cost-de", "name": "City Cost DE", "languages": ["en", "de"]}] + # Only EN for berlin β€” DE is missing + await _insert_article( + "city-cost-de-en-berlin", "/en/markets/germany/berlin", + language="en", status="published", + ) + orphans = await check_hreflang_orphans(templates) + assert len(orphans) == 1 + assert orphans[0]["template_slug"] == "city-cost-de" + assert "de" in orphans[0]["missing_languages"] + assert "en" in orphans[0]["present_languages"] + + async def test_draft_articles_not_counted(self, db): + templates = [{"slug": "city-cost-de", "name": "City Cost DE", "languages": ["en", "de"]}] + # Draft articles should be ignored + await _insert_article( + "city-cost-de-en-berlin", "/en/markets/germany/berlin", + language="en", status="draft", + ) + orphans = await check_hreflang_orphans(templates) + assert orphans == [] + + +# ════════════════════════════════════════════════════════════════════════════ +# check_missing_build_files() +# ════════════════════════════════════════════════════════════════════════════ + + +class TestCheckMissingBuildFiles: + async def test_no_articles_returns_empty(self, db, tmp_path): + result = await check_missing_build_files(build_dir=tmp_path) + assert result == [] + + async def test_build_file_present_not_reported(self, db, tmp_path): + await _insert_article( + "city-cost-de-en-berlin", "/en/markets/germany/berlin", + language="en", status="published", + ) + build_file = tmp_path / "en" / "city-cost-de-en-berlin.html" + build_file.parent.mkdir(parents=True) + build_file.write_text("

Berlin

") + + result = await check_missing_build_files(build_dir=tmp_path) + assert result == [] + + async def test_missing_build_file_reported(self, db, tmp_path): + await _insert_article( + "city-cost-de-en-berlin", "/en/markets/germany/berlin", + language="en", status="published", + ) + # No build file created + result = await check_missing_build_files(build_dir=tmp_path) + assert len(result) == 1 + assert result[0]["slug"] == "city-cost-de-en-berlin" + assert result[0]["language"] == "en" + + async def test_draft_articles_ignored(self, db, tmp_path): + await _insert_article( + "city-cost-de-en-berlin", "/en/markets/germany/berlin", + language="en", status="draft", + ) + result = await check_missing_build_files(build_dir=tmp_path) + assert result == [] + + +# ════════════════════════════════════════════════════════════════════════════ +# check_broken_scenario_refs() +# ════════════════════════════════════════════════════════════════════════════ + + +class TestCheckBrokenScenarioRefs: + async def test_no_markdown_files_returns_empty(self, db, tmp_path): + await _insert_article( + "city-cost-de-en-berlin", "/en/markets/germany/berlin", + language="en", status="published", + ) + result = await check_broken_scenario_refs(build_dir=tmp_path) + assert result == [] + + async def test_valid_scenario_ref_not_reported(self, db, tmp_path): + await _insert_scenario("berlin-scenario") + await _insert_article( + "city-cost-de-en-berlin", "/en/markets/germany/berlin", + language="en", status="published", + ) + md_dir = tmp_path / "en" / "md" + md_dir.mkdir(parents=True) + (md_dir / "city-cost-de-en-berlin.md").write_text( + "# Berlin\n\n[scenario:berlin-scenario:capex]\n" + ) + result = await check_broken_scenario_refs(build_dir=tmp_path) + assert result == [] + + async def test_missing_scenario_ref_reported(self, db, tmp_path): + # No scenario in DB, but markdown references one + await _insert_article( + "city-cost-de-en-berlin", "/en/markets/germany/berlin", + language="en", status="published", + ) + md_dir = tmp_path / "en" / "md" + md_dir.mkdir(parents=True) + (md_dir / "city-cost-de-en-berlin.md").write_text( + "# Berlin\n\n[scenario:ghost-scenario:capex]\n" + ) + result = await check_broken_scenario_refs(build_dir=tmp_path) + assert len(result) == 1 + assert "ghost-scenario" in result[0]["broken_scenario_refs"] + + async def test_no_template_slug_articles_ignored(self, db, tmp_path): + # Legacy article (no template_slug) should not be checked + await execute( + """INSERT INTO articles + (url_path, slug, title, status, language, created_at) + VALUES ('/en/legacy', 'legacy', 'Legacy', 'published', 'en', ?)""", + (utcnow_iso(),), + ) + md_dir = tmp_path / "en" / "md" + md_dir.mkdir(parents=True) + (md_dir / "legacy.md").write_text("# Legacy\n\n[scenario:ghost]\n") + + result = await check_broken_scenario_refs(build_dir=tmp_path) + assert result == [] + + +# ════════════════════════════════════════════════════════════════════════════ +# get_all_health_issues() +# ════════════════════════════════════════════════════════════════════════════ + + +class TestGetAllHealthIssues: + async def test_clean_state_returns_zero_counts(self, db, tmp_path): + templates = [{"slug": "city-cost-de", "name": "City Cost DE", "languages": ["en"]}] + result = await get_all_health_issues(templates, build_dir=tmp_path) + + assert result["counts"]["total"] == 0 + assert result["counts"]["hreflang_orphans"] == 0 + assert result["counts"]["missing_build_files"] == 0 + assert result["counts"]["broken_scenario_refs"] == 0 + assert "hreflang_orphans" in result + assert "missing_build_files" in result + assert "broken_scenario_refs" in result + + async def test_orphan_counted_in_total(self, db, tmp_path): + templates = [{"slug": "city-cost-de", "name": "City Cost DE", "languages": ["en", "de"]}] + # EN article with no DE sibling β†’ orphan + await _insert_article( + "city-cost-de-en-berlin", "/en/markets/germany/berlin", + language="en", status="published", + ) + result = await get_all_health_issues(templates, build_dir=tmp_path) + assert result["counts"]["hreflang_orphans"] == 1 + assert result["counts"]["total"] >= 1 + + +# ════════════════════════════════════════════════════════════════════════════ +# pSEO Route tests +# ════════════════════════════════════════════════════════════════════════════ + +# Mock objects for route tests β€” avoids needing a live DuckDB +_MOCK_TEMPLATE_CFG = { + "slug": "city-cost-de", + "name": "City Cost DE", + "data_table": "serving.pseo_city_costs_de", + "natural_key": "city_slug", + "languages": ["en", "de"], + "url_pattern": "/markets/{country}/{city_slug}", +} +_MOCK_TEMPLATES = [_MOCK_TEMPLATE_CFG] + + +def _discover_mock(): + return _MOCK_TEMPLATES + + +def _load_template_mock(slug): + if slug == "city-cost-de": + return _MOCK_TEMPLATE_CFG + raise FileNotFoundError(f"Template {slug!r} not found") + + +async def _freshness_mock(templates): + return [ + { + "slug": t["slug"], + "name": t["name"], + "data_table": t["data_table"], + "status": "fresh", + "exported_at_utc": None, + "last_generated": None, + "row_count": 100, + } + for t in templates + ] + + +async def _stats_mock(slug): + return { + "total": 10, "published": 8, "draft": 2, "scheduled": 0, + "by_language": { + "en": {"total": 5, "published": 4, "draft": 1, "scheduled": 0}, + "de": {"total": 5, "published": 4, "draft": 1, "scheduled": 0}, + }, + } + + +async def _health_mock(templates, build_dir=None): + return { + "hreflang_orphans": [], + "missing_build_files": [], + "broken_scenario_refs": [], + "counts": {"hreflang_orphans": 0, "missing_build_files": 0, + "broken_scenario_refs": 0, "total": 0}, + } + + +async def _gaps_empty_mock(template_slug, data_table, natural_key, languages, limit=200): + return [] + + +async def _gaps_two_mock(template_slug, data_table, natural_key, languages, limit=200): + return [ + {"city_slug": "munich", "_natural_key": "munich", "_missing_languages": ["en"]}, + {"city_slug": "hamburg", "_natural_key": "hamburg", "_missing_languages": ["de"]}, + ] + + +class TestPseoRoutes: + """Tests for all pSEO Engine admin blueprint routes.""" + + # -- Access control -------------------------------------------------------- + + async def test_dashboard_requires_admin(self, client, db): + resp = await client.get("/admin/pseo/") + assert resp.status_code in (302, 403) + + async def test_health_requires_admin(self, client, db): + resp = await client.get("/admin/pseo/health") + assert resp.status_code in (302, 403) + + async def test_gaps_requires_admin(self, client, db): + resp = await client.get("/admin/pseo/gaps/city-cost-de") + assert resp.status_code in (302, 403) + + async def test_jobs_requires_admin(self, client, db): + resp = await client.get("/admin/pseo/jobs") + assert resp.status_code in (302, 403) + + # -- Dashboard ------------------------------------------------------------- + + async def test_dashboard_renders(self, admin_client, db): + with ( + patch("padelnomics.admin.pseo_routes.discover_templates", _discover_mock), + patch("padelnomics.admin.pseo_routes.get_template_freshness", _freshness_mock), + patch("padelnomics.admin.pseo_routes.get_template_stats", _stats_mock), + ): + resp = await admin_client.get("/admin/pseo/") + + assert resp.status_code == 200 + text = await resp.get_data(as_text=True) + assert "pSEO Engine" in text + + async def test_dashboard_shows_template_name(self, admin_client, db): + with ( + patch("padelnomics.admin.pseo_routes.discover_templates", _discover_mock), + patch("padelnomics.admin.pseo_routes.get_template_freshness", _freshness_mock), + patch("padelnomics.admin.pseo_routes.get_template_stats", _stats_mock), + ): + resp = await admin_client.get("/admin/pseo/") + + text = await resp.get_data(as_text=True) + assert "City Cost DE" in text + + # -- Health HTMX partial --------------------------------------------------- + + async def test_health_partial_renders(self, admin_client, db): + with ( + patch("padelnomics.admin.pseo_routes.discover_templates", _discover_mock), + patch("padelnomics.admin.pseo_routes.get_all_health_issues", _health_mock), + ): + resp = await admin_client.get("/admin/pseo/health") + + assert resp.status_code == 200 + + # -- Content gaps HTMX partial --------------------------------------------- + + async def test_gaps_unknown_template_returns_404(self, admin_client, db): + def _raise(slug): + raise FileNotFoundError("not found") + + with patch("padelnomics.admin.pseo_routes.load_template", _raise): + resp = await admin_client.get("/admin/pseo/gaps/no-such-template") + + assert resp.status_code == 404 + + async def test_gaps_partial_renders(self, admin_client, db): + with ( + patch("padelnomics.admin.pseo_routes.load_template", _load_template_mock), + patch("padelnomics.admin.pseo_routes.get_content_gaps", _gaps_two_mock), + ): + resp = await admin_client.get("/admin/pseo/gaps/city-cost-de") + + assert resp.status_code == 200 + text = await resp.get_data(as_text=True) + # Should show gap count or row content + assert "munich" in text or "missing" in text.lower() + + async def test_gaps_empty_shows_no_gaps_message(self, admin_client, db): + with ( + patch("padelnomics.admin.pseo_routes.load_template", _load_template_mock), + patch("padelnomics.admin.pseo_routes.get_content_gaps", _gaps_empty_mock), + ): + resp = await admin_client.get("/admin/pseo/gaps/city-cost-de") + + assert resp.status_code == 200 + text = await resp.get_data(as_text=True) + assert "No gaps" in text or "all" in text.lower() + + # -- Generate gaps POST ---------------------------------------------------- + + async def test_generate_gaps_redirects(self, admin_client, db): + async with admin_client.session_transaction() as sess: + sess["csrf_token"] = "test" + + with ( + patch("padelnomics.admin.pseo_routes.load_template", _load_template_mock), + patch("padelnomics.admin.pseo_routes.get_content_gaps", _gaps_two_mock), + ): + resp = await admin_client.post( + "/admin/pseo/gaps/city-cost-de/generate", + form={"csrf_token": "test"}, + ) + + assert resp.status_code == 302 + + async def test_generate_gaps_enqueues_task(self, admin_client, db): + async with admin_client.session_transaction() as sess: + sess["csrf_token"] = "test" + + with ( + patch("padelnomics.admin.pseo_routes.load_template", _load_template_mock), + patch("padelnomics.admin.pseo_routes.get_content_gaps", _gaps_two_mock), + ): + await admin_client.post( + "/admin/pseo/gaps/city-cost-de/generate", + form={"csrf_token": "test"}, + ) + + tasks = await core.fetch_all( + "SELECT task_name FROM tasks WHERE task_name = 'generate_articles'" + ) + assert len(tasks) == 1 + + async def test_generate_gaps_no_gaps_redirects_without_task(self, admin_client, db): + async with admin_client.session_transaction() as sess: + sess["csrf_token"] = "test" + + with ( + patch("padelnomics.admin.pseo_routes.load_template", _load_template_mock), + patch("padelnomics.admin.pseo_routes.get_content_gaps", _gaps_empty_mock), + ): + resp = await admin_client.post( + "/admin/pseo/gaps/city-cost-de/generate", + form={"csrf_token": "test"}, + ) + + assert resp.status_code == 302 + tasks = await core.fetch_all( + "SELECT task_name FROM tasks WHERE task_name = 'generate_articles'" + ) + assert len(tasks) == 0 + + # -- Jobs list ------------------------------------------------------------- + + async def test_jobs_renders_empty(self, admin_client, db): + resp = await admin_client.get("/admin/pseo/jobs") + assert resp.status_code == 200 + text = await resp.get_data(as_text=True) + assert "Generation Jobs" in text + + async def test_jobs_shows_task_row(self, admin_client, db): + await _insert_task(status="complete", progress_current=20, progress_total=20) + + resp = await admin_client.get("/admin/pseo/jobs") + assert resp.status_code == 200 + text = await resp.get_data(as_text=True) + assert "Complete" in text + + # -- Job status HTMX polled ------------------------------------------------ + + async def test_job_status_not_found_returns_404(self, admin_client, db): + resp = await admin_client.get("/admin/pseo/jobs/9999/status") + assert resp.status_code == 404 + + async def test_job_status_renders_pending(self, admin_client, db): + job_id = await _insert_task( + status="pending", progress_current=5, progress_total=20 + ) + + resp = await admin_client.get(f"/admin/pseo/jobs/{job_id}/status") + assert resp.status_code == 200 + text = await resp.get_data(as_text=True) + assert "Running" in text + + async def test_job_status_renders_complete(self, admin_client, db): + job_id = await _insert_task( + status="complete", progress_current=20, progress_total=20 + ) + + resp = await admin_client.get(f"/admin/pseo/jobs/{job_id}/status") + assert resp.status_code == 200 + text = await resp.get_data(as_text=True) + assert "Complete" in text + + async def test_job_status_complete_no_htmx_poll_trigger(self, admin_client, db): + """A completed job should not include hx-trigger="every 2s" (stops HTMX polling).""" + job_id = await _insert_task( + status="complete", progress_current=20, progress_total=20 + ) + + resp = await admin_client.get(f"/admin/pseo/jobs/{job_id}/status") + text = await resp.get_data(as_text=True) + assert "every 2s" not in text + + async def test_job_status_pending_includes_htmx_poll_trigger(self, admin_client, db): + """A pending job should include hx-trigger="every 2s" (keeps HTMX polling).""" + job_id = await _insert_task( + status="pending", progress_current=0, progress_total=20 + ) + + resp = await admin_client.get(f"/admin/pseo/jobs/{job_id}/status") + text = await resp.get_data(as_text=True) + assert "every 2s" in text