From ccf03db9a3dde00664c3fe46f4b8762cb0d0f4a6 Mon Sep 17 00:00:00 2001 From: Deeman Date: Mon, 23 Feb 2026 15:00:36 +0100 Subject: [PATCH] =?UTF-8?q?feat:=20SEO/GEO=20admin=20hub=20=E2=80=94=20mig?= =?UTF-8?q?ration,=20sync=20module,=20routes,=20templates?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Migration 0019: seo_search_metrics, seo_analytics_metrics, seo_sync_log tables - seo/ module: GSC, Bing, Umami sync + query functions (search perf, funnel, scorecard) - Admin routes: /admin/seo hub with HTMX tabs + manual sync trigger - Admin templates: hub page, search/funnel/scorecard partials, sidebar nav entry - Worker: sync_gsc, sync_bing, sync_umami, cleanup_seo_metrics tasks + daily scheduler - Config: GSC_SERVICE_ACCOUNT_PATH, GSC_SITE_URL, BING_WEBMASTER_API_KEY, BING_SITE_URL - Deps: httpx, google-api-python-client, google-auth Co-Authored-By: Claude Opus 4.6 --- web/pyproject.toml | 3 + web/src/padelnomics/admin/routes.py | 131 ++++++ .../admin/templates/admin/base_admin.html | 6 + .../templates/admin/partials/seo_funnel.html | 96 +++++ .../admin/partials/seo_scorecard.html | 104 +++++ .../templates/admin/partials/seo_search.html | 132 ++++++ .../admin/templates/admin/seo.html | 149 +++++++ web/src/padelnomics/core.py | 8 +- .../versions/0019_add_seo_metrics.py | 84 ++++ web/src/padelnomics/seo/__init__.py | 36 ++ web/src/padelnomics/seo/_bing.py | 143 +++++++ web/src/padelnomics/seo/_gsc.py | 144 +++++++ web/src/padelnomics/seo/_queries.py | 379 ++++++++++++++++++ web/src/padelnomics/seo/_umami.py | 117 ++++++ web/src/padelnomics/worker.py | 53 ++- 15 files changed, 1583 insertions(+), 2 deletions(-) create mode 100644 web/src/padelnomics/admin/templates/admin/partials/seo_funnel.html create mode 100644 web/src/padelnomics/admin/templates/admin/partials/seo_scorecard.html create mode 100644 web/src/padelnomics/admin/templates/admin/partials/seo_search.html create mode 100644 web/src/padelnomics/admin/templates/admin/seo.html create mode 100644 web/src/padelnomics/migrations/versions/0019_add_seo_metrics.py create mode 100644 web/src/padelnomics/seo/__init__.py create mode 100644 web/src/padelnomics/seo/_bing.py create mode 100644 web/src/padelnomics/seo/_gsc.py create mode 100644 web/src/padelnomics/seo/_queries.py create mode 100644 web/src/padelnomics/seo/_umami.py diff --git a/web/pyproject.toml b/web/pyproject.toml index 59b0690..d03b01e 100644 --- a/web/pyproject.toml +++ b/web/pyproject.toml @@ -17,6 +17,9 @@ dependencies = [ "weasyprint>=68.1", "duckdb>=1.0.0", "pyyaml>=6.0", + "httpx>=0.27.0", + "google-api-python-client>=2.100.0", + "google-auth>=2.23.0", ] [build-system] diff --git a/web/src/padelnomics/admin/routes.py b/web/src/padelnomics/admin/routes.py index 8bcf7c8..d4a800c 100644 --- a/web/src/padelnomics/admin/routes.py +++ b/web/src/padelnomics/admin/routes.py @@ -1401,3 +1401,134 @@ async def _rebuild_article(article_id: int): body_html = await bake_scenario_cards(body_html, lang=lang) BUILD_DIR.mkdir(parents=True, exist_ok=True) (BUILD_DIR / f"{article['slug']}.html").write_text(body_html) + + +# ============================================================================= +# SEO Hub +# ============================================================================= + +@bp.route("/seo") +@role_required("admin") +async def seo(): + """SEO metrics hub — overview + tabs for search, funnel, scorecard.""" + from ..seo import get_search_performance, get_sync_status + + date_range_days = int(request.args.get("days", "28") or "28") + date_range_days = max(1, min(date_range_days, 730)) + + overview = await get_search_performance(date_range_days=date_range_days) + sync_status = await get_sync_status() + + return await render_template( + "admin/seo.html", + overview=overview, + sync_status=sync_status, + date_range_days=date_range_days, + ) + + +@bp.route("/seo/search") +@role_required("admin") +async def seo_search(): + """HTMX partial: search performance tab.""" + from ..seo import ( + get_country_breakdown, + get_device_breakdown, + get_top_pages, + get_top_queries, + ) + + days = int(request.args.get("days", "28") or "28") + days = max(1, min(days, 730)) + source = request.args.get("source", "") or None + + queries = await get_top_queries(date_range_days=days, source=source) + pages = await get_top_pages(date_range_days=days, source=source) + countries = await get_country_breakdown(date_range_days=days) + devices = await get_device_breakdown(date_range_days=days) + + return await render_template( + "admin/partials/seo_search.html", + queries=queries, + pages=pages, + countries=countries, + devices=devices, + date_range_days=days, + current_source=source, + ) + + +@bp.route("/seo/funnel") +@role_required("admin") +async def seo_funnel(): + """HTMX partial: full funnel view.""" + from ..seo import get_funnel_metrics + + days = int(request.args.get("days", "28") or "28") + days = max(1, min(days, 730)) + funnel = await get_funnel_metrics(date_range_days=days) + + return await render_template( + "admin/partials/seo_funnel.html", + funnel=funnel, + date_range_days=days, + ) + + +@bp.route("/seo/scorecard") +@role_required("admin") +async def seo_scorecard(): + """HTMX partial: article scorecard.""" + from ..seo import get_article_scorecard + + days = int(request.args.get("days", "28") or "28") + days = max(1, min(days, 730)) + template_slug = request.args.get("template_slug", "") or None + country_filter = request.args.get("country", "") or None + language = request.args.get("language", "") or None + sort_by = request.args.get("sort", "impressions") + sort_dir = request.args.get("dir", "desc") + + scorecard = await get_article_scorecard( + date_range_days=days, + template_slug=template_slug, + country=country_filter, + language=language, + sort_by=sort_by, + sort_dir=sort_dir, + ) + + return await render_template( + "admin/partials/seo_scorecard.html", + scorecard=scorecard, + date_range_days=days, + current_template=template_slug, + current_country=country_filter, + current_language=language, + current_sort=sort_by, + current_dir=sort_dir, + ) + + +@bp.route("/seo/sync", methods=["POST"]) +@role_required("admin") +@csrf_protect +async def seo_sync_now(): + """Manually trigger SEO data sync.""" + from ..worker import enqueue + + form = await request.form + source = form.get("source", "all") + + if source == "all": + await enqueue("sync_gsc") + await enqueue("sync_bing") + await enqueue("sync_umami") + await flash("All SEO syncs queued.", "success") + elif source in ("gsc", "bing", "umami"): + await enqueue(f"sync_{source}") + await flash(f"{source.upper()} sync queued.", "success") + else: + await flash("Unknown source.", "error") + + return redirect(url_for("admin.seo")) diff --git a/web/src/padelnomics/admin/templates/admin/base_admin.html b/web/src/padelnomics/admin/templates/admin/base_admin.html index d826e97..80f6730 100644 --- a/web/src/padelnomics/admin/templates/admin/base_admin.html +++ b/web/src/padelnomics/admin/templates/admin/base_admin.html @@ -86,6 +86,12 @@ Templates +
Analytics
+ + + SEO Hub + +
System
diff --git a/web/src/padelnomics/admin/templates/admin/partials/seo_funnel.html b/web/src/padelnomics/admin/templates/admin/partials/seo_funnel.html new file mode 100644 index 0000000..ece0800 --- /dev/null +++ b/web/src/padelnomics/admin/templates/admin/partials/seo_funnel.html @@ -0,0 +1,96 @@ + + + +{% set max_val = [funnel.impressions, funnel.clicks, funnel.pageviews, funnel.visitors, funnel.planner_users, funnel.leads] | max or 1 %} + +
+ + +
+
ImpressionsSearch results shown
+
+
+
+
+ {{ "{:,}".format(funnel.impressions | int) }} +
+
+ +
+
ClicksCTR: {{ "%.1f" | format(funnel.ctr * 100) }}%
+
+
+
+
+ {{ "{:,}".format(funnel.clicks | int) }} +
+
+ + + +
+
Pageviews{% if funnel.clicks %}{{ "%.0f" | format(funnel.click_to_view * 100) }}% of clicks{% endif %}
+
+
+
+
+ {{ "{:,}".format(funnel.pageviews | int) }} +
+
+ +
+
VisitorsUnique
+
+
+
+
+ {{ "{:,}".format(funnel.visitors | int) }} +
+
+ + + +
+
Planner Users{% if funnel.visitors %}{{ "%.1f" | format(funnel.visitor_to_planner * 100) }}% of visitors{% endif %}
+
+
+
+
+ {{ "{:,}".format(funnel.planner_users | int) }} +
+
+ +
+
Lead Requests{% if funnel.planner_users %}{{ "%.1f" | format(funnel.planner_to_lead * 100) }}% of planners{% endif %}
+
+
+
+
+ {{ "{:,}".format(funnel.leads | int) }} +
+
+
+ +{% if not funnel.impressions and not funnel.pageviews and not funnel.planner_users %} +
+

No funnel data yet. Run a sync to populate search and analytics metrics.

+
+{% endif %} diff --git a/web/src/padelnomics/admin/templates/admin/partials/seo_scorecard.html b/web/src/padelnomics/admin/templates/admin/partials/seo_scorecard.html new file mode 100644 index 0000000..49071e8 --- /dev/null +++ b/web/src/padelnomics/admin/templates/admin/partials/seo_scorecard.html @@ -0,0 +1,104 @@ + + + +
+
+ + +
+ + +
+ +
+ + +
+ +
+ + +
+ +
+ + +
+
+
+ +{% if scorecard %} +
+ + + + + + + + + + + + + + + + {% for a in scorecard %} + + + + + + + + + + + + {% endfor %} + +
TitleImpressionsClicksCTRPosViewsBouncePublishedFlags
+ {{ a.title or a.url_path }} + {% if a.template_slug %} +
{{ a.template_slug }} + {% endif %} +
{{ "{:,}".format(a.impressions | int) }}{{ "{:,}".format(a.clicks | int) }}{{ "%.1f" | format((a.ctr or 0) * 100) }}%{{ "%.1f" | format(a.position_avg or 0) }}{{ "{:,}".format(a.pageviews | int) }} + {% if a.bounce_rate is not none %}{{ "%.0f" | format(a.bounce_rate * 100) }}%{% else %}-{% endif %} + {{ a.published_at[:10] if a.published_at else '-' }} + {% if a.flag_low_ctr %} + Low CTR + {% endif %} + {% if a.flag_no_clicks %} + No Clicks + {% endif %} +
+
+

{{ scorecard | length }} articles shown

+{% else %} +
+

No published articles match the current filters, or no search/analytics data synced yet.

+
+{% endif %} diff --git a/web/src/padelnomics/admin/templates/admin/partials/seo_search.html b/web/src/padelnomics/admin/templates/admin/partials/seo_search.html new file mode 100644 index 0000000..9499a30 --- /dev/null +++ b/web/src/padelnomics/admin/templates/admin/partials/seo_search.html @@ -0,0 +1,132 @@ + +
+ + + +
+ +
+ +
+

Top Queries

+ {% if queries %} +
+ + + + + + + + + + + + {% for q in queries[:20] %} + + + + + + + + {% endfor %} + +
QueryImpressionsClicksCTRPos
{{ q.query }}{{ "{:,}".format(q.impressions | int) }}{{ "{:,}".format(q.clicks | int) }}{{ "%.1f" | format((q.ctr or 0) * 100) }}%{{ "%.1f" | format(q.position_avg or 0) }}
+
+ {% else %} +
+

No query data yet. Run a sync to populate.

+
+ {% endif %} +
+ + +
+

Top Pages

+ {% if pages %} +
+ + + + + + + + + + + + {% for p in pages[:20] %} + + + + + + + + {% endfor %} + +
PageImpressionsClicksCTRPos
{{ p.page_url }}{{ "{:,}".format(p.impressions | int) }}{{ "{:,}".format(p.clicks | int) }}{{ "%.1f" | format((p.ctr or 0) * 100) }}%{{ "%.1f" | format(p.position_avg or 0) }}
+
+ {% else %} +
+

No page data yet.

+
+ {% endif %} +
+
+ +
+ +
+

By Country

+ {% if countries %} +
+ + + + {% for c in countries[:15] %} + + + + + + {% endfor %} + +
CountryImpressionsClicks
{{ c.country | upper }}{{ "{:,}".format(c.impressions | int) }}{{ "{:,}".format(c.clicks | int) }}
+
+ {% else %} +

No country data.

+ {% endif %} +
+ + +
+

By Device (GSC)

+ {% if devices %} +
+ + + + {% for d in devices %} + + + + + + {% endfor %} + +
DeviceImpressionsClicks
{{ d.device | capitalize }}{{ "{:,}".format(d.impressions | int) }}{{ "{:,}".format(d.clicks | int) }}
+
+ {% else %} +

No device data (GSC only).

+ {% endif %} +
+
diff --git a/web/src/padelnomics/admin/templates/admin/seo.html b/web/src/padelnomics/admin/templates/admin/seo.html new file mode 100644 index 0000000..0b0f295 --- /dev/null +++ b/web/src/padelnomics/admin/templates/admin/seo.html @@ -0,0 +1,149 @@ +{% extends "admin/base_admin.html" %} +{% set admin_page = "seo" %} +{% block title %}SEO Hub - Admin - {{ config.APP_NAME }}{% endblock %} + +{% block admin_head %} + +{% endblock %} + +{% block admin_content %} +
+
+

SEO & Analytics Hub

+

Search performance, funnel metrics, and article scorecard

+
+
+
+ + +
+ Last sync: + {% for s in sync_status %} + + {{ s.source | upper }} + {% if s.status == 'success' %} + {{ s.completed_at[:16] if s.completed_at else '' }} ({{ s.rows_synced }} rows) + {% elif s.status == 'failed' %} + failed + {% endif %} + + {% endfor %} + {% if not sync_status %} + No syncs yet + {% endif %} +
+ + +
+
+ {% for d, label in [(7, '7d'), (28, '28d'), (90, '3m'), (180, '6m'), (365, '12m')] %} + + {% endfor %} +
+
+ + +
+
+

Impressions

+

{{ "{:,}".format(overview.total_impressions | int) }}

+
+
+

Clicks

+

{{ "{:,}".format(overview.total_clicks | int) }}

+
+
+

Avg CTR

+

{{ "%.1f" | format(overview.avg_ctr * 100) }}%

+
+
+

Avg Position

+

{{ "%.1f" | format(overview.avg_position) }}

+
+
+ + +
+ + + +
+ + +
+
+

Loading...

+
+
+ + +{% endblock %} diff --git a/web/src/padelnomics/core.py b/web/src/padelnomics/core.py index 53aa510..b85284f 100644 --- a/web/src/padelnomics/core.py +++ b/web/src/padelnomics/core.py @@ -51,7 +51,13 @@ class Config: UMAMI_API_URL: str = os.getenv("UMAMI_API_URL", "https://umami.padelnomics.io") UMAMI_API_TOKEN: str = os.getenv("UMAMI_API_TOKEN", "") UMAMI_WEBSITE_ID: str = "4474414b-58d6-4c6e-89a1-df5ea1f49d70" - + + # SEO metrics sync + GSC_SERVICE_ACCOUNT_PATH: str = os.getenv("GSC_SERVICE_ACCOUNT_PATH", "") + GSC_SITE_URL: str = os.getenv("GSC_SITE_URL", "") + BING_WEBMASTER_API_KEY: str = os.getenv("BING_WEBMASTER_API_KEY", "") + BING_SITE_URL: str = os.getenv("BING_SITE_URL", "") + RESEND_API_KEY: str = os.getenv("RESEND_API_KEY", "") EMAIL_FROM: str = _env("EMAIL_FROM", "hello@padelnomics.io") LEADS_EMAIL: str = _env("LEADS_EMAIL", "leads@padelnomics.io") diff --git a/web/src/padelnomics/migrations/versions/0019_add_seo_metrics.py b/web/src/padelnomics/migrations/versions/0019_add_seo_metrics.py new file mode 100644 index 0000000..aea6400 --- /dev/null +++ b/web/src/padelnomics/migrations/versions/0019_add_seo_metrics.py @@ -0,0 +1,84 @@ +"""Add SEO metrics tables for GSC, Bing, and Umami data sync. + +Three tables: + - seo_search_metrics — daily search data per page+query (GSC + Bing) + - seo_analytics_metrics — daily page analytics (Umami) + - seo_sync_log — tracks sync state per source +""" + + +def up(conn): + # ── 1. Search metrics (GSC + Bing) ───────────────────────────────── + conn.execute(""" + CREATE TABLE IF NOT EXISTS seo_search_metrics ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + source TEXT NOT NULL, + metric_date TEXT NOT NULL, + page_url TEXT NOT NULL, + query TEXT, + country TEXT, + device TEXT, + clicks INTEGER NOT NULL DEFAULT 0, + impressions INTEGER NOT NULL DEFAULT 0, + ctr REAL, + position_avg REAL, + created_at TEXT NOT NULL DEFAULT (datetime('now')) + ) + """) + # COALESCE converts NULLs to '' for unique index (SQLite treats + # NULL as distinct in UNIQUE constraints, causing duplicate rows) + conn.execute(""" + CREATE UNIQUE INDEX IF NOT EXISTS idx_seo_search_dedup + ON seo_search_metrics( + source, metric_date, page_url, + COALESCE(query, ''), COALESCE(country, ''), COALESCE(device, '') + ) + """) + conn.execute( + "CREATE INDEX IF NOT EXISTS idx_seo_search_date" + " ON seo_search_metrics(metric_date)" + ) + conn.execute( + "CREATE INDEX IF NOT EXISTS idx_seo_search_page" + " ON seo_search_metrics(page_url)" + ) + + # ── 2. Analytics metrics (Umami) ─────────────────────────────────── + conn.execute(""" + CREATE TABLE IF NOT EXISTS seo_analytics_metrics ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + metric_date TEXT NOT NULL, + page_url TEXT NOT NULL, + pageviews INTEGER NOT NULL DEFAULT 0, + visitors INTEGER NOT NULL DEFAULT 0, + bounce_rate REAL, + time_avg_seconds INTEGER, + created_at TEXT NOT NULL DEFAULT (datetime('now')) + ) + """) + conn.execute(""" + CREATE UNIQUE INDEX IF NOT EXISTS idx_seo_analytics_dedup + ON seo_analytics_metrics(metric_date, page_url) + """) + conn.execute( + "CREATE INDEX IF NOT EXISTS idx_seo_analytics_date" + " ON seo_analytics_metrics(metric_date)" + ) + + # ── 3. Sync log ──────────────────────────────────────────────────── + conn.execute(""" + CREATE TABLE IF NOT EXISTS seo_sync_log ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + source TEXT NOT NULL, + status TEXT NOT NULL, + rows_synced INTEGER NOT NULL DEFAULT 0, + error TEXT, + started_at TEXT NOT NULL, + completed_at TEXT, + duration_ms INTEGER + ) + """) + conn.execute( + "CREATE INDEX IF NOT EXISTS idx_seo_sync_source" + " ON seo_sync_log(source, started_at)" + ) diff --git a/web/src/padelnomics/seo/__init__.py b/web/src/padelnomics/seo/__init__.py new file mode 100644 index 0000000..40a4e2b --- /dev/null +++ b/web/src/padelnomics/seo/__init__.py @@ -0,0 +1,36 @@ +""" +SEO metrics sync and query module. + +Syncs data from Google Search Console, Bing Webmaster Tools, and Umami +into SQLite tables. Query functions support the admin SEO hub views. +""" + +from ._bing import sync_bing +from ._gsc import sync_gsc +from ._queries import ( + cleanup_old_metrics, + get_article_scorecard, + get_country_breakdown, + get_device_breakdown, + get_funnel_metrics, + get_search_performance, + get_sync_status, + get_top_pages, + get_top_queries, +) +from ._umami import sync_umami + +__all__ = [ + "sync_gsc", + "sync_bing", + "sync_umami", + "get_search_performance", + "get_top_queries", + "get_top_pages", + "get_country_breakdown", + "get_device_breakdown", + "get_funnel_metrics", + "get_article_scorecard", + "get_sync_status", + "cleanup_old_metrics", +] diff --git a/web/src/padelnomics/seo/_bing.py b/web/src/padelnomics/seo/_bing.py new file mode 100644 index 0000000..8effc5a --- /dev/null +++ b/web/src/padelnomics/seo/_bing.py @@ -0,0 +1,143 @@ +"""Bing Webmaster Tools sync via REST API. + +Uses an API key for auth. Fetches query stats and page stats. +""" + +from datetime import datetime, timedelta +from urllib.parse import urlparse + +import httpx + +from ..core import config, execute + + +_TIMEOUT_SECONDS = 30 + + +def _normalize_url(full_url: str) -> str: + """Strip a full URL to just the path.""" + parsed = urlparse(full_url) + return parsed.path or "/" + + +async def sync_bing(days_back: int = 3, timeout_seconds: int = _TIMEOUT_SECONDS) -> int: + """Sync Bing Webmaster query stats into seo_search_metrics. Returns rows synced.""" + assert 1 <= days_back <= 90, "days_back must be 1-90" + assert 1 <= timeout_seconds <= 120, "timeout_seconds must be 1-120" + + if not config.BING_WEBMASTER_API_KEY or not config.BING_SITE_URL: + return 0 # Bing not configured — skip silently + + started_at = datetime.utcnow() + + try: + rows_synced = 0 + async with httpx.AsyncClient(timeout=timeout_seconds) as client: + # Fetch query stats for the date range + response = await client.get( + "https://ssl.bing.com/webmaster/api.svc/json/GetQueryStats", + params={ + "apikey": config.BING_WEBMASTER_API_KEY, + "siteUrl": config.BING_SITE_URL, + }, + ) + response.raise_for_status() + data = response.json() + + # Bing returns {"d": [{"Query": ..., "Date": ..., ...}, ...]} + entries = data.get("d", []) if isinstance(data, dict) else data + if not isinstance(entries, list): + entries = [] + + cutoff = datetime.utcnow() - timedelta(days=days_back) + + for entry in entries: + # Bing date format: "/Date(1708905600000)/" (ms since epoch) + date_str = entry.get("Date", "") + if "/Date(" in date_str: + ms = int(date_str.split("(")[1].split(")")[0]) + entry_date = datetime.utcfromtimestamp(ms / 1000) + else: + continue + + if entry_date < cutoff: + continue + + metric_date = entry_date.strftime("%Y-%m-%d") + query = entry.get("Query", "") + + await execute( + """INSERT OR REPLACE INTO seo_search_metrics + (source, metric_date, page_url, query, country, device, + clicks, impressions, ctr, position_avg) + VALUES ('bing', ?, '/', ?, NULL, NULL, ?, ?, ?, ?)""", + ( + metric_date, query, + entry.get("Clicks", 0), + entry.get("Impressions", 0), + entry.get("AvgCTR", 0.0), + entry.get("AvgClickPosition", 0.0), + ), + ) + rows_synced += 1 + + # Also fetch page-level stats + page_response = await client.get( + "https://ssl.bing.com/webmaster/api.svc/json/GetPageStats", + params={ + "apikey": config.BING_WEBMASTER_API_KEY, + "siteUrl": config.BING_SITE_URL, + }, + ) + page_response.raise_for_status() + page_data = page_response.json() + + page_entries = page_data.get("d", []) if isinstance(page_data, dict) else page_data + if not isinstance(page_entries, list): + page_entries = [] + + for entry in page_entries: + date_str = entry.get("Date", "") + if "/Date(" in date_str: + ms = int(date_str.split("(")[1].split(")")[0]) + entry_date = datetime.utcfromtimestamp(ms / 1000) + else: + continue + + if entry_date < cutoff: + continue + + metric_date = entry_date.strftime("%Y-%m-%d") + page_url = _normalize_url(entry.get("Url", "/")) + + await execute( + """INSERT OR REPLACE INTO seo_search_metrics + (source, metric_date, page_url, query, country, device, + clicks, impressions, ctr, position_avg) + VALUES ('bing', ?, ?, '', NULL, NULL, ?, ?, NULL, NULL)""", + ( + metric_date, page_url, + entry.get("Clicks", 0), + entry.get("Impressions", 0), + ), + ) + rows_synced += 1 + + duration_ms = int((datetime.utcnow() - started_at).total_seconds() * 1000) + await execute( + """INSERT INTO seo_sync_log + (source, status, rows_synced, started_at, completed_at, duration_ms) + VALUES ('bing', 'success', ?, ?, ?, ?)""", + (rows_synced, started_at.isoformat(), datetime.utcnow().isoformat(), duration_ms), + ) + return rows_synced + + except Exception as exc: + duration_ms = int((datetime.utcnow() - started_at).total_seconds() * 1000) + await execute( + """INSERT INTO seo_sync_log + (source, status, rows_synced, error, started_at, completed_at, duration_ms) + VALUES ('bing', 'failed', 0, ?, ?, ?, ?)""", + (str(exc), started_at.isoformat(), datetime.utcnow().isoformat(), duration_ms), + ) + raise diff --git a/web/src/padelnomics/seo/_gsc.py b/web/src/padelnomics/seo/_gsc.py new file mode 100644 index 0000000..9753160 --- /dev/null +++ b/web/src/padelnomics/seo/_gsc.py @@ -0,0 +1,144 @@ +"""Google Search Console sync via Search Analytics API. + +Uses a service account JSON key file for auth. The google-api-python-client +is synchronous, so sync runs in asyncio.to_thread(). +""" + +import asyncio +import time +from datetime import datetime, timedelta +from pathlib import Path +from urllib.parse import urlparse + +from ..core import config, execute + + +# GSC returns max 25K rows per request +_ROWS_PER_PAGE = 25_000 + + +def _fetch_gsc_data( + start_date: str, + end_date: str, + max_pages: int, +) -> list[dict]: + """Synchronous GSC fetch — called via asyncio.to_thread(). + + Returns list of dicts with keys: date, page, query, country, device, + clicks, impressions, ctr, position. + """ + from google.oauth2.service_account import Credentials + from googleapiclient.discovery import build + + key_path = Path(config.GSC_SERVICE_ACCOUNT_PATH) + assert key_path.exists(), f"GSC service account key not found: {key_path}" + + credentials = Credentials.from_service_account_file( + str(key_path), + scopes=["https://www.googleapis.com/auth/webmasters.readonly"], + ) + service = build("searchconsole", "v1", credentials=credentials) + + all_rows = [] + start_row = 0 + + for _page_num in range(max_pages): + body = { + "startDate": start_date, + "endDate": end_date, + "dimensions": ["date", "page", "query", "country", "device"], + "rowLimit": _ROWS_PER_PAGE, + "startRow": start_row, + } + response = service.searchanalytics().query( + siteUrl=config.GSC_SITE_URL, + body=body, + ).execute() + + rows = response.get("rows", []) + if not rows: + break + + for row in rows: + keys = row["keys"] + all_rows.append({ + "date": keys[0], + "page": keys[1], + "query": keys[2], + "country": keys[3], + "device": keys[4], + "clicks": row.get("clicks", 0), + "impressions": row.get("impressions", 0), + "ctr": row.get("ctr", 0.0), + "position": row.get("position", 0.0), + }) + + if len(rows) < _ROWS_PER_PAGE: + break + start_row += _ROWS_PER_PAGE + + return all_rows + + +def _normalize_url(full_url: str) -> str: + """Strip a full URL to just the path (no domain). + + Example: 'https://padelnomics.io/en/markets/germany/berlin' → '/en/markets/germany/berlin' + """ + parsed = urlparse(full_url) + return parsed.path or "/" + + +async def sync_gsc(days_back: int = 3, max_pages: int = 10) -> int: + """Sync GSC search analytics into seo_search_metrics. Returns rows synced.""" + assert 1 <= days_back <= 90, "days_back must be 1-90" + assert 1 <= max_pages <= 20, "max_pages must be 1-20" + + if not config.GSC_SERVICE_ACCOUNT_PATH or not config.GSC_SITE_URL: + return 0 # GSC not configured — skip silently + + started_at = datetime.utcnow() + + # GSC has ~2 day delay; fetch from days_back ago to 2 days ago + end_date = (datetime.utcnow() - timedelta(days=2)).strftime("%Y-%m-%d") + start_date = (datetime.utcnow() - timedelta(days=days_back + 2)).strftime("%Y-%m-%d") + + try: + rows = await asyncio.to_thread( + _fetch_gsc_data, start_date, end_date, max_pages, + ) + + rows_synced = 0 + for row in rows: + page_url = _normalize_url(row["page"]) + await execute( + """INSERT OR REPLACE INTO seo_search_metrics + (source, metric_date, page_url, query, country, device, + clicks, impressions, ctr, position_avg) + VALUES ('gsc', ?, ?, ?, ?, ?, ?, ?, ?, ?)""", + ( + row["date"], page_url, row["query"], row["country"], + row["device"], row["clicks"], row["impressions"], + row["ctr"], row["position"], + ), + ) + rows_synced += 1 + + duration_ms = int((datetime.utcnow() - started_at).total_seconds() * 1000) + await execute( + """INSERT INTO seo_sync_log + (source, status, rows_synced, started_at, completed_at, duration_ms) + VALUES ('gsc', 'success', ?, ?, ?, ?)""", + (rows_synced, started_at.isoformat(), datetime.utcnow().isoformat(), duration_ms), + ) + return rows_synced + + except Exception as exc: + duration_ms = int((datetime.utcnow() - started_at).total_seconds() * 1000) + await execute( + """INSERT INTO seo_sync_log + (source, status, rows_synced, error, started_at, completed_at, duration_ms) + VALUES ('gsc', 'failed', 0, ?, ?, ?, ?)""", + (str(exc), started_at.isoformat(), datetime.utcnow().isoformat(), duration_ms), + ) + raise diff --git a/web/src/padelnomics/seo/_queries.py b/web/src/padelnomics/seo/_queries.py new file mode 100644 index 0000000..94434c0 --- /dev/null +++ b/web/src/padelnomics/seo/_queries.py @@ -0,0 +1,379 @@ +"""SQL query functions for the admin SEO hub views. + +All heavy lifting happens in SQL. Functions accept filter parameters +and return plain dicts/lists. +""" + +from datetime import datetime, timedelta + +from ..core import execute, fetch_all, fetch_one + + +def _date_cutoff(date_range_days: int) -> str: + """Return ISO date string for N days ago.""" + return (datetime.utcnow() - timedelta(days=date_range_days)).strftime("%Y-%m-%d") + + +async def get_search_performance( + date_range_days: int = 28, + source: str | None = None, +) -> dict: + """Aggregate search performance: total clicks, impressions, avg CTR, avg position.""" + assert 1 <= date_range_days <= 730 + + cutoff = _date_cutoff(date_range_days) + source_filter = "AND source = ?" if source else "" + params = [cutoff] + if source: + params.append(source) + + row = await fetch_one( + f"""SELECT + COALESCE(SUM(clicks), 0) AS total_clicks, + COALESCE(SUM(impressions), 0) AS total_impressions, + CASE WHEN SUM(impressions) > 0 + THEN CAST(SUM(clicks) AS REAL) / SUM(impressions) + ELSE 0 END AS avg_ctr, + CASE WHEN SUM(impressions) > 0 + THEN SUM(position_avg * impressions) / SUM(impressions) + ELSE 0 END AS avg_position + FROM seo_search_metrics + WHERE metric_date >= ? {source_filter}""", + tuple(params), + ) + return dict(row) if row else { + "total_clicks": 0, "total_impressions": 0, + "avg_ctr": 0, "avg_position": 0, + } + + +async def get_top_queries( + date_range_days: int = 28, + source: str | None = None, + limit: int = 50, +) -> list[dict]: + """Top queries by impressions with clicks, CTR, avg position.""" + assert 1 <= date_range_days <= 730 + assert 1 <= limit <= 500 + + cutoff = _date_cutoff(date_range_days) + source_filter = "AND source = ?" if source else "" + params: list = [cutoff] + if source: + params.append(source) + params.append(limit) + + rows = await fetch_all( + f"""SELECT + query, + SUM(clicks) AS clicks, + SUM(impressions) AS impressions, + CASE WHEN SUM(impressions) > 0 + THEN CAST(SUM(clicks) AS REAL) / SUM(impressions) + ELSE 0 END AS ctr, + CASE WHEN SUM(impressions) > 0 + THEN SUM(position_avg * impressions) / SUM(impressions) + ELSE 0 END AS position_avg + FROM seo_search_metrics + WHERE metric_date >= ? + AND query IS NOT NULL AND query != '' + {source_filter} + GROUP BY query + ORDER BY impressions DESC + LIMIT ?""", + tuple(params), + ) + return [dict(r) for r in rows] + + +async def get_top_pages( + date_range_days: int = 28, + source: str | None = None, + limit: int = 50, +) -> list[dict]: + """Top pages by impressions with clicks, CTR, avg position.""" + assert 1 <= date_range_days <= 730 + assert 1 <= limit <= 500 + + cutoff = _date_cutoff(date_range_days) + source_filter = "AND source = ?" if source else "" + params: list = [cutoff] + if source: + params.append(source) + params.append(limit) + + rows = await fetch_all( + f"""SELECT + page_url, + SUM(clicks) AS clicks, + SUM(impressions) AS impressions, + CASE WHEN SUM(impressions) > 0 + THEN CAST(SUM(clicks) AS REAL) / SUM(impressions) + ELSE 0 END AS ctr, + CASE WHEN SUM(impressions) > 0 + THEN SUM(position_avg * impressions) / SUM(impressions) + ELSE 0 END AS position_avg + FROM seo_search_metrics + WHERE metric_date >= ? + {source_filter} + GROUP BY page_url + ORDER BY impressions DESC + LIMIT ?""", + tuple(params), + ) + return [dict(r) for r in rows] + + +async def get_country_breakdown( + date_range_days: int = 28, +) -> list[dict]: + """Clicks and impressions by country.""" + assert 1 <= date_range_days <= 730 + + cutoff = _date_cutoff(date_range_days) + rows = await fetch_all( + """SELECT + country, + SUM(clicks) AS clicks, + SUM(impressions) AS impressions + FROM seo_search_metrics + WHERE metric_date >= ? + AND country IS NOT NULL AND country != '' + GROUP BY country + ORDER BY impressions DESC + LIMIT 50""", + (cutoff,), + ) + return [dict(r) for r in rows] + + +async def get_device_breakdown( + date_range_days: int = 28, +) -> list[dict]: + """Clicks and impressions by device type (GSC only).""" + assert 1 <= date_range_days <= 730 + + cutoff = _date_cutoff(date_range_days) + rows = await fetch_all( + """SELECT + device, + SUM(clicks) AS clicks, + SUM(impressions) AS impressions + FROM seo_search_metrics + WHERE metric_date >= ? + AND source = 'gsc' + AND device IS NOT NULL AND device != '' + GROUP BY device + ORDER BY impressions DESC""", + (cutoff,), + ) + return [dict(r) for r in rows] + + +async def get_funnel_metrics( + date_range_days: int = 28, +) -> dict: + """Full funnel: search → analytics → conversions. + + Combines search metrics (GSC/Bing), analytics (Umami), and + business metrics (planner users, leads) from SQLite. + """ + assert 1 <= date_range_days <= 730 + + cutoff = _date_cutoff(date_range_days) + + # Search layer + search = await fetch_one( + """SELECT + COALESCE(SUM(impressions), 0) AS impressions, + COALESCE(SUM(clicks), 0) AS clicks + FROM seo_search_metrics + WHERE metric_date >= ?""", + (cutoff,), + ) + + # Analytics layer + analytics = await fetch_one( + """SELECT + COALESCE(SUM(pageviews), 0) AS pageviews, + COALESCE(SUM(visitors), 0) AS visitors + FROM seo_analytics_metrics + WHERE metric_date >= ? + AND page_url != '/'""", + (cutoff,), + ) + + # Business layer (from existing SQLite tables) + planner_users = await fetch_one( + """SELECT COUNT(DISTINCT user_id) AS cnt + FROM scenarios + WHERE deleted_at IS NULL + AND created_at >= ?""", + (cutoff,), + ) + + leads = await fetch_one( + """SELECT COUNT(*) AS cnt + FROM lead_requests + WHERE lead_type = 'quote' + AND created_at >= ?""", + (cutoff,), + ) + + imp = search["impressions"] if search else 0 + clicks = search["clicks"] if search else 0 + pvs = analytics["pageviews"] if analytics else 0 + vis = analytics["visitors"] if analytics else 0 + planners = planner_users["cnt"] if planner_users else 0 + lead_count = leads["cnt"] if leads else 0 + + return { + "impressions": imp, + "clicks": clicks, + "pageviews": pvs, + "visitors": vis, + "planner_users": planners, + "leads": lead_count, + # Conversion rates between stages + "ctr": clicks / imp if imp > 0 else 0, + "click_to_view": pvs / clicks if clicks > 0 else 0, + "view_to_visitor": vis / pvs if pvs > 0 else 0, + "visitor_to_planner": planners / vis if vis > 0 else 0, + "planner_to_lead": lead_count / planners if planners > 0 else 0, + } + + +async def get_article_scorecard( + date_range_days: int = 28, + template_slug: str | None = None, + country: str | None = None, + language: str | None = None, + sort_by: str = "impressions", + sort_dir: str = "desc", + limit: int = 100, +) -> list[dict]: + """Per-article scorecard joining articles + search + analytics metrics. + + Returns article metadata enriched with search and analytics data, + plus attention flags for articles needing action. + """ + assert 1 <= date_range_days <= 730 + assert 1 <= limit <= 500 + assert sort_dir in ("asc", "desc") + + # Allowlist sort columns to prevent SQL injection + sort_columns = { + "impressions", "clicks", "ctr", "position_avg", + "pageviews", "title", "published_at", + } + if sort_by not in sort_columns: + sort_by = "impressions" + + cutoff = _date_cutoff(date_range_days) + + wheres = ["a.status = 'published'"] + params: list = [cutoff, cutoff] + + if template_slug: + wheres.append("a.template_slug = ?") + params.append(template_slug) + if country: + wheres.append("a.country = ?") + params.append(country) + if language: + wheres.append("a.language = ?") + params.append(language) + + where_clause = " AND ".join(wheres) + params.append(limit) + + rows = await fetch_all( + f"""SELECT + a.id, + a.title, + a.url_path, + a.template_slug, + a.country, + a.language, + a.published_at, + COALESCE(s.impressions, 0) AS impressions, + COALESCE(s.clicks, 0) AS clicks, + COALESCE(s.ctr, 0) AS ctr, + COALESCE(s.position_avg, 0) AS position_avg, + COALESCE(u.pageviews, 0) AS pageviews, + COALESCE(u.visitors, 0) AS visitors, + u.bounce_rate, + u.time_avg_seconds, + -- Attention flags + CASE WHEN COALESCE(s.impressions, 0) > 100 + AND COALESCE(s.ctr, 0) < 0.02 + THEN 1 ELSE 0 END AS flag_low_ctr, + CASE WHEN COALESCE(s.clicks, 0) = 0 + AND a.published_at <= date('now', '-30 days') + THEN 1 ELSE 0 END AS flag_no_clicks + FROM articles a + LEFT JOIN ( + SELECT page_url, + SUM(impressions) AS impressions, + SUM(clicks) AS clicks, + CASE WHEN SUM(impressions) > 0 + THEN CAST(SUM(clicks) AS REAL) / SUM(impressions) + ELSE 0 END AS ctr, + CASE WHEN SUM(impressions) > 0 + THEN SUM(position_avg * impressions) / SUM(impressions) + ELSE 0 END AS position_avg + FROM seo_search_metrics + WHERE metric_date >= ? + GROUP BY page_url + ) s ON s.page_url = a.url_path + LEFT JOIN ( + SELECT page_url, + SUM(pageviews) AS pageviews, + SUM(visitors) AS visitors, + AVG(bounce_rate) AS bounce_rate, + AVG(time_avg_seconds) AS time_avg_seconds + FROM seo_analytics_metrics + WHERE metric_date >= ? + GROUP BY page_url + ) u ON u.page_url = a.url_path + WHERE {where_clause} + ORDER BY {sort_by} {sort_dir} + LIMIT ?""", + tuple(params), + ) + return [dict(r) for r in rows] + + +async def get_sync_status() -> list[dict]: + """Last sync status for each source (gsc, bing, umami).""" + rows = await fetch_all( + """SELECT source, status, rows_synced, error, + started_at, completed_at, duration_ms + FROM seo_sync_log + WHERE id IN ( + SELECT MAX(id) FROM seo_sync_log GROUP BY source + ) + ORDER BY source""" + ) + return [dict(r) for r in rows] + + +async def cleanup_old_metrics(retention_days: int = 365) -> int: + """Delete metrics older than retention_days. Returns rows deleted.""" + assert 30 <= retention_days <= 1095 + + cutoff = _date_cutoff(retention_days) + + deleted_search = await execute( + "DELETE FROM seo_search_metrics WHERE metric_date < ?", (cutoff,) + ) + deleted_analytics = await execute( + "DELETE FROM seo_analytics_metrics WHERE metric_date < ?", (cutoff,) + ) + # Sync log: keep 30 days + sync_cutoff = _date_cutoff(30) + deleted_sync = await execute( + "DELETE FROM seo_sync_log WHERE started_at < ?", (sync_cutoff,) + ) + + return (deleted_search or 0) + (deleted_analytics or 0) + (deleted_sync or 0) diff --git a/web/src/padelnomics/seo/_umami.py b/web/src/padelnomics/seo/_umami.py new file mode 100644 index 0000000..33a7083 --- /dev/null +++ b/web/src/padelnomics/seo/_umami.py @@ -0,0 +1,117 @@ +"""Umami analytics sync via REST API. + +Uses bearer token auth. Self-hosted instance, no rate limits. +Config already exists: UMAMI_API_URL, UMAMI_API_TOKEN, UMAMI_WEBSITE_ID. +""" + +from datetime import datetime, timedelta + +import httpx + +from ..core import config, execute + + +_TIMEOUT_SECONDS = 15 + + +async def sync_umami(days_back: int = 3, timeout_seconds: int = _TIMEOUT_SECONDS) -> int: + """Sync Umami per-URL metrics into seo_analytics_metrics. Returns rows synced.""" + assert 1 <= days_back <= 90, "days_back must be 1-90" + assert 1 <= timeout_seconds <= 120, "timeout_seconds must be 1-120" + + if not config.UMAMI_API_TOKEN or not config.UMAMI_API_URL: + return 0 # Umami not configured — skip silently + + started_at = datetime.utcnow() + + try: + rows_synced = 0 + headers = {"Authorization": f"Bearer {config.UMAMI_API_TOKEN}"} + base = config.UMAMI_API_URL.rstrip("/") + website_id = config.UMAMI_WEBSITE_ID + + async with httpx.AsyncClient(timeout=timeout_seconds, headers=headers) as client: + # Fetch per-URL metrics for each day individually + # (Umami's metrics endpoint returns totals for the period, + # so we query one day at a time for daily granularity) + for day_offset in range(days_back): + day = datetime.utcnow() - timedelta(days=day_offset + 1) + metric_date = day.strftime("%Y-%m-%d") + start_ms = int(day.replace(hour=0, minute=0, second=0).timestamp() * 1000) + end_ms = int(day.replace(hour=23, minute=59, second=59).timestamp() * 1000) + + # Get URL-level metrics + response = await client.get( + f"{base}/api/websites/{website_id}/metrics", + params={ + "startAt": start_ms, + "endAt": end_ms, + "type": "url", + "limit": 500, + }, + ) + response.raise_for_status() + url_metrics = response.json() + + if not isinstance(url_metrics, list): + continue + + for entry in url_metrics: + page_url = entry.get("x", "") + pageviews = entry.get("y", 0) + + if not page_url: + continue + + await execute( + """INSERT OR REPLACE INTO seo_analytics_metrics + (metric_date, page_url, pageviews, visitors, + bounce_rate, time_avg_seconds) + VALUES (?, ?, ?, 0, NULL, NULL)""", + (metric_date, page_url, pageviews), + ) + rows_synced += 1 + + # Try to get overall stats for bounce rate and visit duration + # (Umami doesn't provide per-URL bounce rate, only site-wide) + stats_response = await client.get( + f"{base}/api/websites/{website_id}/stats", + params={"startAt": start_ms, "endAt": end_ms}, + ) + if stats_response.status_code == 200: + stats = stats_response.json() + visitors = stats.get("visitors", {}).get("value", 0) + bounce_rate = stats.get("bounces", {}).get("value", 0) + total_time = stats.get("totaltime", {}).get("value", 0) + page_count = stats.get("pageviews", {}).get("value", 1) or 1 + + # Store site-wide stats on the root URL for the day + avg_time = int(total_time / max(visitors, 1)) + br = bounce_rate / max(visitors, 1) if visitors else 0 + + await execute( + """INSERT OR REPLACE INTO seo_analytics_metrics + (metric_date, page_url, pageviews, visitors, + bounce_rate, time_avg_seconds) + VALUES (?, '/', ?, ?, ?, ?)""", + (metric_date, page_count, visitors, br, avg_time), + ) + + duration_ms = int((datetime.utcnow() - started_at).total_seconds() * 1000) + await execute( + """INSERT INTO seo_sync_log + (source, status, rows_synced, started_at, completed_at, duration_ms) + VALUES ('umami', 'success', ?, ?, ?, ?)""", + (rows_synced, started_at.isoformat(), datetime.utcnow().isoformat(), duration_ms), + ) + return rows_synced + + except Exception as exc: + duration_ms = int((datetime.utcnow() - started_at).total_seconds() * 1000) + await execute( + """INSERT INTO seo_sync_log + (source, status, rows_synced, error, started_at, completed_at, duration_ms) + VALUES ('umami', 'failed', 0, ?, ?, ?, ?)""", + (str(exc), started_at.isoformat(), datetime.utcnow().isoformat(), duration_ms), + ) + raise diff --git a/web/src/padelnomics/worker.py b/web/src/padelnomics/worker.py index 6b8fa4b..a718af3 100644 --- a/web/src/padelnomics/worker.py +++ b/web/src/padelnomics/worker.py @@ -564,6 +564,45 @@ async def handle_cleanup_tasks(payload: dict) -> None: ) +# ============================================================================= +# SEO Metrics Sync +# ============================================================================= + +@task("sync_gsc") +async def handle_sync_gsc(payload: dict) -> None: + """Sync Google Search Console data.""" + from .seo import sync_gsc + days_back = payload.get("days_back", 3) + rows = await sync_gsc(days_back=days_back) + print(f"[WORKER] GSC sync complete: {rows} rows") + + +@task("sync_bing") +async def handle_sync_bing(payload: dict) -> None: + """Sync Bing Webmaster data.""" + from .seo import sync_bing + days_back = payload.get("days_back", 3) + rows = await sync_bing(days_back=days_back) + print(f"[WORKER] Bing sync complete: {rows} rows") + + +@task("sync_umami") +async def handle_sync_umami(payload: dict) -> None: + """Sync Umami analytics data.""" + from .seo import sync_umami + days_back = payload.get("days_back", 3) + rows = await sync_umami(days_back=days_back) + print(f"[WORKER] Umami sync complete: {rows} rows") + + +@task("cleanup_seo_metrics") +async def handle_cleanup_seo_metrics(payload: dict) -> None: + """Delete SEO metrics older than 12 months.""" + from .seo import cleanup_old_metrics + deleted = await cleanup_old_metrics(retention_days=365) + print(f"[WORKER] Cleaned up {deleted} old SEO metric rows") + + # ============================================================================= # Worker Loop # ============================================================================= @@ -616,6 +655,7 @@ async def run_scheduler() -> None: await init_db() last_credit_refill = None + last_seo_sync_date = None while True: try: @@ -633,8 +673,19 @@ async def run_scheduler() -> None: last_credit_refill = this_month print(f"[SCHEDULER] Queued monthly credit refill for {this_month}") + # Daily SEO metrics sync — run once per day after 6am UTC + # (GSC data has ~2 day delay, syncing at 6am ensures data is ready) + today_date = today.strftime("%Y-%m-%d") + if last_seo_sync_date != today_date and today.hour >= 6: + await enqueue("sync_gsc") + await enqueue("sync_bing") + await enqueue("sync_umami") + await enqueue("cleanup_seo_metrics") + last_seo_sync_date = today_date + print(f"[SCHEDULER] Queued SEO metric syncs for {today_date}") + await asyncio.sleep(3600) # 1 hour - + except Exception as e: print(f"[SCHEDULER] Error: {e}") await asyncio.sleep(60)