diff --git a/CHANGELOG.md b/CHANGELOG.md index bceecd3..1e98b87 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). - Supervisor: added daily sleep interval between pipeline runs ### Added +- **Sitemap: hreflang alternates + caching** — extracted sitemap generation to + `sitemap.py`; each URL entry now includes `xhtml:link` hreflang alternates + (en, de, x-default) for correct international SEO signaling; supplier detail + pages now listed in both EN and DE (were EN-only); removed misleading "today" + lastmod from static pages; added 1-hour in-memory TTL cache with + `Cache-Control: public, max-age=3600` response header - **Playtomic availability extractor** (`playtomic_availability.py`) — daily next-day booking slot snapshots for occupancy rate estimation and pricing benchmarking; reads tenant IDs from latest `tenants.json.gz`, queries `/v1/availability` per venue with 2s throttle, resumable diff --git a/web/src/padelnomics/app.py b/web/src/padelnomics/app.py index e54a817..30aad72 100644 --- a/web/src/padelnomics/app.py +++ b/web/src/padelnomics/app.py @@ -251,63 +251,10 @@ def create_app() -> Quart: ) return Response(body, content_type="text/plain") - # sitemap.xml must live at root @app.route("/sitemap.xml") async def sitemap(): - from datetime import UTC, datetime - - from .core import fetch_all - base = config.BASE_URL.rstrip("/") - today = datetime.now(UTC).strftime("%Y-%m-%d") - - # Both language variants of all SEO pages - static_paths = [ - "", # landing - "/features", - "/about", - "/terms", - "/privacy", - "/imprint", - "/suppliers", - "/markets", - ] - entries: list[tuple[str, str]] = [] - for path in static_paths: - for lang in ("en", "de"): - entries.append((f"{base}/{lang}{path}", today)) - - # Planner + directory lang variants, billing (no lang) - for lang in ("en", "de"): - entries.append((f"{base}/{lang}/planner/", today)) - entries.append((f"{base}/{lang}/directory/", today)) - entries.append((f"{base}/billing/pricing", today)) - - # Published articles — both lang variants - articles = await fetch_all( - """SELECT url_path, COALESCE(updated_at, published_at) as lastmod - FROM articles - WHERE status = 'published' AND published_at <= datetime('now') - ORDER BY published_at DESC""" - ) - for article in articles: - lastmod = article["lastmod"][:10] if article["lastmod"] else today - for lang in ("en", "de"): - entries.append((f"{base}/{lang}{article['url_path']}", lastmod)) - - # Supplier detail pages (English only — canonical) - suppliers = await fetch_all( - "SELECT slug, created_at FROM suppliers ORDER BY name LIMIT 5000" - ) - for supplier in suppliers: - lastmod = supplier["created_at"][:10] if supplier["created_at"] else today - entries.append((f"{base}/en/directory/{supplier['slug']}", lastmod)) - - xml = '\n' - xml += '\n' - for loc, lastmod in entries: - xml += f" {loc}{lastmod}\n" - xml += "" - return Response(xml, content_type="application/xml") + from .sitemap import sitemap_response + return await sitemap_response(config.BASE_URL) # Health check @app.route("/health") diff --git a/web/src/padelnomics/sitemap.py b/web/src/padelnomics/sitemap.py new file mode 100644 index 0000000..d7c1568 --- /dev/null +++ b/web/src/padelnomics/sitemap.py @@ -0,0 +1,117 @@ +"""Sitemap generation with in-memory TTL cache and hreflang alternates.""" + +import time + +from quart import Response + +from .core import fetch_all + +_cache_xml: str = "" +_cache_timestamp: float = 0.0 +CACHE_TTL_SECONDS: int = 3600 # 1 hour + +LANGS = ("en", "de") +DEFAULT_LANG = "en" + +# Pages with lang prefix but no meaningful lastmod +STATIC_PATHS = [ + "", # landing + "/features", + "/about", + "/terms", + "/privacy", + "/imprint", + "/suppliers", + "/markets", + "/planner/", + "/directory/", +] + + +def _url_entry(loc: str, alternates: list[tuple[str, str]], lastmod: str | None = None) -> str: + """Build a single entry with optional hreflang alternates and lastmod.""" + parts = [f" \n {loc}"] + for hreflang, href in alternates: + parts.append( + f' ' + ) + if lastmod: + parts.append(f" {lastmod}") + parts.append(" ") + return "\n".join(parts) + + +def _lang_alternates(base: str, path: str) -> list[tuple[str, str]]: + """Build hreflang alternate list for a lang-prefixed path.""" + alternates = [] + for lang in LANGS: + alternates.append((lang, f"{base}/{lang}{path}")) + alternates.append(("x-default", f"{base}/{DEFAULT_LANG}{path}")) + return alternates + + +async def _generate_sitemap_xml(base_url: str) -> str: + """Build sitemap XML from static paths + DB content.""" + base = base_url.rstrip("/") + entries: list[str] = [] + + # Static pages — both lang variants, no lastmod (rarely changes) + for path in STATIC_PATHS: + alternates = _lang_alternates(base, path) + for lang in LANGS: + entries.append(_url_entry(f"{base}/{lang}{path}", alternates)) + + # Billing pricing — no lang prefix, no hreflang + entries.append(_url_entry(f"{base}/billing/pricing", [])) + + # Published articles — both lang variants with accurate lastmod + articles = await fetch_all( + """SELECT url_path, COALESCE(updated_at, published_at) AS lastmod + FROM articles + WHERE status = 'published' AND published_at <= datetime('now') + ORDER BY published_at DESC + LIMIT 25000""" + ) + for article in articles: + lastmod = article["lastmod"][:10] if article["lastmod"] else None + alternates = _lang_alternates(base, article["url_path"]) + for lang in LANGS: + entries.append( + _url_entry(f"{base}/{lang}{article['url_path']}", alternates, lastmod) + ) + + # Supplier detail pages — both lang variants + suppliers = await fetch_all( + "SELECT slug, created_at FROM suppliers ORDER BY name LIMIT 5000" + ) + for supplier in suppliers: + lastmod = supplier["created_at"][:10] if supplier["created_at"] else None + path = f"/directory/{supplier['slug']}" + alternates = _lang_alternates(base, path) + for lang in LANGS: + entries.append( + _url_entry(f"{base}/{lang}{path}", alternates, lastmod) + ) + + xml = '\n' + xml += ( + '\n' + ) + xml += "\n".join(entries) + xml += "\n" + return xml + + +async def sitemap_response(base_url: str) -> Response: + """Return cached sitemap XML, regenerating if stale (1-hour TTL).""" + global _cache_xml, _cache_timestamp # noqa: PLW0603 + now = time.monotonic() + if not _cache_xml or (now - _cache_timestamp) > CACHE_TTL_SECONDS: + _cache_xml = await _generate_sitemap_xml(base_url) + _cache_timestamp = now + return Response( + _cache_xml, + content_type="application/xml", + headers={"Cache-Control": f"public, max-age={CACHE_TTL_SECONDS}"}, + ) diff --git a/web/tests/conftest.py b/web/tests/conftest.py index 3fbdcc9..1d69655 100644 --- a/web/tests/conftest.py +++ b/web/tests/conftest.py @@ -12,10 +12,12 @@ from unittest.mock import AsyncMock, patch import aiosqlite import pytest -from padelnomics import core from padelnomics.app import create_app from padelnomics.migrations.migrate import migrate +from padelnomics import core +from padelnomics import sitemap as sitemap_mod + _SCHEMA_CACHE = None @@ -56,6 +58,9 @@ async def db(): original_db = core._db core._db = conn + # Clear sitemap cache so tests see fresh DB state + sitemap_mod._cache_xml = "" + sitemap_mod._cache_timestamp = 0.0 yield conn @@ -147,6 +152,7 @@ def create_subscription(db): async def scenario(db, test_user): """User scenario with valid planner state for PDF generation.""" import json + from padelnomics.planner.calculator import validate_state state = validate_state({"dblCourts": 4, "sglCourts": 2}) now = datetime.utcnow().isoformat()