feat: Python supervisor + feature flags
Supervisor (replaces supervisor.sh): - supervisor.py — cron-based pipeline orchestration, reads workflows.toml on every tick, runs due extractors in topological waves with parallel execution, then SQLMesh transform + serving export - workflows.toml — workflow registry: overpass (monthly), eurostat (monthly), playtomic_tenants (weekly), playtomic_availability (daily), playtomic_recheck (hourly 6–23) - padelnomics-supervisor.service — updated ExecStart to Python supervisor Extraction enhancements: - proxy.py — optional round-robin/sticky proxy rotation via PROXY_URLS env - playtomic_availability.py — parallel fetch (EXTRACT_WORKERS), recheck mode (main_recheck) re-queries imminent slots for accurate occupancy measurement - _shared.py — realistic browser User-Agent on all extractor sessions - stg_playtomic_availability.sql — reads morning + recheck snapshots, tags each - fct_daily_availability.sql — prefers recheck over morning for same slot Feature flags (replaces WAITLIST_MODE env var): - migration 0019 — feature_flags table, 5 initial flags: markets (on), payments/planner_export/supplier_signup/lead_unlock (off) - core.py — is_flag_enabled() + feature_gate() decorator - routes — payments, markets, planner_export, supplier_signup, lead_unlock gated - admin flags UI — /admin/flags toggle page + nav link - app.py — flag() injected as Jinja2 global Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -14,6 +14,7 @@ extract-overpass = "padelnomics_extract.overpass:main"
|
||||
extract-eurostat = "padelnomics_extract.eurostat:main"
|
||||
extract-playtomic-tenants = "padelnomics_extract.playtomic_tenants:main"
|
||||
extract-playtomic-availability = "padelnomics_extract.playtomic_availability:main"
|
||||
extract-playtomic-recheck = "padelnomics_extract.playtomic_availability:main_recheck"
|
||||
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
|
||||
@@ -19,6 +19,13 @@ LANDING_DIR = Path(os.environ.get("LANDING_DIR", "data/landing"))
|
||||
HTTP_TIMEOUT_SECONDS = 30
|
||||
OVERPASS_TIMEOUT_SECONDS = 90 # Overpass can be slow on global queries
|
||||
|
||||
# Realistic browser User-Agent — avoids bot detection on all extractors
|
||||
USER_AGENT = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
||||
"Chrome/131.0.0.0 Safari/537.36"
|
||||
)
|
||||
|
||||
|
||||
def setup_logging(name: str) -> logging.Logger:
|
||||
"""Configure and return a logger for the given extractor module."""
|
||||
@@ -50,6 +57,7 @@ def run_extractor(
|
||||
|
||||
try:
|
||||
with niquests.Session() as session:
|
||||
session.headers["User-Agent"] = USER_AGENT
|
||||
result = func(LANDING_DIR, year_month, conn, session)
|
||||
|
||||
assert isinstance(result, dict), f"extractor must return a dict, got {type(result)}"
|
||||
|
||||
@@ -5,33 +5,51 @@ unauthenticated /v1/availability endpoint for each venue's next-day slots.
|
||||
This is the highest-value source: daily snapshots enable occupancy rate
|
||||
estimation, pricing benchmarking, and demand signal detection.
|
||||
|
||||
API constraint: max 25-hour window per request (see docs/data-sources-inventory.md §2.1).
|
||||
Rate: 1 req / 2 s (conservative, unauthenticated endpoint).
|
||||
Parallel mode: set EXTRACT_WORKERS=N and PROXY_URLS=... to fetch N venues
|
||||
concurrently (one proxy per worker). Without proxies, runs single-threaded.
|
||||
|
||||
Recheck mode: re-queries venues with slots starting within the next 90 minutes.
|
||||
Writes a separate recheck file for more accurate occupancy measurement.
|
||||
|
||||
Landing: {LANDING_DIR}/playtomic/{year}/{month}/availability_{date}.json.gz
|
||||
Recheck: {LANDING_DIR}/playtomic/{year}/{month}/availability_{date}_recheck_{HH}.json.gz
|
||||
"""
|
||||
|
||||
import gzip
|
||||
import json
|
||||
import os
|
||||
import sqlite3
|
||||
import threading
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from datetime import UTC, datetime, timedelta
|
||||
from pathlib import Path
|
||||
|
||||
import niquests
|
||||
|
||||
from ._shared import HTTP_TIMEOUT_SECONDS, run_extractor, setup_logging
|
||||
from ._shared import HTTP_TIMEOUT_SECONDS, USER_AGENT, run_extractor, setup_logging
|
||||
from .proxy import load_proxy_urls, make_round_robin_cycler
|
||||
from .utils import get_last_cursor, landing_path, write_gzip_atomic
|
||||
|
||||
logger = setup_logging("padelnomics.extract.playtomic_availability")
|
||||
|
||||
EXTRACTOR_NAME = "playtomic_availability"
|
||||
RECHECK_EXTRACTOR_NAME = "playtomic_recheck"
|
||||
AVAILABILITY_URL = "https://api.playtomic.io/v1/availability"
|
||||
|
||||
THROTTLE_SECONDS = 1
|
||||
MAX_VENUES_PER_RUN = 10_000
|
||||
MAX_VENUES_PER_RUN = 20_000
|
||||
MAX_RETRIES_PER_VENUE = 2
|
||||
MAX_WORKERS = int(os.environ.get("EXTRACT_WORKERS", "1"))
|
||||
RECHECK_WINDOW_MINUTES = int(os.environ.get("RECHECK_WINDOW_MINUTES", "90"))
|
||||
|
||||
# Thread-local storage for per-worker sessions
|
||||
_thread_local = threading.local()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tenant ID loading
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _load_tenant_ids(landing_dir: Path) -> list[str]:
|
||||
"""Read tenant IDs from the most recent tenants.json.gz file."""
|
||||
@@ -39,7 +57,6 @@ def _load_tenant_ids(landing_dir: Path) -> list[str]:
|
||||
if not playtomic_dir.exists():
|
||||
return []
|
||||
|
||||
# Find the most recent tenants.json.gz across all year/month dirs
|
||||
tenant_files = sorted(playtomic_dir.glob("*/*/tenants.json.gz"), reverse=True)
|
||||
if not tenant_files:
|
||||
return []
|
||||
@@ -65,12 +82,10 @@ def _parse_resume_cursor(cursor: str | None, target_date: str) -> int:
|
||||
"""Parse cursor_value to find resume index. Returns 0 if no valid cursor."""
|
||||
if not cursor:
|
||||
return 0
|
||||
# cursor format: "{date}:{index}"
|
||||
parts = cursor.split(":", 1)
|
||||
if len(parts) != 2:
|
||||
return 0
|
||||
cursor_date, cursor_index = parts
|
||||
# Only resume if cursor is for today's target date
|
||||
if cursor_date != target_date:
|
||||
return 0
|
||||
try:
|
||||
@@ -79,6 +94,125 @@ def _parse_resume_cursor(cursor: str | None, target_date: str) -> int:
|
||||
return 0
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Per-venue fetch (used by both serial and parallel modes)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _get_thread_session(proxy_url: str | None) -> niquests.Session:
|
||||
"""Get or create a thread-local niquests.Session with optional proxy."""
|
||||
if not hasattr(_thread_local, "session") or _thread_local.session is None:
|
||||
session = niquests.Session()
|
||||
session.headers["User-Agent"] = USER_AGENT
|
||||
if proxy_url:
|
||||
session.proxies = {"http": proxy_url, "https": proxy_url}
|
||||
_thread_local.session = session
|
||||
return _thread_local.session
|
||||
|
||||
|
||||
def _fetch_venue_availability(
|
||||
tenant_id: str,
|
||||
start_min_str: str,
|
||||
start_max_str: str,
|
||||
proxy_url: str | None,
|
||||
) -> dict | None:
|
||||
"""Fetch availability for a single venue. Returns payload dict or None on failure."""
|
||||
session = _get_thread_session(proxy_url)
|
||||
params = {
|
||||
"sport_id": "PADEL",
|
||||
"tenant_id": tenant_id,
|
||||
"start_min": start_min_str,
|
||||
"start_max": start_max_str,
|
||||
}
|
||||
|
||||
for attempt in range(MAX_RETRIES_PER_VENUE + 1):
|
||||
try:
|
||||
resp = session.get(AVAILABILITY_URL, params=params, timeout=HTTP_TIMEOUT_SECONDS)
|
||||
|
||||
if resp.status_code == 429:
|
||||
wait_seconds = THROTTLE_SECONDS * (attempt + 2)
|
||||
logger.warning("Rate limited on %s, waiting %ds", tenant_id, wait_seconds)
|
||||
time.sleep(wait_seconds)
|
||||
continue
|
||||
|
||||
if resp.status_code >= 500:
|
||||
logger.warning(
|
||||
"Server error %d for %s (attempt %d)",
|
||||
resp.status_code, tenant_id, attempt + 1,
|
||||
)
|
||||
time.sleep(THROTTLE_SECONDS)
|
||||
continue
|
||||
|
||||
resp.raise_for_status()
|
||||
time.sleep(THROTTLE_SECONDS)
|
||||
return {"tenant_id": tenant_id, "slots": resp.json()}
|
||||
|
||||
except niquests.exceptions.RequestException as e:
|
||||
if attempt < MAX_RETRIES_PER_VENUE:
|
||||
logger.warning("Request failed for %s (attempt %d): %s", tenant_id, attempt + 1, e)
|
||||
time.sleep(THROTTLE_SECONDS)
|
||||
else:
|
||||
logger.error(
|
||||
"Giving up on %s after %d attempts: %s",
|
||||
tenant_id, MAX_RETRIES_PER_VENUE + 1, e,
|
||||
)
|
||||
return None
|
||||
|
||||
return None # all retries exhausted via 429/5xx
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Parallel fetch orchestrator
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _fetch_venues_parallel(
|
||||
tenant_ids: list[str],
|
||||
start_min_str: str,
|
||||
start_max_str: str,
|
||||
worker_count: int,
|
||||
proxy_cycler,
|
||||
) -> tuple[list[dict], int]:
|
||||
"""Fetch availability for multiple venues in parallel.
|
||||
|
||||
Returns (venues_data, venues_errored).
|
||||
"""
|
||||
venues_data: list[dict] = []
|
||||
venues_errored = 0
|
||||
completed_count = 0
|
||||
lock = threading.Lock()
|
||||
|
||||
def _worker(tenant_id: str) -> dict | None:
|
||||
proxy_url = proxy_cycler()
|
||||
return _fetch_venue_availability(tenant_id, start_min_str, start_max_str, proxy_url)
|
||||
|
||||
with ThreadPoolExecutor(max_workers=worker_count) as pool:
|
||||
futures = {pool.submit(_worker, tid): tid for tid in tenant_ids}
|
||||
|
||||
for future in as_completed(futures):
|
||||
result = future.result()
|
||||
with lock:
|
||||
completed_count += 1
|
||||
if result is not None:
|
||||
venues_data.append(result)
|
||||
else:
|
||||
venues_errored += 1
|
||||
|
||||
if completed_count % 500 == 0:
|
||||
logger.info(
|
||||
"Progress: %d/%d venues (%d errors, %d workers)",
|
||||
completed_count, len(tenant_ids), venues_errored, worker_count,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Parallel fetch complete: %d/%d venues (%d errors, %d workers)",
|
||||
len(venues_data), len(tenant_ids), venues_errored, worker_count,
|
||||
)
|
||||
return venues_data, venues_errored
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Main extraction function
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def extract(
|
||||
landing_dir: Path,
|
||||
year_month: str,
|
||||
@@ -91,7 +225,7 @@ def extract(
|
||||
logger.warning("No tenant IDs found — run extract-playtomic-tenants first")
|
||||
return {"files_written": 0, "files_skipped": 0, "bytes_written": 0}
|
||||
|
||||
# Query tomorrow's slots (25-hour window starting at midnight local)
|
||||
# Query tomorrow's slots
|
||||
tomorrow = datetime.now(UTC) + timedelta(days=1)
|
||||
target_date = tomorrow.strftime("%Y-%m-%d")
|
||||
start_min = tomorrow.replace(hour=0, minute=0, second=0, microsecond=0)
|
||||
@@ -101,117 +235,223 @@ def extract(
|
||||
dest_dir = landing_path(landing_dir, "playtomic", year, month)
|
||||
dest = dest_dir / f"availability_{target_date}.json.gz"
|
||||
|
||||
# Check if already completed for this date
|
||||
if dest.exists():
|
||||
logger.info("Already have %s — skipping", dest)
|
||||
return {"files_written": 0, "files_skipped": 1, "bytes_written": 0}
|
||||
|
||||
# Resume from last cursor if we crashed mid-run
|
||||
# Resume from cursor if crashed mid-run
|
||||
last_cursor = get_last_cursor(conn, EXTRACTOR_NAME)
|
||||
resume_index = _parse_resume_cursor(last_cursor, target_date)
|
||||
if resume_index > 0:
|
||||
logger.info("Resuming from index %d (cursor: %s)", resume_index, last_cursor)
|
||||
|
||||
venues_data: list[dict] = []
|
||||
venues_to_process = tenant_ids[:MAX_VENUES_PER_RUN]
|
||||
venues_errored = 0
|
||||
if resume_index > 0:
|
||||
venues_to_process = venues_to_process[resume_index:]
|
||||
|
||||
for i, tenant_id in enumerate(venues_to_process):
|
||||
if i < resume_index:
|
||||
continue
|
||||
# Determine parallelism
|
||||
proxy_urls = load_proxy_urls()
|
||||
worker_count = min(MAX_WORKERS, len(proxy_urls)) if proxy_urls else 1
|
||||
proxy_cycler = make_round_robin_cycler(proxy_urls)
|
||||
|
||||
params = {
|
||||
"sport_id": "PADEL",
|
||||
"tenant_id": tenant_id,
|
||||
"start_min": start_min.strftime("%Y-%m-%dT%H:%M:%S"),
|
||||
"start_max": start_max.strftime("%Y-%m-%dT%H:%M:%S"),
|
||||
}
|
||||
start_min_str = start_min.strftime("%Y-%m-%dT%H:%M:%S")
|
||||
start_max_str = start_max.strftime("%Y-%m-%dT%H:%M:%S")
|
||||
|
||||
for attempt in range(MAX_RETRIES_PER_VENUE + 1):
|
||||
try:
|
||||
resp = session.get(AVAILABILITY_URL, params=params, timeout=HTTP_TIMEOUT_SECONDS)
|
||||
|
||||
if resp.status_code == 429:
|
||||
# Rate limited — back off and retry
|
||||
wait_seconds = THROTTLE_SECONDS * (attempt + 2)
|
||||
logger.warning("Rate limited on %s, waiting %ds", tenant_id, wait_seconds)
|
||||
time.sleep(wait_seconds)
|
||||
continue
|
||||
|
||||
if resp.status_code >= 500:
|
||||
logger.warning(
|
||||
"Server error %d for %s (attempt %d)",
|
||||
resp.status_code,
|
||||
tenant_id,
|
||||
attempt + 1,
|
||||
)
|
||||
time.sleep(THROTTLE_SECONDS)
|
||||
continue
|
||||
|
||||
resp.raise_for_status()
|
||||
venues_data.append({"tenant_id": tenant_id, "slots": resp.json()})
|
||||
break
|
||||
|
||||
except niquests.exceptions.RequestException as e:
|
||||
if attempt < MAX_RETRIES_PER_VENUE:
|
||||
logger.warning(
|
||||
"Request failed for %s (attempt %d): %s", tenant_id, attempt + 1, e
|
||||
)
|
||||
time.sleep(THROTTLE_SECONDS)
|
||||
else:
|
||||
logger.error(
|
||||
"Giving up on %s after %d attempts: %s",
|
||||
tenant_id,
|
||||
MAX_RETRIES_PER_VENUE + 1,
|
||||
e,
|
||||
)
|
||||
venues_errored += 1
|
||||
else:
|
||||
# All retries exhausted (loop completed without break)
|
||||
venues_errored += 1
|
||||
|
||||
if (i + 1) % 100 == 0:
|
||||
logger.info(
|
||||
"Progress: %d/%d venues queried, %d errors",
|
||||
i + 1,
|
||||
len(venues_to_process),
|
||||
venues_errored,
|
||||
if worker_count > 1:
|
||||
logger.info("Parallel mode: %d workers, %d proxies", worker_count, len(proxy_urls))
|
||||
venues_data, venues_errored = _fetch_venues_parallel(
|
||||
venues_to_process, start_min_str, start_max_str, worker_count, proxy_cycler,
|
||||
)
|
||||
else:
|
||||
# Serial mode — same as before but uses shared fetch function
|
||||
logger.info("Serial mode: 1 worker, %d venues", len(venues_to_process))
|
||||
venues_data = []
|
||||
venues_errored = 0
|
||||
for i, tenant_id in enumerate(venues_to_process):
|
||||
result = _fetch_venue_availability(
|
||||
tenant_id, start_min_str, start_max_str, proxy_cycler(),
|
||||
)
|
||||
if result is not None:
|
||||
venues_data.append(result)
|
||||
else:
|
||||
venues_errored += 1
|
||||
|
||||
time.sleep(THROTTLE_SECONDS)
|
||||
if (i + 1) % 100 == 0:
|
||||
logger.info(
|
||||
"Progress: %d/%d venues queried, %d errors",
|
||||
i + 1, len(venues_to_process), venues_errored,
|
||||
)
|
||||
|
||||
# Write consolidated file
|
||||
captured_at = datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
payload = json.dumps(
|
||||
{
|
||||
"date": target_date,
|
||||
"captured_at_utc": captured_at,
|
||||
"venue_count": len(venues_data),
|
||||
"venues_errored": venues_errored,
|
||||
"venues": venues_data,
|
||||
}
|
||||
).encode()
|
||||
payload = json.dumps({
|
||||
"date": target_date,
|
||||
"captured_at_utc": captured_at,
|
||||
"venue_count": len(venues_data),
|
||||
"venues_errored": venues_errored,
|
||||
"venues": venues_data,
|
||||
}).encode()
|
||||
|
||||
bytes_written = write_gzip_atomic(dest, payload)
|
||||
logger.info(
|
||||
"%d venues scraped (%d errors) -> %s (%s bytes)",
|
||||
len(venues_data),
|
||||
venues_errored,
|
||||
dest,
|
||||
f"{bytes_written:,}",
|
||||
len(venues_data), venues_errored, dest, f"{bytes_written:,}",
|
||||
)
|
||||
|
||||
return {
|
||||
"files_written": 1,
|
||||
"files_skipped": 0,
|
||||
"bytes_written": bytes_written,
|
||||
"cursor_value": f"{target_date}:{len(venues_to_process)}",
|
||||
"cursor_value": f"{target_date}:{len(tenant_ids[:MAX_VENUES_PER_RUN])}",
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Recheck mode — re-query venues with upcoming slots for accurate occupancy
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _load_morning_availability(landing_dir: Path, target_date: str) -> dict | None:
|
||||
"""Load today's morning availability file. Returns parsed JSON or None."""
|
||||
playtomic_dir = landing_dir / "playtomic"
|
||||
# Search across year/month dirs for the target date
|
||||
matches = list(playtomic_dir.glob(f"*/*/availability_{target_date}.json.gz"))
|
||||
if not matches:
|
||||
return None
|
||||
|
||||
with gzip.open(matches[0], "rb") as f:
|
||||
return json.loads(f.read())
|
||||
|
||||
|
||||
def _find_venues_with_upcoming_slots(
|
||||
morning_data: dict, window_start: datetime, window_end: datetime
|
||||
) -> list[str]:
|
||||
"""Find tenant_ids that have available slots starting within the recheck window."""
|
||||
tenant_ids = set()
|
||||
for venue in morning_data.get("venues", []):
|
||||
tid = venue.get("tenant_id")
|
||||
if not tid:
|
||||
continue
|
||||
for resource in venue.get("slots", []):
|
||||
for slot in resource.get("slots", []):
|
||||
start_time_str = slot.get("start_time")
|
||||
if not start_time_str:
|
||||
continue
|
||||
try:
|
||||
# Parse "2026-02-24T17:00:00" format
|
||||
slot_start = datetime.fromisoformat(start_time_str).replace(tzinfo=UTC)
|
||||
if window_start <= slot_start < window_end:
|
||||
tenant_ids.add(tid)
|
||||
break # found one upcoming slot, no need to check more
|
||||
except ValueError:
|
||||
continue
|
||||
if tid in tenant_ids:
|
||||
break # already found upcoming slot for this venue
|
||||
|
||||
return sorted(tenant_ids)
|
||||
|
||||
|
||||
def extract_recheck(
|
||||
landing_dir: Path,
|
||||
year_month: str,
|
||||
conn: sqlite3.Connection,
|
||||
session: niquests.Session,
|
||||
) -> dict:
|
||||
"""Re-query venues with slots starting soon for accurate occupancy data."""
|
||||
now = datetime.now(UTC)
|
||||
target_date = now.strftime("%Y-%m-%d")
|
||||
|
||||
# Also check tomorrow if it's late evening
|
||||
tomorrow = (now + timedelta(days=1)).strftime("%Y-%m-%d")
|
||||
|
||||
morning_data = _load_morning_availability(landing_dir, target_date)
|
||||
if morning_data is None:
|
||||
# Try tomorrow's file (morning extraction creates it for tomorrow)
|
||||
morning_data = _load_morning_availability(landing_dir, tomorrow)
|
||||
if morning_data is None:
|
||||
logger.info("No morning availability file found — skipping recheck")
|
||||
return {"files_written": 0, "files_skipped": 0, "bytes_written": 0}
|
||||
target_date = tomorrow
|
||||
|
||||
# Find venues with slots in the upcoming window
|
||||
window_start = now
|
||||
window_end = now + timedelta(minutes=RECHECK_WINDOW_MINUTES)
|
||||
venues_to_recheck = _find_venues_with_upcoming_slots(morning_data, window_start, window_end)
|
||||
|
||||
if not venues_to_recheck:
|
||||
logger.info("No venues with upcoming slots in next %d min — skipping", RECHECK_WINDOW_MINUTES)
|
||||
return {"files_written": 0, "files_skipped": 0, "bytes_written": 0}
|
||||
|
||||
logger.info(
|
||||
"Rechecking %d venues with slots starting in next %d min",
|
||||
len(venues_to_recheck), RECHECK_WINDOW_MINUTES,
|
||||
)
|
||||
|
||||
# Fetch availability for the recheck window
|
||||
start_min_str = window_start.strftime("%Y-%m-%dT%H:%M:%S")
|
||||
start_max_str = window_end.strftime("%Y-%m-%dT%H:%M:%S")
|
||||
|
||||
# Determine parallelism
|
||||
proxy_urls = load_proxy_urls()
|
||||
worker_count = min(MAX_WORKERS, len(proxy_urls)) if proxy_urls else 1
|
||||
proxy_cycler = make_round_robin_cycler(proxy_urls)
|
||||
|
||||
if worker_count > 1 and len(venues_to_recheck) > 10:
|
||||
venues_data, venues_errored = _fetch_venues_parallel(
|
||||
venues_to_recheck, start_min_str, start_max_str, worker_count, proxy_cycler,
|
||||
)
|
||||
else:
|
||||
venues_data = []
|
||||
venues_errored = 0
|
||||
for tid in venues_to_recheck:
|
||||
result = _fetch_venue_availability(tid, start_min_str, start_max_str, proxy_cycler())
|
||||
if result is not None:
|
||||
venues_data.append(result)
|
||||
else:
|
||||
venues_errored += 1
|
||||
|
||||
# Write recheck file
|
||||
recheck_hour = now.hour
|
||||
year, month = year_month.split("/")
|
||||
dest_dir = landing_path(landing_dir, "playtomic", year, month)
|
||||
dest = dest_dir / f"availability_{target_date}_recheck_{recheck_hour:02d}.json.gz"
|
||||
|
||||
captured_at = datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
payload = json.dumps({
|
||||
"date": target_date,
|
||||
"captured_at_utc": captured_at,
|
||||
"recheck_hour": recheck_hour,
|
||||
"recheck_window_minutes": RECHECK_WINDOW_MINUTES,
|
||||
"rechecked_tenant_ids": venues_to_recheck,
|
||||
"venue_count": len(venues_data),
|
||||
"venues_errored": venues_errored,
|
||||
"venues": venues_data,
|
||||
}).encode()
|
||||
|
||||
bytes_written = write_gzip_atomic(dest, payload)
|
||||
logger.info(
|
||||
"Recheck: %d/%d venues (%d errors) -> %s (%s bytes)",
|
||||
len(venues_data), len(venues_to_recheck), venues_errored, dest, f"{bytes_written:,}",
|
||||
)
|
||||
|
||||
return {
|
||||
"files_written": 1,
|
||||
"files_skipped": 0,
|
||||
"bytes_written": bytes_written,
|
||||
"cursor_value": f"{target_date}:recheck:{recheck_hour}",
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Entry points
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main() -> None:
|
||||
run_extractor(EXTRACTOR_NAME, extract)
|
||||
|
||||
|
||||
def main_recheck() -> None:
|
||||
run_extractor(RECHECK_EXTRACTOR_NAME, extract_recheck)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
57
extract/padelnomics_extract/src/padelnomics_extract/proxy.py
Normal file
57
extract/padelnomics_extract/src/padelnomics_extract/proxy.py
Normal file
@@ -0,0 +1,57 @@
|
||||
"""Optional proxy rotation for parallel HTTP fetching.
|
||||
|
||||
Proxies are configured via the PROXY_URLS environment variable (comma-separated).
|
||||
When unset, all functions return None/no-op — extractors fall back to direct requests.
|
||||
|
||||
Two routing modes:
|
||||
round-robin — distribute requests evenly across proxies (default)
|
||||
sticky — same key always maps to same proxy (for session-tracked sites)
|
||||
"""
|
||||
|
||||
import itertools
|
||||
import os
|
||||
import threading
|
||||
|
||||
|
||||
def load_proxy_urls() -> list[str]:
|
||||
"""Read PROXY_URLS env var (comma-separated). Returns [] if unset.
|
||||
|
||||
Format: http://user:pass@host:port or socks5://host:port
|
||||
"""
|
||||
raw = os.environ.get("PROXY_URLS", "")
|
||||
urls = [u.strip() for u in raw.split(",") if u.strip()]
|
||||
return urls
|
||||
|
||||
|
||||
def make_round_robin_cycler(proxy_urls: list[str]):
|
||||
"""Thread-safe round-robin proxy cycler.
|
||||
|
||||
Returns a callable: next_proxy() -> str | None
|
||||
Returns None-returning callable if no proxies configured.
|
||||
"""
|
||||
if not proxy_urls:
|
||||
return lambda: None
|
||||
|
||||
cycle = itertools.cycle(proxy_urls)
|
||||
lock = threading.Lock()
|
||||
|
||||
def next_proxy() -> str:
|
||||
with lock:
|
||||
return next(cycle)
|
||||
|
||||
return next_proxy
|
||||
|
||||
|
||||
def make_sticky_selector(proxy_urls: list[str]):
|
||||
"""Consistent-hash proxy selector — same key always maps to same proxy.
|
||||
|
||||
Use when the target site tracks sessions by IP (e.g. Cloudflare).
|
||||
Returns a callable: select_proxy(key: str) -> str | None
|
||||
"""
|
||||
if not proxy_urls:
|
||||
return lambda key: None
|
||||
|
||||
def select_proxy(key: str) -> str:
|
||||
return proxy_urls[hash(key) % len(proxy_urls)]
|
||||
|
||||
return select_proxy
|
||||
Reference in New Issue
Block a user