fix(extract): use tiered cycler in playtomic_tenants
Previously the tenants extractor flattened all proxy tiers into a single round-robin list, bypassing the circuit breaker entirely. When the free Webshare tier runs out of bandwidth (402), all 20 free proxies fail and the batch crashes — the paid datacenter/residential proxies are never tried. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -10,11 +10,11 @@ API notes (discovered 2026-02):
|
||||
- `size=100` is the maximum effective page size
|
||||
- ~14K venues globally as of Feb 2026
|
||||
|
||||
Parallel mode: when PROXY_URLS is set, fires batch_size = len(proxy_urls)
|
||||
pages concurrently. Each page gets its own fresh session + proxy. Pages beyond
|
||||
the last one return empty lists (safe — just triggers the done condition).
|
||||
Without proxies, falls back to single-threaded with THROTTLE_SECONDS between
|
||||
pages.
|
||||
Parallel mode: when proxy tiers are configured, fires BATCH_SIZE pages
|
||||
concurrently. Each page gets its own fresh session + proxy from the tiered
|
||||
cycler. On failure the cycler escalates through free → datacenter →
|
||||
residential tiers. Without proxies, falls back to single-threaded with
|
||||
THROTTLE_SECONDS between pages.
|
||||
|
||||
Rate: 1 req / 2 s per IP (see docs/data-sources-inventory.md §1.2).
|
||||
|
||||
@@ -22,6 +22,7 @@ Landing: {LANDING_DIR}/playtomic/{year}/{month}/tenants.jsonl.gz
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sqlite3
|
||||
import time
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
@@ -31,7 +32,7 @@ from pathlib import Path
|
||||
import niquests
|
||||
|
||||
from ._shared import HTTP_TIMEOUT_SECONDS, run_extractor, setup_logging, ua_for_proxy
|
||||
from .proxy import load_proxy_tiers, make_round_robin_cycler
|
||||
from .proxy import load_proxy_tiers, make_tiered_cycler
|
||||
from .utils import compress_jsonl_atomic, landing_path
|
||||
|
||||
logger = setup_logging("padelnomics.extract.playtomic_tenants")
|
||||
@@ -42,6 +43,9 @@ PLAYTOMIC_TENANTS_URL = "https://api.playtomic.io/v1/tenants"
|
||||
THROTTLE_SECONDS = 2
|
||||
PAGE_SIZE = 100
|
||||
MAX_PAGES = 500 # safety bound — ~50K venues max, well above current ~14K
|
||||
BATCH_SIZE = 20 # concurrent pages per batch (fixed, independent of proxy count)
|
||||
CIRCUIT_BREAKER_THRESHOLD = int(os.environ.get("CIRCUIT_BREAKER_THRESHOLD") or "10")
|
||||
MAX_PAGE_ATTEMPTS = 5 # max retries per individual page before giving up
|
||||
|
||||
|
||||
def _fetch_one_page(proxy_url: str | None, page: int) -> tuple[int, list[dict]]:
|
||||
@@ -61,22 +65,57 @@ def _fetch_one_page(proxy_url: str | None, page: int) -> tuple[int, list[dict]]:
|
||||
return (page, tenants)
|
||||
|
||||
|
||||
def _fetch_pages_parallel(pages: list[int], next_proxy) -> list[tuple[int, list[dict]]]:
|
||||
"""Fetch multiple pages concurrently. Returns [(page_num, tenants_list), ...]."""
|
||||
def _fetch_page_via_cycler(cycler: dict, page: int) -> tuple[int, list[dict]]:
|
||||
"""Fetch a single page, retrying across proxy tiers via the circuit breaker.
|
||||
|
||||
On each attempt, pulls the next proxy from the active tier. Records
|
||||
success/failure so the circuit breaker can escalate tiers. Raises
|
||||
RuntimeError if all tiers are exhausted or MAX_PAGE_ATTEMPTS is exceeded.
|
||||
"""
|
||||
last_exc: Exception | None = None
|
||||
for attempt in range(MAX_PAGE_ATTEMPTS):
|
||||
proxy_url = cycler["next_proxy"]()
|
||||
if proxy_url is None: # all tiers exhausted
|
||||
raise RuntimeError(f"All proxy tiers exhausted fetching page {page}")
|
||||
try:
|
||||
result = _fetch_one_page(proxy_url, page)
|
||||
cycler["record_success"]()
|
||||
return result
|
||||
except Exception as exc:
|
||||
last_exc = exc
|
||||
logger.warning(
|
||||
"Page %d attempt %d/%d failed (proxy=%s): %s",
|
||||
page,
|
||||
attempt + 1,
|
||||
MAX_PAGE_ATTEMPTS,
|
||||
proxy_url,
|
||||
exc,
|
||||
)
|
||||
cycler["record_failure"]()
|
||||
if cycler["is_exhausted"]():
|
||||
raise RuntimeError(f"All proxy tiers exhausted fetching page {page}") from exc
|
||||
raise RuntimeError(f"Page {page} failed after {MAX_PAGE_ATTEMPTS} attempts") from last_exc
|
||||
|
||||
|
||||
def _fetch_pages_parallel(pages: list[int], cycler: dict) -> list[tuple[int, list[dict]]]:
|
||||
"""Fetch multiple pages concurrently using the tiered cycler.
|
||||
|
||||
Returns [(page_num, tenants_list), ...]. Raises if any page exhausts all tiers.
|
||||
"""
|
||||
with ThreadPoolExecutor(max_workers=len(pages)) as pool:
|
||||
futures = [pool.submit(_fetch_one_page, next_proxy(), p) for p in pages]
|
||||
futures = [pool.submit(_fetch_page_via_cycler, cycler, p) for p in pages]
|
||||
return [f.result() for f in as_completed(futures)]
|
||||
|
||||
|
||||
def extract(
|
||||
landing_dir: Path,
|
||||
year_month: str, # noqa: ARG001 — unused; tenants uses ISO week partition instead
|
||||
year_month: str, # noqa: ARG001 — unused; tenants uses daily partition instead
|
||||
conn: sqlite3.Connection,
|
||||
session: niquests.Session,
|
||||
) -> dict:
|
||||
"""Fetch all Playtomic venues via global pagination. Returns run metrics.
|
||||
|
||||
Partitioned by ISO week (e.g. 2026/W09) so each weekly run produces a
|
||||
Partitioned by day (e.g. 2026/03/01) so each daily run produces a
|
||||
fresh file. _load_tenant_ids() in playtomic_availability globs across all
|
||||
partitions and picks the most recent one.
|
||||
"""
|
||||
@@ -89,12 +128,16 @@ def extract(
|
||||
return {"files_written": 0, "files_skipped": 1, "bytes_written": 0}
|
||||
|
||||
tiers = load_proxy_tiers()
|
||||
all_proxies = [url for tier in tiers for url in tier]
|
||||
next_proxy = make_round_robin_cycler(all_proxies) if all_proxies else None
|
||||
batch_size = len(all_proxies) if all_proxies else 1
|
||||
cycler = make_tiered_cycler(tiers, CIRCUIT_BREAKER_THRESHOLD) if tiers else None
|
||||
batch_size = BATCH_SIZE if cycler else 1
|
||||
|
||||
if next_proxy:
|
||||
logger.info("Parallel mode: %d pages per batch (%d proxies across %d tier(s))", batch_size, len(all_proxies), len(tiers))
|
||||
if cycler:
|
||||
logger.info(
|
||||
"Parallel mode: %d pages/batch, %d tier(s), threshold=%d",
|
||||
batch_size,
|
||||
cycler["tier_count"](),
|
||||
CIRCUIT_BREAKER_THRESHOLD,
|
||||
)
|
||||
else:
|
||||
logger.info("Serial mode: 1 page at a time (no proxies)")
|
||||
|
||||
@@ -104,15 +147,33 @@ def extract(
|
||||
done = False
|
||||
|
||||
while not done and page < MAX_PAGES:
|
||||
if cycler and cycler["is_exhausted"]():
|
||||
logger.error(
|
||||
"All proxy tiers exhausted — stopping at page %d (%d venues collected)",
|
||||
page,
|
||||
len(all_tenants),
|
||||
)
|
||||
break
|
||||
|
||||
batch_end = min(page + batch_size, MAX_PAGES)
|
||||
pages_to_fetch = list(range(page, batch_end))
|
||||
|
||||
if next_proxy and len(pages_to_fetch) > 1:
|
||||
if cycler and len(pages_to_fetch) > 1:
|
||||
logger.info(
|
||||
"Fetching pages %d-%d in parallel (%d workers, total so far: %d)",
|
||||
page, batch_end - 1, len(pages_to_fetch), len(all_tenants),
|
||||
page,
|
||||
batch_end - 1,
|
||||
len(pages_to_fetch),
|
||||
len(all_tenants),
|
||||
)
|
||||
results = _fetch_pages_parallel(pages_to_fetch, next_proxy)
|
||||
try:
|
||||
results = _fetch_pages_parallel(pages_to_fetch, cycler)
|
||||
except RuntimeError:
|
||||
logger.error(
|
||||
"Proxy tiers exhausted mid-batch — writing partial results (%d venues)",
|
||||
len(all_tenants),
|
||||
)
|
||||
break
|
||||
else:
|
||||
# Serial: reuse the shared session, throttle between pages
|
||||
page_num = pages_to_fetch[0]
|
||||
@@ -126,7 +187,7 @@ def extract(
|
||||
)
|
||||
results = [(page_num, tenants)]
|
||||
|
||||
# Process pages in order so the done-detection on < PAGE_SIZE is deterministic
|
||||
# Process pages in order so done-detection on < PAGE_SIZE is deterministic
|
||||
for p, tenants in sorted(results):
|
||||
new_count = 0
|
||||
for tenant in tenants:
|
||||
@@ -137,7 +198,11 @@ def extract(
|
||||
new_count += 1
|
||||
|
||||
logger.info(
|
||||
"page=%d got=%d new=%d total=%d", p, len(tenants), new_count, len(all_tenants),
|
||||
"page=%d got=%d new=%d total=%d",
|
||||
p,
|
||||
len(tenants),
|
||||
new_count,
|
||||
len(all_tenants),
|
||||
)
|
||||
|
||||
# Last page — fewer than PAGE_SIZE results means we've exhausted the list
|
||||
@@ -146,7 +211,7 @@ def extract(
|
||||
break
|
||||
|
||||
page = batch_end
|
||||
if not next_proxy:
|
||||
if not cycler:
|
||||
time.sleep(THROTTLE_SECONDS)
|
||||
|
||||
# Write each tenant as a JSONL line, then compress atomically
|
||||
|
||||
Reference in New Issue
Block a user