Files
padelnomics/extract/padelnomics_extract/src/padelnomics_extract/proxy.py
Deeman adf22924f6 feat(extract): three-tier proxy system with Webshare auto-fetch
Replace two-tier proxy setup (PROXY_URLS / PROXY_URLS_FALLBACK) with
N-tier escalation: free → datacenter → residential.

- proxy.py: fetch_webshare_proxies() auto-fetches the Webshare download
  API on each run (no more stale manually-copied lists). load_proxy_tiers()
  assembles tiers from WEBSHARE_DOWNLOAD_URL, PROXY_URLS_DATACENTER,
  PROXY_URLS_RESIDENTIAL. make_tiered_cycler() generalised to list[list[str]]
  with N-level escalation; is_fallback_active() replaced by is_exhausted().
  Old load_proxy_urls() / load_fallback_proxy_urls() deleted.

- playtomic_availability.py: both extract() and extract_recheck() use
  load_proxy_tiers() + generalised cycler. _fetch_venues_parallel fallback_urls
  param removed. All is_fallback_active() checks → is_exhausted().

- playtomic_tenants.py: flattens tiers for simple round-robin.

- test_supervisor.py: TestLoadProxyUrls removed (function deleted).
  Added TestFetchWebshareProxies, TestLoadProxyTiers, TestTieredCyclerNTier
  (11 tests covering parse format, error handling, escalation, thread safety).

47 tests pass, ruff clean.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-02-28 16:57:07 +01:00

230 lines
7.7 KiB
Python

"""Optional proxy rotation for parallel HTTP fetching.
Proxies are configured via environment variables. When unset, all functions
return None/no-op — extractors fall back to direct requests.
Three-tier escalation: free → datacenter → residential.
Tier 1 (free): WEBSHARE_DOWNLOAD_URL — auto-fetched from Webshare API
Tier 2 (datacenter): PROXY_URLS_DATACENTER — comma-separated paid DC proxies
Tier 3 (residential): PROXY_URLS_RESIDENTIAL — comma-separated paid residential proxies
Tiered circuit breaker:
Active tier is used until consecutive failures >= threshold, then escalates
to the next tier. Once all tiers are exhausted, is_exhausted() returns True.
Escalation is permanent for the duration of the run — no auto-recovery.
"""
import itertools
import logging
import os
import threading
import urllib.error
import urllib.request
logger = logging.getLogger(__name__)
MAX_WEBSHARE_PROXIES = 20
WEBSHARE_FETCH_TIMEOUT_SECONDS = 10
WEBSHARE_MAX_RESPONSE_BYTES = 1024 * 1024 # 1MB
def fetch_webshare_proxies(download_url: str, max_proxies: int = MAX_WEBSHARE_PROXIES) -> list[str]:
"""Fetch proxy list from the Webshare download API. Returns [] on any error.
Expected line format: ip:port:username:password
Converts to: http://username:password@ip:port
Bounded: reads at most WEBSHARE_MAX_RESPONSE_BYTES, returns at most max_proxies.
"""
assert max_proxies > 0, f"max_proxies must be positive, got {max_proxies}"
assert download_url, "download_url must not be empty"
try:
req = urllib.request.Request(
download_url,
headers={"User-Agent": "padelnomics-extract/1.0"},
)
with urllib.request.urlopen(req, timeout=WEBSHARE_FETCH_TIMEOUT_SECONDS) as resp:
raw = resp.read(WEBSHARE_MAX_RESPONSE_BYTES).decode("utf-8")
except Exception as e:
logger.warning("Failed to fetch Webshare proxies: %s", e)
return []
urls = []
for line in raw.splitlines():
line = line.strip()
if not line:
continue
parts = line.split(":")
if len(parts) != 4:
logger.debug("Skipping malformed proxy line: %r", line)
continue
ip, port, username, password = parts
urls.append(f"http://{username}:{password}@{ip}:{port}")
if len(urls) >= max_proxies:
break
logger.info("Fetched %d proxies from Webshare", len(urls))
return urls
def load_proxy_tiers() -> list[list[str]]:
"""Assemble proxy tiers in escalation order: free → datacenter → residential.
Tier 1 (free): fetched from WEBSHARE_DOWNLOAD_URL if set.
Tier 2 (datacenter): PROXY_URLS_DATACENTER (comma-separated).
Tier 3 (residential): PROXY_URLS_RESIDENTIAL (comma-separated).
Empty tiers are omitted. Returns [] if no proxies configured anywhere.
"""
tiers: list[list[str]] = []
webshare_url = os.environ.get("WEBSHARE_DOWNLOAD_URL", "").strip()
if webshare_url:
free_proxies = fetch_webshare_proxies(webshare_url)
if free_proxies:
tiers.append(free_proxies)
for var in ("PROXY_URLS_DATACENTER", "PROXY_URLS_RESIDENTIAL"):
raw = os.environ.get(var, "")
urls = [u.strip() for u in raw.split(",") if u.strip()]
if urls:
tiers.append(urls)
return tiers
def make_round_robin_cycler(proxy_urls: list[str]):
"""Thread-safe round-robin proxy cycler.
Returns a callable: next_proxy() -> str | None
Returns None-returning callable if no proxies configured.
"""
if not proxy_urls:
return lambda: None
cycle = itertools.cycle(proxy_urls)
lock = threading.Lock()
def next_proxy() -> str:
with lock:
return next(cycle)
return next_proxy
def make_sticky_selector(proxy_urls: list[str]):
"""Hash-based sticky proxy selector.
Returns a callable: select_proxy(key: str) -> str | None
The same key always maps to the same proxy (consistent hashing).
Returns None-returning callable if no proxies configured.
"""
if not proxy_urls:
return lambda key: None
n = len(proxy_urls)
def select_proxy(key: str) -> str:
import hashlib
idx = int(hashlib.md5(key.encode(), usedforsecurity=False).hexdigest(), 16) % n
return proxy_urls[idx]
return select_proxy
def make_tiered_cycler(tiers: list[list[str]], threshold: int) -> dict:
"""Thread-safe N-tier proxy cycler with circuit breaker.
Uses tiers[0] until consecutive failures >= threshold, then escalates
to tiers[1], then tiers[2], etc. Once all tiers are exhausted,
is_exhausted() returns True and next_proxy() returns None.
Failure counter resets on each escalation — the new tier gets a fresh start.
Once exhausted, further record_failure() calls are no-ops.
Returns a dict of callables:
next_proxy() -> str | None — URL from the active tier, or None
record_success() -> None — resets consecutive failure counter
record_failure() -> bool — True if just escalated to next tier
is_exhausted() -> bool — True if all tiers exhausted
active_tier_index() -> int — 0-based index of current tier
tier_count() -> int — total number of tiers
Edge cases:
Empty tiers list: next_proxy() always returns None, is_exhausted() True.
Single tier: behaves like the primary-only case, is_exhausted() after threshold.
"""
assert threshold > 0, f"threshold must be positive, got {threshold}"
assert isinstance(tiers, list), f"tiers must be a list, got {type(tiers)}"
lock = threading.Lock()
cycles = [itertools.cycle(t) for t in tiers]
state = {
"active_tier": 0,
"consecutive_failures": 0,
}
def next_proxy() -> str | None:
with lock:
idx = state["active_tier"]
if idx >= len(cycles):
return None
return next(cycles[idx])
def record_success() -> None:
with lock:
state["consecutive_failures"] = 0
def record_failure() -> bool:
"""Increment failure counter. Returns True if just escalated to next tier."""
with lock:
idx = state["active_tier"]
if idx >= len(tiers):
# Already exhausted — no-op
return False
state["consecutive_failures"] += 1
if state["consecutive_failures"] < threshold:
return False
# Threshold reached — escalate
state["consecutive_failures"] = 0
state["active_tier"] += 1
new_idx = state["active_tier"]
if new_idx < len(tiers):
logger.warning(
"Circuit open after %d consecutive failures — "
"escalating to proxy tier %d/%d",
threshold,
new_idx + 1,
len(tiers),
)
else:
logger.error(
"All %d proxy tier(s) exhausted after %d consecutive failures — "
"no more fallbacks",
len(tiers),
threshold,
)
return True
def is_exhausted() -> bool:
with lock:
return state["active_tier"] >= len(tiers)
def active_tier_index() -> int:
with lock:
return state["active_tier"]
def tier_count() -> int:
return len(tiers)
return {
"next_proxy": next_proxy,
"record_success": record_success,
"record_failure": record_failure,
"is_exhausted": is_exhausted,
"active_tier_index": active_tier_index,
"tier_count": tier_count,
}