Replace two-tier proxy setup (PROXY_URLS / PROXY_URLS_FALLBACK) with N-tier escalation: free → datacenter → residential. - proxy.py: fetch_webshare_proxies() auto-fetches the Webshare download API on each run (no more stale manually-copied lists). load_proxy_tiers() assembles tiers from WEBSHARE_DOWNLOAD_URL, PROXY_URLS_DATACENTER, PROXY_URLS_RESIDENTIAL. make_tiered_cycler() generalised to list[list[str]] with N-level escalation; is_fallback_active() replaced by is_exhausted(). Old load_proxy_urls() / load_fallback_proxy_urls() deleted. - playtomic_availability.py: both extract() and extract_recheck() use load_proxy_tiers() + generalised cycler. _fetch_venues_parallel fallback_urls param removed. All is_fallback_active() checks → is_exhausted(). - playtomic_tenants.py: flattens tiers for simple round-robin. - test_supervisor.py: TestLoadProxyUrls removed (function deleted). Added TestFetchWebshareProxies, TestLoadProxyTiers, TestTieredCyclerNTier (11 tests covering parse format, error handling, escalation, thread safety). 47 tests pass, ruff clean. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
230 lines
7.7 KiB
Python
230 lines
7.7 KiB
Python
"""Optional proxy rotation for parallel HTTP fetching.
|
|
|
|
Proxies are configured via environment variables. When unset, all functions
|
|
return None/no-op — extractors fall back to direct requests.
|
|
|
|
Three-tier escalation: free → datacenter → residential.
|
|
Tier 1 (free): WEBSHARE_DOWNLOAD_URL — auto-fetched from Webshare API
|
|
Tier 2 (datacenter): PROXY_URLS_DATACENTER — comma-separated paid DC proxies
|
|
Tier 3 (residential): PROXY_URLS_RESIDENTIAL — comma-separated paid residential proxies
|
|
|
|
Tiered circuit breaker:
|
|
Active tier is used until consecutive failures >= threshold, then escalates
|
|
to the next tier. Once all tiers are exhausted, is_exhausted() returns True.
|
|
Escalation is permanent for the duration of the run — no auto-recovery.
|
|
"""
|
|
|
|
import itertools
|
|
import logging
|
|
import os
|
|
import threading
|
|
import urllib.error
|
|
import urllib.request
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
MAX_WEBSHARE_PROXIES = 20
|
|
WEBSHARE_FETCH_TIMEOUT_SECONDS = 10
|
|
WEBSHARE_MAX_RESPONSE_BYTES = 1024 * 1024 # 1MB
|
|
|
|
|
|
def fetch_webshare_proxies(download_url: str, max_proxies: int = MAX_WEBSHARE_PROXIES) -> list[str]:
|
|
"""Fetch proxy list from the Webshare download API. Returns [] on any error.
|
|
|
|
Expected line format: ip:port:username:password
|
|
Converts to: http://username:password@ip:port
|
|
|
|
Bounded: reads at most WEBSHARE_MAX_RESPONSE_BYTES, returns at most max_proxies.
|
|
"""
|
|
assert max_proxies > 0, f"max_proxies must be positive, got {max_proxies}"
|
|
assert download_url, "download_url must not be empty"
|
|
|
|
try:
|
|
req = urllib.request.Request(
|
|
download_url,
|
|
headers={"User-Agent": "padelnomics-extract/1.0"},
|
|
)
|
|
with urllib.request.urlopen(req, timeout=WEBSHARE_FETCH_TIMEOUT_SECONDS) as resp:
|
|
raw = resp.read(WEBSHARE_MAX_RESPONSE_BYTES).decode("utf-8")
|
|
except Exception as e:
|
|
logger.warning("Failed to fetch Webshare proxies: %s", e)
|
|
return []
|
|
|
|
urls = []
|
|
for line in raw.splitlines():
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
parts = line.split(":")
|
|
if len(parts) != 4:
|
|
logger.debug("Skipping malformed proxy line: %r", line)
|
|
continue
|
|
ip, port, username, password = parts
|
|
urls.append(f"http://{username}:{password}@{ip}:{port}")
|
|
if len(urls) >= max_proxies:
|
|
break
|
|
|
|
logger.info("Fetched %d proxies from Webshare", len(urls))
|
|
return urls
|
|
|
|
|
|
def load_proxy_tiers() -> list[list[str]]:
|
|
"""Assemble proxy tiers in escalation order: free → datacenter → residential.
|
|
|
|
Tier 1 (free): fetched from WEBSHARE_DOWNLOAD_URL if set.
|
|
Tier 2 (datacenter): PROXY_URLS_DATACENTER (comma-separated).
|
|
Tier 3 (residential): PROXY_URLS_RESIDENTIAL (comma-separated).
|
|
|
|
Empty tiers are omitted. Returns [] if no proxies configured anywhere.
|
|
"""
|
|
tiers: list[list[str]] = []
|
|
|
|
webshare_url = os.environ.get("WEBSHARE_DOWNLOAD_URL", "").strip()
|
|
if webshare_url:
|
|
free_proxies = fetch_webshare_proxies(webshare_url)
|
|
if free_proxies:
|
|
tiers.append(free_proxies)
|
|
|
|
for var in ("PROXY_URLS_DATACENTER", "PROXY_URLS_RESIDENTIAL"):
|
|
raw = os.environ.get(var, "")
|
|
urls = [u.strip() for u in raw.split(",") if u.strip()]
|
|
if urls:
|
|
tiers.append(urls)
|
|
|
|
return tiers
|
|
|
|
|
|
def make_round_robin_cycler(proxy_urls: list[str]):
|
|
"""Thread-safe round-robin proxy cycler.
|
|
|
|
Returns a callable: next_proxy() -> str | None
|
|
Returns None-returning callable if no proxies configured.
|
|
"""
|
|
if not proxy_urls:
|
|
return lambda: None
|
|
|
|
cycle = itertools.cycle(proxy_urls)
|
|
lock = threading.Lock()
|
|
|
|
def next_proxy() -> str:
|
|
with lock:
|
|
return next(cycle)
|
|
|
|
return next_proxy
|
|
|
|
|
|
def make_sticky_selector(proxy_urls: list[str]):
|
|
"""Hash-based sticky proxy selector.
|
|
|
|
Returns a callable: select_proxy(key: str) -> str | None
|
|
The same key always maps to the same proxy (consistent hashing).
|
|
Returns None-returning callable if no proxies configured.
|
|
"""
|
|
if not proxy_urls:
|
|
return lambda key: None
|
|
|
|
n = len(proxy_urls)
|
|
|
|
def select_proxy(key: str) -> str:
|
|
import hashlib
|
|
|
|
idx = int(hashlib.md5(key.encode(), usedforsecurity=False).hexdigest(), 16) % n
|
|
return proxy_urls[idx]
|
|
|
|
return select_proxy
|
|
|
|
|
|
def make_tiered_cycler(tiers: list[list[str]], threshold: int) -> dict:
|
|
"""Thread-safe N-tier proxy cycler with circuit breaker.
|
|
|
|
Uses tiers[0] until consecutive failures >= threshold, then escalates
|
|
to tiers[1], then tiers[2], etc. Once all tiers are exhausted,
|
|
is_exhausted() returns True and next_proxy() returns None.
|
|
|
|
Failure counter resets on each escalation — the new tier gets a fresh start.
|
|
Once exhausted, further record_failure() calls are no-ops.
|
|
|
|
Returns a dict of callables:
|
|
next_proxy() -> str | None — URL from the active tier, or None
|
|
record_success() -> None — resets consecutive failure counter
|
|
record_failure() -> bool — True if just escalated to next tier
|
|
is_exhausted() -> bool — True if all tiers exhausted
|
|
active_tier_index() -> int — 0-based index of current tier
|
|
tier_count() -> int — total number of tiers
|
|
|
|
Edge cases:
|
|
Empty tiers list: next_proxy() always returns None, is_exhausted() True.
|
|
Single tier: behaves like the primary-only case, is_exhausted() after threshold.
|
|
"""
|
|
assert threshold > 0, f"threshold must be positive, got {threshold}"
|
|
assert isinstance(tiers, list), f"tiers must be a list, got {type(tiers)}"
|
|
|
|
lock = threading.Lock()
|
|
cycles = [itertools.cycle(t) for t in tiers]
|
|
state = {
|
|
"active_tier": 0,
|
|
"consecutive_failures": 0,
|
|
}
|
|
|
|
def next_proxy() -> str | None:
|
|
with lock:
|
|
idx = state["active_tier"]
|
|
if idx >= len(cycles):
|
|
return None
|
|
return next(cycles[idx])
|
|
|
|
def record_success() -> None:
|
|
with lock:
|
|
state["consecutive_failures"] = 0
|
|
|
|
def record_failure() -> bool:
|
|
"""Increment failure counter. Returns True if just escalated to next tier."""
|
|
with lock:
|
|
idx = state["active_tier"]
|
|
if idx >= len(tiers):
|
|
# Already exhausted — no-op
|
|
return False
|
|
state["consecutive_failures"] += 1
|
|
if state["consecutive_failures"] < threshold:
|
|
return False
|
|
# Threshold reached — escalate
|
|
state["consecutive_failures"] = 0
|
|
state["active_tier"] += 1
|
|
new_idx = state["active_tier"]
|
|
if new_idx < len(tiers):
|
|
logger.warning(
|
|
"Circuit open after %d consecutive failures — "
|
|
"escalating to proxy tier %d/%d",
|
|
threshold,
|
|
new_idx + 1,
|
|
len(tiers),
|
|
)
|
|
else:
|
|
logger.error(
|
|
"All %d proxy tier(s) exhausted after %d consecutive failures — "
|
|
"no more fallbacks",
|
|
len(tiers),
|
|
threshold,
|
|
)
|
|
return True
|
|
|
|
def is_exhausted() -> bool:
|
|
with lock:
|
|
return state["active_tier"] >= len(tiers)
|
|
|
|
def active_tier_index() -> int:
|
|
with lock:
|
|
return state["active_tier"]
|
|
|
|
def tier_count() -> int:
|
|
return len(tiers)
|
|
|
|
return {
|
|
"next_proxy": next_proxy,
|
|
"record_success": record_success,
|
|
"record_failure": record_failure,
|
|
"is_exhausted": is_exhausted,
|
|
"active_tier_index": active_tier_index,
|
|
"tier_count": tier_count,
|
|
}
|