fix(extract): default worker count to 200 when proxies configured
All checks were successful
CI / test (push) Successful in 49s
CI / tag (push) Successful in 3s

Previously fell back to len(tiers[0]) (e.g. 10 for Webshare) when
PROXY_CONCURRENCY was unset. Default is now MAX_PROXY_CONCURRENCY=200
so single-URL rotating proxies (DC/residential) run at full concurrency
without needing an explicit env var.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Deeman
2026-02-28 18:06:55 +01:00
parent 710624f417
commit c1cdeec6be

View File

@@ -52,8 +52,7 @@ MAX_VENUES_PER_RUN = 20_000
MAX_RETRIES_PER_VENUE = 2 MAX_RETRIES_PER_VENUE = 2
RECHECK_WINDOW_MINUTES = int(os.environ.get("RECHECK_WINDOW_MINUTES", "30")) RECHECK_WINDOW_MINUTES = int(os.environ.get("RECHECK_WINDOW_MINUTES", "30"))
CIRCUIT_BREAKER_THRESHOLD = int(os.environ.get("CIRCUIT_BREAKER_THRESHOLD") or "10") CIRCUIT_BREAKER_THRESHOLD = int(os.environ.get("CIRCUIT_BREAKER_THRESHOLD") or "10")
# Override worker count — useful when tier 0 is a single rotating endpoint (DC/residential) # Worker count: defaults to MAX_PROXY_CONCURRENCY (200). Override via PROXY_CONCURRENCY env var.
# that supports many concurrent connections. Defaults to len(tiers[0]) when unset.
_PROXY_CONCURRENCY = os.environ.get("PROXY_CONCURRENCY", "").strip() _PROXY_CONCURRENCY = os.environ.get("PROXY_CONCURRENCY", "").strip()
MAX_PROXY_CONCURRENCY = 200 MAX_PROXY_CONCURRENCY = 200
@@ -300,8 +299,7 @@ def extract(
# Set up tiered proxy cycler with circuit breaker # Set up tiered proxy cycler with circuit breaker
tiers = load_proxy_tiers() tiers = load_proxy_tiers()
default_workers = len(tiers[0]) if tiers else 1 worker_count = min(int(_PROXY_CONCURRENCY), MAX_PROXY_CONCURRENCY) if _PROXY_CONCURRENCY else (MAX_PROXY_CONCURRENCY if tiers else 1)
worker_count = min(int(_PROXY_CONCURRENCY), MAX_PROXY_CONCURRENCY) if _PROXY_CONCURRENCY else default_workers
cycler = make_tiered_cycler(tiers, CIRCUIT_BREAKER_THRESHOLD) cycler = make_tiered_cycler(tiers, CIRCUIT_BREAKER_THRESHOLD)
start_min_str = start_min.strftime("%Y-%m-%dT%H:%M:%S") start_min_str = start_min.strftime("%Y-%m-%dT%H:%M:%S")
@@ -491,8 +489,7 @@ def extract_recheck(
# Set up tiered proxy cycler with circuit breaker # Set up tiered proxy cycler with circuit breaker
tiers = load_proxy_tiers() tiers = load_proxy_tiers()
default_workers = len(tiers[0]) if tiers else 1 worker_count = min(int(_PROXY_CONCURRENCY), MAX_PROXY_CONCURRENCY) if _PROXY_CONCURRENCY else (MAX_PROXY_CONCURRENCY if tiers else 1)
worker_count = min(int(_PROXY_CONCURRENCY), MAX_PROXY_CONCURRENCY) if _PROXY_CONCURRENCY else default_workers
cycler = make_tiered_cycler(tiers, CIRCUIT_BREAKER_THRESHOLD) cycler = make_tiered_cycler(tiers, CIRCUIT_BREAKER_THRESHOLD)
if worker_count > 1 and len(venues_to_recheck) > 10: if worker_count > 1 and len(venues_to_recheck) > 10: