feat(extract): three-tier proxy system with Webshare auto-fetch

Replace two-tier proxy setup (PROXY_URLS / PROXY_URLS_FALLBACK) with
N-tier escalation: free → datacenter → residential.

- proxy.py: fetch_webshare_proxies() auto-fetches the Webshare download
  API on each run (no more stale manually-copied lists). load_proxy_tiers()
  assembles tiers from WEBSHARE_DOWNLOAD_URL, PROXY_URLS_DATACENTER,
  PROXY_URLS_RESIDENTIAL. make_tiered_cycler() generalised to list[list[str]]
  with N-level escalation; is_fallback_active() replaced by is_exhausted().
  Old load_proxy_urls() / load_fallback_proxy_urls() deleted.

- playtomic_availability.py: both extract() and extract_recheck() use
  load_proxy_tiers() + generalised cycler. _fetch_venues_parallel fallback_urls
  param removed. All is_fallback_active() checks → is_exhausted().

- playtomic_tenants.py: flattens tiers for simple round-robin.

- test_supervisor.py: TestLoadProxyUrls removed (function deleted).
  Added TestFetchWebshareProxies, TestLoadProxyTiers, TestTieredCyclerNTier
  (11 tests covering parse format, error handling, escalation, thread safety).

47 tests pass, ruff clean.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Deeman
2026-02-28 16:57:07 +01:00
parent 642041b32b
commit adf22924f6
5 changed files with 413 additions and 116 deletions

View File

@@ -24,9 +24,11 @@ sup = _ilu.module_from_spec(_spec)
_spec.loader.exec_module(sup)
from padelnomics_extract.proxy import ( # noqa: E402
load_proxy_urls,
fetch_webshare_proxies,
load_proxy_tiers,
make_round_robin_cycler,
make_sticky_selector,
make_tiered_cycler,
)
# ── load_workflows ────────────────────────────────────────────────
@@ -198,28 +200,112 @@ class TestTopologicalWaves:
# ── proxy.py ─────────────────────────────────────────────────────
class TestLoadProxyUrls:
def test_returns_empty_when_unset(self, monkeypatch):
monkeypatch.delenv("PROXY_URLS", raising=False)
assert load_proxy_urls() == []
class TestFetchWebshareProxies:
def test_parses_ip_port_user_pass_format(self):
raw = "1.2.3.4:1080:user1:pass1\n5.6.7.8:1080:user2:pass2\n"
with patch("urllib.request.urlopen") as mock_open:
mock_resp = MagicMock()
mock_resp.read.return_value = raw.encode("utf-8")
mock_resp.__enter__ = lambda s: s
mock_resp.__exit__ = MagicMock(return_value=False)
mock_open.return_value = mock_resp
urls = fetch_webshare_proxies("http://example.com/proxy-list")
assert urls == [
"http://user1:pass1@1.2.3.4:1080",
"http://user2:pass2@5.6.7.8:1080",
]
def test_parses_comma_separated_urls(self, monkeypatch):
monkeypatch.setenv(
"PROXY_URLS",
"http://p1:8080,http://p2:8080,http://p3:8080",
)
urls = load_proxy_urls()
assert urls == ["http://p1:8080", "http://p2:8080", "http://p3:8080"]
def test_network_error_returns_empty(self):
import urllib.error
with patch("urllib.request.urlopen", side_effect=urllib.error.URLError("timeout")):
result = fetch_webshare_proxies("http://example.com/proxy-list")
assert result == []
def test_strips_whitespace(self, monkeypatch):
monkeypatch.setenv("PROXY_URLS", " http://p1:8080 , http://p2:8080 ")
urls = load_proxy_urls()
assert urls == ["http://p1:8080", "http://p2:8080"]
def test_malformed_lines_are_skipped(self):
raw = "bad_line\n1.2.3.4:1080:user:pass\nonly:three:parts\n"
with patch("urllib.request.urlopen") as mock_open:
mock_resp = MagicMock()
mock_resp.read.return_value = raw.encode("utf-8")
mock_resp.__enter__ = lambda s: s
mock_resp.__exit__ = MagicMock(return_value=False)
mock_open.return_value = mock_resp
urls = fetch_webshare_proxies("http://example.com/proxy-list")
assert urls == ["http://user:pass@1.2.3.4:1080"]
def test_ignores_empty_segments(self, monkeypatch):
monkeypatch.setenv("PROXY_URLS", "http://p1:8080,,http://p2:8080,")
urls = load_proxy_urls()
assert urls == ["http://p1:8080", "http://p2:8080"]
def test_max_proxies_respected(self):
lines = "\n".join(f"10.0.0.{i}:1080:u{i}:p{i}" for i in range(10))
with patch("urllib.request.urlopen") as mock_open:
mock_resp = MagicMock()
mock_resp.read.return_value = lines.encode("utf-8")
mock_resp.__enter__ = lambda s: s
mock_resp.__exit__ = MagicMock(return_value=False)
mock_open.return_value = mock_resp
urls = fetch_webshare_proxies("http://example.com/proxy-list", max_proxies=3)
assert len(urls) == 3
def test_empty_lines_skipped(self):
raw = "\n\n1.2.3.4:1080:user:pass\n\n"
with patch("urllib.request.urlopen") as mock_open:
mock_resp = MagicMock()
mock_resp.read.return_value = raw.encode("utf-8")
mock_resp.__enter__ = lambda s: s
mock_resp.__exit__ = MagicMock(return_value=False)
mock_open.return_value = mock_resp
urls = fetch_webshare_proxies("http://example.com/proxy-list")
assert urls == ["http://user:pass@1.2.3.4:1080"]
class TestLoadProxyTiers:
def _clear_proxy_env(self, monkeypatch):
for var in ("WEBSHARE_DOWNLOAD_URL", "PROXY_URLS_DATACENTER", "PROXY_URLS_RESIDENTIAL"):
monkeypatch.delenv(var, raising=False)
def test_returns_empty_when_all_unset(self, monkeypatch):
self._clear_proxy_env(monkeypatch)
assert load_proxy_tiers() == []
def test_single_datacenter_tier(self, monkeypatch):
self._clear_proxy_env(monkeypatch)
monkeypatch.setenv("PROXY_URLS_DATACENTER", "http://dc1:8080,http://dc2:8080")
tiers = load_proxy_tiers()
assert len(tiers) == 1
assert tiers[0] == ["http://dc1:8080", "http://dc2:8080"]
def test_residential_only(self, monkeypatch):
self._clear_proxy_env(monkeypatch)
monkeypatch.setenv("PROXY_URLS_RESIDENTIAL", "http://res1:8080")
tiers = load_proxy_tiers()
assert len(tiers) == 1
assert tiers[0] == ["http://res1:8080"]
def test_empty_tiers_skipped(self, monkeypatch):
self._clear_proxy_env(monkeypatch)
monkeypatch.setenv("PROXY_URLS_DATACENTER", "")
monkeypatch.setenv("PROXY_URLS_RESIDENTIAL", "http://res1:8080")
tiers = load_proxy_tiers()
assert len(tiers) == 1
assert tiers[0] == ["http://res1:8080"]
def test_three_tiers_correct_order(self, monkeypatch):
self._clear_proxy_env(monkeypatch)
with patch("padelnomics_extract.proxy.fetch_webshare_proxies", return_value=["http://user:pass@1.2.3.4:1080"]):
monkeypatch.setenv("WEBSHARE_DOWNLOAD_URL", "http://example.com/list")
monkeypatch.setenv("PROXY_URLS_DATACENTER", "http://dc1:8080")
monkeypatch.setenv("PROXY_URLS_RESIDENTIAL", "http://res1:8080")
tiers = load_proxy_tiers()
assert len(tiers) == 3
assert tiers[0] == ["http://user:pass@1.2.3.4:1080"] # free
assert tiers[1] == ["http://dc1:8080"] # datacenter
assert tiers[2] == ["http://res1:8080"] # residential
def test_webshare_fetch_failure_skips_tier(self, monkeypatch):
self._clear_proxy_env(monkeypatch)
with patch("padelnomics_extract.proxy.fetch_webshare_proxies", return_value=[]):
monkeypatch.setenv("WEBSHARE_DOWNLOAD_URL", "http://example.com/list")
monkeypatch.setenv("PROXY_URLS_DATACENTER", "http://dc1:8080")
tiers = load_proxy_tiers()
assert len(tiers) == 1
assert tiers[0] == ["http://dc1:8080"]
class TestRoundRobinCycler:
@@ -279,3 +365,138 @@ class TestStickySelectorProxy:
fn = make_sticky_selector(urls)
for i in range(20):
assert fn(f"key_{i}") in urls
class TestTieredCyclerNTier:
def test_starts_on_first_tier(self):
tiers = [["http://t0a", "http://t0b"], ["http://t1a"]]
cycler = make_tiered_cycler(tiers, threshold=3)
assert cycler["active_tier_index"]() == 0
assert not cycler["is_exhausted"]()
assert cycler["next_proxy"]() in tiers[0]
def test_escalates_after_threshold(self):
tiers = [["http://t0"], ["http://t1"]]
cycler = make_tiered_cycler(tiers, threshold=3)
# Two failures — stays on tier 0
cycler["record_failure"]()
cycler["record_failure"]()
assert cycler["active_tier_index"]() == 0
# Third failure — escalates
escalated = cycler["record_failure"]()
assert escalated is True
assert cycler["active_tier_index"]() == 1
assert cycler["next_proxy"]() == "http://t1"
def test_escalates_through_all_tiers(self):
tiers = [["http://t0"], ["http://t1"], ["http://t2"]]
cycler = make_tiered_cycler(tiers, threshold=2)
# Exhaust tier 0
cycler["record_failure"]()
cycler["record_failure"]()
assert cycler["active_tier_index"]() == 1
# Exhaust tier 1
cycler["record_failure"]()
cycler["record_failure"]()
assert cycler["active_tier_index"]() == 2
# Exhaust tier 2
cycler["record_failure"]()
cycler["record_failure"]()
assert cycler["is_exhausted"]()
assert cycler["next_proxy"]() is None
def test_success_resets_counter(self):
tiers = [["http://t0"], ["http://t1"]]
cycler = make_tiered_cycler(tiers, threshold=3)
cycler["record_failure"]()
cycler["record_failure"]()
cycler["record_success"]()
# Counter reset — need threshold more failures to escalate
cycler["record_failure"]()
cycler["record_failure"]()
assert cycler["active_tier_index"]() == 0 # still on tier 0
cycler["record_failure"]()
assert cycler["active_tier_index"]() == 1 # now escalated
def test_counter_resets_on_escalation(self):
"""After escalating, failure counter resets so new tier gets a fresh start."""
tiers = [["http://t0"], ["http://t1"], ["http://t2"]]
cycler = make_tiered_cycler(tiers, threshold=2)
# Exhaust tier 0
cycler["record_failure"]()
cycler["record_failure"]()
assert cycler["active_tier_index"]() == 1
# One failure on tier 1 — should NOT escalate yet (counter reset)
cycler["record_failure"]()
assert cycler["active_tier_index"]() == 1
# Second failure on tier 1 — escalates to tier 2
cycler["record_failure"]()
assert cycler["active_tier_index"]() == 2
def test_is_exhausted_false_when_tiers_remain(self):
tiers = [["http://t0"], ["http://t1"]]
cycler = make_tiered_cycler(tiers, threshold=1)
assert not cycler["is_exhausted"]()
cycler["record_failure"]() # escalates to tier 1
assert not cycler["is_exhausted"]()
def test_is_exhausted_true_after_all_tiers_fail(self):
tiers = [["http://t0"]]
cycler = make_tiered_cycler(tiers, threshold=1)
assert not cycler["is_exhausted"]()
cycler["record_failure"]()
assert cycler["is_exhausted"]()
assert cycler["next_proxy"]() is None
def test_empty_tiers_immediately_exhausted(self):
cycler = make_tiered_cycler([], threshold=3)
assert cycler["is_exhausted"]()
assert cycler["next_proxy"]() is None
assert cycler["tier_count"]() == 0
def test_single_tier_cycles_within_tier(self):
tiers = [["http://p1", "http://p2", "http://p3"]]
cycler = make_tiered_cycler(tiers, threshold=10)
results = [cycler["next_proxy"]() for _ in range(6)]
assert results == ["http://p1", "http://p2", "http://p3"] * 2
def test_tier_count_reflects_input(self):
assert make_tiered_cycler([], threshold=1)["tier_count"]() == 0
assert make_tiered_cycler([["a"]], threshold=1)["tier_count"]() == 1
assert make_tiered_cycler([["a"], ["b"], ["c"]], threshold=1)["tier_count"]() == 3
def test_record_failure_noop_when_exhausted(self):
tiers = [["http://t0"]]
cycler = make_tiered_cycler(tiers, threshold=1)
cycler["record_failure"]() # exhausts
assert cycler["is_exhausted"]()
# Further failures are no-ops, not exceptions
result = cycler["record_failure"]()
assert result is False
assert cycler["is_exhausted"]()
def test_thread_safety(self):
"""Concurrent next_proxy and record calls do not raise or corrupt state."""
import threading
tiers = [["http://t0a", "http://t0b"], ["http://t1a", "http://t1b"]]
cycler = make_tiered_cycler(tiers, threshold=5)
errors = []
lock = threading.Lock()
def worker():
try:
for _ in range(20):
cycler["next_proxy"]()
cycler["record_failure"]()
cycler["record_success"]()
except Exception as e:
with lock:
errors.append(e)
threads = [threading.Thread(target=worker) for _ in range(8)]
for t in threads:
t.start()
for t in threads:
t.join()
assert errors == [], f"Thread safety errors: {errors}"