feat(extract): three-tier proxy system with Webshare auto-fetch
Replace two-tier proxy setup (PROXY_URLS / PROXY_URLS_FALLBACK) with N-tier escalation: free → datacenter → residential. - proxy.py: fetch_webshare_proxies() auto-fetches the Webshare download API on each run (no more stale manually-copied lists). load_proxy_tiers() assembles tiers from WEBSHARE_DOWNLOAD_URL, PROXY_URLS_DATACENTER, PROXY_URLS_RESIDENTIAL. make_tiered_cycler() generalised to list[list[str]] with N-level escalation; is_fallback_active() replaced by is_exhausted(). Old load_proxy_urls() / load_fallback_proxy_urls() deleted. - playtomic_availability.py: both extract() and extract_recheck() use load_proxy_tiers() + generalised cycler. _fetch_venues_parallel fallback_urls param removed. All is_fallback_active() checks → is_exhausted(). - playtomic_tenants.py: flattens tiers for simple round-robin. - test_supervisor.py: TestLoadProxyUrls removed (function deleted). Added TestFetchWebshareProxies, TestLoadProxyTiers, TestTieredCyclerNTier (11 tests covering parse format, error handling, escalation, thread safety). 47 tests pass, ruff clean. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -24,9 +24,11 @@ sup = _ilu.module_from_spec(_spec)
|
||||
_spec.loader.exec_module(sup)
|
||||
|
||||
from padelnomics_extract.proxy import ( # noqa: E402
|
||||
load_proxy_urls,
|
||||
fetch_webshare_proxies,
|
||||
load_proxy_tiers,
|
||||
make_round_robin_cycler,
|
||||
make_sticky_selector,
|
||||
make_tiered_cycler,
|
||||
)
|
||||
|
||||
# ── load_workflows ────────────────────────────────────────────────
|
||||
@@ -198,28 +200,112 @@ class TestTopologicalWaves:
|
||||
# ── proxy.py ─────────────────────────────────────────────────────
|
||||
|
||||
|
||||
class TestLoadProxyUrls:
|
||||
def test_returns_empty_when_unset(self, monkeypatch):
|
||||
monkeypatch.delenv("PROXY_URLS", raising=False)
|
||||
assert load_proxy_urls() == []
|
||||
class TestFetchWebshareProxies:
|
||||
def test_parses_ip_port_user_pass_format(self):
|
||||
raw = "1.2.3.4:1080:user1:pass1\n5.6.7.8:1080:user2:pass2\n"
|
||||
with patch("urllib.request.urlopen") as mock_open:
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.read.return_value = raw.encode("utf-8")
|
||||
mock_resp.__enter__ = lambda s: s
|
||||
mock_resp.__exit__ = MagicMock(return_value=False)
|
||||
mock_open.return_value = mock_resp
|
||||
urls = fetch_webshare_proxies("http://example.com/proxy-list")
|
||||
assert urls == [
|
||||
"http://user1:pass1@1.2.3.4:1080",
|
||||
"http://user2:pass2@5.6.7.8:1080",
|
||||
]
|
||||
|
||||
def test_parses_comma_separated_urls(self, monkeypatch):
|
||||
monkeypatch.setenv(
|
||||
"PROXY_URLS",
|
||||
"http://p1:8080,http://p2:8080,http://p3:8080",
|
||||
)
|
||||
urls = load_proxy_urls()
|
||||
assert urls == ["http://p1:8080", "http://p2:8080", "http://p3:8080"]
|
||||
def test_network_error_returns_empty(self):
|
||||
import urllib.error
|
||||
with patch("urllib.request.urlopen", side_effect=urllib.error.URLError("timeout")):
|
||||
result = fetch_webshare_proxies("http://example.com/proxy-list")
|
||||
assert result == []
|
||||
|
||||
def test_strips_whitespace(self, monkeypatch):
|
||||
monkeypatch.setenv("PROXY_URLS", " http://p1:8080 , http://p2:8080 ")
|
||||
urls = load_proxy_urls()
|
||||
assert urls == ["http://p1:8080", "http://p2:8080"]
|
||||
def test_malformed_lines_are_skipped(self):
|
||||
raw = "bad_line\n1.2.3.4:1080:user:pass\nonly:three:parts\n"
|
||||
with patch("urllib.request.urlopen") as mock_open:
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.read.return_value = raw.encode("utf-8")
|
||||
mock_resp.__enter__ = lambda s: s
|
||||
mock_resp.__exit__ = MagicMock(return_value=False)
|
||||
mock_open.return_value = mock_resp
|
||||
urls = fetch_webshare_proxies("http://example.com/proxy-list")
|
||||
assert urls == ["http://user:pass@1.2.3.4:1080"]
|
||||
|
||||
def test_ignores_empty_segments(self, monkeypatch):
|
||||
monkeypatch.setenv("PROXY_URLS", "http://p1:8080,,http://p2:8080,")
|
||||
urls = load_proxy_urls()
|
||||
assert urls == ["http://p1:8080", "http://p2:8080"]
|
||||
def test_max_proxies_respected(self):
|
||||
lines = "\n".join(f"10.0.0.{i}:1080:u{i}:p{i}" for i in range(10))
|
||||
with patch("urllib.request.urlopen") as mock_open:
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.read.return_value = lines.encode("utf-8")
|
||||
mock_resp.__enter__ = lambda s: s
|
||||
mock_resp.__exit__ = MagicMock(return_value=False)
|
||||
mock_open.return_value = mock_resp
|
||||
urls = fetch_webshare_proxies("http://example.com/proxy-list", max_proxies=3)
|
||||
assert len(urls) == 3
|
||||
|
||||
def test_empty_lines_skipped(self):
|
||||
raw = "\n\n1.2.3.4:1080:user:pass\n\n"
|
||||
with patch("urllib.request.urlopen") as mock_open:
|
||||
mock_resp = MagicMock()
|
||||
mock_resp.read.return_value = raw.encode("utf-8")
|
||||
mock_resp.__enter__ = lambda s: s
|
||||
mock_resp.__exit__ = MagicMock(return_value=False)
|
||||
mock_open.return_value = mock_resp
|
||||
urls = fetch_webshare_proxies("http://example.com/proxy-list")
|
||||
assert urls == ["http://user:pass@1.2.3.4:1080"]
|
||||
|
||||
|
||||
class TestLoadProxyTiers:
|
||||
def _clear_proxy_env(self, monkeypatch):
|
||||
for var in ("WEBSHARE_DOWNLOAD_URL", "PROXY_URLS_DATACENTER", "PROXY_URLS_RESIDENTIAL"):
|
||||
monkeypatch.delenv(var, raising=False)
|
||||
|
||||
def test_returns_empty_when_all_unset(self, monkeypatch):
|
||||
self._clear_proxy_env(monkeypatch)
|
||||
assert load_proxy_tiers() == []
|
||||
|
||||
def test_single_datacenter_tier(self, monkeypatch):
|
||||
self._clear_proxy_env(monkeypatch)
|
||||
monkeypatch.setenv("PROXY_URLS_DATACENTER", "http://dc1:8080,http://dc2:8080")
|
||||
tiers = load_proxy_tiers()
|
||||
assert len(tiers) == 1
|
||||
assert tiers[0] == ["http://dc1:8080", "http://dc2:8080"]
|
||||
|
||||
def test_residential_only(self, monkeypatch):
|
||||
self._clear_proxy_env(monkeypatch)
|
||||
monkeypatch.setenv("PROXY_URLS_RESIDENTIAL", "http://res1:8080")
|
||||
tiers = load_proxy_tiers()
|
||||
assert len(tiers) == 1
|
||||
assert tiers[0] == ["http://res1:8080"]
|
||||
|
||||
def test_empty_tiers_skipped(self, monkeypatch):
|
||||
self._clear_proxy_env(monkeypatch)
|
||||
monkeypatch.setenv("PROXY_URLS_DATACENTER", "")
|
||||
monkeypatch.setenv("PROXY_URLS_RESIDENTIAL", "http://res1:8080")
|
||||
tiers = load_proxy_tiers()
|
||||
assert len(tiers) == 1
|
||||
assert tiers[0] == ["http://res1:8080"]
|
||||
|
||||
def test_three_tiers_correct_order(self, monkeypatch):
|
||||
self._clear_proxy_env(monkeypatch)
|
||||
with patch("padelnomics_extract.proxy.fetch_webshare_proxies", return_value=["http://user:pass@1.2.3.4:1080"]):
|
||||
monkeypatch.setenv("WEBSHARE_DOWNLOAD_URL", "http://example.com/list")
|
||||
monkeypatch.setenv("PROXY_URLS_DATACENTER", "http://dc1:8080")
|
||||
monkeypatch.setenv("PROXY_URLS_RESIDENTIAL", "http://res1:8080")
|
||||
tiers = load_proxy_tiers()
|
||||
assert len(tiers) == 3
|
||||
assert tiers[0] == ["http://user:pass@1.2.3.4:1080"] # free
|
||||
assert tiers[1] == ["http://dc1:8080"] # datacenter
|
||||
assert tiers[2] == ["http://res1:8080"] # residential
|
||||
|
||||
def test_webshare_fetch_failure_skips_tier(self, monkeypatch):
|
||||
self._clear_proxy_env(monkeypatch)
|
||||
with patch("padelnomics_extract.proxy.fetch_webshare_proxies", return_value=[]):
|
||||
monkeypatch.setenv("WEBSHARE_DOWNLOAD_URL", "http://example.com/list")
|
||||
monkeypatch.setenv("PROXY_URLS_DATACENTER", "http://dc1:8080")
|
||||
tiers = load_proxy_tiers()
|
||||
assert len(tiers) == 1
|
||||
assert tiers[0] == ["http://dc1:8080"]
|
||||
|
||||
|
||||
class TestRoundRobinCycler:
|
||||
@@ -279,3 +365,138 @@ class TestStickySelectorProxy:
|
||||
fn = make_sticky_selector(urls)
|
||||
for i in range(20):
|
||||
assert fn(f"key_{i}") in urls
|
||||
|
||||
|
||||
class TestTieredCyclerNTier:
|
||||
def test_starts_on_first_tier(self):
|
||||
tiers = [["http://t0a", "http://t0b"], ["http://t1a"]]
|
||||
cycler = make_tiered_cycler(tiers, threshold=3)
|
||||
assert cycler["active_tier_index"]() == 0
|
||||
assert not cycler["is_exhausted"]()
|
||||
assert cycler["next_proxy"]() in tiers[0]
|
||||
|
||||
def test_escalates_after_threshold(self):
|
||||
tiers = [["http://t0"], ["http://t1"]]
|
||||
cycler = make_tiered_cycler(tiers, threshold=3)
|
||||
# Two failures — stays on tier 0
|
||||
cycler["record_failure"]()
|
||||
cycler["record_failure"]()
|
||||
assert cycler["active_tier_index"]() == 0
|
||||
# Third failure — escalates
|
||||
escalated = cycler["record_failure"]()
|
||||
assert escalated is True
|
||||
assert cycler["active_tier_index"]() == 1
|
||||
assert cycler["next_proxy"]() == "http://t1"
|
||||
|
||||
def test_escalates_through_all_tiers(self):
|
||||
tiers = [["http://t0"], ["http://t1"], ["http://t2"]]
|
||||
cycler = make_tiered_cycler(tiers, threshold=2)
|
||||
# Exhaust tier 0
|
||||
cycler["record_failure"]()
|
||||
cycler["record_failure"]()
|
||||
assert cycler["active_tier_index"]() == 1
|
||||
# Exhaust tier 1
|
||||
cycler["record_failure"]()
|
||||
cycler["record_failure"]()
|
||||
assert cycler["active_tier_index"]() == 2
|
||||
# Exhaust tier 2
|
||||
cycler["record_failure"]()
|
||||
cycler["record_failure"]()
|
||||
assert cycler["is_exhausted"]()
|
||||
assert cycler["next_proxy"]() is None
|
||||
|
||||
def test_success_resets_counter(self):
|
||||
tiers = [["http://t0"], ["http://t1"]]
|
||||
cycler = make_tiered_cycler(tiers, threshold=3)
|
||||
cycler["record_failure"]()
|
||||
cycler["record_failure"]()
|
||||
cycler["record_success"]()
|
||||
# Counter reset — need threshold more failures to escalate
|
||||
cycler["record_failure"]()
|
||||
cycler["record_failure"]()
|
||||
assert cycler["active_tier_index"]() == 0 # still on tier 0
|
||||
cycler["record_failure"]()
|
||||
assert cycler["active_tier_index"]() == 1 # now escalated
|
||||
|
||||
def test_counter_resets_on_escalation(self):
|
||||
"""After escalating, failure counter resets so new tier gets a fresh start."""
|
||||
tiers = [["http://t0"], ["http://t1"], ["http://t2"]]
|
||||
cycler = make_tiered_cycler(tiers, threshold=2)
|
||||
# Exhaust tier 0
|
||||
cycler["record_failure"]()
|
||||
cycler["record_failure"]()
|
||||
assert cycler["active_tier_index"]() == 1
|
||||
# One failure on tier 1 — should NOT escalate yet (counter reset)
|
||||
cycler["record_failure"]()
|
||||
assert cycler["active_tier_index"]() == 1
|
||||
# Second failure on tier 1 — escalates to tier 2
|
||||
cycler["record_failure"]()
|
||||
assert cycler["active_tier_index"]() == 2
|
||||
|
||||
def test_is_exhausted_false_when_tiers_remain(self):
|
||||
tiers = [["http://t0"], ["http://t1"]]
|
||||
cycler = make_tiered_cycler(tiers, threshold=1)
|
||||
assert not cycler["is_exhausted"]()
|
||||
cycler["record_failure"]() # escalates to tier 1
|
||||
assert not cycler["is_exhausted"]()
|
||||
|
||||
def test_is_exhausted_true_after_all_tiers_fail(self):
|
||||
tiers = [["http://t0"]]
|
||||
cycler = make_tiered_cycler(tiers, threshold=1)
|
||||
assert not cycler["is_exhausted"]()
|
||||
cycler["record_failure"]()
|
||||
assert cycler["is_exhausted"]()
|
||||
assert cycler["next_proxy"]() is None
|
||||
|
||||
def test_empty_tiers_immediately_exhausted(self):
|
||||
cycler = make_tiered_cycler([], threshold=3)
|
||||
assert cycler["is_exhausted"]()
|
||||
assert cycler["next_proxy"]() is None
|
||||
assert cycler["tier_count"]() == 0
|
||||
|
||||
def test_single_tier_cycles_within_tier(self):
|
||||
tiers = [["http://p1", "http://p2", "http://p3"]]
|
||||
cycler = make_tiered_cycler(tiers, threshold=10)
|
||||
results = [cycler["next_proxy"]() for _ in range(6)]
|
||||
assert results == ["http://p1", "http://p2", "http://p3"] * 2
|
||||
|
||||
def test_tier_count_reflects_input(self):
|
||||
assert make_tiered_cycler([], threshold=1)["tier_count"]() == 0
|
||||
assert make_tiered_cycler([["a"]], threshold=1)["tier_count"]() == 1
|
||||
assert make_tiered_cycler([["a"], ["b"], ["c"]], threshold=1)["tier_count"]() == 3
|
||||
|
||||
def test_record_failure_noop_when_exhausted(self):
|
||||
tiers = [["http://t0"]]
|
||||
cycler = make_tiered_cycler(tiers, threshold=1)
|
||||
cycler["record_failure"]() # exhausts
|
||||
assert cycler["is_exhausted"]()
|
||||
# Further failures are no-ops, not exceptions
|
||||
result = cycler["record_failure"]()
|
||||
assert result is False
|
||||
assert cycler["is_exhausted"]()
|
||||
|
||||
def test_thread_safety(self):
|
||||
"""Concurrent next_proxy and record calls do not raise or corrupt state."""
|
||||
import threading
|
||||
tiers = [["http://t0a", "http://t0b"], ["http://t1a", "http://t1b"]]
|
||||
cycler = make_tiered_cycler(tiers, threshold=5)
|
||||
errors = []
|
||||
lock = threading.Lock()
|
||||
|
||||
def worker():
|
||||
try:
|
||||
for _ in range(20):
|
||||
cycler["next_proxy"]()
|
||||
cycler["record_failure"]()
|
||||
cycler["record_success"]()
|
||||
except Exception as e:
|
||||
with lock:
|
||||
errors.append(e)
|
||||
|
||||
threads = [threading.Thread(target=worker) for _ in range(8)]
|
||||
for t in threads:
|
||||
t.start()
|
||||
for t in threads:
|
||||
t.join()
|
||||
|
||||
assert errors == [], f"Thread safety errors: {errors}"
|
||||
|
||||
Reference in New Issue
Block a user