diff --git a/extract/padelnomics_extract/src/padelnomics_extract/_shared.py b/extract/padelnomics_extract/src/padelnomics_extract/_shared.py index be4ad1b..b27974d 100644 --- a/extract/padelnomics_extract/src/padelnomics_extract/_shared.py +++ b/extract/padelnomics_extract/src/padelnomics_extract/_shared.py @@ -19,12 +19,35 @@ LANDING_DIR = Path(os.environ.get("LANDING_DIR", "data/landing")) HTTP_TIMEOUT_SECONDS = 30 OVERPASS_TIMEOUT_SECONDS = 90 # Overpass can be slow on global queries -# Realistic browser User-Agent — avoids bot detection on all extractors -USER_AGENT = ( - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " - "AppleWebKit/537.36 (KHTML, like Gecko) " - "Chrome/131.0.0.0 Safari/537.36" -) +# Honest bot UA for public APIs (Overpass, Eurostat, GeoNames, etc.) +BOT_UA = "padelnomics-bot/1.0 (+https://padelnomics.com/bot)" + +# Realistic browser UAs for proxy-backed scraping — paired 1:1 with proxy IPs +# so each IP presents a consistent fingerprint across runs. +_UA_POOL = [ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0", + "Mozilla/5.0 (X11; Linux x86_64; rv:134.0) Gecko/20100101 Firefox/134.0", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36", + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36", + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36", +] + + +def ua_for_proxy(proxy_url: str | None) -> str: + """Pick a user-agent for a given proxy URL. + + No proxy → honest bot UA (for public APIs). + With proxy → deterministic pick from _UA_POOL keyed on proxy URL hash, + so the same proxy IP always presents the same browser identity. + """ + if proxy_url is None: + return BOT_UA + return _UA_POOL[hash(proxy_url) % len(_UA_POOL)] def setup_logging(name: str) -> logging.Logger: @@ -62,7 +85,7 @@ def run_extractor( try: with niquests.Session() as session: - session.headers["User-Agent"] = USER_AGENT + session.headers["User-Agent"] = ua_for_proxy(proxy_url) if proxy_url: session.proxies = {"http": proxy_url, "https": proxy_url} result = func(LANDING_DIR, year_month, conn, session) diff --git a/extract/padelnomics_extract/src/padelnomics_extract/playtomic_availability.py b/extract/padelnomics_extract/src/padelnomics_extract/playtomic_availability.py index 63f1d4a..ff2d10f 100644 --- a/extract/padelnomics_extract/src/padelnomics_extract/playtomic_availability.py +++ b/extract/padelnomics_extract/src/padelnomics_extract/playtomic_availability.py @@ -32,7 +32,7 @@ from pathlib import Path import niquests -from ._shared import HTTP_TIMEOUT_SECONDS, USER_AGENT, run_extractor, setup_logging +from ._shared import HTTP_TIMEOUT_SECONDS, run_extractor, setup_logging, ua_for_proxy from .proxy import load_fallback_proxy_urls, load_proxy_urls, make_tiered_cycler from .utils import ( compress_jsonl_atomic, @@ -121,7 +121,7 @@ def _get_thread_session(proxy_url: str | None) -> niquests.Session: """Get or create a thread-local niquests.Session with optional proxy.""" if not hasattr(_thread_local, "session") or _thread_local.session is None: session = niquests.Session() - session.headers["User-Agent"] = USER_AGENT + session.headers["User-Agent"] = ua_for_proxy(proxy_url) if proxy_url: session.proxies = {"http": proxy_url, "https": proxy_url} _thread_local.session = session diff --git a/extract/padelnomics_extract/src/padelnomics_extract/playtomic_tenants.py b/extract/padelnomics_extract/src/padelnomics_extract/playtomic_tenants.py index ea95eca..c1eb8a7 100644 --- a/extract/padelnomics_extract/src/padelnomics_extract/playtomic_tenants.py +++ b/extract/padelnomics_extract/src/padelnomics_extract/playtomic_tenants.py @@ -29,7 +29,7 @@ from pathlib import Path import niquests -from ._shared import HTTP_TIMEOUT_SECONDS, USER_AGENT, run_extractor, setup_logging +from ._shared import HTTP_TIMEOUT_SECONDS, run_extractor, setup_logging, ua_for_proxy from .proxy import load_proxy_urls, make_round_robin_cycler from .utils import compress_jsonl_atomic, landing_path @@ -49,7 +49,7 @@ def _fetch_one_page(proxy_url: str | None, page: int) -> tuple[int, list[dict]]: Returns (page, tenants_list). Raises on HTTP error. """ s = niquests.Session() - s.headers["User-Agent"] = USER_AGENT + s.headers["User-Agent"] = ua_for_proxy(proxy_url) if proxy_url: s.proxies = {"http": proxy_url, "https": proxy_url} params = {"sport_ids": "PADEL", "size": PAGE_SIZE, "page": page}