feat(extract): proxy-pinned UA identities + honest bot UA for public APIs

Replace single hardcoded Chrome 131 UA with:
- BOT_UA: honest padelnomics-bot UA for Overpass, Eurostat, GeoNames etc.
- _UA_POOL + ua_for_proxy(): deterministic browser UA per proxy URL so each
  IP presents a consistent, distinct fingerprint across runs.

Public-API extractors (shared session, no proxy) now send BOT_UA.
Playtomic extractors (proxy-backed) each get a stable pool UA keyed on
their proxy URL hash.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Deeman
2026-02-25 22:08:00 +01:00
parent b33dd51d76
commit c5b46376af
3 changed files with 34 additions and 11 deletions

View File

@@ -19,12 +19,35 @@ LANDING_DIR = Path(os.environ.get("LANDING_DIR", "data/landing"))
HTTP_TIMEOUT_SECONDS = 30
OVERPASS_TIMEOUT_SECONDS = 90 # Overpass can be slow on global queries
# Realistic browser User-Agent — avoids bot detection on all extractors
USER_AGENT = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/131.0.0.0 Safari/537.36"
)
# Honest bot UA for public APIs (Overpass, Eurostat, GeoNames, etc.)
BOT_UA = "padelnomics-bot/1.0 (+https://padelnomics.com/bot)"
# Realistic browser UAs for proxy-backed scraping — paired 1:1 with proxy IPs
# so each IP presents a consistent fingerprint across runs.
_UA_POOL = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
"Mozilla/5.0 (X11; Linux x86_64; rv:134.0) Gecko/20100101 Firefox/134.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
]
def ua_for_proxy(proxy_url: str | None) -> str:
"""Pick a user-agent for a given proxy URL.
No proxy → honest bot UA (for public APIs).
With proxy → deterministic pick from _UA_POOL keyed on proxy URL hash,
so the same proxy IP always presents the same browser identity.
"""
if proxy_url is None:
return BOT_UA
return _UA_POOL[hash(proxy_url) % len(_UA_POOL)]
def setup_logging(name: str) -> logging.Logger:
@@ -62,7 +85,7 @@ def run_extractor(
try:
with niquests.Session() as session:
session.headers["User-Agent"] = USER_AGENT
session.headers["User-Agent"] = ua_for_proxy(proxy_url)
if proxy_url:
session.proxies = {"http": proxy_url, "https": proxy_url}
result = func(LANDING_DIR, year_month, conn, session)

View File

@@ -32,7 +32,7 @@ from pathlib import Path
import niquests
from ._shared import HTTP_TIMEOUT_SECONDS, USER_AGENT, run_extractor, setup_logging
from ._shared import HTTP_TIMEOUT_SECONDS, run_extractor, setup_logging, ua_for_proxy
from .proxy import load_fallback_proxy_urls, load_proxy_urls, make_tiered_cycler
from .utils import (
compress_jsonl_atomic,
@@ -121,7 +121,7 @@ def _get_thread_session(proxy_url: str | None) -> niquests.Session:
"""Get or create a thread-local niquests.Session with optional proxy."""
if not hasattr(_thread_local, "session") or _thread_local.session is None:
session = niquests.Session()
session.headers["User-Agent"] = USER_AGENT
session.headers["User-Agent"] = ua_for_proxy(proxy_url)
if proxy_url:
session.proxies = {"http": proxy_url, "https": proxy_url}
_thread_local.session = session

View File

@@ -29,7 +29,7 @@ from pathlib import Path
import niquests
from ._shared import HTTP_TIMEOUT_SECONDS, USER_AGENT, run_extractor, setup_logging
from ._shared import HTTP_TIMEOUT_SECONDS, run_extractor, setup_logging, ua_for_proxy
from .proxy import load_proxy_urls, make_round_robin_cycler
from .utils import compress_jsonl_atomic, landing_path
@@ -49,7 +49,7 @@ def _fetch_one_page(proxy_url: str | None, page: int) -> tuple[int, list[dict]]:
Returns (page, tenants_list). Raises on HTTP error.
"""
s = niquests.Session()
s.headers["User-Agent"] = USER_AGENT
s.headers["User-Agent"] = ua_for_proxy(proxy_url)
if proxy_url:
s.proxies = {"http": proxy_url, "https": proxy_url}
params = {"sport_ids": "PADEL", "size": PAGE_SIZE, "page": page}