merge: proxy-pinned UA identities + honest bot UA for public APIs
# Conflicts: # extract/padelnomics_extract/src/padelnomics_extract/_shared.py
This commit is contained in:
@@ -28,12 +28,35 @@ OVERPASS_MIRRORS = [
|
|||||||
|
|
||||||
_log = logging.getLogger(__name__)
|
_log = logging.getLogger(__name__)
|
||||||
|
|
||||||
# Realistic browser User-Agent — avoids bot detection on all extractors
|
# Honest bot UA for public APIs (Overpass, Eurostat, GeoNames, etc.)
|
||||||
USER_AGENT = (
|
BOT_UA = "padelnomics-bot/1.0 (+https://padelnomics.com/bot)"
|
||||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
||||||
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
# Realistic browser UAs for proxy-backed scraping — paired 1:1 with proxy IPs
|
||||||
"Chrome/131.0.0.0 Safari/537.36"
|
# so each IP presents a consistent fingerprint across runs.
|
||||||
)
|
_UA_POOL = [
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
|
||||||
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:134.0) Gecko/20100101 Firefox/134.0",
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.2 Safari/605.1.15",
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:133.0) Gecko/20100101 Firefox/133.0",
|
||||||
|
"Mozilla/5.0 (X11; Linux x86_64; rv:134.0) Gecko/20100101 Firefox/134.0",
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
|
||||||
|
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def ua_for_proxy(proxy_url: str | None) -> str:
|
||||||
|
"""Pick a user-agent for a given proxy URL.
|
||||||
|
|
||||||
|
No proxy → honest bot UA (for public APIs).
|
||||||
|
With proxy → deterministic pick from _UA_POOL keyed on proxy URL hash,
|
||||||
|
so the same proxy IP always presents the same browser identity.
|
||||||
|
"""
|
||||||
|
if proxy_url is None:
|
||||||
|
return BOT_UA
|
||||||
|
return _UA_POOL[hash(proxy_url) % len(_UA_POOL)]
|
||||||
|
|
||||||
|
|
||||||
def post_overpass(
|
def post_overpass(
|
||||||
@@ -95,7 +118,7 @@ def run_extractor(
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
with niquests.Session() as session:
|
with niquests.Session() as session:
|
||||||
session.headers["User-Agent"] = USER_AGENT
|
session.headers["User-Agent"] = ua_for_proxy(proxy_url)
|
||||||
if proxy_url:
|
if proxy_url:
|
||||||
session.proxies = {"http": proxy_url, "https": proxy_url}
|
session.proxies = {"http": proxy_url, "https": proxy_url}
|
||||||
result = func(LANDING_DIR, year_month, conn, session)
|
result = func(LANDING_DIR, year_month, conn, session)
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ from pathlib import Path
|
|||||||
|
|
||||||
import niquests
|
import niquests
|
||||||
|
|
||||||
from ._shared import HTTP_TIMEOUT_SECONDS, USER_AGENT, run_extractor, setup_logging
|
from ._shared import HTTP_TIMEOUT_SECONDS, run_extractor, setup_logging, ua_for_proxy
|
||||||
from .proxy import load_fallback_proxy_urls, load_proxy_urls, make_tiered_cycler
|
from .proxy import load_fallback_proxy_urls, load_proxy_urls, make_tiered_cycler
|
||||||
from .utils import (
|
from .utils import (
|
||||||
compress_jsonl_atomic,
|
compress_jsonl_atomic,
|
||||||
@@ -121,7 +121,7 @@ def _get_thread_session(proxy_url: str | None) -> niquests.Session:
|
|||||||
"""Get or create a thread-local niquests.Session with optional proxy."""
|
"""Get or create a thread-local niquests.Session with optional proxy."""
|
||||||
if not hasattr(_thread_local, "session") or _thread_local.session is None:
|
if not hasattr(_thread_local, "session") or _thread_local.session is None:
|
||||||
session = niquests.Session()
|
session = niquests.Session()
|
||||||
session.headers["User-Agent"] = USER_AGENT
|
session.headers["User-Agent"] = ua_for_proxy(proxy_url)
|
||||||
if proxy_url:
|
if proxy_url:
|
||||||
session.proxies = {"http": proxy_url, "https": proxy_url}
|
session.proxies = {"http": proxy_url, "https": proxy_url}
|
||||||
_thread_local.session = session
|
_thread_local.session = session
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ from pathlib import Path
|
|||||||
|
|
||||||
import niquests
|
import niquests
|
||||||
|
|
||||||
from ._shared import HTTP_TIMEOUT_SECONDS, USER_AGENT, run_extractor, setup_logging
|
from ._shared import HTTP_TIMEOUT_SECONDS, run_extractor, setup_logging, ua_for_proxy
|
||||||
from .proxy import load_proxy_urls, make_round_robin_cycler
|
from .proxy import load_proxy_urls, make_round_robin_cycler
|
||||||
from .utils import compress_jsonl_atomic, landing_path
|
from .utils import compress_jsonl_atomic, landing_path
|
||||||
|
|
||||||
@@ -49,7 +49,7 @@ def _fetch_one_page(proxy_url: str | None, page: int) -> tuple[int, list[dict]]:
|
|||||||
Returns (page, tenants_list). Raises on HTTP error.
|
Returns (page, tenants_list). Raises on HTTP error.
|
||||||
"""
|
"""
|
||||||
s = niquests.Session()
|
s = niquests.Session()
|
||||||
s.headers["User-Agent"] = USER_AGENT
|
s.headers["User-Agent"] = ua_for_proxy(proxy_url)
|
||||||
if proxy_url:
|
if proxy_url:
|
||||||
s.proxies = {"http": proxy_url, "https": proxy_url}
|
s.proxies = {"http": proxy_url, "https": proxy_url}
|
||||||
params = {"sport_ids": "PADEL", "size": PAGE_SIZE, "page": page}
|
params = {"sport_ids": "PADEL", "size": PAGE_SIZE, "page": page}
|
||||||
|
|||||||
Reference in New Issue
Block a user