diff --git a/extract/padelnomics_extract/src/padelnomics_extract/_shared.py b/extract/padelnomics_extract/src/padelnomics_extract/_shared.py index be4ad1b..a7c2a38 100644 --- a/extract/padelnomics_extract/src/padelnomics_extract/_shared.py +++ b/extract/padelnomics_extract/src/padelnomics_extract/_shared.py @@ -19,6 +19,15 @@ LANDING_DIR = Path(os.environ.get("LANDING_DIR", "data/landing")) HTTP_TIMEOUT_SECONDS = 30 OVERPASS_TIMEOUT_SECONDS = 90 # Overpass can be slow on global queries +# Public mirrors running the same Overpass API software — tried in order on failure +OVERPASS_MIRRORS = [ + "https://overpass-api.de/api/interpreter", + "https://overpass.kumi.systems/api/interpreter", + "https://overpass.openstreetmap.ru/api/interpreter", +] + +_log = logging.getLogger(__name__) + # Realistic browser User-Agent — avoids bot detection on all extractors USER_AGENT = ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " @@ -27,6 +36,30 @@ USER_AGENT = ( ) +def post_overpass( + session: niquests.Session, + query: str, + timeout_seconds: int, +) -> niquests.Response: + """POST an Overpass QL query, trying mirrors in order. + + On success returns the response. On all-mirrors-fail re-raises the last + RequestException so callers with retry logic (e.g. overpass_tennis) keep + working unchanged. + """ + last_exc: Exception | None = None + for url in OVERPASS_MIRRORS: + try: + resp = session.post(url, data={"data": query}, timeout=timeout_seconds) + resp.raise_for_status() + return resp + except niquests.exceptions.RequestException as exc: + _log.warning("Overpass mirror %s failed: %s — trying next", url, exc) + last_exc = exc + assert last_exc is not None + raise last_exc + + def setup_logging(name: str) -> logging.Logger: """Configure and return a logger for the given extractor module.""" logging.basicConfig( diff --git a/extract/padelnomics_extract/src/padelnomics_extract/overpass.py b/extract/padelnomics_extract/src/padelnomics_extract/overpass.py index 950dcdc..77cc749 100644 --- a/extract/padelnomics_extract/src/padelnomics_extract/overpass.py +++ b/extract/padelnomics_extract/src/padelnomics_extract/overpass.py @@ -11,13 +11,12 @@ from pathlib import Path import niquests -from ._shared import OVERPASS_TIMEOUT_SECONDS, run_extractor, setup_logging +from ._shared import OVERPASS_TIMEOUT_SECONDS, post_overpass, run_extractor, setup_logging from .utils import landing_path, write_gzip_atomic logger = setup_logging("padelnomics.extract.overpass") EXTRACTOR_NAME = "overpass" -OVERPASS_URL = "https://overpass-api.de/api/interpreter" OVERPASS_QUERY = ( "[out:json][timeout:60];\n" @@ -41,13 +40,8 @@ def extract( dest_dir = landing_path(landing_dir, "overpass", year, month) dest = dest_dir / "courts.json.gz" - logger.info("POST %s", OVERPASS_URL) - resp = session.post( - OVERPASS_URL, - data={"data": OVERPASS_QUERY}, - timeout=OVERPASS_TIMEOUT_SECONDS, - ) - resp.raise_for_status() + logger.info("POST Overpass (with mirror fallback)") + resp = post_overpass(session, OVERPASS_QUERY, OVERPASS_TIMEOUT_SECONDS) size_bytes = len(resp.content) logger.info("%s bytes received", f"{size_bytes:,}") diff --git a/extract/padelnomics_extract/src/padelnomics_extract/overpass_tennis.py b/extract/padelnomics_extract/src/padelnomics_extract/overpass_tennis.py index e7f1c0f..cedcb2f 100644 --- a/extract/padelnomics_extract/src/padelnomics_extract/overpass_tennis.py +++ b/extract/padelnomics_extract/src/padelnomics_extract/overpass_tennis.py @@ -21,13 +21,12 @@ from pathlib import Path import niquests -from ._shared import run_extractor, setup_logging +from ._shared import post_overpass, run_extractor, setup_logging from .utils import compress_jsonl_atomic, landing_path, load_partial_results logger = setup_logging("padelnomics.extract.overpass_tennis") EXTRACTOR_NAME = "overpass_tennis" -OVERPASS_URL = "https://overpass-api.de/api/interpreter" # Each region is [south, west, north, east] — Overpass bbox format REGIONS = [ @@ -63,14 +62,9 @@ def _region_query(bbox: str) -> str: def _query_region(session: niquests.Session, region: dict) -> list[dict]: - """POST one regional Overpass query. Returns list of OSM elements.""" + """POST one regional Overpass query, with mirror fallback. Returns OSM elements.""" query = _region_query(region["bbox"]) - resp = session.post( - OVERPASS_URL, - data={"data": query}, - timeout=REGION_TIMEOUT_SECONDS, - ) - resp.raise_for_status() + resp = post_overpass(session, query, REGION_TIMEOUT_SECONDS) return resp.json().get("elements", [])