Adds OVERPASS_MIRRORS list (overpass-api.de, kumi.systems, openstreetmap.ru) and a post_overpass() helper in _shared.py that tries mirrors in order, logging a warning on each failure and re-raising the last RequestException if all mirrors fail. Both overpass.py and overpass_tennis.py now call post_overpass() instead of hard-coding the primary URL. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
118 lines
3.8 KiB
Python
118 lines
3.8 KiB
Python
"""Shared configuration and helpers for all extractors.
|
|
|
|
Each source module imports from here to get LANDING_DIR, logging setup,
|
|
and the run_extractor() wrapper that handles state tracking boilerplate.
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
import sys
|
|
from datetime import UTC, datetime
|
|
from pathlib import Path
|
|
|
|
import niquests
|
|
|
|
from .utils import end_run, open_state_db, start_run
|
|
|
|
LANDING_DIR = Path(os.environ.get("LANDING_DIR", "data/landing"))
|
|
|
|
HTTP_TIMEOUT_SECONDS = 30
|
|
OVERPASS_TIMEOUT_SECONDS = 90 # Overpass can be slow on global queries
|
|
|
|
# Public mirrors running the same Overpass API software — tried in order on failure
|
|
OVERPASS_MIRRORS = [
|
|
"https://overpass-api.de/api/interpreter",
|
|
"https://overpass.kumi.systems/api/interpreter",
|
|
"https://overpass.openstreetmap.ru/api/interpreter",
|
|
]
|
|
|
|
_log = logging.getLogger(__name__)
|
|
|
|
# Realistic browser User-Agent — avoids bot detection on all extractors
|
|
USER_AGENT = (
|
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
|
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
|
"Chrome/131.0.0.0 Safari/537.36"
|
|
)
|
|
|
|
|
|
def post_overpass(
|
|
session: niquests.Session,
|
|
query: str,
|
|
timeout_seconds: int,
|
|
) -> niquests.Response:
|
|
"""POST an Overpass QL query, trying mirrors in order.
|
|
|
|
On success returns the response. On all-mirrors-fail re-raises the last
|
|
RequestException so callers with retry logic (e.g. overpass_tennis) keep
|
|
working unchanged.
|
|
"""
|
|
last_exc: Exception | None = None
|
|
for url in OVERPASS_MIRRORS:
|
|
try:
|
|
resp = session.post(url, data={"data": query}, timeout=timeout_seconds)
|
|
resp.raise_for_status()
|
|
return resp
|
|
except niquests.exceptions.RequestException as exc:
|
|
_log.warning("Overpass mirror %s failed: %s — trying next", url, exc)
|
|
last_exc = exc
|
|
assert last_exc is not None
|
|
raise last_exc
|
|
|
|
|
|
def setup_logging(name: str) -> logging.Logger:
|
|
"""Configure and return a logger for the given extractor module."""
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s %(name)s %(levelname)s %(message)s",
|
|
datefmt="%Y-%m-%d %H:%M:%S",
|
|
handlers=[logging.StreamHandler(sys.stdout)],
|
|
)
|
|
return logging.getLogger(name)
|
|
|
|
|
|
def run_extractor(
|
|
extractor_name: str,
|
|
func,
|
|
proxy_url: str | None = None,
|
|
) -> None:
|
|
"""Boilerplate wrapper: open state DB, start run, call func, end run.
|
|
|
|
func signature: func(landing_dir, year_month, conn, session) -> dict
|
|
The dict must contain: files_written, files_skipped, bytes_written.
|
|
Optional: cursor_value.
|
|
|
|
proxy_url: if set, configure the session proxy before calling func.
|
|
Extractors that manage their own proxy logic (e.g. playtomic_availability)
|
|
ignore the shared session and are unaffected.
|
|
"""
|
|
LANDING_DIR.mkdir(parents=True, exist_ok=True)
|
|
conn = open_state_db(LANDING_DIR)
|
|
run_id = start_run(conn, extractor_name)
|
|
|
|
today = datetime.now(UTC)
|
|
year_month = today.strftime("%Y/%m")
|
|
|
|
try:
|
|
with niquests.Session() as session:
|
|
session.headers["User-Agent"] = USER_AGENT
|
|
if proxy_url:
|
|
session.proxies = {"http": proxy_url, "https": proxy_url}
|
|
result = func(LANDING_DIR, year_month, conn, session)
|
|
|
|
assert isinstance(result, dict), f"extractor must return a dict, got {type(result)}"
|
|
end_run(
|
|
conn,
|
|
run_id,
|
|
status="success",
|
|
files_written=result.get("files_written", 0),
|
|
files_skipped=result.get("files_skipped", 0),
|
|
bytes_written=result.get("bytes_written", 0),
|
|
cursor_value=result.get("cursor_value"),
|
|
)
|
|
except Exception as e:
|
|
end_run(conn, run_id, status="failed", error_message=str(e)[:500])
|
|
raise
|
|
finally:
|
|
conn.close()
|