merge: standardise recheck availability to JSONL + update docs

This commit is contained in:
Deeman
2026-02-25 15:45:23 +01:00
6 changed files with 63 additions and 31 deletions

View File

@@ -37,7 +37,7 @@ src/padelnomics_extract/
```python
from ._shared import run_extractor, setup_logging
from .utils import landing_path, write_gzip_atomic
from .utils import compress_jsonl_atomic, landing_path
logger = setup_logging("padelnomics.extract.my_source")
EXTRACTOR_NAME = "my_source"
@@ -108,18 +108,23 @@ sqlite3 data/landing/.state.sqlite \
```
data/landing/
├── .state.sqlite
├── overpass/{year}/{month}/courts.json.gz
├── overpass/{year}/{month}/courts.{jsonl,json}.gz
├── overpass_tennis/{year}/{month}/courts.{jsonl,json}.gz
├── eurostat/{year}/{month}/urb_cpop1.json.gz
├── eurostat/{year}/{month}/ilc_di03.json.gz
├── playtomic/{year}/{month}/tenants.json.gz
── playtomic/{year}/{month}/availability_{date}.json.gz
├── geonames/{year}/{month}/cities_global.{jsonl,json}.gz
── playtomic/{year}/{month}/tenants.{jsonl,json}.gz
├── playtomic/{year}/{month}/availability_{date}.{jsonl,json}.gz
└── playtomic/{year}/{month}/availability_{date}_recheck_{HH}.{jsonl,json}.gz
```
## Data sources
| Source | Module | Schedule | Notes |
|--------|--------|----------|-------|
| Overpass API | `overpass.py` | Daily | OSM padel courts, ~5K nodes |
| Overpass API (padel) | `overpass.py` | Daily | OSM padel courts, ~5K nodes; JSONL output |
| Overpass API (tennis) | `overpass_tennis.py` | Daily | OSM tennis courts, ~150K+ nodes; regional splits; JSONL output |
| Eurostat | `eurostat.py` | Daily (304 most runs) | urb_cpop1, ilc_di03 — etag dedup |
| Playtomic tenants | `playtomic_tenants.py` | Daily | ~8K venues, bounded pagination |
| Playtomic availability | `playtomic_availability.py` | Daily | Next-day slots, ~4.5h runtime |
| GeoNames | `geonames.py` | Daily | ~140K locations (pop ≥1K); JSONL output |
| Playtomic tenants | `playtomic_tenants.py` | Daily | ~14K venues, bounded pagination; JSONL output |
| Playtomic availability | `playtomic_availability.py` | Daily + recheck | Morning: next-day slots; recheck: near-real-time fill; JSONL output |

View File

@@ -17,7 +17,7 @@ Recheck mode: re-queries venues with slots starting within the next 90 minutes.
Writes a separate recheck file for more accurate occupancy measurement.
Landing: {LANDING_DIR}/playtomic/{year}/{month}/availability_{date}.jsonl.gz
Recheck: {LANDING_DIR}/playtomic/{year}/{month}/availability_{date}_recheck_{HH}.json.gz
Recheck: {LANDING_DIR}/playtomic/{year}/{month}/availability_{date}_recheck_{HH}.jsonl.gz
"""
import gzip
@@ -39,7 +39,6 @@ from .utils import (
flush_partial_batch,
landing_path,
load_partial_results,
write_gzip_atomic,
)
logger = setup_logging("padelnomics.extract.playtomic_availability")
@@ -510,25 +509,22 @@ def extract_recheck(
logger.error("Circuit open with no fallback — writing partial recheck results")
break
# Write recheck file
# Write recheck file as JSONL — one venue per line with metadata injected
recheck_hour = now.hour
year, month = year_month.split("/")
dest_dir = landing_path(landing_dir, "playtomic", year, month)
dest = dest_dir / f"availability_{target_date}_recheck_{recheck_hour:02d}.json.gz"
dest = dest_dir / f"availability_{target_date}_recheck_{recheck_hour:02d}.jsonl.gz"
captured_at = datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ")
payload = json.dumps({
"date": target_date,
"captured_at_utc": captured_at,
"recheck_hour": recheck_hour,
"recheck_window_minutes": RECHECK_WINDOW_MINUTES,
"rechecked_tenant_ids": venues_to_recheck,
"venue_count": len(venues_data),
"venues_errored": venues_errored,
"venues": venues_data,
}).encode()
working_path = dest.with_suffix("").with_suffix(".working.jsonl")
with open(working_path, "w") as f:
for venue in venues_data:
venue["date"] = target_date
venue["captured_at_utc"] = captured_at
venue["recheck_hour"] = recheck_hour
f.write(json.dumps(venue, separators=(",", ":")) + "\n")
bytes_written = compress_jsonl_atomic(working_path, dest)
bytes_written = write_gzip_atomic(dest, payload)
logger.info(
"Recheck: %d/%d venues (%d errors) -> %s (%s bytes)",
len(venues_data), len(venues_to_recheck), venues_errored, dest, f"{bytes_written:,}",