feat(extract): daily tenant snapshots with date-based partition

- playtomic_tenants: partition by YYYY/MM/DD instead of ISO week;
  schedule changed from weekly to daily in workflows.toml
- playtomic_availability: _load_tenant_ids now tries 3-level glob
  (*/*/*/tenants.jsonl.gz) first for daily files, falls back to
  2-level for old monthly/weekly data

Alphabetical sort would rank old monthly files above daily ones
('t' > '2' in ASCII), so the explicit fallback chain is required.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Deeman
2026-02-28 17:27:16 +01:00
parent 9116625884
commit beb4195f16
3 changed files with 9 additions and 8 deletions

View File

@@ -76,7 +76,9 @@ def _load_tenant_ids(landing_dir: Path) -> list[str]:
if not playtomic_dir.exists(): if not playtomic_dir.exists():
return [] return []
# Prefer JSONL (new format), fall back to blob (old format) # Prefer daily partition (YYYY/MM/DD), fall back to older monthly/weekly partitions
tenant_files = sorted(playtomic_dir.glob("*/*/*/tenants.jsonl.gz"), reverse=True)
if not tenant_files:
tenant_files = sorted(playtomic_dir.glob("*/*/tenants.jsonl.gz"), reverse=True) tenant_files = sorted(playtomic_dir.glob("*/*/tenants.jsonl.gz"), reverse=True)
if not tenant_files: if not tenant_files:
tenant_files = sorted(playtomic_dir.glob("*/*/tenants.json.gz"), reverse=True) tenant_files = sorted(playtomic_dir.glob("*/*/tenants.json.gz"), reverse=True)

View File

@@ -81,12 +81,11 @@ def extract(
partitions and picks the most recent one. partitions and picks the most recent one.
""" """
today = datetime.now(UTC) today = datetime.now(UTC)
year = today.strftime("%G") # ISO year (matches ISO week, differs from calendar year on week boundaries) year, month, day = today.strftime("%Y"), today.strftime("%m"), today.strftime("%d")
week = today.strftime("W%V") # ISO week: W01 … W53 dest_dir = landing_path(landing_dir, "playtomic", year, month, day)
dest_dir = landing_path(landing_dir, "playtomic", year, week)
dest = dest_dir / "tenants.jsonl.gz" dest = dest_dir / "tenants.jsonl.gz"
if dest.exists(): if dest.exists():
logger.info("Already have tenants for %s/%s — skipping", year, week) logger.info("Already have tenants for %s/%s/%s — skipping", year, month, day)
return {"files_written": 0, "files_skipped": 1, "bytes_written": 0} return {"files_written": 0, "files_skipped": 1, "bytes_written": 0}
tiers = load_proxy_tiers() tiers = load_proxy_tiers()
@@ -162,7 +161,7 @@ def extract(
"files_written": 1, "files_written": 1,
"files_skipped": 0, "files_skipped": 0,
"bytes_written": bytes_written, "bytes_written": bytes_written,
"cursor_value": f"{year}/{week}", "cursor_value": f"{year}/{month}/{day}",
} }

View File

@@ -23,7 +23,7 @@ schedule = "monthly"
[playtomic_tenants] [playtomic_tenants]
module = "padelnomics_extract.playtomic_tenants" module = "padelnomics_extract.playtomic_tenants"
schedule = "weekly" schedule = "daily"
[playtomic_availability] [playtomic_availability]
module = "padelnomics_extract.playtomic_availability" module = "padelnomics_extract.playtomic_availability"