From beb4195f16d089bf7f4dd56f0933749b6e325cc6 Mon Sep 17 00:00:00 2001 From: Deeman Date: Sat, 28 Feb 2026 17:27:16 +0100 Subject: [PATCH] feat(extract): daily tenant snapshots with date-based partition - playtomic_tenants: partition by YYYY/MM/DD instead of ISO week; schedule changed from weekly to daily in workflows.toml - playtomic_availability: _load_tenant_ids now tries 3-level glob (*/*/*/tenants.jsonl.gz) first for daily files, falls back to 2-level for old monthly/weekly data Alphabetical sort would rank old monthly files above daily ones ('t' > '2' in ASCII), so the explicit fallback chain is required. Co-Authored-By: Claude Sonnet 4.6 --- .../src/padelnomics_extract/playtomic_availability.py | 6 ++++-- .../src/padelnomics_extract/playtomic_tenants.py | 9 ++++----- infra/supervisor/workflows.toml | 2 +- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/extract/padelnomics_extract/src/padelnomics_extract/playtomic_availability.py b/extract/padelnomics_extract/src/padelnomics_extract/playtomic_availability.py index d708a61..5274ed1 100644 --- a/extract/padelnomics_extract/src/padelnomics_extract/playtomic_availability.py +++ b/extract/padelnomics_extract/src/padelnomics_extract/playtomic_availability.py @@ -76,8 +76,10 @@ def _load_tenant_ids(landing_dir: Path) -> list[str]: if not playtomic_dir.exists(): return [] - # Prefer JSONL (new format), fall back to blob (old format) - tenant_files = sorted(playtomic_dir.glob("*/*/tenants.jsonl.gz"), reverse=True) + # Prefer daily partition (YYYY/MM/DD), fall back to older monthly/weekly partitions + tenant_files = sorted(playtomic_dir.glob("*/*/*/tenants.jsonl.gz"), reverse=True) + if not tenant_files: + tenant_files = sorted(playtomic_dir.glob("*/*/tenants.jsonl.gz"), reverse=True) if not tenant_files: tenant_files = sorted(playtomic_dir.glob("*/*/tenants.json.gz"), reverse=True) if not tenant_files: diff --git a/extract/padelnomics_extract/src/padelnomics_extract/playtomic_tenants.py b/extract/padelnomics_extract/src/padelnomics_extract/playtomic_tenants.py index c4c0d06..e09102b 100644 --- a/extract/padelnomics_extract/src/padelnomics_extract/playtomic_tenants.py +++ b/extract/padelnomics_extract/src/padelnomics_extract/playtomic_tenants.py @@ -81,12 +81,11 @@ def extract( partitions and picks the most recent one. """ today = datetime.now(UTC) - year = today.strftime("%G") # ISO year (matches ISO week, differs from calendar year on week boundaries) - week = today.strftime("W%V") # ISO week: W01 … W53 - dest_dir = landing_path(landing_dir, "playtomic", year, week) + year, month, day = today.strftime("%Y"), today.strftime("%m"), today.strftime("%d") + dest_dir = landing_path(landing_dir, "playtomic", year, month, day) dest = dest_dir / "tenants.jsonl.gz" if dest.exists(): - logger.info("Already have tenants for %s/%s — skipping", year, week) + logger.info("Already have tenants for %s/%s/%s — skipping", year, month, day) return {"files_written": 0, "files_skipped": 1, "bytes_written": 0} tiers = load_proxy_tiers() @@ -162,7 +161,7 @@ def extract( "files_written": 1, "files_skipped": 0, "bytes_written": bytes_written, - "cursor_value": f"{year}/{week}", + "cursor_value": f"{year}/{month}/{day}", } diff --git a/infra/supervisor/workflows.toml b/infra/supervisor/workflows.toml index 5f5c43d..3dcf055 100644 --- a/infra/supervisor/workflows.toml +++ b/infra/supervisor/workflows.toml @@ -23,7 +23,7 @@ schedule = "monthly" [playtomic_tenants] module = "padelnomics_extract.playtomic_tenants" -schedule = "weekly" +schedule = "daily" [playtomic_availability] module = "padelnomics_extract.playtomic_availability"