feat(extract): daily tenant snapshots with date-based partition
- playtomic_tenants: partition by YYYY/MM/DD instead of ISO week;
schedule changed from weekly to daily in workflows.toml
- playtomic_availability: _load_tenant_ids now tries 3-level glob
(*/*/*/tenants.jsonl.gz) first for daily files, falls back to
2-level for old monthly/weekly data
Alphabetical sort would rank old monthly files above daily ones
('t' > '2' in ASCII), so the explicit fallback chain is required.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -76,7 +76,9 @@ def _load_tenant_ids(landing_dir: Path) -> list[str]:
|
|||||||
if not playtomic_dir.exists():
|
if not playtomic_dir.exists():
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# Prefer JSONL (new format), fall back to blob (old format)
|
# Prefer daily partition (YYYY/MM/DD), fall back to older monthly/weekly partitions
|
||||||
|
tenant_files = sorted(playtomic_dir.glob("*/*/*/tenants.jsonl.gz"), reverse=True)
|
||||||
|
if not tenant_files:
|
||||||
tenant_files = sorted(playtomic_dir.glob("*/*/tenants.jsonl.gz"), reverse=True)
|
tenant_files = sorted(playtomic_dir.glob("*/*/tenants.jsonl.gz"), reverse=True)
|
||||||
if not tenant_files:
|
if not tenant_files:
|
||||||
tenant_files = sorted(playtomic_dir.glob("*/*/tenants.json.gz"), reverse=True)
|
tenant_files = sorted(playtomic_dir.glob("*/*/tenants.json.gz"), reverse=True)
|
||||||
|
|||||||
@@ -81,12 +81,11 @@ def extract(
|
|||||||
partitions and picks the most recent one.
|
partitions and picks the most recent one.
|
||||||
"""
|
"""
|
||||||
today = datetime.now(UTC)
|
today = datetime.now(UTC)
|
||||||
year = today.strftime("%G") # ISO year (matches ISO week, differs from calendar year on week boundaries)
|
year, month, day = today.strftime("%Y"), today.strftime("%m"), today.strftime("%d")
|
||||||
week = today.strftime("W%V") # ISO week: W01 … W53
|
dest_dir = landing_path(landing_dir, "playtomic", year, month, day)
|
||||||
dest_dir = landing_path(landing_dir, "playtomic", year, week)
|
|
||||||
dest = dest_dir / "tenants.jsonl.gz"
|
dest = dest_dir / "tenants.jsonl.gz"
|
||||||
if dest.exists():
|
if dest.exists():
|
||||||
logger.info("Already have tenants for %s/%s — skipping", year, week)
|
logger.info("Already have tenants for %s/%s/%s — skipping", year, month, day)
|
||||||
return {"files_written": 0, "files_skipped": 1, "bytes_written": 0}
|
return {"files_written": 0, "files_skipped": 1, "bytes_written": 0}
|
||||||
|
|
||||||
tiers = load_proxy_tiers()
|
tiers = load_proxy_tiers()
|
||||||
@@ -162,7 +161,7 @@ def extract(
|
|||||||
"files_written": 1,
|
"files_written": 1,
|
||||||
"files_skipped": 0,
|
"files_skipped": 0,
|
||||||
"bytes_written": bytes_written,
|
"bytes_written": bytes_written,
|
||||||
"cursor_value": f"{year}/{week}",
|
"cursor_value": f"{year}/{month}/{day}",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ schedule = "monthly"
|
|||||||
|
|
||||||
[playtomic_tenants]
|
[playtomic_tenants]
|
||||||
module = "padelnomics_extract.playtomic_tenants"
|
module = "padelnomics_extract.playtomic_tenants"
|
||||||
schedule = "weekly"
|
schedule = "daily"
|
||||||
|
|
||||||
[playtomic_availability]
|
[playtomic_availability]
|
||||||
module = "padelnomics_extract.playtomic_availability"
|
module = "padelnomics_extract.playtomic_availability"
|
||||||
|
|||||||
Reference in New Issue
Block a user