diff --git a/extract/padelnomics_extract/src/padelnomics_extract/playtomic_tenants.py b/extract/padelnomics_extract/src/padelnomics_extract/playtomic_tenants.py index ea95eca..f275990 100644 --- a/extract/padelnomics_extract/src/padelnomics_extract/playtomic_tenants.py +++ b/extract/padelnomics_extract/src/padelnomics_extract/playtomic_tenants.py @@ -77,6 +77,10 @@ def extract( year, month = year_month.split("/") dest_dir = landing_path(landing_dir, "playtomic", year, month) dest = dest_dir / "tenants.jsonl.gz" + old_blob = dest_dir / "tenants.json.gz" + if dest.exists() or old_blob.exists(): + logger.info("Already have tenants for %s/%s — skipping", year, month) + return {"files_written": 0, "files_skipped": 1, "bytes_written": 0} proxy_urls = load_proxy_urls() next_proxy = make_round_robin_cycler(proxy_urls) if proxy_urls else None diff --git a/transform/sqlmesh_padelnomics/models/staging/stg_playtomic_opening_hours.sql b/transform/sqlmesh_padelnomics/models/staging/stg_playtomic_opening_hours.sql index 42e7bf9..b6d6bcc 100644 --- a/transform/sqlmesh_padelnomics/models/staging/stg_playtomic_opening_hours.sql +++ b/transform/sqlmesh_padelnomics/models/staging/stg_playtomic_opening_hours.sql @@ -104,3 +104,6 @@ SELECT FROM unpivoted WHERE opening_time IS NOT NULL AND closing_time IS NOT NULL +-- Enforce grain: if both old blob and new JSONL exist for the same month, +-- the UNION ALL produces duplicate (tenant_id, day_of_week) pairs — deduplicate. +QUALIFY ROW_NUMBER() OVER (PARTITION BY tenant_id, day_of_week ORDER BY tenant_id) = 1 diff --git a/transform/sqlmesh_padelnomics/models/staging/stg_playtomic_resources.sql b/transform/sqlmesh_padelnomics/models/staging/stg_playtomic_resources.sql index b6f6353..6c9484f 100644 --- a/transform/sqlmesh_padelnomics/models/staging/stg_playtomic_resources.sql +++ b/transform/sqlmesh_padelnomics/models/staging/stg_playtomic_resources.sql @@ -68,3 +68,6 @@ SELECT FROM unnested WHERE (resource_json ->> 'resource_id') IS NOT NULL AND (resource_json ->> 'sport_id') = 'PADEL' +-- Enforce grain: if both old blob and new JSONL exist for the same month, +-- the UNION ALL produces duplicate (tenant_id, resource_id) pairs — deduplicate. +QUALIFY ROW_NUMBER() OVER (PARTITION BY tenant_id, resource_json ->> 'resource_id' ORDER BY tenant_id) = 1