Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
51d9aab4a0 | ||
|
|
85b6aa0d0a | ||
|
|
e62aad148b |
@@ -279,12 +279,18 @@ def web_code_changed() -> bool:
|
|||||||
|
|
||||||
|
|
||||||
def current_deployed_tag() -> str | None:
|
def current_deployed_tag() -> str | None:
|
||||||
"""Return the tag currently checked out, or None if not on a tag."""
|
"""Return the highest-version tag pointing at HEAD, or None.
|
||||||
|
|
||||||
|
Uses the same sort order as latest_remote_tag() so that when multiple
|
||||||
|
tags point to the same commit (e.g. a date-based tag and a CI integer
|
||||||
|
tag), we always compare apples-to-apples.
|
||||||
|
"""
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
["git", "describe", "--tags", "--exact-match", "HEAD"],
|
["git", "tag", "--list", "--sort=-version:refname", "--points-at", "HEAD", "v*"],
|
||||||
capture_output=True, text=True, timeout=10,
|
capture_output=True, text=True, timeout=10,
|
||||||
)
|
)
|
||||||
return result.stdout.strip() or None
|
tags = result.stdout.strip().splitlines()
|
||||||
|
return tags[0] if tags else None
|
||||||
|
|
||||||
|
|
||||||
def latest_remote_tag() -> str | None:
|
def latest_remote_tag() -> str | None:
|
||||||
|
|||||||
@@ -3,11 +3,7 @@
|
|||||||
-- Broad coverage (140K+ locations) enables Gemeinde-level market intelligence.
|
-- Broad coverage (140K+ locations) enables Gemeinde-level market intelligence.
|
||||||
-- One row per geoname_id (GeoNames stable numeric identifier).
|
-- One row per geoname_id (GeoNames stable numeric identifier).
|
||||||
--
|
--
|
||||||
-- Supports two landing formats (UNION ALL during migration):
|
-- Source: data/landing/geonames/{year}/{month}/cities_global.jsonl.gz
|
||||||
-- New: cities_global.jsonl.gz — one city per line, columns directly accessible
|
|
||||||
-- Old: cities_global.json.gz — {"rows": [...]} blob (UNNEST required)
|
|
||||||
--
|
|
||||||
-- Source: data/landing/geonames/{year}/{month}/cities_global.{jsonl,json}.gz
|
|
||||||
|
|
||||||
MODEL (
|
MODEL (
|
||||||
name staging.stg_population_geonames,
|
name staging.stg_population_geonames,
|
||||||
@@ -16,74 +12,29 @@ MODEL (
|
|||||||
grain geoname_id
|
grain geoname_id
|
||||||
);
|
);
|
||||||
|
|
||||||
WITH
|
|
||||||
-- New format: one city per JSONL line
|
|
||||||
jsonl_rows AS (
|
|
||||||
SELECT
|
|
||||||
TRY_CAST(geoname_id AS INTEGER) AS geoname_id,
|
|
||||||
city_name,
|
|
||||||
country_code,
|
|
||||||
TRY_CAST(lat AS DOUBLE) AS lat,
|
|
||||||
TRY_CAST(lon AS DOUBLE) AS lon,
|
|
||||||
admin1_code,
|
|
||||||
admin2_code,
|
|
||||||
TRY_CAST(population AS BIGINT) AS population,
|
|
||||||
TRY_CAST(ref_year AS INTEGER) AS ref_year,
|
|
||||||
CURRENT_DATE AS extracted_date
|
|
||||||
FROM read_json(
|
|
||||||
@LANDING_DIR || '/geonames/*/*/cities_global.jsonl.gz',
|
|
||||||
format = 'newline_delimited',
|
|
||||||
columns = {
|
|
||||||
geoname_id: 'INTEGER', city_name: 'VARCHAR', country_code: 'VARCHAR',
|
|
||||||
lat: 'DOUBLE', lon: 'DOUBLE', admin1_code: 'VARCHAR', admin2_code: 'VARCHAR',
|
|
||||||
population: 'BIGINT', ref_year: 'INTEGER'
|
|
||||||
}
|
|
||||||
)
|
|
||||||
WHERE geoname_id IS NOT NULL
|
|
||||||
),
|
|
||||||
-- Old format: {"rows": [...]} blob — kept for transition
|
|
||||||
blob_rows AS (
|
|
||||||
SELECT
|
|
||||||
TRY_CAST(row ->> 'geoname_id' AS INTEGER) AS geoname_id,
|
|
||||||
row ->> 'city_name' AS city_name,
|
|
||||||
row ->> 'country_code' AS country_code,
|
|
||||||
TRY_CAST(row ->> 'lat' AS DOUBLE) AS lat,
|
|
||||||
TRY_CAST(row ->> 'lon' AS DOUBLE) AS lon,
|
|
||||||
row ->> 'admin1_code' AS admin1_code,
|
|
||||||
row ->> 'admin2_code' AS admin2_code,
|
|
||||||
TRY_CAST(row ->> 'population' AS BIGINT) AS population,
|
|
||||||
TRY_CAST(row ->> 'ref_year' AS INTEGER) AS ref_year,
|
|
||||||
CURRENT_DATE AS extracted_date
|
|
||||||
FROM (
|
|
||||||
SELECT UNNEST(rows) AS row
|
|
||||||
FROM read_json(
|
|
||||||
@LANDING_DIR || '/geonames/*/*/cities_global.json.gz',
|
|
||||||
auto_detect = true,
|
|
||||||
maximum_object_size = 40000000
|
|
||||||
)
|
|
||||||
)
|
|
||||||
WHERE (row ->> 'geoname_id') IS NOT NULL
|
|
||||||
),
|
|
||||||
all_rows AS (
|
|
||||||
SELECT * FROM jsonl_rows
|
|
||||||
UNION ALL
|
|
||||||
SELECT * FROM blob_rows
|
|
||||||
)
|
|
||||||
SELECT
|
SELECT
|
||||||
geoname_id,
|
TRY_CAST(geoname_id AS INTEGER) AS geoname_id,
|
||||||
TRIM(city_name) AS city_name,
|
TRIM(city_name) AS city_name,
|
||||||
UPPER(country_code) AS country_code,
|
UPPER(country_code) AS country_code,
|
||||||
lat,
|
TRY_CAST(lat AS DOUBLE) AS lat,
|
||||||
lon,
|
TRY_CAST(lon AS DOUBLE) AS lon,
|
||||||
NULLIF(TRIM(admin1_code), '') AS admin1_code,
|
NULLIF(TRIM(admin1_code), '') AS admin1_code,
|
||||||
NULLIF(TRIM(admin2_code), '') AS admin2_code,
|
NULLIF(TRIM(admin2_code), '') AS admin2_code,
|
||||||
population,
|
TRY_CAST(population AS BIGINT) AS population,
|
||||||
ref_year,
|
TRY_CAST(ref_year AS INTEGER) AS ref_year,
|
||||||
extracted_date
|
CURRENT_DATE AS extracted_date
|
||||||
FROM all_rows
|
FROM read_json(
|
||||||
WHERE population IS NOT NULL
|
@LANDING_DIR || '/geonames/*/*/cities_global.jsonl.gz',
|
||||||
|
format = 'newline_delimited',
|
||||||
|
columns = {
|
||||||
|
geoname_id: 'INTEGER', city_name: 'VARCHAR', country_code: 'VARCHAR',
|
||||||
|
lat: 'DOUBLE', lon: 'DOUBLE', admin1_code: 'VARCHAR', admin2_code: 'VARCHAR',
|
||||||
|
population: 'BIGINT', ref_year: 'INTEGER'
|
||||||
|
}
|
||||||
|
)
|
||||||
|
WHERE geoname_id IS NOT NULL
|
||||||
|
AND population IS NOT NULL
|
||||||
AND population > 0
|
AND population > 0
|
||||||
AND geoname_id IS NOT NULL
|
|
||||||
AND city_name IS NOT NULL
|
AND city_name IS NOT NULL
|
||||||
AND lat IS NOT NULL
|
AND lat IS NOT NULL
|
||||||
AND lon IS NOT NULL
|
AND lon IS NOT NULL
|
||||||
|
|||||||
@@ -1,22 +1,19 @@
|
|||||||
"""Create minimal seed files for SQLMesh staging models that require landing data."""
|
"""Create minimal seed files for SQLMesh staging models that require landing data.
|
||||||
|
|
||||||
|
Seeds are empty JSONL gzip files — they satisfy DuckDB's file-not-found check
|
||||||
|
while contributing zero rows to the staging models.
|
||||||
|
"""
|
||||||
import gzip
|
import gzip
|
||||||
import json
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
seed = {
|
# stg_playtomic_availability requires at least one morning and one recheck file
|
||||||
"date": "1970-01-01",
|
morning = Path("data/landing/playtomic/1970/01/availability_1970-01-01.jsonl.gz")
|
||||||
"captured_at_utc": "1970-01-01T00:00:00Z",
|
recheck = Path("data/landing/playtomic/1970/01/availability_1970-01-01_recheck_00.jsonl.gz")
|
||||||
"venue_count": 0,
|
|
||||||
"venues_errored": 0,
|
|
||||||
"venues": [],
|
|
||||||
}
|
|
||||||
morning = Path("data/landing/playtomic/1970/01/availability_1970-01-01.json.gz")
|
|
||||||
recheck = Path("data/landing/playtomic/1970/01/availability_1970-01-01_recheck_00.json.gz")
|
|
||||||
morning.parent.mkdir(parents=True, exist_ok=True)
|
morning.parent.mkdir(parents=True, exist_ok=True)
|
||||||
for p in [morning, recheck]:
|
for p in [morning, recheck]:
|
||||||
if not p.exists():
|
if not p.exists():
|
||||||
with gzip.open(p, "wt") as f:
|
with gzip.open(p, "wb") as f:
|
||||||
json.dump(seed, f)
|
pass # empty JSONL — 0 rows, no error
|
||||||
print("created", p)
|
print("created", p)
|
||||||
else:
|
else:
|
||||||
print("exists ", p)
|
print("exists ", p)
|
||||||
|
|||||||
Reference in New Issue
Block a user