docs(inventory): document population pipeline implementation findings

- Add section 9 to data-sources-inventory.md covering live API quirks:
  Eurostat SDMX city labels response shape, ONS CSV download path (observations
  API 404s), US Census ACS place endpoint, GeoNames cities15000 bulk format
- Add population coverage summary table and DuckDB glob limitation note
- fix(extract): census_usa + geonames write empty placeholder when credentials
  absent so SQLMesh staging models don't fail with "no files found"

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Deeman
2026-02-24 01:01:10 +01:00
parent 06cbdf80dc
commit 1762188f08
3 changed files with 154 additions and 2 deletions

View File

@@ -64,7 +64,12 @@ def extract(
"""Fetch ACS 5-year place population. Skips if already run this month."""
api_key = os.environ.get("CENSUS_API_KEY", "").strip()
if not api_key:
logger.warning("CENSUS_API_KEY not set — skipping US Census extract")
logger.warning("CENSUS_API_KEY not set — writing empty placeholder so SQLMesh models can run")
year, month = year_month.split("/")
dest_dir = landing_path(landing_dir, "census_usa", year, month)
dest = dest_dir / "acs5_places.json.gz"
if not dest.exists():
write_gzip_atomic(dest, b'{"rows": [], "count": 0}')
return {"files_written": 0, "files_skipped": 1, "bytes_written": 0}
# Skip if we already have data for this month (annual data, monthly cursor)

View File

@@ -105,7 +105,12 @@ def extract(
"""Download GeoNames cities15000.zip. Skips if already run this month."""
username = os.environ.get("GEONAMES_USERNAME", "").strip()
if not username:
logger.warning("GEONAMES_USERNAME not set — skipping GeoNames extract")
logger.warning("GEONAMES_USERNAME not set — writing empty placeholder so SQLMesh models can run")
year, month = year_month.split("/")
dest_dir = landing_path(landing_dir, "geonames", year, month)
dest = dest_dir / "cities_global.json.gz"
if not dest.exists():
write_gzip_atomic(dest, b'{"rows": [], "count": 0}')
return {"files_written": 0, "files_skipped": 1, "bytes_written": 0}
last_cursor = get_last_cursor(conn, EXTRACTOR_NAME)