docs(inventory): document population pipeline implementation findings

- Add section 9 to data-sources-inventory.md covering live API quirks: Eurostat SDMX city labels response shape, ONS CSV download path (observations API 404s), US Census ACS place endpoint, GeoNames cities15000 bulk format - Add population coverage summary table and DuckDB glob limitation note - fix(extract): census_usa + geonames write empty placeholder when credentials absent so SQLMesh staging models don't fail with "no files found" Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-24 01:01:10 +01:00
parent 06cbdf80dc
commit 1762188f08
3 changed files with 154 additions and 2 deletions
--- a/extract/padelnomics_extract/src/padelnomics_extract/census_usa.py
+++ b/extract/padelnomics_extract/src/padelnomics_extract/census_usa.py
@@ -64,7 +64,12 @@ def extract(
    """Fetch ACS 5-year place population. Skips if already run this month."""
    api_key = os.environ.get("CENSUS_API_KEY", "").strip()
    if not api_key:
-        logger.warning("CENSUS_API_KEY not set — skipping US Census extract")
+        logger.warning("CENSUS_API_KEY not set — writing empty placeholder so SQLMesh models can run")
+        year, month = year_month.split("/")
+        dest_dir = landing_path(landing_dir, "census_usa", year, month)
+        dest = dest_dir / "acs5_places.json.gz"
+        if not dest.exists():
+            write_gzip_atomic(dest, b'{"rows": [], "count": 0}')
        return {"files_written": 0, "files_skipped": 1, "bytes_written": 0}

    # Skip if we already have data for this month (annual data, monthly cursor)
--- a/extract/padelnomics_extract/src/padelnomics_extract/geonames.py
+++ b/extract/padelnomics_extract/src/padelnomics_extract/geonames.py
@@ -105,7 +105,12 @@ def extract(
    """Download GeoNames cities15000.zip. Skips if already run this month."""
    username = os.environ.get("GEONAMES_USERNAME", "").strip()
    if not username:
-        logger.warning("GEONAMES_USERNAME not set — skipping GeoNames extract")
+        logger.warning("GEONAMES_USERNAME not set — writing empty placeholder so SQLMesh models can run")
+        year, month = year_month.split("/")
+        dest_dir = landing_path(landing_dir, "geonames", year, month)
+        dest = dest_dir / "cities_global.json.gz"
+        if not dest.exists():
+            write_gzip_atomic(dest, b'{"rows": [], "count": 0}')
        return {"files_written": 0, "files_skipped": 1, "bytes_written": 0}

    last_cursor = get_last_cursor(conn, EXTRACTOR_NAME)