Compare commits

...

4 Commits

Author SHA1 Message Date
Deeman
c5d872ec55 chore(secrets): add R2_LANDING_* vars for
All checks were successful
CI / test (push) Successful in 49s
CI / tag (push) Successful in 2s
landing-zone backup bucket…
2026-03-01 17:32:51 +01:00
Deeman
75305935bd merge: GISCO extractor + wire all extractors + load_dotenv fix
All checks were successful
CI / test (push) Successful in 50s
CI / tag (push) Successful in 3s
2026-03-01 17:08:42 +01:00
Deeman
a15c32d398 fix(extract): load .env automatically via load_dotenv()
PROXY_URLS_* and other secrets were defined in .env but never loaded,
causing availability to run in slow serial mode (1 req/s) instead of
parallel mode with proxies.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-01 16:41:59 +01:00
Deeman
97c5846d51 feat(extract): GISCO extractor + wire all unscheduled extractors
- New gisco.py: proper extractor module replacing scripts/download_gisco_nuts.py.
  Writes uncompressed .geojson (ST_Read can't handle .gz). Fixed partition path
  gisco/2024/01/nuts2_boundaries.geojson; cursor tracking skips re-download monthly.
- all.py: import + register gisco in EXTRACTORS (9 independent, 1 dep)
- pyproject.toml: add extract-gisco entry point
- workflows.toml: add census_usa, census_usa_income, eurostat_city_labels,
  ons_uk, gisco — all monthly, no dependencies
- Delete scripts/download_gisco_nuts.py (superseded)

Unblocks: stg_nuts2_boundaries, stg_regional_income, stg_income_usa,
and 4 downstream models (dim_locations, pseo_city_costs_de,
location_opportunity_profile, pseo_country_overview).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-01 15:49:39 +01:00
8 changed files with 133 additions and 86 deletions

View File

@@ -3,6 +3,8 @@ APP_NAME=ENC[AES256_GCM,data:Vic/MJYoxZo8JAI=,iv:n1SEGQaGeZtYMtLmDRFiljDBbNKFvCz
SECRET_KEY=ENC[AES256_GCM,data:a3Bhj3gSQaE3llRWBYzpjoFDhhhSsNee67jXJs7+qn4=,iv:yvrx78X5Ut4DBSlmBnIn09ESVc/tuDiwiV4njmjcvko=,tag:cbFUTAEpX+isQD9FCVllsw==,type:str] SECRET_KEY=ENC[AES256_GCM,data:a3Bhj3gSQaE3llRWBYzpjoFDhhhSsNee67jXJs7+qn4=,iv:yvrx78X5Ut4DBSlmBnIn09ESVc/tuDiwiV4njmjcvko=,tag:cbFUTAEpX+isQD9FCVllsw==,type:str]
BASE_URL=ENC[AES256_GCM,data:LcbPDZf9Pwcuv7RxN9xhNfa9Tufi,iv:cOdjW9nNe+BuDXh+dL4b5LFQL2mKBiKV0FaEsDGMAQc=,tag:3uAn3AIwsztIfGpkQLD5Fg==,type:str] BASE_URL=ENC[AES256_GCM,data:LcbPDZf9Pwcuv7RxN9xhNfa9Tufi,iv:cOdjW9nNe+BuDXh+dL4b5LFQL2mKBiKV0FaEsDGMAQc=,tag:3uAn3AIwsztIfGpkQLD5Fg==,type:str]
DEBUG=ENC[AES256_GCM,data:qrEGkA==,iv:bCyEDWiEzolHo4vabiyYTsqM0eUaBmNbXYYu4wCsaeE=,tag:80gnDNbdZHRWVEYtuA1M2Q==,type:str] DEBUG=ENC[AES256_GCM,data:qrEGkA==,iv:bCyEDWiEzolHo4vabiyYTsqM0eUaBmNbXYYu4wCsaeE=,tag:80gnDNbdZHRWVEYtuA1M2Q==,type:str]
#ENC[AES256_GCM,data:YB5h,iv:2HFpvHNebAB9M/44rtPk/QpFV9hNKOlV/099OSjPnOA=,tag:BVj8vGy6K3LW/wb1vcZ+Ug==,type:comment]
GITEA_TOKEN=ENC[AES256_GCM,data:aIM7vQXxFbz7FDdXEdwtelvmXAdLgJfWNCSPeK//NlveQrU5cLDt8w==,iv:9qhjk52ZAs+y5WwP5WebMUwHhu6JNdHzAsEOpznrwBw=,tag:WnCDA4hAccMFs6vXVVKqxw==,type:str]
#ENC[AES256_GCM,data:YmlGAWpXxRCqam3oTWtGxHDXC+svEXI4HyUxrm/8OcKTuJsYPcL1WcnYqrP5Mf5lU5qPezEXUrrgZy8vjVW6qAbb0IA2PMM4Kg==,iv:dx6Dn99dJgjwyvUp8NAygXjRQ50yKYFeC73Oqt9WvmY=,tag:6JLF2ixSAv39VkKt6+cecQ==,type:comment] #ENC[AES256_GCM,data:YmlGAWpXxRCqam3oTWtGxHDXC+svEXI4HyUxrm/8OcKTuJsYPcL1WcnYqrP5Mf5lU5qPezEXUrrgZy8vjVW6qAbb0IA2PMM4Kg==,iv:dx6Dn99dJgjwyvUp8NAygXjRQ50yKYFeC73Oqt9WvmY=,tag:6JLF2ixSAv39VkKt6+cecQ==,type:comment]
ADMIN_EMAILS=ENC[AES256_GCM,data:hlG8b32WlD4ems3VKQ==,iv:wWO08dmX4oLhHulXg4HUG0PjRnFiX19RUTkTvjqIw5I=,tag:KMjXsBt7aE/KqlCfV+fdMg==,type:str] ADMIN_EMAILS=ENC[AES256_GCM,data:hlG8b32WlD4ems3VKQ==,iv:wWO08dmX4oLhHulXg4HUG0PjRnFiX19RUTkTvjqIw5I=,tag:KMjXsBt7aE/KqlCfV+fdMg==,type:str]
#ENC[AES256_GCM,data:b2wQxnL8Q2Bp,iv:q8ep3yUPzCumpZpljoVL2jbcPdsI5c2piiZ0x5k10Mw=,tag:IbjkT0Mjgu9n+6FGiPVihg==,type:comment] #ENC[AES256_GCM,data:b2wQxnL8Q2Bp,iv:q8ep3yUPzCumpZpljoVL2jbcPdsI5c2piiZ0x5k10Mw=,tag:IbjkT0Mjgu9n+6FGiPVihg==,type:comment]
@@ -71,7 +73,7 @@ GEONAMES_USERNAME=ENC[AES256_GCM,data:aSkVdLNrhiF6tlg=,iv:eemFGwDIv3EG/P3lVHGZj9
CENSUS_API_KEY=ENC[AES256_GCM,data:qqG971573aGq9MiHI2xLlanKKFwjfcNNoMXtm8LNbyh0rMbQN2XukQ==,iv:az2i0ldH75nHGah4DeOxaXmDbVYqmC1c77ptZqFA9BI=,tag:zoDdKj9bR7fgIDo1/dEU2g==,type:str] CENSUS_API_KEY=ENC[AES256_GCM,data:qqG971573aGq9MiHI2xLlanKKFwjfcNNoMXtm8LNbyh0rMbQN2XukQ==,iv:az2i0ldH75nHGah4DeOxaXmDbVYqmC1c77ptZqFA9BI=,tag:zoDdKj9bR7fgIDo1/dEU2g==,type:str]
sops_age__list_0__map_enc=-----BEGIN AGE ENCRYPTED FILE-----\nYWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBxNWNmUzVNUGdWRnE0ZFpF\nM0JQZWZ3UDdEVzlwTmIxakxOZXBkT2x2ZlNrClRtV2M3S2daSGxUZmFDSWQ2Nmh4\neU51QndFcUxlSE00RFovOVJTcDZmUUUKLS0tIDcvL3hRMDRoMWZZSXljNzA3WG5o\nMWFic21MV0krMzlIaldBTVU0ZDdlTE0K7euGQtA+9lHNws+x7TMCArZamm9att96\nL8cXoUDWe5fNI5+M1bXReqVfNwPTwZsV6j/+ZtYKybklIzWz02Ex4A==\n-----END AGE ENCRYPTED FILE-----\n sops_age__list_0__map_enc=-----BEGIN AGE ENCRYPTED FILE-----\nYWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBxNWNmUzVNUGdWRnE0ZFpF\nM0JQZWZ3UDdEVzlwTmIxakxOZXBkT2x2ZlNrClRtV2M3S2daSGxUZmFDSWQ2Nmh4\neU51QndFcUxlSE00RFovOVJTcDZmUUUKLS0tIDcvL3hRMDRoMWZZSXljNzA3WG5o\nMWFic21MV0krMzlIaldBTVU0ZDdlTE0K7euGQtA+9lHNws+x7TMCArZamm9att96\nL8cXoUDWe5fNI5+M1bXReqVfNwPTwZsV6j/+ZtYKybklIzWz02Ex4A==\n-----END AGE ENCRYPTED FILE-----\n
sops_age__list_0__map_recipient=age1f5002gj4s78jju45jd28kuejtcfhn5cdujz885fl7z2p9ym68pnsgky87a sops_age__list_0__map_recipient=age1f5002gj4s78jju45jd28kuejtcfhn5cdujz885fl7z2p9ym68pnsgky87a
sops_lastmodified=2026-03-01T13:26:08Z sops_lastmodified=2026-03-01T13:34:16Z
sops_mac=ENC[AES256_GCM,data:WmbT6tCUEoCDyKu673NQoJNzmCiilpG8yDVGl6ObxTOYleWt+1DVdPS+XUV+0Wd4bfkEhGTEfXAyy+wfoCVfYnenMuDGjXUUdsvqrOX6nnNCJ8nIntL46LfbRsbVrU6eeYGu/TaTyfouWjkk6pqlxffNSS6rrEFNZE4Q+v58+EI=,iv:TuCEmK6YJXsYISbN4mbuVbS6OvUNuhPRLstjjNkkrPk=,tag:hWLS036q7H5lMNpR6gZBVA==,type:str] sops_mac=ENC[AES256_GCM,data:JLfGLbNTEcI6M/sUA5Zez6cfEUObgnUBmX52560PzBmeLZt0F5Y5QpeojIBqEDMuNB0hp1nnPI59WClLJtQ12VlHo9TkL3x9uCNUG+KneQrn1bTmJpA3cwNkWTzIm4l+TGbJbd4FpKJ9H0v1w+sqoKOgG8DqbtOeVdUfsVspAso=,iv:UqYxooXkEtx+y7fYzl+GFncpkjz8dcP7o9fp+kFf6w4=,tag:/maSb1aZGo+Ia8eGpB7PYw==,type:str]
sops_unencrypted_suffix=_unencrypted sops_unencrypted_suffix=_unencrypted
sops_version=3.12.1 sops_version=3.12.1

View File

@@ -52,13 +52,17 @@ BING_SITE_URL=ENC[AES256_GCM,data:M33VI97DyxH8gRR3ZUXoXg4QrEv5og==,iv:GxZtwfbBVi
#ENC[AES256_GCM,data:OTUMKNkRW0zrupNppXthwE1oieILhNjM+cjx5hFn69g=,iv:48ID2qtSe9ggD2X+G/iUqp3v2uwEc7fZw8lxHIvVXmk=,tag:okBn0Npk1K9dDOFWA/AB1A==,type:comment] #ENC[AES256_GCM,data:OTUMKNkRW0zrupNppXthwE1oieILhNjM+cjx5hFn69g=,iv:48ID2qtSe9ggD2X+G/iUqp3v2uwEc7fZw8lxHIvVXmk=,tag:okBn0Npk1K9dDOFWA/AB1A==,type:comment]
GEONAMES_USERNAME=ENC[AES256_GCM,data:UXd/S2TzXPiGmLY=,iv:OMURM5E6SFEsaqroUlH76DEnr7C/ujNk9UQnbWT0hK4=,tag:VsjjS12QDbudiEhdAQ/OCQ==,type:str] GEONAMES_USERNAME=ENC[AES256_GCM,data:UXd/S2TzXPiGmLY=,iv:OMURM5E6SFEsaqroUlH76DEnr7C/ujNk9UQnbWT0hK4=,tag:VsjjS12QDbudiEhdAQ/OCQ==,type:str]
CENSUS_API_KEY=ENC[AES256_GCM,data:9RbKlxSD17LqIuuNXaOKSgZ8LnFh9Wbze3XHgpctfV/1TqBMZTIedQ==,iv:WwsmR3HLUEcgUpLliGRaUPhGM9vFNPMGXSAQQ6+9UVc=,tag:R4EMNy5MxxvK0UTaCL0umA==,type:str] CENSUS_API_KEY=ENC[AES256_GCM,data:9RbKlxSD17LqIuuNXaOKSgZ8LnFh9Wbze3XHgpctfV/1TqBMZTIedQ==,iv:WwsmR3HLUEcgUpLliGRaUPhGM9vFNPMGXSAQQ6+9UVc=,tag:R4EMNy5MxxvK0UTaCL0umA==,type:str]
#ENC[AES256_GCM,data:SL402gYB8ngjqkrG03FmaA==,iv:I326cYnOWdFnaUwnSfP+s2p9oCDCnqDzUJuPOzSFJc0=,tag:MBW5AqAaq4hTMmNXq1tXKw==,type:comment]
R2_LANDING_BUCKET=ENC[AES256_GCM,data:yZXLNQb8yN9nQPdxqmqv61fLWbRYCjjOqQ==,iv:fAwBLC/EuU0lgYOxZSkTagWyeQCdEadjssapxpCEGjA=,tag:VUmuVw76WZAaukp71Desag==,type:str]
R2_LANDING_ACCESS_KEY_ID=ENC[AES256_GCM,data:Y6y+U1ayhpFDcoaDjl7hyMVjU3gVvtORAH5gbd+HXbM=,iv:ra9kuch1DT+2tfz140bvxQRIXypsdiUrX1QYQ59gNRI=,tag:Wt85qliUMFvgbvoUrOXT7A==,type:str]
R2_LANDING_SECRET_ACCESS_KEY=ENC[AES256_GCM,data:99wB9aKSq2GihW9FOwBSMgHYzNKBHlol2Mf2kg4Ma6Fr4Cr21t/blzPxNQ7YRdeKk6ypFgViXlS4BJz9nC+v0g==,iv:/AmbXtj/uSGcMp+NBhN5tiVb2U56tvO5e1UpG2/ijPo=,tag:Qg2Tt11DUJPyeYcq9iSVnQ==,type:str]
sops_age__list_0__map_enc=-----BEGIN AGE ENCRYPTED FILE-----\nYWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBaUVk0UEVqdmtsM3VzQnpZ\nZjJDZ1lsM0VqWFpVVXUvNzdQcCtHbVJLNjFnCmhna01vTkVBaFQ5ZVlXeGhYNXdH\ncWJ5Qi9PdkxLaHBhQnR3cmtoblkxdEUKLS0tIDhHamY4NXhxOG9YN1NpbTN1aVRh\nOHVKcEN1d0QwQldVTDlBWUU4SDVDWlUKRJU+CTfTzIx6LLKin9sTXAHPVAfiUerZ\nCqYVFncsCJE3TbMI424urQj7kragPoGl1z4++yqAXNTRxfZIY4KTkg==\n-----END AGE ENCRYPTED FILE-----\n sops_age__list_0__map_enc=-----BEGIN AGE ENCRYPTED FILE-----\nYWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBaUVk0UEVqdmtsM3VzQnpZ\nZjJDZ1lsM0VqWFpVVXUvNzdQcCtHbVJLNjFnCmhna01vTkVBaFQ5ZVlXeGhYNXdH\ncWJ5Qi9PdkxLaHBhQnR3cmtoblkxdEUKLS0tIDhHamY4NXhxOG9YN1NpbTN1aVRh\nOHVKcEN1d0QwQldVTDlBWUU4SDVDWlUKRJU+CTfTzIx6LLKin9sTXAHPVAfiUerZ\nCqYVFncsCJE3TbMI424urQj7kragPoGl1z4++yqAXNTRxfZIY4KTkg==\n-----END AGE ENCRYPTED FILE-----\n
sops_age__list_0__map_recipient=age1f5002gj4s78jju45jd28kuejtcfhn5cdujz885fl7z2p9ym68pnsgky87a sops_age__list_0__map_recipient=age1f5002gj4s78jju45jd28kuejtcfhn5cdujz885fl7z2p9ym68pnsgky87a
sops_age__list_1__map_enc=-----BEGIN AGE ENCRYPTED FILE-----\nYWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBmVEticFRVemlzZnlzek4x\nbWJ0d0h5ejJVUk5remo1VkdxNjVpdllqbFhFClc1UXlNd09xVVA5MnltMlN5MWRy\nYUlNRmNybHh1RGdPVC9yWlYrVmRTdkkKLS0tIHBUbU9qSDMrVGVHZDZGSFdpWlBh\nT3NXTGl0SmszaU9hRmU5bXI0cDRoRW8KLvbNYsBEwz+ITKvn7Yn+iNHiRzyyjtQt\no9/HupykJ3WjSdleGz7ZN6UiPGelHp0D/rzSASTYaI1+0i0xZ4PUoQ==\n-----END AGE ENCRYPTED FILE-----\n sops_age__list_1__map_enc=-----BEGIN AGE ENCRYPTED FILE-----\nYWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBmVEticFRVemlzZnlzek4x\nbWJ0d0h5ejJVUk5remo1VkdxNjVpdllqbFhFClc1UXlNd09xVVA5MnltMlN5MWRy\nYUlNRmNybHh1RGdPVC9yWlYrVmRTdkkKLS0tIHBUbU9qSDMrVGVHZDZGSFdpWlBh\nT3NXTGl0SmszaU9hRmU5bXI0cDRoRW8KLvbNYsBEwz+ITKvn7Yn+iNHiRzyyjtQt\no9/HupykJ3WjSdleGz7ZN6UiPGelHp0D/rzSASTYaI1+0i0xZ4PUoQ==\n-----END AGE ENCRYPTED FILE-----\n
sops_age__list_1__map_recipient=age1wjepykv3glvsrtegu25tevg7vyn3ngpl607u3yjc9ucay04s045s796msw sops_age__list_1__map_recipient=age1wjepykv3glvsrtegu25tevg7vyn3ngpl607u3yjc9ucay04s045s796msw
sops_age__list_2__map_enc=-----BEGIN AGE ENCRYPTED FILE-----\nYWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBFeHhaOURNZnRVMEwxNThu\nUjF4Q0kwUXhTUE1QSzZJbmpubnh3RnpQTmdvCjRmWWxpNkxFUmVGb3NRbnlydW5O\nWEg3ZXJQTU4vcndzS2pUQXY3Q0ttYjAKLS0tIE9IRFJ1c2ZxbGVHa2xTL0swbGN1\nTzgwMThPUDRFTWhuZHJjZUYxOTZrU00KY62qrNBCUQYxwcLMXFEnLkwncxq3BPJB\nKm4NzeHBU87XmPWVrgrKuf+PH1mxJlBsl7Hev8xBTy7l6feiZjLIvQ==\n-----END AGE ENCRYPTED FILE-----\n sops_age__list_2__map_enc=-----BEGIN AGE ENCRYPTED FILE-----\nYWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBFeHhaOURNZnRVMEwxNThu\nUjF4Q0kwUXhTUE1QSzZJbmpubnh3RnpQTmdvCjRmWWxpNkxFUmVGb3NRbnlydW5O\nWEg3ZXJQTU4vcndzS2pUQXY3Q0ttYjAKLS0tIE9IRFJ1c2ZxbGVHa2xTL0swbGN1\nTzgwMThPUDRFTWhuZHJjZUYxOTZrU00KY62qrNBCUQYxwcLMXFEnLkwncxq3BPJB\nKm4NzeHBU87XmPWVrgrKuf+PH1mxJlBsl7Hev8xBTy7l6feiZjLIvQ==\n-----END AGE ENCRYPTED FILE-----\n
sops_age__list_2__map_recipient=age1c783ym2q5x9tv7py5d28uc4k44aguudjn03g97l9nzs00dd9tsrqum8h4d sops_age__list_2__map_recipient=age1c783ym2q5x9tv7py5d28uc4k44aguudjn03g97l9nzs00dd9tsrqum8h4d
sops_lastmodified=2026-03-01T13:25:41Z sops_lastmodified=2026-03-01T16:31:40Z
sops_mac=ENC[AES256_GCM,data:EL9Bgo0pWWECeHaaM1bHtkvwBgBmS3P2cX+6oahHKmLEJLI7P7fiomP7G8SdrfUyNpZaP9d4LlfwZSuCPqH6rP8jzF67oNkfXfd/xK4OW2U2TqSvouCMzlhqVQgS4HHl5EgvOI488WEIZko7KK2A1rxnpkm8C29WG9d9G64LKvw=,iv:XzsNm3CXnlC6SIef63BdddALjGustp8czHQCWOtjXBQ=,tag:zll0db6K1+M4brOpfVWnhg==,type:str] sops_mac=ENC[AES256_GCM,data:+9Sk7wVRPMDeDf6FkuNAOyUT6/OD8Rk6jtJuy5CGQXdxxCYY12F6dAGF6V5fE0toqfYxhVTJbSqH32qTZM2Tc28n36zCtXNnaTdv9rS4XFfPq+MrhuIIv5bJYwDXDgW4F5TCeCBB09jgUKDRaQVGBn2hO3+k8auaPdqWp2cd+es=,iv:wtN61uo7vixY1/EQteyTMzG73C6Gz8AFu1qodR9JvQw=,tag:Z1izDo6EAS03OhA1bj0ArA==,type:str]
sops_unencrypted_suffix=_unencrypted sops_unencrypted_suffix=_unencrypted
sops_version=3.12.1 sops_version=3.12.1

View File

@@ -21,6 +21,7 @@ extract-census-usa = "padelnomics_extract.census_usa:main"
extract-census-usa-income = "padelnomics_extract.census_usa_income:main" extract-census-usa-income = "padelnomics_extract.census_usa_income:main"
extract-ons-uk = "padelnomics_extract.ons_uk:main" extract-ons-uk = "padelnomics_extract.ons_uk:main"
extract-geonames = "padelnomics_extract.geonames:main" extract-geonames = "padelnomics_extract.geonames:main"
extract-gisco = "padelnomics_extract.gisco:main"
[build-system] [build-system]
requires = ["hatchling"] requires = ["hatchling"]

View File

@@ -11,9 +11,12 @@ from datetime import UTC, datetime
from pathlib import Path from pathlib import Path
import niquests import niquests
from dotenv import load_dotenv
from .utils import end_run, open_state_db, start_run from .utils import end_run, open_state_db, start_run
load_dotenv()
LANDING_DIR = Path(os.environ.get("LANDING_DIR", "data/landing")) LANDING_DIR = Path(os.environ.get("LANDING_DIR", "data/landing"))
HTTP_TIMEOUT_SECONDS = 30 HTTP_TIMEOUT_SECONDS = 30

View File

@@ -7,7 +7,7 @@ A graphlib.TopologicalSorter schedules them: tasks with no unmet dependencies
run immediately in parallel; each completion may unlock new tasks. run immediately in parallel; each completion may unlock new tasks.
Current dependency graph: Current dependency graph:
- All 8 non-availability extractors have no dependencies (run in parallel) - All 9 non-availability extractors have no dependencies (run in parallel)
- playtomic_availability depends on playtomic_tenants (starts as soon as - playtomic_availability depends on playtomic_tenants (starts as soon as
tenants finishes, even if other extractors are still running) tenants finishes, even if other extractors are still running)
""" """
@@ -26,6 +26,8 @@ from .eurostat_city_labels import EXTRACTOR_NAME as EUROSTAT_CITY_LABELS_NAME
from .eurostat_city_labels import extract as extract_eurostat_city_labels from .eurostat_city_labels import extract as extract_eurostat_city_labels
from .geonames import EXTRACTOR_NAME as GEONAMES_NAME from .geonames import EXTRACTOR_NAME as GEONAMES_NAME
from .geonames import extract as extract_geonames from .geonames import extract as extract_geonames
from .gisco import EXTRACTOR_NAME as GISCO_NAME
from .gisco import extract as extract_gisco
from .ons_uk import EXTRACTOR_NAME as ONS_UK_NAME from .ons_uk import EXTRACTOR_NAME as ONS_UK_NAME
from .ons_uk import extract as extract_ons_uk from .ons_uk import extract as extract_ons_uk
from .overpass import EXTRACTOR_NAME as OVERPASS_NAME from .overpass import EXTRACTOR_NAME as OVERPASS_NAME
@@ -50,6 +52,7 @@ EXTRACTORS: dict[str, tuple] = {
CENSUS_USA_INCOME_NAME: (extract_census_usa_income, []), CENSUS_USA_INCOME_NAME: (extract_census_usa_income, []),
ONS_UK_NAME: (extract_ons_uk, []), ONS_UK_NAME: (extract_ons_uk, []),
GEONAMES_NAME: (extract_geonames, []), GEONAMES_NAME: (extract_geonames, []),
GISCO_NAME: (extract_gisco, []),
TENANTS_NAME: (extract_tenants, []), TENANTS_NAME: (extract_tenants, []),
AVAILABILITY_NAME: (extract_availability, [TENANTS_NAME]), AVAILABILITY_NAME: (extract_availability, [TENANTS_NAME]),
} }

View File

@@ -0,0 +1,95 @@
"""GISCO NUTS-2 boundary GeoJSON extractor.
Downloads NUTS-2 boundary polygons from Eurostat GISCO. The file is stored
uncompressed because DuckDB's ST_Read cannot read gzipped files.
NUTS classification revises approximately every 7 years (current: 2021).
The partition path is fixed to the revision year, not the run date, making
the source version explicit. Cursor tracking still uses year_month to avoid
re-downloading on every monthly run.
Landing: {LANDING_DIR}/gisco/2024/01/nuts2_boundaries.geojson (~5 MB, uncompressed)
"""
import sqlite3
from pathlib import Path
import niquests
from ._shared import HTTP_TIMEOUT_SECONDS, run_extractor, setup_logging
from .utils import get_last_cursor
logger = setup_logging("padelnomics.extract.gisco")
EXTRACTOR_NAME = "gisco"
# NUTS 2021 revision, 20M scale (1:20,000,000), WGS84 (EPSG:4326), LEVL_2 only.
# 20M resolution gives simplified polygons that are fast for point-in-polygon
# matching without sacrificing accuracy at the NUTS-2 boundary level.
GISCO_URL = (
"https://gisco-services.ec.europa.eu/distribution/v2/nuts/geojson/"
"NUTS_RG_20M_2021_4326_LEVL_2.geojson"
)
# Fixed partition: NUTS boundaries are a static reference file, not time-series data.
# The 2024/01 partition reflects when this NUTS 2021 dataset was first ingested.
DEST_REL = Path("gisco/2024/01/nuts2_boundaries.geojson")
_GISCO_TIMEOUT_SECONDS = HTTP_TIMEOUT_SECONDS * 4 # ~5 MB; generous for slow upstreams
def extract(
landing_dir: Path,
year_month: str,
conn: sqlite3.Connection,
session: niquests.Session,
) -> dict:
"""Download NUTS-2 GeoJSON. Skips if already run this month or file exists."""
last_cursor = get_last_cursor(conn, EXTRACTOR_NAME)
if last_cursor == year_month:
logger.info("already ran for %s — skipping", year_month)
return {"files_written": 0, "files_skipped": 1, "bytes_written": 0}
dest = landing_dir / DEST_REL
if dest.exists():
logger.info("file already exists (skipping download): %s", dest)
return {
"files_written": 0,
"files_skipped": 1,
"bytes_written": 0,
"cursor_value": year_month,
}
dest.parent.mkdir(parents=True, exist_ok=True)
logger.info("GET %s", GISCO_URL)
resp = session.get(GISCO_URL, timeout=_GISCO_TIMEOUT_SECONDS)
resp.raise_for_status()
content = resp.content
assert len(content) > 100_000, (
f"GeoJSON too small ({len(content)} bytes) — download may have failed"
)
assert b'"FeatureCollection"' in content, "Response does not look like GeoJSON"
# Write uncompressed — ST_Read requires a plain file, not .gz
tmp = dest.with_suffix(".geojson.tmp")
tmp.write_bytes(content)
tmp.rename(dest)
size_mb = len(content) / 1_000_000
logger.info("written %s (%.1f MB)", dest, size_mb)
return {
"files_written": 1,
"files_skipped": 0,
"bytes_written": len(content),
"cursor_value": year_month,
}
def main() -> None:
run_extractor(EXTRACTOR_NAME, extract)
if __name__ == "__main__":
main()

View File

@@ -39,3 +39,23 @@ module = "padelnomics_extract.playtomic_availability"
entry = "main_recheck" entry = "main_recheck"
schedule = "0,30 6-23 * * *" schedule = "0,30 6-23 * * *"
depends_on = ["playtomic_availability"] depends_on = ["playtomic_availability"]
[census_usa]
module = "padelnomics_extract.census_usa"
schedule = "monthly"
[census_usa_income]
module = "padelnomics_extract.census_usa_income"
schedule = "monthly"
[eurostat_city_labels]
module = "padelnomics_extract.eurostat_city_labels"
schedule = "monthly"
[ons_uk]
module = "padelnomics_extract.ons_uk"
schedule = "monthly"
[gisco]
module = "padelnomics_extract.gisco"
schedule = "monthly"

View File

@@ -1,81 +0,0 @@
"""Download NUTS-2 boundary GeoJSON from Eurostat GISCO.
One-time (or on NUTS revision) download of NUTS-2 boundary polygons used for
spatial income resolution in dim_locations. Stored uncompressed because DuckDB's
ST_Read function cannot read gzipped files.
NUTS classification changes approximately every 7 years. Current revision: 2021.
Output: {LANDING_DIR}/gisco/2024/01/nuts2_boundaries.geojson (~5MB, uncompressed)
Usage:
uv run python scripts/download_gisco_nuts.py [--landing-dir data/landing]
Idempotent: skips download if the file already exists.
"""
import argparse
import sys
from pathlib import Path
import niquests
# NUTS 2021 revision, 20M scale (1:20,000,000), WGS84 (EPSG:4326), LEVL_2 only.
# 20M resolution gives simplified polygons that are fast for point-in-polygon
# matching without sacrificing accuracy at the NUTS-2 boundary level.
GISCO_URL = (
"https://gisco-services.ec.europa.eu/distribution/v2/nuts/geojson/"
"NUTS_RG_20M_2021_4326_LEVL_2.geojson"
)
# Fixed partition: NUTS boundaries are a static reference file, not time-series data.
# Use the NUTS revision year as the partition to make the source version explicit.
DEST_REL_PATH = "gisco/2024/01/nuts2_boundaries.geojson"
HTTP_TIMEOUT_SECONDS = 120
def download_nuts_boundaries(landing_dir: Path) -> None:
dest = landing_dir / DEST_REL_PATH
if dest.exists():
print(f"Already exists (skipping): {dest}")
return
dest.parent.mkdir(parents=True, exist_ok=True)
print(f"Downloading NUTS-2 boundaries from GISCO...")
print(f" URL: {GISCO_URL}")
with niquests.Session() as session:
resp = session.get(GISCO_URL, timeout=HTTP_TIMEOUT_SECONDS)
resp.raise_for_status()
content = resp.content
assert len(content) > 100_000, (
f"GeoJSON too small ({len(content)} bytes) — download may have failed"
)
assert b'"FeatureCollection"' in content, "Response does not look like GeoJSON"
# Write uncompressed — ST_Read requires a plain file
tmp = dest.with_suffix(".geojson.tmp")
tmp.write_bytes(content)
tmp.rename(dest)
size_mb = len(content) / 1_000_000
print(f" Written: {dest} ({size_mb:.1f} MB)")
print("Done. Run SQLMesh plan to rebuild stg_nuts2_boundaries.")
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--landing-dir", default="data/landing", type=Path)
args = parser.parse_args()
if not args.landing_dir.is_dir():
print(f"Error: landing dir does not exist: {args.landing_dir}", file=sys.stderr)
sys.exit(1)
download_nuts_boundaries(args.landing_dir)
if __name__ == "__main__":
main()