Compare commits
14 Commits
v4
...
v202602281
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c1cdeec6be | ||
|
|
710624f417 | ||
|
|
6cf98f44d4 | ||
|
|
60659a5ec5 | ||
|
|
beb4195f16 | ||
|
|
88cc857f3a | ||
|
|
9116625884 | ||
|
|
1af65bb46f | ||
|
|
9b0bfc478d | ||
|
|
adf22924f6 | ||
|
|
09665b7786 | ||
|
|
93349923bd | ||
|
|
642041b32b | ||
|
|
bb70a5372b |
@@ -56,9 +56,10 @@ WORKFLOWS_PATH=ENC[AES256_GCM,data:PehxEUMb1K3F1557BY3IqKD7sbJcoaIjnQvboBRJ1g==,
|
|||||||
ALERT_WEBHOOK_URL=
|
ALERT_WEBHOOK_URL=
|
||||||
NTFY_TOKEN=
|
NTFY_TOKEN=
|
||||||
#ENC[AES256_GCM,data:BCyQYjRnTx8yW9A=,iv:4OPCP+xzRLUJrpoFewVnbZRKnZH4sAbV76SM//2k5wU=,tag:HxwEp7VFVZUN/VjPiL/+Vw==,type:comment]
|
#ENC[AES256_GCM,data:BCyQYjRnTx8yW9A=,iv:4OPCP+xzRLUJrpoFewVnbZRKnZH4sAbV76SM//2k5wU=,tag:HxwEp7VFVZUN/VjPiL/+Vw==,type:comment]
|
||||||
PROXY_URLS=ENC[AES256_GCM,data:CzRaK0piUQfvuYYsdz0i2MEQIphKi0BhNvHw9alo46aTH+kqEKvoS7dKEKzyU9VJ4TyNweInlVMxB962DsvRoBtnHwo/pUmYtVeEr2881clNgEiZVYRDFRdEbpULcLPDJa3ey1leqAAHlmiL0RQ6Qa57gPCOVBzVG6npGLKO+K8XVIb+BZMs9kEUOlw7iuqTJW5xPN/t4X/jHidEqfTSAl9b4vU4bsYVuY3yQrL+/V5QpTbyXlf+cMq3flpA3zE2Fxhalzg+c/wHMTrCksFwrCkrInW0kY9yPkA7usUWr1xwwaV3wIDoNQsLXpMd/3RztipNvKtOMRhRJOmjzP7BKhCJvvvKTV5p+mBCulFijbMQgArg3BqcFanfw3YZ4wPd4hp8q/vOhE/U9Wu0yrMmyWYFHYGQnFVARlBH7pwn/ez8W4KqRFveEAuev9CE7K7s5RqzPLelSkoa9UuiiULJ+t0LFgKlgxuLtQ8GdFdgsmBCxY/4U/xzvNdC82hD549z5nMWWlaUJm4onPWirT/RYm7j3v6z4mmNImI2W6rCNbvEvsXwWsciquVaBIgReA47p6/GTzZ9VZMyGr4PdzB87BJGAgX1W57WNdPAsRIF49XP2BU72RtRFxsUG8Ha2dc=,iv:a10Vpk7Zv8QqORuEcMlpcvtHO/zjBLaFphWPYBXwysc=,tag:8N66/R+CLqEZ45wj+tCt6w==,type:str]
|
|
||||||
RECHECK_WINDOW_MINUTES=ENC[AES256_GCM,data:YWM=,iv:iY5+uMazLAFdwyLT7Gr7MaF1QHBIgHuoi6nF2VbSsOA=,tag:dc6AmuJdTQ55gVe16uzs6A==,type:str]
|
RECHECK_WINDOW_MINUTES=ENC[AES256_GCM,data:YWM=,iv:iY5+uMazLAFdwyLT7Gr7MaF1QHBIgHuoi6nF2VbSsOA=,tag:dc6AmuJdTQ55gVe16uzs6A==,type:str]
|
||||||
PROXY_URLS_FALLBACK=ENC[AES256_GCM,data:95rwI7kKUj1YxLpjChtrM4f2EFUDzQdAg1e1MOHnLwQ9ZY54UNH7v4JcqTsvDk9D+0N/BIdwFSDi7pnCSd6BWFV+cQ==,iv:rm9HdBsibSne7JR6vWl+ao/GHb1rbuVdZZDUWhVbTnE=,tag:NJ2STxmFZPvFayfTrEEYbg==,type:str]
|
PROXY_URLS_RESIDENTIAL=ENC[AES256_GCM,data:lfmlsjXFtL+zo40SNFLiFKaZiYvE7CNH+zRwjMK5pqPfCs0TlMX+Y9e1KmzAS+y/cI69TP5sgMPRBzER0Jn7RvH0KA==,iv:jBN/4/K5L5886G4rSzxt8V8u/57tAuj3R76haltzqeU=,tag:Xe6o9eg2PodfktDqmLgVNA==,type:str]
|
||||||
|
PROXY_URLS_DATACENTER=ENC[AES256_GCM,data:X6xpxz5u8Xh3OXjkIz3UwqH847qLvY9cVWVktW5B+lqhmXAKTzoTzHds8vlRGJf5Up9Yx44XcigbvuK33ZJDSq9ovkAIbY55OK4=,iv:3hHyFD+H9HMzQ/27bPjGr59+7yWmEneUdN9XPQasCig=,tag:oBXsSuV5idB7HqNrNOruwg==,type:str]
|
||||||
|
WEBSHARE_DOWNLOAD_URL=ENC[AES256_GCM,data:1D9VRZ3MCXPQWfiMH8+CLcrxeYnVVcQgZDvt5kltvbSTuSHQ2hHDmZpBkTOMIBJnw4JLZ2JQKHgG4OaYDtsM2VltFPnfwaRgVI9G5PSenR3o4PeQmYO1AqWOmjn19jPxNXRhEXdupP9UT+xQNXoBJsl6RR20XOpMA5AipUHmSjD0UIKXoZLU,iv:uWUkAydac//qrOTPUThuOLKAKXK4xcZmK9qBVFwpqt4=,tag:1vYhukBW9kEuSXCLAiZZmQ==,type:str]
|
||||||
CIRCUIT_BREAKER_THRESHOLD=
|
CIRCUIT_BREAKER_THRESHOLD=
|
||||||
#ENC[AES256_GCM,data:ZcX/OEbrMfKizIQYq3CYGnvzeTEX7KsmQaz2+Jj1rG5tbTy2aljQBIEkjtiwuo8NsNAD+FhIGRGVfBmKe1CAKME1MuiCbgSG,iv:4BSkeD3jZFawP09qECcqyuiWcDnCNSgbIjBATYhazq4=,tag:Ep1d2Uk700MOlWcLWaQ/ig==,type:comment]
|
#ENC[AES256_GCM,data:ZcX/OEbrMfKizIQYq3CYGnvzeTEX7KsmQaz2+Jj1rG5tbTy2aljQBIEkjtiwuo8NsNAD+FhIGRGVfBmKe1CAKME1MuiCbgSG,iv:4BSkeD3jZFawP09qECcqyuiWcDnCNSgbIjBATYhazq4=,tag:Ep1d2Uk700MOlWcLWaQ/ig==,type:comment]
|
||||||
GSC_SERVICE_ACCOUNT_PATH=
|
GSC_SERVICE_ACCOUNT_PATH=
|
||||||
@@ -70,7 +71,7 @@ GEONAMES_USERNAME=ENC[AES256_GCM,data:aSkVdLNrhiF6tlg=,iv:eemFGwDIv3EG/P3lVHGZj9
|
|||||||
CENSUS_API_KEY=ENC[AES256_GCM,data:qqG971573aGq9MiHI2xLlanKKFwjfcNNoMXtm8LNbyh0rMbQN2XukQ==,iv:az2i0ldH75nHGah4DeOxaXmDbVYqmC1c77ptZqFA9BI=,tag:zoDdKj9bR7fgIDo1/dEU2g==,type:str]
|
CENSUS_API_KEY=ENC[AES256_GCM,data:qqG971573aGq9MiHI2xLlanKKFwjfcNNoMXtm8LNbyh0rMbQN2XukQ==,iv:az2i0ldH75nHGah4DeOxaXmDbVYqmC1c77ptZqFA9BI=,tag:zoDdKj9bR7fgIDo1/dEU2g==,type:str]
|
||||||
sops_age__list_0__map_enc=-----BEGIN AGE ENCRYPTED FILE-----\nYWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBxNWNmUzVNUGdWRnE0ZFpF\nM0JQZWZ3UDdEVzlwTmIxakxOZXBkT2x2ZlNrClRtV2M3S2daSGxUZmFDSWQ2Nmh4\neU51QndFcUxlSE00RFovOVJTcDZmUUUKLS0tIDcvL3hRMDRoMWZZSXljNzA3WG5o\nMWFic21MV0krMzlIaldBTVU0ZDdlTE0K7euGQtA+9lHNws+x7TMCArZamm9att96\nL8cXoUDWe5fNI5+M1bXReqVfNwPTwZsV6j/+ZtYKybklIzWz02Ex4A==\n-----END AGE ENCRYPTED FILE-----\n
|
sops_age__list_0__map_enc=-----BEGIN AGE ENCRYPTED FILE-----\nYWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBxNWNmUzVNUGdWRnE0ZFpF\nM0JQZWZ3UDdEVzlwTmIxakxOZXBkT2x2ZlNrClRtV2M3S2daSGxUZmFDSWQ2Nmh4\neU51QndFcUxlSE00RFovOVJTcDZmUUUKLS0tIDcvL3hRMDRoMWZZSXljNzA3WG5o\nMWFic21MV0krMzlIaldBTVU0ZDdlTE0K7euGQtA+9lHNws+x7TMCArZamm9att96\nL8cXoUDWe5fNI5+M1bXReqVfNwPTwZsV6j/+ZtYKybklIzWz02Ex4A==\n-----END AGE ENCRYPTED FILE-----\n
|
||||||
sops_age__list_0__map_recipient=age1f5002gj4s78jju45jd28kuejtcfhn5cdujz885fl7z2p9ym68pnsgky87a
|
sops_age__list_0__map_recipient=age1f5002gj4s78jju45jd28kuejtcfhn5cdujz885fl7z2p9ym68pnsgky87a
|
||||||
sops_lastmodified=2026-02-26T14:31:14Z
|
sops_lastmodified=2026-02-28T15:50:46Z
|
||||||
sops_mac=ENC[AES256_GCM,data:iqFuTexTS9U/Nv8xoTpHljTNQTGX9ITcJ3AjhDEtxrh0Z9/lngfBvGtjiKmpwFGlobQw/x+/YLM+u3MhciwXF7qNwFfJ/StN2Y66uF71SxWotbL70Dxl4oWSVL3sU+2NYbw5yP0p+xCbE+rEd5SqAe6K5yyq5X25hz8fIapxlYA=,iv:foqoWQVMipuOAQ0Kp799PaIhCIrxV8T5cC811wIzxR8=,tag:yNfxSV3R21XEXksjmdsKBw==,type:str]
|
sops_mac=ENC[AES256_GCM,data:HiLZTLa+p3mqa4hw+tKOK27F/bsJOy4jmDi8MHToi6S7tRfBA/TzcEzXvXUIkkwAixN73NQHvBVeRnbcEsApVpkaxH1OqnjvvyT+B3YFkTEtxczaKGWlCvbqFZNmXYsFvGR9njaWYWsTQPkRIjrroXrSrhr7uxC8F40v7ByxJKo=,iv:qj2IpzWRIh/mM1HtjjkNbyFuhtORKXslVnf/vdEC9Uw=,tag:fr9CZsL74HxRJLXn9eS0xQ==,type:str]
|
||||||
sops_unencrypted_suffix=_unencrypted
|
sops_unencrypted_suffix=_unencrypted
|
||||||
sops_version=3.12.1
|
sops_version=3.12.1
|
||||||
|
|||||||
@@ -43,7 +43,9 @@ ALERT_WEBHOOK_URL=ENC[AES256_GCM,data:4sXQk8zklruC525J279TUUatdDJQ43qweuoPhtpI82
|
|||||||
NTFY_TOKEN=ENC[AES256_GCM,data:YlOxhsRJ8P1y4kk6ugWm41iyRCsM6oAWjvbU9lGcD0A=,iv:JZXOvi3wTOPV9A46c7fMiqbszNCvXkOgh9i/H1hob24=,tag:8xnPimgy7sesOAnxhaXmpg==,type:str]
|
NTFY_TOKEN=ENC[AES256_GCM,data:YlOxhsRJ8P1y4kk6ugWm41iyRCsM6oAWjvbU9lGcD0A=,iv:JZXOvi3wTOPV9A46c7fMiqbszNCvXkOgh9i/H1hob24=,tag:8xnPimgy7sesOAnxhaXmpg==,type:str]
|
||||||
SUPERVISOR_GIT_PULL=ENC[AES256_GCM,data:mg==,iv:KgqMVYj12FjOzWxtA1T0r0pqCDJ6MtHzMjE+4W/W+s4=,tag:czFaOqhHG8nqrQ8AZ8QiGw==,type:str]
|
SUPERVISOR_GIT_PULL=ENC[AES256_GCM,data:mg==,iv:KgqMVYj12FjOzWxtA1T0r0pqCDJ6MtHzMjE+4W/W+s4=,tag:czFaOqhHG8nqrQ8AZ8QiGw==,type:str]
|
||||||
#ENC[AES256_GCM,data:hzAZvCWc4RTk290=,iv:RsSI4OpAOQGcFVpfXDZ6t705yWmlO0JEWwWF5uQu9As=,tag:UPqFtA2tXiSa0vzJAv8qXg==,type:comment]
|
#ENC[AES256_GCM,data:hzAZvCWc4RTk290=,iv:RsSI4OpAOQGcFVpfXDZ6t705yWmlO0JEWwWF5uQu9As=,tag:UPqFtA2tXiSa0vzJAv8qXg==,type:comment]
|
||||||
PROXY_URLS=ENC[AES256_GCM,data:nm4B++SkZZgN3p2xru3WrpVA0X6O8yvb45tH/ovF4006zBy28xqVxbsd44Mz6b5FMinjOXRmGwoI/GDWmdJLzBYdpryQ/FhpbzSUpr1ZOjOz+7P0vn2jfBGAB8ksU3i5kuYglud3EyQGFL+v+uooxwrIUCjfzmmB4vCmf7phssKDsK1CqzmdZ1c54ehSu4bRRdmGp9d0+r+j1SpXb/JbZ8LTqUIhLlZXrHFqkCfN1czhFK9IwMVgR00Q4v2YkjaRBME4lVqwk1NwwatbS9Fq8LlzwuT1uKk+T6ZDkFKC8ZoPW1YRqF13X7hFGFXCNRqABRDZ45lqxYQbBoRrWmH2tfMiAmTrIuRsdPM8bZ/Ol5mXSDhs0HyWX2urX+LD65rIOO0zN/lwjXSwh5mwwBdB61akdzsWRyLZsdafuQUmgGul8y0eGMEbFWaty3bdrtAmqtsvHwxD/Dp/gQWScESXvPd1arn55zaXmefOy+ZLwcmx+FAJPpTMXRaq6Y/Z+D1PZZ+Uhu2D6tsAR4VvqqwlUgpsrAFXk6chJzOry8rmmxoMuIj9mXfjG+BqPFhV2oQsKSuIqFQqd/ZidJLO8ZSxA7L+h1eH4cQjcUd2nfzroG8nnKZ+cA8hQMfLuFiMY1I=,iv:nTaNQlC3px/lnodLphnILWbPVnelaUKKOZAFAaHi8MU=,tag:TYkIX1nrc+PKbvvnWYcvbg==,type:str]
|
PROXY_URLS_RESIDENTIAL=ENC[AES256_GCM,data:O+eoFK/Z4hUgVxDqK58qVICa8wjo2o6/Es7VHZ3wyfjuDed38ekybjiUy5W7BF6X38HP73VSeRv/5cgbdbPMWjvwtw==,iv:MILS/EvbY+D2i28B1i5PgAAxlkRuK1fAKmUuuAtuCXk=,tag:o4eQkkga/RjQGlqYnXwufQ==,type:str]
|
||||||
|
PROXY_URLS_DATACENTER=ENC[AES256_GCM,data:VQ8SU8xOoVMAQfJVit6HQfOjLlq3u41iHMfTYUZ908ouCcsKkB/mRBbhlODiu7tJXdjuGD47iTgXlso54bPIbjcLwLcg2GNOiSI=,iv:g/RM1XoCw78OmtGUh2Dyfd1N8tNQRlcfRrtj6uJYvds=,tag:lbPdM4JJxTysr1qG1A4+Fw==,type:str]
|
||||||
|
WEBSHARE_DOWNLOAD_URL=ENC[AES256_GCM,data:/N77CFf6tJWCk7HrnBOm2Q1ynx7XoblzfbzJySeCjrxqiu4r+CB90aDkaPahlQKI00DUZih3pcy7WhnjdAwI30G5kJZ3P8H8/R0tP7OBK1wPVbsJq8prQJPFOAWewsS4KWNtSURZPYSCxslcBb7DHLX6ZAjv6A5KFOjRK2N8usR9sIabrCWh,iv:G3Ropu/JGytZK/zKsNGFjjSu3Wt6fvHaAqI9RpUHvlI=,tag:fv6xuS94OR+4xfiyKrYELA==,type:str]
|
||||||
RECHECK_WINDOW_MINUTES=ENC[AES256_GCM,data:L2s=,iv:fV3mCKmK5fxUmIWRePELBDAPTb8JZqasVIhnAl55kYw=,tag:XL+PO6sblz/7WqHC3dtk1w==,type:str]
|
RECHECK_WINDOW_MINUTES=ENC[AES256_GCM,data:L2s=,iv:fV3mCKmK5fxUmIWRePELBDAPTb8JZqasVIhnAl55kYw=,tag:XL+PO6sblz/7WqHC3dtk1w==,type:str]
|
||||||
#ENC[AES256_GCM,data:RC+t2vqLwLjapdAUql8rQls=,iv:Kkiz3ND0g0MRAgcPJysIYMzSQS96Rq+3YP5yO7yWfIY=,tag:Y6TbZd81ihIwn+U515qd1g==,type:comment]
|
#ENC[AES256_GCM,data:RC+t2vqLwLjapdAUql8rQls=,iv:Kkiz3ND0g0MRAgcPJysIYMzSQS96Rq+3YP5yO7yWfIY=,tag:Y6TbZd81ihIwn+U515qd1g==,type:comment]
|
||||||
GSC_SERVICE_ACCOUNT_PATH=ENC[AES256_GCM,data:Vki6yHk+gd4n,iv:rxzKvwrGnAkLcpS41EZ097E87NrIpNZGFfl4iXFvr40=,tag:EZkBJpCq5rSpKYVC4H3JHQ==,type:str]
|
GSC_SERVICE_ACCOUNT_PATH=ENC[AES256_GCM,data:Vki6yHk+gd4n,iv:rxzKvwrGnAkLcpS41EZ097E87NrIpNZGFfl4iXFvr40=,tag:EZkBJpCq5rSpKYVC4H3JHQ==,type:str]
|
||||||
@@ -59,7 +61,7 @@ sops_age__list_1__map_enc=-----BEGIN AGE ENCRYPTED FILE-----\nYWdlLWVuY3J5cHRpb2
|
|||||||
sops_age__list_1__map_recipient=age1wjepykv3glvsrtegu25tevg7vyn3ngpl607u3yjc9ucay04s045s796msw
|
sops_age__list_1__map_recipient=age1wjepykv3glvsrtegu25tevg7vyn3ngpl607u3yjc9ucay04s045s796msw
|
||||||
sops_age__list_2__map_enc=-----BEGIN AGE ENCRYPTED FILE-----\nYWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBFeHhaOURNZnRVMEwxNThu\nUjF4Q0kwUXhTUE1QSzZJbmpubnh3RnpQTmdvCjRmWWxpNkxFUmVGb3NRbnlydW5O\nWEg3ZXJQTU4vcndzS2pUQXY3Q0ttYjAKLS0tIE9IRFJ1c2ZxbGVHa2xTL0swbGN1\nTzgwMThPUDRFTWhuZHJjZUYxOTZrU00KY62qrNBCUQYxwcLMXFEnLkwncxq3BPJB\nKm4NzeHBU87XmPWVrgrKuf+PH1mxJlBsl7Hev8xBTy7l6feiZjLIvQ==\n-----END AGE ENCRYPTED FILE-----\n
|
sops_age__list_2__map_enc=-----BEGIN AGE ENCRYPTED FILE-----\nYWdlLWVuY3J5cHRpb24ub3JnL3YxCi0+IFgyNTUxOSBFeHhaOURNZnRVMEwxNThu\nUjF4Q0kwUXhTUE1QSzZJbmpubnh3RnpQTmdvCjRmWWxpNkxFUmVGb3NRbnlydW5O\nWEg3ZXJQTU4vcndzS2pUQXY3Q0ttYjAKLS0tIE9IRFJ1c2ZxbGVHa2xTL0swbGN1\nTzgwMThPUDRFTWhuZHJjZUYxOTZrU00KY62qrNBCUQYxwcLMXFEnLkwncxq3BPJB\nKm4NzeHBU87XmPWVrgrKuf+PH1mxJlBsl7Hev8xBTy7l6feiZjLIvQ==\n-----END AGE ENCRYPTED FILE-----\n
|
||||||
sops_age__list_2__map_recipient=age1c783ym2q5x9tv7py5d28uc4k44aguudjn03g97l9nzs00dd9tsrqum8h4d
|
sops_age__list_2__map_recipient=age1c783ym2q5x9tv7py5d28uc4k44aguudjn03g97l9nzs00dd9tsrqum8h4d
|
||||||
sops_lastmodified=2026-02-26T14:32:28Z
|
sops_lastmodified=2026-02-28T15:51:24Z
|
||||||
sops_mac=ENC[AES256_GCM,data:pyHQHwTtjh7OLiMqbqhUjfrmetEtYS7yB342C/TWfDCwEotWLVwnGWlC4+HIl53pw9+3AgoBVRnW0t86e4kG9O8KyHnk68S9qBcpUsybW3lyGPNXmBydv1W9gQHuK8f/4WGIbkhNxyIToKg9ZAmYWFxNhRKSoYKm5P9Uh7B7CF4=,iv:syrX8VdL3JsDsawvFWbX04Ygcr18hjSSHfEwHkyKETk=,tag:qrhWkh/e+21OKGU2+rCeyg==,type:str]
|
sops_mac=ENC[AES256_GCM,data:FV34u95nhrUKbzVvcZ44V0pPVlhxJafs0TpkHlcpcuc+1TQWAysW5QbyKEVAGiLobKd3wjT7ThYGLEko7ZWjgXvpS0Bx0Qn+8KN+wFFZ84/cgCu6BEI3K5Ua3ssLyEYDcqIWy13K7DM6ymfp3bBNg96p3dgcf0wbJHHr+ef5048=,iv:JDhmirwvMhudCBFeQqlXvmqKaMIi++16UJvLOoMFios=,tag:p+LumDAe+CNozGxx5kZObw==,type:str]
|
||||||
sops_unencrypted_suffix=_unencrypted
|
sops_unencrypted_suffix=_unencrypted
|
||||||
sops_version=3.12.1
|
sops_version=3.12.1
|
||||||
|
|||||||
@@ -6,6 +6,15 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
|
|||||||
|
|
||||||
## [Unreleased]
|
## [Unreleased]
|
||||||
|
|
||||||
|
### Added
|
||||||
|
- **Three-tier proxy system** for extraction pipeline: free (Webshare auto-fetched) → datacenter (`PROXY_URLS_DATACENTER`) → residential (`PROXY_URLS_RESIDENTIAL`). Webshare free proxies are now auto-fetched from their download API on each run — no more manually copying stale proxy lists.
|
||||||
|
- `proxy.py`: added `fetch_webshare_proxies()` (stdlib urllib, bounded read + timeout), `load_proxy_tiers()` (assembles N tiers from env), generalised `make_tiered_cycler()` to accept `list[list[str]]` with N-level escalation. Exposes `is_exhausted()`, `active_tier_index()`, `tier_count()`.
|
||||||
|
- `playtomic_availability.py`: both `extract()` and `extract_recheck()` now use `load_proxy_tiers()` + N-tier cycler. `_fetch_venues_parallel` `fallback_urls` param removed. `is_fallback_active()` replaced by `is_exhausted()`.
|
||||||
|
- `playtomic_tenants.py`: uses `load_proxy_tiers()` flattened for simple round-robin.
|
||||||
|
|
||||||
|
### Changed
|
||||||
|
- **Env vars renamed** (breaking): `PROXY_URLS` → removed, `PROXY_URLS_FALLBACK` → removed. New vars: `WEBSHARE_DOWNLOAD_URL`, `PROXY_URLS_DATACENTER`, `PROXY_URLS_RESIDENTIAL`.
|
||||||
|
|
||||||
### Added
|
### Added
|
||||||
- **Phase 2a — NUTS-1 regional income differentiation** (`opportunity_score`): Munich and Berlin no longer share the same income figure as Chemnitz.
|
- **Phase 2a — NUTS-1 regional income differentiation** (`opportunity_score`): Munich and Berlin no longer share the same income figure as Chemnitz.
|
||||||
- `eurostat.py`: added `nama_10r_2hhinc` dataset config (NUTS-2 cube with NUTS-1 entries); filter params now appended to API URL so the server pre-filters the large cube before download (also makes `ilc_di03` requests smaller).
|
- `eurostat.py`: added `nama_10r_2hhinc` dataset config (NUTS-2 cube with NUTS-1 entries); filter params now appended to API URL so the server pre-filters the large cube before download (also makes `ilc_di03` requests smaller).
|
||||||
|
|||||||
27
README.md
27
README.md
@@ -396,18 +396,19 @@ docker compose logs -f app # tail logs
|
|||||||
|
|
||||||
## CI/CD
|
## CI/CD
|
||||||
|
|
||||||
Go to GitLab → padelnomics → Settings → CI/CD → Variables and add:
|
Pull-based deployment via Gitea Actions — no SSH keys or deploy credentials in CI.
|
||||||
|
|
||||||
| Variable | Value | Notes |
|
1. Push to master → Gitea Actions runs tests (`.gitea/workflows/ci.yaml`)
|
||||||
|----------|-------|-------|
|
2. On success, CI creates tag `v<run_number>` using the built-in `github.token`
|
||||||
| SSH_PRIVATE_KEY | Your ed25519 private key | Mask it, type "Variable" |
|
3. On-server supervisor polls for new tags every 60s and deploys automatically
|
||||||
| DEPLOY_HOST | Your Hetzner server IP | e.g. 1.2.3.4 |
|
|
||||||
| DEPLOY_USER | SSH username on the server | e.g. deploy or root |
|
|
||||||
| SSH_KNOWN_HOSTS | Server host key | Run `ssh-keyscan $YOUR_SERVER_IP` |
|
|
||||||
|
|
||||||
Server-side one-time setup:
|
**Server-side one-time setup:**
|
||||||
1. Add the matching public key to `~/.ssh/authorized_keys` for the deploy user
|
```bash
|
||||||
2. Clone the repo to `/opt/padelnomics`
|
bash infra/setup_server.sh # creates padelnomics_service user, keys, dirs
|
||||||
3. Create `.env` from `padelnomics/.env.example` with production values
|
ssh root@<server> 'bash -s' < infra/bootstrap_supervisor.sh
|
||||||
4. `chmod +x deploy.sh && ./deploy.sh` for the first deploy
|
```
|
||||||
5. Point reverse proxy to port 5000
|
|
||||||
|
1. `setup_server.sh` generates an ed25519 SSH deploy key — add the printed public key to Gitea:
|
||||||
|
`git.padelnomics.io → padelnomics → Settings → Deploy Keys → Add key (read-only)`
|
||||||
|
2. Add the printed age public key to `.sops.yaml`, re-encrypt, commit + push
|
||||||
|
3. Run `bootstrap_supervisor.sh` — clones from `git.padelnomics.io:2222`, decrypts secrets, starts systemd supervisor
|
||||||
|
|||||||
@@ -33,7 +33,7 @@ from pathlib import Path
|
|||||||
import niquests
|
import niquests
|
||||||
|
|
||||||
from ._shared import HTTP_TIMEOUT_SECONDS, run_extractor, setup_logging, ua_for_proxy
|
from ._shared import HTTP_TIMEOUT_SECONDS, run_extractor, setup_logging, ua_for_proxy
|
||||||
from .proxy import load_fallback_proxy_urls, load_proxy_urls, make_tiered_cycler
|
from .proxy import load_proxy_tiers, make_tiered_cycler
|
||||||
from .utils import (
|
from .utils import (
|
||||||
compress_jsonl_atomic,
|
compress_jsonl_atomic,
|
||||||
flush_partial_batch,
|
flush_partial_batch,
|
||||||
@@ -52,6 +52,9 @@ MAX_VENUES_PER_RUN = 20_000
|
|||||||
MAX_RETRIES_PER_VENUE = 2
|
MAX_RETRIES_PER_VENUE = 2
|
||||||
RECHECK_WINDOW_MINUTES = int(os.environ.get("RECHECK_WINDOW_MINUTES", "30"))
|
RECHECK_WINDOW_MINUTES = int(os.environ.get("RECHECK_WINDOW_MINUTES", "30"))
|
||||||
CIRCUIT_BREAKER_THRESHOLD = int(os.environ.get("CIRCUIT_BREAKER_THRESHOLD") or "10")
|
CIRCUIT_BREAKER_THRESHOLD = int(os.environ.get("CIRCUIT_BREAKER_THRESHOLD") or "10")
|
||||||
|
# Worker count: defaults to MAX_PROXY_CONCURRENCY (200). Override via PROXY_CONCURRENCY env var.
|
||||||
|
_PROXY_CONCURRENCY = os.environ.get("PROXY_CONCURRENCY", "").strip()
|
||||||
|
MAX_PROXY_CONCURRENCY = 200
|
||||||
|
|
||||||
# Parallel mode submits futures in batches so the circuit breaker can stop
|
# Parallel mode submits futures in batches so the circuit breaker can stop
|
||||||
# new submissions after it opens. Already-inflight futures in the current
|
# new submissions after it opens. Already-inflight futures in the current
|
||||||
@@ -76,7 +79,9 @@ def _load_tenant_ids(landing_dir: Path) -> list[str]:
|
|||||||
if not playtomic_dir.exists():
|
if not playtomic_dir.exists():
|
||||||
return []
|
return []
|
||||||
|
|
||||||
# Prefer JSONL (new format), fall back to blob (old format)
|
# Prefer daily partition (YYYY/MM/DD), fall back to older monthly/weekly partitions
|
||||||
|
tenant_files = sorted(playtomic_dir.glob("*/*/*/tenants.jsonl.gz"), reverse=True)
|
||||||
|
if not tenant_files:
|
||||||
tenant_files = sorted(playtomic_dir.glob("*/*/tenants.jsonl.gz"), reverse=True)
|
tenant_files = sorted(playtomic_dir.glob("*/*/tenants.jsonl.gz"), reverse=True)
|
||||||
if not tenant_files:
|
if not tenant_files:
|
||||||
tenant_files = sorted(playtomic_dir.glob("*/*/tenants.json.gz"), reverse=True)
|
tenant_files = sorted(playtomic_dir.glob("*/*/tenants.json.gz"), reverse=True)
|
||||||
@@ -190,14 +195,13 @@ def _fetch_venues_parallel(
|
|||||||
start_max_str: str,
|
start_max_str: str,
|
||||||
worker_count: int,
|
worker_count: int,
|
||||||
cycler: dict,
|
cycler: dict,
|
||||||
fallback_urls: list[str],
|
|
||||||
on_result=None,
|
on_result=None,
|
||||||
) -> tuple[list[dict], int]:
|
) -> tuple[list[dict], int]:
|
||||||
"""Fetch availability for multiple venues in parallel.
|
"""Fetch availability for multiple venues in parallel.
|
||||||
|
|
||||||
Submits futures in batches of PARALLEL_BATCH_SIZE. After each batch
|
Submits futures in batches of PARALLEL_BATCH_SIZE. After each batch
|
||||||
completes, checks the circuit breaker: if it opened and there is no
|
completes, checks the circuit breaker: if all proxy tiers are exhausted,
|
||||||
fallback configured, stops submitting further batches.
|
stops submitting further batches.
|
||||||
|
|
||||||
on_result: optional callable(result: dict) invoked inside the lock for
|
on_result: optional callable(result: dict) invoked inside the lock for
|
||||||
each successful result — used for incremental partial-file flushing.
|
each successful result — used for incremental partial-file flushing.
|
||||||
@@ -215,10 +219,10 @@ def _fetch_venues_parallel(
|
|||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=worker_count) as pool:
|
with ThreadPoolExecutor(max_workers=worker_count) as pool:
|
||||||
for batch_start in range(0, len(tenant_ids), PARALLEL_BATCH_SIZE):
|
for batch_start in range(0, len(tenant_ids), PARALLEL_BATCH_SIZE):
|
||||||
# Stop submitting new work if circuit is open with no fallback
|
# Stop submitting new work if all proxy tiers are exhausted
|
||||||
if cycler["is_fallback_active"]() and not fallback_urls:
|
if cycler["is_exhausted"]():
|
||||||
logger.error(
|
logger.error(
|
||||||
"Circuit open with no fallback — stopping after %d/%d venues",
|
"All proxy tiers exhausted — stopping after %d/%d venues",
|
||||||
completed_count, len(tenant_ids),
|
completed_count, len(tenant_ids),
|
||||||
)
|
)
|
||||||
break
|
break
|
||||||
@@ -294,10 +298,9 @@ def extract(
|
|||||||
venues_to_process = [tid for tid in all_venues_to_process if tid not in already_done]
|
venues_to_process = [tid for tid in all_venues_to_process if tid not in already_done]
|
||||||
|
|
||||||
# Set up tiered proxy cycler with circuit breaker
|
# Set up tiered proxy cycler with circuit breaker
|
||||||
proxy_urls = load_proxy_urls()
|
tiers = load_proxy_tiers()
|
||||||
fallback_urls = load_fallback_proxy_urls()
|
worker_count = min(int(_PROXY_CONCURRENCY), MAX_PROXY_CONCURRENCY) if _PROXY_CONCURRENCY else (MAX_PROXY_CONCURRENCY if tiers else 1)
|
||||||
worker_count = len(proxy_urls) if proxy_urls else 1
|
cycler = make_tiered_cycler(tiers, CIRCUIT_BREAKER_THRESHOLD)
|
||||||
cycler = make_tiered_cycler(proxy_urls, fallback_urls, CIRCUIT_BREAKER_THRESHOLD)
|
|
||||||
|
|
||||||
start_min_str = start_min.strftime("%Y-%m-%dT%H:%M:%S")
|
start_min_str = start_min.strftime("%Y-%m-%dT%H:%M:%S")
|
||||||
start_max_str = start_max.strftime("%Y-%m-%dT%H:%M:%S")
|
start_max_str = start_max.strftime("%Y-%m-%dT%H:%M:%S")
|
||||||
@@ -325,9 +328,9 @@ def extract(
|
|||||||
venues_errored = 0
|
venues_errored = 0
|
||||||
|
|
||||||
if worker_count > 1:
|
if worker_count > 1:
|
||||||
logger.info("Parallel mode: %d workers, %d proxies", worker_count, len(proxy_urls))
|
logger.info("Parallel mode: %d workers, %d tier(s)", worker_count, len(tiers))
|
||||||
new_venues_data, venues_errored = _fetch_venues_parallel(
|
new_venues_data, venues_errored = _fetch_venues_parallel(
|
||||||
venues_to_process, start_min_str, start_max_str, worker_count, cycler, fallback_urls,
|
venues_to_process, start_min_str, start_max_str, worker_count, cycler,
|
||||||
on_result=_on_result,
|
on_result=_on_result,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
@@ -342,9 +345,9 @@ def extract(
|
|||||||
_on_result(result)
|
_on_result(result)
|
||||||
else:
|
else:
|
||||||
venues_errored += 1
|
venues_errored += 1
|
||||||
circuit_opened = cycler["record_failure"]()
|
cycler["record_failure"]()
|
||||||
if circuit_opened and not fallback_urls:
|
if cycler["is_exhausted"]():
|
||||||
logger.error("Circuit open with no fallback — writing partial results")
|
logger.error("All proxy tiers exhausted — writing partial results")
|
||||||
break
|
break
|
||||||
|
|
||||||
if (i + 1) % 100 == 0:
|
if (i + 1) % 100 == 0:
|
||||||
@@ -485,14 +488,13 @@ def extract_recheck(
|
|||||||
start_max_str = window_end.strftime("%Y-%m-%dT%H:%M:%S")
|
start_max_str = window_end.strftime("%Y-%m-%dT%H:%M:%S")
|
||||||
|
|
||||||
# Set up tiered proxy cycler with circuit breaker
|
# Set up tiered proxy cycler with circuit breaker
|
||||||
proxy_urls = load_proxy_urls()
|
tiers = load_proxy_tiers()
|
||||||
fallback_urls = load_fallback_proxy_urls()
|
worker_count = min(int(_PROXY_CONCURRENCY), MAX_PROXY_CONCURRENCY) if _PROXY_CONCURRENCY else (MAX_PROXY_CONCURRENCY if tiers else 1)
|
||||||
worker_count = len(proxy_urls) if proxy_urls else 1
|
cycler = make_tiered_cycler(tiers, CIRCUIT_BREAKER_THRESHOLD)
|
||||||
cycler = make_tiered_cycler(proxy_urls, fallback_urls, CIRCUIT_BREAKER_THRESHOLD)
|
|
||||||
|
|
||||||
if worker_count > 1 and len(venues_to_recheck) > 10:
|
if worker_count > 1 and len(venues_to_recheck) > 10:
|
||||||
venues_data, venues_errored = _fetch_venues_parallel(
|
venues_data, venues_errored = _fetch_venues_parallel(
|
||||||
venues_to_recheck, start_min_str, start_max_str, worker_count, cycler, fallback_urls,
|
venues_to_recheck, start_min_str, start_max_str, worker_count, cycler,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
venues_data = []
|
venues_data = []
|
||||||
@@ -504,9 +506,9 @@ def extract_recheck(
|
|||||||
cycler["record_success"]()
|
cycler["record_success"]()
|
||||||
else:
|
else:
|
||||||
venues_errored += 1
|
venues_errored += 1
|
||||||
circuit_opened = cycler["record_failure"]()
|
cycler["record_failure"]()
|
||||||
if circuit_opened and not fallback_urls:
|
if cycler["is_exhausted"]():
|
||||||
logger.error("Circuit open with no fallback — writing partial recheck results")
|
logger.error("All proxy tiers exhausted — writing partial recheck results")
|
||||||
break
|
break
|
||||||
|
|
||||||
# Write recheck file as JSONL — one venue per line with metadata injected
|
# Write recheck file as JSONL — one venue per line with metadata injected
|
||||||
|
|||||||
@@ -25,12 +25,13 @@ import json
|
|||||||
import sqlite3
|
import sqlite3
|
||||||
import time
|
import time
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
from datetime import UTC, datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import niquests
|
import niquests
|
||||||
|
|
||||||
from ._shared import HTTP_TIMEOUT_SECONDS, run_extractor, setup_logging, ua_for_proxy
|
from ._shared import HTTP_TIMEOUT_SECONDS, run_extractor, setup_logging, ua_for_proxy
|
||||||
from .proxy import load_proxy_urls, make_round_robin_cycler
|
from .proxy import load_proxy_tiers, make_round_robin_cycler
|
||||||
from .utils import compress_jsonl_atomic, landing_path
|
from .utils import compress_jsonl_atomic, landing_path
|
||||||
|
|
||||||
logger = setup_logging("padelnomics.extract.playtomic_tenants")
|
logger = setup_logging("padelnomics.extract.playtomic_tenants")
|
||||||
@@ -69,25 +70,31 @@ def _fetch_pages_parallel(pages: list[int], next_proxy) -> list[tuple[int, list[
|
|||||||
|
|
||||||
def extract(
|
def extract(
|
||||||
landing_dir: Path,
|
landing_dir: Path,
|
||||||
year_month: str,
|
year_month: str, # noqa: ARG001 — unused; tenants uses ISO week partition instead
|
||||||
conn: sqlite3.Connection,
|
conn: sqlite3.Connection,
|
||||||
session: niquests.Session,
|
session: niquests.Session,
|
||||||
) -> dict:
|
) -> dict:
|
||||||
"""Fetch all Playtomic venues via global pagination. Returns run metrics."""
|
"""Fetch all Playtomic venues via global pagination. Returns run metrics.
|
||||||
year, month = year_month.split("/")
|
|
||||||
dest_dir = landing_path(landing_dir, "playtomic", year, month)
|
Partitioned by ISO week (e.g. 2026/W09) so each weekly run produces a
|
||||||
|
fresh file. _load_tenant_ids() in playtomic_availability globs across all
|
||||||
|
partitions and picks the most recent one.
|
||||||
|
"""
|
||||||
|
today = datetime.now(UTC)
|
||||||
|
year, month, day = today.strftime("%Y"), today.strftime("%m"), today.strftime("%d")
|
||||||
|
dest_dir = landing_path(landing_dir, "playtomic", year, month, day)
|
||||||
dest = dest_dir / "tenants.jsonl.gz"
|
dest = dest_dir / "tenants.jsonl.gz"
|
||||||
old_blob = dest_dir / "tenants.json.gz"
|
if dest.exists():
|
||||||
if dest.exists() or old_blob.exists():
|
logger.info("Already have tenants for %s/%s/%s — skipping", year, month, day)
|
||||||
logger.info("Already have tenants for %s/%s — skipping", year, month)
|
|
||||||
return {"files_written": 0, "files_skipped": 1, "bytes_written": 0}
|
return {"files_written": 0, "files_skipped": 1, "bytes_written": 0}
|
||||||
|
|
||||||
proxy_urls = load_proxy_urls()
|
tiers = load_proxy_tiers()
|
||||||
next_proxy = make_round_robin_cycler(proxy_urls) if proxy_urls else None
|
all_proxies = [url for tier in tiers for url in tier]
|
||||||
batch_size = len(proxy_urls) if proxy_urls else 1
|
next_proxy = make_round_robin_cycler(all_proxies) if all_proxies else None
|
||||||
|
batch_size = len(all_proxies) if all_proxies else 1
|
||||||
|
|
||||||
if next_proxy:
|
if next_proxy:
|
||||||
logger.info("Parallel mode: %d pages per batch (%d proxies)", batch_size, len(proxy_urls))
|
logger.info("Parallel mode: %d pages per batch (%d proxies across %d tier(s))", batch_size, len(all_proxies), len(tiers))
|
||||||
else:
|
else:
|
||||||
logger.info("Serial mode: 1 page at a time (no proxies)")
|
logger.info("Serial mode: 1 page at a time (no proxies)")
|
||||||
|
|
||||||
@@ -154,7 +161,7 @@ def extract(
|
|||||||
"files_written": 1,
|
"files_written": 1,
|
||||||
"files_skipped": 0,
|
"files_skipped": 0,
|
||||||
"bytes_written": bytes_written,
|
"bytes_written": bytes_written,
|
||||||
"cursor_value": year_month,
|
"cursor_value": f"{year}/{month}/{day}",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,41 +1,97 @@
|
|||||||
"""Optional proxy rotation for parallel HTTP fetching.
|
"""Optional proxy rotation for parallel HTTP fetching.
|
||||||
|
|
||||||
Proxies are configured via the PROXY_URLS environment variable (comma-separated).
|
Proxies are configured via environment variables. When unset, all functions
|
||||||
When unset, all functions return None/no-op — extractors fall back to direct requests.
|
return None/no-op — extractors fall back to direct requests.
|
||||||
|
|
||||||
Tiered proxy with circuit breaker:
|
Three-tier escalation: free → datacenter → residential.
|
||||||
Primary tier (PROXY_URLS) is used by default — typically cheap datacenter proxies.
|
Tier 1 (free): WEBSHARE_DOWNLOAD_URL — auto-fetched from Webshare API
|
||||||
Fallback tier (PROXY_URLS_FALLBACK) activates once consecutive failures >= threshold.
|
Tier 2 (datacenter): PROXY_URLS_DATACENTER — comma-separated paid DC proxies
|
||||||
Once the circuit opens it stays open for the duration of the run (no auto-recovery).
|
Tier 3 (residential): PROXY_URLS_RESIDENTIAL — comma-separated paid residential proxies
|
||||||
|
|
||||||
|
Tiered circuit breaker:
|
||||||
|
Active tier is used until consecutive failures >= threshold, then escalates
|
||||||
|
to the next tier. Once all tiers are exhausted, is_exhausted() returns True.
|
||||||
|
Escalation is permanent for the duration of the run — no auto-recovery.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import itertools
|
import itertools
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import threading
|
import threading
|
||||||
|
import urllib.error
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
MAX_WEBSHARE_PROXIES = 20
|
||||||
|
WEBSHARE_FETCH_TIMEOUT_SECONDS = 10
|
||||||
|
WEBSHARE_MAX_RESPONSE_BYTES = 1024 * 1024 # 1MB
|
||||||
|
|
||||||
def load_proxy_urls() -> list[str]:
|
|
||||||
"""Read PROXY_URLS env var (comma-separated). Returns [] if unset.
|
|
||||||
|
|
||||||
Format: http://user:pass@host:port or socks5://host:port
|
def fetch_webshare_proxies(download_url: str, max_proxies: int = MAX_WEBSHARE_PROXIES) -> list[str]:
|
||||||
|
"""Fetch proxy list from the Webshare download API. Returns [] on any error.
|
||||||
|
|
||||||
|
Expected line format: ip:port:username:password
|
||||||
|
Converts to: http://username:password@ip:port
|
||||||
|
|
||||||
|
Bounded: reads at most WEBSHARE_MAX_RESPONSE_BYTES, returns at most max_proxies.
|
||||||
"""
|
"""
|
||||||
raw = os.environ.get("PROXY_URLS", "")
|
assert max_proxies > 0, f"max_proxies must be positive, got {max_proxies}"
|
||||||
urls = [u.strip() for u in raw.split(",") if u.strip()]
|
assert download_url, "download_url must not be empty"
|
||||||
|
|
||||||
|
try:
|
||||||
|
req = urllib.request.Request(
|
||||||
|
download_url,
|
||||||
|
headers={"User-Agent": "padelnomics-extract/1.0"},
|
||||||
|
)
|
||||||
|
with urllib.request.urlopen(req, timeout=WEBSHARE_FETCH_TIMEOUT_SECONDS) as resp:
|
||||||
|
raw = resp.read(WEBSHARE_MAX_RESPONSE_BYTES).decode("utf-8")
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Failed to fetch Webshare proxies: %s", e)
|
||||||
|
return []
|
||||||
|
|
||||||
|
urls = []
|
||||||
|
for line in raw.splitlines():
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
parts = line.split(":")
|
||||||
|
if len(parts) != 4:
|
||||||
|
logger.debug("Skipping malformed proxy line: %r", line)
|
||||||
|
continue
|
||||||
|
ip, port, username, password = parts
|
||||||
|
urls.append(f"http://{username}:{password}@{ip}:{port}")
|
||||||
|
if len(urls) >= max_proxies:
|
||||||
|
break
|
||||||
|
|
||||||
|
logger.info("Fetched %d proxies from Webshare", len(urls))
|
||||||
return urls
|
return urls
|
||||||
|
|
||||||
|
|
||||||
def load_fallback_proxy_urls() -> list[str]:
|
def load_proxy_tiers() -> list[list[str]]:
|
||||||
"""Read PROXY_URLS_FALLBACK env var (comma-separated). Returns [] if unset.
|
"""Assemble proxy tiers in escalation order: free → datacenter → residential.
|
||||||
|
|
||||||
Used as the residential/reliable fallback tier when the primary tier fails.
|
Tier 1 (free): fetched from WEBSHARE_DOWNLOAD_URL if set.
|
||||||
Format: http://user:pass@host:port or socks5://host:port
|
Tier 2 (datacenter): PROXY_URLS_DATACENTER (comma-separated).
|
||||||
|
Tier 3 (residential): PROXY_URLS_RESIDENTIAL (comma-separated).
|
||||||
|
|
||||||
|
Empty tiers are omitted. Returns [] if no proxies configured anywhere.
|
||||||
"""
|
"""
|
||||||
raw = os.environ.get("PROXY_URLS_FALLBACK", "")
|
tiers: list[list[str]] = []
|
||||||
|
|
||||||
|
webshare_url = os.environ.get("WEBSHARE_DOWNLOAD_URL", "").strip()
|
||||||
|
if webshare_url:
|
||||||
|
free_proxies = fetch_webshare_proxies(webshare_url)
|
||||||
|
if free_proxies:
|
||||||
|
tiers.append(free_proxies)
|
||||||
|
|
||||||
|
for var in ("PROXY_URLS_DATACENTER", "PROXY_URLS_RESIDENTIAL"):
|
||||||
|
raw = os.environ.get(var, "")
|
||||||
urls = [u.strip() for u in raw.split(",") if u.strip()]
|
urls = [u.strip() for u in raw.split(",") if u.strip()]
|
||||||
return urls
|
if urls:
|
||||||
|
tiers.append(urls)
|
||||||
|
|
||||||
|
return tiers
|
||||||
|
|
||||||
|
|
||||||
def make_round_robin_cycler(proxy_urls: list[str]):
|
def make_round_robin_cycler(proxy_urls: list[str]):
|
||||||
@@ -78,83 +134,96 @@ def make_sticky_selector(proxy_urls: list[str]):
|
|||||||
return select_proxy
|
return select_proxy
|
||||||
|
|
||||||
|
|
||||||
def make_tiered_cycler(
|
def make_tiered_cycler(tiers: list[list[str]], threshold: int) -> dict:
|
||||||
primary_urls: list[str],
|
"""Thread-safe N-tier proxy cycler with circuit breaker.
|
||||||
fallback_urls: list[str],
|
|
||||||
threshold: int,
|
|
||||||
) -> dict:
|
|
||||||
"""Thread-safe tiered proxy cycler with circuit breaker.
|
|
||||||
|
|
||||||
Uses primary_urls until consecutive failures >= threshold, then switches
|
Uses tiers[0] until consecutive failures >= threshold, then escalates
|
||||||
permanently to fallback_urls for the rest of the run. No auto-recovery —
|
to tiers[1], then tiers[2], etc. Once all tiers are exhausted,
|
||||||
once the circuit opens it stays open to avoid flapping.
|
is_exhausted() returns True and next_proxy() returns None.
|
||||||
|
|
||||||
|
Failure counter resets on each escalation — the new tier gets a fresh start.
|
||||||
|
Once exhausted, further record_failure() calls are no-ops.
|
||||||
|
|
||||||
Returns a dict of callables:
|
Returns a dict of callables:
|
||||||
next_proxy() -> str | None — returns URL from the active tier
|
next_proxy() -> str | None — URL from the active tier, or None
|
||||||
record_success() — resets consecutive failure counter
|
record_success() -> None — resets consecutive failure counter
|
||||||
record_failure() -> bool — increments counter; True if circuit just opened
|
record_failure() -> bool — True if just escalated to next tier
|
||||||
is_fallback_active() -> bool — whether fallback tier is currently active
|
is_exhausted() -> bool — True if all tiers exhausted
|
||||||
|
active_tier_index() -> int — 0-based index of current tier
|
||||||
|
tier_count() -> int — total number of tiers
|
||||||
|
|
||||||
If primary_urls is empty: always returns from fallback_urls (no circuit breaker needed).
|
Edge cases:
|
||||||
If both are empty: next_proxy() always returns None.
|
Empty tiers list: next_proxy() always returns None, is_exhausted() True.
|
||||||
|
Single tier: behaves like the primary-only case, is_exhausted() after threshold.
|
||||||
"""
|
"""
|
||||||
assert threshold > 0, f"threshold must be positive, got {threshold}"
|
assert threshold > 0, f"threshold must be positive, got {threshold}"
|
||||||
|
assert isinstance(tiers, list), f"tiers must be a list, got {type(tiers)}"
|
||||||
|
|
||||||
lock = threading.Lock()
|
lock = threading.Lock()
|
||||||
|
cycles = [itertools.cycle(t) for t in tiers]
|
||||||
state = {
|
state = {
|
||||||
|
"active_tier": 0,
|
||||||
"consecutive_failures": 0,
|
"consecutive_failures": 0,
|
||||||
"fallback_active": False,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
primary_cycle = itertools.cycle(primary_urls) if primary_urls else None
|
|
||||||
fallback_cycle = itertools.cycle(fallback_urls) if fallback_urls else None
|
|
||||||
|
|
||||||
# No primary proxies — skip circuit breaker, use fallback directly
|
|
||||||
if not primary_urls:
|
|
||||||
state["fallback_active"] = True
|
|
||||||
|
|
||||||
def next_proxy() -> str | None:
|
def next_proxy() -> str | None:
|
||||||
with lock:
|
with lock:
|
||||||
if state["fallback_active"]:
|
idx = state["active_tier"]
|
||||||
return next(fallback_cycle) if fallback_cycle else None
|
if idx >= len(cycles):
|
||||||
return next(primary_cycle) if primary_cycle else None
|
return None
|
||||||
|
return next(cycles[idx])
|
||||||
|
|
||||||
def record_success() -> None:
|
def record_success() -> None:
|
||||||
with lock:
|
with lock:
|
||||||
state["consecutive_failures"] = 0
|
state["consecutive_failures"] = 0
|
||||||
|
|
||||||
def record_failure() -> bool:
|
def record_failure() -> bool:
|
||||||
"""Increment failure counter. Returns True if circuit just opened."""
|
"""Increment failure counter. Returns True if just escalated to next tier."""
|
||||||
with lock:
|
with lock:
|
||||||
if state["fallback_active"]:
|
idx = state["active_tier"]
|
||||||
# Already on fallback — don't trip the circuit again
|
if idx >= len(tiers):
|
||||||
|
# Already exhausted — no-op
|
||||||
return False
|
return False
|
||||||
state["consecutive_failures"] += 1
|
state["consecutive_failures"] += 1
|
||||||
if state["consecutive_failures"] >= threshold:
|
if state["consecutive_failures"] < threshold:
|
||||||
state["fallback_active"] = True
|
return False
|
||||||
if fallback_urls:
|
# Threshold reached — escalate
|
||||||
|
state["consecutive_failures"] = 0
|
||||||
|
state["active_tier"] += 1
|
||||||
|
new_idx = state["active_tier"]
|
||||||
|
if new_idx < len(tiers):
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"Circuit open after %d consecutive failures — "
|
"Circuit open after %d consecutive failures — "
|
||||||
"switching to fallback residential proxies",
|
"escalating to proxy tier %d/%d",
|
||||||
state["consecutive_failures"],
|
threshold,
|
||||||
|
new_idx + 1,
|
||||||
|
len(tiers),
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
logger.error(
|
logger.error(
|
||||||
"Circuit open after %d consecutive failures — "
|
"All %d proxy tier(s) exhausted after %d consecutive failures — "
|
||||||
"no fallback configured, aborting run",
|
"no more fallbacks",
|
||||||
state["consecutive_failures"],
|
len(tiers),
|
||||||
|
threshold,
|
||||||
)
|
)
|
||||||
return True
|
return True
|
||||||
return False
|
|
||||||
|
|
||||||
def is_fallback_active() -> bool:
|
def is_exhausted() -> bool:
|
||||||
with lock:
|
with lock:
|
||||||
return state["fallback_active"]
|
return state["active_tier"] >= len(tiers)
|
||||||
|
|
||||||
|
def active_tier_index() -> int:
|
||||||
|
with lock:
|
||||||
|
return state["active_tier"]
|
||||||
|
|
||||||
|
def tier_count() -> int:
|
||||||
|
return len(tiers)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"next_proxy": next_proxy,
|
"next_proxy": next_proxy,
|
||||||
"record_success": record_success,
|
"record_success": record_success,
|
||||||
"record_failure": record_failure,
|
"record_failure": record_failure,
|
||||||
"is_fallback_active": is_fallback_active,
|
"is_exhausted": is_exhausted,
|
||||||
|
"active_tier_index": active_tier_index,
|
||||||
|
"tier_count": tier_count,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ schedule = "monthly"
|
|||||||
|
|
||||||
[playtomic_tenants]
|
[playtomic_tenants]
|
||||||
module = "padelnomics_extract.playtomic_tenants"
|
module = "padelnomics_extract.playtomic_tenants"
|
||||||
schedule = "weekly"
|
schedule = "daily"
|
||||||
|
|
||||||
[playtomic_availability]
|
[playtomic_availability]
|
||||||
module = "padelnomics_extract.playtomic_availability"
|
module = "padelnomics_extract.playtomic_availability"
|
||||||
|
|||||||
@@ -192,9 +192,9 @@ def run_workflow(conn, workflow: dict) -> None:
|
|||||||
entry_fn = getattr(module, entry_name)
|
entry_fn = getattr(module, entry_name)
|
||||||
entry_fn()
|
entry_fn()
|
||||||
logger.info("Workflow %s completed successfully", workflow["name"])
|
logger.info("Workflow %s completed successfully", workflow["name"])
|
||||||
except Exception:
|
except Exception as exc:
|
||||||
logger.exception("Workflow %s failed", workflow["name"])
|
logger.exception("Workflow %s failed", workflow["name"])
|
||||||
send_alert(f"Workflow '{workflow['name']}' failed")
|
send_alert(f"[extract] {type(exc).__name__}: {str(exc)[:100]}")
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
|
||||||
@@ -233,8 +233,8 @@ def run_due_workflows(conn, workflows: list[dict]) -> bool:
|
|||||||
# Transform + Export + Deploy
|
# Transform + Export + Deploy
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
def run_shell(cmd: str, timeout_seconds: int = SUBPROCESS_TIMEOUT_SECONDS) -> bool:
|
def run_shell(cmd: str, timeout_seconds: int = SUBPROCESS_TIMEOUT_SECONDS) -> tuple[bool, str]:
|
||||||
"""Run a shell command. Returns True on success."""
|
"""Run a shell command. Returns (success, error_snippet)."""
|
||||||
logger.info("Shell: %s", cmd)
|
logger.info("Shell: %s", cmd)
|
||||||
result = subprocess.run(
|
result = subprocess.run(
|
||||||
cmd, shell=True, capture_output=True, text=True, timeout=timeout_seconds
|
cmd, shell=True, capture_output=True, text=True, timeout=timeout_seconds
|
||||||
@@ -242,29 +242,31 @@ def run_shell(cmd: str, timeout_seconds: int = SUBPROCESS_TIMEOUT_SECONDS) -> bo
|
|||||||
if result.returncode != 0:
|
if result.returncode != 0:
|
||||||
logger.error("Shell failed (rc=%d): %s\nstdout: %s\nstderr: %s",
|
logger.error("Shell failed (rc=%d): %s\nstdout: %s\nstderr: %s",
|
||||||
result.returncode, cmd, result.stdout[-500:], result.stderr[-500:])
|
result.returncode, cmd, result.stdout[-500:], result.stderr[-500:])
|
||||||
return False
|
raw = (result.stderr or result.stdout).strip()
|
||||||
return True
|
snippet = next((ln.strip() for ln in raw.splitlines() if ln.strip()), raw)[:120]
|
||||||
|
return False, snippet
|
||||||
|
return True, ""
|
||||||
|
|
||||||
|
|
||||||
def run_transform() -> None:
|
def run_transform() -> None:
|
||||||
"""Run SQLMesh — it evaluates model staleness internally."""
|
"""Run SQLMesh — it evaluates model staleness internally."""
|
||||||
logger.info("Running SQLMesh transform")
|
logger.info("Running SQLMesh transform")
|
||||||
ok = run_shell(
|
ok, err = run_shell(
|
||||||
"uv run sqlmesh -p transform/sqlmesh_padelnomics plan prod --auto-apply",
|
"uv run sqlmesh -p transform/sqlmesh_padelnomics plan prod --auto-apply",
|
||||||
)
|
)
|
||||||
if not ok:
|
if not ok:
|
||||||
send_alert("SQLMesh transform failed")
|
send_alert(f"[transform] {err}")
|
||||||
|
|
||||||
|
|
||||||
def run_export() -> None:
|
def run_export() -> None:
|
||||||
"""Export serving tables to analytics.duckdb."""
|
"""Export serving tables to analytics.duckdb."""
|
||||||
logger.info("Exporting serving tables")
|
logger.info("Exporting serving tables")
|
||||||
ok = run_shell(
|
ok, err = run_shell(
|
||||||
f"DUCKDB_PATH={DUCKDB_PATH} SERVING_DUCKDB_PATH={SERVING_DUCKDB_PATH} "
|
f"DUCKDB_PATH={DUCKDB_PATH} SERVING_DUCKDB_PATH={SERVING_DUCKDB_PATH} "
|
||||||
f"uv run python src/padelnomics/export_serving.py"
|
f"uv run python src/padelnomics/export_serving.py"
|
||||||
)
|
)
|
||||||
if not ok:
|
if not ok:
|
||||||
send_alert("Serving export failed")
|
send_alert(f"[export] {err}")
|
||||||
|
|
||||||
|
|
||||||
def web_code_changed() -> bool:
|
def web_code_changed() -> bool:
|
||||||
@@ -317,6 +319,7 @@ def git_pull_and_sync() -> None:
|
|||||||
|
|
||||||
logger.info("New tag %s available (current: %s) — deploying", latest, current)
|
logger.info("New tag %s available (current: %s) — deploying", latest, current)
|
||||||
run_shell(f"git checkout --detach {latest}")
|
run_shell(f"git checkout --detach {latest}")
|
||||||
|
run_shell("sops --input-type dotenv --output-type dotenv -d .env.prod.sops > .env")
|
||||||
run_shell("uv sync --all-packages")
|
run_shell("uv sync --all-packages")
|
||||||
|
|
||||||
|
|
||||||
@@ -365,11 +368,11 @@ def tick() -> None:
|
|||||||
# Deploy web app if code changed
|
# Deploy web app if code changed
|
||||||
if os.getenv("SUPERVISOR_GIT_PULL") and web_code_changed():
|
if os.getenv("SUPERVISOR_GIT_PULL") and web_code_changed():
|
||||||
logger.info("Web code changed — deploying")
|
logger.info("Web code changed — deploying")
|
||||||
ok = run_shell("./deploy.sh")
|
ok, err = run_shell("./deploy.sh")
|
||||||
if ok:
|
if ok:
|
||||||
send_alert("Deploy succeeded")
|
send_alert("[deploy] ok")
|
||||||
else:
|
else:
|
||||||
send_alert("Deploy FAILED — check journalctl -u padelnomics-supervisor")
|
send_alert(f"[deploy] failed: {err}")
|
||||||
finally:
|
finally:
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
@@ -386,9 +389,9 @@ def supervisor_loop() -> None:
|
|||||||
except KeyboardInterrupt:
|
except KeyboardInterrupt:
|
||||||
logger.info("Supervisor stopped (KeyboardInterrupt)")
|
logger.info("Supervisor stopped (KeyboardInterrupt)")
|
||||||
break
|
break
|
||||||
except Exception:
|
except Exception as exc:
|
||||||
logger.exception("Supervisor tick failed — backing off %ds", BACKOFF_SECONDS)
|
logger.exception("Supervisor tick failed — backing off %ds", BACKOFF_SECONDS)
|
||||||
send_alert("Supervisor tick failed")
|
send_alert(f"[supervisor] {type(exc).__name__}: {str(exc)[:100]}")
|
||||||
time.sleep(BACKOFF_SECONDS)
|
time.sleep(BACKOFF_SECONDS)
|
||||||
else:
|
else:
|
||||||
time.sleep(TICK_INTERVAL_SECONDS)
|
time.sleep(TICK_INTERVAL_SECONDS)
|
||||||
|
|||||||
@@ -2,12 +2,9 @@
|
|||||||
-- Used as a "racket sport culture" signal in the opportunity score:
|
-- Used as a "racket sport culture" signal in the opportunity score:
|
||||||
-- areas with high tennis court density are prime padel adoption markets.
|
-- areas with high tennis court density are prime padel adoption markets.
|
||||||
--
|
--
|
||||||
-- Supports two landing formats (UNION ALL during migration):
|
-- Source: data/landing/overpass_tennis/{year}/{month}/courts.jsonl.gz
|
||||||
-- New: courts.jsonl.gz — one OSM element per line; nodes have lat/lon directly,
|
-- Format: one OSM element per line; nodes have lat/lon directly,
|
||||||
-- ways/relations have center.lat/center.lon (Overpass out center)
|
-- ways/relations have center.lat/center.lon (Overpass out center)
|
||||||
-- Old: courts.json.gz — {"elements": [...]} blob (UNNEST required)
|
|
||||||
--
|
|
||||||
-- Source: data/landing/overpass_tennis/{year}/{month}/courts.{jsonl,json}.gz
|
|
||||||
|
|
||||||
MODEL (
|
MODEL (
|
||||||
name staging.stg_tennis_courts,
|
name staging.stg_tennis_courts,
|
||||||
@@ -17,8 +14,7 @@ MODEL (
|
|||||||
);
|
);
|
||||||
|
|
||||||
WITH
|
WITH
|
||||||
-- New format: one OSM element per JSONL line
|
parsed AS (
|
||||||
jsonl_elements AS (
|
|
||||||
SELECT
|
SELECT
|
||||||
type AS osm_type,
|
type AS osm_type,
|
||||||
TRY_CAST(id AS BIGINT) AS osm_id,
|
TRY_CAST(id AS BIGINT) AS osm_id,
|
||||||
@@ -47,33 +43,6 @@ jsonl_elements AS (
|
|||||||
)
|
)
|
||||||
WHERE type IS NOT NULL
|
WHERE type IS NOT NULL
|
||||||
),
|
),
|
||||||
-- Old format: {"elements": [...]} blob — kept for transition
|
|
||||||
blob_elements AS (
|
|
||||||
SELECT
|
|
||||||
elem ->> 'type' AS osm_type,
|
|
||||||
(elem ->> 'id')::BIGINT AS osm_id,
|
|
||||||
TRY_CAST(elem ->> 'lat' AS DOUBLE) AS lat,
|
|
||||||
TRY_CAST(elem ->> 'lon' AS DOUBLE) AS lon,
|
|
||||||
elem -> 'tags' ->> 'name' AS name,
|
|
||||||
elem -> 'tags' ->> 'addr:country' AS country_code,
|
|
||||||
elem -> 'tags' ->> 'addr:city' AS city_tag,
|
|
||||||
filename AS source_file,
|
|
||||||
CURRENT_DATE AS extracted_date
|
|
||||||
FROM (
|
|
||||||
SELECT UNNEST(elements) AS elem, filename
|
|
||||||
FROM read_json(
|
|
||||||
@LANDING_DIR || '/overpass_tennis/*/*/courts.json.gz',
|
|
||||||
format = 'auto',
|
|
||||||
filename = true
|
|
||||||
)
|
|
||||||
)
|
|
||||||
WHERE (elem ->> 'type') IS NOT NULL
|
|
||||||
),
|
|
||||||
parsed AS (
|
|
||||||
SELECT * FROM jsonl_elements
|
|
||||||
UNION ALL
|
|
||||||
SELECT * FROM blob_elements
|
|
||||||
),
|
|
||||||
deduped AS (
|
deduped AS (
|
||||||
SELECT *,
|
SELECT *,
|
||||||
ROW_NUMBER() OVER (PARTITION BY osm_id ORDER BY extracted_date DESC) AS rn
|
ROW_NUMBER() OVER (PARTITION BY osm_id ORDER BY extracted_date DESC) AS rn
|
||||||
|
|||||||
@@ -24,9 +24,11 @@ sup = _ilu.module_from_spec(_spec)
|
|||||||
_spec.loader.exec_module(sup)
|
_spec.loader.exec_module(sup)
|
||||||
|
|
||||||
from padelnomics_extract.proxy import ( # noqa: E402
|
from padelnomics_extract.proxy import ( # noqa: E402
|
||||||
load_proxy_urls,
|
fetch_webshare_proxies,
|
||||||
|
load_proxy_tiers,
|
||||||
make_round_robin_cycler,
|
make_round_robin_cycler,
|
||||||
make_sticky_selector,
|
make_sticky_selector,
|
||||||
|
make_tiered_cycler,
|
||||||
)
|
)
|
||||||
|
|
||||||
# ── load_workflows ────────────────────────────────────────────────
|
# ── load_workflows ────────────────────────────────────────────────
|
||||||
@@ -198,28 +200,112 @@ class TestTopologicalWaves:
|
|||||||
# ── proxy.py ─────────────────────────────────────────────────────
|
# ── proxy.py ─────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
class TestLoadProxyUrls:
|
class TestFetchWebshareProxies:
|
||||||
def test_returns_empty_when_unset(self, monkeypatch):
|
def test_parses_ip_port_user_pass_format(self):
|
||||||
monkeypatch.delenv("PROXY_URLS", raising=False)
|
raw = "1.2.3.4:1080:user1:pass1\n5.6.7.8:1080:user2:pass2\n"
|
||||||
assert load_proxy_urls() == []
|
with patch("urllib.request.urlopen") as mock_open:
|
||||||
|
mock_resp = MagicMock()
|
||||||
|
mock_resp.read.return_value = raw.encode("utf-8")
|
||||||
|
mock_resp.__enter__ = lambda s: s
|
||||||
|
mock_resp.__exit__ = MagicMock(return_value=False)
|
||||||
|
mock_open.return_value = mock_resp
|
||||||
|
urls = fetch_webshare_proxies("http://example.com/proxy-list")
|
||||||
|
assert urls == [
|
||||||
|
"http://user1:pass1@1.2.3.4:1080",
|
||||||
|
"http://user2:pass2@5.6.7.8:1080",
|
||||||
|
]
|
||||||
|
|
||||||
def test_parses_comma_separated_urls(self, monkeypatch):
|
def test_network_error_returns_empty(self):
|
||||||
monkeypatch.setenv(
|
import urllib.error
|
||||||
"PROXY_URLS",
|
with patch("urllib.request.urlopen", side_effect=urllib.error.URLError("timeout")):
|
||||||
"http://p1:8080,http://p2:8080,http://p3:8080",
|
result = fetch_webshare_proxies("http://example.com/proxy-list")
|
||||||
)
|
assert result == []
|
||||||
urls = load_proxy_urls()
|
|
||||||
assert urls == ["http://p1:8080", "http://p2:8080", "http://p3:8080"]
|
|
||||||
|
|
||||||
def test_strips_whitespace(self, monkeypatch):
|
def test_malformed_lines_are_skipped(self):
|
||||||
monkeypatch.setenv("PROXY_URLS", " http://p1:8080 , http://p2:8080 ")
|
raw = "bad_line\n1.2.3.4:1080:user:pass\nonly:three:parts\n"
|
||||||
urls = load_proxy_urls()
|
with patch("urllib.request.urlopen") as mock_open:
|
||||||
assert urls == ["http://p1:8080", "http://p2:8080"]
|
mock_resp = MagicMock()
|
||||||
|
mock_resp.read.return_value = raw.encode("utf-8")
|
||||||
|
mock_resp.__enter__ = lambda s: s
|
||||||
|
mock_resp.__exit__ = MagicMock(return_value=False)
|
||||||
|
mock_open.return_value = mock_resp
|
||||||
|
urls = fetch_webshare_proxies("http://example.com/proxy-list")
|
||||||
|
assert urls == ["http://user:pass@1.2.3.4:1080"]
|
||||||
|
|
||||||
def test_ignores_empty_segments(self, monkeypatch):
|
def test_max_proxies_respected(self):
|
||||||
monkeypatch.setenv("PROXY_URLS", "http://p1:8080,,http://p2:8080,")
|
lines = "\n".join(f"10.0.0.{i}:1080:u{i}:p{i}" for i in range(10))
|
||||||
urls = load_proxy_urls()
|
with patch("urllib.request.urlopen") as mock_open:
|
||||||
assert urls == ["http://p1:8080", "http://p2:8080"]
|
mock_resp = MagicMock()
|
||||||
|
mock_resp.read.return_value = lines.encode("utf-8")
|
||||||
|
mock_resp.__enter__ = lambda s: s
|
||||||
|
mock_resp.__exit__ = MagicMock(return_value=False)
|
||||||
|
mock_open.return_value = mock_resp
|
||||||
|
urls = fetch_webshare_proxies("http://example.com/proxy-list", max_proxies=3)
|
||||||
|
assert len(urls) == 3
|
||||||
|
|
||||||
|
def test_empty_lines_skipped(self):
|
||||||
|
raw = "\n\n1.2.3.4:1080:user:pass\n\n"
|
||||||
|
with patch("urllib.request.urlopen") as mock_open:
|
||||||
|
mock_resp = MagicMock()
|
||||||
|
mock_resp.read.return_value = raw.encode("utf-8")
|
||||||
|
mock_resp.__enter__ = lambda s: s
|
||||||
|
mock_resp.__exit__ = MagicMock(return_value=False)
|
||||||
|
mock_open.return_value = mock_resp
|
||||||
|
urls = fetch_webshare_proxies("http://example.com/proxy-list")
|
||||||
|
assert urls == ["http://user:pass@1.2.3.4:1080"]
|
||||||
|
|
||||||
|
|
||||||
|
class TestLoadProxyTiers:
|
||||||
|
def _clear_proxy_env(self, monkeypatch):
|
||||||
|
for var in ("WEBSHARE_DOWNLOAD_URL", "PROXY_URLS_DATACENTER", "PROXY_URLS_RESIDENTIAL"):
|
||||||
|
monkeypatch.delenv(var, raising=False)
|
||||||
|
|
||||||
|
def test_returns_empty_when_all_unset(self, monkeypatch):
|
||||||
|
self._clear_proxy_env(monkeypatch)
|
||||||
|
assert load_proxy_tiers() == []
|
||||||
|
|
||||||
|
def test_single_datacenter_tier(self, monkeypatch):
|
||||||
|
self._clear_proxy_env(monkeypatch)
|
||||||
|
monkeypatch.setenv("PROXY_URLS_DATACENTER", "http://dc1:8080,http://dc2:8080")
|
||||||
|
tiers = load_proxy_tiers()
|
||||||
|
assert len(tiers) == 1
|
||||||
|
assert tiers[0] == ["http://dc1:8080", "http://dc2:8080"]
|
||||||
|
|
||||||
|
def test_residential_only(self, monkeypatch):
|
||||||
|
self._clear_proxy_env(monkeypatch)
|
||||||
|
monkeypatch.setenv("PROXY_URLS_RESIDENTIAL", "http://res1:8080")
|
||||||
|
tiers = load_proxy_tiers()
|
||||||
|
assert len(tiers) == 1
|
||||||
|
assert tiers[0] == ["http://res1:8080"]
|
||||||
|
|
||||||
|
def test_empty_tiers_skipped(self, monkeypatch):
|
||||||
|
self._clear_proxy_env(monkeypatch)
|
||||||
|
monkeypatch.setenv("PROXY_URLS_DATACENTER", "")
|
||||||
|
monkeypatch.setenv("PROXY_URLS_RESIDENTIAL", "http://res1:8080")
|
||||||
|
tiers = load_proxy_tiers()
|
||||||
|
assert len(tiers) == 1
|
||||||
|
assert tiers[0] == ["http://res1:8080"]
|
||||||
|
|
||||||
|
def test_three_tiers_correct_order(self, monkeypatch):
|
||||||
|
self._clear_proxy_env(monkeypatch)
|
||||||
|
with patch("padelnomics_extract.proxy.fetch_webshare_proxies", return_value=["http://user:pass@1.2.3.4:1080"]):
|
||||||
|
monkeypatch.setenv("WEBSHARE_DOWNLOAD_URL", "http://example.com/list")
|
||||||
|
monkeypatch.setenv("PROXY_URLS_DATACENTER", "http://dc1:8080")
|
||||||
|
monkeypatch.setenv("PROXY_URLS_RESIDENTIAL", "http://res1:8080")
|
||||||
|
tiers = load_proxy_tiers()
|
||||||
|
assert len(tiers) == 3
|
||||||
|
assert tiers[0] == ["http://user:pass@1.2.3.4:1080"] # free
|
||||||
|
assert tiers[1] == ["http://dc1:8080"] # datacenter
|
||||||
|
assert tiers[2] == ["http://res1:8080"] # residential
|
||||||
|
|
||||||
|
def test_webshare_fetch_failure_skips_tier(self, monkeypatch):
|
||||||
|
self._clear_proxy_env(monkeypatch)
|
||||||
|
with patch("padelnomics_extract.proxy.fetch_webshare_proxies", return_value=[]):
|
||||||
|
monkeypatch.setenv("WEBSHARE_DOWNLOAD_URL", "http://example.com/list")
|
||||||
|
monkeypatch.setenv("PROXY_URLS_DATACENTER", "http://dc1:8080")
|
||||||
|
tiers = load_proxy_tiers()
|
||||||
|
assert len(tiers) == 1
|
||||||
|
assert tiers[0] == ["http://dc1:8080"]
|
||||||
|
|
||||||
|
|
||||||
class TestRoundRobinCycler:
|
class TestRoundRobinCycler:
|
||||||
@@ -279,3 +365,138 @@ class TestStickySelectorProxy:
|
|||||||
fn = make_sticky_selector(urls)
|
fn = make_sticky_selector(urls)
|
||||||
for i in range(20):
|
for i in range(20):
|
||||||
assert fn(f"key_{i}") in urls
|
assert fn(f"key_{i}") in urls
|
||||||
|
|
||||||
|
|
||||||
|
class TestTieredCyclerNTier:
|
||||||
|
def test_starts_on_first_tier(self):
|
||||||
|
tiers = [["http://t0a", "http://t0b"], ["http://t1a"]]
|
||||||
|
cycler = make_tiered_cycler(tiers, threshold=3)
|
||||||
|
assert cycler["active_tier_index"]() == 0
|
||||||
|
assert not cycler["is_exhausted"]()
|
||||||
|
assert cycler["next_proxy"]() in tiers[0]
|
||||||
|
|
||||||
|
def test_escalates_after_threshold(self):
|
||||||
|
tiers = [["http://t0"], ["http://t1"]]
|
||||||
|
cycler = make_tiered_cycler(tiers, threshold=3)
|
||||||
|
# Two failures — stays on tier 0
|
||||||
|
cycler["record_failure"]()
|
||||||
|
cycler["record_failure"]()
|
||||||
|
assert cycler["active_tier_index"]() == 0
|
||||||
|
# Third failure — escalates
|
||||||
|
escalated = cycler["record_failure"]()
|
||||||
|
assert escalated is True
|
||||||
|
assert cycler["active_tier_index"]() == 1
|
||||||
|
assert cycler["next_proxy"]() == "http://t1"
|
||||||
|
|
||||||
|
def test_escalates_through_all_tiers(self):
|
||||||
|
tiers = [["http://t0"], ["http://t1"], ["http://t2"]]
|
||||||
|
cycler = make_tiered_cycler(tiers, threshold=2)
|
||||||
|
# Exhaust tier 0
|
||||||
|
cycler["record_failure"]()
|
||||||
|
cycler["record_failure"]()
|
||||||
|
assert cycler["active_tier_index"]() == 1
|
||||||
|
# Exhaust tier 1
|
||||||
|
cycler["record_failure"]()
|
||||||
|
cycler["record_failure"]()
|
||||||
|
assert cycler["active_tier_index"]() == 2
|
||||||
|
# Exhaust tier 2
|
||||||
|
cycler["record_failure"]()
|
||||||
|
cycler["record_failure"]()
|
||||||
|
assert cycler["is_exhausted"]()
|
||||||
|
assert cycler["next_proxy"]() is None
|
||||||
|
|
||||||
|
def test_success_resets_counter(self):
|
||||||
|
tiers = [["http://t0"], ["http://t1"]]
|
||||||
|
cycler = make_tiered_cycler(tiers, threshold=3)
|
||||||
|
cycler["record_failure"]()
|
||||||
|
cycler["record_failure"]()
|
||||||
|
cycler["record_success"]()
|
||||||
|
# Counter reset — need threshold more failures to escalate
|
||||||
|
cycler["record_failure"]()
|
||||||
|
cycler["record_failure"]()
|
||||||
|
assert cycler["active_tier_index"]() == 0 # still on tier 0
|
||||||
|
cycler["record_failure"]()
|
||||||
|
assert cycler["active_tier_index"]() == 1 # now escalated
|
||||||
|
|
||||||
|
def test_counter_resets_on_escalation(self):
|
||||||
|
"""After escalating, failure counter resets so new tier gets a fresh start."""
|
||||||
|
tiers = [["http://t0"], ["http://t1"], ["http://t2"]]
|
||||||
|
cycler = make_tiered_cycler(tiers, threshold=2)
|
||||||
|
# Exhaust tier 0
|
||||||
|
cycler["record_failure"]()
|
||||||
|
cycler["record_failure"]()
|
||||||
|
assert cycler["active_tier_index"]() == 1
|
||||||
|
# One failure on tier 1 — should NOT escalate yet (counter reset)
|
||||||
|
cycler["record_failure"]()
|
||||||
|
assert cycler["active_tier_index"]() == 1
|
||||||
|
# Second failure on tier 1 — escalates to tier 2
|
||||||
|
cycler["record_failure"]()
|
||||||
|
assert cycler["active_tier_index"]() == 2
|
||||||
|
|
||||||
|
def test_is_exhausted_false_when_tiers_remain(self):
|
||||||
|
tiers = [["http://t0"], ["http://t1"]]
|
||||||
|
cycler = make_tiered_cycler(tiers, threshold=1)
|
||||||
|
assert not cycler["is_exhausted"]()
|
||||||
|
cycler["record_failure"]() # escalates to tier 1
|
||||||
|
assert not cycler["is_exhausted"]()
|
||||||
|
|
||||||
|
def test_is_exhausted_true_after_all_tiers_fail(self):
|
||||||
|
tiers = [["http://t0"]]
|
||||||
|
cycler = make_tiered_cycler(tiers, threshold=1)
|
||||||
|
assert not cycler["is_exhausted"]()
|
||||||
|
cycler["record_failure"]()
|
||||||
|
assert cycler["is_exhausted"]()
|
||||||
|
assert cycler["next_proxy"]() is None
|
||||||
|
|
||||||
|
def test_empty_tiers_immediately_exhausted(self):
|
||||||
|
cycler = make_tiered_cycler([], threshold=3)
|
||||||
|
assert cycler["is_exhausted"]()
|
||||||
|
assert cycler["next_proxy"]() is None
|
||||||
|
assert cycler["tier_count"]() == 0
|
||||||
|
|
||||||
|
def test_single_tier_cycles_within_tier(self):
|
||||||
|
tiers = [["http://p1", "http://p2", "http://p3"]]
|
||||||
|
cycler = make_tiered_cycler(tiers, threshold=10)
|
||||||
|
results = [cycler["next_proxy"]() for _ in range(6)]
|
||||||
|
assert results == ["http://p1", "http://p2", "http://p3"] * 2
|
||||||
|
|
||||||
|
def test_tier_count_reflects_input(self):
|
||||||
|
assert make_tiered_cycler([], threshold=1)["tier_count"]() == 0
|
||||||
|
assert make_tiered_cycler([["a"]], threshold=1)["tier_count"]() == 1
|
||||||
|
assert make_tiered_cycler([["a"], ["b"], ["c"]], threshold=1)["tier_count"]() == 3
|
||||||
|
|
||||||
|
def test_record_failure_noop_when_exhausted(self):
|
||||||
|
tiers = [["http://t0"]]
|
||||||
|
cycler = make_tiered_cycler(tiers, threshold=1)
|
||||||
|
cycler["record_failure"]() # exhausts
|
||||||
|
assert cycler["is_exhausted"]()
|
||||||
|
# Further failures are no-ops, not exceptions
|
||||||
|
result = cycler["record_failure"]()
|
||||||
|
assert result is False
|
||||||
|
assert cycler["is_exhausted"]()
|
||||||
|
|
||||||
|
def test_thread_safety(self):
|
||||||
|
"""Concurrent next_proxy and record calls do not raise or corrupt state."""
|
||||||
|
import threading
|
||||||
|
tiers = [["http://t0a", "http://t0b"], ["http://t1a", "http://t1b"]]
|
||||||
|
cycler = make_tiered_cycler(tiers, threshold=5)
|
||||||
|
errors = []
|
||||||
|
lock = threading.Lock()
|
||||||
|
|
||||||
|
def worker():
|
||||||
|
try:
|
||||||
|
for _ in range(20):
|
||||||
|
cycler["next_proxy"]()
|
||||||
|
cycler["record_failure"]()
|
||||||
|
cycler["record_success"]()
|
||||||
|
except Exception as e:
|
||||||
|
with lock:
|
||||||
|
errors.append(e)
|
||||||
|
|
||||||
|
threads = [threading.Thread(target=worker) for _ in range(8)]
|
||||||
|
for t in threads:
|
||||||
|
t.start()
|
||||||
|
for t in threads:
|
||||||
|
t.join()
|
||||||
|
|
||||||
|
assert errors == [], f"Thread safety errors: {errors}"
|
||||||
|
|||||||
Reference in New Issue
Block a user