diff --git a/scripts/prod_query.py b/scripts/prod_query.py new file mode 100644 index 0000000..0d7eee6 --- /dev/null +++ b/scripts/prod_query.py @@ -0,0 +1,124 @@ +""" +Run a read-only SQL query against the production DuckDB (analytics or lakehouse). + +Usage: + uv run python scripts/prod_query.py "SELECT COUNT(*) FROM serving.location_profiles" + uv run python scripts/prod_query.py --db lakehouse "SELECT * FROM foundation.dim_countries LIMIT 5" + echo "SELECT 1" | uv run python scripts/prod_query.py --stdin + +The script SSHes to the prod server, runs the query via Python/DuckDB, and prints +tab-separated results with a header row. Read-only: DuckDB is opened with read_only=True. + +For lakehouse queries, automatically aliases the catalog as "local" so SQLMesh views work. + +Designed for Claude Code to call without nested shell escaping nightmares. +""" + +import argparse +import base64 +import subprocess +import sys + +SSH_HOST = "hetzner_root" +SSH_USER = "padelnomics_service" +DB_PATHS = { + "analytics": "/opt/padelnomics/data/analytics.duckdb", + "lakehouse": "/opt/padelnomics/data/lakehouse.duckdb", +} +MAX_ROWS = 500 +TIMEOUT_SECONDS = 30 + +# Mutation keywords blocked (defense in depth — DB is read_only anyway) +BLOCKED_KEYWORDS = {"CREATE", "DROP", "ALTER", "INSERT", "UPDATE", "DELETE", "ATTACH", "COPY", "EXPORT", "INSTALL", "LOAD"} + +# Remote Python script template. Receives SQL as base64 via {b64_sql}. +# Uses ATTACH + USE to alias the lakehouse catalog as "local" for SQLMesh view compat. +REMOTE_SCRIPT = """\ +import duckdb, json, sys, base64 +db_path = "{db_path}" +sql = base64.b64decode("{b64_sql}").decode() +max_rows = {max_rows} +output_json = {output_json} +try: + if "lakehouse" in db_path: + con = duckdb.connect(":memory:") + con.execute(f"ATTACH '{db_path}' AS local (READ_ONLY)") + con.execute("USE local") + else: + con = duckdb.connect(db_path, read_only=True) + result = con.execute(sql) + cols = [d[0] for d in result.description] + rows = result.fetchmany(max_rows) + if output_json: + print(json.dumps({{"columns": cols, "rows": [list(r) for r in rows], "count": len(rows)}}, default=str)) + else: + print("\\t".join(cols)) + for row in rows: + print("\\t".join(str(v) if v is not None else "NULL" for v in row)) + if len(rows) == max_rows: + print(f"... truncated at {{max_rows}} rows", file=sys.stderr) +except Exception as e: + print(f"ERROR: {{e}}", file=sys.stderr) + sys.exit(1) +""" + + +def main(): + parser = argparse.ArgumentParser(description="Query prod DuckDB over SSH") + parser.add_argument("sql", nargs="?", help="SQL query to run") + parser.add_argument("--stdin", action="store_true", help="Read SQL from stdin") + parser.add_argument( + "--db", + choices=list(DB_PATHS.keys()), + default="analytics", + help="Which database (default: analytics)", + ) + parser.add_argument( + "--max-rows", type=int, default=MAX_ROWS, help=f"Max rows (default: {MAX_ROWS})" + ) + parser.add_argument("--json", action="store_true", help="Output JSON instead of TSV") + args = parser.parse_args() + + sql = args.sql + if args.stdin or sql is None: + sql = sys.stdin.read().strip() + if not sql: + print("ERROR: No SQL provided", file=sys.stderr) + sys.exit(1) + + sql_upper = sql.upper() + for kw in BLOCKED_KEYWORDS: + if kw in sql_upper: + print(f"ERROR: Blocked keyword '{kw}' in query", file=sys.stderr) + sys.exit(1) + + b64_sql = base64.b64encode(sql.encode()).decode() + remote_script = REMOTE_SCRIPT.format( + db_path=DB_PATHS[args.db], + b64_sql=b64_sql, + max_rows=args.max_rows, + output_json=args.json, + ) + + cmd = [ + "ssh", SSH_HOST, + f"sudo -u {SSH_USER} bash -lc 'cd /opt/padelnomics && uv run python3 -'", + ] + + result = subprocess.run( + cmd, + input=remote_script, + capture_output=True, + text=True, + timeout=TIMEOUT_SECONDS + 10, + ) + + if result.stdout: + print(result.stdout, end="") + if result.stderr: + print(result.stderr, end="", file=sys.stderr) + sys.exit(result.returncode) + + +if __name__ == "__main__": + main()