feat: copier update v0.9.0 — extraction docs, state tracking, architecture guides

Sync template from 29ac25b → v0.9.0 (29 template commits). Due to template's _subdirectory migration, new files were manually rendered rather than auto-merged by copier. New files: - .claude/CLAUDE.md + coding_philosophy.md (agent instructions) - extract utils.py: SQLite state tracking for extraction runs - extract/transform READMEs: architecture & pattern documentation - infra/supervisor: systemd service + orchestration script - Per-layer model READMEs (raw, staging, foundation, serving) Also fixes copier-answers.yml (adds 4 feature toggles, removes stale payment_provider key) and scopes CLAUDE.md gitignore to root only. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-22 15:44:48 +01:00
parent b76e87a0b6
commit 18ee24818b
14 changed files with 1084 additions and 2 deletions
--- a/extract/padelnomics_extract/src/padelnomics_extract/utils.py
+++ b/extract/padelnomics_extract/src/padelnomics_extract/utils.py
@@ -0,0 +1,129 @@
+"""Extraction utilities: SQLite state tracking, file I/O helpers.
+
+These are inline equivalents of the extract_core library used in larger
+multi-extractor pipelines. For a single-package project they live here;
+if you add multiple data sources, extract them to a shared workspace package.
+"""
+
+import gzip
+import hashlib
+import sqlite3
+from pathlib import Path
+
+# ---------------------------------------------------------------------------
+# State tracking (SQLite — transactional, stdlib, no extra dependency)
+# ---------------------------------------------------------------------------
+
+_CREATE_TABLE_SQL = """
+CREATE TABLE IF NOT EXISTS extraction_runs (
+    run_id        INTEGER PRIMARY KEY AUTOINCREMENT,
+    extractor     TEXT NOT NULL,
+    started_at    TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ', 'now')),
+    finished_at   TEXT,
+    status        TEXT NOT NULL DEFAULT 'running',
+    files_written INTEGER DEFAULT 0,
+    files_skipped INTEGER DEFAULT 0,
+    bytes_written INTEGER DEFAULT 0,
+    cursor_value  TEXT,
+    error_message TEXT
+)
+"""
+
+
+def open_state_db(landing_dir: str | Path) -> sqlite3.Connection:
+    """Open (or create) .state.sqlite inside landing_dir.
+
+    WAL mode allows concurrent reads while a run is in progress.
+    Caller is responsible for conn.close().
+    """
+    db_path = Path(landing_dir) / ".state.sqlite"
+    db_path.parent.mkdir(parents=True, exist_ok=True)
+    conn = sqlite3.connect(str(db_path))
+    conn.row_factory = sqlite3.Row
+    conn.execute("PRAGMA journal_mode=WAL")
+    conn.execute(_CREATE_TABLE_SQL)
+    conn.commit()
+    return conn
+
+
+def start_run(conn: sqlite3.Connection, extractor: str) -> int:
+    """Insert a 'running' row. Returns run_id."""
+    cur = conn.execute(
+        "INSERT INTO extraction_runs (extractor, status) VALUES (?, 'running')",
+        (extractor,),
+    )
+    conn.commit()
+    return cur.lastrowid
+
+
+def end_run(
+    conn: sqlite3.Connection,
+    run_id: int,
+    *,
+    status: str,
+    files_written: int = 0,
+    files_skipped: int = 0,
+    bytes_written: int = 0,
+    cursor_value: str | None = None,
+    error_message: str | None = None,
+) -> None:
+    """Update the run row to its final state."""
+    assert status in ("success", "failed")
+    conn.execute(
+        """
+        UPDATE extraction_runs
+        SET finished_at   = strftime('%Y-%m-%dT%H:%M:%SZ', 'now'),
+            status        = ?,
+            files_written = ?,
+            files_skipped = ?,
+            bytes_written = ?,
+            cursor_value  = ?,
+            error_message = ?
+        WHERE run_id = ?
+        """,
+        (status, files_written, files_skipped, bytes_written, cursor_value, error_message, run_id),
+    )
+    conn.commit()
+
+
+def get_last_cursor(conn: sqlite3.Connection, extractor: str) -> str | None:
+    """Return the cursor_value from the most recent successful run, or None."""
+    row = conn.execute(
+        """
+        SELECT cursor_value FROM extraction_runs
+        WHERE extractor = ? AND status = 'success' AND cursor_value IS NOT NULL
+        ORDER BY run_id DESC LIMIT 1
+        """,
+        (extractor,),
+    ).fetchone()
+    return row["cursor_value"] if row else None
+
+
+# ---------------------------------------------------------------------------
+# File I/O helpers
+# ---------------------------------------------------------------------------
+
+def landing_path(landing_dir: str | Path, *parts: str) -> Path:
+    """Return path to a subdirectory of landing_dir, creating it if absent."""
+    path = Path(landing_dir).joinpath(*parts)
+    path.mkdir(parents=True, exist_ok=True)
+    return path
+
+
+def content_hash(data: bytes, prefix_bytes: int = 8) -> str:
+    """SHA256 content fingerprint — used as idempotency key in filenames."""
+    assert data, "data must not be empty"
+    return hashlib.sha256(data).hexdigest()[:prefix_bytes]
+
+
+def write_gzip_atomic(path: Path, data: bytes) -> int:
+    """Gzip compress data and write to path atomically via .tmp sibling.
+
+    Returns bytes written. Atomic write means readers never see a partial file.
+    """
+    assert data, "data must not be empty"
+    compressed = gzip.compress(data)
+    tmp = path.with_suffix(path.suffix + ".tmp")
+    tmp.write_bytes(compressed)
+    tmp.rename(path)
+    return len(compressed)