feat: copier update v0.9.0 — extraction docs, state tracking, architecture guides

Sync template from 29ac25b → v0.9.0 (29 template commits). Due to
template's _subdirectory migration, new files were manually rendered
rather than auto-merged by copier.

New files:
- .claude/CLAUDE.md + coding_philosophy.md (agent instructions)
- extract utils.py: SQLite state tracking for extraction runs
- extract/transform READMEs: architecture & pattern documentation
- infra/supervisor: systemd service + orchestration script
- Per-layer model READMEs (raw, staging, foundation, serving)

Also fixes copier-answers.yml (adds 4 feature toggles, removes stale
payment_provider key) and scopes CLAUDE.md gitignore to root only.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Deeman
2026-02-22 15:44:48 +01:00
parent b76e87a0b6
commit 18ee24818b
14 changed files with 1084 additions and 2 deletions

View File

@@ -0,0 +1,129 @@
"""Extraction utilities: SQLite state tracking, file I/O helpers.
These are inline equivalents of the extract_core library used in larger
multi-extractor pipelines. For a single-package project they live here;
if you add multiple data sources, extract them to a shared workspace package.
"""
import gzip
import hashlib
import sqlite3
from pathlib import Path
# ---------------------------------------------------------------------------
# State tracking (SQLite — transactional, stdlib, no extra dependency)
# ---------------------------------------------------------------------------
_CREATE_TABLE_SQL = """
CREATE TABLE IF NOT EXISTS extraction_runs (
run_id INTEGER PRIMARY KEY AUTOINCREMENT,
extractor TEXT NOT NULL,
started_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ', 'now')),
finished_at TEXT,
status TEXT NOT NULL DEFAULT 'running',
files_written INTEGER DEFAULT 0,
files_skipped INTEGER DEFAULT 0,
bytes_written INTEGER DEFAULT 0,
cursor_value TEXT,
error_message TEXT
)
"""
def open_state_db(landing_dir: str | Path) -> sqlite3.Connection:
"""Open (or create) .state.sqlite inside landing_dir.
WAL mode allows concurrent reads while a run is in progress.
Caller is responsible for conn.close().
"""
db_path = Path(landing_dir) / ".state.sqlite"
db_path.parent.mkdir(parents=True, exist_ok=True)
conn = sqlite3.connect(str(db_path))
conn.row_factory = sqlite3.Row
conn.execute("PRAGMA journal_mode=WAL")
conn.execute(_CREATE_TABLE_SQL)
conn.commit()
return conn
def start_run(conn: sqlite3.Connection, extractor: str) -> int:
"""Insert a 'running' row. Returns run_id."""
cur = conn.execute(
"INSERT INTO extraction_runs (extractor, status) VALUES (?, 'running')",
(extractor,),
)
conn.commit()
return cur.lastrowid
def end_run(
conn: sqlite3.Connection,
run_id: int,
*,
status: str,
files_written: int = 0,
files_skipped: int = 0,
bytes_written: int = 0,
cursor_value: str | None = None,
error_message: str | None = None,
) -> None:
"""Update the run row to its final state."""
assert status in ("success", "failed")
conn.execute(
"""
UPDATE extraction_runs
SET finished_at = strftime('%Y-%m-%dT%H:%M:%SZ', 'now'),
status = ?,
files_written = ?,
files_skipped = ?,
bytes_written = ?,
cursor_value = ?,
error_message = ?
WHERE run_id = ?
""",
(status, files_written, files_skipped, bytes_written, cursor_value, error_message, run_id),
)
conn.commit()
def get_last_cursor(conn: sqlite3.Connection, extractor: str) -> str | None:
"""Return the cursor_value from the most recent successful run, or None."""
row = conn.execute(
"""
SELECT cursor_value FROM extraction_runs
WHERE extractor = ? AND status = 'success' AND cursor_value IS NOT NULL
ORDER BY run_id DESC LIMIT 1
""",
(extractor,),
).fetchone()
return row["cursor_value"] if row else None
# ---------------------------------------------------------------------------
# File I/O helpers
# ---------------------------------------------------------------------------
def landing_path(landing_dir: str | Path, *parts: str) -> Path:
"""Return path to a subdirectory of landing_dir, creating it if absent."""
path = Path(landing_dir).joinpath(*parts)
path.mkdir(parents=True, exist_ok=True)
return path
def content_hash(data: bytes, prefix_bytes: int = 8) -> str:
"""SHA256 content fingerprint — used as idempotency key in filenames."""
assert data, "data must not be empty"
return hashlib.sha256(data).hexdigest()[:prefix_bytes]
def write_gzip_atomic(path: Path, data: bytes) -> int:
"""Gzip compress data and write to path atomically via .tmp sibling.
Returns bytes written. Atomic write means readers never see a partial file.
"""
assert data, "data must not be empty"
compressed = gzip.compress(data)
tmp = path.with_suffix(path.suffix + ".tmp")
tmp.write_bytes(compressed)
tmp.rename(path)
return len(compressed)