beanflows/web/scripts/seed_cms_coffee.py

#!/usr/bin/env python3
"""
seed_cms_coffee.py — Seed coffee commodity CMS article templates.

Creates:
  1. Article templates (Jinja2 body_template + URL/title patterns)
  2. Template data rows (one per country/commodity/year combo)
     pulled from the DuckDB serving layer when available.

Usage (from web/ directory):
    uv run python scripts/seed_cms_coffee.py [--db data/app.db] [--dry-run]

After running this, go to /admin/cms to bulk-generate the articles.
"""

import argparse
import json
import os
import sqlite3
import sys
from pathlib import Path

# ── Config ────────────────────────────────────────────────────────────────────

DB_DEFAULT = "data/app.db"
SERVING_DB = os.getenv("SERVING_DUCKDB_PATH", "")


# ── Article templates ─────────────────────────────────────────────────────────

TEMPLATES = [
    {
        "name": "coffee-country-overview",
        "url_pattern": "/coffee/{{ country_slug }}",
        "title_pattern": "{{ country_name }} Coffee Production & Trade — BeanFlows",
        "meta_description_pattern": (
            "USDA PSD supply/demand data for {{ country_name }} coffee: "
            "production, exports, imports, ending stocks, and market trends."
        ),
        "body_template": """\
<h2>{{ country_name }} Coffee Overview</h2>

<p>
  {{ country_name }} is {% if rank <= 5 %}one of the world's top coffee-producing nations
  {% else %}a notable player in the global coffee market{% endif %}.
  In {{ latest_year }}, total production reached
  <strong>{{ "{:,}".format(production_bags|int) }} 60-kg bags</strong>.
</p>

<h3>Supply & Demand Snapshot ({{ latest_year }})</h3>
<table>
  <thead>
    <tr><th>Metric</th><th>Value (60-kg bags)</th></tr>
  </thead>
  <tbody>
    <tr><td>Production</td><td>{{ "{:,}".format(production_bags|int) }}</td></tr>
    <tr><td>Exports</td><td>{{ "{:,}".format(exports_bags|int) }}</td></tr>
    <tr><td>Domestic Consumption</td><td>{{ "{:,}".format(domestic_consumption_bags|int) }}</td></tr>
    <tr><td>Ending Stocks</td><td>{{ "{:,}".format(ending_stocks_bags|int) }}</td></tr>
  </tbody>
</table>

<h3>Production Trend</h3>
<p>
  Over the past decade, {{ country_name }}'s coffee output has shown
  {% if production_trend == 'up' %}a <strong>rising trend</strong>
  {% elif production_trend == 'down' %}a <strong>declining trend</strong>
  {% else %}relatively <strong>stable</strong> production
  {% endif %}.
  Year-on-year change in {{ latest_year }}: <strong>{{ production_yoy_pct }}%</strong>.
</p>

<h3>Key Export Markets</h3>
<p>
  {{ country_name }} primarily exports to international commodity markets,
  with volumes settled against the ICE Coffee C futures contract (KC=F).
  Track live price data and warehouse stocks on the
  <a href="/dashboard/positioning">BeanFlows positioning dashboard</a>.
</p>

<p><em>Data source: USDA PSD Online, updated {{ data_vintage }}.</em></p>
""",
    },
    {
        "name": "coffee-global-market-year",
        "url_pattern": "/coffee/market/{{ market_year }}",
        "title_pattern": "Global Coffee Market {{ market_year }} — Supply, Demand & Stocks",
        "meta_description_pattern": (
            "Global coffee supply and demand balance for {{ market_year }}: "
            "USDA PSD production, consumption, trade, and ending stocks data."
        ),
        "body_template": """\
<h2>Global Coffee Market {{ market_year }}</h2>

<p>
  The <strong>{{ market_year }}</strong> global coffee marketing year ran from
  October {{ market_year|int - 1 }} through September {{ market_year }}.
  World production totalled
  <strong>{{ "{:,}".format(world_production_bags|int) }} million 60-kg bags</strong>.
</p>

<h3>World Supply & Demand Balance</h3>
<table>
  <thead>
    <tr><th>Metric</th><th>Million 60-kg Bags</th></tr>
  </thead>
  <tbody>
    <tr><td>Opening Stocks</td><td>{{ "%.1f"|format(beginning_stocks_m|float) }}</td></tr>
    <tr><td>Production</td><td>{{ "%.1f"|format(production_m|float) }}</td></tr>
    <tr><td>Total Supply</td><td>{{ "%.1f"|format(total_supply_m|float) }}</td></tr>
    <tr><td>Consumption</td><td>{{ "%.1f"|format(consumption_m|float) }}</td></tr>
    <tr><td>Ending Stocks</td><td>{{ "%.1f"|format(ending_stocks_m|float) }}</td></tr>
    <tr><td>Stock-to-Use Ratio</td><td>{{ "%.1f"|format(stu_pct|float) }}%</td></tr>
  </tbody>
</table>

<h3>Supply/Demand Balance</h3>
<p>
  The {{ market_year }} marketing year ended with a
  {% if balance >= 0 %}<strong>surplus</strong> of {{ "%.1f"|format(balance|float) }}M bags
  {% else %}<strong>deficit</strong> of {{ "%.1f"|format((balance|float)|abs) }}M bags
  {% endif %}.
  The stock-to-use ratio of <strong>{{ "%.1f"|format(stu_pct|float) }}%</strong> indicates
  {% if stu_pct|float > 25 %}comfortable{% elif stu_pct|float > 18 %}adequate{% else %}tight{% endif %}
  global supply conditions.
</p>

<p>
  Explore live supply & demand charts and price data on
  <a href="/dashboard/supply">BeanFlows Supply Dashboard</a>.
</p>

<p><em>Data source: USDA PSD Online, updated {{ data_vintage }}.</em></p>
""",
    },
]


# ── Data generation ───────────────────────────────────────────────────────────

def fetch_country_data_from_duckdb() -> list[dict]:
    """Pull top coffee-producing countries from DuckDB serving layer."""
    if not SERVING_DB or not Path(SERVING_DB).exists():
        print(f"  Serving DB not found at {SERVING_DB!r} — using placeholder countries")
        return []

    try:
        import duckdb
        conn = duckdb.connect(SERVING_DB, read_only=True)
        rows = conn.execute("""
            WITH latest AS (
                SELECT MAX(market_year) AS max_year
                FROM serving.commodity_metrics
                WHERE commodity_code = 711100 AND country_code IS NOT NULL
            ),
            ranked AS (
                SELECT country_name, country_code, market_year,
                       production * 1000 AS production_bags,
                       exports * 1000 AS exports_bags,
                       domestic_consumption * 1000 AS domestic_consumption_bags,
                       ending_stocks * 1000 AS ending_stocks_bags,
                       production_yoy_pct,
                       ROW_NUMBER() OVER (ORDER BY production DESC) AS rank
                FROM serving.commodity_metrics, latest
                WHERE commodity_code = 711100
                  AND country_code IS NOT NULL
                  AND market_year = latest.max_year
                  AND production > 0
            )
            SELECT * FROM ranked LIMIT 30
        """).fetchall()
        _ = [d[0] for d in conn.execute("""
            WITH latest AS (SELECT MAX(market_year) AS max_year FROM serving.commodity_metrics
                            WHERE commodity_code = 711100 AND country_code IS NOT NULL)
            SELECT country_name, country_code, market_year, production * 1000,
                   exports * 1000, domestic_consumption * 1000, ending_stocks * 1000,
                   production_yoy_pct, 1 FROM serving.commodity_metrics, latest LIMIT 0
        """).description or []]
        return [dict(zip(["country_name","country_code","market_year","production_bags",
                          "exports_bags","domestic_consumption_bags","ending_stocks_bags",
                          "production_yoy_pct","rank"], row)) for row in rows]
    except Exception as e:
        print(f"  DuckDB error: {e} — using placeholder countries")
        return []


def fetch_global_year_data_from_duckdb() -> list[dict]:
    """Pull global supply/demand summary per market year."""
    if not SERVING_DB or not Path(SERVING_DB).exists():
        return []

    try:
        import duckdb
        conn = duckdb.connect(SERVING_DB, read_only=True)
        rows = conn.execute("""
            SELECT market_year,
                   beginning_stocks * 1000 AS beginning_stocks_bags,
                   production * 1000 AS world_production_bags,
                   total_supply * 1000 AS total_supply_bags,
                   domestic_consumption * 1000 AS consumption_bags,
                   ending_stocks * 1000 AS ending_stocks_bags,
                   production / NULLIF(total_distribution, 0) * 1000 AS beginning_stocks_m,
                   production AS production_m,
                   total_supply AS total_supply_m,
                   domestic_consumption AS consumption_m,
                   ending_stocks AS ending_stocks_m,
                   supply_demand_balance AS balance,
                   stock_to_use_ratio_pct AS stu_pct
            FROM serving.commodity_metrics
            WHERE commodity_code = 711100 AND country_name = 'Global'
            ORDER BY market_year DESC
            LIMIT 10
        """).fetchall()
        cols = ["market_year","beginning_stocks_bags","world_production_bags",
                "total_supply_bags","consumption_bags","ending_stocks_bags",
                "beginning_stocks_m","production_m","total_supply_m","consumption_m",
                "ending_stocks_m","balance","stu_pct"]
        return [dict(zip(cols, row)) for row in rows]
    except Exception as e:
        print(f"  DuckDB error (global): {e}")
        return []


PLACEHOLDER_COUNTRIES = [
    {"country_name": "Brazil", "country_code": "BR", "rank": 1},
    {"country_name": "Vietnam", "country_code": "VN", "rank": 2},
    {"country_name": "Colombia", "country_code": "CO", "rank": 3},
    {"country_name": "Indonesia", "country_code": "ID", "rank": 4},
    {"country_name": "Ethiopia", "country_code": "ET", "rank": 5},
    {"country_name": "Honduras", "country_code": "HN", "rank": 6},
    {"country_name": "India", "country_code": "IN", "rank": 7},
    {"country_name": "Uganda", "country_code": "UG", "rank": 8},
    {"country_name": "Mexico", "country_code": "MX", "rank": 9},
    {"country_name": "Peru", "country_code": "PE", "rank": 10},
]


def slug(name: str) -> str:
    return name.lower().replace(" ", "-").replace(",", "").replace("'", "")


# ── Main ──────────────────────────────────────────────────────────────────────

def run(db_path: str, dry_run: bool = False):
    conn = sqlite3.connect(db_path)
    conn.row_factory = sqlite3.Row

    now = __import__("datetime").datetime.utcnow().isoformat()
    data_vintage = __import__("datetime").date.today().strftime("%B %Y")

    inserted_templates = 0
    inserted_data_rows = 0

    for tmpl in TEMPLATES:
        existing = conn.execute(
            "SELECT id FROM article_templates WHERE name = ?", (tmpl["name"],)
        ).fetchone()

        if existing:
            tmpl_id = existing["id"]
            print(f"  Template '{tmpl['name']}' already exists (id={tmpl_id})")
        else:
            if dry_run:
                print(f"  [dry-run] Would insert template: {tmpl['name']}")
                tmpl_id = -1
            else:
                cursor = conn.execute(
                    """INSERT INTO article_templates
                       (name, slug, url_pattern, title_pattern, meta_description_pattern,
                        body_template, created_at)
                       VALUES (?, ?, ?, ?, ?, ?, ?)""",
                    (tmpl["name"], tmpl["name"], tmpl["url_pattern"], tmpl["title_pattern"],
                     tmpl["meta_description_pattern"], tmpl["body_template"], now),
                )
                tmpl_id = cursor.lastrowid
                inserted_templates += 1
                print(f"  Inserted template: {tmpl['name']} (id={tmpl_id})")

        # Seed data rows per template
        if tmpl["name"] == "coffee-country-overview":
            countries = fetch_country_data_from_duckdb() or [
                {**c, "latest_year": 2024, "production_bags": 0,
                 "exports_bags": 0, "domestic_consumption_bags": 0,
                 "ending_stocks_bags": 0, "production_yoy_pct": 0,
                 "production_trend": "stable"}
                for c in PLACEHOLDER_COUNTRIES
            ]
            for c in countries:
                country_slug = slug(c["country_name"])
                data = {
                    "country_name": c["country_name"],
                    "country_code": c.get("country_code", ""),
                    "country_slug": country_slug,
                    "latest_year": c.get("market_year", 2024),
                    "production_bags": c.get("production_bags", 0),
                    "exports_bags": c.get("exports_bags", 0),
                    "domestic_consumption_bags": c.get("domestic_consumption_bags", 0),
                    "ending_stocks_bags": c.get("ending_stocks_bags", 0),
                    "production_yoy_pct": round(c.get("production_yoy_pct") or 0, 1),
                    "production_trend": (
                        "up" if (c.get("production_yoy_pct") or 0) > 2
                        else "down" if (c.get("production_yoy_pct") or 0) < -2
                        else "stable"
                    ),
                    "rank": c.get("rank", 99),
                    "data_vintage": data_vintage,
                }
                exists = conn.execute(
                    "SELECT id FROM template_data WHERE template_id = ? "
                    "AND json_extract(data_json, '$.country_code') = ?",
                    (tmpl_id, c.get("country_code", "")),
                ).fetchone()
                if not exists and not dry_run and tmpl_id > 0:
                    conn.execute(
                        "INSERT INTO template_data (template_id, data_json, created_at) VALUES (?, ?, ?)",
                        (tmpl_id, json.dumps(data), now),
                    )
                    inserted_data_rows += 1
                elif dry_run:
                    print(f"    [dry-run] Would insert data row: {c['country_name']}")

        elif tmpl["name"] == "coffee-global-market-year":
            years_data = fetch_global_year_data_from_duckdb()
            if not years_data:
                years_data = [{"market_year": y} for y in range(2020, 2025)]

            for y in years_data:
                data = {
                    "market_year": y["market_year"],
                    "world_production_bags": y.get("world_production_bags", 0),
                    "beginning_stocks_m": round(y.get("beginning_stocks_m") or 0, 1),
                    "production_m": round(y.get("production_m") or 0, 1),
                    "total_supply_m": round(y.get("total_supply_m") or 0, 1),
                    "consumption_m": round(y.get("consumption_m") or 0, 1),
                    "ending_stocks_m": round(y.get("ending_stocks_m") or 0, 1),
                    "balance": round(y.get("balance") or 0, 2),
                    "stu_pct": round(y.get("stu_pct") or 0, 1),
                    "data_vintage": data_vintage,
                }
                exists = conn.execute(
                    "SELECT id FROM template_data WHERE template_id = ? "
                    "AND json_extract(data_json, '$.market_year') = ?",
                    (tmpl_id, y["market_year"]),
                ).fetchone()
                if not exists and not dry_run and tmpl_id > 0:
                    conn.execute(
                        "INSERT INTO template_data (template_id, data_json, created_at) VALUES (?, ?, ?)",
                        (tmpl_id, json.dumps(data), now),
                    )
                    inserted_data_rows += 1
                elif dry_run:
                    print(f"    [dry-run] Would insert data row: market_year={y['market_year']}")

    if not dry_run:
        conn.commit()
    conn.close()

    print(f"\nDone — inserted {inserted_templates} templates, {inserted_data_rows} data rows.")
    if not dry_run:
        print("Next: go to /admin/cms → pSEO Templates → Bulk Generate")


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Seed coffee CMS templates")
    parser.add_argument("--db", default=DB_DEFAULT, help=f"SQLite DB path (default: {DB_DEFAULT})")
    parser.add_argument("--dry-run", action="store_true", help="Print what would be inserted, don't write")
    args = parser.parse_args()

    db = Path(args.db)
    if not db.exists():
        print(f"DB not found at {db}. Run migrations first: uv run python -m beanflows.migrations.migrate")
        sys.exit(1)

    print(f"Seeding coffee CMS content into {db}...")
    run(str(db), dry_run=args.dry_run)