"""Backfill-Migration 2026-05-03: HTML-Tags aus articles.content_original / content_de strippen.

Ursache: rss_parser.py hat bis 2026-05-03 die `summary` aus dem RSS-Feed
ungefiltert in content_original/content_de gespeichert. Bei vielen Quellen
(Guardian, AP, SZ, Golem, ...) ist das HTML.

Diese Migration sucht alle articles, deren content_original oder content_de
HTML-Tags enthalten, und ersetzt den Inhalt durch die plain-Text-Variante
(via html_to_text aus feeds/transcript_extractors/_common.py).

Idempotent: wiederholter Lauf findet nichts mehr.
"""
import argparse
import shutil
import sqlite3
import sys
from datetime import datetime

# html_to_text aus dem Monitor-Repo importieren
sys.path.insert(0, "/home/claude-dev/AegisSight-Monitor/src")
try:
    from feeds.transcript_extractors._common import html_to_text
except ImportError:
    # Fallback: aus Staging-Repo
    sys.path.insert(0, "/home/claude-dev/AegisSight-Monitor-staging/src")
    from feeds.transcript_extractors._common import html_to_text


def _clean(s):
    """html_to_text + Cap auf 1000 Zeichen (passend zum Parser-Verhalten).

    Liefert None wenn Eingabe None oder Ergebnis leer.
    """
    if not s:
        return None
    cleaned = html_to_text(s)
    if not cleaned or not cleaned.strip():
        return None
    return cleaned[:1000]


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--db", required=True, help="Pfad zur SQLite-DB")
    ap.add_argument("--no-backup", action="store_true", help="Backup ueberspringen")
    ap.add_argument("--dry-run", action="store_true", help="Nur zeigen, nichts schreiben")
    args = ap.parse_args()

    if not args.no_backup and not args.dry_run:
        ts = datetime.now().strftime("%Y-%m-%d-%H%M")
        backup_path = f"{args.db}.html-strip-bak-{ts}"
        shutil.copy2(args.db, backup_path)
        print(f"Backup angelegt: {backup_path}")

    db = sqlite3.connect(args.db)
    db.row_factory = sqlite3.Row

    # Selektieren: alles mit Tag-Pattern in content_original ODER content_de
    cur = db.execute(
        """SELECT id, content_original, content_de
           FROM articles
           WHERE content_original LIKE '%<%>%' OR content_de LIKE '%<%>%'"""
    )
    rows = cur.fetchall()
    print(f"Gefundene Artikel mit HTML: {len(rows)}")

    if not rows:
        print("Nichts zu tun.")
        db.close()
        return 0

    updated = 0
    set_null_orig = 0
    set_null_de = 0
    sample_before = []
    sample_after = []

    for r in rows:
        rid = r["id"]
        old_orig = r["content_original"]
        old_de = r["content_de"]
        new_orig = _clean(old_orig)
        new_de = _clean(old_de)

        if new_orig is None and old_orig is not None:
            set_null_orig += 1
        if new_de is None and old_de is not None:
            set_null_de += 1

        # Sample fuer Verifikation (erste 3)
        if len(sample_before) < 3 and old_orig and old_orig != new_orig:
            sample_before.append((rid, old_orig[:120]))
            sample_after.append((rid, (new_orig or "<NULL>")[:120]))

        if not args.dry_run:
            db.execute(
                "UPDATE articles SET content_original=?, content_de=? WHERE id=?",
                (new_orig, new_de, rid),
            )
        updated += 1

    if not args.dry_run:
        db.commit()
        print(f"Updates committed: {updated}")
    else:
        print(f"DRY-RUN: {updated} Updates wuerden ausgefuehrt")
    print(f"  davon content_original auf NULL: {set_null_orig}")
    print(f"  davon content_de auf NULL: {set_null_de}")

    print()
    print("=== Sample (vorher -> nachher) ===")
    for (rid_b, before), (rid_a, after) in zip(sample_before, sample_after):
        print(f"  [{rid_b}] BEFORE: {before!r}")
        print(f"  [{rid_a}] AFTER:  {after!r}")
        print()

    # Verifikation
    if not args.dry_run:
        remaining = db.execute(
            "SELECT COUNT(*) FROM articles WHERE content_original LIKE '%<%>%' OR content_de LIKE '%<%>%'"
        ).fetchone()[0]
        print(f"Nach Migration verbleibend mit HTML: {remaining}")
        if remaining != 0:
            print("WARNUNG: Es sind noch HTML-Reste vorhanden!")
            return 1

    db.close()
    print("Fertig.")
    return 0


if __name__ == "__main__":
    sys.exit(main())