From 939a7e9476703da217e7149675c588a28a35ebb8 Mon Sep 17 00:00:00 2001
From: claude-dev <claude-dev@aegis-sight.de>
Date: Sat, 2 May 2026 23:13:41 +0000
Subject: [PATCH] Backfill-Migration: HTML aus
 articles.content_original/content_de strippen

Idempotente Migration mit --db Parameter (Live + Staging benutzbar),
Backup vor Lauf, Verifikations-Check nach Lauf. Selektiert alle Artikel
mit HTML-Tag-Pattern, strippt via html_to_text-Helper aus dem Monitor-Repo,
respektiert 1000-Zeichen-Cap.
---
 migrations/migrate_html_strip_2026-05-03.py | 133 ++++++++++++++++++++
 1 file changed, 133 insertions(+)
 create mode 100644 migrations/migrate_html_strip_2026-05-03.py
diff --git a/migrations/migrate_html_strip_2026-05-03.py b/migrations/migrate_html_strip_2026-05-03.py
new file mode 100644
index 0000000..28d0dc6
--- /dev/null
+++ b/migrations/migrate_html_strip_2026-05-03.py
@@ -0,0 +1,133 @@
+"""Backfill-Migration 2026-05-03: HTML-Tags aus articles.content_original / content_de strippen.
+
+Ursache: rss_parser.py hat bis 2026-05-03 die `summary` aus dem RSS-Feed
+ungefiltert in content_original/content_de gespeichert. Bei vielen Quellen
+(Guardian, AP, SZ, Golem, ...) ist das HTML.
+
+Diese Migration sucht alle articles, deren content_original oder content_de
+HTML-Tags enthalten, und ersetzt den Inhalt durch die plain-Text-Variante
+(via html_to_text aus feeds/transcript_extractors/_common.py).
+
+Idempotent: wiederholter Lauf findet nichts mehr.
+"""
+import argparse
+import shutil
+import sqlite3
+import sys
+from datetime import datetime
+
+# html_to_text aus dem Monitor-Repo importieren
+sys.path.insert(0, "/home/claude-dev/AegisSight-Monitor/src")
+try:
+    from feeds.transcript_extractors._common import html_to_text
+except ImportError:
+    # Fallback: aus Staging-Repo
+    sys.path.insert(0, "/home/claude-dev/AegisSight-Monitor-staging/src")
+    from feeds.transcript_extractors._common import html_to_text
+
+
+def _clean(s):
+    """html_to_text + Cap auf 1000 Zeichen (passend zum Parser-Verhalten).
+
+    Liefert None wenn Eingabe None oder Ergebnis leer.
+    """
+    if not s:
+        return None
+    cleaned = html_to_text(s)
+    if not cleaned or not cleaned.strip():
+        return None
+    return cleaned[:1000]
+
+
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--db", required=True, help="Pfad zur SQLite-DB")
+    ap.add_argument("--no-backup", action="store_true", help="Backup ueberspringen")
+    ap.add_argument("--dry-run", action="store_true", help="Nur zeigen, nichts schreiben")
+    args = ap.parse_args()
+
+    if not args.no_backup and not args.dry_run:
+        ts = datetime.now().strftime("%Y-%m-%d-%H%M")
+        backup_path = f"{args.db}.html-strip-bak-{ts}"
+        shutil.copy2(args.db, backup_path)
+        print(f"Backup angelegt: {backup_path}")
+
+    db = sqlite3.connect(args.db)
+    db.row_factory = sqlite3.Row
+
+    # Selektieren: alles mit Tag-Pattern in content_original ODER content_de
+    cur = db.execute(
+        """SELECT id, content_original, content_de
+           FROM articles
+           WHERE content_original LIKE '%<%>%' OR content_de LIKE '%<%>%'"""
+    )
+    rows = cur.fetchall()
+    print(f"Gefundene Artikel mit HTML: {len(rows)}")
+
+    if not rows:
+        print("Nichts zu tun.")
+        db.close()
+        return 0
+
+    updated = 0
+    set_null_orig = 0
+    set_null_de = 0
+    sample_before = []
+    sample_after = []
+
+    for r in rows:
+        rid = r["id"]
+        old_orig = r["content_original"]
+        old_de = r["content_de"]
+        new_orig = _clean(old_orig)
+        new_de = _clean(old_de)
+
+        if new_orig is None and old_orig is not None:
+            set_null_orig += 1
+        if new_de is None and old_de is not None:
+            set_null_de += 1
+
+        # Sample fuer Verifikation (erste 3)
+        if len(sample_before) < 3 and old_orig and old_orig != new_orig:
+            sample_before.append((rid, old_orig[:120]))
+            sample_after.append((rid, (new_orig or "<NULL>")[:120]))
+
+        if not args.dry_run:
+            db.execute(
+                "UPDATE articles SET content_original=?, content_de=? WHERE id=?",
+                (new_orig, new_de, rid),
+            )
+        updated += 1
+
+    if not args.dry_run:
+        db.commit()
+        print(f"Updates committed: {updated}")
+    else:
+        print(f"DRY-RUN: {updated} Updates wuerden ausgefuehrt")
+    print(f"  davon content_original auf NULL: {set_null_orig}")
+    print(f"  davon content_de auf NULL: {set_null_de}")
+
+    print()
+    print("=== Sample (vorher -> nachher) ===")
+    for (rid_b, before), (rid_a, after) in zip(sample_before, sample_after):
+        print(f"  [{rid_b}] BEFORE: {before!r}")
+        print(f"  [{rid_a}] AFTER:  {after!r}")
+        print()
+
+    # Verifikation
+    if not args.dry_run:
+        remaining = db.execute(
+            "SELECT COUNT(*) FROM articles WHERE content_original LIKE '%<%>%' OR content_de LIKE '%<%>%'"
+        ).fetchone()[0]
+        print(f"Nach Migration verbleibend mit HTML: {remaining}")
+        if remaining != 0:
+            print("WARNUNG: Es sind noch HTML-Reste vorhanden!")
+            return 1
+
+    db.close()
+    print("Fertig.")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())