"""Backfill-Migration 2026-05-03: HTML-Tags aus articles.content_original / content_de strippen. Ursache: rss_parser.py hat bis 2026-05-03 die `summary` aus dem RSS-Feed ungefiltert in content_original/content_de gespeichert. Bei vielen Quellen (Guardian, AP, SZ, Golem, ...) ist das HTML. Diese Migration sucht alle articles, deren content_original oder content_de HTML-Tags enthalten, und ersetzt den Inhalt durch die plain-Text-Variante (via html_to_text aus feeds/transcript_extractors/_common.py). Idempotent: wiederholter Lauf findet nichts mehr. """ import argparse import shutil import sqlite3 import sys from datetime import datetime # html_to_text aus dem Monitor-Repo importieren sys.path.insert(0, "/home/claude-dev/AegisSight-Monitor/src") try: from feeds.transcript_extractors._common import html_to_text except ImportError: # Fallback: aus Staging-Repo sys.path.insert(0, "/home/claude-dev/AegisSight-Monitor-staging/src") from feeds.transcript_extractors._common import html_to_text def _clean(s): """html_to_text + Cap auf 1000 Zeichen (passend zum Parser-Verhalten). Liefert None wenn Eingabe None oder Ergebnis leer. """ if not s: return None cleaned = html_to_text(s) if not cleaned or not cleaned.strip(): return None return cleaned[:1000] def main(): ap = argparse.ArgumentParser() ap.add_argument("--db", required=True, help="Pfad zur SQLite-DB") ap.add_argument("--no-backup", action="store_true", help="Backup ueberspringen") ap.add_argument("--dry-run", action="store_true", help="Nur zeigen, nichts schreiben") args = ap.parse_args() if not args.no_backup and not args.dry_run: ts = datetime.now().strftime("%Y-%m-%d-%H%M") backup_path = f"{args.db}.html-strip-bak-{ts}" shutil.copy2(args.db, backup_path) print(f"Backup angelegt: {backup_path}") db = sqlite3.connect(args.db) db.row_factory = sqlite3.Row # Selektieren: alles mit Tag-Pattern in content_original ODER content_de cur = db.execute( """SELECT id, content_original, content_de FROM articles WHERE content_original LIKE '%<%>%' OR content_de LIKE '%<%>%'""" ) rows = cur.fetchall() print(f"Gefundene Artikel mit HTML: {len(rows)}") if not rows: print("Nichts zu tun.") db.close() return 0 updated = 0 set_null_orig = 0 set_null_de = 0 sample_before = [] sample_after = [] for r in rows: rid = r["id"] old_orig = r["content_original"] old_de = r["content_de"] new_orig = _clean(old_orig) new_de = _clean(old_de) if new_orig is None and old_orig is not None: set_null_orig += 1 if new_de is None and old_de is not None: set_null_de += 1 # Sample fuer Verifikation (erste 3) if len(sample_before) < 3 and old_orig and old_orig != new_orig: sample_before.append((rid, old_orig[:120])) sample_after.append((rid, (new_orig or "")[:120])) if not args.dry_run: db.execute( "UPDATE articles SET content_original=?, content_de=? WHERE id=?", (new_orig, new_de, rid), ) updated += 1 if not args.dry_run: db.commit() print(f"Updates committed: {updated}") else: print(f"DRY-RUN: {updated} Updates wuerden ausgefuehrt") print(f" davon content_original auf NULL: {set_null_orig}") print(f" davon content_de auf NULL: {set_null_de}") print() print("=== Sample (vorher -> nachher) ===") for (rid_b, before), (rid_a, after) in zip(sample_before, sample_after): print(f" [{rid_b}] BEFORE: {before!r}") print(f" [{rid_a}] AFTER: {after!r}") print() # Verifikation if not args.dry_run: remaining = db.execute( "SELECT COUNT(*) FROM articles WHERE content_original LIKE '%<%>%' OR content_de LIKE '%<%>%'" ).fetchone()[0] print(f"Nach Migration verbleibend mit HTML: {remaining}") if remaining != 0: print("WARNUNG: Es sind noch HTML-Reste vorhanden!") return 1 db.close() print("Fertig.") return 0 if __name__ == "__main__": sys.exit(main())