"""Backfill-Migration 2026-05-03: HTML-Tags aus articles.content_original / content_de strippen.
Ursache: rss_parser.py hat bis 2026-05-03 die `summary` aus dem RSS-Feed
ungefiltert in content_original/content_de gespeichert. Bei vielen Quellen
(Guardian, AP, SZ, Golem, ...) ist das HTML.
Diese Migration sucht alle articles, deren content_original oder content_de
HTML-Tags enthalten, und ersetzt den Inhalt durch die plain-Text-Variante
(via html_to_text aus feeds/transcript_extractors/_common.py).
Idempotent: wiederholter Lauf findet nichts mehr.
"""
import argparse
import shutil
import sqlite3
import sys
from datetime import datetime
# html_to_text aus dem Monitor-Repo importieren
sys.path.insert(0, "/home/claude-dev/AegisSight-Monitor/src")
try:
from feeds.transcript_extractors._common import html_to_text
except ImportError:
# Fallback: aus Staging-Repo
sys.path.insert(0, "/home/claude-dev/AegisSight-Monitor-staging/src")
from feeds.transcript_extractors._common import html_to_text
def _clean(s):
"""html_to_text + Cap auf 1000 Zeichen (passend zum Parser-Verhalten).
Liefert None wenn Eingabe None oder Ergebnis leer.
"""
if not s:
return None
cleaned = html_to_text(s)
if not cleaned or not cleaned.strip():
return None
return cleaned[:1000]
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--db", required=True, help="Pfad zur SQLite-DB")
ap.add_argument("--no-backup", action="store_true", help="Backup ueberspringen")
ap.add_argument("--dry-run", action="store_true", help="Nur zeigen, nichts schreiben")
args = ap.parse_args()
if not args.no_backup and not args.dry_run:
ts = datetime.now().strftime("%Y-%m-%d-%H%M")
backup_path = f"{args.db}.html-strip-bak-{ts}"
shutil.copy2(args.db, backup_path)
print(f"Backup angelegt: {backup_path}")
db = sqlite3.connect(args.db)
db.row_factory = sqlite3.Row
# Selektieren: alles mit Tag-Pattern in content_original ODER content_de
cur = db.execute(
"""SELECT id, content_original, content_de
FROM articles
WHERE content_original LIKE '%<%>%' OR content_de LIKE '%<%>%'"""
)
rows = cur.fetchall()
print(f"Gefundene Artikel mit HTML: {len(rows)}")
if not rows:
print("Nichts zu tun.")
db.close()
return 0
updated = 0
set_null_orig = 0
set_null_de = 0
sample_before = []
sample_after = []
for r in rows:
rid = r["id"]
old_orig = r["content_original"]
old_de = r["content_de"]
new_orig = _clean(old_orig)
new_de = _clean(old_de)
if new_orig is None and old_orig is not None:
set_null_orig += 1
if new_de is None and old_de is not None:
set_null_de += 1
# Sample fuer Verifikation (erste 3)
if len(sample_before) < 3 and old_orig and old_orig != new_orig:
sample_before.append((rid, old_orig[:120]))
sample_after.append((rid, (new_orig or "")[:120]))
if not args.dry_run:
db.execute(
"UPDATE articles SET content_original=?, content_de=? WHERE id=?",
(new_orig, new_de, rid),
)
updated += 1
if not args.dry_run:
db.commit()
print(f"Updates committed: {updated}")
else:
print(f"DRY-RUN: {updated} Updates wuerden ausgefuehrt")
print(f" davon content_original auf NULL: {set_null_orig}")
print(f" davon content_de auf NULL: {set_null_de}")
print()
print("=== Sample (vorher -> nachher) ===")
for (rid_b, before), (rid_a, after) in zip(sample_before, sample_after):
print(f" [{rid_b}] BEFORE: {before!r}")
print(f" [{rid_a}] AFTER: {after!r}")
print()
# Verifikation
if not args.dry_run:
remaining = db.execute(
"SELECT COUNT(*) FROM articles WHERE content_original LIKE '%<%>%' OR content_de LIKE '%<%>%'"
).fetchone()[0]
print(f"Nach Migration verbleibend mit HTML: {remaining}")
if remaining != 0:
print("WARNUNG: Es sind noch HTML-Reste vorhanden!")
return 1
db.close()
print("Fertig.")
return 0
if __name__ == "__main__":
sys.exit(main())