From 939a7e9476703da217e7149675c588a28a35ebb8 Mon Sep 17 00:00:00 2001 From: claude-dev Date: Sat, 2 May 2026 23:13:41 +0000 Subject: [PATCH] Backfill-Migration: HTML aus articles.content_original/content_de strippen Idempotente Migration mit --db Parameter (Live + Staging benutzbar), Backup vor Lauf, Verifikations-Check nach Lauf. Selektiert alle Artikel mit HTML-Tag-Pattern, strippt via html_to_text-Helper aus dem Monitor-Repo, respektiert 1000-Zeichen-Cap. --- migrations/migrate_html_strip_2026-05-03.py | 133 ++++++++++++++++++++ 1 file changed, 133 insertions(+) create mode 100644 migrations/migrate_html_strip_2026-05-03.py diff --git a/migrations/migrate_html_strip_2026-05-03.py b/migrations/migrate_html_strip_2026-05-03.py new file mode 100644 index 0000000..28d0dc6 --- /dev/null +++ b/migrations/migrate_html_strip_2026-05-03.py @@ -0,0 +1,133 @@ +"""Backfill-Migration 2026-05-03: HTML-Tags aus articles.content_original / content_de strippen. + +Ursache: rss_parser.py hat bis 2026-05-03 die `summary` aus dem RSS-Feed +ungefiltert in content_original/content_de gespeichert. Bei vielen Quellen +(Guardian, AP, SZ, Golem, ...) ist das HTML. + +Diese Migration sucht alle articles, deren content_original oder content_de +HTML-Tags enthalten, und ersetzt den Inhalt durch die plain-Text-Variante +(via html_to_text aus feeds/transcript_extractors/_common.py). + +Idempotent: wiederholter Lauf findet nichts mehr. +""" +import argparse +import shutil +import sqlite3 +import sys +from datetime import datetime + +# html_to_text aus dem Monitor-Repo importieren +sys.path.insert(0, "/home/claude-dev/AegisSight-Monitor/src") +try: + from feeds.transcript_extractors._common import html_to_text +except ImportError: + # Fallback: aus Staging-Repo + sys.path.insert(0, "/home/claude-dev/AegisSight-Monitor-staging/src") + from feeds.transcript_extractors._common import html_to_text + + +def _clean(s): + """html_to_text + Cap auf 1000 Zeichen (passend zum Parser-Verhalten). + + Liefert None wenn Eingabe None oder Ergebnis leer. + """ + if not s: + return None + cleaned = html_to_text(s) + if not cleaned or not cleaned.strip(): + return None + return cleaned[:1000] + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--db", required=True, help="Pfad zur SQLite-DB") + ap.add_argument("--no-backup", action="store_true", help="Backup ueberspringen") + ap.add_argument("--dry-run", action="store_true", help="Nur zeigen, nichts schreiben") + args = ap.parse_args() + + if not args.no_backup and not args.dry_run: + ts = datetime.now().strftime("%Y-%m-%d-%H%M") + backup_path = f"{args.db}.html-strip-bak-{ts}" + shutil.copy2(args.db, backup_path) + print(f"Backup angelegt: {backup_path}") + + db = sqlite3.connect(args.db) + db.row_factory = sqlite3.Row + + # Selektieren: alles mit Tag-Pattern in content_original ODER content_de + cur = db.execute( + """SELECT id, content_original, content_de + FROM articles + WHERE content_original LIKE '%<%>%' OR content_de LIKE '%<%>%'""" + ) + rows = cur.fetchall() + print(f"Gefundene Artikel mit HTML: {len(rows)}") + + if not rows: + print("Nichts zu tun.") + db.close() + return 0 + + updated = 0 + set_null_orig = 0 + set_null_de = 0 + sample_before = [] + sample_after = [] + + for r in rows: + rid = r["id"] + old_orig = r["content_original"] + old_de = r["content_de"] + new_orig = _clean(old_orig) + new_de = _clean(old_de) + + if new_orig is None and old_orig is not None: + set_null_orig += 1 + if new_de is None and old_de is not None: + set_null_de += 1 + + # Sample fuer Verifikation (erste 3) + if len(sample_before) < 3 and old_orig and old_orig != new_orig: + sample_before.append((rid, old_orig[:120])) + sample_after.append((rid, (new_orig or "")[:120])) + + if not args.dry_run: + db.execute( + "UPDATE articles SET content_original=?, content_de=? WHERE id=?", + (new_orig, new_de, rid), + ) + updated += 1 + + if not args.dry_run: + db.commit() + print(f"Updates committed: {updated}") + else: + print(f"DRY-RUN: {updated} Updates wuerden ausgefuehrt") + print(f" davon content_original auf NULL: {set_null_orig}") + print(f" davon content_de auf NULL: {set_null_de}") + + print() + print("=== Sample (vorher -> nachher) ===") + for (rid_b, before), (rid_a, after) in zip(sample_before, sample_after): + print(f" [{rid_b}] BEFORE: {before!r}") + print(f" [{rid_a}] AFTER: {after!r}") + print() + + # Verifikation + if not args.dry_run: + remaining = db.execute( + "SELECT COUNT(*) FROM articles WHERE content_original LIKE '%<%>%' OR content_de LIKE '%<%>%'" + ).fetchone()[0] + print(f"Nach Migration verbleibend mit HTML: {remaining}") + if remaining != 0: + print("WARNUNG: Es sind noch HTML-Reste vorhanden!") + return 1 + + db.close() + print("Fertig.") + return 0 + + +if __name__ == "__main__": + sys.exit(main())