Backfill-Migration: HTML aus articles.content_original/content_de strippen
Idempotente Migration mit --db Parameter (Live + Staging benutzbar), Backup vor Lauf, Verifikations-Check nach Lauf. Selektiert alle Artikel mit HTML-Tag-Pattern, strippt via html_to_text-Helper aus dem Monitor-Repo, respektiert 1000-Zeichen-Cap.
Dieser Commit ist enthalten in:
133
migrations/migrate_html_strip_2026-05-03.py
Normale Datei
133
migrations/migrate_html_strip_2026-05-03.py
Normale Datei
@@ -0,0 +1,133 @@
|
|||||||
|
"""Backfill-Migration 2026-05-03: HTML-Tags aus articles.content_original / content_de strippen.
|
||||||
|
|
||||||
|
Ursache: rss_parser.py hat bis 2026-05-03 die `summary` aus dem RSS-Feed
|
||||||
|
ungefiltert in content_original/content_de gespeichert. Bei vielen Quellen
|
||||||
|
(Guardian, AP, SZ, Golem, ...) ist das HTML.
|
||||||
|
|
||||||
|
Diese Migration sucht alle articles, deren content_original oder content_de
|
||||||
|
HTML-Tags enthalten, und ersetzt den Inhalt durch die plain-Text-Variante
|
||||||
|
(via html_to_text aus feeds/transcript_extractors/_common.py).
|
||||||
|
|
||||||
|
Idempotent: wiederholter Lauf findet nichts mehr.
|
||||||
|
"""
|
||||||
|
import argparse
|
||||||
|
import shutil
|
||||||
|
import sqlite3
|
||||||
|
import sys
|
||||||
|
from datetime import datetime
|
||||||
|
|
||||||
|
# html_to_text aus dem Monitor-Repo importieren
|
||||||
|
sys.path.insert(0, "/home/claude-dev/AegisSight-Monitor/src")
|
||||||
|
try:
|
||||||
|
from feeds.transcript_extractors._common import html_to_text
|
||||||
|
except ImportError:
|
||||||
|
# Fallback: aus Staging-Repo
|
||||||
|
sys.path.insert(0, "/home/claude-dev/AegisSight-Monitor-staging/src")
|
||||||
|
from feeds.transcript_extractors._common import html_to_text
|
||||||
|
|
||||||
|
|
||||||
|
def _clean(s):
|
||||||
|
"""html_to_text + Cap auf 1000 Zeichen (passend zum Parser-Verhalten).
|
||||||
|
|
||||||
|
Liefert None wenn Eingabe None oder Ergebnis leer.
|
||||||
|
"""
|
||||||
|
if not s:
|
||||||
|
return None
|
||||||
|
cleaned = html_to_text(s)
|
||||||
|
if not cleaned or not cleaned.strip():
|
||||||
|
return None
|
||||||
|
return cleaned[:1000]
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
ap = argparse.ArgumentParser()
|
||||||
|
ap.add_argument("--db", required=True, help="Pfad zur SQLite-DB")
|
||||||
|
ap.add_argument("--no-backup", action="store_true", help="Backup ueberspringen")
|
||||||
|
ap.add_argument("--dry-run", action="store_true", help="Nur zeigen, nichts schreiben")
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
if not args.no_backup and not args.dry_run:
|
||||||
|
ts = datetime.now().strftime("%Y-%m-%d-%H%M")
|
||||||
|
backup_path = f"{args.db}.html-strip-bak-{ts}"
|
||||||
|
shutil.copy2(args.db, backup_path)
|
||||||
|
print(f"Backup angelegt: {backup_path}")
|
||||||
|
|
||||||
|
db = sqlite3.connect(args.db)
|
||||||
|
db.row_factory = sqlite3.Row
|
||||||
|
|
||||||
|
# Selektieren: alles mit Tag-Pattern in content_original ODER content_de
|
||||||
|
cur = db.execute(
|
||||||
|
"""SELECT id, content_original, content_de
|
||||||
|
FROM articles
|
||||||
|
WHERE content_original LIKE '%<%>%' OR content_de LIKE '%<%>%'"""
|
||||||
|
)
|
||||||
|
rows = cur.fetchall()
|
||||||
|
print(f"Gefundene Artikel mit HTML: {len(rows)}")
|
||||||
|
|
||||||
|
if not rows:
|
||||||
|
print("Nichts zu tun.")
|
||||||
|
db.close()
|
||||||
|
return 0
|
||||||
|
|
||||||
|
updated = 0
|
||||||
|
set_null_orig = 0
|
||||||
|
set_null_de = 0
|
||||||
|
sample_before = []
|
||||||
|
sample_after = []
|
||||||
|
|
||||||
|
for r in rows:
|
||||||
|
rid = r["id"]
|
||||||
|
old_orig = r["content_original"]
|
||||||
|
old_de = r["content_de"]
|
||||||
|
new_orig = _clean(old_orig)
|
||||||
|
new_de = _clean(old_de)
|
||||||
|
|
||||||
|
if new_orig is None and old_orig is not None:
|
||||||
|
set_null_orig += 1
|
||||||
|
if new_de is None and old_de is not None:
|
||||||
|
set_null_de += 1
|
||||||
|
|
||||||
|
# Sample fuer Verifikation (erste 3)
|
||||||
|
if len(sample_before) < 3 and old_orig and old_orig != new_orig:
|
||||||
|
sample_before.append((rid, old_orig[:120]))
|
||||||
|
sample_after.append((rid, (new_orig or "<NULL>")[:120]))
|
||||||
|
|
||||||
|
if not args.dry_run:
|
||||||
|
db.execute(
|
||||||
|
"UPDATE articles SET content_original=?, content_de=? WHERE id=?",
|
||||||
|
(new_orig, new_de, rid),
|
||||||
|
)
|
||||||
|
updated += 1
|
||||||
|
|
||||||
|
if not args.dry_run:
|
||||||
|
db.commit()
|
||||||
|
print(f"Updates committed: {updated}")
|
||||||
|
else:
|
||||||
|
print(f"DRY-RUN: {updated} Updates wuerden ausgefuehrt")
|
||||||
|
print(f" davon content_original auf NULL: {set_null_orig}")
|
||||||
|
print(f" davon content_de auf NULL: {set_null_de}")
|
||||||
|
|
||||||
|
print()
|
||||||
|
print("=== Sample (vorher -> nachher) ===")
|
||||||
|
for (rid_b, before), (rid_a, after) in zip(sample_before, sample_after):
|
||||||
|
print(f" [{rid_b}] BEFORE: {before!r}")
|
||||||
|
print(f" [{rid_a}] AFTER: {after!r}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Verifikation
|
||||||
|
if not args.dry_run:
|
||||||
|
remaining = db.execute(
|
||||||
|
"SELECT COUNT(*) FROM articles WHERE content_original LIKE '%<%>%' OR content_de LIKE '%<%>%'"
|
||||||
|
).fetchone()[0]
|
||||||
|
print(f"Nach Migration verbleibend mit HTML: {remaining}")
|
||||||
|
if remaining != 0:
|
||||||
|
print("WARNUNG: Es sind noch HTML-Reste vorhanden!")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
db.close()
|
||||||
|
print("Fertig.")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
In neuem Issue referenzieren
Einen Benutzer sperren