- migrations/2026-05-09e_fetch_strategy.py NEU: ALTER TABLE sources ADD COLUMN fetch_strategy. Pre-flagging fuer FT/WSJ/NZZ etc. (paywall) und Rheinische Post/Verfassungsschutz (googlebot). - shared/services/source_health.py: gesynct vom Monitor (Phase-18-Code mit Retry-Logik + Strategien default/googlebot/paywall/skip). - routers/sources.py: GlobalSourceCreate/Update um fetch_strategy (Pattern-Validation), SOURCE_UPDATE_COLUMNS + INSERT erweitert. - dashboard.html: Edit-Modal hat jetzt Dropdown sourceFetchStrategy. - sources.js: laedt + sendet fetch_strategy mit. Cache-Buster 20260509c -> 20260509d.
78 Zeilen
2.6 KiB
Python
78 Zeilen
2.6 KiB
Python
"""Migration 2026-05-09e: sources.fetch_strategy.
|
|
|
|
Neues Feld zur Steuerung wie der Health-Check / RSS-Parser eine Quelle abrufen soll:
|
|
|
|
default: normaler User-Agent (AegisSight-HealthCheck), bei 403/429 Retry mit Googlebot.
|
|
googlebot: direkt mit Googlebot-UA (fuer Sites die SEO-freundlich sind).
|
|
paywall: bei 403 zweite Anfrage via removepaywalls.com (fuer Spiegel+/SZ+/FT etc.).
|
|
skip: Health-Check ueberspringen (bekannte unerreichbare Quellen).
|
|
|
|
Ausfuehrung:
|
|
DB_PATH=/home/claude-dev/osint-data/osint.db python3 migrations/2026-05-09e_fetch_strategy.py
|
|
DB_PATH=/home/claude-dev/AegisSight-Monitor-staging/data/osint.db python3 migrations/2026-05-09e_fetch_strategy.py
|
|
"""
|
|
import os
|
|
import sqlite3
|
|
import sys
|
|
|
|
|
|
def main(db_path: str) -> int:
|
|
if not os.path.exists(db_path):
|
|
print(f"FEHLER: DB nicht gefunden: {db_path}", file=sys.stderr)
|
|
return 1
|
|
|
|
conn = sqlite3.connect(db_path, timeout=60)
|
|
conn.execute("PRAGMA busy_timeout = 60000")
|
|
conn.execute("PRAGMA journal_mode = WAL")
|
|
|
|
print(f"Migration auf {db_path}")
|
|
|
|
cols = [c[1] for c in conn.execute("PRAGMA table_info(sources)")]
|
|
if "fetch_strategy" in cols:
|
|
print(" = sources.fetch_strategy war bereits da")
|
|
else:
|
|
conn.execute(
|
|
"ALTER TABLE sources ADD COLUMN fetch_strategy TEXT DEFAULT 'default'"
|
|
)
|
|
print(" + sources.fetch_strategy hinzugefügt (Default 'default')")
|
|
|
|
# Bekannte Paywall-Domains pre-flagging
|
|
paywall_domains = (
|
|
"ft.com",
|
|
"wsj.com",
|
|
"nzz.ch",
|
|
"handelsblatt.com",
|
|
"wiwo.de",
|
|
)
|
|
for dom in paywall_domains:
|
|
conn.execute(
|
|
"UPDATE sources SET fetch_strategy = 'paywall' "
|
|
"WHERE LOWER(domain) = ? AND COALESCE(fetch_strategy, 'default') = 'default'",
|
|
(dom,),
|
|
)
|
|
print(" ~ paywall-Strategie für bekannte Domains gesetzt (FT, WSJ, NZZ, Handelsblatt, WiWo)")
|
|
|
|
# Bekannte Bot-Block-Domains: Googlebot probieren
|
|
bot_block_domains = (
|
|
"rheinische-post.de",
|
|
"rp-online.de",
|
|
"verfassungsschutz.de",
|
|
)
|
|
for dom in bot_block_domains:
|
|
conn.execute(
|
|
"UPDATE sources SET fetch_strategy = 'googlebot' "
|
|
"WHERE LOWER(domain) LIKE ? AND COALESCE(fetch_strategy, 'default') = 'default'",
|
|
(f"%{dom}",),
|
|
)
|
|
print(" ~ googlebot-Strategie für bekannte Bot-Block-Domains")
|
|
|
|
conn.commit()
|
|
conn.close()
|
|
print("Migration abgeschlossen.")
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
db_path = os.environ.get("DB_PATH", "/home/claude-dev/osint-data/osint.db")
|
|
sys.exit(main(db_path))
|