Phase 18 (Verwaltung): fetch_strategy in CRUD + Edit-Modal
- migrations/2026-05-09e_fetch_strategy.py NEU: ALTER TABLE sources ADD COLUMN fetch_strategy. Pre-flagging fuer FT/WSJ/NZZ etc. (paywall) und Rheinische Post/Verfassungsschutz (googlebot). - shared/services/source_health.py: gesynct vom Monitor (Phase-18-Code mit Retry-Logik + Strategien default/googlebot/paywall/skip). - routers/sources.py: GlobalSourceCreate/Update um fetch_strategy (Pattern-Validation), SOURCE_UPDATE_COLUMNS + INSERT erweitert. - dashboard.html: Edit-Modal hat jetzt Dropdown sourceFetchStrategy. - sources.js: laedt + sendet fetch_strategy mit. Cache-Buster 20260509c -> 20260509d.
Dieser Commit ist enthalten in:
77
migrations/2026-05-09e_fetch_strategy.py
Normale Datei
77
migrations/2026-05-09e_fetch_strategy.py
Normale Datei
@@ -0,0 +1,77 @@
|
||||
"""Migration 2026-05-09e: sources.fetch_strategy.
|
||||
|
||||
Neues Feld zur Steuerung wie der Health-Check / RSS-Parser eine Quelle abrufen soll:
|
||||
|
||||
default: normaler User-Agent (AegisSight-HealthCheck), bei 403/429 Retry mit Googlebot.
|
||||
googlebot: direkt mit Googlebot-UA (fuer Sites die SEO-freundlich sind).
|
||||
paywall: bei 403 zweite Anfrage via removepaywalls.com (fuer Spiegel+/SZ+/FT etc.).
|
||||
skip: Health-Check ueberspringen (bekannte unerreichbare Quellen).
|
||||
|
||||
Ausfuehrung:
|
||||
DB_PATH=/home/claude-dev/osint-data/osint.db python3 migrations/2026-05-09e_fetch_strategy.py
|
||||
DB_PATH=/home/claude-dev/AegisSight-Monitor-staging/data/osint.db python3 migrations/2026-05-09e_fetch_strategy.py
|
||||
"""
|
||||
import os
|
||||
import sqlite3
|
||||
import sys
|
||||
|
||||
|
||||
def main(db_path: str) -> int:
|
||||
if not os.path.exists(db_path):
|
||||
print(f"FEHLER: DB nicht gefunden: {db_path}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
conn = sqlite3.connect(db_path, timeout=60)
|
||||
conn.execute("PRAGMA busy_timeout = 60000")
|
||||
conn.execute("PRAGMA journal_mode = WAL")
|
||||
|
||||
print(f"Migration auf {db_path}")
|
||||
|
||||
cols = [c[1] for c in conn.execute("PRAGMA table_info(sources)")]
|
||||
if "fetch_strategy" in cols:
|
||||
print(" = sources.fetch_strategy war bereits da")
|
||||
else:
|
||||
conn.execute(
|
||||
"ALTER TABLE sources ADD COLUMN fetch_strategy TEXT DEFAULT 'default'"
|
||||
)
|
||||
print(" + sources.fetch_strategy hinzugefügt (Default 'default')")
|
||||
|
||||
# Bekannte Paywall-Domains pre-flagging
|
||||
paywall_domains = (
|
||||
"ft.com",
|
||||
"wsj.com",
|
||||
"nzz.ch",
|
||||
"handelsblatt.com",
|
||||
"wiwo.de",
|
||||
)
|
||||
for dom in paywall_domains:
|
||||
conn.execute(
|
||||
"UPDATE sources SET fetch_strategy = 'paywall' "
|
||||
"WHERE LOWER(domain) = ? AND COALESCE(fetch_strategy, 'default') = 'default'",
|
||||
(dom,),
|
||||
)
|
||||
print(" ~ paywall-Strategie für bekannte Domains gesetzt (FT, WSJ, NZZ, Handelsblatt, WiWo)")
|
||||
|
||||
# Bekannte Bot-Block-Domains: Googlebot probieren
|
||||
bot_block_domains = (
|
||||
"rheinische-post.de",
|
||||
"rp-online.de",
|
||||
"verfassungsschutz.de",
|
||||
)
|
||||
for dom in bot_block_domains:
|
||||
conn.execute(
|
||||
"UPDATE sources SET fetch_strategy = 'googlebot' "
|
||||
"WHERE LOWER(domain) LIKE ? AND COALESCE(fetch_strategy, 'default') = 'default'",
|
||||
(f"%{dom}",),
|
||||
)
|
||||
print(" ~ googlebot-Strategie für bekannte Bot-Block-Domains")
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
print("Migration abgeschlossen.")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
db_path = os.environ.get("DB_PATH", "/home/claude-dev/osint-data/osint.db")
|
||||
sys.exit(main(db_path))
|
||||
In neuem Issue referenzieren
Einen Benutzer sperren