From 7f729443cb18b487c1972ce469cc381cfb8b3490 Mon Sep 17 00:00:00 2001 From: claude-dev Date: Sat, 9 May 2026 04:57:01 +0000 Subject: [PATCH] Phase 18 (Verwaltung): fetch_strategy in CRUD + Edit-Modal - migrations/2026-05-09e_fetch_strategy.py NEU: ALTER TABLE sources ADD COLUMN fetch_strategy. Pre-flagging fuer FT/WSJ/NZZ etc. (paywall) und Rheinische Post/Verfassungsschutz (googlebot). - shared/services/source_health.py: gesynct vom Monitor (Phase-18-Code mit Retry-Logik + Strategien default/googlebot/paywall/skip). - routers/sources.py: GlobalSourceCreate/Update um fetch_strategy (Pattern-Validation), SOURCE_UPDATE_COLUMNS + INSERT erweitert. - dashboard.html: Edit-Modal hat jetzt Dropdown sourceFetchStrategy. - sources.js: laedt + sendet fetch_strategy mit. Cache-Buster 20260509c -> 20260509d. --- migrations/2026-05-09e_fetch_strategy.py | 77 ++++++++++++++++++++++++ src/routers/sources.py | 9 +-- src/shared/services/source_health.py | 58 ++++++++++++++++-- src/static/dashboard.html | 19 ++++-- src/static/index.html | 2 +- src/static/js/sources.js | 2 + 6 files changed, 153 insertions(+), 14 deletions(-) create mode 100644 migrations/2026-05-09e_fetch_strategy.py diff --git a/migrations/2026-05-09e_fetch_strategy.py b/migrations/2026-05-09e_fetch_strategy.py new file mode 100644 index 0000000..f20b98b --- /dev/null +++ b/migrations/2026-05-09e_fetch_strategy.py @@ -0,0 +1,77 @@ +"""Migration 2026-05-09e: sources.fetch_strategy. + +Neues Feld zur Steuerung wie der Health-Check / RSS-Parser eine Quelle abrufen soll: + + default: normaler User-Agent (AegisSight-HealthCheck), bei 403/429 Retry mit Googlebot. + googlebot: direkt mit Googlebot-UA (fuer Sites die SEO-freundlich sind). + paywall: bei 403 zweite Anfrage via removepaywalls.com (fuer Spiegel+/SZ+/FT etc.). + skip: Health-Check ueberspringen (bekannte unerreichbare Quellen). + +Ausfuehrung: + DB_PATH=/home/claude-dev/osint-data/osint.db python3 migrations/2026-05-09e_fetch_strategy.py + DB_PATH=/home/claude-dev/AegisSight-Monitor-staging/data/osint.db python3 migrations/2026-05-09e_fetch_strategy.py +""" +import os +import sqlite3 +import sys + + +def main(db_path: str) -> int: + if not os.path.exists(db_path): + print(f"FEHLER: DB nicht gefunden: {db_path}", file=sys.stderr) + return 1 + + conn = sqlite3.connect(db_path, timeout=60) + conn.execute("PRAGMA busy_timeout = 60000") + conn.execute("PRAGMA journal_mode = WAL") + + print(f"Migration auf {db_path}") + + cols = [c[1] for c in conn.execute("PRAGMA table_info(sources)")] + if "fetch_strategy" in cols: + print(" = sources.fetch_strategy war bereits da") + else: + conn.execute( + "ALTER TABLE sources ADD COLUMN fetch_strategy TEXT DEFAULT 'default'" + ) + print(" + sources.fetch_strategy hinzugefügt (Default 'default')") + + # Bekannte Paywall-Domains pre-flagging + paywall_domains = ( + "ft.com", + "wsj.com", + "nzz.ch", + "handelsblatt.com", + "wiwo.de", + ) + for dom in paywall_domains: + conn.execute( + "UPDATE sources SET fetch_strategy = 'paywall' " + "WHERE LOWER(domain) = ? AND COALESCE(fetch_strategy, 'default') = 'default'", + (dom,), + ) + print(" ~ paywall-Strategie für bekannte Domains gesetzt (FT, WSJ, NZZ, Handelsblatt, WiWo)") + + # Bekannte Bot-Block-Domains: Googlebot probieren + bot_block_domains = ( + "rheinische-post.de", + "rp-online.de", + "verfassungsschutz.de", + ) + for dom in bot_block_domains: + conn.execute( + "UPDATE sources SET fetch_strategy = 'googlebot' " + "WHERE LOWER(domain) LIKE ? AND COALESCE(fetch_strategy, 'default') = 'default'", + (f"%{dom}",), + ) + print(" ~ googlebot-Strategie für bekannte Bot-Block-Domains") + + conn.commit() + conn.close() + print("Migration abgeschlossen.") + return 0 + + +if __name__ == "__main__": + db_path = os.environ.get("DB_PATH", "/home/claude-dev/osint-data/osint.db") + sys.exit(main(db_path)) diff --git a/src/routers/sources.py b/src/routers/sources.py index 5a46c6b..dd96608 100644 --- a/src/routers/sources.py +++ b/src/routers/sources.py @@ -24,7 +24,7 @@ logger = logging.getLogger("verwaltung.sources") router = APIRouter(prefix="/api/sources", tags=["sources"]) -SOURCE_UPDATE_COLUMNS = {"name", "url", "domain", "source_type", "category", "status", "notes", "language", "bias"} +SOURCE_UPDATE_COLUMNS = {"name", "url", "domain", "source_type", "category", "status", "notes", "language", "bias", "fetch_strategy"} @router.get("/meta") @@ -48,6 +48,7 @@ class GlobalSourceCreate(BaseModel): notes: Optional[str] = None language: Optional[str] = Field(default=None, max_length=100) bias: Optional[str] = Field(default=None, max_length=500) + fetch_strategy: Optional[str] = Field(default="default", pattern="^(default|googlebot|paywall|skip)$") class GlobalSourceUpdate(BaseModel): @@ -143,10 +144,10 @@ async def create_global_source( ) cursor = await db.execute( - """INSERT INTO sources (name, url, domain, source_type, category, status, notes, language, bias, added_by, tenant_id) - VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, 'system', NULL)""", + """INSERT INTO sources (name, url, domain, source_type, category, status, notes, language, bias, fetch_strategy, added_by, tenant_id) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 'system', NULL)""", (data.name, data.url, data.domain, data.source_type, data.category, data.status, data.notes, - data.language, data.bias), + data.language, data.bias, data.fetch_strategy or "default"), ) src_id = cursor.lastrowid await db.commit() diff --git a/src/shared/services/source_health.py b/src/shared/services/source_health.py index 9837cda..b07b5a0 100644 --- a/src/shared/services/source_health.py +++ b/src/shared/services/source_health.py @@ -15,6 +15,17 @@ except ImportError: HEALTH_CHECK_USER_AGENT = "Mozilla/5.0 (compatible; AegisSight-HealthCheck/1.0)" HEALTH_CHECK_TIMEOUT_S = 15.0 +# Phase 18: alternative User-Agents fuer Bot-Block-Bypass +USER_AGENT_GOOGLEBOT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)" +USER_AGENT_BROWSER = ( + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/120.0 Safari/537.36" +) +REMOVEPAYWALLS_PREFIX = "https://www.removepaywalls.com/search?url=" + +# HTTP-Codes, die einen Retry mit anderem UA rechtfertigen +RETRY_ON_STATUS = {403, 406, 429} + logger = logging.getLogger("osint.source_health") @@ -24,7 +35,8 @@ async def run_health_checks(db: aiosqlite.Connection) -> dict: # Alle aktiven Quellen laden (global UND Tenant-spezifisch) cursor = await db.execute( - "SELECT id, name, url, domain, source_type, article_count, last_seen_at " + "SELECT id, name, url, domain, source_type, article_count, last_seen_at, " + "COALESCE(fetch_strategy, 'default') AS fetch_strategy " "FROM sources WHERE status = 'active' " ) sources = [dict(row) for row in await cursor.fetchall()] @@ -108,16 +120,54 @@ async def run_health_checks(db: aiosqlite.Connection) -> dict: async def _check_source_reachability( client: httpx.AsyncClient, source: dict, ) -> list[dict]: - """Prüft Erreichbarkeit und Feed-Validität einer Quelle.""" + """Prüft Erreichbarkeit und Feed-Validität einer Quelle. + + Phase 18: pro Quelle eine fetch_strategy ('default' | 'googlebot' | 'paywall' | 'skip'). + Bei 'default' wird im Fehlerfall (403/406/429) ein Retry mit Googlebot-UA gemacht. + Bei 'paywall' wird auf removepaywalls.com umgeleitet. + Bei 'skip' wird kein Check ausgeführt. + """ checks = [] url = source["url"] + strategy = source.get("fetch_strategy") or "default" - # URL-Schema sicherstellen: t.me-Kanaele und andere Domains koennen ohne https:// vorkommen + # 'skip' -> kein Check (bekannte unerreichbare Quellen, z.B. Login-only) + if strategy == "skip": + checks.append({ + "type": "reachability", "status": "ok", + "message": "Health-Check uebersprungen (fetch_strategy=skip)", + }) + return checks + + # URL-Schema sicherstellen if url and not url.startswith(("http://", "https://")): url = "https://" + url.lstrip("/") + # Initialen UA waehlen: googlebot direkt; paywall ueber removepaywalls; default normal + initial_ua = HEALTH_CHECK_USER_AGENT + initial_url = url + if strategy == "googlebot": + initial_ua = USER_AGENT_GOOGLEBOT + elif strategy == "paywall": + initial_url = REMOVEPAYWALLS_PREFIX + url + initial_ua = USER_AGENT_BROWSER + try: - resp = await client.get(url) + resp = await client.get(initial_url, headers={"User-Agent": initial_ua}) + + # Bot-Block-Retry nur bei strategy='default' + if ( + strategy == "default" + and resp.status_code in RETRY_ON_STATUS + ): + retry = await client.get(url, headers={"User-Agent": USER_AGENT_GOOGLEBOT}) + if retry.status_code < 400: + resp = retry # Retry hat geholfen + checks.append({ + "type": "reachability", "status": "warning", + "message": f"Erreichbar nur mit Googlebot-UA (Standard-UA bekam HTTP {initial_url and 'unknown' or 'XXX'})", + }) + # Hinweis-Eintrag, aber Hauptcheck folgt unten als 'ok' weil resp jetzt die Retry-Antwort ist if resp.status_code >= 400: checks.append({ diff --git a/src/static/dashboard.html b/src/static/dashboard.html index 6bab51b..61b4d41 100644 --- a/src/static/dashboard.html +++ b/src/static/dashboard.html @@ -6,7 +6,7 @@ AegisSight Monitor-Verwaltung - +