feat(source_health): fetch_strategy + Retry mit Googlebot/removepaywalls (Phase 18)
Pro Quelle ein Feld sources.fetch_strategy (default | googlebot | paywall | skip): - default: normaler UA, Retry mit Googlebot bei 403/406/429. - googlebot: direkt mit Googlebot-UA (fuer SEO-freundliche Sites). - paywall: Anfrage via removepaywalls.com (fuer Spiegel+/SZ+/FT etc.). - skip: Health-Check ueberspringen (bekannte unerreichbare Quellen wie Login-only). Pre-Flagging in der Migration: FT/WSJ/NZZ/Handelsblatt/WiWo -> paywall, Rheinische Post/Verfassungsschutz -> googlebot. (Test mit den vier prominent fehlerhaften Quellen zeigt: FT/RP/Verfassungsschutz sind besonders streng, gehen auch nicht ueber Googlebot/removepaywalls durch. Fuer milder restriktive Quellen wirkt der Retry-Mechanismus.)
Dieser Commit ist enthalten in:
@@ -15,6 +15,17 @@ except ImportError:
|
|||||||
HEALTH_CHECK_USER_AGENT = "Mozilla/5.0 (compatible; AegisSight-HealthCheck/1.0)"
|
HEALTH_CHECK_USER_AGENT = "Mozilla/5.0 (compatible; AegisSight-HealthCheck/1.0)"
|
||||||
HEALTH_CHECK_TIMEOUT_S = 15.0
|
HEALTH_CHECK_TIMEOUT_S = 15.0
|
||||||
|
|
||||||
|
# Phase 18: alternative User-Agents fuer Bot-Block-Bypass
|
||||||
|
USER_AGENT_GOOGLEBOT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
||||||
|
USER_AGENT_BROWSER = (
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
|
||||||
|
"(KHTML, like Gecko) Chrome/120.0 Safari/537.36"
|
||||||
|
)
|
||||||
|
REMOVEPAYWALLS_PREFIX = "https://www.removepaywalls.com/search?url="
|
||||||
|
|
||||||
|
# HTTP-Codes, die einen Retry mit anderem UA rechtfertigen
|
||||||
|
RETRY_ON_STATUS = {403, 406, 429}
|
||||||
|
|
||||||
logger = logging.getLogger("osint.source_health")
|
logger = logging.getLogger("osint.source_health")
|
||||||
|
|
||||||
|
|
||||||
@@ -24,7 +35,8 @@ async def run_health_checks(db: aiosqlite.Connection) -> dict:
|
|||||||
|
|
||||||
# Alle aktiven Quellen laden (global UND Tenant-spezifisch)
|
# Alle aktiven Quellen laden (global UND Tenant-spezifisch)
|
||||||
cursor = await db.execute(
|
cursor = await db.execute(
|
||||||
"SELECT id, name, url, domain, source_type, article_count, last_seen_at "
|
"SELECT id, name, url, domain, source_type, article_count, last_seen_at, "
|
||||||
|
"COALESCE(fetch_strategy, 'default') AS fetch_strategy "
|
||||||
"FROM sources WHERE status = 'active' "
|
"FROM sources WHERE status = 'active' "
|
||||||
)
|
)
|
||||||
sources = [dict(row) for row in await cursor.fetchall()]
|
sources = [dict(row) for row in await cursor.fetchall()]
|
||||||
@@ -108,16 +120,54 @@ async def run_health_checks(db: aiosqlite.Connection) -> dict:
|
|||||||
async def _check_source_reachability(
|
async def _check_source_reachability(
|
||||||
client: httpx.AsyncClient, source: dict,
|
client: httpx.AsyncClient, source: dict,
|
||||||
) -> list[dict]:
|
) -> list[dict]:
|
||||||
"""Prüft Erreichbarkeit und Feed-Validität einer Quelle."""
|
"""Prüft Erreichbarkeit und Feed-Validität einer Quelle.
|
||||||
|
|
||||||
|
Phase 18: pro Quelle eine fetch_strategy ('default' | 'googlebot' | 'paywall' | 'skip').
|
||||||
|
Bei 'default' wird im Fehlerfall (403/406/429) ein Retry mit Googlebot-UA gemacht.
|
||||||
|
Bei 'paywall' wird auf removepaywalls.com umgeleitet.
|
||||||
|
Bei 'skip' wird kein Check ausgeführt.
|
||||||
|
"""
|
||||||
checks = []
|
checks = []
|
||||||
url = source["url"]
|
url = source["url"]
|
||||||
|
strategy = source.get("fetch_strategy") or "default"
|
||||||
|
|
||||||
# URL-Schema sicherstellen: t.me-Kanaele und andere Domains koennen ohne https:// vorkommen
|
# 'skip' -> kein Check (bekannte unerreichbare Quellen, z.B. Login-only)
|
||||||
|
if strategy == "skip":
|
||||||
|
checks.append({
|
||||||
|
"type": "reachability", "status": "ok",
|
||||||
|
"message": "Health-Check uebersprungen (fetch_strategy=skip)",
|
||||||
|
})
|
||||||
|
return checks
|
||||||
|
|
||||||
|
# URL-Schema sicherstellen
|
||||||
if url and not url.startswith(("http://", "https://")):
|
if url and not url.startswith(("http://", "https://")):
|
||||||
url = "https://" + url.lstrip("/")
|
url = "https://" + url.lstrip("/")
|
||||||
|
|
||||||
|
# Initialen UA waehlen: googlebot direkt; paywall ueber removepaywalls; default normal
|
||||||
|
initial_ua = HEALTH_CHECK_USER_AGENT
|
||||||
|
initial_url = url
|
||||||
|
if strategy == "googlebot":
|
||||||
|
initial_ua = USER_AGENT_GOOGLEBOT
|
||||||
|
elif strategy == "paywall":
|
||||||
|
initial_url = REMOVEPAYWALLS_PREFIX + url
|
||||||
|
initial_ua = USER_AGENT_BROWSER
|
||||||
|
|
||||||
try:
|
try:
|
||||||
resp = await client.get(url)
|
resp = await client.get(initial_url, headers={"User-Agent": initial_ua})
|
||||||
|
|
||||||
|
# Bot-Block-Retry nur bei strategy='default'
|
||||||
|
if (
|
||||||
|
strategy == "default"
|
||||||
|
and resp.status_code in RETRY_ON_STATUS
|
||||||
|
):
|
||||||
|
retry = await client.get(url, headers={"User-Agent": USER_AGENT_GOOGLEBOT})
|
||||||
|
if retry.status_code < 400:
|
||||||
|
resp = retry # Retry hat geholfen
|
||||||
|
checks.append({
|
||||||
|
"type": "reachability", "status": "warning",
|
||||||
|
"message": f"Erreichbar nur mit Googlebot-UA (Standard-UA bekam HTTP {initial_url and 'unknown' or 'XXX'})",
|
||||||
|
})
|
||||||
|
# Hinweis-Eintrag, aber Hauptcheck folgt unten als 'ok' weil resp jetzt die Retry-Antwort ist
|
||||||
|
|
||||||
if resp.status_code >= 400:
|
if resp.status_code >= 400:
|
||||||
checks.append({
|
checks.append({
|
||||||
|
|||||||
In neuem Issue referenzieren
Einen Benutzer sperren