Phase 18 (Verwaltung): fetch_strategy in CRUD + Edit-Modal
- migrations/2026-05-09e_fetch_strategy.py NEU: ALTER TABLE sources ADD COLUMN fetch_strategy. Pre-flagging fuer FT/WSJ/NZZ etc. (paywall) und Rheinische Post/Verfassungsschutz (googlebot). - shared/services/source_health.py: gesynct vom Monitor (Phase-18-Code mit Retry-Logik + Strategien default/googlebot/paywall/skip). - routers/sources.py: GlobalSourceCreate/Update um fetch_strategy (Pattern-Validation), SOURCE_UPDATE_COLUMNS + INSERT erweitert. - dashboard.html: Edit-Modal hat jetzt Dropdown sourceFetchStrategy. - sources.js: laedt + sendet fetch_strategy mit. Cache-Buster 20260509c -> 20260509d.
Dieser Commit ist enthalten in:
77
migrations/2026-05-09e_fetch_strategy.py
Normale Datei
77
migrations/2026-05-09e_fetch_strategy.py
Normale Datei
@@ -0,0 +1,77 @@
|
||||
"""Migration 2026-05-09e: sources.fetch_strategy.
|
||||
|
||||
Neues Feld zur Steuerung wie der Health-Check / RSS-Parser eine Quelle abrufen soll:
|
||||
|
||||
default: normaler User-Agent (AegisSight-HealthCheck), bei 403/429 Retry mit Googlebot.
|
||||
googlebot: direkt mit Googlebot-UA (fuer Sites die SEO-freundlich sind).
|
||||
paywall: bei 403 zweite Anfrage via removepaywalls.com (fuer Spiegel+/SZ+/FT etc.).
|
||||
skip: Health-Check ueberspringen (bekannte unerreichbare Quellen).
|
||||
|
||||
Ausfuehrung:
|
||||
DB_PATH=/home/claude-dev/osint-data/osint.db python3 migrations/2026-05-09e_fetch_strategy.py
|
||||
DB_PATH=/home/claude-dev/AegisSight-Monitor-staging/data/osint.db python3 migrations/2026-05-09e_fetch_strategy.py
|
||||
"""
|
||||
import os
|
||||
import sqlite3
|
||||
import sys
|
||||
|
||||
|
||||
def main(db_path: str) -> int:
|
||||
if not os.path.exists(db_path):
|
||||
print(f"FEHLER: DB nicht gefunden: {db_path}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
conn = sqlite3.connect(db_path, timeout=60)
|
||||
conn.execute("PRAGMA busy_timeout = 60000")
|
||||
conn.execute("PRAGMA journal_mode = WAL")
|
||||
|
||||
print(f"Migration auf {db_path}")
|
||||
|
||||
cols = [c[1] for c in conn.execute("PRAGMA table_info(sources)")]
|
||||
if "fetch_strategy" in cols:
|
||||
print(" = sources.fetch_strategy war bereits da")
|
||||
else:
|
||||
conn.execute(
|
||||
"ALTER TABLE sources ADD COLUMN fetch_strategy TEXT DEFAULT 'default'"
|
||||
)
|
||||
print(" + sources.fetch_strategy hinzugefügt (Default 'default')")
|
||||
|
||||
# Bekannte Paywall-Domains pre-flagging
|
||||
paywall_domains = (
|
||||
"ft.com",
|
||||
"wsj.com",
|
||||
"nzz.ch",
|
||||
"handelsblatt.com",
|
||||
"wiwo.de",
|
||||
)
|
||||
for dom in paywall_domains:
|
||||
conn.execute(
|
||||
"UPDATE sources SET fetch_strategy = 'paywall' "
|
||||
"WHERE LOWER(domain) = ? AND COALESCE(fetch_strategy, 'default') = 'default'",
|
||||
(dom,),
|
||||
)
|
||||
print(" ~ paywall-Strategie für bekannte Domains gesetzt (FT, WSJ, NZZ, Handelsblatt, WiWo)")
|
||||
|
||||
# Bekannte Bot-Block-Domains: Googlebot probieren
|
||||
bot_block_domains = (
|
||||
"rheinische-post.de",
|
||||
"rp-online.de",
|
||||
"verfassungsschutz.de",
|
||||
)
|
||||
for dom in bot_block_domains:
|
||||
conn.execute(
|
||||
"UPDATE sources SET fetch_strategy = 'googlebot' "
|
||||
"WHERE LOWER(domain) LIKE ? AND COALESCE(fetch_strategy, 'default') = 'default'",
|
||||
(f"%{dom}",),
|
||||
)
|
||||
print(" ~ googlebot-Strategie für bekannte Bot-Block-Domains")
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
print("Migration abgeschlossen.")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
db_path = os.environ.get("DB_PATH", "/home/claude-dev/osint-data/osint.db")
|
||||
sys.exit(main(db_path))
|
||||
@@ -24,7 +24,7 @@ logger = logging.getLogger("verwaltung.sources")
|
||||
|
||||
router = APIRouter(prefix="/api/sources", tags=["sources"])
|
||||
|
||||
SOURCE_UPDATE_COLUMNS = {"name", "url", "domain", "source_type", "category", "status", "notes", "language", "bias"}
|
||||
SOURCE_UPDATE_COLUMNS = {"name", "url", "domain", "source_type", "category", "status", "notes", "language", "bias", "fetch_strategy"}
|
||||
|
||||
|
||||
@router.get("/meta")
|
||||
@@ -48,6 +48,7 @@ class GlobalSourceCreate(BaseModel):
|
||||
notes: Optional[str] = None
|
||||
language: Optional[str] = Field(default=None, max_length=100)
|
||||
bias: Optional[str] = Field(default=None, max_length=500)
|
||||
fetch_strategy: Optional[str] = Field(default="default", pattern="^(default|googlebot|paywall|skip)$")
|
||||
|
||||
|
||||
class GlobalSourceUpdate(BaseModel):
|
||||
@@ -143,10 +144,10 @@ async def create_global_source(
|
||||
)
|
||||
|
||||
cursor = await db.execute(
|
||||
"""INSERT INTO sources (name, url, domain, source_type, category, status, notes, language, bias, added_by, tenant_id)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, 'system', NULL)""",
|
||||
"""INSERT INTO sources (name, url, domain, source_type, category, status, notes, language, bias, fetch_strategy, added_by, tenant_id)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, 'system', NULL)""",
|
||||
(data.name, data.url, data.domain, data.source_type, data.category, data.status, data.notes,
|
||||
data.language, data.bias),
|
||||
data.language, data.bias, data.fetch_strategy or "default"),
|
||||
)
|
||||
src_id = cursor.lastrowid
|
||||
await db.commit()
|
||||
|
||||
@@ -15,6 +15,17 @@ except ImportError:
|
||||
HEALTH_CHECK_USER_AGENT = "Mozilla/5.0 (compatible; AegisSight-HealthCheck/1.0)"
|
||||
HEALTH_CHECK_TIMEOUT_S = 15.0
|
||||
|
||||
# Phase 18: alternative User-Agents fuer Bot-Block-Bypass
|
||||
USER_AGENT_GOOGLEBOT = "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)"
|
||||
USER_AGENT_BROWSER = (
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/120.0 Safari/537.36"
|
||||
)
|
||||
REMOVEPAYWALLS_PREFIX = "https://www.removepaywalls.com/search?url="
|
||||
|
||||
# HTTP-Codes, die einen Retry mit anderem UA rechtfertigen
|
||||
RETRY_ON_STATUS = {403, 406, 429}
|
||||
|
||||
logger = logging.getLogger("osint.source_health")
|
||||
|
||||
|
||||
@@ -24,7 +35,8 @@ async def run_health_checks(db: aiosqlite.Connection) -> dict:
|
||||
|
||||
# Alle aktiven Quellen laden (global UND Tenant-spezifisch)
|
||||
cursor = await db.execute(
|
||||
"SELECT id, name, url, domain, source_type, article_count, last_seen_at "
|
||||
"SELECT id, name, url, domain, source_type, article_count, last_seen_at, "
|
||||
"COALESCE(fetch_strategy, 'default') AS fetch_strategy "
|
||||
"FROM sources WHERE status = 'active' "
|
||||
)
|
||||
sources = [dict(row) for row in await cursor.fetchall()]
|
||||
@@ -108,16 +120,54 @@ async def run_health_checks(db: aiosqlite.Connection) -> dict:
|
||||
async def _check_source_reachability(
|
||||
client: httpx.AsyncClient, source: dict,
|
||||
) -> list[dict]:
|
||||
"""Prüft Erreichbarkeit und Feed-Validität einer Quelle."""
|
||||
"""Prüft Erreichbarkeit und Feed-Validität einer Quelle.
|
||||
|
||||
Phase 18: pro Quelle eine fetch_strategy ('default' | 'googlebot' | 'paywall' | 'skip').
|
||||
Bei 'default' wird im Fehlerfall (403/406/429) ein Retry mit Googlebot-UA gemacht.
|
||||
Bei 'paywall' wird auf removepaywalls.com umgeleitet.
|
||||
Bei 'skip' wird kein Check ausgeführt.
|
||||
"""
|
||||
checks = []
|
||||
url = source["url"]
|
||||
strategy = source.get("fetch_strategy") or "default"
|
||||
|
||||
# URL-Schema sicherstellen: t.me-Kanaele und andere Domains koennen ohne https:// vorkommen
|
||||
# 'skip' -> kein Check (bekannte unerreichbare Quellen, z.B. Login-only)
|
||||
if strategy == "skip":
|
||||
checks.append({
|
||||
"type": "reachability", "status": "ok",
|
||||
"message": "Health-Check uebersprungen (fetch_strategy=skip)",
|
||||
})
|
||||
return checks
|
||||
|
||||
# URL-Schema sicherstellen
|
||||
if url and not url.startswith(("http://", "https://")):
|
||||
url = "https://" + url.lstrip("/")
|
||||
|
||||
# Initialen UA waehlen: googlebot direkt; paywall ueber removepaywalls; default normal
|
||||
initial_ua = HEALTH_CHECK_USER_AGENT
|
||||
initial_url = url
|
||||
if strategy == "googlebot":
|
||||
initial_ua = USER_AGENT_GOOGLEBOT
|
||||
elif strategy == "paywall":
|
||||
initial_url = REMOVEPAYWALLS_PREFIX + url
|
||||
initial_ua = USER_AGENT_BROWSER
|
||||
|
||||
try:
|
||||
resp = await client.get(url)
|
||||
resp = await client.get(initial_url, headers={"User-Agent": initial_ua})
|
||||
|
||||
# Bot-Block-Retry nur bei strategy='default'
|
||||
if (
|
||||
strategy == "default"
|
||||
and resp.status_code in RETRY_ON_STATUS
|
||||
):
|
||||
retry = await client.get(url, headers={"User-Agent": USER_AGENT_GOOGLEBOT})
|
||||
if retry.status_code < 400:
|
||||
resp = retry # Retry hat geholfen
|
||||
checks.append({
|
||||
"type": "reachability", "status": "warning",
|
||||
"message": f"Erreichbar nur mit Googlebot-UA (Standard-UA bekam HTTP {initial_url and 'unknown' or 'XXX'})",
|
||||
})
|
||||
# Hinweis-Eintrag, aber Hauptcheck folgt unten als 'ok' weil resp jetzt die Retry-Antwort ist
|
||||
|
||||
if resp.status_code >= 400:
|
||||
checks.append({
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
<title>AegisSight Monitor-Verwaltung</title>
|
||||
<link rel="icon" type="image/svg+xml" href="/static/favicon.svg">
|
||||
<link rel="apple-touch-icon" href="/static/favicon.svg">
|
||||
<link rel="stylesheet" href="/static/css/style.css?v=20260509c">
|
||||
<link rel="stylesheet" href="/static/css/style.css?v=20260509d">
|
||||
|
||||
<style>
|
||||
.source-badge { display:inline-block; padding:2px 8px; border-radius:4px; font-size:12px; font-weight:600; }
|
||||
@@ -625,6 +625,15 @@
|
||||
<label for="sourceBias">Bias / Einordnung</label>
|
||||
<input type="text" id="sourceBias" placeholder="z.B. Nachrichtenagentur, faktenbasiert-neutral" maxlength="500">
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label for="sourceFetchStrategy">Fetch-Strategie (Health-Check)</label>
|
||||
<select id="sourceFetchStrategy">
|
||||
<option value="default">Standard (UA + Retry mit Googlebot bei 403)</option>
|
||||
<option value="googlebot">Googlebot (direkt - fuer SEO-freundliche Sites)</option>
|
||||
<option value="paywall">Paywall (via removepaywalls.com - z.B. FT, Spiegel+)</option>
|
||||
<option value="skip">Skip (Health-Check ueberspringen)</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label for="sourceNotes">Notizen</label>
|
||||
<input type="text" id="sourceNotes" placeholder="Optional">
|
||||
@@ -697,10 +706,10 @@
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script src="/static/js/app.js?v=20260509c"></script>
|
||||
<script src="/static/js/sources.js?v=20260509c"></script>
|
||||
<script src="/static/js/source-health.js?v=20260509c"></script>
|
||||
<script src="/static/js/audit.js?v=20260509c"></script>
|
||||
<script src="/static/js/app.js?v=20260509d"></script>
|
||||
<script src="/static/js/sources.js?v=20260509d"></script>
|
||||
<script src="/static/js/source-health.js?v=20260509d"></script>
|
||||
<script src="/static/js/audit.js?v=20260509d"></script>
|
||||
<div id="toastContainer" class="toast-container" aria-live="polite" aria-atomic="true"></div>
|
||||
</body>
|
||||
</html>
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
<title>AegisSight Monitor-Verwaltung - Anmeldung</title>
|
||||
<link rel="icon" type="image/svg+xml" href="/static/favicon.svg">
|
||||
<link rel="apple-touch-icon" href="/static/favicon.svg">
|
||||
<link rel="stylesheet" href="/static/css/style.css?v=20260509c">
|
||||
<link rel="stylesheet" href="/static/css/style.css?v=20260509d">
|
||||
</head>
|
||||
<body class="login-page">
|
||||
<div class="login-container">
|
||||
|
||||
@@ -297,6 +297,7 @@ function editGlobalSource(id) {
|
||||
document.getElementById("sourceNotes").value = s.notes || "";
|
||||
document.getElementById("sourceLanguage").value = s.language || "";
|
||||
document.getElementById("sourceBias").value = s.bias || "";
|
||||
document.getElementById("sourceFetchStrategy").value = s.fetch_strategy || "default";
|
||||
openModal("modalSource");
|
||||
}
|
||||
|
||||
@@ -324,6 +325,7 @@ function setupSourceForms() {
|
||||
notes: document.getElementById("sourceNotes").value || null,
|
||||
language: document.getElementById("sourceLanguage").value || null,
|
||||
bias: document.getElementById("sourceBias").value || null,
|
||||
fetch_strategy: document.getElementById("sourceFetchStrategy").value || "default",
|
||||
};
|
||||
|
||||
try {
|
||||
|
||||
In neuem Issue referenzieren
Einen Benutzer sperren