Initial commit: AegisSight-Monitor (OSINT-Monitoringsystem)
Dieser Commit ist enthalten in:
169
src/agents/analyzer.py
Normale Datei
169
src/agents/analyzer.py
Normale Datei
@@ -0,0 +1,169 @@
|
||||
"""Analyzer-Agent: Analysiert, übersetzt und fasst Meldungen zusammen."""
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from datetime import datetime
|
||||
from config import TIMEZONE
|
||||
from agents.claude_client import call_claude, ClaudeUsage
|
||||
|
||||
logger = logging.getLogger("osint.analyzer")
|
||||
|
||||
ANALYSIS_PROMPT_TEMPLATE = """Du bist ein OSINT-Analyse-Agent für ein Lagemonitoring-System.
|
||||
HEUTIGES DATUM: {today}
|
||||
AUSGABESPRACHE: {output_language}
|
||||
|
||||
VORFALL: {title}
|
||||
KONTEXT: {description}
|
||||
|
||||
VORHANDENE MELDUNGEN:
|
||||
{articles_text}
|
||||
|
||||
AUFTRAG:
|
||||
1. Erstelle eine neutrale, faktenbasierte Zusammenfassung auf {output_language} (max. 500 Wörter)
|
||||
2. Verwende Inline-Quellenverweise [1], [2], [3] etc. im Zusammenfassungstext
|
||||
3. Liste die bestätigten Kernfakten auf
|
||||
4. Übersetze fremdsprachige Überschriften und Inhalte in die Ausgabesprache
|
||||
|
||||
STRUKTUR:
|
||||
- Wenn die Meldungen thematisch klar einen einzelnen Strang behandeln: Fließtext ohne Überschriften
|
||||
- Wenn verschiedene Aspekte oder Themenfelder aufkommen (z.B. Ereignis + Reaktionen + Hintergrund): Gliedere mit kurzen Markdown-Zwischenüberschriften (##)
|
||||
- Die Entscheidung liegt bei dir — Überschriften nur wenn sie dem Leser helfen, verschiedene Themenstränge auseinanderzuhalten
|
||||
|
||||
REGELN:
|
||||
- Neutral und sachlich - keine Wertungen oder Spekulationen
|
||||
- Nur gesicherte Informationen in die Zusammenfassung
|
||||
- Bei widersprüchlichen Angaben beide Seiten erwähnen
|
||||
- Quellen immer mit [Nr] referenzieren
|
||||
- Jede verwendete Quelle MUSS im sources-Array aufgelistet sein
|
||||
- Nummeriere die Quellen fortlaufend ab [1]
|
||||
- Ältere Quellen zeitlich einordnen (z.B. "laut einem Bericht vom Januar", "Anfang Februar berichtete...")
|
||||
|
||||
Antworte AUSSCHLIESSLICH als JSON-Objekt mit diesen Feldern:
|
||||
- "summary": Zusammenfassung auf {output_language} mit Quellenverweisen [1], [2] etc. im Text (Markdown-Überschriften ## erlaubt wenn sinnvoll)
|
||||
- "sources": Array von Quellenobjekten, je: {{"nr": 1, "name": "Quellenname", "url": "https://..."}}
|
||||
- "key_facts": Array von bestätigten Kernfakten (Strings, in Ausgabesprache)
|
||||
- "translations": Array von Objekten mit "article_id", "headline_de", "content_de" (nur für fremdsprachige Artikel)
|
||||
|
||||
Antworte NUR mit dem JSON-Objekt. Keine Einleitung, keine Erklärung."""
|
||||
|
||||
BRIEFING_PROMPT_TEMPLATE = """Du bist ein OSINT-Analyse-Agent für ein Lagemonitoring-System.
|
||||
Du erstellst ein strukturiertes Briefing für eine Hintergrundrecherche.
|
||||
HEUTIGES DATUM: {today}
|
||||
AUSGABESPRACHE: {output_language}
|
||||
|
||||
THEMA: {title}
|
||||
KONTEXT: {description}
|
||||
|
||||
VORLIEGENDE QUELLEN:
|
||||
{articles_text}
|
||||
|
||||
AUFTRAG:
|
||||
Erstelle ein strukturiertes Briefing (max. 800 Wörter) auf {output_language} mit folgenden Abschnitten.
|
||||
Verwende durchgehend Inline-Quellenverweise [1], [2], [3] etc. im Text.
|
||||
|
||||
## ÜBERBLICK
|
||||
Kurze Einordnung des Themas (2-3 Sätze)
|
||||
|
||||
## HINTERGRUND
|
||||
Historischer Kontext, relevante Vorgeschichte
|
||||
|
||||
## AKTEURE
|
||||
Beteiligte Personen, Organisationen, Institutionen und ihre Rollen
|
||||
|
||||
## AKTUELLE LAGE
|
||||
Was ist der aktuelle Stand? Welche Entwicklungen gibt es?
|
||||
|
||||
## EINSCHÄTZUNG
|
||||
Sachliche Bewertung der Situation, mögliche Entwicklungen
|
||||
|
||||
## QUELLENQUALITÄT
|
||||
Kurze Bewertung der Informationslage: Wie belastbar sind die vorliegenden Quellen?
|
||||
|
||||
REGELN:
|
||||
- Neutral und sachlich - keine Wertungen oder Spekulationen
|
||||
- Nur gesicherte Informationen verwenden
|
||||
- Bei widersprüchlichen Angaben beide Seiten erwähnen
|
||||
- Quellen immer mit [Nr] referenzieren
|
||||
- Jede verwendete Quelle MUSS im sources-Array aufgelistet sein
|
||||
- Nummeriere die Quellen fortlaufend ab [1]
|
||||
- Ältere Quellen zeitlich einordnen (z.B. "laut einem Bericht vom Januar", "Anfang Februar berichtete...")
|
||||
- Markdown-Überschriften (##) für die Abschnitte verwenden
|
||||
- Fettdruck (**) für Schlüsselbegriffe erlaubt
|
||||
|
||||
Antworte AUSSCHLIESSLICH als JSON-Objekt mit diesen Feldern:
|
||||
- "summary": Das strukturierte Briefing als Markdown-Text mit Quellenverweisen [1], [2] etc.
|
||||
- "sources": Array von Quellenobjekten, je: {{"nr": 1, "name": "Quellenname", "url": "https://..."}}
|
||||
- "key_facts": Array von gesicherten Kernfakten (Strings, in Ausgabesprache)
|
||||
- "translations": Array von Objekten mit "article_id", "headline_de", "content_de" (nur für fremdsprachige Artikel)
|
||||
|
||||
Antworte NUR mit dem JSON-Objekt. Keine Einleitung, keine Erklärung."""
|
||||
|
||||
|
||||
class AnalyzerAgent:
|
||||
"""Analysiert und übersetzt Meldungen über Claude CLI."""
|
||||
|
||||
async def analyze(self, title: str, description: str, articles: list[dict], incident_type: str = "adhoc") -> tuple[dict | None, ClaudeUsage | None]:
|
||||
"""Analysiert alle Meldungen zu einem Vorfall."""
|
||||
if not articles:
|
||||
return None, None
|
||||
|
||||
# Artikel-Text für Prompt aufbereiten
|
||||
articles_text = ""
|
||||
for i, article in enumerate(articles[:30]): # Max 30 Artikel um Prompt-Länge zu begrenzen
|
||||
articles_text += f"\n--- Meldung {i+1} (ID: {article.get('id', 'neu')}) ---\n"
|
||||
articles_text += f"Quelle: {article.get('source', 'Unbekannt')}\n"
|
||||
url = article.get('source_url', '')
|
||||
if url:
|
||||
articles_text += f"URL: {url}\n"
|
||||
articles_text += f"Sprache: {article.get('language', 'de')}\n"
|
||||
published = article.get('published_at', '')
|
||||
if published:
|
||||
articles_text += f"Veröffentlicht: {published}\n"
|
||||
headline = article.get('headline_de') or article.get('headline', '')
|
||||
articles_text += f"Überschrift: {headline}\n"
|
||||
content = article.get('content_de') or article.get('content_original', '')
|
||||
if content:
|
||||
articles_text += f"Inhalt: {content[:500]}\n"
|
||||
|
||||
from config import OUTPUT_LANGUAGE
|
||||
today = datetime.now(TIMEZONE).strftime("%d.%m.%Y")
|
||||
template = BRIEFING_PROMPT_TEMPLATE if incident_type == "research" else ANALYSIS_PROMPT_TEMPLATE
|
||||
prompt = template.format(
|
||||
title=title,
|
||||
description=description or "Keine weiteren Details",
|
||||
articles_text=articles_text,
|
||||
today=today,
|
||||
output_language=OUTPUT_LANGUAGE,
|
||||
)
|
||||
|
||||
try:
|
||||
result, usage = await call_claude(prompt)
|
||||
analysis = self._parse_response(result)
|
||||
if analysis:
|
||||
logger.info(f"Analyse abgeschlossen: {len(analysis.get('sources', []))} Quellen referenziert")
|
||||
return analysis, usage
|
||||
except Exception as e:
|
||||
logger.error(f"Analyse-Fehler: {e}")
|
||||
return None, None
|
||||
|
||||
def _parse_response(self, response: str) -> dict | None:
|
||||
"""Parst die Claude-Antwort als JSON-Objekt."""
|
||||
try:
|
||||
data = json.loads(response)
|
||||
if isinstance(data, dict):
|
||||
return data
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
match = re.search(r'\{.*\}', response, re.DOTALL)
|
||||
if match:
|
||||
try:
|
||||
data = json.loads(match.group())
|
||||
if isinstance(data, dict):
|
||||
return data
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
logger.warning("Konnte Analyse-Antwort nicht als JSON parsen")
|
||||
return None
|
||||
88
src/agents/claude_client.py
Normale Datei
88
src/agents/claude_client.py
Normale Datei
@@ -0,0 +1,88 @@
|
||||
"""Shared Claude CLI Client mit Usage-Tracking."""
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
from config import CLAUDE_PATH, CLAUDE_TIMEOUT
|
||||
|
||||
logger = logging.getLogger("osint.claude_client")
|
||||
|
||||
|
||||
@dataclass
|
||||
class ClaudeUsage:
|
||||
"""Token-Verbrauch eines einzelnen Claude CLI Aufrufs."""
|
||||
input_tokens: int = 0
|
||||
output_tokens: int = 0
|
||||
cache_creation_tokens: int = 0
|
||||
cache_read_tokens: int = 0
|
||||
cost_usd: float = 0.0
|
||||
duration_ms: int = 0
|
||||
|
||||
|
||||
@dataclass
|
||||
class UsageAccumulator:
|
||||
"""Akkumuliert Usage über mehrere Claude-Aufrufe eines Refreshs."""
|
||||
input_tokens: int = 0
|
||||
output_tokens: int = 0
|
||||
cache_creation_tokens: int = 0
|
||||
cache_read_tokens: int = 0
|
||||
total_cost_usd: float = 0.0
|
||||
call_count: int = 0
|
||||
|
||||
def add(self, usage: ClaudeUsage):
|
||||
self.input_tokens += usage.input_tokens
|
||||
self.output_tokens += usage.output_tokens
|
||||
self.cache_creation_tokens += usage.cache_creation_tokens
|
||||
self.cache_read_tokens += usage.cache_read_tokens
|
||||
self.total_cost_usd += usage.cost_usd
|
||||
self.call_count += 1
|
||||
|
||||
|
||||
async def call_claude(prompt: str, tools: str | None = "WebSearch,WebFetch") -> tuple[str, ClaudeUsage]:
|
||||
"""Ruft Claude CLI auf. Gibt (result_text, usage) zurück."""
|
||||
cmd = [CLAUDE_PATH, "-p", prompt, "--output-format", "json"]
|
||||
if tools:
|
||||
cmd.extend(["--allowedTools", tools])
|
||||
else:
|
||||
cmd.extend(["--max-turns", "1"])
|
||||
|
||||
process = await asyncio.create_subprocess_exec(
|
||||
*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE,
|
||||
env={"PATH": "/usr/local/bin:/usr/bin:/bin", "HOME": "/home/claude-dev"},
|
||||
)
|
||||
try:
|
||||
stdout, stderr = await asyncio.wait_for(process.communicate(), timeout=CLAUDE_TIMEOUT)
|
||||
except asyncio.TimeoutError:
|
||||
process.kill()
|
||||
raise TimeoutError(f"Claude CLI Timeout nach {CLAUDE_TIMEOUT}s")
|
||||
|
||||
if process.returncode != 0:
|
||||
error_msg = stderr.decode("utf-8", errors="replace").strip()
|
||||
logger.error(f"Claude CLI Fehler (Exit {process.returncode}): {error_msg}")
|
||||
raise RuntimeError(f"Claude CLI Fehler: {error_msg}")
|
||||
|
||||
raw = stdout.decode("utf-8", errors="replace").strip()
|
||||
usage = ClaudeUsage()
|
||||
result_text = raw
|
||||
|
||||
try:
|
||||
data = json.loads(raw)
|
||||
result_text = data.get("result", raw)
|
||||
u = data.get("usage", {})
|
||||
usage = ClaudeUsage(
|
||||
input_tokens=u.get("input_tokens", 0),
|
||||
output_tokens=u.get("output_tokens", 0),
|
||||
cache_creation_tokens=u.get("cache_creation_input_tokens", 0),
|
||||
cache_read_tokens=u.get("cache_read_input_tokens", 0),
|
||||
cost_usd=data.get("total_cost_usd", 0.0),
|
||||
duration_ms=data.get("duration_ms", 0),
|
||||
)
|
||||
logger.info(
|
||||
f"Claude: {usage.input_tokens} in / {usage.output_tokens} out / "
|
||||
f"cache {usage.cache_creation_tokens}+{usage.cache_read_tokens} / "
|
||||
f"${usage.cost_usd:.4f} / {usage.duration_ms}ms"
|
||||
)
|
||||
except json.JSONDecodeError:
|
||||
logger.warning("Claude CLI Antwort kein gültiges JSON, nutze raw output")
|
||||
|
||||
return result_text, usage
|
||||
143
src/agents/factchecker.py
Normale Datei
143
src/agents/factchecker.py
Normale Datei
@@ -0,0 +1,143 @@
|
||||
"""Factchecker-Agent: Prüft Fakten gegen mehrere unabhängige Quellen."""
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from agents.claude_client import call_claude, ClaudeUsage
|
||||
|
||||
logger = logging.getLogger("osint.factchecker")
|
||||
|
||||
FACTCHECK_PROMPT_TEMPLATE = """Du bist ein Faktencheck-Agent für ein OSINT-Lagemonitoring-System.
|
||||
AUSGABESPRACHE: {output_language}
|
||||
|
||||
VORFALL: {title}
|
||||
|
||||
VORLIEGENDE MELDUNGEN:
|
||||
{articles_text}
|
||||
|
||||
STRENGE REGELN - KEINE HALLUZINATIONEN:
|
||||
- Du darfst NUR Fakten bewerten, die direkt aus den oben übergebenen Meldungen stammen
|
||||
- KEINE Fakten aus deinem Trainingskorpus - NUR aus den übergebenen Meldungen + WebSearch
|
||||
- Nutze WebSearch um jeden Claim gegen mindestens 1 weitere unabhängige Quelle zu prüfen
|
||||
- Rufe die gefundenen URLs per WebFetch ab um den Inhalt zu verifizieren
|
||||
- Nur wenn du den Claim in der tatsächlich abgerufenen Quelle findest, darfst du ihn als bestätigt markieren
|
||||
- Jeder Claim MUSS eine konkrete Quellen-URL als Beleg enthalten
|
||||
- "confirmed" erst bei 2+ unabhängigen Quellen mit überprüfbarer URL
|
||||
- Lieber "unconfirmed" als falsch bestätigt
|
||||
|
||||
AUFTRAG:
|
||||
1. Identifiziere die 5-10 wichtigsten Faktenaussagen aus den Meldungen
|
||||
2. Prüfe jeden Claim aktiv per WebSearch gegen mindestens eine weitere unabhängige Quelle
|
||||
3. Kategorisiere jede Aussage:
|
||||
- "confirmed": Durch 2+ unabhängige seriöse Quellen mit überprüfbarer URL bestätigt
|
||||
- "unconfirmed": Nur 1 Quelle oder nicht unabhängig verifizierbar
|
||||
- "contradicted": Widersprüchliche Informationen aus verschiedenen Quellen
|
||||
- "developing": Situation noch unklar, entwickelt sich
|
||||
4. Markiere WICHTIGE NEUE Entwicklungen mit is_notification: true
|
||||
|
||||
Antworte AUSSCHLIESSLICH als JSON-Array. Jedes Element hat:
|
||||
- "claim": Die Faktenaussage auf {output_language}
|
||||
- "status": "confirmed" | "unconfirmed" | "contradicted" | "developing"
|
||||
- "sources_count": Anzahl unabhängiger Quellen mit überprüfbarer URL
|
||||
- "evidence": Begründung MIT konkreten Quellen-URLs als Beleg (z.B. "Bestätigt durch: tagesschau.de (URL), Reuters (URL)")
|
||||
- "is_notification": true/false (nur bei wichtigen Entwicklungen true)
|
||||
|
||||
Antworte NUR mit dem JSON-Array. Keine Einleitung, keine Erklärung."""
|
||||
|
||||
RESEARCH_FACTCHECK_PROMPT_TEMPLATE = """Du bist ein Faktencheck-Agent für eine Hintergrundrecherche in einem OSINT-Lagemonitoring-System.
|
||||
AUSGABESPRACHE: {output_language}
|
||||
|
||||
THEMA: {title}
|
||||
|
||||
VORLIEGENDE QUELLEN:
|
||||
{articles_text}
|
||||
|
||||
STRENGE REGELN - KEINE HALLUZINATIONEN:
|
||||
- Du darfst NUR Fakten bewerten, die direkt aus den oben übergebenen Quellen stammen
|
||||
- KEINE Fakten aus deinem Trainingskorpus - NUR aus den übergebenen Quellen + WebSearch
|
||||
- Nutze WebSearch um jeden Claim gegen mindestens 1 weitere unabhängige Quelle zu prüfen
|
||||
- Rufe die gefundenen URLs per WebFetch ab um den Inhalt zu verifizieren
|
||||
- Nur wenn du den Claim in der tatsächlich abgerufenen Quelle findest, darfst du ihn als gesichert markieren
|
||||
- Jeder Claim MUSS eine konkrete Quellen-URL als Beleg enthalten
|
||||
- Lieber "unverified" als falsch bestätigt
|
||||
|
||||
AUFTRAG:
|
||||
Fokus: "Was sind die gesicherten Fakten zu diesem Thema?"
|
||||
|
||||
1. Identifiziere die 5-10 wichtigsten Faktenaussagen aus den Quellen
|
||||
2. Prüfe jeden Claim aktiv per WebSearch gegen weitere unabhängige Quellen
|
||||
3. Kategorisiere jede Aussage:
|
||||
- "established": Breit dokumentierter, gesicherter Fakt (3+ unabhängige Quellen mit URL)
|
||||
- "disputed": Umstrittener Sachverhalt, verschiedene Positionen dokumentiert
|
||||
- "unverified": Einzelbehauptung, nicht unabhängig verifizierbar
|
||||
- "developing": Aktuelle Entwicklung, Faktenlage noch im Fluss
|
||||
4. Markiere WICHTIGE Erkenntnisse mit is_notification: true
|
||||
|
||||
Antworte AUSSCHLIESSLICH als JSON-Array. Jedes Element hat:
|
||||
- "claim": Die Faktenaussage auf {output_language}
|
||||
- "status": "established" | "disputed" | "unverified" | "developing"
|
||||
- "sources_count": Anzahl unabhängiger Quellen mit überprüfbarer URL
|
||||
- "evidence": Begründung MIT konkreten Quellen-URLs als Beleg
|
||||
- "is_notification": true/false
|
||||
|
||||
Antworte NUR mit dem JSON-Array. Keine Einleitung, keine Erklärung."""
|
||||
|
||||
|
||||
class FactCheckerAgent:
|
||||
"""Prüft Fakten über Claude CLI gegen unabhängige Quellen."""
|
||||
|
||||
async def check(self, title: str, articles: list[dict], incident_type: str = "adhoc") -> tuple[list[dict], ClaudeUsage | None]:
|
||||
"""Führt Faktencheck für eine Lage durch."""
|
||||
if not articles:
|
||||
return [], None
|
||||
|
||||
articles_text = ""
|
||||
for i, article in enumerate(articles[:20]):
|
||||
articles_text += f"\n--- Meldung {i+1} ---\n"
|
||||
articles_text += f"Quelle: {article.get('source', 'Unbekannt')}\n"
|
||||
source_url = article.get('source_url', '')
|
||||
if source_url:
|
||||
articles_text += f"URL: {source_url}\n"
|
||||
headline = article.get('headline_de') or article.get('headline', '')
|
||||
articles_text += f"Überschrift: {headline}\n"
|
||||
content = article.get('content_de') or article.get('content_original', '')
|
||||
if content:
|
||||
articles_text += f"Inhalt: {content[:300]}\n"
|
||||
|
||||
from config import OUTPUT_LANGUAGE
|
||||
template = RESEARCH_FACTCHECK_PROMPT_TEMPLATE if incident_type == "research" else FACTCHECK_PROMPT_TEMPLATE
|
||||
prompt = template.format(
|
||||
title=title,
|
||||
articles_text=articles_text,
|
||||
output_language=OUTPUT_LANGUAGE,
|
||||
)
|
||||
|
||||
try:
|
||||
result, usage = await call_claude(prompt)
|
||||
facts = self._parse_response(result)
|
||||
logger.info(f"Faktencheck: {len(facts)} Fakten geprüft")
|
||||
return facts, usage
|
||||
except Exception as e:
|
||||
logger.error(f"Faktencheck-Fehler: {e}")
|
||||
return [], None
|
||||
|
||||
def _parse_response(self, response: str) -> list[dict]:
|
||||
"""Parst die Claude-Antwort als JSON-Array."""
|
||||
try:
|
||||
data = json.loads(response)
|
||||
if isinstance(data, list):
|
||||
return data
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
match = re.search(r'\[.*\]', response, re.DOTALL)
|
||||
if match:
|
||||
try:
|
||||
data = json.loads(match.group())
|
||||
if isinstance(data, list):
|
||||
return data
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
logger.warning("Konnte Faktencheck-Antwort nicht als JSON parsen")
|
||||
return []
|
||||
893
src/agents/orchestrator.py
Normale Datei
893
src/agents/orchestrator.py
Normale Datei
@@ -0,0 +1,893 @@
|
||||
"""Agenten-Orchestrierung: Queue und Steuerung der Claude-Agenten."""
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from datetime import datetime, timezone
|
||||
from config import TIMEZONE
|
||||
from typing import Optional
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
|
||||
from agents.claude_client import UsageAccumulator
|
||||
from source_rules import (
|
||||
DOMAIN_CATEGORY_MAP,
|
||||
_detect_category,
|
||||
_extract_domain,
|
||||
discover_source,
|
||||
domain_to_display_name,
|
||||
)
|
||||
|
||||
logger = logging.getLogger("osint.orchestrator")
|
||||
|
||||
|
||||
def _normalize_url(url: str) -> str:
|
||||
"""URL normalisieren für Duplikat-Erkennung."""
|
||||
if not url:
|
||||
return ""
|
||||
url = url.strip()
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
# Scheme normalisieren
|
||||
scheme = parsed.scheme.lower() or "https"
|
||||
# Host normalisieren (www entfernen, lowercase)
|
||||
netloc = parsed.netloc.lower()
|
||||
if netloc.startswith("www."):
|
||||
netloc = netloc[4:]
|
||||
# Pfad normalisieren (trailing slash entfernen)
|
||||
path = parsed.path.rstrip("/")
|
||||
# Query-Parameter und Fragment entfernen (Tracking-Params etc.)
|
||||
return urlunparse((scheme, netloc, path, "", "", ""))
|
||||
except Exception:
|
||||
return url.lower().strip().rstrip("/")
|
||||
|
||||
|
||||
def _normalize_headline(headline: str) -> str:
|
||||
"""Überschrift normalisieren für Ähnlichkeitsvergleich."""
|
||||
if not headline:
|
||||
return ""
|
||||
h = headline.lower().strip()
|
||||
# Umlaute normalisieren
|
||||
h = h.replace("ä", "ae").replace("ö", "oe").replace("ü", "ue").replace("ß", "ss")
|
||||
# Sonderzeichen entfernen
|
||||
h = re.sub(r"[^\w\s]", "", h)
|
||||
h = re.sub(r"\s+", " ", h).strip()
|
||||
return h
|
||||
|
||||
|
||||
def _is_duplicate(article: dict, seen_urls: set, seen_headlines: set) -> bool:
|
||||
"""Prüft ob ein Artikel ein Duplikat ist (URL oder Headline)."""
|
||||
url = article.get("source_url", "")
|
||||
headline = article.get("headline", "")
|
||||
|
||||
# URL-Duplikat
|
||||
if url:
|
||||
norm_url = _normalize_url(url)
|
||||
if norm_url in seen_urls:
|
||||
return True
|
||||
seen_urls.add(norm_url)
|
||||
|
||||
# Headline-Duplikat (nur wenn Überschrift lang genug)
|
||||
if headline and len(headline) > 20:
|
||||
norm_headline = _normalize_headline(headline)
|
||||
if norm_headline and norm_headline in seen_headlines:
|
||||
return True
|
||||
if norm_headline:
|
||||
seen_headlines.add(norm_headline)
|
||||
|
||||
return False
|
||||
|
||||
|
||||
async def _background_discover_sources(articles: list[dict]):
|
||||
"""Background-Task: Registriert seriöse, unbekannte Quellen aus Recherche-Ergebnissen."""
|
||||
from database import get_db
|
||||
|
||||
db = await get_db()
|
||||
try:
|
||||
# 1. Unique Domains extrahieren
|
||||
seen_domains: set[str] = set()
|
||||
domains_to_check: list[tuple[str, str, str]] = []
|
||||
for article in articles:
|
||||
url = article.get("source_url")
|
||||
if not url:
|
||||
continue
|
||||
domain = _extract_domain(url)
|
||||
if not domain or domain in seen_domains:
|
||||
continue
|
||||
seen_domains.add(domain)
|
||||
|
||||
# 2. Nur seriöse Domains (in DOMAIN_CATEGORY_MAP, nicht "sonstige")
|
||||
category = _detect_category(domain)
|
||||
if category == "sonstige":
|
||||
continue
|
||||
domains_to_check.append((domain, url, category))
|
||||
|
||||
if not domains_to_check:
|
||||
return
|
||||
|
||||
# 3. Gegen DB prüfen — welche Domains existieren schon?
|
||||
new_count = 0
|
||||
for domain, url, category in domains_to_check:
|
||||
cursor = await db.execute(
|
||||
"SELECT id FROM sources WHERE LOWER(domain) = ?",
|
||||
(domain.lower(),),
|
||||
)
|
||||
if await cursor.fetchone():
|
||||
continue # Domain schon bekannt
|
||||
|
||||
# 4. RSS-Feed-Erkennung
|
||||
try:
|
||||
result = await discover_source(url)
|
||||
name = domain_to_display_name(domain)
|
||||
source_type = result["source_type"] # "rss_feed" oder "web_source"
|
||||
feed_url = result.get("rss_url")
|
||||
|
||||
await db.execute(
|
||||
"""INSERT INTO sources (name, url, domain, source_type, category, status, notes, added_by)
|
||||
VALUES (?, ?, ?, ?, ?, 'active', 'Auto-entdeckt via Recherche', 'system')""",
|
||||
(name, feed_url or f"https://{domain}", domain, source_type, category),
|
||||
)
|
||||
new_count += 1
|
||||
logger.info(f"Neue Quelle registriert: {name} ({domain}) als {source_type}")
|
||||
except Exception as e:
|
||||
logger.debug(f"Discovery fehlgeschlagen für {domain}: {e}")
|
||||
|
||||
if new_count > 0:
|
||||
await db.commit()
|
||||
logger.info(f"Background-Discovery: {new_count} neue Quellen registriert")
|
||||
except Exception as e:
|
||||
logger.warning(f"Background-Discovery Fehler: {e}")
|
||||
finally:
|
||||
await db.close()
|
||||
|
||||
|
||||
async def _create_notifications_for_incident(
|
||||
db, incident_id: int, visibility: str, created_by: int, tenant_id: int, notifications: list[dict]
|
||||
):
|
||||
"""Erzeugt DB-Notifications fuer alle betroffenen Nutzer der Organisation.
|
||||
|
||||
- Oeffentliche Lagen -> alle Nutzer der Org
|
||||
- Private Lagen -> nur Ersteller
|
||||
"""
|
||||
if not notifications:
|
||||
return
|
||||
|
||||
now = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
if visibility == "public" and tenant_id:
|
||||
cursor = await db.execute(
|
||||
"SELECT id FROM users WHERE organization_id = ? AND is_active = 1 AND last_login_at IS NOT NULL",
|
||||
(tenant_id,),
|
||||
)
|
||||
user_ids = [row["id"] for row in await cursor.fetchall()]
|
||||
else:
|
||||
user_ids = [created_by]
|
||||
|
||||
for user_id in user_ids:
|
||||
for notif in notifications:
|
||||
await db.execute(
|
||||
"""INSERT INTO notifications (user_id, incident_id, type, title, text, icon, tenant_id, created_at)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
|
||||
(
|
||||
user_id,
|
||||
incident_id,
|
||||
notif.get("type", "refresh_summary"),
|
||||
notif["title"],
|
||||
notif["text"],
|
||||
notif.get("icon", "info"),
|
||||
tenant_id,
|
||||
now,
|
||||
),
|
||||
)
|
||||
|
||||
await db.commit()
|
||||
logger.info(f"Notifications erstellt: {len(notifications)} x {len(user_ids)} Nutzer fuer Lage {incident_id}")
|
||||
|
||||
|
||||
async def _send_email_notifications_for_incident(
|
||||
db, incident_id: int, incident_title: str, visibility: str,
|
||||
created_by: int, tenant_id: int, notifications: list[dict]
|
||||
):
|
||||
"""Sendet E-Mail-Benachrichtigungen basierend auf individuellen Nutzer-Abos.
|
||||
|
||||
Jeder Nutzer hat eigene E-Mail-Praeferenzen pro Lage (incident_subscriptions).
|
||||
Nur Nutzer die aktiv sind und sich mindestens einmal eingeloggt haben
|
||||
(last_login_at IS NOT NULL) werden beruecksichtigt.
|
||||
"""
|
||||
if not notifications:
|
||||
return
|
||||
|
||||
from email_utils.sender import send_email
|
||||
from email_utils.templates import incident_notification_email
|
||||
from config import MAGIC_LINK_BASE_URL
|
||||
|
||||
# Alle Nutzer mit aktiven Abos fuer diese Lage laden
|
||||
cursor = await db.execute(
|
||||
"""SELECT s.notify_email_summary, s.notify_email_new_articles,
|
||||
s.notify_email_status_change, u.id, u.email, u.username
|
||||
FROM incident_subscriptions s
|
||||
JOIN users u ON u.id = s.user_id
|
||||
WHERE s.incident_id = ?
|
||||
AND u.is_active = 1
|
||||
AND u.last_login_at IS NOT NULL
|
||||
AND (s.notify_email_summary = 1
|
||||
OR s.notify_email_new_articles = 1
|
||||
OR s.notify_email_status_change = 1)""",
|
||||
(incident_id,),
|
||||
)
|
||||
subscribers = await cursor.fetchall()
|
||||
if not subscribers:
|
||||
return
|
||||
|
||||
dashboard_url = f"{MAGIC_LINK_BASE_URL}/dashboard"
|
||||
|
||||
for sub in subscribers:
|
||||
prefs = dict(sub)
|
||||
|
||||
# Relevante Notifications basierend auf Nutzer-Praeferenzen filtern
|
||||
filtered_notifications = []
|
||||
for n in notifications:
|
||||
ntype = n.get("type", "refresh_summary")
|
||||
|
||||
if ntype == "refresh_summary" and prefs.get("notify_email_summary"):
|
||||
filtered_notifications.append(n)
|
||||
elif ntype == "new_articles" and prefs.get("notify_email_new_articles"):
|
||||
filtered_notifications.append(n)
|
||||
elif ntype == "status_change" and prefs.get("notify_email_status_change"):
|
||||
filtered_notifications.append(n)
|
||||
|
||||
if not filtered_notifications:
|
||||
continue
|
||||
|
||||
subject, html = incident_notification_email(
|
||||
username=prefs["username"],
|
||||
incident_title=incident_title,
|
||||
notifications=filtered_notifications,
|
||||
dashboard_url=dashboard_url,
|
||||
)
|
||||
try:
|
||||
await send_email(prefs["email"], subject, html)
|
||||
logger.info(f"E-Mail-Benachrichtigung gesendet an {prefs['email']} fuer Lage {incident_id} ({len(filtered_notifications)} Items)")
|
||||
except Exception as e:
|
||||
logger.error(f"E-Mail-Benachrichtigung fehlgeschlagen fuer {prefs['email']}: {e}")
|
||||
|
||||
|
||||
class AgentOrchestrator:
|
||||
"""Verwaltet die Claude-Agenten-Queue und koordiniert Recherche-Zyklen."""
|
||||
|
||||
def __init__(self):
|
||||
self._queue: asyncio.Queue = asyncio.Queue()
|
||||
self._running = False
|
||||
self._current_task: Optional[int] = None
|
||||
self._ws_manager = None
|
||||
self._queued_ids: set[int] = set()
|
||||
self._cancel_requested: set[int] = set()
|
||||
|
||||
def set_ws_manager(self, ws_manager):
|
||||
"""WebSocket-Manager setzen für Echtzeit-Updates."""
|
||||
self._ws_manager = ws_manager
|
||||
|
||||
async def start(self):
|
||||
"""Queue-Worker starten."""
|
||||
self._running = True
|
||||
asyncio.create_task(self._worker())
|
||||
logger.info("Agenten-Orchestrator gestartet")
|
||||
|
||||
async def stop(self):
|
||||
"""Queue-Worker stoppen."""
|
||||
self._running = False
|
||||
logger.info("Agenten-Orchestrator gestoppt")
|
||||
|
||||
async def enqueue_refresh(self, incident_id: int, trigger_type: str = "manual") -> bool:
|
||||
"""Refresh-Auftrag in die Queue stellen. Gibt False zurueck wenn bereits in Queue/aktiv."""
|
||||
if incident_id in self._queued_ids or self._current_task == incident_id:
|
||||
logger.info(f"Refresh fuer Lage {incident_id} uebersprungen: bereits aktiv/in Queue")
|
||||
return False
|
||||
|
||||
visibility, created_by, tenant_id = await self._get_incident_visibility(incident_id)
|
||||
|
||||
self._queued_ids.add(incident_id)
|
||||
await self._queue.put((incident_id, trigger_type))
|
||||
queue_size = self._queue.qsize()
|
||||
logger.info(f"Refresh fuer Lage {incident_id} eingereiht (Queue: {queue_size}, Trigger: {trigger_type})")
|
||||
|
||||
if self._ws_manager:
|
||||
await self._ws_manager.broadcast_for_incident({
|
||||
"type": "status_update",
|
||||
"incident_id": incident_id,
|
||||
"data": {"status": "queued", "queue_position": queue_size},
|
||||
}, visibility, created_by, tenant_id)
|
||||
|
||||
return True
|
||||
|
||||
async def cancel_refresh(self, incident_id: int) -> bool:
|
||||
"""Fordert Abbruch eines laufenden Refreshes an."""
|
||||
if self._current_task != incident_id:
|
||||
return False
|
||||
self._cancel_requested.add(incident_id)
|
||||
logger.info(f"Cancel angefordert fuer Lage {incident_id}")
|
||||
if self._ws_manager:
|
||||
try:
|
||||
vis, cb, tid = await self._get_incident_visibility(incident_id)
|
||||
except Exception:
|
||||
vis, cb, tid = "public", None, None
|
||||
await self._ws_manager.broadcast_for_incident({
|
||||
"type": "status_update",
|
||||
"incident_id": incident_id,
|
||||
"data": {"status": "cancelling", "detail": "Wird abgebrochen..."},
|
||||
}, vis, cb, tid)
|
||||
return True
|
||||
|
||||
def _check_cancelled(self, incident_id: int):
|
||||
"""Prüft ob Abbruch angefordert wurde und wirft CancelledError."""
|
||||
if incident_id in self._cancel_requested:
|
||||
self._cancel_requested.discard(incident_id)
|
||||
raise asyncio.CancelledError("Vom Nutzer abgebrochen")
|
||||
|
||||
async def _worker(self):
|
||||
"""Verarbeitet Refresh-Aufträge sequentiell."""
|
||||
while self._running:
|
||||
try:
|
||||
item = await asyncio.wait_for(self._queue.get(), timeout=5.0)
|
||||
except asyncio.TimeoutError:
|
||||
continue
|
||||
|
||||
incident_id, trigger_type = item
|
||||
self._queued_ids.discard(incident_id)
|
||||
self._current_task = incident_id
|
||||
logger.info(f"Starte Refresh für Lage {incident_id} (Trigger: {trigger_type})")
|
||||
|
||||
RETRY_DELAYS = [0, 120, 300] # Sekunden: sofort, 2min, 5min
|
||||
TRANSIENT_ERRORS = (asyncio.TimeoutError, ConnectionError, OSError)
|
||||
last_error = None
|
||||
|
||||
try:
|
||||
for attempt in range(3):
|
||||
try:
|
||||
await self._run_refresh(incident_id, trigger_type=trigger_type, retry_count=attempt)
|
||||
last_error = None
|
||||
break # Erfolg
|
||||
except asyncio.CancelledError:
|
||||
logger.info(f"Refresh fuer Lage {incident_id} abgebrochen")
|
||||
await self._mark_refresh_cancelled(incident_id)
|
||||
try:
|
||||
_vis, _cb, _tid = await self._get_incident_visibility(incident_id)
|
||||
except Exception:
|
||||
_vis, _cb, _tid = "public", None, None
|
||||
if self._ws_manager:
|
||||
await self._ws_manager.broadcast_for_incident({
|
||||
"type": "refresh_cancelled",
|
||||
"incident_id": incident_id,
|
||||
"data": {"status": "cancelled"},
|
||||
}, _vis, _cb, _tid)
|
||||
last_error = None
|
||||
break
|
||||
except TRANSIENT_ERRORS as e:
|
||||
last_error = e
|
||||
logger.warning(f"Transienter Fehler bei Lage {incident_id} (Versuch {attempt + 1}/3): {e}")
|
||||
if attempt < 2:
|
||||
await self._mark_refresh_failed(incident_id, str(e))
|
||||
delay = RETRY_DELAYS[attempt + 1]
|
||||
logger.info(f"Retry in {delay}s für Lage {incident_id}")
|
||||
# Retry-Status per WebSocket senden
|
||||
if self._ws_manager:
|
||||
try:
|
||||
_vis, _cb, _tid = await self._get_incident_visibility(incident_id)
|
||||
except Exception:
|
||||
_vis, _cb, _tid = "public", None, None
|
||||
await self._ws_manager.broadcast_for_incident({
|
||||
"type": "status_update",
|
||||
"incident_id": incident_id,
|
||||
"data": {"status": "retrying", "attempt": attempt + 1, "delay": delay},
|
||||
}, _vis, _cb, _tid)
|
||||
await asyncio.sleep(delay)
|
||||
else:
|
||||
await self._mark_refresh_failed(incident_id, f"Endgültig fehlgeschlagen nach 3 Versuchen: {e}")
|
||||
except Exception as e:
|
||||
last_error = e
|
||||
logger.error(f"Permanenter Fehler bei Refresh für Lage {incident_id}: {e}")
|
||||
await self._mark_refresh_failed(incident_id, str(e))
|
||||
break # Permanenter Fehler, kein Retry
|
||||
|
||||
if last_error and self._ws_manager:
|
||||
try:
|
||||
_vis, _cb, _tid = await self._get_incident_visibility(incident_id)
|
||||
except Exception:
|
||||
_vis, _cb, _tid = "public", None, None
|
||||
await self._ws_manager.broadcast_for_incident({
|
||||
"type": "refresh_error",
|
||||
"incident_id": incident_id,
|
||||
"data": {"error": str(last_error)},
|
||||
}, _vis, _cb, _tid)
|
||||
finally:
|
||||
self._current_task = None
|
||||
self._queue.task_done()
|
||||
|
||||
async def _mark_refresh_cancelled(self, incident_id: int):
|
||||
"""Markiert den laufenden Refresh-Log-Eintrag als cancelled."""
|
||||
from database import get_db
|
||||
db = await get_db()
|
||||
try:
|
||||
await db.execute(
|
||||
"""UPDATE refresh_log SET status = 'cancelled', error_message = 'Vom Nutzer abgebrochen',
|
||||
completed_at = ? WHERE incident_id = ? AND status = 'running'""",
|
||||
(datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'), incident_id),
|
||||
)
|
||||
await db.commit()
|
||||
except Exception as e:
|
||||
logger.warning(f"Konnte Refresh-Log nicht als abgebrochen markieren: {e}")
|
||||
finally:
|
||||
await db.close()
|
||||
|
||||
async def _mark_refresh_failed(self, incident_id: int, error: str):
|
||||
"""Markiert den laufenden Refresh-Log-Eintrag als error."""
|
||||
from database import get_db
|
||||
db = await get_db()
|
||||
try:
|
||||
await db.execute(
|
||||
"""UPDATE refresh_log SET status = 'error', error_message = ?,
|
||||
completed_at = ? WHERE incident_id = ? AND status = 'running'""",
|
||||
(error[:500], datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'), incident_id),
|
||||
)
|
||||
await db.commit()
|
||||
except Exception as e:
|
||||
logger.warning(f"Konnte Refresh-Log nicht als fehlgeschlagen markieren: {e}")
|
||||
finally:
|
||||
await db.close()
|
||||
|
||||
async def _get_incident_visibility(self, incident_id: int) -> tuple[str, Optional[int], Optional[int]]:
|
||||
"""Incident-Visibility, created_by und tenant_id laden."""
|
||||
from database import get_db
|
||||
visibility = "public"
|
||||
created_by = None
|
||||
tenant_id = None
|
||||
db = await get_db()
|
||||
try:
|
||||
cursor = await db.execute(
|
||||
"SELECT visibility, created_by, tenant_id FROM incidents WHERE id = ?", (incident_id,)
|
||||
)
|
||||
row = await cursor.fetchone()
|
||||
if row:
|
||||
visibility = row["visibility"] or "public"
|
||||
created_by = row["created_by"]
|
||||
tenant_id = row["tenant_id"]
|
||||
finally:
|
||||
await db.close()
|
||||
return visibility, created_by, tenant_id
|
||||
|
||||
async def _run_refresh(self, incident_id: int, trigger_type: str = "manual", retry_count: int = 0):
|
||||
"""Führt einen kompletten Refresh-Zyklus durch."""
|
||||
import aiosqlite
|
||||
from database import get_db
|
||||
from agents.researcher import ResearcherAgent
|
||||
from agents.analyzer import AnalyzerAgent
|
||||
from agents.factchecker import FactCheckerAgent
|
||||
from feeds.rss_parser import RSSParser
|
||||
|
||||
db = await get_db()
|
||||
try:
|
||||
# Lage laden
|
||||
cursor = await db.execute("SELECT * FROM incidents WHERE id = ?", (incident_id,))
|
||||
incident = await cursor.fetchone()
|
||||
if not incident:
|
||||
logger.warning(f"Lage {incident_id} nicht gefunden")
|
||||
return
|
||||
|
||||
title = incident["title"]
|
||||
description = incident["description"] or ""
|
||||
incident_type = incident["type"] or "adhoc"
|
||||
international = bool(incident["international_sources"]) if "international_sources" in incident.keys() else True
|
||||
visibility = incident["visibility"] if "visibility" in incident.keys() else "public"
|
||||
created_by = incident["created_by"] if "created_by" in incident.keys() else None
|
||||
tenant_id = incident["tenant_id"] if "tenant_id" in incident.keys() else None
|
||||
|
||||
# Bei Retry: vorherigen running-Eintrag als error markieren
|
||||
if retry_count > 0:
|
||||
await db.execute(
|
||||
"""UPDATE refresh_log SET status = 'error', error_message = 'Retry gestartet',
|
||||
completed_at = ? WHERE incident_id = ? AND status = 'running'""",
|
||||
(datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'), incident_id),
|
||||
)
|
||||
await db.commit()
|
||||
|
||||
# Refresh-Log starten
|
||||
now = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')
|
||||
cursor = await db.execute(
|
||||
"INSERT INTO refresh_log (incident_id, started_at, status, trigger_type, retry_count, tenant_id) VALUES (?, ?, 'running', ?, ?, ?)",
|
||||
(incident_id, now, trigger_type, retry_count, tenant_id),
|
||||
)
|
||||
await db.commit()
|
||||
log_id = cursor.lastrowid
|
||||
usage_acc = UsageAccumulator()
|
||||
|
||||
research_status = "deep_researching" if incident_type == "research" else "researching"
|
||||
research_detail = "Hintergrundrecherche im Web läuft..." if incident_type == "research" else "RSS-Feeds und Web werden durchsucht..."
|
||||
if self._ws_manager:
|
||||
await self._ws_manager.broadcast_for_incident({
|
||||
"type": "status_update",
|
||||
"incident_id": incident_id,
|
||||
"data": {"status": research_status, "detail": research_detail, "started_at": now},
|
||||
}, visibility, created_by, tenant_id)
|
||||
|
||||
# Schritt 1+2: RSS-Feeds und Claude-Recherche parallel ausführen
|
||||
async def _rss_pipeline():
|
||||
"""RSS-Feed-Suche (Feed-Selektion + Parsing)."""
|
||||
if incident_type != "adhoc":
|
||||
logger.info("Recherche-Modus: RSS-Feeds übersprungen")
|
||||
return [], None
|
||||
|
||||
rss_researcher = ResearcherAgent()
|
||||
rss_parser = RSSParser()
|
||||
|
||||
from source_rules import get_feeds_with_metadata
|
||||
all_feeds = await get_feeds_with_metadata(tenant_id=tenant_id)
|
||||
|
||||
feed_usage = None
|
||||
if len(all_feeds) > 20:
|
||||
selected_feeds, feed_usage = await rss_researcher.select_relevant_feeds(
|
||||
title, description, international, all_feeds
|
||||
)
|
||||
logger.info(f"Feed-Selektion: {len(selected_feeds)} von {len(all_feeds)} Feeds ausgewählt")
|
||||
articles = await rss_parser.search_feeds_selective(title, selected_feeds)
|
||||
else:
|
||||
articles = await rss_parser.search_feeds(title, international=international, tenant_id=tenant_id)
|
||||
|
||||
logger.info(f"RSS: {len(articles)} relevante Artikel gefunden (international={international})")
|
||||
return articles, feed_usage
|
||||
|
||||
async def _web_search_pipeline():
|
||||
"""Claude WebSearch-Recherche."""
|
||||
researcher = ResearcherAgent()
|
||||
results, usage = await researcher.search(title, description, incident_type, international=international)
|
||||
logger.info(f"Claude-Recherche: {len(results)} Ergebnisse")
|
||||
return results, usage
|
||||
|
||||
# Beide Pipelines parallel starten
|
||||
(rss_articles, rss_feed_usage), (search_results, search_usage) = await asyncio.gather(
|
||||
_rss_pipeline(),
|
||||
_web_search_pipeline(),
|
||||
)
|
||||
|
||||
if rss_feed_usage:
|
||||
usage_acc.add(rss_feed_usage)
|
||||
if search_usage:
|
||||
usage_acc.add(search_usage)
|
||||
|
||||
# Checkpoint 1: Cancel prüfen nach RSS/WebSearch
|
||||
self._check_cancelled(incident_id)
|
||||
|
||||
# Alle Ergebnisse zusammenführen
|
||||
all_results = rss_articles + search_results
|
||||
|
||||
# Duplikate entfernen (normalisierte URL + Headline-Ähnlichkeit)
|
||||
seen_urls = set()
|
||||
seen_headlines = set()
|
||||
unique_results = []
|
||||
for article in all_results:
|
||||
if not _is_duplicate(article, seen_urls, seen_headlines):
|
||||
unique_results.append(article)
|
||||
|
||||
dupes_removed = len(all_results) - len(unique_results)
|
||||
if dupes_removed > 0:
|
||||
logger.info(f"Deduplizierung: {dupes_removed} Duplikate entfernt, {len(unique_results)} verbleibend")
|
||||
|
||||
source_count = len(set(a.get("source", "") for a in unique_results))
|
||||
if self._ws_manager:
|
||||
await self._ws_manager.broadcast_for_incident({
|
||||
"type": "status_update",
|
||||
"incident_id": incident_id,
|
||||
"data": {
|
||||
"status": "analyzing",
|
||||
"detail": f"Analysiert {len(unique_results)} Meldungen aus {source_count} Quellen...",
|
||||
"started_at": now,
|
||||
},
|
||||
}, visibility, created_by, tenant_id)
|
||||
|
||||
# In DB speichern (neue Artikel) — auch gegen bestehende DB-Einträge prüfen
|
||||
new_count = 0
|
||||
for article in unique_results:
|
||||
# Prüfen ob URL (normalisiert) schon existiert
|
||||
if article.get("source_url"):
|
||||
norm_url = _normalize_url(article["source_url"])
|
||||
cursor = await db.execute(
|
||||
"SELECT id, source_url FROM articles WHERE incident_id = ?",
|
||||
(incident_id,),
|
||||
)
|
||||
existing_articles = await cursor.fetchall()
|
||||
already_exists = False
|
||||
for existing in existing_articles:
|
||||
if existing["source_url"] and _normalize_url(existing["source_url"]) == norm_url:
|
||||
already_exists = True
|
||||
break
|
||||
if already_exists:
|
||||
continue
|
||||
|
||||
# Headline-Duplikat gegen DB prüfen
|
||||
headline = article.get("headline", "")
|
||||
if headline and len(headline) > 20:
|
||||
norm_h = _normalize_headline(headline)
|
||||
cursor = await db.execute(
|
||||
"SELECT id, headline FROM articles WHERE incident_id = ?",
|
||||
(incident_id,),
|
||||
)
|
||||
existing_articles = await cursor.fetchall()
|
||||
headline_exists = False
|
||||
for existing in existing_articles:
|
||||
if _normalize_headline(existing["headline"]) == norm_h:
|
||||
headline_exists = True
|
||||
break
|
||||
if headline_exists:
|
||||
continue
|
||||
|
||||
await db.execute(
|
||||
"""INSERT INTO articles (incident_id, headline, headline_de, source,
|
||||
source_url, content_original, content_de, language, published_at, tenant_id)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
|
||||
(
|
||||
incident_id,
|
||||
article.get("headline", ""),
|
||||
article.get("headline_de"),
|
||||
article.get("source", "Unbekannt"),
|
||||
article.get("source_url"),
|
||||
article.get("content_original"),
|
||||
article.get("content_de"),
|
||||
article.get("language", "de"),
|
||||
article.get("published_at"),
|
||||
tenant_id,
|
||||
),
|
||||
)
|
||||
new_count += 1
|
||||
|
||||
await db.commit()
|
||||
|
||||
# Quellen-Statistiken aktualisieren
|
||||
if new_count > 0:
|
||||
try:
|
||||
from database import refresh_source_counts
|
||||
await refresh_source_counts(db)
|
||||
except Exception as e:
|
||||
logger.warning(f"Quellen-Statistiken konnten nicht aktualisiert werden: {e}")
|
||||
|
||||
# Schritt 3: Analyse und Zusammenfassung
|
||||
if new_count > 0 or not incident["summary"]:
|
||||
cursor = await db.execute(
|
||||
"SELECT * FROM articles WHERE incident_id = ? ORDER BY collected_at DESC",
|
||||
(incident_id,),
|
||||
)
|
||||
all_articles = [dict(row) for row in await cursor.fetchall()]
|
||||
|
||||
analyzer = AnalyzerAgent()
|
||||
analysis, analysis_usage = await analyzer.analyze(title, description, all_articles, incident_type)
|
||||
if analysis_usage:
|
||||
usage_acc.add(analysis_usage)
|
||||
|
||||
if analysis:
|
||||
is_first_summary = not incident["summary"]
|
||||
|
||||
# Snapshot des alten Lagebilds sichern (nur wenn schon eins existiert)
|
||||
if incident["summary"]:
|
||||
cursor = await db.execute(
|
||||
"SELECT COUNT(*) as cnt FROM articles WHERE incident_id = ?",
|
||||
(incident_id,),
|
||||
)
|
||||
snap_articles = (await cursor.fetchone())["cnt"]
|
||||
cursor = await db.execute(
|
||||
"SELECT COUNT(*) as cnt FROM fact_checks WHERE incident_id = ?",
|
||||
(incident_id,),
|
||||
)
|
||||
snap_fcs = (await cursor.fetchone())["cnt"]
|
||||
await db.execute(
|
||||
"""INSERT INTO incident_snapshots
|
||||
(incident_id, summary, sources_json,
|
||||
article_count, fact_check_count, refresh_log_id, created_at, tenant_id)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
|
||||
(incident_id, incident["summary"], incident["sources_json"],
|
||||
snap_articles, snap_fcs, log_id, now, tenant_id),
|
||||
)
|
||||
|
||||
# sources_json aus der Analyse extrahieren und speichern
|
||||
sources = analysis.get("sources", [])
|
||||
sources_json = json.dumps(sources, ensure_ascii=False) if sources else None
|
||||
|
||||
new_summary = analysis.get("summary", "")
|
||||
|
||||
await db.execute(
|
||||
"UPDATE incidents SET summary = ?, sources_json = ?, updated_at = ? WHERE id = ?",
|
||||
(new_summary, sources_json, now, incident_id),
|
||||
)
|
||||
|
||||
# Beim ersten Refresh: Snapshot des neuen Lagebilds erstellen
|
||||
if is_first_summary and new_summary:
|
||||
cursor = await db.execute(
|
||||
"SELECT COUNT(*) as cnt FROM articles WHERE incident_id = ?",
|
||||
(incident_id,),
|
||||
)
|
||||
snap_articles = (await cursor.fetchone())["cnt"]
|
||||
cursor = await db.execute(
|
||||
"SELECT COUNT(*) as cnt FROM fact_checks WHERE incident_id = ?",
|
||||
(incident_id,),
|
||||
)
|
||||
snap_fcs = (await cursor.fetchone())["cnt"]
|
||||
await db.execute(
|
||||
"""INSERT INTO incident_snapshots
|
||||
(incident_id, summary, sources_json,
|
||||
article_count, fact_check_count, refresh_log_id, created_at, tenant_id)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
|
||||
(incident_id, new_summary, sources_json,
|
||||
snap_articles, snap_fcs, log_id, now, tenant_id),
|
||||
)
|
||||
|
||||
# Übersetzungen aktualisieren
|
||||
for translation in analysis.get("translations", []):
|
||||
article_id = translation.get("article_id")
|
||||
if article_id:
|
||||
await db.execute(
|
||||
"UPDATE articles SET headline_de = ?, content_de = ? WHERE id = ?",
|
||||
(translation.get("headline_de"), translation.get("content_de"), article_id),
|
||||
)
|
||||
|
||||
await db.commit()
|
||||
|
||||
# Checkpoint 2: Cancel prüfen nach Analyse
|
||||
self._check_cancelled(incident_id)
|
||||
|
||||
if self._ws_manager:
|
||||
await self._ws_manager.broadcast_for_incident({
|
||||
"type": "status_update",
|
||||
"incident_id": incident_id,
|
||||
"data": {"status": "factchecking", "detail": "Prüft Fakten gegen unabhängige Quellen...", "started_at": now},
|
||||
}, visibility, created_by, tenant_id)
|
||||
|
||||
# Schritt 4: Faktencheck
|
||||
factchecker = FactCheckerAgent()
|
||||
fact_checks, fc_usage = await factchecker.check(title, all_articles, incident_type)
|
||||
if fc_usage:
|
||||
usage_acc.add(fc_usage)
|
||||
|
||||
# Checkpoint 3: Cancel prüfen nach Faktencheck
|
||||
self._check_cancelled(incident_id)
|
||||
|
||||
# Prüfen ob dies der erste Refresh ist (keine vorherigen Faktenchecks)
|
||||
cursor = await db.execute(
|
||||
"SELECT COUNT(*) as cnt FROM fact_checks WHERE incident_id = ?",
|
||||
(incident_id,),
|
||||
)
|
||||
row = await cursor.fetchone()
|
||||
is_first_refresh = row["cnt"] == 0
|
||||
|
||||
# Notification-Summary sammeln
|
||||
confirmed_count = 0
|
||||
contradicted_count = 0
|
||||
status_changes = []
|
||||
|
||||
for fc in fact_checks:
|
||||
# Prüfen ob Claim schon existiert (mit altem Status)
|
||||
cursor = await db.execute(
|
||||
"SELECT id, status FROM fact_checks WHERE incident_id = ? AND claim = ?",
|
||||
(incident_id, fc.get("claim", "")),
|
||||
)
|
||||
existing = await cursor.fetchone()
|
||||
old_status = existing["status"] if existing else None
|
||||
new_status = fc.get("status", "developing")
|
||||
|
||||
if existing:
|
||||
await db.execute(
|
||||
"UPDATE fact_checks SET status = ?, sources_count = ?, evidence = ?, is_notification = ?, checked_at = ? WHERE id = ?",
|
||||
(new_status, fc.get("sources_count", 0), fc.get("evidence"), fc.get("is_notification", 0), now, existing["id"]),
|
||||
)
|
||||
else:
|
||||
await db.execute(
|
||||
"""INSERT INTO fact_checks (incident_id, claim, status, sources_count, evidence, is_notification, tenant_id)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)""",
|
||||
(incident_id, fc.get("claim", ""), new_status, fc.get("sources_count", 0), fc.get("evidence"), fc.get("is_notification", 0), tenant_id),
|
||||
)
|
||||
|
||||
# Status-Statistik sammeln
|
||||
if new_status == "confirmed" or new_status == "established":
|
||||
confirmed_count += 1
|
||||
elif new_status == "contradicted" or new_status == "disputed":
|
||||
contradicted_count += 1
|
||||
|
||||
# Echte Status-Änderungen tracken (nicht beim ersten Refresh)
|
||||
if not is_first_refresh and old_status and old_status != new_status:
|
||||
status_changes.append({
|
||||
"claim": fc.get("claim", ""),
|
||||
"old_status": old_status,
|
||||
"new_status": new_status,
|
||||
})
|
||||
|
||||
await db.commit()
|
||||
|
||||
# Gebündelte Notification senden (nicht beim ersten Refresh)
|
||||
if not is_first_refresh:
|
||||
if self._ws_manager:
|
||||
await self._ws_manager.broadcast_for_incident({
|
||||
"type": "refresh_summary",
|
||||
"incident_id": incident_id,
|
||||
"data": {
|
||||
"new_articles": new_count,
|
||||
"confirmed_count": confirmed_count,
|
||||
"contradicted_count": contradicted_count,
|
||||
"status_changes": status_changes,
|
||||
"is_first_refresh": False,
|
||||
"incident_title": title,
|
||||
},
|
||||
}, visibility, created_by, tenant_id)
|
||||
|
||||
# DB-Notifications erzeugen
|
||||
parts = []
|
||||
if new_count > 0:
|
||||
parts.append(f"{new_count} neue Meldung{'en' if new_count != 1 else ''}")
|
||||
if confirmed_count > 0:
|
||||
parts.append(f"{confirmed_count} bestätigt")
|
||||
if contradicted_count > 0:
|
||||
parts.append(f"{contradicted_count} widersprochen")
|
||||
summary_text = ", ".join(parts) if parts else "Keine neuen Entwicklungen"
|
||||
|
||||
db_notifications = [{
|
||||
"type": "refresh_summary",
|
||||
"title": title,
|
||||
"text": f"Recherche: {summary_text}",
|
||||
"icon": "warning" if contradicted_count > 0 else "success",
|
||||
}]
|
||||
if new_count > 0:
|
||||
db_notifications.append({
|
||||
"type": "new_articles",
|
||||
"title": title,
|
||||
"text": f"{new_count} neue Meldung{'en' if new_count != 1 else ''} gefunden",
|
||||
"icon": "info",
|
||||
})
|
||||
for sc in status_changes:
|
||||
db_notifications.append({
|
||||
"type": "status_change",
|
||||
"title": title,
|
||||
"text": f"{sc['claim']}: {sc['old_status']} \u2192 {sc['new_status']}",
|
||||
"icon": "error" if sc["new_status"] in ("contradicted", "disputed") else "success",
|
||||
})
|
||||
|
||||
if created_by:
|
||||
await _create_notifications_for_incident(
|
||||
db, incident_id, visibility, created_by, tenant_id, db_notifications
|
||||
)
|
||||
# E-Mail-Benachrichtigungen versenden
|
||||
await _send_email_notifications_for_incident(
|
||||
db, incident_id, title, visibility, created_by, tenant_id, db_notifications
|
||||
)
|
||||
|
||||
# Refresh-Log abschließen (mit Token-Statistiken)
|
||||
await db.execute(
|
||||
"""UPDATE refresh_log SET
|
||||
completed_at = ?, articles_found = ?, status = 'completed',
|
||||
input_tokens = ?, output_tokens = ?,
|
||||
cache_creation_tokens = ?, cache_read_tokens = ?,
|
||||
total_cost_usd = ?, api_calls = ?
|
||||
WHERE id = ?""",
|
||||
(datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'), new_count,
|
||||
usage_acc.input_tokens, usage_acc.output_tokens,
|
||||
usage_acc.cache_creation_tokens, usage_acc.cache_read_tokens,
|
||||
round(usage_acc.total_cost_usd, 7), usage_acc.call_count, log_id),
|
||||
)
|
||||
await db.commit()
|
||||
logger.info(
|
||||
f"Token: {usage_acc.input_tokens} in / {usage_acc.output_tokens} out / "
|
||||
f"${usage_acc.total_cost_usd:.4f} ({usage_acc.call_count} Calls)"
|
||||
)
|
||||
|
||||
# Quellen-Discovery im Background starten
|
||||
if unique_results:
|
||||
asyncio.create_task(_background_discover_sources(unique_results))
|
||||
|
||||
if self._ws_manager:
|
||||
await self._ws_manager.broadcast_for_incident({
|
||||
"type": "refresh_complete",
|
||||
"incident_id": incident_id,
|
||||
"data": {"new_articles": new_count, "status": "idle"},
|
||||
}, visibility, created_by, tenant_id)
|
||||
|
||||
logger.info(f"Refresh für Lage {incident_id} abgeschlossen: {new_count} neue Artikel")
|
||||
|
||||
finally:
|
||||
await db.close()
|
||||
|
||||
|
||||
# Singleton-Instanz
|
||||
orchestrator = AgentOrchestrator()
|
||||
236
src/agents/researcher.py
Normale Datei
236
src/agents/researcher.py
Normale Datei
@@ -0,0 +1,236 @@
|
||||
"""Researcher-Agent: Sucht nach Informationen via Claude WebSearch."""
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from agents.claude_client import call_claude, ClaudeUsage
|
||||
|
||||
logger = logging.getLogger("osint.researcher")
|
||||
|
||||
RESEARCH_PROMPT_TEMPLATE = """Du bist ein OSINT-Recherche-Agent für ein Lagemonitoring-System.
|
||||
AUSGABESPRACHE: {output_language}
|
||||
|
||||
AUFTRAG: Suche nach aktuellen Informationen zu folgendem Vorfall:
|
||||
Titel: {title}
|
||||
Kontext: {description}
|
||||
|
||||
REGELN:
|
||||
- Suche nur bei seriösen Nachrichtenquellen (Nachrichtenagenturen, Qualitätszeitungen, öffentlich-rechtliche Medien, Behörden)
|
||||
- KEIN Social Media (Twitter/X, Facebook, Instagram, TikTok, Reddit)
|
||||
- KEINE Boulevardmedien (Bild, Sun, Daily Mail etc.)
|
||||
{language_instruction}
|
||||
- Faktenbasiert und neutral - keine Spekulationen
|
||||
- Nutze removepaywalls.com für Paywall-geschützte Artikel (z.B. Spiegel+, Zeit+, SZ+): https://www.removepaywalls.com/search?url=ARTIKEL_URL
|
||||
|
||||
Gib die Ergebnisse AUSSCHLIESSLICH als JSON-Array zurück, ohne Erklärungen davor oder danach.
|
||||
Jedes Element hat diese Felder:
|
||||
- "headline": Originale Überschrift
|
||||
- "headline_de": Übersetzung in Ausgabesprache (falls Originalsprache abweicht)
|
||||
- "source": Name der Quelle (z.B. "Reuters", "tagesschau")
|
||||
- "source_url": URL des Artikels
|
||||
- "content_summary": Zusammenfassung des Inhalts (2-3 Sätze, in Ausgabesprache)
|
||||
- "language": Sprache des Originals (z.B. "de", "en", "fr")
|
||||
- "published_at": Veröffentlichungsdatum falls bekannt (ISO-Format)
|
||||
|
||||
Antworte NUR mit dem JSON-Array. Keine Einleitung, keine Erklärung."""
|
||||
|
||||
DEEP_RESEARCH_PROMPT_TEMPLATE = """Du bist ein OSINT-Tiefenrecherche-Agent für ein Lagemonitoring-System.
|
||||
AUSGABESPRACHE: {output_language}
|
||||
|
||||
AUFTRAG: Führe eine umfassende Hintergrundrecherche durch zu:
|
||||
Titel: {title}
|
||||
Kontext: {description}
|
||||
|
||||
RECHERCHE-STRATEGIE:
|
||||
- Breite Suche: Hintergrundberichte, Analysen, Expertenmeinungen, Think-Tank-Publikationen
|
||||
- Suche nach: Akteuren, Zusammenhängen, historischem Kontext, rechtlichen Rahmenbedingungen
|
||||
- Akademische und Fachquellen zusätzlich zu Nachrichtenquellen
|
||||
- Nutze removepaywalls.com für Paywall-geschützte Artikel (z.B. https://www.removepaywalls.com/search?url=ARTIKEL_URL)
|
||||
{language_instruction}
|
||||
- Ziel: 8-15 hochwertige Quellen
|
||||
|
||||
QUELLENTYPEN (priorisiert):
|
||||
1. Fachzeitschriften und Branchenmedien
|
||||
2. Qualitätszeitungen (Hintergrundberichte, Dossiers)
|
||||
3. Think Tanks und Forschungsinstitute
|
||||
4. Offizielle Dokumente und Pressemitteilungen
|
||||
5. Nachrichtenagenturen (für Faktengrundlage)
|
||||
|
||||
AUSSCHLUSS:
|
||||
- KEIN Social Media (Twitter/X, Facebook, Instagram, TikTok, Reddit)
|
||||
- KEINE Boulevardmedien
|
||||
- KEINE Meinungsblogs ohne Quellenbelege
|
||||
|
||||
Gib die Ergebnisse AUSSCHLIESSLICH als JSON-Array zurück, ohne Erklärungen davor oder danach.
|
||||
Jedes Element hat diese Felder:
|
||||
- "headline": Originale Überschrift
|
||||
- "headline_de": Übersetzung in Ausgabesprache (falls Originalsprache abweicht)
|
||||
- "source": Name der Quelle (z.B. "netzpolitik.org", "Handelsblatt")
|
||||
- "source_url": URL des Artikels
|
||||
- "content_summary": Ausführliche Zusammenfassung des Inhalts (3-5 Sätze, in Ausgabesprache)
|
||||
- "language": Sprache des Originals (z.B. "de", "en", "fr")
|
||||
- "published_at": Veröffentlichungsdatum falls bekannt (ISO-Format)
|
||||
|
||||
Antworte NUR mit dem JSON-Array. Keine Einleitung, keine Erklärung."""
|
||||
|
||||
# Sprach-Anweisungen
|
||||
LANG_INTERNATIONAL = "- Suche in Deutsch UND Englisch für internationale Abdeckung"
|
||||
LANG_GERMAN_ONLY = "- Suche NUR auf Deutsch bei deutschsprachigen Quellen (Deutschland, Österreich, Schweiz)\n- KEINE englischsprachigen oder anderssprachigen Quellen"
|
||||
|
||||
LANG_DEEP_INTERNATIONAL = "- Suche in Deutsch, Englisch und weiteren relevanten Sprachen"
|
||||
LANG_DEEP_GERMAN_ONLY = "- Suche NUR auf Deutsch bei deutschsprachigen Quellen (Deutschland, Österreich, Schweiz)\n- KEINE englischsprachigen oder anderssprachigen Quellen"
|
||||
|
||||
|
||||
FEED_SELECTION_PROMPT_TEMPLATE = """Du bist ein OSINT-Analyst. Wähle aus dieser Feed-Liste die Feeds aus, die für die Lage relevant sein könnten.
|
||||
|
||||
LAGE: {title}
|
||||
KONTEXT: {description}
|
||||
INTERNATIONALE QUELLEN: {international}
|
||||
|
||||
FEEDS:
|
||||
{feed_list}
|
||||
|
||||
REGELN:
|
||||
- Wähle alle Feeds die thematisch oder regional relevant sein könnten
|
||||
- Lieber einen Feed zu viel als zu wenig auswählen
|
||||
- Bei "Internationale Quellen: Nein": Keine internationalen Feeds auswählen
|
||||
- Allgemeine Nachrichtenfeeds (tagesschau, Spiegel etc.) sind fast immer relevant
|
||||
- Antworte NUR mit einem JSON-Array der Nummern, z.B. [1, 2, 5, 12]"""
|
||||
|
||||
|
||||
class ResearcherAgent:
|
||||
"""Führt OSINT-Recherchen über Claude CLI WebSearch durch."""
|
||||
|
||||
async def select_relevant_feeds(
|
||||
self,
|
||||
title: str,
|
||||
description: str,
|
||||
international: bool,
|
||||
feeds_metadata: list[dict],
|
||||
) -> tuple[list[dict], ClaudeUsage | None]:
|
||||
"""Lässt Claude die relevanten Feeds für eine Lage vorauswählen.
|
||||
|
||||
Returns:
|
||||
(ausgewählte Feeds, usage) — Bei Fehler: (alle Feeds, None)
|
||||
"""
|
||||
# Feed-Liste als nummerierte Übersicht formatieren
|
||||
feed_lines = []
|
||||
for i, feed in enumerate(feeds_metadata, 1):
|
||||
feed_lines.append(
|
||||
f"{i}. {feed['name']} ({feed['domain']}) [{feed['category']}]"
|
||||
)
|
||||
|
||||
prompt = FEED_SELECTION_PROMPT_TEMPLATE.format(
|
||||
title=title,
|
||||
description=description or "Keine weitere Beschreibung",
|
||||
international="Ja" if international else "Nein",
|
||||
feed_list="\n".join(feed_lines),
|
||||
)
|
||||
|
||||
try:
|
||||
result, usage = await call_claude(prompt, tools=None)
|
||||
|
||||
# JSON-Array aus Antwort extrahieren
|
||||
match = re.search(r'\[[\d\s,]+\]', result)
|
||||
if not match:
|
||||
logger.warning("Feed-Selektion: Kein JSON-Array in Antwort, nutze alle Feeds")
|
||||
return feeds_metadata, usage
|
||||
|
||||
indices = json.loads(match.group())
|
||||
selected = []
|
||||
for idx in indices:
|
||||
if isinstance(idx, int) and 1 <= idx <= len(feeds_metadata):
|
||||
selected.append(feeds_metadata[idx - 1])
|
||||
|
||||
if not selected:
|
||||
logger.warning("Feed-Selektion: Keine gültigen Indizes, nutze alle Feeds")
|
||||
return feeds_metadata, usage
|
||||
|
||||
logger.info(
|
||||
f"Feed-Selektion: {len(selected)} von {len(feeds_metadata)} Feeds ausgewählt"
|
||||
)
|
||||
return selected, usage
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Feed-Selektion fehlgeschlagen ({e}), nutze alle Feeds")
|
||||
return feeds_metadata, None
|
||||
|
||||
async def search(self, title: str, description: str = "", incident_type: str = "adhoc", international: bool = True) -> tuple[list[dict], ClaudeUsage | None]:
|
||||
"""Sucht nach Informationen zu einem Vorfall."""
|
||||
from config import OUTPUT_LANGUAGE
|
||||
if incident_type == "research":
|
||||
lang_instruction = LANG_DEEP_INTERNATIONAL if international else LANG_DEEP_GERMAN_ONLY
|
||||
prompt = DEEP_RESEARCH_PROMPT_TEMPLATE.format(
|
||||
title=title, description=description, language_instruction=lang_instruction,
|
||||
output_language=OUTPUT_LANGUAGE,
|
||||
)
|
||||
else:
|
||||
lang_instruction = LANG_INTERNATIONAL if international else LANG_GERMAN_ONLY
|
||||
prompt = RESEARCH_PROMPT_TEMPLATE.format(
|
||||
title=title, description=description, language_instruction=lang_instruction,
|
||||
output_language=OUTPUT_LANGUAGE,
|
||||
)
|
||||
|
||||
try:
|
||||
result, usage = await call_claude(prompt)
|
||||
articles = self._parse_response(result)
|
||||
|
||||
# Ausgeschlossene Quellen dynamisch aus DB laden
|
||||
excluded_sources = await self._get_excluded_sources()
|
||||
|
||||
# Ausgeschlossene Quellen filtern
|
||||
filtered = []
|
||||
for article in articles:
|
||||
source = article.get("source", "").lower()
|
||||
source_url = article.get("source_url", "").lower()
|
||||
excluded = False
|
||||
for excl in excluded_sources:
|
||||
if excl in source or excl in source_url:
|
||||
excluded = True
|
||||
break
|
||||
if not excluded:
|
||||
# Bei nur-deutsch: nicht-deutsche Ergebnisse nachfiltern
|
||||
if not international and article.get("language", "de") != "de":
|
||||
continue
|
||||
filtered.append(article)
|
||||
|
||||
logger.info(f"Recherche ergab {len(filtered)} Artikel (von {len(articles)} gefundenen, international={international})")
|
||||
return filtered, usage
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Recherche-Fehler: {e}")
|
||||
return [], None
|
||||
|
||||
async def _get_excluded_sources(self) -> list[str]:
|
||||
"""Lädt ausgeschlossene Quellen aus der Datenbank."""
|
||||
try:
|
||||
from source_rules import get_source_rules
|
||||
rules = await get_source_rules()
|
||||
return rules.get("excluded_domains", [])
|
||||
except Exception as e:
|
||||
logger.warning(f"Fallback auf config.py für Excluded Sources: {e}")
|
||||
from config import EXCLUDED_SOURCES
|
||||
return list(EXCLUDED_SOURCES)
|
||||
|
||||
def _parse_response(self, response: str) -> list[dict]:
|
||||
"""Parst die Claude-Antwort als JSON-Array."""
|
||||
# Versuche JSON direkt zu parsen
|
||||
try:
|
||||
data = json.loads(response)
|
||||
if isinstance(data, list):
|
||||
return data
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Versuche JSON aus der Antwort zu extrahieren (zwischen [ und ])
|
||||
match = re.search(r'\[.*\]', response, re.DOTALL)
|
||||
if match:
|
||||
try:
|
||||
data = json.loads(match.group())
|
||||
if isinstance(data, list):
|
||||
return data
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
logger.warning("Konnte Claude-Antwort nicht als JSON parsen")
|
||||
return []
|
||||
In neuem Issue referenzieren
Einen Benutzer sperren