From 0019d74aeadaff8ce054efe7d03e2e437c73e3ca Mon Sep 17 00:00:00 2001 From: Claude Dev Date: Sun, 15 Mar 2026 18:01:32 +0100 Subject: [PATCH] feat: Intelligente Telegram-Kanal-Selektion und verbesserte Quellenzuordnung MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Researcher: Claude-basierte Vorauswahl relevanter Telegram-Kanäle per Haiku - FactChecker: Verbesserte Quellen-Zuordnung mit Relevanz-Scoring (Top 5) - FactChecker: URLs werden nicht mehr doppelt zugeordnet, sources_count wird aktualisiert - TelegramParser: Kanal-Filterung per channel_ids statt categories - TelegramParser: Lockereres Keyword-Matching (1 Match reicht, da vorselektiert) - Models: telegram_categories Feld entfernt (durch KI-Selektion ersetzt) - Main: Chat-Router eingebunden unter /api/chat Co-Authored-By: Claude Opus 4.6 (1M context) --- src/agents/factchecker.py | 71 ++++++++++++++++++++++----------- src/agents/researcher.py | 77 ++++++++++++++++++++++++++++++++++++ src/feeds/telegram_parser.py | 25 ++++++------ src/main.py | 2 + src/models.py | 3 -- 5 files changed, 140 insertions(+), 38 deletions(-) diff --git a/src/agents/factchecker.py b/src/agents/factchecker.py index 08f2c90..8140822 100644 --- a/src/agents/factchecker.py +++ b/src/agents/factchecker.py @@ -706,56 +706,83 @@ class FactCheckerAgent: return None def _validate_facts(self, facts: list[dict], articles: list[dict] = None) -> list[dict]: - """Validiert Fakten: Bei fehlender URL werden Ursprungsquellen aus den Artikeln ergaenzt.""" + """Validiert Fakten und ordnet Quellen-URLs aus den Artikeln zu. + + Stellt sicher, dass jeder confirmed/established Fakt URLs in der + evidence hat, damit das Frontend die Quellen korrekt anzeigen kann. + """ url_pattern = re.compile(r'https?://') - # Verfuegbare Artikel-URLs sammeln + # Verfuegbare Artikel-URLs sammeln (dedupliziert nach URL) article_sources = [] + seen_urls = set() if articles: for a in articles: url = a.get("source_url", "") source = a.get("source", "") headline = a.get("headline_de") or a.get("headline", "") - if url: + if url and url not in seen_urls: + seen_urls.add(url) article_sources.append({"url": url, "source": source, "headline": headline}) for fact in facts: status = fact.get("status", "") evidence = fact.get("evidence") or "" - if status in ("confirmed", "established") and not url_pattern.search(evidence): - # Passende Ursprungsquellen finden (Keyword-Match auf Claim) + + # Fuer alle Fakten: Quellen zuordnen + if status not in ("retracted",): + # Bereits vorhandene URLs in der evidence zaehlen + existing_urls = set(url_pattern.findall(evidence)) + + # Passende Quellen per Keyword-Match finden claim_lower = (fact.get("claim") or "").lower() - claim_words = [w for w in claim_lower.split() if len(w) >= 4][:8] - matched_sources = [] + evidence_lower = evidence.lower() + claim_words = [w for w in claim_lower.split() if len(w) >= 4][:10] + + scored_sources = [] for src in article_sources: + if src["url"] in existing_urls: + continue # Bereits in evidence src_text = (src["headline"] + " " + src["source"]).lower() matches = sum(1 for w in claim_words if w in src_text) - if matches >= max(1, len(claim_words) // 4): - matched_sources.append(src) - if len(matched_sources) >= 3: - break + if matches >= max(1, len(claim_words) // 5): + scored_sources.append((matches, src)) + + # Nach Relevanz sortieren, Top 5 nehmen + scored_sources.sort(key=lambda x: x[0], reverse=True) + matched_sources = [s for _, s in scored_sources[:5]] if matched_sources: - # Ursprungsquellen anhaengen statt herabstufen source_refs = "; ".join( f"{s['source']} ({s['url']})" for s in matched_sources ) - fact["evidence"] = ( - evidence.rstrip(". ") + - ". [Ursprungsquellen: " + source_refs + - " — Quellenlinks zum Zeitpunkt der Recherche moeglicherweise nicht mehr verfuegbar]" - ) + if existing_urls: + # Bereits URLs vorhanden, weitere ergaenzen + fact["evidence"] = ( + evidence.rstrip(". ") + + ". [Weitere Quellen: " + source_refs + "]" + ) + else: + # Keine URLs vorhanden, Quellen anhaengen + fact["evidence"] = ( + evidence.rstrip(". ") + + ". [Quellen: " + source_refs + "]" + ) + + # sources_count aktualisieren + all_urls = url_pattern.findall(fact["evidence"]) + fact["sources_count"] = len(set(all_urls)) + logger.info( f"Fakt '{fact.get('claim', '')[:50]}...' ergaenzt mit " - f"{len(matched_sources)} Ursprungsquelle(n)" + f"{len(matched_sources)} Quelle(n), gesamt: {fact['sources_count']}" ) - else: - # Keine passende Quelle gefunden -> herabstufen + elif not existing_urls: + # Weder bestehende URLs noch passende Quellen old_status = status fact["status"] = "unconfirmed" if status == "confirmed" else "unverified" logger.warning( f"Fakt herabgestuft ({old_status} -> {fact['status']}): " - f"keine URL in Evidenz und keine passende Ursprungsquelle: " - f"'{fact.get('claim', '')[:60]}...'" + f"keine Quellen zuordnebar: '{fact.get('claim', '')[:60]}...'" ) return facts diff --git a/src/agents/researcher.py b/src/agents/researcher.py index 7bb63d9..93ca331 100644 --- a/src/agents/researcher.py +++ b/src/agents/researcher.py @@ -136,6 +136,25 @@ Antwort NUR als JSON-Array: [{{"de": "iran", "en": "iran"}}, {{"de": "israel", "en": "israel"}}, {{"de": "teheran", "en": "tehran"}}, {{"de": "luftangriff", "en": "airstrike"}}, {{"de": "trump", "en": "trump"}}]""" +TELEGRAM_CHANNEL_SELECTION_PROMPT = """Du bist ein OSINT-Analyst. Waehle aus dieser Liste von Telegram-Kanaelen diejenigen aus, die fuer die Lage relevant sein koennten. + +LAGE: {title} +KONTEXT: {description} + +TELEGRAM-KANAELE: +{channel_list} + +REGELN: +- Waehle alle Kanaele die thematisch relevant sein koennten +- Lieber einen Kanal zu viel als zu wenig auswaehlen +- Beachte die Kategorie und Beschreibung jedes Kanals +- Allgemeine OSINT-Kanaele sind oft relevant +- Bei Cybercrime-Themen: Cybercrime + Leaks Kanaele waehlen +- Bei geopolitischen Themen: Relevante Laender-/Regionskanaele waehlen + +Antworte NUR mit einem JSON-Array der Kanal-Nummern, z.B.: [1, 3, 5, 12]""" + + class ResearcherAgent: """Führt OSINT-Recherchen über Claude CLI WebSearch durch.""" @@ -388,3 +407,61 @@ class ResearcherAgent: logger.warning(f"Konnte Claude-Antwort nicht als JSON parsen (Laenge: {len(response)})") return [] + + async def select_relevant_telegram_channels( + self, + title: str, + description: str, + channels_metadata: list[dict], + ) -> tuple[list[dict], ClaudeUsage | None]: + """Laesst Claude die relevanten Telegram-Kanaele fuer eine Lage vorauswaehlen. + + Nutzt Haiku (CLAUDE_MODEL_FAST) fuer diese einfache Aufgabe. + + Returns: + (ausgewaehlte Kanaele, usage) -- Bei Fehler: (alle Kanaele, None) + """ + if len(channels_metadata) <= 10: + logger.info("Telegram-Selektion: Nur %d Kanaele, nutze alle", len(channels_metadata)) + return channels_metadata, None + + channel_lines = [] + for i, ch in enumerate(channels_metadata, 1): + cat = ch.get("category", "sonstige") + notes = (ch.get("notes") or "")[:100] + channel_lines.append(f"{i}. {ch['name']} [{cat}] - {notes}") + + prompt = TELEGRAM_CHANNEL_SELECTION_PROMPT.format( + title=title, + description=description or "Keine weitere Beschreibung", + channel_list="\n".join(channel_lines), + ) + + try: + result, usage = await call_claude(prompt, tools=None, model=CLAUDE_MODEL_FAST) + + arr_match = re.search(r'\[[\d\s,]+\]', result) + if not arr_match: + logger.warning("Telegram-Selektion: Kein JSON in Antwort, nutze alle Kanaele") + return channels_metadata, usage + + indices = json.loads(arr_match.group()) + selected = [] + for idx in indices: + if isinstance(idx, int) and 1 <= idx <= len(channels_metadata): + selected.append(channels_metadata[idx - 1]) + + if not selected: + logger.warning("Telegram-Selektion: Keine gueltigen Indizes, nutze alle Kanaele") + return channels_metadata, usage + + logger.info( + "Telegram-Selektion: %d von %d Kanaelen ausgewaehlt", + len(selected), len(channels_metadata) + ) + return selected, usage + + except Exception as e: + logger.warning("Telegram-Selektion fehlgeschlagen (%s), nutze alle Kanaele", e) + return channels_metadata, None + diff --git a/src/feeds/telegram_parser.py b/src/feeds/telegram_parser.py index 248655a..bf51817 100644 --- a/src/feeds/telegram_parser.py +++ b/src/feeds/telegram_parser.py @@ -61,7 +61,7 @@ class TelegramParser: return None async def search_channels(self, search_term: str, tenant_id: int = None, - keywords: list[str] = None, categories: list[str] = None) -> list[dict]: + keywords: list[str] = None, channel_ids: list[int] = None) -> list[dict]: """Liest Nachrichten aus konfigurierten Telegram-Kanaelen. Gibt Artikel-Dicts zurueck (kompatibel mit RSS-Parser-Format). @@ -72,7 +72,7 @@ class TelegramParser: return [] # Telegram-Kanaele aus DB laden - channels = await self._get_telegram_channels(tenant_id, categories=categories) + channels = await self._get_telegram_channels(tenant_id, channel_ids=channel_ids) if not channels: logger.info("Keine Telegram-Kanaele konfiguriert") return [] @@ -106,25 +106,24 @@ class TelegramParser: logger.info("Telegram: %d relevante Nachrichten aus %d Kanaelen", len(all_articles), len(channels)) return all_articles - async def _get_telegram_channels(self, tenant_id: int = None, categories: list[str] = None) -> list[dict]: + async def _get_telegram_channels(self, tenant_id: int = None, channel_ids: list[int] = None) -> list[dict]: """Laedt Telegram-Kanaele aus der sources-Tabelle.""" try: from database import get_db db = await get_db() try: - if categories and len(categories) > 0: - placeholders = ",".join("?" for _ in categories) + if channel_ids and len(channel_ids) > 0: + placeholders = ",".join("?" for _ in channel_ids) cursor = await db.execute( - f"""SELECT id, name, url FROM sources + f"""SELECT id, name, url, category, notes FROM sources WHERE source_type = 'telegram_channel' AND status = 'active' - AND (tenant_id IS NULL OR tenant_id = ?) - AND category IN ({placeholders})""", - (tenant_id, *categories), + AND id IN ({placeholders})""", + tuple(channel_ids), ) else: cursor = await db.execute( - """SELECT id, name, url FROM sources + """SELECT id, name, url, category, notes FROM sources WHERE source_type = 'telegram_channel' AND status = 'active' AND (tenant_id IS NULL OR tenant_id = ?)""", @@ -171,11 +170,11 @@ class TelegramParser: text = msg.text text_lower = text.lower() - # Keyword-Matching (gleiche Logik wie RSS-Parser) - min_matches = min(2, max(1, (len(search_words) + 1) // 2)) + # Keyword-Matching (lockerer als RSS: 1 Match reicht, + # da Kanaele bereits thematisch vorselektiert sind) match_count = sum(1 for word in search_words if word in text_lower) - if match_count < min_matches: + if match_count < 1: continue # Erste Zeile als Headline, Rest als Content diff --git a/src/main.py b/src/main.py index cca30a7..e21aa9a 100644 --- a/src/main.py +++ b/src/main.py @@ -331,6 +331,7 @@ from routers.sources import router as sources_router from routers.notifications import router as notifications_router from routers.feedback import router as feedback_router from routers.public_api import router as public_api_router +from routers.chat import router as chat_router app.include_router(auth_router) app.include_router(incidents_router) @@ -338,6 +339,7 @@ app.include_router(sources_router) app.include_router(notifications_router) app.include_router(feedback_router) app.include_router(public_api_router) +app.include_router(chat_router, prefix="/api/chat") @app.websocket("/api/ws") diff --git a/src/models.py b/src/models.py index 5358998..c31ccb4 100644 --- a/src/models.py +++ b/src/models.py @@ -53,7 +53,6 @@ class IncidentCreate(BaseModel): retention_days: int = Field(default=0, ge=0, le=999) international_sources: bool = True include_telegram: bool = False - telegram_categories: Optional[list[str]] = None visibility: str = Field(default="public", pattern="^(public|private)$") @@ -67,7 +66,6 @@ class IncidentUpdate(BaseModel): retention_days: Optional[int] = Field(default=None, ge=0, le=999) international_sources: Optional[bool] = None include_telegram: Optional[bool] = None - telegram_categories: Optional[list[str]] = None visibility: Optional[str] = Field(default=None, pattern="^(public|private)$") @@ -85,7 +83,6 @@ class IncidentResponse(BaseModel): sources_json: Optional[str] = None international_sources: bool = True include_telegram: bool = False - telegram_categories: Optional[list[str]] = None created_by: int created_by_username: str = "" created_at: str