feat: Intelligente Telegram-Kanal-Selektion und verbesserte Quellenzuordnung
- Researcher: Claude-basierte Vorauswahl relevanter Telegram-Kanäle per Haiku - FactChecker: Verbesserte Quellen-Zuordnung mit Relevanz-Scoring (Top 5) - FactChecker: URLs werden nicht mehr doppelt zugeordnet, sources_count wird aktualisiert - TelegramParser: Kanal-Filterung per channel_ids statt categories - TelegramParser: Lockereres Keyword-Matching (1 Match reicht, da vorselektiert) - Models: telegram_categories Feld entfernt (durch KI-Selektion ersetzt) - Main: Chat-Router eingebunden unter /api/chat Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
Dieser Commit ist enthalten in:
@@ -706,56 +706,83 @@ class FactCheckerAgent:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
def _validate_facts(self, facts: list[dict], articles: list[dict] = None) -> list[dict]:
|
def _validate_facts(self, facts: list[dict], articles: list[dict] = None) -> list[dict]:
|
||||||
"""Validiert Fakten: Bei fehlender URL werden Ursprungsquellen aus den Artikeln ergaenzt."""
|
"""Validiert Fakten und ordnet Quellen-URLs aus den Artikeln zu.
|
||||||
|
|
||||||
|
Stellt sicher, dass jeder confirmed/established Fakt URLs in der
|
||||||
|
evidence hat, damit das Frontend die Quellen korrekt anzeigen kann.
|
||||||
|
"""
|
||||||
url_pattern = re.compile(r'https?://')
|
url_pattern = re.compile(r'https?://')
|
||||||
# Verfuegbare Artikel-URLs sammeln
|
# Verfuegbare Artikel-URLs sammeln (dedupliziert nach URL)
|
||||||
article_sources = []
|
article_sources = []
|
||||||
|
seen_urls = set()
|
||||||
if articles:
|
if articles:
|
||||||
for a in articles:
|
for a in articles:
|
||||||
url = a.get("source_url", "")
|
url = a.get("source_url", "")
|
||||||
source = a.get("source", "")
|
source = a.get("source", "")
|
||||||
headline = a.get("headline_de") or a.get("headline", "")
|
headline = a.get("headline_de") or a.get("headline", "")
|
||||||
if url:
|
if url and url not in seen_urls:
|
||||||
|
seen_urls.add(url)
|
||||||
article_sources.append({"url": url, "source": source, "headline": headline})
|
article_sources.append({"url": url, "source": source, "headline": headline})
|
||||||
|
|
||||||
for fact in facts:
|
for fact in facts:
|
||||||
status = fact.get("status", "")
|
status = fact.get("status", "")
|
||||||
evidence = fact.get("evidence") or ""
|
evidence = fact.get("evidence") or ""
|
||||||
if status in ("confirmed", "established") and not url_pattern.search(evidence):
|
|
||||||
# Passende Ursprungsquellen finden (Keyword-Match auf Claim)
|
# Fuer alle Fakten: Quellen zuordnen
|
||||||
|
if status not in ("retracted",):
|
||||||
|
# Bereits vorhandene URLs in der evidence zaehlen
|
||||||
|
existing_urls = set(url_pattern.findall(evidence))
|
||||||
|
|
||||||
|
# Passende Quellen per Keyword-Match finden
|
||||||
claim_lower = (fact.get("claim") or "").lower()
|
claim_lower = (fact.get("claim") or "").lower()
|
||||||
claim_words = [w for w in claim_lower.split() if len(w) >= 4][:8]
|
evidence_lower = evidence.lower()
|
||||||
matched_sources = []
|
claim_words = [w for w in claim_lower.split() if len(w) >= 4][:10]
|
||||||
|
|
||||||
|
scored_sources = []
|
||||||
for src in article_sources:
|
for src in article_sources:
|
||||||
|
if src["url"] in existing_urls:
|
||||||
|
continue # Bereits in evidence
|
||||||
src_text = (src["headline"] + " " + src["source"]).lower()
|
src_text = (src["headline"] + " " + src["source"]).lower()
|
||||||
matches = sum(1 for w in claim_words if w in src_text)
|
matches = sum(1 for w in claim_words if w in src_text)
|
||||||
if matches >= max(1, len(claim_words) // 4):
|
if matches >= max(1, len(claim_words) // 5):
|
||||||
matched_sources.append(src)
|
scored_sources.append((matches, src))
|
||||||
if len(matched_sources) >= 3:
|
|
||||||
break
|
# Nach Relevanz sortieren, Top 5 nehmen
|
||||||
|
scored_sources.sort(key=lambda x: x[0], reverse=True)
|
||||||
|
matched_sources = [s for _, s in scored_sources[:5]]
|
||||||
|
|
||||||
if matched_sources:
|
if matched_sources:
|
||||||
# Ursprungsquellen anhaengen statt herabstufen
|
|
||||||
source_refs = "; ".join(
|
source_refs = "; ".join(
|
||||||
f"{s['source']} ({s['url']})" for s in matched_sources
|
f"{s['source']} ({s['url']})" for s in matched_sources
|
||||||
)
|
)
|
||||||
fact["evidence"] = (
|
if existing_urls:
|
||||||
evidence.rstrip(". ") +
|
# Bereits URLs vorhanden, weitere ergaenzen
|
||||||
". [Ursprungsquellen: " + source_refs +
|
fact["evidence"] = (
|
||||||
" — Quellenlinks zum Zeitpunkt der Recherche moeglicherweise nicht mehr verfuegbar]"
|
evidence.rstrip(". ") +
|
||||||
)
|
". [Weitere Quellen: " + source_refs + "]"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Keine URLs vorhanden, Quellen anhaengen
|
||||||
|
fact["evidence"] = (
|
||||||
|
evidence.rstrip(". ") +
|
||||||
|
". [Quellen: " + source_refs + "]"
|
||||||
|
)
|
||||||
|
|
||||||
|
# sources_count aktualisieren
|
||||||
|
all_urls = url_pattern.findall(fact["evidence"])
|
||||||
|
fact["sources_count"] = len(set(all_urls))
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Fakt '{fact.get('claim', '')[:50]}...' ergaenzt mit "
|
f"Fakt '{fact.get('claim', '')[:50]}...' ergaenzt mit "
|
||||||
f"{len(matched_sources)} Ursprungsquelle(n)"
|
f"{len(matched_sources)} Quelle(n), gesamt: {fact['sources_count']}"
|
||||||
)
|
)
|
||||||
else:
|
elif not existing_urls:
|
||||||
# Keine passende Quelle gefunden -> herabstufen
|
# Weder bestehende URLs noch passende Quellen
|
||||||
old_status = status
|
old_status = status
|
||||||
fact["status"] = "unconfirmed" if status == "confirmed" else "unverified"
|
fact["status"] = "unconfirmed" if status == "confirmed" else "unverified"
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Fakt herabgestuft ({old_status} -> {fact['status']}): "
|
f"Fakt herabgestuft ({old_status} -> {fact['status']}): "
|
||||||
f"keine URL in Evidenz und keine passende Ursprungsquelle: "
|
f"keine Quellen zuordnebar: '{fact.get('claim', '')[:60]}...'"
|
||||||
f"'{fact.get('claim', '')[:60]}...'"
|
|
||||||
)
|
)
|
||||||
return facts
|
return facts
|
||||||
|
|
||||||
|
|||||||
@@ -136,6 +136,25 @@ Antwort NUR als JSON-Array:
|
|||||||
[{{"de": "iran", "en": "iran"}}, {{"de": "israel", "en": "israel"}}, {{"de": "teheran", "en": "tehran"}}, {{"de": "luftangriff", "en": "airstrike"}}, {{"de": "trump", "en": "trump"}}]"""
|
[{{"de": "iran", "en": "iran"}}, {{"de": "israel", "en": "israel"}}, {{"de": "teheran", "en": "tehran"}}, {{"de": "luftangriff", "en": "airstrike"}}, {{"de": "trump", "en": "trump"}}]"""
|
||||||
|
|
||||||
|
|
||||||
|
TELEGRAM_CHANNEL_SELECTION_PROMPT = """Du bist ein OSINT-Analyst. Waehle aus dieser Liste von Telegram-Kanaelen diejenigen aus, die fuer die Lage relevant sein koennten.
|
||||||
|
|
||||||
|
LAGE: {title}
|
||||||
|
KONTEXT: {description}
|
||||||
|
|
||||||
|
TELEGRAM-KANAELE:
|
||||||
|
{channel_list}
|
||||||
|
|
||||||
|
REGELN:
|
||||||
|
- Waehle alle Kanaele die thematisch relevant sein koennten
|
||||||
|
- Lieber einen Kanal zu viel als zu wenig auswaehlen
|
||||||
|
- Beachte die Kategorie und Beschreibung jedes Kanals
|
||||||
|
- Allgemeine OSINT-Kanaele sind oft relevant
|
||||||
|
- Bei Cybercrime-Themen: Cybercrime + Leaks Kanaele waehlen
|
||||||
|
- Bei geopolitischen Themen: Relevante Laender-/Regionskanaele waehlen
|
||||||
|
|
||||||
|
Antworte NUR mit einem JSON-Array der Kanal-Nummern, z.B.: [1, 3, 5, 12]"""
|
||||||
|
|
||||||
|
|
||||||
class ResearcherAgent:
|
class ResearcherAgent:
|
||||||
"""Führt OSINT-Recherchen über Claude CLI WebSearch durch."""
|
"""Führt OSINT-Recherchen über Claude CLI WebSearch durch."""
|
||||||
|
|
||||||
@@ -388,3 +407,61 @@ class ResearcherAgent:
|
|||||||
|
|
||||||
logger.warning(f"Konnte Claude-Antwort nicht als JSON parsen (Laenge: {len(response)})")
|
logger.warning(f"Konnte Claude-Antwort nicht als JSON parsen (Laenge: {len(response)})")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
async def select_relevant_telegram_channels(
|
||||||
|
self,
|
||||||
|
title: str,
|
||||||
|
description: str,
|
||||||
|
channels_metadata: list[dict],
|
||||||
|
) -> tuple[list[dict], ClaudeUsage | None]:
|
||||||
|
"""Laesst Claude die relevanten Telegram-Kanaele fuer eine Lage vorauswaehlen.
|
||||||
|
|
||||||
|
Nutzt Haiku (CLAUDE_MODEL_FAST) fuer diese einfache Aufgabe.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
(ausgewaehlte Kanaele, usage) -- Bei Fehler: (alle Kanaele, None)
|
||||||
|
"""
|
||||||
|
if len(channels_metadata) <= 10:
|
||||||
|
logger.info("Telegram-Selektion: Nur %d Kanaele, nutze alle", len(channels_metadata))
|
||||||
|
return channels_metadata, None
|
||||||
|
|
||||||
|
channel_lines = []
|
||||||
|
for i, ch in enumerate(channels_metadata, 1):
|
||||||
|
cat = ch.get("category", "sonstige")
|
||||||
|
notes = (ch.get("notes") or "")[:100]
|
||||||
|
channel_lines.append(f"{i}. {ch['name']} [{cat}] - {notes}")
|
||||||
|
|
||||||
|
prompt = TELEGRAM_CHANNEL_SELECTION_PROMPT.format(
|
||||||
|
title=title,
|
||||||
|
description=description or "Keine weitere Beschreibung",
|
||||||
|
channel_list="\n".join(channel_lines),
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
result, usage = await call_claude(prompt, tools=None, model=CLAUDE_MODEL_FAST)
|
||||||
|
|
||||||
|
arr_match = re.search(r'\[[\d\s,]+\]', result)
|
||||||
|
if not arr_match:
|
||||||
|
logger.warning("Telegram-Selektion: Kein JSON in Antwort, nutze alle Kanaele")
|
||||||
|
return channels_metadata, usage
|
||||||
|
|
||||||
|
indices = json.loads(arr_match.group())
|
||||||
|
selected = []
|
||||||
|
for idx in indices:
|
||||||
|
if isinstance(idx, int) and 1 <= idx <= len(channels_metadata):
|
||||||
|
selected.append(channels_metadata[idx - 1])
|
||||||
|
|
||||||
|
if not selected:
|
||||||
|
logger.warning("Telegram-Selektion: Keine gueltigen Indizes, nutze alle Kanaele")
|
||||||
|
return channels_metadata, usage
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Telegram-Selektion: %d von %d Kanaelen ausgewaehlt",
|
||||||
|
len(selected), len(channels_metadata)
|
||||||
|
)
|
||||||
|
return selected, usage
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("Telegram-Selektion fehlgeschlagen (%s), nutze alle Kanaele", e)
|
||||||
|
return channels_metadata, None
|
||||||
|
|
||||||
|
|||||||
@@ -61,7 +61,7 @@ class TelegramParser:
|
|||||||
return None
|
return None
|
||||||
|
|
||||||
async def search_channels(self, search_term: str, tenant_id: int = None,
|
async def search_channels(self, search_term: str, tenant_id: int = None,
|
||||||
keywords: list[str] = None, categories: list[str] = None) -> list[dict]:
|
keywords: list[str] = None, channel_ids: list[int] = None) -> list[dict]:
|
||||||
"""Liest Nachrichten aus konfigurierten Telegram-Kanaelen.
|
"""Liest Nachrichten aus konfigurierten Telegram-Kanaelen.
|
||||||
|
|
||||||
Gibt Artikel-Dicts zurueck (kompatibel mit RSS-Parser-Format).
|
Gibt Artikel-Dicts zurueck (kompatibel mit RSS-Parser-Format).
|
||||||
@@ -72,7 +72,7 @@ class TelegramParser:
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
# Telegram-Kanaele aus DB laden
|
# Telegram-Kanaele aus DB laden
|
||||||
channels = await self._get_telegram_channels(tenant_id, categories=categories)
|
channels = await self._get_telegram_channels(tenant_id, channel_ids=channel_ids)
|
||||||
if not channels:
|
if not channels:
|
||||||
logger.info("Keine Telegram-Kanaele konfiguriert")
|
logger.info("Keine Telegram-Kanaele konfiguriert")
|
||||||
return []
|
return []
|
||||||
@@ -106,25 +106,24 @@ class TelegramParser:
|
|||||||
logger.info("Telegram: %d relevante Nachrichten aus %d Kanaelen", len(all_articles), len(channels))
|
logger.info("Telegram: %d relevante Nachrichten aus %d Kanaelen", len(all_articles), len(channels))
|
||||||
return all_articles
|
return all_articles
|
||||||
|
|
||||||
async def _get_telegram_channels(self, tenant_id: int = None, categories: list[str] = None) -> list[dict]:
|
async def _get_telegram_channels(self, tenant_id: int = None, channel_ids: list[int] = None) -> list[dict]:
|
||||||
"""Laedt Telegram-Kanaele aus der sources-Tabelle."""
|
"""Laedt Telegram-Kanaele aus der sources-Tabelle."""
|
||||||
try:
|
try:
|
||||||
from database import get_db
|
from database import get_db
|
||||||
db = await get_db()
|
db = await get_db()
|
||||||
try:
|
try:
|
||||||
if categories and len(categories) > 0:
|
if channel_ids and len(channel_ids) > 0:
|
||||||
placeholders = ",".join("?" for _ in categories)
|
placeholders = ",".join("?" for _ in channel_ids)
|
||||||
cursor = await db.execute(
|
cursor = await db.execute(
|
||||||
f"""SELECT id, name, url FROM sources
|
f"""SELECT id, name, url, category, notes FROM sources
|
||||||
WHERE source_type = 'telegram_channel'
|
WHERE source_type = 'telegram_channel'
|
||||||
AND status = 'active'
|
AND status = 'active'
|
||||||
AND (tenant_id IS NULL OR tenant_id = ?)
|
AND id IN ({placeholders})""",
|
||||||
AND category IN ({placeholders})""",
|
tuple(channel_ids),
|
||||||
(tenant_id, *categories),
|
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
cursor = await db.execute(
|
cursor = await db.execute(
|
||||||
"""SELECT id, name, url FROM sources
|
"""SELECT id, name, url, category, notes FROM sources
|
||||||
WHERE source_type = 'telegram_channel'
|
WHERE source_type = 'telegram_channel'
|
||||||
AND status = 'active'
|
AND status = 'active'
|
||||||
AND (tenant_id IS NULL OR tenant_id = ?)""",
|
AND (tenant_id IS NULL OR tenant_id = ?)""",
|
||||||
@@ -171,11 +170,11 @@ class TelegramParser:
|
|||||||
text = msg.text
|
text = msg.text
|
||||||
text_lower = text.lower()
|
text_lower = text.lower()
|
||||||
|
|
||||||
# Keyword-Matching (gleiche Logik wie RSS-Parser)
|
# Keyword-Matching (lockerer als RSS: 1 Match reicht,
|
||||||
min_matches = min(2, max(1, (len(search_words) + 1) // 2))
|
# da Kanaele bereits thematisch vorselektiert sind)
|
||||||
match_count = sum(1 for word in search_words if word in text_lower)
|
match_count = sum(1 for word in search_words if word in text_lower)
|
||||||
|
|
||||||
if match_count < min_matches:
|
if match_count < 1:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Erste Zeile als Headline, Rest als Content
|
# Erste Zeile als Headline, Rest als Content
|
||||||
|
|||||||
@@ -331,6 +331,7 @@ from routers.sources import router as sources_router
|
|||||||
from routers.notifications import router as notifications_router
|
from routers.notifications import router as notifications_router
|
||||||
from routers.feedback import router as feedback_router
|
from routers.feedback import router as feedback_router
|
||||||
from routers.public_api import router as public_api_router
|
from routers.public_api import router as public_api_router
|
||||||
|
from routers.chat import router as chat_router
|
||||||
|
|
||||||
app.include_router(auth_router)
|
app.include_router(auth_router)
|
||||||
app.include_router(incidents_router)
|
app.include_router(incidents_router)
|
||||||
@@ -338,6 +339,7 @@ app.include_router(sources_router)
|
|||||||
app.include_router(notifications_router)
|
app.include_router(notifications_router)
|
||||||
app.include_router(feedback_router)
|
app.include_router(feedback_router)
|
||||||
app.include_router(public_api_router)
|
app.include_router(public_api_router)
|
||||||
|
app.include_router(chat_router, prefix="/api/chat")
|
||||||
|
|
||||||
|
|
||||||
@app.websocket("/api/ws")
|
@app.websocket("/api/ws")
|
||||||
|
|||||||
@@ -53,7 +53,6 @@ class IncidentCreate(BaseModel):
|
|||||||
retention_days: int = Field(default=0, ge=0, le=999)
|
retention_days: int = Field(default=0, ge=0, le=999)
|
||||||
international_sources: bool = True
|
international_sources: bool = True
|
||||||
include_telegram: bool = False
|
include_telegram: bool = False
|
||||||
telegram_categories: Optional[list[str]] = None
|
|
||||||
visibility: str = Field(default="public", pattern="^(public|private)$")
|
visibility: str = Field(default="public", pattern="^(public|private)$")
|
||||||
|
|
||||||
|
|
||||||
@@ -67,7 +66,6 @@ class IncidentUpdate(BaseModel):
|
|||||||
retention_days: Optional[int] = Field(default=None, ge=0, le=999)
|
retention_days: Optional[int] = Field(default=None, ge=0, le=999)
|
||||||
international_sources: Optional[bool] = None
|
international_sources: Optional[bool] = None
|
||||||
include_telegram: Optional[bool] = None
|
include_telegram: Optional[bool] = None
|
||||||
telegram_categories: Optional[list[str]] = None
|
|
||||||
visibility: Optional[str] = Field(default=None, pattern="^(public|private)$")
|
visibility: Optional[str] = Field(default=None, pattern="^(public|private)$")
|
||||||
|
|
||||||
|
|
||||||
@@ -85,7 +83,6 @@ class IncidentResponse(BaseModel):
|
|||||||
sources_json: Optional[str] = None
|
sources_json: Optional[str] = None
|
||||||
international_sources: bool = True
|
international_sources: bool = True
|
||||||
include_telegram: bool = False
|
include_telegram: bool = False
|
||||||
telegram_categories: Optional[list[str]] = None
|
|
||||||
created_by: int
|
created_by: int
|
||||||
created_by_username: str = ""
|
created_by_username: str = ""
|
||||||
created_at: str
|
created_at: str
|
||||||
|
|||||||
In neuem Issue referenzieren
Einen Benutzer sperren