Telegram-Kanaele als Quelle: Parser, Pipeline, UI-Checkbox, Validate-Endpoint

- Neuer source_type telegram_channel in models.py (Source + Incident) - DB-Migration: include_telegram Spalte fuer incidents - feeds/telegram_parser.py: Telethon-basierter Parser (analog RSS) - Orchestrator: Telegram-Pipeline parallel zu RSS + WebSearch - sources.py: POST /api/sources/telegram/validate Endpoint - incidents.py: include_telegram in Create/Update/Response - dashboard.html: Telegram-Checkbox + Filter-Option - app.js: FormData, EditModal, SourceStats, TypeLabels - config.py: TELEGRAM_API_ID, API_HASH, SESSION_PATH - requirements.txt: telethon hinzugefuegt
2026-03-13 13:10:24 +01:00
Commit 01cad9dac5
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,3 +10,4 @@ websockets
 python-multipart
 aiosmtplib
 geonamescache>=2.0
+telethon
--- a/src/agents/orchestrator.py
+++ b/src/agents/orchestrator.py
@@ -535,6 +535,7 @@ class AgentOrchestrator:
            description = incident["description"] or ""
            incident_type = incident["type"] or "adhoc"
            international = bool(incident["international_sources"]) if "international_sources" in incident.keys() else True
+            include_telegram = bool(incident["include_telegram"]) if "include_telegram" in incident.keys() else False
            visibility = incident["visibility"] if "visibility" in incident.keys() else "public"
            created_by = incident["created_by"] if "created_by" in incident.keys() else None
            tenant_id = incident["tenant_id"] if "tenant_id" in incident.keys() else None
@@ -620,11 +621,24 @@ class AgentOrchestrator:
                logger.info(f"Claude-Recherche: {len(results)} Ergebnisse")
                return results, usage

-            # Beide Pipelines parallel starten
-            (rss_articles, rss_feed_usage), (search_results, search_usage) = await asyncio.gather(
-                _rss_pipeline(),
-                _web_search_pipeline(),
-            )
+            async def _telegram_pipeline():
+                """Telegram-Kanal-Suche."""
+                from feeds.telegram_parser import TelegramParser
+                tg_parser = TelegramParser()
+                articles = await tg_parser.search_channels(title, tenant_id=tenant_id, keywords=None)
+                logger.info(f"Telegram-Pipeline: {len(articles)} Nachrichten")
+                return articles, None
+
+            # Pipelines parallel starten (RSS + WebSearch + optional Telegram)
+            pipelines = [_rss_pipeline(), _web_search_pipeline()]
+            if include_telegram:
+                pipelines.append(_telegram_pipeline())
+
+            pipeline_results = await asyncio.gather(*pipelines)
+
+            (rss_articles, rss_feed_usage) = pipeline_results[0]
+            (search_results, search_usage) = pipeline_results[1]
+            telegram_articles = pipeline_results[2][0] if include_telegram else []

            if rss_feed_usage:
                usage_acc.add(rss_feed_usage)
@@ -635,7 +649,7 @@ class AgentOrchestrator:
            self._check_cancelled(incident_id)

            # Alle Ergebnisse zusammenführen
-            all_results = rss_articles + search_results
+            all_results = rss_articles + search_results + telegram_articles

            # Duplikate entfernen (normalisierte URL + Headline-Ähnlichkeit)
            seen_urls = set()
--- a/src/config.py
+++ b/src/config.py
@@ -76,3 +76,9 @@ MAX_ARTICLES_PER_DOMAIN_RSS = 10  # Max. Artikel pro Domain nach RSS-Fetch
 # Magic Link
 MAGIC_LINK_EXPIRE_MINUTES = 10
 MAGIC_LINK_BASE_URL = os.environ.get("MAGIC_LINK_BASE_URL", "https://monitor.aegis-sight.de")
+
+# Telegram (Telethon)
+TELEGRAM_API_ID = int(os.environ.get("TELEGRAM_API_ID", "2040"))
+TELEGRAM_API_HASH = os.environ.get("TELEGRAM_API_HASH", "b18441a1ff607e10a989891a5462e627")
+TELEGRAM_SESSION_PATH = os.environ.get("TELEGRAM_SESSION_PATH", "/home/claude-dev/.telegram/telegram_session")
+
--- a/src/database.py
+++ b/src/database.py
@@ -259,6 +259,12 @@ async def init_db():
            await db.execute("ALTER TABLE incidents ADD COLUMN visibility TEXT DEFAULT 'public'")
            await db.commit()

+        if "include_telegram" not in columns:
+            await db.execute("ALTER TABLE incidents ADD COLUMN include_telegram INTEGER DEFAULT 0")
+            await db.commit()
+            logger.info("Migration: include_telegram zu incidents hinzugefuegt")
+
+
        if "tenant_id" not in columns:
            await db.execute("ALTER TABLE incidents ADD COLUMN tenant_id INTEGER REFERENCES organizations(id)")
            await db.commit()
--- a/src/feeds/telegram_parser.py
+++ b/src/feeds/telegram_parser.py
@@ -0,0 +1,251 @@
+"""Telegram-Kanal Parser: Liest Nachrichten aus konfigurierten Telegram-Kanaelen."""
+import asyncio
+import logging
+import os
+from datetime import datetime, timezone
+from typing import Optional
+
+from config import TIMEZONE, TELEGRAM_API_ID, TELEGRAM_API_HASH, TELEGRAM_SESSION_PATH
+
+logger = logging.getLogger("osint.telegram")
+
+# Stoppwoerter (gleich wie RSS-Parser)
+STOP_WORDS = {
+    "und", "oder", "der", "die", "das", "ein", "eine", "in", "im", "am", "an",
+    "auf", "fuer", "mit", "von", "zu", "zum", "zur", "bei", "nach", "vor",
+    "ueber", "unter", "ist", "sind", "hat", "the", "and", "for", "with", "from",
+}
+
+
+class TelegramParser:
+    """Durchsucht Telegram-Kanaele nach relevanten Nachrichten."""
+
+    _client = None
+    _lock = asyncio.Lock()
+
+    async def _get_client(self):
+        """Telethon-Client erstellen oder wiederverwenden."""
+        if TelegramParser._client is not None:
+            if TelegramParser._client.is_connected():
+                return TelegramParser._client
+
+        async with TelegramParser._lock:
+            # Double-check nach Lock
+            if TelegramParser._client is not None and TelegramParser._client.is_connected():
+                return TelegramParser._client
+
+            try:
+                from telethon import TelegramClient
+                session_path = TELEGRAM_SESSION_PATH
+                if not os.path.exists(session_path + ".session") and not os.path.exists(session_path):
+                    logger.error("Telegram-Session nicht gefunden: %s", session_path)
+                    return None
+
+                client = TelegramClient(session_path, TELEGRAM_API_ID, TELEGRAM_API_HASH)
+                await client.connect()
+
+                if not await client.is_user_authorized():
+                    logger.error("Telegram-Session nicht autorisiert. Bitte neu einloggen.")
+                    await client.disconnect()
+                    return None
+
+                TelegramParser._client = client
+                me = await client.get_me()
+                logger.info("Telegram verbunden als: %s (%s)", me.first_name, me.phone)
+                return client
+            except ImportError:
+                logger.error("telethon nicht installiert: pip install telethon")
+                return None
+            except Exception as e:
+                logger.error("Telegram-Verbindung fehlgeschlagen: %s", e)
+                return None
+
+    async def search_channels(self, search_term: str, tenant_id: int = None,
+                              keywords: list[str] = None) -> list[dict]:
+        """Liest Nachrichten aus konfigurierten Telegram-Kanaelen.
+
+        Gibt Artikel-Dicts zurueck (kompatibel mit RSS-Parser-Format).
+        """
+        client = await self._get_client()
+        if not client:
+            logger.warning("Telegram-Client nicht verfuegbar, ueberspringe Telegram-Pipeline")
+            return []
+
+        # Telegram-Kanaele aus DB laden
+        channels = await self._get_telegram_channels(tenant_id)
+        if not channels:
+            logger.info("Keine Telegram-Kanaele konfiguriert")
+            return []
+
+        # Suchwoerter vorbereiten
+        if keywords:
+            search_words = [w.lower().strip() for w in keywords if w.strip()]
+        else:
+            search_words = [
+                w for w in search_term.lower().split()
+                if w not in STOP_WORDS and len(w) >= 3
+            ]
+            if not search_words:
+                search_words = search_term.lower().split()[:2]
+
+        # Kanaele parallel abrufen
+        tasks = []
+        for ch in channels:
+            channel_id = ch["url"] or ch["name"]
+            tasks.append(self._fetch_channel(client, channel_id, search_words))
+
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+
+        all_articles = []
+        for i, result in enumerate(results):
+            if isinstance(result, Exception):
+                logger.warning("Telegram-Kanal %s: %s", channels[i]["name"], result)
+                continue
+            all_articles.extend(result)
+
+        logger.info("Telegram: %d relevante Nachrichten aus %d Kanaelen", len(all_articles), len(channels))
+        return all_articles
+
+    async def _get_telegram_channels(self, tenant_id: int = None) -> list[dict]:
+        """Laedt Telegram-Kanaele aus der sources-Tabelle."""
+        try:
+            from database import get_db
+            db = await get_db()
+            try:
+                cursor = await db.execute(
+                    """SELECT id, name, url FROM sources
+                       WHERE source_type = 'telegram_channel'
+                       AND status = 'active'
+                       AND (tenant_id IS NULL OR tenant_id = ?)""",
+                    (tenant_id,),
+                )
+                rows = await cursor.fetchall()
+                return [dict(row) for row in rows]
+            finally:
+                await db.close()
+        except Exception as e:
+            logger.error("Fehler beim Laden der Telegram-Kanaele: %s", e)
+            return []
+
+    async def _fetch_channel(self, client, channel_id: str, search_words: list[str],
+                             limit: int = 50) -> list[dict]:
+        """Letzte N Nachrichten eines Kanals abrufen und nach Keywords filtern."""
+        articles = []
+        try:
+            # Kanal-Identifier normalisieren
+            identifier = channel_id.strip()
+            if identifier.startswith("https://t.me/"):
+                identifier = identifier.replace("https://t.me/", "")
+            if identifier.startswith("t.me/"):
+                identifier = identifier.replace("t.me/", "")
+
+            # Privater Invite-Link
+            if identifier.startswith("+") or identifier.startswith("joinchat/"):
+                entity = await client.get_entity(channel_id)
+            else:
+                # Oeffentlicher Kanal
+                if not identifier.startswith("@"):
+                    identifier = "@" + identifier
+                entity = await client.get_entity(identifier)
+
+            messages = await client.get_messages(entity, limit=limit)
+
+            channel_title = getattr(entity, "title", identifier)
+            channel_username = getattr(entity, "username", identifier.replace("@", ""))
+
+            for msg in messages:
+                if not msg.text:
+                    continue
+
+                text = msg.text
+                text_lower = text.lower()
+
+                # Keyword-Matching (gleiche Logik wie RSS-Parser)
+                min_matches = min(2, max(1, (len(search_words) + 1) // 2))
+                match_count = sum(1 for word in search_words if word in text_lower)
+
+                if match_count < min_matches:
+                    continue
+
+                # Erste Zeile als Headline, Rest als Content
+                lines = text.strip().split("\n")
+                headline = lines[0][:200] if lines else text[:200]
+                content = text
+
+                # Datum
+                published = None
+                if msg.date:
+                    try:
+                        published = msg.date.astimezone(TIMEZONE).isoformat()
+                    except Exception:
+                        published = msg.date.isoformat()
+
+                # Source-URL: t.me/channel/msg_id
+                if channel_username:
+                    source_url = "https://t.me/%s/%s" % (channel_username, msg.id)
+                else:
+                    source_url = "https://t.me/c/%s/%s" % (entity.id, msg.id)
+
+                relevance_score = match_count / len(search_words) if search_words else 0.0
+
+                articles.append({
+                    "headline": headline,
+                    "headline_de": headline if self._is_german(headline) else None,
+                    "source": "Telegram: %s" % channel_title,
+                    "source_url": source_url,
+                    "content_original": content[:2000],
+                    "content_de": content[:2000] if self._is_german(content) else None,
+                    "language": "de" if self._is_german(content) else "en",
+                    "published_at": published,
+                    "relevance_score": relevance_score,
+                })
+
+        except Exception as e:
+            logger.warning("Telegram-Kanal %s: %s", channel_id, e)
+
+        return articles
+
+    async def validate_channel(self, channel_id: str) -> Optional[dict]:
+        """Prueft ob ein Telegram-Kanal erreichbar ist und gibt Info zurueck."""
+        client = await self._get_client()
+        if not client:
+            return None
+
+        try:
+            identifier = channel_id.strip()
+            if identifier.startswith("https://t.me/"):
+                identifier = identifier.replace("https://t.me/", "")
+            if identifier.startswith("t.me/"):
+                identifier = identifier.replace("t.me/", "")
+
+            if identifier.startswith("+") or identifier.startswith("joinchat/"):
+                return {"valid": True, "name": "Privater Kanal", "description": "Privater Einladungslink", "subscribers": None}
+
+            if not identifier.startswith("@"):
+                identifier = "@" + identifier
+
+            entity = await client.get_entity(identifier)
+
+            from telethon.tl.functions.channels import GetFullChannelRequest
+            full = await client(GetFullChannelRequest(entity))
+
+            return {
+                "valid": True,
+                "name": getattr(entity, "title", identifier),
+                "description": getattr(full.full_chat, "about", "") or "",
+                "subscribers": getattr(full.full_chat, "participants_count", None),
+                "username": getattr(entity, "username", ""),
+            }
+        except Exception as e:
+            logger.warning("Telegram-Kanal-Validierung fehlgeschlagen fuer %s: %s", channel_id, e)
+            return None
+
+    def _is_german(self, text: str) -> bool:
+        """Einfache Heuristik ob ein Text deutsch ist."""
+        german_words = {"der", "die", "das", "und", "ist", "von", "mit", "fuer", "auf", "ein",
+                        "eine", "den", "dem", "des", "sich", "wird", "nach", "bei", "auch",
+                        "ueber", "wie", "aus", "hat", "zum", "zur", "als", "noch", "mehr",
+                        "nicht", "aber", "oder", "sind", "vor", "einem", "einer", "wurde"}
+        words = set(text.lower().split())
+        matches = words & german_words
+        return len(matches) >= 2
--- a/src/models.py
+++ b/src/models.py
@@ -52,6 +52,7 @@ class IncidentCreate(BaseModel):
    refresh_interval: int = Field(default=15, ge=10, le=10080)
    retention_days: int = Field(default=0, ge=0, le=999)
    international_sources: bool = True
+    include_telegram: bool = False
    visibility: str = Field(default="public", pattern="^(public|private)$")


@@ -64,6 +65,7 @@ class IncidentUpdate(BaseModel):
    refresh_interval: Optional[int] = Field(default=None, ge=10, le=10080)
    retention_days: Optional[int] = Field(default=None, ge=0, le=999)
    international_sources: Optional[bool] = None
+    include_telegram: Optional[bool] = None
    visibility: Optional[str] = Field(default=None, pattern="^(public|private)$")


@@ -80,6 +82,7 @@ class IncidentResponse(BaseModel):
    summary: Optional[str]
    sources_json: Optional[str] = None
    international_sources: bool = True
+    include_telegram: bool = False
    created_by: int
    created_by_username: str = ""
    created_at: str
@@ -95,7 +98,7 @@ class SourceCreate(BaseModel):
    name: str = Field(min_length=1, max_length=200)
    url: Optional[str] = None
    domain: Optional[str] = None
-    source_type: str = Field(default="rss_feed", pattern="^(rss_feed|web_source|excluded)$")
+    source_type: str = Field(default="rss_feed", pattern="^(rss_feed|web_source|excluded|telegram_channel)$")
    category: str = Field(default="sonstige", pattern="^(nachrichtenagentur|oeffentlich-rechtlich|qualitaetszeitung|behoerde|fachmedien|think-tank|international|regional|boulevard|sonstige)$")
    status: str = Field(default="active", pattern="^(active|inactive)$")
    notes: Optional[str] = None
@@ -105,7 +108,7 @@ class SourceUpdate(BaseModel):
    name: Optional[str] = Field(default=None, max_length=200)
    url: Optional[str] = None
    domain: Optional[str] = None
-    source_type: Optional[str] = Field(default=None, pattern="^(rss_feed|web_source|excluded)$")
+    source_type: Optional[str] = Field(default=None, pattern="^(rss_feed|web_source|excluded|telegram_channel)$")
    category: Optional[str] = Field(default=None, pattern="^(nachrichtenagentur|oeffentlich-rechtlich|qualitaetszeitung|behoerde|fachmedien|think-tank|international|regional|boulevard|sonstige)$")
    status: Optional[str] = Field(default=None, pattern="^(active|inactive)$")
    notes: Optional[str] = None
--- a/src/routers/incidents.py
+++ b/src/routers/incidents.py
@@ -20,7 +20,7 @@ router = APIRouter(prefix="/api/incidents", tags=["incidents"])

 INCIDENT_UPDATE_COLUMNS = {
    "title", "description", "type", "status", "refresh_mode",
-    "refresh_interval", "retention_days", "international_sources", "visibility",
+    "refresh_interval", "retention_days", "international_sources", "include_telegram", "visibility",
 }


@@ -105,9 +105,9 @@ async def create_incident(
    now = datetime.now(TIMEZONE).strftime('%Y-%m-%d %H:%M:%S')
    cursor = await db.execute(
        """INSERT INTO incidents (title, description, type, refresh_mode, refresh_interval,
-           retention_days, international_sources, visibility,
+           retention_days, international_sources, include_telegram, visibility,
           tenant_id, created_by, created_at, updated_at)
-           VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
+           VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
        (
            data.title,
            data.description,
@@ -116,6 +116,7 @@ async def create_incident(
            data.refresh_interval,
            data.retention_days,
            1 if data.international_sources else 0,
+            1 if data.include_telegram else 0,
            data.visibility,
            tenant_id,
            current_user["id"],
@@ -721,6 +722,7 @@ def _build_json_export(
            "updated_at": incident.get("updated_at"),
            "summary": incident.get("summary"),
            "international_sources": bool(incident.get("international_sources")),
+            "include_telegram": bool(incident.get("include_telegram")),
        },
        "sources": sources,
        "fact_checks": [
--- a/src/routers/sources.py
+++ b/src/routers/sources.py
@@ -87,6 +87,7 @@ async def get_source_stats(
    stats = {
        "rss_feed": {"count": 0, "articles": 0},
        "web_source": {"count": 0, "articles": 0},
+        "telegram_channel": {"count": 0, "articles": 0},
        "excluded": {"count": 0, "articles": 0},
    }
    for row in rows:
@@ -516,6 +517,32 @@ async def delete_source(
    await db.commit()


+
+
+@router.post("/telegram/validate")
+async def validate_telegram_channel(
+    data: dict,
+    current_user: dict = Depends(get_current_user),
+):
+    """Prueft ob ein Telegram-Kanal erreichbar ist und gibt Kanalinfo zurueck."""
+    channel_id = data.get("channel_id", "").strip()
+    if not channel_id:
+        raise HTTPException(status_code=400, detail="channel_id ist erforderlich")
+
+    try:
+        from feeds.telegram_parser import TelegramParser
+        parser = TelegramParser()
+        result = await parser.validate_channel(channel_id)
+        if result:
+            return result
+        raise HTTPException(status_code=404, detail="Kanal nicht erreichbar oder nicht gefunden")
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error("Telegram-Validierung fehlgeschlagen: %s", e, exc_info=True)
+        raise HTTPException(status_code=500, detail="Telegram-Validierung fehlgeschlagen")
+
+
@router.post("/refresh-counts")
 async def trigger_refresh_counts(
    current_user: dict = Depends(get_current_user),
--- a/src/static/dashboard.html
+++ b/src/static/dashboard.html
@@ -339,6 +339,14 @@
                            </label>
                            <div class="form-hint" id="sources-hint">DE + internationale Feeds (Reuters, BBC, Al Jazeera etc.)</div>
                        </div>
+                        <div class="toggle-group" style="margin-top: 8px;">
+                            <label class="toggle-label">
+                                <input type="checkbox" id="inc-telegram">
+                                <span class="toggle-switch"></span>
+                                <span class="toggle-text">Telegram-Kanäle einbeziehen</span>
+                            </label>
+                            <div class="form-hint" id="telegram-hint">Nachrichten aus konfigurierten Telegram-Kanälen berücksichtigen</div>
+                        </div>
                    </div>
                    <div class="form-group">
                        <label>Sichtbarkeit</label>
@@ -427,6 +435,7 @@
                            <option value="">Alle Typen</option>
                            <option value="rss_feed">RSS-Feed</option>
                            <option value="web_source">Web-Quelle</option>
+                            <option value="telegram_channel">Telegram</option>
                            <option value="excluded">Von mir ausgeschlossen</option>
                        </select>
                        <label for="sources-filter-category" class="sr-only">Kategorie filtern</label>
--- a/src/static/js/app.js
+++ b/src/static/js/app.js