Podcast-Integration Phase 1: Feed-Tag + Senderseiten

Podcasts werden wie normale RSS-Quellen behandelt (source_type=podcast_feed). Kein externer bezahlter Dienst, keine lokale Transkription — Monitor nutzt ausschliesslich vorhandene Transkripte. Kaskade fuer Transkript-Bezug: 1. Podcasting-2.0-Tag <podcast:transcript> im Feed (SRT/VTT/HTML/JSON) 2. Redaktionelles Manuskript auf der Episodenseite (Adapter: Dlf, SZ, Spiegel, NDR) 3. YouTube-Captions — Phase 2, optional per yt-dlp Kein Stufen-Treffer -> Episode verworfen (graceful, kein Error). Neu: - src/feeds/podcast_parser.py (eigener Parser, RSS-Heisspfad unveraendert) - src/feeds/transcript_extractors/ (Plugin-Muster): __init__.py Dispatcher, Cache-Lookup gegen podcast_transcripts _common.py HTML-Extraktion, Domain-Matching, httpx-Helper rss_native.py Stufe 1: Feed-Tag-Parser (SRT/VTT/JSON/HTML) website_dlf.py Stufe 2: deutschlandfunk.de + Schwester-Domains website_sz.py Stufe 2: sz.de / sueddeutsche.de website_spiegel.py Stufe 2: spiegel.de / manager-magazin.de website_ndr.py Stufe 2: ndr.de Geaendert: - src/database.py: idempotente Migration, Tabelle podcast_transcripts als URL-Cache gegen Mehrfach-Scrape zwischen Lagen - src/models.py: Pydantic-Pattern von source_type um podcast_feed erweitert - src/source_rules.py: get_feeds_with_metadata() nimmt source_type-Parameter, Default rss_feed (RSS-Pfad unveraendert) - src/agents/orchestrator.py: neue _podcast_pipeline() parallel zu RSS, WebSearch und Telegram; nur fuer adhoc-Lagen; ohne Podcast-Quellen dormant Verifikation: - Migration auf Live-DB erfolgreich (Log: Tabelle podcast_transcripts angelegt) - Import-/Instanziierungs-Test aller Module bestanden - can_handle-Tests pro Sender-Adapter positiv + negativ OK - Live-Scrape gegen Dlf: 22710 Zeichen, gegen SZ: 24918 Zeichen - Dormant-Test: 0 Podcast-Quellen -> keine neue Codezeile im Refresh Verwerfbarkeit: rein additiv, RSS-Pfad unberuehrt, Rollback in drei Schritten (Quellen disablen, git revert, DROP TABLE podcast_transcripts).
2026-04-18 12:06:54 +00:00
Commit 5127e0a42d
--- a/src/feeds/transcript_extractors/_common.py
+++ b/src/feeds/transcript_extractors/_common.py
@@ -0,0 +1,170 @@
+"""Gemeinsame Helfer fuer Website-Scrape-Adapter.
+
+HTML-Extraktor ohne externe Abhaengigkeiten (BeautifulSoup nicht in
+requirements.txt). Nutzt Regex fuer robusten Plaintext-Extract aus
+typischen Artikel-Containern.
+"""
+from __future__ import annotations
+
+import logging
+import re
+from typing import Optional
+from urllib.parse import urlparse
+
+import httpx
+
+logger = logging.getLogger("osint.podcast.extractors.common")
+
+
+HTTP_TIMEOUT = 20.0
+MIN_TRANSCRIPT_LEN = 500  # Unter 500 Zeichen ist das kein Manuskript, nur Shownotes
+
+DEFAULT_HEADERS = {
+    "User-Agent": "Mozilla/5.0 (compatible; OSINT-Monitor/1.0; +https://monitor.aegis-sight.de)",
+    "Accept": "text/html,application/xhtml+xml",
+    "Accept-Language": "de-DE,de;q=0.9,en;q=0.8",
+}
+
+
+def matches_domain(url: str, domains: tuple[str, ...]) -> bool:
+    """Prueft, ob die URL zu einer der bekannten Sender-Domains gehoert."""
+    if not url:
+        return False
+    try:
+        host = urlparse(url).hostname or ""
+        host = host.lower().lstrip("www.")
+        return any(host == d or host.endswith("." + d) for d in domains)
+    except Exception:
+        return False
+
+
+def episode_url(feed_entry: dict) -> Optional[str]:
+    """Holt die Episoden-Webseite (meist entry.link)."""
+    if isinstance(feed_entry, dict):
+        return feed_entry.get("link") or feed_entry.get("guid")
+    return getattr(feed_entry, "link", None) or getattr(feed_entry, "guid", None)
+
+
+async def fetch_html(url: str) -> Optional[str]:
+    async with httpx.AsyncClient(timeout=HTTP_TIMEOUT, follow_redirects=True, headers=DEFAULT_HEADERS) as client:
+        try:
+            resp = await client.get(url)
+            resp.raise_for_status()
+            return resp.text
+        except Exception as e:
+            logger.debug(f"HTML-Fetch fehlgeschlagen ({url}): {e}")
+            return None
+
+
+# --- HTML-Extraktion ------------------------------------------------------
+
+_SCRIPT_STYLE_RE = re.compile(r"<(script|style|noscript|iframe)[^>]*>.*?</\1>", re.DOTALL | re.IGNORECASE)
+_COMMENT_RE = re.compile(r"<!--.*?-->", re.DOTALL)
+_TAG_RE = re.compile(r"<[^>]+>")
+_WHITESPACE_RE = re.compile(r"\s+")
+
+
+def extract_text_by_container(html: str, container_patterns: list[str]) -> Optional[str]:
+    """Extrahiert Text aus dem ersten gefundenen Container.
+
+    container_patterns: Liste von Regex-Mustern, die den oeffnenden Container-Tag
+    matchen (z. B. r'<article[^>]*class="[^"]*article-body[^"]*"[^>]*>').
+    Intern wird der zugehoerige schliessende Tag per Tag-Balancing gesucht.
+    """
+    html_clean = _COMMENT_RE.sub("", _SCRIPT_STYLE_RE.sub("", html))
+
+    for pattern in container_patterns:
+        m = re.search(pattern, html_clean, re.IGNORECASE)
+        if not m:
+            continue
+        start = m.start()
+        # Tag-Name aus Pattern-Treffer extrahieren
+        tag_match = re.match(r"<(\w+)", m.group(0))
+        if not tag_match:
+            continue
+        tag_name = tag_match.group(1).lower()
+        end = _find_matching_close(html_clean, start, tag_name)
+        if end < 0:
+            continue
+        block = html_clean[start:end]
+        text = html_to_text(block)
+        if len(text) >= MIN_TRANSCRIPT_LEN:
+            return text
+    return None
+
+
+def extract_longest_article_block(html: str) -> Optional[str]:
+    """Fallback: suche den laengsten zusammenhaengenden Block aus <p>-Tags.
+
+    Nuetzlich, wenn spezifische Container-Selektoren fehlschlagen.
+    """
+    html_clean = _COMMENT_RE.sub("", _SCRIPT_STYLE_RE.sub("", html))
+
+    # Alle <article>- und <main>-Bloecke finden
+    candidates = []
+    for tag in ("article", "main"):
+        for m in re.finditer(rf"<{tag}\b[^>]*>", html_clean, re.IGNORECASE):
+            end = _find_matching_close(html_clean, m.start(), tag)
+            if end > m.start():
+                candidates.append(html_clean[m.start():end])
+
+    if not candidates:
+        # Letzter Ausweg: gesamter Body
+        body_m = re.search(r"<body\b[^>]*>", html_clean, re.IGNORECASE)
+        if body_m:
+            candidates.append(html_clean[body_m.start():])
+
+    best_text = ""
+    for block in candidates:
+        text = html_to_text(block)
+        if len(text) > len(best_text):
+            best_text = text
+    return best_text if len(best_text) >= MIN_TRANSCRIPT_LEN else None
+
+
+def html_to_text(html: str) -> str:
+    """Simple HTML→Plaintext-Konvertierung."""
+    no_tags = _COMMENT_RE.sub("", _SCRIPT_STYLE_RE.sub("", html))
+    no_tags = _TAG_RE.sub(" ", no_tags)
+    no_tags = (no_tags
+               .replace("&nbsp;", " ")
+               .replace("&amp;", "&")
+               .replace("&quot;", '"')
+               .replace("&#39;", "'")
+               .replace("&apos;", "'")
+               .replace("&lt;", "<")
+               .replace("&gt;", ">")
+               .replace("&ndash;", "-")
+               .replace("&mdash;", "-")
+               .replace("&auml;", "ä")
+               .replace("&ouml;", "ö")
+               .replace("&uuml;", "ü")
+               .replace("&Auml;", "Ä")
+               .replace("&Ouml;", "Ö")
+               .replace("&Uuml;", "Ü")
+               .replace("&szlig;", "ß"))
+    return _WHITESPACE_RE.sub(" ", no_tags).strip()
+
+
+def _find_matching_close(html: str, start: int, tag_name: str) -> int:
+    """Findet die Position des schliessenden Tags, der zum oeffnenden Tag an `start` gehoert.
+
+    Einfacher Zaehler-Ansatz: jeder weitere <tag> erhoeht, jeder </tag> verringert.
+    Rueckgabe: Index NACH dem schliessenden Tag, -1 falls nicht gefunden.
+    """
+    open_re = re.compile(rf"<{tag_name}\b[^>]*>", re.IGNORECASE)
+    close_re = re.compile(rf"</{tag_name}>", re.IGNORECASE)
+    depth = 1
+    pos = start + 1  # nach dem initial geoeffneten Tag
+    while pos < len(html) and depth > 0:
+        next_open = open_re.search(html, pos)
+        next_close = close_re.search(html, pos)
+        if not next_close:
+            return -1
+        if next_open and next_open.start() < next_close.start():
+            depth += 1
+            pos = next_open.end()
+        else:
+            depth -= 1
+            pos = next_close.end()
+    return pos if depth == 0 else -1