"""Deutschlandfunk: Manuskripte auf den Sender-Websites. Domains: - deutschlandfunk.de - deutschlandfunkkultur.de - deutschlandfunknova.de Dlf-Artikel-HTML enthaelt den Manuskript-Text typischerweise in
...
mit vielen

-Absaetzen oder als

. Als Fallback greift der generische Longest-Article-Block-Extraktor. """ from __future__ import annotations import logging from typing import Optional from . import TranscriptResult from ._common import ( episode_url, extract_longest_article_block, extract_text_by_container, fetch_html, matches_domain, ) logger = logging.getLogger("osint.podcast.extractors.dlf") _DOMAINS = ( "deutschlandfunk.de", "deutschlandfunkkultur.de", "deutschlandfunknova.de", ) _CONTAINER_PATTERNS = [ r']*class="[^"]*b-article[^"]*"[^>]*>', r']*class="[^"]*b-text[^"]*"[^>]*>', r']*>', r']*>', ] def can_handle(feed_entry: dict, feed_url: str) -> bool: url = episode_url(feed_entry) or feed_url return matches_domain(url, _DOMAINS) or matches_domain(feed_url, _DOMAINS) async def fetch(feed_entry: dict, feed_url: str) -> Optional[TranscriptResult]: url = episode_url(feed_entry) if not url: return None html = await fetch_html(url) if not html: return None text = extract_text_by_container(html, _CONTAINER_PATTERNS) if not text: text = extract_longest_article_block(html) if not text: return None return TranscriptResult(text=text, source="website_scrape")