"""Deutschlandfunk: Manuskripte auf den Sender-Websites.
Domains:
- deutschlandfunk.de
- deutschlandfunkkultur.de
- deutschlandfunknova.de
Dlf-Artikel-HTML enthaelt den Manuskript-Text typischerweise in
... mit vielen
. Als Fallback greift der generische
Longest-Article-Block-Extraktor.
"""
from __future__ import annotations
import logging
from typing import Optional
from . import TranscriptResult
from ._common import (
episode_url,
extract_longest_article_block,
extract_text_by_container,
fetch_html,
matches_domain,
)
logger = logging.getLogger("osint.podcast.extractors.dlf")
_DOMAINS = (
"deutschlandfunk.de",
"deutschlandfunkkultur.de",
"deutschlandfunknova.de",
)
_CONTAINER_PATTERNS = [
r'
]*class="[^"]*b-article[^"]*"[^>]*>',
r']*class="[^"]*b-text[^"]*"[^>]*>',
r'
]*>',
r']*>',
]
def can_handle(feed_entry: dict, feed_url: str) -> bool:
url = episode_url(feed_entry) or feed_url
return matches_domain(url, _DOMAINS) or matches_domain(feed_url, _DOMAINS)
async def fetch(feed_entry: dict, feed_url: str) -> Optional[TranscriptResult]:
url = episode_url(feed_entry)
if not url:
return None
html = await fetch_html(url)
if not html:
return None
text = extract_text_by_container(html, _CONTAINER_PATTERNS)
if not text:
text = extract_longest_article_block(html)
if not text:
return None
return TranscriptResult(text=text, source="website_scrape")