"""Deutschlandfunk: Manuskripte auf den Sender-Websites.

Domains:
  - deutschlandfunk.de
  - deutschlandfunkkultur.de
  - deutschlandfunknova.de

Dlf-Artikel-HTML enthaelt den Manuskript-Text typischerweise in
<article class="b-article">...</article> mit vielen <p>-Absaetzen
oder als <div class="b-text">. Als Fallback greift der generische
Longest-Article-Block-Extraktor.
"""
from __future__ import annotations

import logging
from typing import Optional

from . import TranscriptResult
from ._common import (
    episode_url,
    extract_longest_article_block,
    extract_text_by_container,
    fetch_html,
    matches_domain,
)

logger = logging.getLogger("osint.podcast.extractors.dlf")

_DOMAINS = (
    "deutschlandfunk.de",
    "deutschlandfunkkultur.de",
    "deutschlandfunknova.de",
)

_CONTAINER_PATTERNS = [
    r'<article[^>]*class="[^"]*b-article[^"]*"[^>]*>',
    r'<div[^>]*class="[^"]*b-text[^"]*"[^>]*>',
    r'<article\b[^>]*>',
    r'<main\b[^>]*>',
]


def can_handle(feed_entry: dict, feed_url: str) -> bool:
    url = episode_url(feed_entry) or feed_url
    return matches_domain(url, _DOMAINS) or matches_domain(feed_url, _DOMAINS)


async def fetch(feed_entry: dict, feed_url: str) -> Optional[TranscriptResult]:
    url = episode_url(feed_entry)
    if not url:
        return None
    html = await fetch_html(url)
    if not html:
        return None

    text = extract_text_by_container(html, _CONTAINER_PATTERNS)
    if not text:
        text = extract_longest_article_block(html)
    if not text:
        return None
    return TranscriptResult(text=text, source="website_scrape")