feat: Blog-Pipeline (Curator + Writer + Push)

2026-03-29 03:35:43 +02:00
Commit b3c8cf2676
--- a/src/agents/blog/init.py
+++ b/src/agents/blog/init.py
--- a/src/agents/blog/blog_curator.py
+++ b/src/agents/blog/blog_curator.py
@@ -0,0 +1,125 @@
+"""BlogCurator -- Wählt tägliche Blog-Themen aus der Monitor-DB."""
+import json
+import logging
+import sqlite3
+from datetime import datetime, timedelta, timezone
+
+logger = logging.getLogger("blog.curator")
+
+DB_PATH = "/mnt/gitea/osint-data/osint.db"
+
+
+def get_recent_data(hours: int = 24) -> dict:
+    """Holt aktuelle Artikel und Faktenchecks aus der Monitor-DB."""
+    cutoff = (datetime.now(timezone.utc) - timedelta(hours=hours)).isoformat()
+    conn = sqlite3.connect(DB_PATH)
+    conn.row_factory = sqlite3.Row
+
+    # Aktuelle Artikel
+    cursor = conn.execute(
+        """SELECT a.headline_de, a.headline, a.source, a.source_url,
+                  a.content_de, a.published_at, a.collected_at,
+                  i.title as incident_title, i.id as incident_id
+           FROM articles a
+           LEFT JOIN incidents i ON a.incident_id = i.id
+           WHERE a.collected_at > ? OR a.published_at > ?
+           ORDER BY a.collected_at DESC LIMIT 100""",
+        (cutoff, cutoff),
+    )
+    articles = [dict(r) for r in cursor.fetchall()]
+
+    # Aktive Lagen
+    cursor = conn.execute(
+        "SELECT id, title, summary, type, status FROM incidents WHERE status = 'active' ORDER BY updated_at DESC LIMIT 10"
+    )
+    incidents = [dict(r) for r in cursor.fetchall()]
+
+    # Aktuelle Faktenchecks
+    cursor = conn.execute(
+        """SELECT claim, status, evidence, checked_at, incident_id
+           FROM fact_checks WHERE checked_at > ? ORDER BY checked_at DESC LIMIT 30""",
+        (cutoff,),
+    )
+    fact_checks = [dict(r) for r in cursor.fetchall()]
+
+    conn.close()
+    return {"articles": articles, "incidents": incidents, "fact_checks": fact_checks}
+
+
+async def curate_topics(call_claude_fn) -> list[dict]:
+    """Wählt 2-4 blogwürdige Themen aus."""
+    data = get_recent_data()
+
+    if not data["articles"]:
+        logger.warning("Keine neuen Artikel in den letzten 24h")
+        return []
+
+    # Zusammenfassung für Claude
+    article_summary = []
+    for a in data["articles"][:50]:
+        title = a["headline_de"] or a["headline"] or ""
+        source = a["source"] or ""
+        incident = a["incident_title"] or ""
+        article_summary.append(f"- [{source}] {title} (Lage: {incident})")
+
+    fc_summary = []
+    for fc in data["fact_checks"][:15]:
+        fc_summary.append(f"- [{fc['status']}] {fc['claim'][:150]}")
+
+    incident_summary = []
+    for inc in data["incidents"]:
+        summary_short = (inc["summary"] or "")[:200]
+        incident_summary.append(f"- Lage #{inc['id']}: {inc['title']} -- {summary_short}")
+
+    prompt = f"""Du bist der Redaktionsleiter des OSINT-Blogs "AegisSight Mosaic".
+Wähle aus den folgenden aktuellen OSINT-Daten 2-4 Themen aus, die sich für fundierte Blog-Artikel eignen.
+
+KATEGORIEN (wähle passend):
+- OSINT: Nachrichtenanalyse, Quellenauswertung, Faktencheck
+- GEOINT: Karten, Satellitenbilder, räumliche Analyse
+- CYBINT: Cyber-Bedrohungen, digitale Infrastruktur
+- SOCMINT: Desinformation, Narrative, Social-Media-Trends
+
+AKTUELLE LAGEN:
+{chr(10).join(incident_summary)}
+
+AKTUELLE ARTIKEL (letzte 24h):
+{chr(10).join(article_summary[:30])}
+
+AKTUELLE FAKTENCHECKS:
+{chr(10).join(fc_summary)}
+
+REGELN:
+- Wähle Themen die für ein breites Publikum interessant sind
+- Keine rein technischen oder internen Themen
+- Bevorzuge Themen mit mehreren Quellen und Faktenchecks
+- Jedes Thema muss einer Kategorie zugeordnet werden
+- Gib relevante Artikel-IDs und Incident-IDs als Kontext mit
+
+Antworte als JSON-Array:
+[
+  {{
+    "topic": "Kurzer Thementitel",
+    "category": "OSINT|GEOINT|CYBINT|SOCMINT",
+    "angle": "Welcher Blickwinkel/welche Story?",
+    "key_sources": ["Quellenname 1", "Quellenname 2"],
+    "incident_ids": [6, 18],
+    "relevance": "Warum ist das jetzt relevant?"
+  }}
+]"""
+
+    result, usage = await call_claude_fn(prompt, tools=None, model="claude-haiku-4-5-20251001")
+
+    try:
+        # JSON aus dem Ergebnis extrahieren
+        text = result.strip()
+        if text.startswith("```"):
+            text = text.split("```")[1]
+            if text.startswith("json"):
+                text = text[4:]
+        topics = json.loads(text)
+        logger.info(f"Curator: {len(topics)} Themen ausgewählt (${usage.cost_usd:.4f})")
+        return topics
+    except (json.JSONDecodeError, IndexError) as e:
+        logger.error(f"Curator JSON-Parse-Fehler: {e}\nRaw: {result[:500]}")
+        return []
--- a/src/agents/blog/blog_pipeline.py
+++ b/src/agents/blog/blog_pipeline.py
@@ -0,0 +1,103 @@
+"""Blog-Pipeline: Curator -> Writer -> Push zum Blog."""
+import asyncio
+import json
+import logging
+import ssl
+import sys
+import urllib.request
+from pathlib import Path
+
+# Projekt-Root zum Path hinzufügen
+sys.path.insert(0, str(Path(__file__).parent.parent.parent.parent))
+
+from src.agents.claude_client import call_claude
+from src.agents.blog.blog_curator import curate_topics
+from src.agents.blog.blog_writer import write_article
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
+    handlers=[
+        logging.StreamHandler(),
+    ],
+)
+logger = logging.getLogger("blog.pipeline")
+
+BLOG_API_URL = "https://blog.aegis-sight.de/api/ingest/drafts"
+API_KEY_FILE = "/home/claude-dev/.blog-api-key"
+
+
+def read_api_key() -> str:
+    try:
+        return open(API_KEY_FILE).read().strip()
+    except FileNotFoundError:
+        logger.error(f"API-Key-Datei nicht gefunden: {API_KEY_FILE}")
+        sys.exit(1)
+
+
+def push_to_blog(articles: list[dict], api_key: str) -> dict:
+    """Pushed Artikel-Entwürfe an die Blog Ingest API."""
+    data = json.dumps({"articles": articles}).encode("utf-8")
+    req = urllib.request.Request(
+        BLOG_API_URL,
+        data=data,
+        headers={
+            "Content-Type": "application/json",
+            "X-API-Key": api_key,
+        },
+        method="POST",
+    )
+    ctx = ssl.create_default_context()
+    with urllib.request.urlopen(req, timeout=30, context=ctx) as resp:
+        return json.loads(resp.read().decode("utf-8"))
+
+
+async def run_pipeline():
+    """Führt die komplette Blog-Pipeline aus."""
+    logger.info("=== Blog-Pipeline gestartet ===")
+
+    # 1. Themen auswählen
+    logger.info("Schritt 1: Themen auswählen...")
+    topics = await curate_topics(call_claude)
+    if not topics:
+        logger.warning("Keine Themen ausgewählt -- Pipeline beendet")
+        return
+
+    logger.info(f"{len(topics)} Themen ausgewählt: {[t['topic'] for t in topics]}")
+
+    # 2. Artikel schreiben
+    logger.info("Schritt 2: Artikel schreiben...")
+    articles = []
+    for topic in topics:
+        logger.info(f"Schreibe: {topic['topic']} ({topic['category']})")
+        article = await write_article(topic, call_claude)
+        if article:
+            articles.append(article)
+        else:
+            logger.warning(f"Artikel fehlgeschlagen: {topic['topic']}")
+
+    if not articles:
+        logger.warning("Keine Artikel geschrieben -- Pipeline beendet")
+        return
+
+    logger.info(f"{len(articles)} Artikel geschrieben")
+
+    # 3. An Blog pushen
+    logger.info("Schritt 3: An Blog pushen...")
+    api_key = read_api_key()
+    try:
+        result = push_to_blog(articles, api_key)
+        logger.info(f"Push-Ergebnis: {result['accepted']} akzeptiert, {result.get('rejected', 0)} abgelehnt")
+    except Exception as e:
+        logger.error(f"Push fehlgeschlagen: {e}")
+        return
+
+    logger.info("=== Blog-Pipeline abgeschlossen ===")
+
+
+def main():
+    asyncio.run(run_pipeline())
+
+
+if __name__ == "__main__":
+    main()
--- a/src/agents/blog/blog_writer.py
+++ b/src/agents/blog/blog_writer.py
@@ -0,0 +1,136 @@
+"""BlogWriter -- Schreibt Blog-Artikel aus Curator-Themen."""
+import json
+import logging
+import sqlite3
+from datetime import datetime, timedelta, timezone
+
+logger = logging.getLogger("blog.writer")
+
+DB_PATH = "/mnt/gitea/osint-data/osint.db"
+
+
+def get_context_for_topic(topic: dict) -> str:
+    """Holt relevante Daten aus der DB für ein Thema."""
+    conn = sqlite3.connect(DB_PATH)
+    conn.row_factory = sqlite3.Row
+
+    context_parts = []
+
+    # Incident-Zusammenfassungen
+    for inc_id in topic.get("incident_ids", []):
+        cursor = conn.execute(
+            "SELECT title, summary, updated_at FROM incidents WHERE id = ?", (inc_id,)
+        )
+        inc = cursor.fetchone()
+        if inc:
+            summary = (dict(inc).get("summary") or "")[:2000]
+            context_parts.append(f"## Lage: {inc['title']}\n{summary}")
+
+    # Relevante Artikel der letzten 48h
+    cutoff = (datetime.now(timezone.utc) - timedelta(hours=48)).isoformat()
+    incident_ids = topic.get("incident_ids", [])
+    if incident_ids:
+        placeholders = ",".join("?" for _ in incident_ids)
+        cursor = conn.execute(
+            f"""SELECT headline_de, headline, source, source_url, content_de, published_at
+               FROM articles WHERE incident_id IN ({placeholders}) AND (collected_at > ? OR published_at > ?)
+               ORDER BY collected_at DESC LIMIT 20""",
+            (*incident_ids, cutoff, cutoff),
+        )
+    else:
+        cursor = conn.execute(
+            """SELECT headline_de, headline, source, source_url, content_de, published_at
+               FROM articles WHERE collected_at > ? OR published_at > ?
+               ORDER BY collected_at DESC LIMIT 20""",
+            (cutoff, cutoff),
+        )
+    articles = [dict(r) for r in cursor.fetchall()]
+
+    if articles:
+        art_text = []
+        for a in articles:
+            title = a["headline_de"] or a["headline"] or ""
+            content = (a["content_de"] or "")[:500]
+            source = a["source"] or ""
+            url = a["source_url"] or ""
+            art_text.append(f"### {title}\nQuelle: {source} ({url})\n{content}")
+        context_parts.append("## Aktuelle Berichte\n" + "\n\n".join(art_text[:10]))
+
+    # Relevante Faktenchecks
+    if incident_ids:
+        placeholders = ",".join("?" for _ in incident_ids)
+        cursor = conn.execute(
+            f"""SELECT claim, status, evidence FROM fact_checks
+               WHERE incident_id IN ({placeholders}) ORDER BY checked_at DESC LIMIT 10""",
+            incident_ids,
+        )
+        fcs = [dict(r) for r in cursor.fetchall()]
+        if fcs:
+            fc_text = []
+            for fc in fcs:
+                evidence = (fc["evidence"] or "")[:300]
+                fc_text.append(f"- [{fc['status']}] {fc['claim']}\n  Evidenz: {evidence}")
+            context_parts.append("## Faktenchecks\n" + "\n".join(fc_text))
+
+    conn.close()
+    return "\n\n".join(context_parts)
+
+
+async def write_article(topic: dict, call_claude_fn) -> dict | None:
+    """Schreibt einen Blog-Artikel zu einem Thema."""
+    context = get_context_for_topic(topic)
+    today = datetime.now(timezone.utc).strftime('%Y-%m-%d')
+
+    prompt = f"""Du bist Journalist beim OSINT-Blog "AegisSight Mosaic". Schreibe einen fundierten Artikel auf Deutsch.
+
+THEMA: {topic["topic"]}
+KATEGORIE: {topic["category"]}
+BLICKWINKEL: {topic.get("angle", "")}
+
+KONTEXT AUS DER OSINT-ANALYSE:
+{context}
+
+REGELN FÜR DEN ARTIKEL:
+1. Schreibe wie ein professioneller Journalist, KEIN Lagebericht-Stil
+2. Erzählerischer Fließtext mit Kontext und Einordnung
+3. Verwende echte Umlaute (ü, ä, ö, ß)
+4. Markdown-Format: ## für Zwischenüberschriften, **fett** für Betonung
+5. 800-1500 Wörter, gut strukturiert
+6. Nenne und verlinke Quellen im Text wo möglich
+7. Meta-Description: 1 Satz, max 155 Zeichen, für Suchmaschinen
+8. Am Ende: Einordnung/Ausblick (was bedeutet das?)
+
+Antworte als JSON:
+{{
+  "title": "Aussagekräftiger Titel (kein Clickbait)",
+  "content_markdown": "## Kompletter Artikel in Markdown...",
+  "meta_description": "Kurze Beschreibung für SEO (max 155 Zeichen)",
+  "sources": [
+    {{"title": "Quellenname", "url": "https://...", "accessed_at": "{today}"}}
+  ],
+  "geo_data": null
+}}
+
+Falls das Thema einen geographischen Bezug hat, fülle geo_data:
+{{
+  "center": [lat, lng],
+  "zoom": 5,
+  "markers": [{{"lat": 0, "lng": 0, "label": "Ort", "popup": "Beschreibung"}}]
+}}"""
+
+    result, usage = await call_claude_fn(prompt, tools="WebSearch,WebFetch", model=None)
+
+    try:
+        text = result.strip()
+        if text.startswith("```"):
+            text = text.split("```")[1]
+            if text.startswith("json"):
+                text = text[4:]
+        article = json.loads(text)
+        article["category"] = topic["category"]
+        article["monitor_event_ids"] = topic.get("incident_ids", [])
+        logger.info(f"Writer: Artikel '{article['title']}' geschrieben (${usage.cost_usd:.4f})")
+        return article
+    except (json.JSONDecodeError, IndexError, KeyError) as e:
+        logger.error(f"Writer JSON-Parse-Fehler: {e}\nRaw: {result[:500]}")
+        return None