Dateien
AegisSight-Monitor/src/agents/entity_extractor.py
Claude Dev e64447ab7f GEOINT-Modus aus Monitor entfernt
Wird als eigenstaendige Anwendung auf separater Subdomain neu aufgebaut.
Alle GEOINT-Dateien entfernt, dashboard.html/components.js/main.py
auf pre-GEOINT Stand zurueckgesetzt.
2026-03-24 11:06:19 +01:00

1256 Zeilen
46 KiB
Python

"""Netzwerkanalyse: Entity-Extraktion (Sonnet) + Beziehungsanalyse (Batched) mit Artikel-Deduplizierung."""
import asyncio
import hashlib
import json
import logging
import re
from collections import defaultdict
from datetime import datetime
from typing import Optional
from agents.claude_client import call_claude, ClaudeUsage, UsageAccumulator
from config import CLAUDE_MODEL_FAST, CLAUDE_MODEL_MEDIUM, TIMEZONE
logger = logging.getLogger("osint.entity_extractor")
# ---------------------------------------------------------------------------
# Konstanten
# ---------------------------------------------------------------------------
TYPE_PRIORITY = {"location": 5, "organisation": 4, "military": 3, "event": 2, "person": 1}
_STOP_WORDS = frozenset({
"the", "of", "and", "for", "in", "on", "at", "to", "by",
"von", "der", "die", "das", "und", "für", "des", "den", "dem",
"ein", "eine", "zur", "zum", "bei", "mit", "aus", "nach",
})
# ---------------------------------------------------------------------------
# Prompts
# ---------------------------------------------------------------------------
ENTITY_EXTRACTION_PROMPT = """Du bist ein OSINT-Analyst für ein Lagemonitoring-System.
AUFGABE: Extrahiere ALLE relevanten Entitäten aus den folgenden Nachrichtenartikeln.
ARTIKEL:
{articles_text}
REGELN:
- Extrahiere JEDE genannte Person, Organisation, Ort, Ereignis und militärische Einheit
- Normalisiere Namen: "Wladimir Putin", "Putin", "V. Putin" -> eine Entität
- Aliase erfassen: Alle Namensvarianten einer Entität als aliases[]
- mention_count: Wie oft wird die Entität insgesamt in allen Artikeln erwähnt?
- Beschreibung: Kurze Einordnung, wer/was die Entität ist (1 Satz)
- KEINE Duplikate: Gleiche Entitäten zusammenfassen
ENTITY-TYPEN:
- "person": Individuelle Personen (Politiker, Militärs, Journalisten etc.)
- "organisation": Organisationen, Parteien, Behörden, Unternehmen, NGOs
- "location": Länder, Städte, Regionen, Gebiete
- "event": Konkrete Ereignisse (Wahlen, Anschläge, Konferenzen etc.)
- "military": Militärische Einheiten, Waffensysteme, Operationen
AUSGABEFORMAT — Antworte AUSSCHLIESSLICH mit diesem JSON:
{{
"entities": [
{{
"name": "Vollständiger Name",
"name_normalized": "vollständiger name",
"type": "person|organisation|location|event|military",
"description": "Kurze Einordnung (1 Satz)",
"aliases": ["Alias1", "Alias2"],
"mention_count": 5
}}
]
}}"""
RELATIONSHIP_BATCH_PROMPT = """Du bist ein Senior OSINT-Analyst für ein Lagemonitoring-System.
AUFGABE: Analysiere die Beziehungen zwischen den Entitäten basierend auf den Artikeln.
BEKANNTE ENTITÄTEN (aus dem Gesamtdatensatz):
{entities_json}
ARTIKEL:
{articles_text}
AUFGABE:
Identifiziere ALLE Beziehungen zwischen den oben genannten Entitäten, die sich aus den Artikeln ergeben.
- Nur Beziehungen nennen, die im Artikeltext belegt sind
- source und target: Exakt die Namen aus der Entitäten-Liste verwenden
- Wenn eine Entität im Artikel vorkommt aber nicht in der Liste, verwende den Namen wie er in der Liste steht
BEZIEHUNGS-KATEGORIEN:
- "alliance": Bündnis, Kooperation, Unterstützung, Partnerschaft
- "conflict": Konflikt, Krieg, Feindschaft, Sanktionen, Opposition
- "diplomacy": Diplomatische Beziehungen, Verhandlungen, Abkommen
- "economic": Wirtschaftsbeziehungen, Handel, Investitionen
- "legal": Rechtliche Beziehungen, Klagen, Verurteilungen
- "neutral": Sonstige Beziehung, Erwähnung, Verbindung
REGELN:
- weight: 1 (schwach/indirekt) bis 5 (stark/direkt)
- evidence[]: 1-2 kurze Stichpunkte aus dem Artikeltext als Beleg
- status: "active" (aktuell), "historical" (vergangen), "emerging" (sich entwickelnd)
AUSGABEFORMAT — Antworte AUSSCHLIESSLICH mit diesem JSON:
{{
"relations": [
{{
"source": "Entität A",
"target": "Entität B",
"category": "alliance|conflict|diplomacy|economic|legal|neutral",
"label": "Kurzes Label (2-4 Wörter)",
"description": "Beschreibung (1-2 Sätze)",
"weight": 3,
"status": "active|historical|emerging",
"evidence": ["Beleg 1", "Beleg 2"]
}}
]
}}"""
SEMANTIC_DEDUP_PROMPT = """Du bist ein OSINT-Analyst. Prüfe diese Entitäten auf Duplikate.
ENTITÄTEN:
{entity_list}
AUFGABE: Welche Entitäten bezeichnen DASSELBE reale Objekt?
Typische Duplikate:
- Abkürzung vs. Vollname: "IRGC" = "Iranian Revolutionary Guard Corps"
- Sprachvarianten: "European Union" = "Europäische Union"
- Schreibvarianten: "Strait of Hormuz" = "Straße von Hormus"
REGELN:
- NUR echte Duplikate zusammenführen (gleiche reale Entität)
- NICHT zusammenführen: Unterorganisationen (IRGC ≠ IRGC Navy), verschiedene Personen
- "keep": Nummer der Haupt-Entität (bevorzuge: mehr Erwähnungen, vollständiger Name)
- "merge": Nummern der Duplikate die in die Haupt-Entität zusammengeführt werden
AUSGABEFORMAT — Antworte AUSSCHLIESSLICH mit diesem JSON:
{{
"merges": [
{{"keep": 1, "merge": [3, 5]}},
{{"keep": 2, "merge": [4]}}
]
}}
Falls KEINE Duplikate: {{"merges": []}}"""
# ---------------------------------------------------------------------------
# Hilfsfunktionen
# ---------------------------------------------------------------------------
async def _broadcast(ws_manager, msg_type: str, data: dict):
"""Sendet eine WebSocket-Nachricht, falls ws_manager vorhanden."""
if ws_manager:
try:
await ws_manager.broadcast({"type": msg_type, **data})
except Exception:
pass
def _parse_json_response(text: str) -> Optional[dict]:
"""Parst JSON aus Claude-Antwort. Handhabt Markdown-Fences."""
if not text:
return None
try:
return json.loads(text)
except json.JSONDecodeError:
pass
fence_match = re.search(r'```(?:json)?\s*\n?(.*?)\n?\s*```', text, re.DOTALL)
if fence_match:
try:
return json.loads(fence_match.group(1))
except json.JSONDecodeError:
pass
obj_match = re.search(r'\{.*\}', text, re.DOTALL)
if obj_match:
try:
return json.loads(obj_match.group())
except json.JSONDecodeError:
pass
logger.warning("JSON-Parse fehlgeschlagen")
return None
async def _check_analysis_exists(db, analysis_id: int) -> bool:
"""Prüft ob die Analyse noch existiert."""
cursor = await db.execute(
"SELECT id FROM network_analyses WHERE id = ?", (analysis_id,)
)
return await cursor.fetchone() is not None
def _compute_data_hash(article_ids, factcheck_ids, article_ts, factcheck_ts) -> str:
"""SHA256-Hash über sortierte IDs und Timestamps."""
parts = []
for aid, ats in sorted(zip(article_ids, article_ts)):
parts.append(f"a:{aid}:{ats}")
for fid, fts in sorted(zip(factcheck_ids, factcheck_ts)):
parts.append(f"f:{fid}:{fts}")
return hashlib.sha256("|".join(parts).encode("utf-8")).hexdigest()
# ---------------------------------------------------------------------------
# Artikel-Deduplizierung
# ---------------------------------------------------------------------------
def _normalize_headline(headline: str) -> str:
"""Normalisiert eine Headline fuer Vergleiche."""
h = headline.lower().strip()
h = re.sub(r"[^a-z0-9\s]", "", h)
h = re.sub(r"\s+", " ", h).strip()
return h
def _headline_tokens(headline: str) -> set[str]:
"""Extrahiert bedeutungstragende Tokens aus einer Headline."""
tokens = set()
for word in _normalize_headline(headline).split():
if len(word) >= 3 and word not in _STOP_WORDS:
tokens.add(word)
return tokens
def _jaccard_similarity(set_a: set, set_b: set) -> float:
"""Jaccard-Aehnlichkeit zweier Mengen."""
if not set_a or not set_b:
return 0.0
intersection = set_a & set_b
union = set_a | set_b
return len(intersection) / len(union) if union else 0.0
def _content_fingerprint(text: str) -> str:
"""Kurzer Hash des Textinhalts fuer Near-Duplicate-Erkennung."""
normalized = re.sub(r"\s+", " ", text.lower().strip())[:500]
return hashlib.md5(normalized.encode("utf-8")).hexdigest()
def _deduplicate_articles(articles: list[dict], factchecks: list[dict]) -> tuple[list[dict], list[dict]]:
"""Entfernt redundante Artikel basierend auf Headline-Similarity und Content-Hash.
Behaelt pro Duplikat-Gruppe den Artikel mit dem laengsten Content.
Faktenchecks werden nicht dedupliziert (sind bereits einzigartig).
Returns:
Tuple von (deduplizierte_artikel, factchecks_unveraendert)
"""
if len(articles) <= 50:
return articles, factchecks
logger.info(f"Artikel-Dedup: {len(articles)} Artikel pruefen")
# Phase A: Exakte Content-Fingerprint-Dedup
seen_fingerprints: dict[str, int] = {}
for i, art in enumerate(articles):
content = art.get("content_de") or art.get("content_original") or ""
headline = art.get("headline_de") or art.get("headline") or ""
if not content and not headline:
continue
fp = _content_fingerprint(headline + " " + content)
if fp in seen_fingerprints:
existing_idx = seen_fingerprints[fp]
existing_content = articles[existing_idx].get("content_de") or articles[existing_idx].get("content_original") or ""
if len(content) > len(existing_content):
seen_fingerprints[fp] = i
else:
seen_fingerprints[fp] = i
after_fp = list(seen_fingerprints.values())
fp_removed = len(articles) - len(after_fp)
# Phase B: Headline-Similarity-Dedup (Jaccard >= 0.7)
remaining = [articles[i] for i in sorted(after_fp)]
token_sets = []
for art in remaining:
headline = art.get("headline_de") or art.get("headline") or ""
token_sets.append(_headline_tokens(headline))
keep_mask = [True] * len(remaining)
for i in range(len(remaining)):
if not keep_mask[i]:
continue
for j in range(i + 1, len(remaining)):
if not keep_mask[j]:
continue
if _jaccard_similarity(token_sets[i], token_sets[j]) >= 0.7:
content_i = remaining[i].get("content_de") or remaining[i].get("content_original") or ""
content_j = remaining[j].get("content_de") or remaining[j].get("content_original") or ""
if len(content_j) > len(content_i):
keep_mask[i] = False
break
else:
keep_mask[j] = False
deduped = [art for art, keep in zip(remaining, keep_mask) if keep]
headline_removed = len(remaining) - len(deduped)
logger.info(
f"Artikel-Dedup abgeschlossen: {len(articles)} -> {len(deduped)} "
f"({fp_removed} Content-Duplikate, {headline_removed} Headline-Duplikate entfernt)"
)
return deduped, factchecks
# ---------------------------------------------------------------------------
# Entity-Merge Helper
# ---------------------------------------------------------------------------
async def _merge_entity_in_db(
db, analysis_id: int, keep_ent: dict, merge_ent: dict, entities: list[dict],
):
"""Führt merge_ent in keep_ent zusammen (DB + In-Memory)."""
keep_id = keep_ent["db_id"]
merge_id = merge_ent["db_id"]
# Aliases vereinen
aliases = set(keep_ent.get("aliases", []))
aliases.add(merge_ent["name"])
for a in merge_ent.get("aliases", []):
if a and a.strip():
aliases.add(a.strip())
aliases.discard(keep_ent["name"])
keep_ent["aliases"] = list(aliases)
# Mention count addieren
keep_ent["mention_count"] = keep_ent.get("mention_count", 0) + merge_ent.get("mention_count", 0)
# Längere Description behalten
if len(merge_ent.get("description", "")) > len(keep_ent.get("description", "")):
keep_ent["description"] = merge_ent["description"]
# Entity-Mentions umhängen
await db.execute(
"UPDATE network_entity_mentions SET entity_id = ? WHERE entity_id = ?",
(keep_id, merge_id),
)
# Relations umhängen
await db.execute(
"UPDATE network_relations SET source_entity_id = ? WHERE source_entity_id = ? AND network_analysis_id = ?",
(keep_id, merge_id, analysis_id),
)
await db.execute(
"UPDATE network_relations SET target_entity_id = ? WHERE target_entity_id = ? AND network_analysis_id = ?",
(keep_id, merge_id, analysis_id),
)
# Self-Loops entfernen die durch den Merge entstanden sein könnten
await db.execute(
"DELETE FROM network_relations WHERE source_entity_id = ? AND target_entity_id = ? AND network_analysis_id = ?",
(keep_id, keep_id, analysis_id),
)
# Keep-Entity in DB aktualisieren
await db.execute(
"""UPDATE network_entities
SET aliases = ?, mention_count = ?, description = ?
WHERE id = ?""",
(json.dumps(keep_ent["aliases"], ensure_ascii=False),
keep_ent["mention_count"], keep_ent.get("description", ""), keep_id),
)
# Merge-Entity löschen
await db.execute("DELETE FROM network_entities WHERE id = ?", (merge_id,))
# Aus entities-Liste entfernen
try:
entities.remove(merge_ent)
except ValueError:
pass
# ---------------------------------------------------------------------------
# Phase 1: Entity-Extraktion (Haiku)
# ---------------------------------------------------------------------------
async def _phase1_extract_entities(
db, analysis_id: int, tenant_id: int,
articles: list[dict], factchecks: list[dict],
usage_acc: UsageAccumulator, ws_manager=None,
) -> list[dict]:
"""Extrahiert Entitäten aus Artikeln via Haiku in Batches."""
logger.info(f"Phase 1: {len(articles)} Artikel, {len(factchecks)} Faktenchecks")
all_texts = []
for art in articles:
headline = art.get("headline_de") or art.get("headline") or ""
content = art.get("content_de") or art.get("content_original") or ""
source = art.get("source") or ""
if len(content) > 800:
content = content[:800] + "..."
all_texts.append(f"[{source}] {headline}\n{content}")
for fc in factchecks:
claim = fc.get("claim") or ""
evidence = fc.get("evidence") or ""
status = fc.get("status") or ""
all_texts.append(f"[Faktencheck] {claim} (Status: {status})\n{evidence}")
if not all_texts:
logger.warning(f"Analyse {analysis_id}: Keine Texte vorhanden")
return []
batch_size = 50
batches = [all_texts[i:i + batch_size] for i in range(0, len(all_texts), batch_size)]
logger.info(f"{len(all_texts)} Texte in {len(batches)} Batches")
entity_map: dict[str, dict] = {}
for batch_idx, batch in enumerate(batches):
articles_text = "\n\n---\n\n".join(batch)
prompt = ENTITY_EXTRACTION_PROMPT.format(articles_text=articles_text)
try:
result_text, usage = await call_claude(prompt, tools=None, model=CLAUDE_MODEL_MEDIUM)
usage_acc.add(usage)
except Exception as e:
logger.error(f"Sonnet Batch {batch_idx + 1}/{len(batches)} fehlgeschlagen: {e}")
continue
parsed = _parse_json_response(result_text)
if not parsed or "entities" not in parsed:
logger.warning(f"Batch {batch_idx + 1}: Kein gültiges JSON")
continue
entities = parsed["entities"]
if not isinstance(entities, list):
continue
for ent in entities:
if not isinstance(ent, dict):
continue
name = (ent.get("name") or "").strip()
if not name:
continue
name_normalized = (ent.get("name_normalized") or name.lower()).strip().lower()
entity_type = (ent.get("type") or "organisation").lower().strip()
valid_types = {"person", "organisation", "location", "event", "military"}
if entity_type not in valid_types:
entity_type = "organisation"
key = name_normalized
if key in entity_map:
existing = entity_map[key]
aliases = set(existing.get("aliases", []))
for alias in ent.get("aliases", []):
if alias and alias.strip():
aliases.add(alias.strip())
if name != existing["name"]:
aliases.add(name)
existing["aliases"] = list(aliases)
existing["mention_count"] = existing.get("mention_count", 1) + ent.get("mention_count", 1)
new_desc = ent.get("description", "")
if len(new_desc) > len(existing.get("description", "")):
existing["description"] = new_desc
# Typ-Priorität: höherwertigen Typ behalten
if TYPE_PRIORITY.get(entity_type, 0) > TYPE_PRIORITY.get(existing["type"], 0):
existing["type"] = entity_type
else:
entity_map[key] = {
"name": name,
"name_normalized": name_normalized,
"type": entity_type,
"description": ent.get("description", ""),
"aliases": [a.strip() for a in ent.get("aliases", []) if a and a.strip()],
"mention_count": ent.get("mention_count", 1),
}
logger.info(f"Batch {batch_idx + 1}/{len(batches)}: {len(entity_map)} Entitäten gesamt")
await _broadcast(ws_manager, "network_status", {
"analysis_id": analysis_id,
"phase": "entity_extraction",
"progress": int((batch_idx + 1) / len(batches) * 100),
})
all_entities = list(entity_map.values())
for ent in all_entities:
try:
cursor = await db.execute(
"""INSERT OR IGNORE INTO network_entities
(network_analysis_id, name, name_normalized, entity_type,
description, aliases, mention_count, tenant_id)
VALUES (?, ?, ?, ?, ?, ?, ?, ?)""",
(
analysis_id, ent["name"], ent["name_normalized"], ent["type"],
ent.get("description", ""),
json.dumps(ent.get("aliases", []), ensure_ascii=False),
ent.get("mention_count", 1),
tenant_id,
),
)
ent["db_id"] = cursor.lastrowid
except Exception as e:
logger.warning(f"Entity speichern fehlgeschlagen '{ent['name']}': {e}")
await db.commit()
logger.info(f"Phase 1 abgeschlossen: {len(all_entities)} Entitäten gespeichert")
return all_entities
# ---------------------------------------------------------------------------
# Phase 2a: Entity-Deduplication nach name_normalized
# ---------------------------------------------------------------------------
async def _phase2a_deduplicate_entities(
db, analysis_id: int, entities: list[dict], ws_manager=None,
) -> None:
"""Dedupliziert Entities mit gleichem name_normalized (unabhängig vom Typ)."""
logger.info(f"Phase 2a: Prüfe {len(entities)} Entitäten auf name_normalized-Duplikate")
groups = defaultdict(list)
for ent in entities:
if ent.get("db_id"):
groups[ent["name_normalized"]].append(ent)
merge_count = 0
merged_ids = set()
for nn, group in groups.items():
if len(group) < 2:
continue
# Sortierung: höchste Typ-Priorität, dann meiste Erwähnungen
group.sort(
key=lambda e: (TYPE_PRIORITY.get(e["type"], 0), e.get("mention_count", 0)),
reverse=True,
)
keep = group[0]
for merge in group[1:]:
if merge["db_id"] in merged_ids:
continue
await _merge_entity_in_db(db, analysis_id, keep, merge, entities)
merged_ids.add(merge["db_id"])
merge_count += 1
if merge_count > 0:
await db.commit()
logger.info(
f"Phase 2a abgeschlossen: {merge_count} Duplikate zusammengeführt, "
f"{len(entities)} Entitäten verbleiben"
)
await _broadcast(ws_manager, "network_status", {
"analysis_id": analysis_id,
"phase": "entity_dedup",
"progress": 100,
})
# ---------------------------------------------------------------------------
# Phase 2: Beziehungsanalyse (Batched — pro Artikel-Batch)
# ---------------------------------------------------------------------------
def _build_entity_name_map(entities: list[dict]) -> dict[str, int]:
"""Mapping: normalisierter Name/Alias -> DB-ID."""
name_to_id: dict[str, int] = {}
for ent in entities:
db_id = ent.get("db_id")
if not db_id:
continue
name_to_id[ent["name"].lower()] = db_id
name_to_id[ent["name_normalized"]] = db_id
for alias in ent.get("aliases", []):
if alias and alias.strip():
name_to_id[alias.strip().lower()] = db_id
return name_to_id
def _find_relevant_entities(batch_texts: list[str], entities: list[dict]) -> list[dict]:
"""Findet Entitäten, die in den Batch-Texten vorkommen."""
combined_text = " ".join(batch_texts).lower()
relevant = []
for ent in entities:
if ent["name"].lower() in combined_text or ent["name_normalized"] in combined_text:
relevant.append(ent)
continue
for alias in ent.get("aliases", []):
if alias and alias.strip().lower() in combined_text:
relevant.append(ent)
break
return relevant
async def _phase2_analyze_relationships(
db, analysis_id: int, tenant_id: int,
entities: list[dict], articles: list[dict], factchecks: list[dict],
usage_acc: UsageAccumulator, ws_manager=None,
) -> list[dict]:
"""Analysiert Beziehungen batch-weise und merged die Ergebnisse."""
if not entities:
return []
logger.info(f"Phase 2: {len(entities)} Entitäten, batched Beziehungsanalyse")
await _broadcast(ws_manager, "network_status", {
"analysis_id": analysis_id,
"phase": "relationship_extraction",
"progress": 0,
})
# --- Texte vorbereiten (gleiche Logik wie Phase 1) ---
all_texts = []
for art in articles:
headline = art.get("headline_de") or art.get("headline") or ""
content = art.get("content_de") or art.get("content_original") or ""
source = art.get("source") or ""
if len(content) > 800:
content = content[:800] + "..."
all_texts.append(f"[{source}] {headline}\n{content}")
for fc in factchecks:
claim = fc.get("claim") or ""
evidence = fc.get("evidence") or ""
status = fc.get("status") or ""
all_texts.append(f"[Faktencheck] {claim} (Status: {status})\n{evidence}")
if not all_texts:
return []
# --- Stufe A: Per-Batch Beziehungsextraktion ---
batch_size = 50
batches = [all_texts[i:i + batch_size] for i in range(0, len(all_texts), batch_size)]
logger.info(f"Stufe A: {len(batches)} Batches für Beziehungsextraktion")
all_raw_relations: list[dict] = []
name_to_id = _build_entity_name_map(entities)
for batch_idx, batch in enumerate(batches):
relevant = _find_relevant_entities(batch, entities)
if len(relevant) < 2:
logger.debug(f"Batch {batch_idx + 1}: Weniger als 2 Entitäten, überspringe")
await _broadcast(ws_manager, "network_status", {
"analysis_id": analysis_id,
"phase": "relationship_extraction",
"progress": int((batch_idx + 1) / len(batches) * 70),
})
continue
entities_for_prompt = [
{"name": e["name"], "type": e["type"]}
for e in relevant
]
entities_json = json.dumps(entities_for_prompt, ensure_ascii=False)
articles_text = "\n\n---\n\n".join(batch)
prompt = RELATIONSHIP_BATCH_PROMPT.format(
entities_json=entities_json,
articles_text=articles_text,
)
try:
result_text, usage = await call_claude(prompt, tools=None, model=CLAUDE_MODEL_MEDIUM)
usage_acc.add(usage)
except Exception as e:
logger.error(f"Relationship Batch {batch_idx + 1}/{len(batches)} fehlgeschlagen: {e}")
continue
parsed = _parse_json_response(result_text)
if not parsed:
logger.warning(f"Batch {batch_idx + 1}: Kein gültiges JSON")
continue
relations = parsed.get("relations", [])
if not isinstance(relations, list):
continue
batch_count = 0
for rel in relations:
if not isinstance(rel, dict):
continue
source_name = (rel.get("source") or "").strip()
target_name = (rel.get("target") or "").strip()
if not source_name or not target_name:
continue
rel["_batch"] = batch_idx
all_raw_relations.append(rel)
batch_count += 1
logger.info(f"Batch {batch_idx + 1}/{len(batches)}: {batch_count} Beziehungen, {len(relevant)} Entitäten")
await _broadcast(ws_manager, "network_status", {
"analysis_id": analysis_id,
"phase": "relationship_extraction",
"progress": int((batch_idx + 1) / len(batches) * 70),
})
logger.info(f"Stufe A abgeschlossen: {len(all_raw_relations)} rohe Beziehungen aus {len(batches)} Batches")
# --- Stufe B: Merge + Deduplizierung ---
logger.info("Stufe B: Merge und Deduplizierung")
await _broadcast(ws_manager, "network_status", {
"analysis_id": analysis_id,
"phase": "relationship_extraction",
"progress": 75,
})
valid_categories = {"alliance", "conflict", "diplomacy", "economic", "legal", "neutral"}
merged: dict[tuple[int, int, str], dict] = {}
for rel in all_raw_relations:
source_name = (rel.get("source") or "").strip()
target_name = (rel.get("target") or "").strip()
source_id = name_to_id.get(source_name.lower())
target_id = name_to_id.get(target_name.lower())
if not source_id or not target_id or source_id == target_id:
continue
# Normalisiere Richtung um A->B und B->A zu mergen
if source_id > target_id:
source_id, target_id = target_id, source_id
source_name, target_name = target_name, source_name
category = (rel.get("category") or "neutral").lower().strip()
if category not in valid_categories:
category = "neutral"
key = (source_id, target_id, category)
weight = rel.get("weight", 3)
try:
weight = max(1, min(5, int(weight)))
except (ValueError, TypeError):
weight = 3
status = (rel.get("status") or "active").lower().strip()
if status not in {"active", "historical", "emerging"}:
status = "active"
evidence = rel.get("evidence", [])
if not isinstance(evidence, list):
evidence = []
if key in merged:
existing = merged[key]
existing["weight"] = max(existing["weight"], weight)
existing_evidence = set(existing["evidence"])
for ev in evidence:
if isinstance(ev, str) and ev.strip() and ev.strip() not in existing_evidence:
existing["evidence"].append(ev.strip())
existing_evidence.add(ev.strip())
if len(existing["evidence"]) > 10:
existing["evidence"] = existing["evidence"][:10]
status_priority = {"active": 3, "emerging": 2, "historical": 1}
if status_priority.get(status, 0) > status_priority.get(existing["status"], 0):
existing["status"] = status
new_desc = rel.get("description", "")
if len(new_desc) > len(existing.get("description", "")):
existing["description"] = new_desc
existing["label"] = rel.get("label", existing["label"])
existing["_count"] = existing.get("_count", 1) + 1
else:
merged[key] = {
"source_id": source_id,
"target_id": target_id,
"source_name": source_name,
"target_name": target_name,
"category": category,
"label": rel.get("label", ""),
"description": rel.get("description", ""),
"weight": weight,
"status": status,
"evidence": [ev.strip() for ev in evidence if isinstance(ev, str) and ev.strip()][:10],
"_count": 1,
}
logger.info(f"Stufe B abgeschlossen: {len(all_raw_relations)} roh -> {len(merged)} gemerged")
# Gewichts-Boost für mehrfach belegte Beziehungen
for m in merged.values():
if m["_count"] >= 3 and m["weight"] < 5:
m["weight"] = min(5, m["weight"] + 1)
# --- In DB speichern ---
await _broadcast(ws_manager, "network_status", {
"analysis_id": analysis_id,
"phase": "relationship_extraction",
"progress": 85,
})
saved_relations = []
for m in merged.values():
try:
cursor = await db.execute(
"""INSERT INTO network_relations
(network_analysis_id, source_entity_id, target_entity_id,
category, label, description, weight, status, evidence, tenant_id)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
(
analysis_id, m["source_id"], m["target_id"], m["category"],
m["label"], m["description"], m["weight"], m["status"],
json.dumps(m["evidence"], ensure_ascii=False),
tenant_id,
),
)
saved_relations.append({"id": cursor.lastrowid})
except Exception as e:
logger.warning(f"Beziehung speichern fehlgeschlagen: {e}")
await db.commit()
logger.info(f"Phase 2 abgeschlossen: {len(saved_relations)} Beziehungen gespeichert")
await _broadcast(ws_manager, "network_status", {
"analysis_id": analysis_id,
"phase": "relationship_extraction",
"progress": 100,
})
return saved_relations
# ---------------------------------------------------------------------------
# Phase 2c: Semantische Deduplication (Opus)
# ---------------------------------------------------------------------------
async def _phase2c_semantic_dedup(
db, analysis_id: int, tenant_id: int,
entities: list[dict], usage_acc: UsageAccumulator, ws_manager=None,
) -> None:
"""Semantische Deduplizierung via Opus — erkennt Synonyme, Abkürzungen, Sprachvarianten."""
if len(entities) < 10:
return
logger.info(f"Phase 2c: Semantische Dedup für {len(entities)} Entitäten")
await _broadcast(ws_manager, "network_status", {
"analysis_id": analysis_id,
"phase": "semantic_dedup",
"progress": 0,
})
# --- Clustering: Token-basiert + Abbreviation-Matching ---
token_to_ids = defaultdict(set)
db_id_map = {e["db_id"]: e for e in entities if e.get("db_id")}
for ent in entities:
db_id = ent.get("db_id")
if not db_id:
continue
all_names = [ent["name_normalized"]]
for a in ent.get("aliases", []):
if a:
all_names.append(a.lower())
# Token-basiertes Clustering
for name in all_names:
for word in re.split(r'[\s\-_/(),.;:]+', name):
word = word.strip()
if len(word) >= 3 and word not in _STOP_WORDS:
token_to_ids[word].add(db_id)
# Abbreviation-Matching
all_display_names = [ent["name"]] + [a for a in ent.get("aliases", []) if a]
for name in all_display_names:
words = [w for w in re.split(r'[\s\-_/(),.;:]+', name) if w and len(w) >= 2]
if len(words) >= 2:
abbr = "".join(w[0] for w in words).lower()
if len(abbr) >= 2:
token_to_ids[f"_abbr_{abbr}"].add(db_id)
# Kurze Namen als potenzielle Abkürzungen
name_clean = re.sub(r'[^a-zA-Z]', '', name).lower()
if 2 <= len(name_clean) <= 6:
token_to_ids[f"_abbr_{name_clean}"].add(db_id)
# Eindeutige Cluster filtern (≥ 2 Entities, ≤ 40 Entities)
seen_clusters = set()
candidate_clusters = []
for token, db_ids in sorted(token_to_ids.items(), key=lambda x: len(x[1])):
if len(db_ids) < 2 or len(db_ids) > 40:
continue
key = frozenset(db_ids)
if key in seen_clusters:
continue
seen_clusters.add(key)
candidate_clusters.append(list(db_ids))
logger.info(f"Phase 2c: {len(candidate_clusters)} Kandidaten-Cluster gefunden")
# --- Opus-Calls für jeden Cluster ---
merged_away = set()
total_merges = 0
opus_calls = 0
max_calls = 50
for ci, cluster_ids in enumerate(candidate_clusters):
if opus_calls >= max_calls:
logger.warning(f"Phase 2c: Max {max_calls} Opus-Calls erreicht, stoppe")
break
# Bereits gemergte Entities filtern
active_ids = [did for did in cluster_ids if did not in merged_away and did in db_id_map]
if len(active_ids) < 2:
continue
active_ents = [db_id_map[did] for did in active_ids]
# Prompt bauen
lines = []
for i, ent in enumerate(active_ents, 1):
aliases_str = ", ".join((ent.get("aliases") or [])[:5])
line = f"{i}. {ent['name']} ({ent['type']}, {ent.get('mention_count', 0)} Erwähnungen)"
if aliases_str:
line += f" [Aliases: {aliases_str}]"
lines.append(line)
prompt = SEMANTIC_DEDUP_PROMPT.format(entity_list="\n".join(lines))
try:
result_text, usage = await call_claude(prompt, tools=None, model=None)
usage_acc.add(usage)
opus_calls += 1
except Exception as e:
logger.error(f"Phase 2c Opus-Call {opus_calls + 1} fehlgeschlagen: {e}")
continue
parsed = _parse_json_response(result_text)
if not parsed:
continue
for merge_group in parsed.get("merges", []):
keep_idx = merge_group.get("keep")
merge_indices = merge_group.get("merge", [])
if keep_idx is None or not merge_indices:
continue
if not isinstance(merge_indices, list):
merge_indices = [merge_indices]
keep_idx -= 1 # 1-indexed → 0-indexed
if keep_idx < 0 or keep_idx >= len(active_ents):
continue
keep_ent = active_ents[keep_idx]
if keep_ent["db_id"] in merged_away:
continue
for mi in merge_indices:
mi -= 1
if mi < 0 or mi >= len(active_ents) or mi == keep_idx:
continue
merge_ent = active_ents[mi]
if merge_ent["db_id"] in merged_away:
continue
logger.info(f"Semantic Merge: '{merge_ent['name']}''{keep_ent['name']}'")
await _merge_entity_in_db(db, analysis_id, keep_ent, merge_ent, entities)
merged_away.add(merge_ent["db_id"])
db_id_map.pop(merge_ent["db_id"], None)
total_merges += 1
# Progress
progress = int((ci + 1) / len(candidate_clusters) * 100) if candidate_clusters else 100
await _broadcast(ws_manager, "network_status", {
"analysis_id": analysis_id,
"phase": "semantic_dedup",
"progress": min(progress, 100),
})
if total_merges > 0:
await db.commit()
logger.info(
f"Phase 2c abgeschlossen: {total_merges} Merges, {opus_calls} Opus-Calls, "
f"{len(entities)} Entitäten verbleiben"
)
# ---------------------------------------------------------------------------
# Phase 2d: Cleanup
# ---------------------------------------------------------------------------
async def _phase2d_cleanup(
db, analysis_id: int, entities: list[dict], ws_manager=None,
) -> None:
"""Cleanup: Self-Loops, Richtungsnormalisierung, Duplikat-Relations, verwaiste Entities."""
logger.info("Phase 2d: Cleanup")
# 1. Self-Loops entfernen
cursor = await db.execute(
"DELETE FROM network_relations WHERE network_analysis_id = ? AND source_entity_id = target_entity_id",
(analysis_id,),
)
self_loops = cursor.rowcount
# 2. Richtungsnormalisierung (nach Merges können Richtungen inkonsistent sein)
await db.execute(
"""UPDATE network_relations
SET source_entity_id = target_entity_id, target_entity_id = source_entity_id
WHERE source_entity_id > target_entity_id AND network_analysis_id = ?""",
(analysis_id,),
)
# 3. Duplikat-Relations entfernen (gleiche source+target+category nach Normalisierung)
cursor = await db.execute(
"""DELETE FROM network_relations
WHERE network_analysis_id = ? AND id NOT IN (
SELECT MIN(id) FROM network_relations
WHERE network_analysis_id = ?
GROUP BY source_entity_id, target_entity_id, category
)""",
(analysis_id, analysis_id),
)
dup_relations = cursor.rowcount
# 4. Verwaiste Entities entfernen (keine Verbindungen)
cursor = await db.execute(
"""DELETE FROM network_entities
WHERE network_analysis_id = ? AND id NOT IN (
SELECT source_entity_id FROM network_relations WHERE network_analysis_id = ?
UNION
SELECT target_entity_id FROM network_relations WHERE network_analysis_id = ?
)""",
(analysis_id, analysis_id, analysis_id),
)
orphans = cursor.rowcount
# Entities-Liste aktualisieren
remaining_ids = set()
cursor = await db.execute(
"SELECT id FROM network_entities WHERE network_analysis_id = ?", (analysis_id,)
)
for row in await cursor.fetchall():
remaining_ids.add(row["id"])
entities[:] = [e for e in entities if e.get("db_id") in remaining_ids]
await db.commit()
logger.info(
f"Phase 2d abgeschlossen: {self_loops} Self-Loops, {dup_relations} Duplikat-Relations, "
f"{orphans} verwaiste Entities entfernt, {len(entities)} Entitäten verbleiben"
)
await _broadcast(ws_manager, "network_status", {
"analysis_id": analysis_id,
"phase": "cleanup",
"progress": 100,
})
# ---------------------------------------------------------------------------
# Phase 3: Finalisierung
# ---------------------------------------------------------------------------
async def _phase3_finalize(
db, analysis_id, tenant_id, entity_count, relation_count,
article_ids, factcheck_ids, article_ts, factcheck_ts,
usage_acc, ws_manager=None,
):
"""Finalisiert: Zähler, Hash, Log, Status."""
data_hash = _compute_data_hash(article_ids, factcheck_ids, article_ts, factcheck_ts)
now = datetime.now(TIMEZONE).strftime("%Y-%m-%d %H:%M:%S")
await db.execute(
"""UPDATE network_analyses
SET entity_count = ?, relation_count = ?, status = 'ready',
last_generated_at = ?, data_hash = ?
WHERE id = ?""",
(entity_count, relation_count, now, data_hash, analysis_id),
)
await db.execute(
"""INSERT INTO network_generation_log
(network_analysis_id, completed_at, status, input_tokens, output_tokens,
cache_creation_tokens, cache_read_tokens, total_cost_usd, api_calls,
entity_count, relation_count, tenant_id)
VALUES (?, ?, 'completed', ?, ?, ?, ?, ?, ?, ?, ?, ?)""",
(analysis_id, now, usage_acc.input_tokens, usage_acc.output_tokens,
usage_acc.cache_creation_tokens, usage_acc.cache_read_tokens,
usage_acc.total_cost_usd, usage_acc.call_count,
entity_count, relation_count, tenant_id),
)
await db.commit()
logger.info(f"Analyse {analysis_id} finalisiert: {entity_count} Entitäten, "
f"{relation_count} Beziehungen, ${usage_acc.total_cost_usd:.4f}")
await _broadcast(ws_manager, "network_complete", {
"analysis_id": analysis_id,
"entity_count": entity_count,
"relation_count": relation_count,
"cost_usd": round(usage_acc.total_cost_usd, 4),
})
# ---------------------------------------------------------------------------
# Hauptfunktion
# ---------------------------------------------------------------------------
async def extract_and_relate_entities(analysis_id: int, tenant_id: int, ws_manager=None):
"""Hauptfunktion: Entity-Extraktion + Beziehungsanalyse.
Phase 1: Haiku extrahiert Entitäten aus Artikeln (in Batches)
Phase 2a: Entity-Dedup nach name_normalized (Code, kein API)
Phase 2: Haiku extrahiert Beziehungen pro Batch, dann Merge + Deduplizierung
Phase 2c: Semantische Dedup via Opus (Cluster-weise)
Phase 2d: Cleanup (Self-Loops, Verwaiste, Duplikat-Relations)
Phase 3: Finalisierung (Zähler, Hash, Log)
"""
from database import get_db
db = await get_db()
usage_acc = UsageAccumulator()
try:
if not await _check_analysis_exists(db, analysis_id):
logger.warning(f"Analyse {analysis_id} existiert nicht")
return
await db.execute(
"UPDATE network_analyses SET status = 'generating' WHERE id = ?",
(analysis_id,),
)
await db.commit()
# Incident-IDs laden
cursor = await db.execute(
"SELECT incident_id FROM network_analysis_incidents WHERE network_analysis_id = ?",
(analysis_id,),
)
incident_ids = [row["incident_id"] for row in await cursor.fetchall()]
if not incident_ids:
logger.warning(f"Analyse {analysis_id}: Keine Lagen verknüpft")
await db.execute("UPDATE network_analyses SET status = 'error' WHERE id = ?", (analysis_id,))
await db.commit()
await _broadcast(ws_manager, "network_error", {
"analysis_id": analysis_id, "error": "Keine Lagen verknüpft",
})
return
# Artikel laden
placeholders = ",".join("?" * len(incident_ids))
cursor = await db.execute(
f"""SELECT id, incident_id, headline, headline_de, source, source_url,
content_original, content_de, collected_at
FROM articles WHERE incident_id IN ({placeholders})""",
incident_ids,
)
article_rows = await cursor.fetchall()
articles = []
article_ids = []
article_ts = []
for r in article_rows:
articles.append({
"id": r["id"], "incident_id": r["incident_id"],
"headline": r["headline"], "headline_de": r["headline_de"],
"source": r["source"], "source_url": r["source_url"],
"content_original": r["content_original"], "content_de": r["content_de"],
})
article_ids.append(r["id"])
article_ts.append(r["collected_at"] or "")
# Faktenchecks laden
cursor = await db.execute(
f"""SELECT id, incident_id, claim, status, evidence, checked_at
FROM fact_checks WHERE incident_id IN ({placeholders})""",
incident_ids,
)
fc_rows = await cursor.fetchall()
factchecks = []
factcheck_ids = []
factcheck_ts = []
for r in fc_rows:
factchecks.append({
"id": r["id"], "incident_id": r["incident_id"],
"claim": r["claim"], "status": r["status"], "evidence": r["evidence"],
})
factcheck_ids.append(r["id"])
factcheck_ts.append(r["checked_at"] or "")
logger.info(f"Analyse {analysis_id}: {len(articles)} Artikel, "
f"{len(factchecks)} Faktenchecks aus {len(incident_ids)} Lagen")
# Artikel-Deduplizierung vor KI-Pipeline
articles, factchecks = _deduplicate_articles(articles, factchecks)
# Phase 1: Entity-Extraktion
if not await _check_analysis_exists(db, analysis_id):
return
entities = await _phase1_extract_entities(
db, analysis_id, tenant_id, articles, factchecks, usage_acc, ws_manager,
)
# Phase 2a: Entity-Deduplication
if not await _check_analysis_exists(db, analysis_id):
return
await _phase2a_deduplicate_entities(db, analysis_id, entities, ws_manager)
# Phase 2: Beziehungsextraktion
if not await _check_analysis_exists(db, analysis_id):
return
relations = await _phase2_analyze_relationships(
db, analysis_id, tenant_id, entities, articles, factchecks, usage_acc, ws_manager,
)
# Phase 2c: Semantische Deduplication
if not await _check_analysis_exists(db, analysis_id):
return
await _phase2c_semantic_dedup(
db, analysis_id, tenant_id, entities, usage_acc, ws_manager,
)
# Phase 2d: Cleanup
if not await _check_analysis_exists(db, analysis_id):
return
await _phase2d_cleanup(db, analysis_id, entities, ws_manager)
# Finale Zähler aus DB (nach allen Cleanups)
cursor = await db.execute(
"SELECT COUNT(*) as cnt FROM network_entities WHERE network_analysis_id = ?",
(analysis_id,),
)
row = await cursor.fetchone()
final_entity_count = row["cnt"] if row else len(entities)
cursor = await db.execute(
"SELECT COUNT(*) as cnt FROM network_relations WHERE network_analysis_id = ?",
(analysis_id,),
)
row = await cursor.fetchone()
final_relation_count = row["cnt"] if row else len(relations)
# Phase 3: Finalisierung
if not await _check_analysis_exists(db, analysis_id):
return
await _phase3_finalize(
db, analysis_id, tenant_id,
entity_count=final_entity_count, relation_count=final_relation_count,
article_ids=article_ids, factcheck_ids=factcheck_ids,
article_ts=article_ts, factcheck_ts=factcheck_ts,
usage_acc=usage_acc, ws_manager=ws_manager,
)
except Exception as e:
logger.error(f"Entity-Extraktion fehlgeschlagen (Analyse {analysis_id}): {e}", exc_info=True)
try:
await db.execute("UPDATE network_analyses SET status = 'error' WHERE id = ?", (analysis_id,))
await db.commit()
except Exception:
pass
await _broadcast(ws_manager, "network_error", {
"analysis_id": analysis_id, "error": str(e),
})
finally:
await db.close()