GEOINT-Modus aus Monitor entfernt

Wird als eigenstaendige Anwendung auf separater Subdomain neu aufgebaut.
Alle GEOINT-Dateien entfernt, dashboard.html/components.js/main.py
auf pre-GEOINT Stand zurueckgesetzt.
Dieser Commit ist enthalten in:
Claude Dev
2026-03-24 11:06:19 +01:00
Ursprung 8212617276
Commit e64447ab7f
14 geänderte Dateien mit 4112 neuen und 1999 gelöschten Zeilen

Datei anzeigen

@@ -1,4 +1,4 @@
"""Netzwerkanalyse: Entity-Extraktion (Haiku) + Beziehungsanalyse (Batched)."""
"""Netzwerkanalyse: Entity-Extraktion (Sonnet) + Beziehungsanalyse (Batched) mit Artikel-Deduplizierung."""
import asyncio
import hashlib
import json
@@ -9,7 +9,7 @@ from datetime import datetime
from typing import Optional
from agents.claude_client import call_claude, ClaudeUsage, UsageAccumulator
from config import CLAUDE_MODEL_FAST, TIMEZONE
from config import CLAUDE_MODEL_FAST, CLAUDE_MODEL_MEDIUM, TIMEZONE
logger = logging.getLogger("osint.entity_extractor")
@@ -194,6 +194,114 @@ def _compute_data_hash(article_ids, factcheck_ids, article_ts, factcheck_ts) ->
return hashlib.sha256("|".join(parts).encode("utf-8")).hexdigest()
# ---------------------------------------------------------------------------
# Artikel-Deduplizierung
# ---------------------------------------------------------------------------
def _normalize_headline(headline: str) -> str:
"""Normalisiert eine Headline fuer Vergleiche."""
h = headline.lower().strip()
h = re.sub(r"[^a-z0-9\s]", "", h)
h = re.sub(r"\s+", " ", h).strip()
return h
def _headline_tokens(headline: str) -> set[str]:
"""Extrahiert bedeutungstragende Tokens aus einer Headline."""
tokens = set()
for word in _normalize_headline(headline).split():
if len(word) >= 3 and word not in _STOP_WORDS:
tokens.add(word)
return tokens
def _jaccard_similarity(set_a: set, set_b: set) -> float:
"""Jaccard-Aehnlichkeit zweier Mengen."""
if not set_a or not set_b:
return 0.0
intersection = set_a & set_b
union = set_a | set_b
return len(intersection) / len(union) if union else 0.0
def _content_fingerprint(text: str) -> str:
"""Kurzer Hash des Textinhalts fuer Near-Duplicate-Erkennung."""
normalized = re.sub(r"\s+", " ", text.lower().strip())[:500]
return hashlib.md5(normalized.encode("utf-8")).hexdigest()
def _deduplicate_articles(articles: list[dict], factchecks: list[dict]) -> tuple[list[dict], list[dict]]:
"""Entfernt redundante Artikel basierend auf Headline-Similarity und Content-Hash.
Behaelt pro Duplikat-Gruppe den Artikel mit dem laengsten Content.
Faktenchecks werden nicht dedupliziert (sind bereits einzigartig).
Returns:
Tuple von (deduplizierte_artikel, factchecks_unveraendert)
"""
if len(articles) <= 50:
return articles, factchecks
logger.info(f"Artikel-Dedup: {len(articles)} Artikel pruefen")
# Phase A: Exakte Content-Fingerprint-Dedup
seen_fingerprints: dict[str, int] = {}
for i, art in enumerate(articles):
content = art.get("content_de") or art.get("content_original") or ""
headline = art.get("headline_de") or art.get("headline") or ""
if not content and not headline:
continue
fp = _content_fingerprint(headline + " " + content)
if fp in seen_fingerprints:
existing_idx = seen_fingerprints[fp]
existing_content = articles[existing_idx].get("content_de") or articles[existing_idx].get("content_original") or ""
if len(content) > len(existing_content):
seen_fingerprints[fp] = i
else:
seen_fingerprints[fp] = i
after_fp = list(seen_fingerprints.values())
fp_removed = len(articles) - len(after_fp)
# Phase B: Headline-Similarity-Dedup (Jaccard >= 0.7)
remaining = [articles[i] for i in sorted(after_fp)]
token_sets = []
for art in remaining:
headline = art.get("headline_de") or art.get("headline") or ""
token_sets.append(_headline_tokens(headline))
keep_mask = [True] * len(remaining)
for i in range(len(remaining)):
if not keep_mask[i]:
continue
for j in range(i + 1, len(remaining)):
if not keep_mask[j]:
continue
if _jaccard_similarity(token_sets[i], token_sets[j]) >= 0.7:
content_i = remaining[i].get("content_de") or remaining[i].get("content_original") or ""
content_j = remaining[j].get("content_de") or remaining[j].get("content_original") or ""
if len(content_j) > len(content_i):
keep_mask[i] = False
break
else:
keep_mask[j] = False
deduped = [art for art, keep in zip(remaining, keep_mask) if keep]
headline_removed = len(remaining) - len(deduped)
logger.info(
f"Artikel-Dedup abgeschlossen: {len(articles)} -> {len(deduped)} "
f"({fp_removed} Content-Duplikate, {headline_removed} Headline-Duplikate entfernt)"
)
return deduped, factchecks
# ---------------------------------------------------------------------------
# Entity-Merge Helper
# ---------------------------------------------------------------------------
@@ -279,8 +387,8 @@ async def _phase1_extract_entities(
headline = art.get("headline_de") or art.get("headline") or ""
content = art.get("content_de") or art.get("content_original") or ""
source = art.get("source") or ""
if len(content) > 2000:
content = content[:2000] + "..."
if len(content) > 800:
content = content[:800] + "..."
all_texts.append(f"[{source}] {headline}\n{content}")
for fc in factchecks:
@@ -293,7 +401,7 @@ async def _phase1_extract_entities(
logger.warning(f"Analyse {analysis_id}: Keine Texte vorhanden")
return []
batch_size = 30
batch_size = 50
batches = [all_texts[i:i + batch_size] for i in range(0, len(all_texts), batch_size)]
logger.info(f"{len(all_texts)} Texte in {len(batches)} Batches")
@@ -304,10 +412,10 @@ async def _phase1_extract_entities(
prompt = ENTITY_EXTRACTION_PROMPT.format(articles_text=articles_text)
try:
result_text, usage = await call_claude(prompt, tools=None, model=CLAUDE_MODEL_FAST)
result_text, usage = await call_claude(prompt, tools=None, model=CLAUDE_MODEL_MEDIUM)
usage_acc.add(usage)
except Exception as e:
logger.error(f"Haiku Batch {batch_idx + 1}/{len(batches)} fehlgeschlagen: {e}")
logger.error(f"Sonnet Batch {batch_idx + 1}/{len(batches)} fehlgeschlagen: {e}")
continue
parsed = _parse_json_response(result_text)
@@ -500,8 +608,8 @@ async def _phase2_analyze_relationships(
headline = art.get("headline_de") or art.get("headline") or ""
content = art.get("content_de") or art.get("content_original") or ""
source = art.get("source") or ""
if len(content) > 2000:
content = content[:2000] + "..."
if len(content) > 800:
content = content[:800] + "..."
all_texts.append(f"[{source}] {headline}\n{content}")
for fc in factchecks:
@@ -514,7 +622,7 @@ async def _phase2_analyze_relationships(
return []
# --- Stufe A: Per-Batch Beziehungsextraktion ---
batch_size = 30
batch_size = 50
batches = [all_texts[i:i + batch_size] for i in range(0, len(all_texts), batch_size)]
logger.info(f"Stufe A: {len(batches)} Batches für Beziehungsextraktion")
@@ -545,7 +653,7 @@ async def _phase2_analyze_relationships(
)
try:
result_text, usage = await call_claude(prompt, tools=None, model=CLAUDE_MODEL_FAST)
result_text, usage = await call_claude(prompt, tools=None, model=CLAUDE_MODEL_MEDIUM)
usage_acc.add(usage)
except Exception as e:
logger.error(f"Relationship Batch {batch_idx + 1}/{len(batches)} fehlgeschlagen: {e}")
@@ -1067,6 +1175,9 @@ async def extract_and_relate_entities(analysis_id: int, tenant_id: int, ws_manag
logger.info(f"Analyse {analysis_id}: {len(articles)} Artikel, "
f"{len(factchecks)} Faktenchecks aus {len(incident_ids)} Lagen")
# Artikel-Deduplizierung vor KI-Pipeline
articles, factchecks = _deduplicate_articles(articles, factchecks)
# Phase 1: Entity-Extraktion
if not await _check_analysis_exists(db, analysis_id):
return

Datei-Diff unterdrückt, da er zu groß ist Diff laden