GEOINT-Modus aus Monitor entfernt
Wird als eigenstaendige Anwendung auf separater Subdomain neu aufgebaut. Alle GEOINT-Dateien entfernt, dashboard.html/components.js/main.py auf pre-GEOINT Stand zurueckgesetzt.
Dieser Commit ist enthalten in:
@@ -1,4 +1,4 @@
|
||||
"""Netzwerkanalyse: Entity-Extraktion (Haiku) + Beziehungsanalyse (Batched)."""
|
||||
"""Netzwerkanalyse: Entity-Extraktion (Sonnet) + Beziehungsanalyse (Batched) mit Artikel-Deduplizierung."""
|
||||
import asyncio
|
||||
import hashlib
|
||||
import json
|
||||
@@ -9,7 +9,7 @@ from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
from agents.claude_client import call_claude, ClaudeUsage, UsageAccumulator
|
||||
from config import CLAUDE_MODEL_FAST, TIMEZONE
|
||||
from config import CLAUDE_MODEL_FAST, CLAUDE_MODEL_MEDIUM, TIMEZONE
|
||||
|
||||
logger = logging.getLogger("osint.entity_extractor")
|
||||
|
||||
@@ -194,6 +194,114 @@ def _compute_data_hash(article_ids, factcheck_ids, article_ts, factcheck_ts) ->
|
||||
return hashlib.sha256("|".join(parts).encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Artikel-Deduplizierung
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _normalize_headline(headline: str) -> str:
|
||||
"""Normalisiert eine Headline fuer Vergleiche."""
|
||||
h = headline.lower().strip()
|
||||
h = re.sub(r"[^a-z0-9\s]", "", h)
|
||||
h = re.sub(r"\s+", " ", h).strip()
|
||||
return h
|
||||
|
||||
|
||||
def _headline_tokens(headline: str) -> set[str]:
|
||||
"""Extrahiert bedeutungstragende Tokens aus einer Headline."""
|
||||
tokens = set()
|
||||
for word in _normalize_headline(headline).split():
|
||||
if len(word) >= 3 and word not in _STOP_WORDS:
|
||||
tokens.add(word)
|
||||
return tokens
|
||||
|
||||
|
||||
def _jaccard_similarity(set_a: set, set_b: set) -> float:
|
||||
"""Jaccard-Aehnlichkeit zweier Mengen."""
|
||||
if not set_a or not set_b:
|
||||
return 0.0
|
||||
intersection = set_a & set_b
|
||||
union = set_a | set_b
|
||||
return len(intersection) / len(union) if union else 0.0
|
||||
|
||||
|
||||
def _content_fingerprint(text: str) -> str:
|
||||
"""Kurzer Hash des Textinhalts fuer Near-Duplicate-Erkennung."""
|
||||
normalized = re.sub(r"\s+", " ", text.lower().strip())[:500]
|
||||
return hashlib.md5(normalized.encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
def _deduplicate_articles(articles: list[dict], factchecks: list[dict]) -> tuple[list[dict], list[dict]]:
|
||||
"""Entfernt redundante Artikel basierend auf Headline-Similarity und Content-Hash.
|
||||
|
||||
Behaelt pro Duplikat-Gruppe den Artikel mit dem laengsten Content.
|
||||
Faktenchecks werden nicht dedupliziert (sind bereits einzigartig).
|
||||
|
||||
Returns:
|
||||
Tuple von (deduplizierte_artikel, factchecks_unveraendert)
|
||||
"""
|
||||
if len(articles) <= 50:
|
||||
return articles, factchecks
|
||||
|
||||
logger.info(f"Artikel-Dedup: {len(articles)} Artikel pruefen")
|
||||
|
||||
# Phase A: Exakte Content-Fingerprint-Dedup
|
||||
seen_fingerprints: dict[str, int] = {}
|
||||
|
||||
for i, art in enumerate(articles):
|
||||
content = art.get("content_de") or art.get("content_original") or ""
|
||||
headline = art.get("headline_de") or art.get("headline") or ""
|
||||
|
||||
if not content and not headline:
|
||||
continue
|
||||
|
||||
fp = _content_fingerprint(headline + " " + content)
|
||||
|
||||
if fp in seen_fingerprints:
|
||||
existing_idx = seen_fingerprints[fp]
|
||||
existing_content = articles[existing_idx].get("content_de") or articles[existing_idx].get("content_original") or ""
|
||||
if len(content) > len(existing_content):
|
||||
seen_fingerprints[fp] = i
|
||||
else:
|
||||
seen_fingerprints[fp] = i
|
||||
|
||||
after_fp = list(seen_fingerprints.values())
|
||||
fp_removed = len(articles) - len(after_fp)
|
||||
|
||||
# Phase B: Headline-Similarity-Dedup (Jaccard >= 0.7)
|
||||
remaining = [articles[i] for i in sorted(after_fp)]
|
||||
|
||||
token_sets = []
|
||||
for art in remaining:
|
||||
headline = art.get("headline_de") or art.get("headline") or ""
|
||||
token_sets.append(_headline_tokens(headline))
|
||||
|
||||
keep_mask = [True] * len(remaining)
|
||||
|
||||
for i in range(len(remaining)):
|
||||
if not keep_mask[i]:
|
||||
continue
|
||||
for j in range(i + 1, len(remaining)):
|
||||
if not keep_mask[j]:
|
||||
continue
|
||||
if _jaccard_similarity(token_sets[i], token_sets[j]) >= 0.7:
|
||||
content_i = remaining[i].get("content_de") or remaining[i].get("content_original") or ""
|
||||
content_j = remaining[j].get("content_de") or remaining[j].get("content_original") or ""
|
||||
if len(content_j) > len(content_i):
|
||||
keep_mask[i] = False
|
||||
break
|
||||
else:
|
||||
keep_mask[j] = False
|
||||
|
||||
deduped = [art for art, keep in zip(remaining, keep_mask) if keep]
|
||||
headline_removed = len(remaining) - len(deduped)
|
||||
|
||||
logger.info(
|
||||
f"Artikel-Dedup abgeschlossen: {len(articles)} -> {len(deduped)} "
|
||||
f"({fp_removed} Content-Duplikate, {headline_removed} Headline-Duplikate entfernt)"
|
||||
)
|
||||
|
||||
return deduped, factchecks
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Entity-Merge Helper
|
||||
# ---------------------------------------------------------------------------
|
||||
@@ -279,8 +387,8 @@ async def _phase1_extract_entities(
|
||||
headline = art.get("headline_de") or art.get("headline") or ""
|
||||
content = art.get("content_de") or art.get("content_original") or ""
|
||||
source = art.get("source") or ""
|
||||
if len(content) > 2000:
|
||||
content = content[:2000] + "..."
|
||||
if len(content) > 800:
|
||||
content = content[:800] + "..."
|
||||
all_texts.append(f"[{source}] {headline}\n{content}")
|
||||
|
||||
for fc in factchecks:
|
||||
@@ -293,7 +401,7 @@ async def _phase1_extract_entities(
|
||||
logger.warning(f"Analyse {analysis_id}: Keine Texte vorhanden")
|
||||
return []
|
||||
|
||||
batch_size = 30
|
||||
batch_size = 50
|
||||
batches = [all_texts[i:i + batch_size] for i in range(0, len(all_texts), batch_size)]
|
||||
logger.info(f"{len(all_texts)} Texte in {len(batches)} Batches")
|
||||
|
||||
@@ -304,10 +412,10 @@ async def _phase1_extract_entities(
|
||||
prompt = ENTITY_EXTRACTION_PROMPT.format(articles_text=articles_text)
|
||||
|
||||
try:
|
||||
result_text, usage = await call_claude(prompt, tools=None, model=CLAUDE_MODEL_FAST)
|
||||
result_text, usage = await call_claude(prompt, tools=None, model=CLAUDE_MODEL_MEDIUM)
|
||||
usage_acc.add(usage)
|
||||
except Exception as e:
|
||||
logger.error(f"Haiku Batch {batch_idx + 1}/{len(batches)} fehlgeschlagen: {e}")
|
||||
logger.error(f"Sonnet Batch {batch_idx + 1}/{len(batches)} fehlgeschlagen: {e}")
|
||||
continue
|
||||
|
||||
parsed = _parse_json_response(result_text)
|
||||
@@ -500,8 +608,8 @@ async def _phase2_analyze_relationships(
|
||||
headline = art.get("headline_de") or art.get("headline") or ""
|
||||
content = art.get("content_de") or art.get("content_original") or ""
|
||||
source = art.get("source") or ""
|
||||
if len(content) > 2000:
|
||||
content = content[:2000] + "..."
|
||||
if len(content) > 800:
|
||||
content = content[:800] + "..."
|
||||
all_texts.append(f"[{source}] {headline}\n{content}")
|
||||
|
||||
for fc in factchecks:
|
||||
@@ -514,7 +622,7 @@ async def _phase2_analyze_relationships(
|
||||
return []
|
||||
|
||||
# --- Stufe A: Per-Batch Beziehungsextraktion ---
|
||||
batch_size = 30
|
||||
batch_size = 50
|
||||
batches = [all_texts[i:i + batch_size] for i in range(0, len(all_texts), batch_size)]
|
||||
logger.info(f"Stufe A: {len(batches)} Batches für Beziehungsextraktion")
|
||||
|
||||
@@ -545,7 +653,7 @@ async def _phase2_analyze_relationships(
|
||||
)
|
||||
|
||||
try:
|
||||
result_text, usage = await call_claude(prompt, tools=None, model=CLAUDE_MODEL_FAST)
|
||||
result_text, usage = await call_claude(prompt, tools=None, model=CLAUDE_MODEL_MEDIUM)
|
||||
usage_acc.add(usage)
|
||||
except Exception as e:
|
||||
logger.error(f"Relationship Batch {batch_idx + 1}/{len(batches)} fehlgeschlagen: {e}")
|
||||
@@ -1067,6 +1175,9 @@ async def extract_and_relate_entities(analysis_id: int, tenant_id: int, ws_manag
|
||||
logger.info(f"Analyse {analysis_id}: {len(articles)} Artikel, "
|
||||
f"{len(factchecks)} Faktenchecks aus {len(incident_ids)} Lagen")
|
||||
|
||||
# Artikel-Deduplizierung vor KI-Pipeline
|
||||
articles, factchecks = _deduplicate_articles(articles, factchecks)
|
||||
|
||||
# Phase 1: Entity-Extraktion
|
||||
if not await _check_analysis_exists(db, analysis_id):
|
||||
return
|
||||
|
||||
1144
src/agents/entity_extractor.py.bak
Normale Datei
1144
src/agents/entity_extractor.py.bak
Normale Datei
Datei-Diff unterdrückt, da er zu groß ist
Diff laden
In neuem Issue referenzieren
Einen Benutzer sperren