Fix: Duplikat-Vorschläge + Stale-Check nur für RSS-Feeds
- Duplikat-Check basiert auf source_id+type statt exaktem Titel - add_source ohne source_id prüft per Domain-Match - Stale-Check überspringt web_sources (nur RSS-Feeds prüfen) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
Dieser Commit ist enthalten in:
@@ -1,4 +1,4 @@
|
|||||||
"""Quellen-Health-Check Engine - prüft Erreichbarkeit, Feed-Validität, Duplikate."""
|
"""Quellen-Health-Check Engine - prüft Erreichbarkeit, Feed-Validität, Duplikate."""
|
||||||
import asyncio
|
import asyncio
|
||||||
import logging
|
import logging
|
||||||
import json
|
import json
|
||||||
@@ -12,7 +12,7 @@ logger = logging.getLogger("osint.source_health")
|
|||||||
|
|
||||||
|
|
||||||
async def run_health_checks(db: aiosqlite.Connection) -> dict:
|
async def run_health_checks(db: aiosqlite.Connection) -> dict:
|
||||||
"""Führt alle Health-Checks für aktive Grundquellen durch."""
|
"""Führt alle Health-Checks für aktive Grundquellen durch."""
|
||||||
logger.info("Starte Quellen-Health-Check...")
|
logger.info("Starte Quellen-Health-Check...")
|
||||||
|
|
||||||
# Alle aktiven Grundquellen laden
|
# Alle aktiven Grundquellen laden
|
||||||
@@ -22,14 +22,14 @@ async def run_health_checks(db: aiosqlite.Connection) -> dict:
|
|||||||
)
|
)
|
||||||
sources = [dict(row) for row in await cursor.fetchall()]
|
sources = [dict(row) for row in await cursor.fetchall()]
|
||||||
|
|
||||||
# Aktuelle Health-Check-Ergebnisse löschen (werden neu geschrieben)
|
# Aktuelle Health-Check-Ergebnisse löschen (werden neu geschrieben)
|
||||||
await db.execute("DELETE FROM source_health_checks")
|
await db.execute("DELETE FROM source_health_checks")
|
||||||
await db.commit()
|
await db.commit()
|
||||||
|
|
||||||
checks_done = 0
|
checks_done = 0
|
||||||
issues_found = 0
|
issues_found = 0
|
||||||
|
|
||||||
# 1. Erreichbarkeit + Feed-Validität (nur Quellen mit URL)
|
# 1. Erreichbarkeit + Feed-Validität (nur Quellen mit URL)
|
||||||
sources_with_url = [s for s in sources if s["url"]]
|
sources_with_url = [s for s in sources if s["url"]]
|
||||||
|
|
||||||
async with httpx.AsyncClient(
|
async with httpx.AsyncClient(
|
||||||
@@ -46,7 +46,7 @@ async def run_health_checks(db: aiosqlite.Connection) -> dict:
|
|||||||
if isinstance(result, Exception):
|
if isinstance(result, Exception):
|
||||||
await _save_check(
|
await _save_check(
|
||||||
db, source["id"], "reachability", "error",
|
db, source["id"], "reachability", "error",
|
||||||
f"Prüfung fehlgeschlagen: {result}",
|
f"Prüfung fehlgeschlagen: {result}",
|
||||||
)
|
)
|
||||||
issues_found += 1
|
issues_found += 1
|
||||||
else:
|
else:
|
||||||
@@ -61,7 +61,7 @@ async def run_health_checks(db: aiosqlite.Connection) -> dict:
|
|||||||
|
|
||||||
# 2. Veraltete Quellen (kein Artikel seit >30 Tagen)
|
# 2. Veraltete Quellen (kein Artikel seit >30 Tagen)
|
||||||
for source in sources:
|
for source in sources:
|
||||||
if source["source_type"] == "excluded":
|
if source["source_type"] in ("excluded", "web_source"):
|
||||||
continue
|
continue
|
||||||
stale_check = _check_stale(source)
|
stale_check = _check_stale(source)
|
||||||
if stale_check:
|
if stale_check:
|
||||||
@@ -83,7 +83,7 @@ async def run_health_checks(db: aiosqlite.Connection) -> dict:
|
|||||||
|
|
||||||
await db.commit()
|
await db.commit()
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Health-Check abgeschlossen: {checks_done} Quellen geprüft, "
|
f"Health-Check abgeschlossen: {checks_done} Quellen geprüft, "
|
||||||
f"{issues_found} Probleme gefunden"
|
f"{issues_found} Probleme gefunden"
|
||||||
)
|
)
|
||||||
return {"checked": checks_done, "issues": issues_found}
|
return {"checked": checks_done, "issues": issues_found}
|
||||||
@@ -92,7 +92,7 @@ async def run_health_checks(db: aiosqlite.Connection) -> dict:
|
|||||||
async def _check_source_reachability(
|
async def _check_source_reachability(
|
||||||
client: httpx.AsyncClient, source: dict,
|
client: httpx.AsyncClient, source: dict,
|
||||||
) -> list[dict]:
|
) -> list[dict]:
|
||||||
"""Prüft Erreichbarkeit und Feed-Validität einer Quelle."""
|
"""Prüft Erreichbarkeit und Feed-Validität einer Quelle."""
|
||||||
checks = []
|
checks = []
|
||||||
url = source["url"]
|
url = source["url"]
|
||||||
|
|
||||||
@@ -125,14 +125,14 @@ async def _check_source_reachability(
|
|||||||
"message": "Erreichbar",
|
"message": "Erreichbar",
|
||||||
})
|
})
|
||||||
|
|
||||||
# Feed-Validität nur für RSS-Feeds
|
# Feed-Validität nur für RSS-Feeds
|
||||||
if source["source_type"] == "rss_feed":
|
if source["source_type"] == "rss_feed":
|
||||||
text = resp.text[:20000]
|
text = resp.text[:20000]
|
||||||
if "<rss" not in text and "<feed" not in text and "<channel" not in text:
|
if "<rss" not in text and "<feed" not in text and "<channel" not in text:
|
||||||
checks.append({
|
checks.append({
|
||||||
"type": "feed_validity",
|
"type": "feed_validity",
|
||||||
"status": "error",
|
"status": "error",
|
||||||
"message": "Kein gültiger RSS/Atom-Feed",
|
"message": "Kein gültiger RSS/Atom-Feed",
|
||||||
})
|
})
|
||||||
else:
|
else:
|
||||||
feed = await asyncio.to_thread(feedparser.parse, text)
|
feed = await asyncio.to_thread(feedparser.parse, text)
|
||||||
@@ -155,7 +155,7 @@ async def _check_source_reachability(
|
|||||||
checks.append({
|
checks.append({
|
||||||
"type": "feed_validity",
|
"type": "feed_validity",
|
||||||
"status": "ok",
|
"status": "ok",
|
||||||
"message": f"Feed gültig ({len(feed.entries)} Einträge)",
|
"message": f"Feed gültig ({len(feed.entries)} Einträge)",
|
||||||
})
|
})
|
||||||
|
|
||||||
except httpx.TimeoutException:
|
except httpx.TimeoutException:
|
||||||
@@ -181,7 +181,7 @@ async def _check_source_reachability(
|
|||||||
|
|
||||||
|
|
||||||
def _check_stale(source: dict) -> dict | None:
|
def _check_stale(source: dict) -> dict | None:
|
||||||
"""Prüft ob eine Quelle veraltet ist (keine Artikel seit >30 Tagen)."""
|
"""Prüft ob eine Quelle veraltet ist (keine Artikel seit >30 Tagen)."""
|
||||||
if source["source_type"] == "excluded":
|
if source["source_type"] == "excluded":
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@@ -249,7 +249,7 @@ async def _save_check(
|
|||||||
|
|
||||||
|
|
||||||
async def get_health_summary(db: aiosqlite.Connection) -> dict:
|
async def get_health_summary(db: aiosqlite.Connection) -> dict:
|
||||||
"""Gibt eine Zusammenfassung der letzten Health-Check-Ergebnisse zurück."""
|
"""Gibt eine Zusammenfassung der letzten Health-Check-Ergebnisse zurück."""
|
||||||
cursor = await db.execute("""
|
cursor = await db.execute("""
|
||||||
SELECT
|
SELECT
|
||||||
h.id, h.source_id, s.name, s.domain, s.url, s.source_type,
|
h.id, h.source_id, s.name, s.domain, s.url, s.source_type,
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
"""KI-gestützte Quellen-Vorschläge via Haiku."""
|
"""KI-gestützte Quellen-Vorschläge via Haiku."""
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
@@ -12,8 +12,8 @@ logger = logging.getLogger("osint.source_suggester")
|
|||||||
|
|
||||||
|
|
||||||
async def generate_suggestions(db: aiosqlite.Connection) -> int:
|
async def generate_suggestions(db: aiosqlite.Connection) -> int:
|
||||||
"""Generiert Quellen-Vorschläge basierend auf Health-Checks und Lückenanalyse."""
|
"""Generiert Quellen-Vorschläge basierend auf Health-Checks und Lückenanalyse."""
|
||||||
logger.info("Starte Quellen-Vorschläge via Haiku...")
|
logger.info("Starte Quellen-Vorschläge via Haiku...")
|
||||||
|
|
||||||
# 1. Aktuelle Quellen laden
|
# 1. Aktuelle Quellen laden
|
||||||
cursor = await db.execute(
|
cursor = await db.execute(
|
||||||
@@ -33,13 +33,13 @@ async def generate_suggestions(db: aiosqlite.Connection) -> int:
|
|||||||
""")
|
""")
|
||||||
issues = [dict(row) for row in await cursor.fetchall()]
|
issues = [dict(row) for row in await cursor.fetchall()]
|
||||||
|
|
||||||
# 3. Alte pending-Vorschläge entfernen (älter als 30 Tage)
|
# 3. Alte pending-Vorschläge entfernen (älter als 30 Tage)
|
||||||
await db.execute(
|
await db.execute(
|
||||||
"DELETE FROM source_suggestions "
|
"DELETE FROM source_suggestions "
|
||||||
"WHERE status = 'pending' AND created_at < datetime('now', '-30 days')"
|
"WHERE status = 'pending' AND created_at < datetime('now', '-30 days')"
|
||||||
)
|
)
|
||||||
|
|
||||||
# 4. Quellen-Zusammenfassung für Haiku
|
# 4. Quellen-Zusammenfassung für Haiku
|
||||||
categories = {}
|
categories = {}
|
||||||
for s in sources:
|
for s in sources:
|
||||||
cat = s["category"]
|
cat = s["category"]
|
||||||
@@ -67,7 +67,7 @@ async def generate_suggestions(db: aiosqlite.Connection) -> int:
|
|||||||
f"{issue['check_type']} = {issue['status']} - {issue['message']}\n"
|
f"{issue['check_type']} = {issue['status']} - {issue['message']}\n"
|
||||||
)
|
)
|
||||||
|
|
||||||
prompt = f"""Du bist ein OSINT-Analyst und verwaltest die Quellensammlung eines Lagebildmonitors für Sicherheitsbehörden.
|
prompt = f"""Du bist ein OSINT-Analyst und verwaltest die Quellensammlung eines Lagebildmonitors für Sicherheitsbehörden.
|
||||||
|
|
||||||
Aktuelle Quellensammlung:{source_summary}{issues_summary}
|
Aktuelle Quellensammlung:{source_summary}{issues_summary}
|
||||||
|
|
||||||
@@ -78,13 +78,13 @@ Beachte:
|
|||||||
2. Fehlende wichtige OSINT-Quellen: Schlage "add_source" mit konkreter RSS-Feed-URL vor
|
2. Fehlende wichtige OSINT-Quellen: Schlage "add_source" mit konkreter RSS-Feed-URL vor
|
||||||
3. Fokus auf deutschsprachige + wichtige internationale Nachrichtenquellen
|
3. Fokus auf deutschsprachige + wichtige internationale Nachrichtenquellen
|
||||||
4. Nur Quellen vorschlagen, die NICHT bereits vorhanden sind
|
4. Nur Quellen vorschlagen, die NICHT bereits vorhanden sind
|
||||||
5. Maximal 5 Vorschläge
|
5. Maximal 5 Vorschläge
|
||||||
|
|
||||||
Antworte NUR mit einem JSON-Array. Jedes Element:
|
Antworte NUR mit einem JSON-Array. Jedes Element:
|
||||||
{{
|
{{
|
||||||
"type": "add_source|deactivate_source|fix_url|remove_source",
|
"type": "add_source|deactivate_source|fix_url|remove_source",
|
||||||
"title": "Kurzer Titel",
|
"title": "Kurzer Titel",
|
||||||
"description": "Begründung",
|
"description": "Begründung",
|
||||||
"priority": "low|medium|high",
|
"priority": "low|medium|high",
|
||||||
"source_id": null,
|
"source_id": null,
|
||||||
"data": {{
|
"data": {{
|
||||||
@@ -104,7 +104,7 @@ Nur das JSON-Array, kein anderer Text."""
|
|||||||
|
|
||||||
json_match = re.search(r'\[.*\]', response, re.DOTALL)
|
json_match = re.search(r'\[.*\]', response, re.DOTALL)
|
||||||
if not json_match:
|
if not json_match:
|
||||||
logger.warning("Keine Vorschläge von Haiku erhalten (kein JSON)")
|
logger.warning("Keine Vorschläge von Haiku erhalten (kein JSON)")
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
suggestions = json.loads(json_match.group(0))
|
suggestions = json.loads(json_match.group(0))
|
||||||
@@ -128,12 +128,28 @@ Nur das JSON-Array, kein anderer Text."""
|
|||||||
if not await cursor.fetchone():
|
if not await cursor.fetchone():
|
||||||
source_id = None
|
source_id = None
|
||||||
|
|
||||||
# Duplikat-Check
|
# Duplikat-Check: gleicher Typ + gleiche source_id oder gleiche Domain pending?
|
||||||
cursor = await db.execute(
|
if source_id is not None:
|
||||||
"SELECT id FROM source_suggestions "
|
cursor = await db.execute(
|
||||||
"WHERE title = ? AND status = 'pending'",
|
"SELECT id FROM source_suggestions "
|
||||||
(title,),
|
"WHERE suggestion_type = ? AND source_id = ? AND status = 'pending'",
|
||||||
)
|
(stype, source_id),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Bei add_source ohne source_id: Domain aus suggested_data prüfen
|
||||||
|
check_domain = suggestion.get('data', {}).get('domain', '')
|
||||||
|
if check_domain:
|
||||||
|
cursor = await db.execute(
|
||||||
|
"SELECT id FROM source_suggestions "
|
||||||
|
"WHERE suggestion_type = ? AND suggested_data LIKE ? AND status = 'pending'",
|
||||||
|
(stype, f'%{check_domain}%'),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
cursor = await db.execute(
|
||||||
|
"SELECT id FROM source_suggestions "
|
||||||
|
"WHERE title = ? AND status = 'pending'",
|
||||||
|
(title,),
|
||||||
|
)
|
||||||
if await cursor.fetchone():
|
if await cursor.fetchone():
|
||||||
continue
|
continue
|
||||||
|
|
||||||
@@ -148,14 +164,14 @@ Nur das JSON-Array, kein anderer Text."""
|
|||||||
|
|
||||||
await db.commit()
|
await db.commit()
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Quellen-Vorschläge: {count} neue Vorschläge generiert "
|
f"Quellen-Vorschläge: {count} neue Vorschläge generiert "
|
||||||
f"(Haiku: {usage.input_tokens} in / {usage.output_tokens} out / "
|
f"(Haiku: {usage.input_tokens} in / {usage.output_tokens} out / "
|
||||||
f"${usage.cost_usd:.4f})"
|
f"${usage.cost_usd:.4f})"
|
||||||
)
|
)
|
||||||
return count
|
return count
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Fehler bei Quellen-Vorschlägen: {e}", exc_info=True)
|
logger.error(f"Fehler bei Quellen-Vorschlägen: {e}", exc_info=True)
|
||||||
return 0
|
return 0
|
||||||
|
|
||||||
|
|
||||||
@@ -202,7 +218,7 @@ async def apply_suggestion(
|
|||||||
(url,),
|
(url,),
|
||||||
)
|
)
|
||||||
if await cursor.fetchone():
|
if await cursor.fetchone():
|
||||||
result["action"] = "übersprungen (URL bereits vorhanden)"
|
result["action"] = "übersprungen (URL bereits vorhanden)"
|
||||||
new_status = "rejected"
|
new_status = "rejected"
|
||||||
else:
|
else:
|
||||||
await db.execute(
|
await db.execute(
|
||||||
@@ -214,7 +230,7 @@ async def apply_suggestion(
|
|||||||
)
|
)
|
||||||
result["action"] = f"Quelle '{name}' angelegt"
|
result["action"] = f"Quelle '{name}' angelegt"
|
||||||
else:
|
else:
|
||||||
result["action"] = "übersprungen (keine URL)"
|
result["action"] = "übersprungen (keine URL)"
|
||||||
new_status = "rejected"
|
new_status = "rejected"
|
||||||
|
|
||||||
elif stype == "deactivate_source":
|
elif stype == "deactivate_source":
|
||||||
@@ -226,7 +242,7 @@ async def apply_suggestion(
|
|||||||
)
|
)
|
||||||
result["action"] = "Quelle deaktiviert"
|
result["action"] = "Quelle deaktiviert"
|
||||||
else:
|
else:
|
||||||
result["action"] = "übersprungen (keine source_id)"
|
result["action"] = "übersprungen (keine source_id)"
|
||||||
|
|
||||||
elif stype == "remove_source":
|
elif stype == "remove_source":
|
||||||
source_id = suggestion["source_id"]
|
source_id = suggestion["source_id"]
|
||||||
@@ -234,9 +250,9 @@ async def apply_suggestion(
|
|||||||
await db.execute(
|
await db.execute(
|
||||||
"DELETE FROM sources WHERE id = ?", (source_id,),
|
"DELETE FROM sources WHERE id = ?", (source_id,),
|
||||||
)
|
)
|
||||||
result["action"] = "Quelle gelöscht"
|
result["action"] = "Quelle gelöscht"
|
||||||
else:
|
else:
|
||||||
result["action"] = "übersprungen (keine source_id)"
|
result["action"] = "übersprungen (keine source_id)"
|
||||||
|
|
||||||
elif stype == "fix_url":
|
elif stype == "fix_url":
|
||||||
source_id = suggestion["source_id"]
|
source_id = suggestion["source_id"]
|
||||||
@@ -248,7 +264,7 @@ async def apply_suggestion(
|
|||||||
)
|
)
|
||||||
result["action"] = f"URL aktualisiert auf {new_url}"
|
result["action"] = f"URL aktualisiert auf {new_url}"
|
||||||
else:
|
else:
|
||||||
result["action"] = "übersprungen (keine source_id oder URL)"
|
result["action"] = "übersprungen (keine source_id oder URL)"
|
||||||
|
|
||||||
await db.execute(
|
await db.execute(
|
||||||
"UPDATE source_suggestions SET status = ?, reviewed_at = CURRENT_TIMESTAMP "
|
"UPDATE source_suggestions SET status = ?, reviewed_at = CURRENT_TIMESTAMP "
|
||||||
|
|||||||
In neuem Issue referenzieren
Einen Benutzer sperren