diff --git a/src/agents/claude_client.py b/src/agents/claude_client.py index e624c90..a79d72f 100644 --- a/src/agents/claude_client.py +++ b/src/agents/claude_client.py @@ -13,6 +13,35 @@ _cancel_event_var: contextvars.ContextVar[asyncio.Event | None] = contextvars.Co logger = logging.getLogger("osint.claude_client") +class ClaudeCliError(RuntimeError): + """Strukturierter Fehler aus dem Claude CLI mit Kategorie. + + error_type: + - "rate_limit": Anthropic Rate-Limit oder Overload (transient, retry-tauglich) + - "auth_error": Account-Problem (Organisation hat keinen Claude-Zugang, + Token abgelaufen/ungueltig) - kein Retry sinnvoll, Admin-Aktion noetig + - "timeout": Claude CLI Timeout (transient) + - "cli_error": Sonstiger CLI-Fehler (unspezifisch, Default) + """ + + def __init__(self, error_type: str, message: str): + self.error_type = error_type + self.message = message + super().__init__(f"Claude CLI [{error_type}]: {message}") + + +def _classify_cli_error(combined_output: str) -> str: + """Ordnet einer Fehler-Ausgabe eine error_type-Kategorie zu.""" + txt = combined_output.lower() + rate_limit_keywords = ["hit your limit", "rate limit", "resets", "rate_limit", "overloaded"] + auth_error_keywords = ["does not have access", "login again", "contact your administrator"] + if any(kw in txt for kw in rate_limit_keywords): + return "rate_limit" + if any(kw in txt for kw in auth_error_keywords): + return "auth_error" + return "cli_error" + + @dataclass class ClaudeUsage: """Token-Verbrauch eines einzelnen Claude CLI Aufrufs.""" @@ -121,19 +150,20 @@ async def call_claude(prompt: str, tools: str | None = "WebSearch,WebFetch", mod error_msg = stderr.decode("utf-8", errors="replace").strip() stdout_msg = stdout.decode("utf-8", errors="replace").strip() - # Rate-Limit-Fehler kommen als JSON auf stdout, nicht auf stderr - error_type = "cli_error" - rate_limit_keywords = ["hit your limit", "rate limit", "resets", "rate_limit", "overloaded"] - combined_output = f"{error_msg} {stdout_msg}".lower() - if any(kw in combined_output for kw in rate_limit_keywords): - error_type = "rate_limit" + # Rate-Limit/Auth-Fehler kommen teils als JSON auf stdout, nicht auf stderr + combined_output = f"{error_msg} {stdout_msg}" + error_type = _classify_cli_error(combined_output) + + if error_type == "rate_limit": logger.warning(f"Claude CLI Rate-Limit (Exit {process.returncode}): {stdout_msg or error_msg}") + elif error_type == "auth_error": + logger.error(f"Claude CLI Auth-Fehler (Exit {process.returncode}): {stdout_msg or error_msg}") else: logger.error(f"Claude CLI Fehler (Exit {process.returncode}): {error_msg}") if stdout_msg: logger.error(f"Claude CLI stdout bei Fehler: {stdout_msg[:500]}") - raise RuntimeError(f"Claude CLI Fehler [{error_type}]: {stdout_msg or error_msg}") + raise ClaudeCliError(error_type, stdout_msg or error_msg) raw = stdout.decode("utf-8", errors="replace").strip() usage = ClaudeUsage() @@ -141,6 +171,19 @@ async def call_claude(prompt: str, tools: str | None = "WebSearch,WebFetch", mod try: data = json.loads(raw) + # CLI kann returncode=0 liefern und trotzdem is_error=true setzen + # (z.B. "Your organization does not have access to Claude") + if data.get("is_error"): + error_text = str(data.get("result", "")) + error_type = _classify_cli_error(error_text) + if error_type == "rate_limit": + logger.warning(f"Claude CLI Rate-Limit (is_error): {error_text}") + elif error_type == "auth_error": + logger.error(f"Claude CLI Auth-Fehler (is_error): {error_text}") + else: + logger.error(f"Claude CLI Fehler (is_error): {error_text}") + raise ClaudeCliError(error_type, error_text) + result_text = data.get("result", raw) u = data.get("usage", {}) usage = ClaudeUsage( diff --git a/src/agents/orchestrator.py b/src/agents/orchestrator.py index 23c5247..997bfeb 100644 --- a/src/agents/orchestrator.py +++ b/src/agents/orchestrator.py @@ -527,8 +527,12 @@ class AgentOrchestrator: RETRY_DELAYS = [0, 120, 300] # Sekunden: sofort, 2min, 5min TRANSIENT_ERRORS = (asyncio.TimeoutError, TimeoutError, ConnectionError, OSError) + from agents.claude_client import ClaudeCliError last_error = None + def _is_transient_cli(err: Exception) -> bool: + return isinstance(err, ClaudeCliError) and err.error_type in ("rate_limit", "timeout") + try: # Research-Lagen: Automatisch 3 Durchläufe nur beim ersten Refresh incident_type, has_summary = await self._get_incident_info(incident_id) @@ -557,32 +561,44 @@ class AgentOrchestrator: }, _vis, _cb, _tid) last_error = None break - except TRANSIENT_ERRORS as e: - last_error = e - logger.warning(f"Transienter Fehler bei Lage {incident_id} (Versuch {attempt + 1}/3): {e}") - if attempt < 2: - await self._mark_refresh_failed(incident_id, str(e)) - delay = RETRY_DELAYS[attempt + 1] - logger.info(f"Retry in {delay}s für Lage {incident_id}") - # Retry-Status per WebSocket senden - if self._ws_manager: - try: - _vis, _cb, _tid = await self._get_incident_visibility(incident_id) - except Exception: - _vis, _cb, _tid = "public", None, None - await self._ws_manager.broadcast_for_incident({ - "type": "status_update", - "incident_id": incident_id, - "data": {"status": "retrying", "attempt": attempt + 1, "delay": delay}, - }, _vis, _cb, _tid) - await asyncio.sleep(delay) - else: - await self._mark_refresh_failed(incident_id, f"Endgültig fehlgeschlagen nach 3 Versuchen: {e}") except Exception as e: + # Auth/CLI-Fehler: sofort abbrechen, kein Retry sinnvoll + if isinstance(e, ClaudeCliError) and e.error_type in ("auth_error", "cli_error"): + last_error = e + logger.error(f"Permanenter Claude-Fehler [{e.error_type}] bei Lage {incident_id}: {e}") + await self._mark_refresh_failed(incident_id, str(e)) + break + + # Transiente Fehler: Retry bis 3x + if isinstance(e, TRANSIENT_ERRORS) or _is_transient_cli(e): + last_error = e + kind = e.error_type if isinstance(e, ClaudeCliError) else type(e).__name__ + logger.warning(f"Transienter Fehler [{kind}] bei Lage {incident_id} (Versuch {attempt + 1}/3): {e}") + if attempt < 2: + await self._mark_refresh_failed(incident_id, str(e)) + delay = RETRY_DELAYS[attempt + 1] + logger.info(f"Retry in {delay}s für Lage {incident_id}") + if self._ws_manager: + try: + _vis, _cb, _tid = await self._get_incident_visibility(incident_id) + except Exception: + _vis, _cb, _tid = "public", None, None + await self._ws_manager.broadcast_for_incident({ + "type": "status_update", + "incident_id": incident_id, + "data": {"status": "retrying", "attempt": attempt + 1, "delay": delay}, + }, _vis, _cb, _tid) + await asyncio.sleep(delay) + continue + else: + await self._mark_refresh_failed(incident_id, f"Endgültig fehlgeschlagen nach 3 Versuchen: {e}") + break + + # Alles andere: permanent last_error = e logger.error(f"Permanenter Fehler bei Refresh für Lage {incident_id}: {e}") await self._mark_refresh_failed(incident_id, str(e)) - break # Permanenter Fehler, kein Retry + break if last_error and self._ws_manager: try: diff --git a/src/routers/chat.py b/src/routers/chat.py index 93fc0a1..30c1277 100644 --- a/src/routers/chat.py +++ b/src/routers/chat.py @@ -15,7 +15,7 @@ from config import CLAUDE_PATH, CLAUDE_MODEL_FAST from database import db_dependency from middleware.license_check import require_writable_license from services.license_service import charge_usage_to_tenant -from agents.claude_client import ClaudeUsage +from agents.claude_client import ClaudeUsage, ClaudeCliError, _classify_cli_error import aiosqlite logger = logging.getLogger("osint.chat") @@ -59,10 +59,11 @@ async def _call_claude_chat(prompt: str) -> tuple[str, int, ClaudeUsage]: if process.returncode != 0: err_msg = stderr.decode("utf-8", errors="replace").strip() - logger.error(f"Chat Claude CLI Fehler (rc={process.returncode}): {err_msg[:500]}") - if "rate_limit" in err_msg.lower() or "overloaded" in err_msg.lower(): - raise RuntimeError("rate_limit") - raise RuntimeError(f"Claude CLI Fehler: {err_msg[:200]}") + stdout_msg = stdout.decode("utf-8", errors="replace").strip() + combined = f"{err_msg} {stdout_msg}" + error_type = _classify_cli_error(combined) + logger.error(f"Chat Claude CLI Fehler [{error_type}] (rc={process.returncode}): {(stdout_msg or err_msg)[:500]}") + raise ClaudeCliError(error_type, stdout_msg or err_msg) raw = stdout.decode("utf-8", errors="replace").strip() duration_ms = 0 @@ -71,6 +72,12 @@ async def _call_claude_chat(prompt: str) -> tuple[str, int, ClaudeUsage]: try: data = _json.loads(raw) + if data.get("is_error"): + error_text = str(data.get("result", "")) + error_type = _classify_cli_error(error_text) + logger.error(f"Chat Claude CLI Fehler [{error_type}] (is_error): {error_text[:500]}") + raise ClaudeCliError(error_type, error_text) + result_text = data.get("result", raw) duration_ms = data.get("duration_ms", 0) u = data.get("usage", {}) @@ -437,11 +444,15 @@ async def chat( result, duration_ms, usage = await _call_claude_chat(prompt) except TimeoutError: raise HTTPException(status_code=504, detail="Der Assistent antwortet gerade nicht. Bitte versuche es erneut.") - except RuntimeError as e: - error_str = str(e) - if "rate_limit" in error_str: + except ClaudeCliError as e: + if e.error_type == "rate_limit": raise HTTPException(status_code=429, detail="Der Assistent ist gerade ausgelastet. Bitte versuche es in einer Minute erneut.") - logger.error(f"Chat Claude-Fehler: {e}") + if e.error_type == "auth_error": + raise HTTPException(status_code=503, detail="KI-Zugang aktuell nicht verfuegbar. Bitte Administrator kontaktieren.") + logger.error(f"Chat Claude-Fehler [{e.error_type}]: {e}") + raise HTTPException(status_code=502, detail="Der Assistent ist voruebergehend nicht erreichbar.") + except RuntimeError as e: + logger.error(f"Chat Claude-Fehler (unspezifisch): {e}") raise HTTPException(status_code=502, detail="Der Assistent ist voruebergehend nicht erreichbar.") # Credits buchen diff --git a/src/routers/incidents.py b/src/routers/incidents.py index fd02e8c..42276ea 100644 --- a/src/routers/incidents.py +++ b/src/routers/incidents.py @@ -245,7 +245,7 @@ async def enhance_description( db: aiosqlite.Connection = Depends(db_dependency), ): """Generiert eine strukturierte Beschreibung per KI aus dem Titel.""" - from agents.claude_client import call_claude + from agents.claude_client import call_claude, ClaudeCliError from config import CLAUDE_MODEL_FAST from services.license_service import charge_usage_to_tenant @@ -255,17 +255,30 @@ async def enhance_description( try: result, usage = await call_claude(prompt, tools=None, model=CLAUDE_MODEL_FAST, raw_text=True) - _enhance_logger.info( - f"Beschreibung generiert fuer \"{data.title[:50]}\": " - f"{usage.input_tokens}in/{usage.output_tokens}out" - ) - await charge_usage_to_tenant(db, current_user.get("tenant_id"), usage, source="enhance") - await db.commit() - return {"description": result.strip()} + except ClaudeCliError as e: + _enhance_logger.error(f"Beschreibung generieren: ClaudeCliError [{e.error_type}]: {e.message}") + if e.error_type == "auth_error": + raise HTTPException(status_code=503, detail="KI-Zugang aktuell nicht verfuegbar. Bitte Administrator kontaktieren.") + if e.error_type == "rate_limit": + raise HTTPException(status_code=429, detail="KI ist gerade ausgelastet. Bitte in einer Minute erneut versuchen.") + raise HTTPException(status_code=500, detail="Beschreibung konnte nicht generiert werden") + except TimeoutError: + _enhance_logger.error("Beschreibung generieren: Timeout") + raise HTTPException(status_code=504, detail="Die KI antwortet gerade nicht. Bitte erneut versuchen.") + except HTTPException: + raise except Exception as e: _enhance_logger.error(f"Beschreibung generieren fehlgeschlagen: {e}") raise HTTPException(status_code=500, detail="Beschreibung konnte nicht generiert werden") + _enhance_logger.info( + f"Beschreibung generiert fuer \"{data.title[:50]}\": " + f"{usage.input_tokens}in/{usage.output_tokens}out" + ) + await charge_usage_to_tenant(db, current_user.get("tenant_id"), usage, source="enhance") + await db.commit() + return {"description": result.strip()} + @router.get("/{incident_id}", response_model=IncidentResponse) async def get_incident( diff --git a/src/static/dashboard.html b/src/static/dashboard.html index 8e2af02..6f5defe 100644 --- a/src/static/dashboard.html +++ b/src/static/dashboard.html @@ -624,11 +624,11 @@ - + - + diff --git a/src/static/js/api.js b/src/static/js/api.js index eb8e1a1..e841a05 100644 --- a/src/static/js/api.js +++ b/src/static/js/api.js @@ -1,6 +1,16 @@ /** * API-Client für den OSINT Lagemonitor. */ + +class ApiError extends Error { + constructor(status, detail) { + super(detail || `Fehler ${status}`); + this.name = 'ApiError'; + this.status = status; + this.detail = detail; + } +} + const API = { baseUrl: '/api', @@ -57,7 +67,7 @@ const API = { } else if (typeof detail === 'object' && detail !== null) { detail = JSON.stringify(detail); } - throw new Error(detail || `Fehler ${response.status}`); + throw new ApiError(response.status, detail); } if (response.status === 204) return null; diff --git a/src/static/js/app.js b/src/static/js/app.js index 82fd47a..2f0dc71 100644 --- a/src/static/js/app.js +++ b/src/static/js/app.js @@ -1816,8 +1816,15 @@ async generateDescription() { textarea.value = result.description; _autoResizeTextarea(textarea); } catch (err) { - if (err.name !== 'AbortError') { - UI.showToast('Beschreibung konnte nicht generiert werden', 'error'); + if (err.name === 'AbortError') { + // still + } else { + let msg = 'Beschreibung konnte nicht generiert werden'; + if (err.status === 503) msg = 'KI-Zugang aktuell nicht verfügbar. Bitte Administrator kontaktieren.'; + else if (err.status === 429) msg = 'KI ist gerade ausgelastet. Bitte kurz warten und erneut versuchen.'; + else if (err.status === 504) msg = 'KI antwortet gerade nicht. Bitte erneut versuchen.'; + else if (err.status === 403) msg = err.detail || 'Zugriff verweigert.'; + UI.showToast(msg, 'error'); } } finally { btnText.textContent = 'Beschreibung generieren';