commit d9c99421974c13719a368fa98bc759be5988de99 Author: Claude Project Manager Date: Sat Jul 12 22:29:41 2025 +0200 Initial commit diff --git a/.claude/settings.local.json b/.claude/settings.local.json new file mode 100644 index 0000000..f35455e --- /dev/null +++ b/.claude/settings.local.json @@ -0,0 +1,12 @@ +{ + "permissions": { + "allow": [ + "Bash(mkdir:*)", + "Bash(cp:*)", + "Bash(python:*)", + "Bash(find:*)", + "Bash(rm:*)" + ], + "deny": [] + } +} \ No newline at end of file diff --git a/CLAUDE_PROJECT_README.md b/CLAUDE_PROJECT_README.md new file mode 100644 index 0000000..d6005d1 --- /dev/null +++ b/CLAUDE_PROJECT_README.md @@ -0,0 +1,78 @@ +# Toolbox-Webseiten-Crawler + +*This README was automatically generated by Claude Project Manager* + +## Project Overview + +- **Path**: `C:/Users/hendr/Desktop/IntelSight/Projektablage/Toolbox-Webseiten-Crawler` +- **Files**: 23 files +- **Size**: 98.9 KB +- **Last Modified**: 2025-07-12 20:11 + +## Technology Stack + +### Languages +- Batch +- Python + +## Project Structure + +``` +CLAUDE_PROJECT_README.md +install_dependencies.bat +main.py +requirements.txt +start.bat +src/ + ├── __init__.py + ├── core/ + │ ├── web_crawler.py + │ └── __init__.py + ├── resources/ + │ ├── icons/ + │ │ ├── check.svg + │ │ ├── download.svg + │ │ ├── folder.svg + │ │ ├── gear.svg + │ │ ├── globe.svg + │ │ ├── moon.svg + │ │ └── sun.svg + │ └── styles/ + │ ├── dark_theme.py + │ └── light_theme.py + ├── ui/ + │ ├── custom_widgets.py + │ ├── main_window.py + │ └── __init__.py + └── utils/ + ├── local_server.py + ├── pdf_report.py + └── __init__.py +``` + +## Key Files + +- `requirements.txt` + +## Claude Integration + +This project is managed with Claude Project Manager. To work with this project: + +1. Open Claude Project Manager +2. Click on this project's tile +3. Claude will open in the project directory + +## Notes + +*Add your project-specific notes here* + +--- + +## Development Log + +- README generated on 2025-07-11 21:27:29 +- README updated on 2025-07-11 21:27:36 +- README updated on 2025-07-12 12:18:47 +- README updated on 2025-07-12 20:10:48 +- README updated on 2025-07-12 20:11:07 +- README updated on 2025-07-12 20:11:21 diff --git a/install_dependencies.bat b/install_dependencies.bat new file mode 100644 index 0000000..0e9dcdd --- /dev/null +++ b/install_dependencies.bat @@ -0,0 +1,6 @@ +@echo off +echo Installing dependencies for IntelSight Webseiten-Crawler... +pip install -r requirements.txt +echo. +echo Installation complete! +pause \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..5d6e235 --- /dev/null +++ b/main.py @@ -0,0 +1,37 @@ +import sys +import os +from PyQt6.QtWidgets import QApplication +from PyQt6.QtGui import QIcon, QFontDatabase +from PyQt6.QtCore import Qt + +# Füge src zum Python-Pfad hinzu +sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src')) + +from ui.main_window import WebsiteCrawlerWindow + + +def main(): + # High DPI Support + if hasattr(Qt.HighDpiScaleFactorRoundingPolicy, 'PassThrough'): + QApplication.setHighDpiScaleFactorRoundingPolicy( + Qt.HighDpiScaleFactorRoundingPolicy.PassThrough + ) + + app = QApplication(sys.argv) + app.setApplicationName("IntelSight Webseiten-Crawler") + app.setOrganizationName("IntelSight") + + # Setze App-Icon wenn vorhanden + icon_path = os.path.join(os.path.dirname(__file__), 'src', 'resources', 'icons', 'globe.svg') + if os.path.exists(icon_path): + app.setWindowIcon(QIcon(icon_path)) + + # Hauptfenster erstellen und anzeigen + window = WebsiteCrawlerWindow() + window.show() + + sys.exit(app.exec()) + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..491470f --- /dev/null +++ b/requirements.txt @@ -0,0 +1,12 @@ +PyQt6==6.6.1 +PyQt6-Qt6==6.6.1 +PyQt6-sip==13.6.0 +beautifulsoup4==4.12.3 +requests==2.31.0 +lxml==5.1.0 +selenium==4.18.1 +wget==3.2 +urllib3==2.2.1 +pywebcopy==7.0.2 +reportlab==4.0.9 +chardet==5.2.0 \ No newline at end of file diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/core/__init__.py b/src/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/core/__pycache__/__init__.cpython-310.pyc b/src/core/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..967027e Binary files /dev/null and b/src/core/__pycache__/__init__.cpython-310.pyc differ diff --git a/src/core/__pycache__/web_crawler.cpython-310.pyc b/src/core/__pycache__/web_crawler.cpython-310.pyc new file mode 100644 index 0000000..7544fa5 Binary files /dev/null and b/src/core/__pycache__/web_crawler.cpython-310.pyc differ diff --git a/src/core/web_crawler.py b/src/core/web_crawler.py new file mode 100644 index 0000000..3ae7249 --- /dev/null +++ b/src/core/web_crawler.py @@ -0,0 +1,1027 @@ +import os +import requests +from bs4 import BeautifulSoup +from urllib.parse import urljoin, urlparse, urlunparse +import shutil +from pathlib import Path +import re +import time +import random +from typing import Optional, Callable, Set, Dict, Tuple +import hashlib +from datetime import datetime + + +class WebCrawler: + def __init__(self, human_behavior: bool = True): + self.session = requests.Session() + self.human_behavior = human_behavior + # Verschiedene User Agents für zufällige Auswahl + self.user_agents = [ + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' + ] + self._set_random_user_agent() + self.progress_callback: Optional[Callable[[int], None]] = None + self.status_callback: Optional[Callable[[str], None]] = None + self.visited_urls: Set[str] = set() + self.downloaded_resources: Dict[str, str] = {} + self.skipped_urls: Dict[str, str] = {} # URL -> Grund + self.start_time: Optional[datetime] = None + self.end_time: Optional[datetime] = None + self.current_base_url: Optional[str] = None + self.url_mapping: Dict[str, str] = {} # Original URL -> lokaler Pfad für Links + + def _set_random_user_agent(self): + """Setzt einen zufälligen User Agent""" + user_agent = random.choice(self.user_agents) + self.session.headers.update({ + 'User-Agent': user_agent, + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Language': 'de-DE,de;q=0.9,en;q=0.8', + 'Accept-Encoding': 'gzip, deflate, br', + 'DNT': '1', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1' + }) + + def _emit_status(self, message: str): + if self.status_callback: + self.status_callback(message) + + def _emit_progress(self, value: int): + if self.progress_callback: + self.progress_callback(value) + + def _human_delay(self, base_delay: float = 1.0, variation: float = 0.5): + """Simuliert menschliche Verzögerungen mit Zufälligkeit""" + if not self.human_behavior: + # Minimale Verzögerung wenn menschliches Verhalten deaktiviert + time.sleep(0.1) + return + + # Basis-Verzögerung mit zufälliger Variation + delay = base_delay + random.uniform(-variation, variation) + + # Gelegentlich längere Pausen (simuliert Lesen/Nachdenken) + if random.random() < 0.1: # 10% Chance auf längere Pause + delay += random.uniform(2, 5) + # Keine Status-Meldung mehr für künstliche Pausen + + # Sehr selten extra lange Pausen (simuliert Ablenkung) + if random.random() < 0.02: # 2% Chance + delay += random.uniform(5, 10) + # Keine Status-Meldung mehr für künstliche Pausen + + time.sleep(max(0.5, delay)) # Mindestens 0.5 Sekunden + + def _get_extension_from_content_type(self, content_type: str) -> str: + """Bestimmt die Dateiendung basierend auf dem Content-Type""" + return { + 'image/jpeg': '.jpg', + 'image/png': '.png', + 'image/gif': '.gif', + 'image/svg+xml': '.svg', + 'image/webp': '.webp', + 'image/x-icon': '.ico', + 'image/vnd.microsoft.icon': '.ico', + 'text/css': '.css', + 'application/javascript': '.js', + 'text/javascript': '.js', + 'application/x-javascript': '.js', + 'video/mp4': '.mp4', + 'video/webm': '.webm', + 'font/woff': '.woff', + 'font/woff2': '.woff2', + 'font/ttf': '.ttf', + 'font/otf': '.otf', + 'application/font-woff': '.woff', + 'application/font-woff2': '.woff2', + 'application/x-font-ttf': '.ttf', + 'application/x-font-otf': '.otf', + 'application/x-font-woff': '.woff', + 'application/vnd.ms-fontobject': '.eot' + }.get(content_type, '.dat') + + def _process_css_file(self, css_path: str, css_url: str, base_path: str): + """Verarbeitet CSS-Dateien und lädt referenzierte Ressourcen herunter""" + css_content = None + used_encoding = None + + # Versuche verschiedene Encodings + encodings = ['utf-8', 'latin-1', 'iso-8859-1', 'windows-1252', 'cp1252'] + + for encoding in encodings: + try: + with open(css_path, 'r', encoding=encoding) as f: + css_content = f.read() + used_encoding = encoding + break + except UnicodeDecodeError: + continue + + # Falls kein Encoding funktioniert, lies als Binärdatei + if css_content is None: + try: + with open(css_path, 'rb') as f: + # Versuche automatische Erkennung oder nutze latin-1 als Fallback + raw_content = f.read() + try: + # Versuche chardet wenn verfügbar + import chardet + detected = chardet.detect(raw_content) + if detected['encoding']: + css_content = raw_content.decode(detected['encoding']) + used_encoding = detected['encoding'] + except: + # Fallback: latin-1 kann alles decodieren + css_content = raw_content.decode('latin-1', errors='replace') + used_encoding = 'latin-1' + except Exception as e: + self._emit_status(f"Fehler beim Lesen von CSS {css_path}: {str(e)}") + return + + if not css_content: + return + + try: + # Finde alle URLs in der CSS-Datei + url_pattern = r'url\(["\']?([^"\'()]+)["\']?\)' + urls = re.findall(url_pattern, css_content) + + for url in urls: + # Skip data URLs + if url.startswith('data:'): + continue + + abs_url = urljoin(css_url, url) + + # Bestimme Ressourcentyp basierend auf Erweiterung + ext = os.path.splitext(urlparse(abs_url).path)[1].lower() + if ext in ['.woff', '.woff2', '.ttf', '.otf', '.eot']: + resource_type = 'fonts' + elif ext in ['.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.ico']: + resource_type = 'images' + else: + resource_type = 'css' # Für andere CSS-Dateien (@import) + + local_path = self._download_resource(abs_url, base_path, resource_type) + if local_path: + # Ersetze URL in CSS + rel_path = os.path.relpath(local_path, os.path.dirname(css_path)).replace('\\', '/') + css_content = css_content.replace(url, rel_path) + + # Schreibe aktualisierte CSS zurück mit gleichem Encoding + with open(css_path, 'w', encoding=used_encoding or 'utf-8') as f: + f.write(css_content) + + except Exception as e: + self._emit_status(f"Fehler beim Verarbeiten von CSS {css_path}: {str(e)}") + + def _simulate_mouse_movement(self): + """Simuliert Mausbewegungen durch zufällige kurze Pausen""" + if self.human_behavior and random.random() < 0.3: # 30% Chance + time.sleep(random.uniform(0.1, 0.3)) + + def _sanitize_filename(self, url: str) -> str: + parsed = urlparse(url) + path = parsed.path.strip('/') + + if not path: + path = 'index' + + # Ersetze ungültige Zeichen + path = re.sub(r'[<>:"|?*]', '_', path) + + # Füge .html hinzu wenn keine Erweiterung vorhanden + if not os.path.splitext(path)[1]: + path += '.html' + + return path + + def _download_resource(self, url: str, base_path: str, resource_type: str = 'page') -> Optional[str]: + if url in self.downloaded_resources: + return self.downloaded_resources[url] + + try: + # Menschenähnliche Verzögerung vor dem Request + if resource_type == 'page': + self._human_delay(base_delay=1.5, variation=0.8) + else: + # Ressourcen werden schneller geladen, aber trotzdem mit Variation + self._human_delay(base_delay=0.3, variation=0.2) + + # Gelegentlich User Agent wechseln + if self.human_behavior and random.random() < 0.05: # 5% Chance + self._set_random_user_agent() + + # Simuliere gelegentliches Neuladen der Seite + if self.human_behavior and resource_type == 'page' and random.random() < 0.03: # 3% Chance + self._emit_status("Seite wird neu geladen...") + time.sleep(random.uniform(0.5, 1.5)) + + response = self.session.get(url, timeout=30) + + # Bei 403/404 Fehlern überspringen statt abbrechen + if response.status_code in [403, 404]: + self._emit_status(f"Überspringe {url} (HTTP {response.status_code})") + self.skipped_urls[url] = f"HTTP {response.status_code}" + return None + + response.raise_for_status() + + # Simuliere Scroll-Verhalten nach dem Laden + if resource_type == 'page': + self._simulate_mouse_movement() + + # Bestimme den lokalen Pfad + parsed = urlparse(url) + + # Prüfe ob es eine externe Ressource ist + is_external = parsed.netloc and parsed.netloc != urlparse(self.current_base_url).netloc + + if is_external: + # Für externe Ressourcen: Erstelle Ordnerstruktur nach Domain + domain = parsed.netloc.replace(':', '_').replace('.', '_') + if resource_type == 'page': + resource_dir = os.path.join(base_path, 'external', domain) + else: + resource_dir = os.path.join(base_path, 'resources', 'external', domain, resource_type) + os.makedirs(resource_dir, exist_ok=True) + + # Hash für eindeutige Dateinamen bei externen Ressourcen + url_hash = hashlib.md5(url.encode()).hexdigest()[:8] + filename = self._sanitize_filename(url) + ext = os.path.splitext(filename)[1] + + # Bestimme Erweiterung aus Content-Type wenn nötig + if not ext and resource_type != 'page': + content_type = response.headers.get('Content-Type', '').split(';')[0] + ext = self._get_extension_from_content_type(content_type) + + filename = f"{url_hash}{ext}" + local_path = os.path.join(resource_dir, filename) + else: + # Interne Ressourcen + filename = self._sanitize_filename(url) + if resource_type != 'page': + # Für Ressourcen einen separaten Ordner verwenden + resource_dir = os.path.join(base_path, 'resources', resource_type) + os.makedirs(resource_dir, exist_ok=True) + + # Hash für eindeutige Dateinamen + url_hash = hashlib.md5(url.encode()).hexdigest()[:8] + ext = os.path.splitext(filename)[1] or '.dat' + filename = f"{url_hash}{ext}" + local_path = os.path.join(resource_dir, filename) + else: + local_path = os.path.join(base_path, filename) + + # Schreibe Datei + os.makedirs(os.path.dirname(local_path), exist_ok=True) + + if resource_type == 'page': + # HTML verarbeiten + content = response.text + with open(local_path, 'w', encoding='utf-8') as f: + f.write(content) + elif resource_type == 'css': + # CSS-Dateien mit korrektem Encoding speichern + # Versuche das Encoding aus der Response zu bekommen + encoding = response.encoding + if not encoding or encoding == 'ISO-8859-1': + # Oft wird ISO-8859-1 als Standard zurückgegeben, auch wenn es nicht stimmt + # Versuche das Encoding aus dem Content-Type Header zu bekommen + content_type = response.headers.get('Content-Type', '') + if 'charset=' in content_type: + encoding = content_type.split('charset=')[-1].strip() + else: + # Fallback: Versuche zu erkennen + try: + import chardet + detected = chardet.detect(response.content) + if detected['encoding']: + encoding = detected['encoding'] + except: + encoding = 'utf-8' + + # Speichere CSS als Text mit erkanntem Encoding + try: + content = response.content.decode(encoding) + with open(local_path, 'w', encoding='utf-8') as f: + f.write(content) + except: + # Fallback: Speichere als Binärdatei + with open(local_path, 'wb') as f: + f.write(response.content) + else: + # Andere Ressourcen als Binärdaten + with open(local_path, 'wb') as f: + f.write(response.content) + + self.downloaded_resources[url] = local_path + self._emit_status(f"Heruntergeladen: {url}") + + # CSS-Dateien parsen für weitere Ressourcen + if resource_type == 'css' and local_path: + self._process_css_file(local_path, url, base_path) + + # Aktualisiere Fortschritt beim erfolgreichen Download + if local_path and hasattr(self, 'processed_resources'): + self.processed_resources += 1 + progress = min(95, int((self.processed_resources / self.total_resources_estimate) * 90) + 5) + self._emit_progress(progress) + + return local_path + + except requests.exceptions.HTTPError as e: + if e.response.status_code in [403, 404, 401, 429, 503]: + self._emit_status(f"Überspringe {url} (HTTP {e.response.status_code})") + else: + self._emit_status(f"HTTP-Fehler bei {url}: {str(e)}") + return None + except requests.exceptions.ConnectionError: + self._emit_status(f"Verbindungsfehler bei {url} - Überspringe...") + return None + except requests.exceptions.Timeout: + self._emit_status(f"Zeitüberschreitung bei {url} - Überspringe...") + return None + except Exception as e: + self._emit_status(f"Fehler beim Download von {url}: {str(e)}") + return None + + def _process_html(self, html_content: str, base_url: str, base_path: str, + download_images: bool, download_css: bool, download_js: bool, + download_videos: bool = True) -> str: + soup = BeautifulSoup(html_content, 'html.parser') + + # Favicon herunterladen + for link in soup.find_all('link', rel=lambda x: x and ('icon' in str(x) or 'shortcut' in str(x))): + if link.get('href'): + href_url = link['href'] + if href_url.startswith('/'): + parsed_base = urlparse(base_url) + abs_url = f"{parsed_base.scheme}://{parsed_base.netloc}{href_url}" + else: + abs_url = urljoin(base_url, href_url) + local_path = self._download_resource(abs_url, base_path, 'images') + if local_path: + link['href'] = os.path.relpath(local_path, base_path).replace('\\', '/') + + # CSS verarbeiten + if download_css: + for link in soup.find_all('link', {'rel': 'stylesheet'}): + if link.get('href'): + href_url = link['href'] + if href_url.startswith('/'): + parsed_base = urlparse(base_url) + abs_url = f"{parsed_base.scheme}://{parsed_base.netloc}{href_url}" + else: + abs_url = urljoin(base_url, href_url) + local_path = self._download_resource(abs_url, base_path, 'css') + if local_path: + # Stelle sicher, dass der Pfad korrekt relativ ist + rel_path = os.path.relpath(local_path, base_path) + # Konvertiere Windows-Pfade zu Web-Pfaden + link['href'] = rel_path.replace('\\', '/') + + # Inline CSS mit @import Regeln und background-images + for style in soup.find_all('style'): + if style.string: + css_content = style.string + + # Verarbeite @import Regeln + imports = re.findall(r'@import\s+url\(["\']?([^"\'()]+)["\']?\)', css_content) + for imp_url in imports: + abs_url = urljoin(base_url, imp_url) + local_path = self._download_resource(abs_url, base_path, 'css') + if local_path: + rel_path = os.path.relpath(local_path, base_path).replace('\\', '/') + css_content = css_content.replace(imp_url, rel_path) + + # Verarbeite alle URLs in Inline-CSS (inkl. background-images) + urls = re.findall(r'url\(["\']?([^"\'()]+)["\']?\)', css_content) + for url in urls: + if url.startswith('data:') or url in imports: + continue + + abs_url = urljoin(base_url, url) + # Bestimme Ressourcentyp + ext = os.path.splitext(urlparse(abs_url).path)[1].lower() + if ext in ['.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.ico']: + resource_type = 'images' + elif ext in ['.woff', '.woff2', '.ttf', '.otf', '.eot']: + resource_type = 'fonts' + else: + resource_type = 'css' + + local_path = self._download_resource(abs_url, base_path, resource_type) + if local_path: + rel_path = os.path.relpath(local_path, base_path).replace('\\', '/') + css_content = css_content.replace(url, rel_path) + + style.string = css_content + + # JavaScript verarbeiten + if download_js: + for script in soup.find_all('script', {'src': True}): + src_url = script['src'] + if src_url.startswith('/'): + parsed_base = urlparse(base_url) + abs_url = f"{parsed_base.scheme}://{parsed_base.netloc}{src_url}" + else: + abs_url = urljoin(base_url, src_url) + local_path = self._download_resource(abs_url, base_path, 'js') + if local_path: + script['src'] = os.path.relpath(local_path, base_path).replace('\\', '/') + + # Bilder verarbeiten + if download_images: + for img in soup.find_all('img'): + if img.get('src'): + src_url = img['src'] + # Behandle relative und absolute Pfade + if src_url.startswith('/'): + # Absoluter Pfad zur Domain - füge Domain hinzu + parsed_base = urlparse(base_url) + abs_url = f"{parsed_base.scheme}://{parsed_base.netloc}{src_url}" + else: + # Relativer Pfad oder vollständige URL + abs_url = urljoin(base_url, src_url) + + local_path = self._download_resource(abs_url, base_path, 'images') + if local_path: + img['src'] = os.path.relpath(local_path, base_path).replace('\\', '/') + + # srcset für responsive Bilder + if img.get('srcset'): + new_srcset = [] + for src_desc in img['srcset'].split(','): + parts = src_desc.strip().split(' ') + if parts: + src_url = parts[0] + # Behandle relative und absolute Pfade + if src_url.startswith('/'): + parsed_base = urlparse(base_url) + abs_url = f"{parsed_base.scheme}://{parsed_base.netloc}{src_url}" + else: + abs_url = urljoin(base_url, src_url) + + local_path = self._download_resource(abs_url, base_path, 'images') + if local_path: + rel_path = os.path.relpath(local_path, base_path).replace('\\', '/') + parts[0] = rel_path + new_srcset.append(' '.join(parts)) + else: + new_srcset.append(src_desc) + img['srcset'] = ', '.join(new_srcset) + + # Picture source tags verarbeiten + for picture in soup.find_all('picture'): + for source in picture.find_all('source'): + if source.get('srcset'): + new_srcset = [] + for src_desc in source['srcset'].split(','): + parts = src_desc.strip().split(' ') + if parts: + src_url = parts[0] + # Behandle relative und absolute Pfade + if src_url.startswith('/'): + parsed_base = urlparse(base_url) + abs_url = f"{parsed_base.scheme}://{parsed_base.netloc}{src_url}" + else: + abs_url = urljoin(base_url, src_url) + + local_path = self._download_resource(abs_url, base_path, 'images') + if local_path: + rel_path = os.path.relpath(local_path, base_path).replace('\\', '/') + parts[0] = rel_path + new_srcset.append(' '.join(parts)) + else: + new_srcset.append(src_desc) + source['srcset'] = ', '.join(new_srcset) + + # Auch Background-Images in style-Attributen + for element in soup.find_all(style=True): + style = element['style'] + urls = re.findall(r'url\(["\']?([^"\'()]+)["\']?\)', style) + for url in urls: + # Behandle relative und absolute Pfade + if url.startswith('/'): + parsed_base = urlparse(base_url) + abs_url = f"{parsed_base.scheme}://{parsed_base.netloc}{url}" + else: + abs_url = urljoin(base_url, url) + + local_path = self._download_resource(abs_url, base_path, 'images') + if local_path: + rel_path = os.path.relpath(local_path, base_path).replace('\\', '/') + style = style.replace(url, rel_path) + element['style'] = style + + # Videos verarbeiten + if download_videos: + # HTML5 video tags + for video in soup.find_all('video'): + # Video source tags + for source in video.find_all('source'): + if source.get('src'): + src_url = source['src'] + if src_url.startswith('/'): + parsed_base = urlparse(base_url) + abs_url = f"{parsed_base.scheme}://{parsed_base.netloc}{src_url}" + else: + abs_url = urljoin(base_url, src_url) + local_path = self._download_resource(abs_url, base_path, 'video') + if local_path: + source['src'] = os.path.relpath(local_path, base_path).replace('\\', '/') + # Direct video src + if video.get('src'): + src_url = video['src'] + if src_url.startswith('/'): + parsed_base = urlparse(base_url) + abs_url = f"{parsed_base.scheme}://{parsed_base.netloc}{src_url}" + else: + abs_url = urljoin(base_url, src_url) + local_path = self._download_resource(abs_url, base_path, 'video') + if local_path: + video['src'] = os.path.relpath(local_path, base_path).replace('\\', '/') + # Video poster + if video.get('poster') and download_images: + poster_url = video['poster'] + if poster_url.startswith('/'): + parsed_base = urlparse(base_url) + abs_url = f"{parsed_base.scheme}://{parsed_base.netloc}{poster_url}" + else: + abs_url = urljoin(base_url, poster_url) + local_path = self._download_resource(abs_url, base_path, 'images') + if local_path: + video['poster'] = os.path.relpath(local_path, base_path).replace('\\', '/') + + # iframe embeds (YouTube, Vimeo, etc.) + for iframe in soup.find_all('iframe'): + src = iframe.get('src') + if src and any(domain in src for domain in ['youtube.com', 'vimeo.com', 'dailymotion.com']): + # Für eingebettete Videos erstellen wir einen Platzhalter + placeholder = soup.new_tag('div', style='background-color: #232D53; color: #00D4FF; padding: 20px; text-align: center; border-radius: 8px;') + placeholder.string = f'[Eingebettetes Video: {src}]' + iframe.replace_with(placeholder) + + # Links zu anderen Seiten konvertieren + for a in soup.find_all('a', href=True): + href = a['href'] + + # Spezialbehandlung für JavaScript-basierte Links (z.B. Galerie) + if href.startswith('javascript:') or href == '#': + # Prüfe ob es ein onclick-Attribut gibt + onclick = a.get('onclick', '') + if onclick: + # Füge ein data-Attribut hinzu um den ursprünglichen onclick zu behalten + a['data-original-onclick'] = onclick + # Entferne onclick für bessere Offline-Kompatibilität + del a['onclick'] + + # Wenn href nur "#" ist und es einen data-href oder ähnliches gibt + if href == '#': + # Prüfe alternative href-Attribute + for attr in ['data-href', 'data-url', 'data-link']: + if a.get(attr): + a['href'] = a[attr] + break + continue + + # Skip andere spezielle Links + if href.startswith(('mailto:', 'tel:', 'data:')): + continue + + abs_url = urljoin(base_url, href) + parsed = urlparse(abs_url) + + # Wenn es eine interne Seite ist, konvertiere zu lokalem Pfad + if parsed.netloc == urlparse(base_url).netloc or not parsed.netloc: + # Erstelle lokalen Dateinamen für die verlinkte Seite + if abs_url in self.url_mapping: + local_path = self.url_mapping[abs_url] + a['href'] = os.path.relpath(local_path, base_path).replace('\\', '/') + else: + # Für noch nicht heruntergeladene Seiten + filename = self._sanitize_filename(abs_url) + if parsed.path.endswith('/') or not parsed.path: + filename = 'index.html' + elif not os.path.splitext(filename)[1]: + filename += '.html' + a['href'] = os.path.relpath(os.path.join(base_path, filename), base_path).replace('\\', '/') + + # Meta-Tags für bessere Offline-Darstellung + if not soup.find('meta', {'http-equiv': 'Content-Type'}): + meta = soup.new_tag('meta') + meta['http-equiv'] = 'Content-Type' + meta['content'] = 'text/html; charset=utf-8' + if soup.head: + soup.head.insert(0, meta) + else: + head = soup.new_tag('head') + soup.insert(0, head) + head.insert(0, meta) + + # Füge JavaScript für Offline-Galerie-Navigation hinzu + gallery_script = soup.new_tag('script') + gallery_script.string = ''' + // Offline Gallery Navigation Fix + document.addEventListener('DOMContentLoaded', function() { + // Finde alle Links mit gespeicherten onclick-Events + var links = document.querySelectorAll('a[data-original-onclick]'); + links.forEach(function(link) { + link.style.cursor = 'pointer'; + link.addEventListener('click', function(e) { + e.preventDefault(); + // Versuche die ursprüngliche onclick-Funktion auszuführen + try { + eval(this.getAttribute('data-original-onclick')); + } catch(err) { + console.log('Gallery navigation not available in offline mode'); + } + }); + }); + + // Behandle Hash-Links für Galerie-Navigation + var hashLinks = document.querySelectorAll('a[href^="#"]'); + hashLinks.forEach(function(link) { + link.addEventListener('click', function(e) { + var targetId = this.getAttribute('href').substring(1); + if (targetId) { + var target = document.getElementById(targetId); + if (target) { + e.preventDefault(); + target.scrollIntoView({behavior: 'smooth'}); + } + } + }); + }); + }); + ''' + if soup.body: + soup.body.append(gallery_script) + + return str(soup) + + def _extract_links(self, html_content: str, base_url: str) -> Set[str]: + soup = BeautifulSoup(html_content, 'html.parser') + links = set() + + for a in soup.find_all('a', href=True): + abs_url = urljoin(base_url, a['href']) + parsed = urlparse(abs_url) + + # Nur Links zur gleichen Domain + if parsed.netloc == urlparse(base_url).netloc: + # Entferne Fragment + clean_url = urlunparse(parsed._replace(fragment='')) + links.add(clean_url) + + return links + + def _create_navigation_index(self, save_path: str, original_url: str): + """Erstellt eine Index-Datei mit allen heruntergeladenen Seiten für einfache Navigation""" + try: + # Sammle alle heruntergeladenen HTML-Dateien + html_files = [] + for root, dirs, files in os.walk(save_path): + for file in files: + if file.endswith('.html') and file != '_navigation_index.html': + rel_path = os.path.relpath(os.path.join(root, file), save_path) + html_files.append(rel_path.replace('\\', '/')) + + # Erstelle HTML für Navigation + nav_html = f''' + + + + + Navigation - {urlparse(original_url).netloc} + + + +
+

Webseiten-Navigation

+
Gesicherte Webseite: {original_url}
+ +
+ Seiten: {len(html_files)} + Datum: {datetime.now().strftime("%d.%m.%Y %H:%M")} +
+ + → Zur Hauptseite + +

Alle gesicherten Seiten

+ +
+ +''' + + # Speichere Navigation Index + nav_path = os.path.join(save_path, '_navigation_index.html') + with open(nav_path, 'w', encoding='utf-8') as f: + f.write(nav_html) + + self._emit_status(f"Navigations-Index erstellt: {nav_path}") + + except Exception as e: + self._emit_status(f"Fehler beim Erstellen des Navigations-Index: {str(e)}") + + def _pre_scan_website(self, url: str, follow_links: bool = False, max_depth: int = 1) -> int: + """Führt eine Vorab-Prüfung durch um die Anzahl der Ressourcen zu schätzen""" + try: + self._emit_status("Analysiere Webseite...") + response = self.session.get(url, timeout=10) + response.raise_for_status() + + soup = BeautifulSoup(response.text, 'html.parser') + resource_count = 1 # Die HTML-Seite selbst + + # Zähle Bilder + resource_count += len(soup.find_all('img')) + + # Zähle CSS-Dateien + resource_count += len(soup.find_all('link', {'rel': 'stylesheet'})) + + # Zähle JS-Dateien + resource_count += len(soup.find_all('script', {'src': True})) + + # Zähle Videos + resource_count += len(soup.find_all('video')) + resource_count += len(soup.find_all('source')) + + # Schätze inline CSS Ressourcen + for style in soup.find_all('style'): + if style.string: + resource_count += len(re.findall(r'url\(["\']?[^"\'()]+["\']?\)', style.string)) + + # Schätze style-Attribut Ressourcen + for element in soup.find_all(style=True): + resource_count += len(re.findall(r'url\(["\']?[^"\'()]+["\']?\)', element['style'])) + + if follow_links: + # Füge geschätzte Anzahl von verlinkten Seiten hinzu + links = soup.find_all('a', href=True) + internal_links = [l for l in links if not l['href'].startswith(('http://', 'https://', '#', 'mailto:', 'tel:'))] + resource_count += min(len(internal_links), 20) # Maximal 20 zusätzliche Seiten schätzen + + return max(resource_count, 10) # Mindestens 10 für realistische Fortschrittsanzeige + + except Exception as e: + self._emit_status(f"Vorab-Prüfung fehlgeschlagen: {str(e)}") + return 50 # Fallback-Schätzung + + def download_website(self, url: str, save_path: str, download_images: bool = True, + download_css: bool = True, download_js: bool = True, + download_videos: bool = True, follow_links: bool = False, + max_depth: int = 1, **kwargs) -> bool: + try: + self.start_time = datetime.now() + self.visited_urls.clear() + self.downloaded_resources.clear() + self.skipped_urls.clear() + self.url_mapping.clear() + self.current_base_url = url + + # Erstelle Speicherordner + os.makedirs(save_path, exist_ok=True) + + # Vorab-Prüfung für bessere Fortschrittsschätzung + estimated_resources = self._pre_scan_website(url, follow_links, max_depth) + self.total_resources_estimate = estimated_resources + self.processed_resources = 0 + + self._emit_status(f"Starte Download von {url}") + self._emit_progress(5) # 5% nach Vorab-Prüfung + + # Download-Queue für Link-Verfolgung + to_download = [(url, 0)] # (URL, Tiefe) + total_urls = 1 + processed = 0 + + while to_download and (not follow_links or processed < 100): # Limit für Sicherheit + current_url, depth = to_download.pop(0) + + if current_url in self.visited_urls: + continue + + self.visited_urls.add(current_url) + + try: + # Menschenähnliche Verzögerung vor dem Request + if self.human_behavior: + self._human_delay(base_delay=1.5, variation=0.8) + + # Lade HTML + response = self.session.get(current_url, timeout=30) + + # Bei bestimmten HTTP-Fehlern überspringen + if response.status_code in [403, 404, 401, 429, 503]: + self._emit_status(f"Überspringe {current_url} (HTTP {response.status_code})") + self.skipped_urls[current_url] = f"HTTP {response.status_code}" + processed += 1 + continue + + response.raise_for_status() + except requests.exceptions.RequestException as e: + self._emit_status(f"Fehler bei {current_url}: {str(e)} - Überspringe...") + self.skipped_urls[current_url] = str(e) + processed += 1 + continue + + # Verarbeite HTML + processed_html = self._process_html( + response.text, current_url, save_path, + download_images, download_css, download_js, download_videos + ) + + # Speichere HTML + filename = self._sanitize_filename(current_url) + if current_url == url: + # Hauptseite als index.html + filename = 'index.html' + + html_path = os.path.join(save_path, filename) + os.makedirs(os.path.dirname(html_path), exist_ok=True) + + # Speichere URL-Mapping für Link-Konvertierung + self.url_mapping[current_url] = html_path + + with open(html_path, 'w', encoding='utf-8') as f: + f.write(processed_html) + + # Links extrahieren wenn gewünscht + if follow_links and depth < max_depth: + links = self._extract_links(response.text, current_url) + for link in links: + if link not in self.visited_urls: + to_download.append((link, depth + 1)) + total_urls += 1 + + processed += 1 + # Bessere Fortschrittsberechnung basierend auf geschätzten Ressourcen + self.processed_resources += 5 # Geschätzter Wert für eine HTML-Seite mit Ressourcen + progress = min(95, int((self.processed_resources / self.total_resources_estimate) * 90) + 5) + self._emit_progress(progress) + + # Menschenähnliches Browsing-Muster + if self.human_behavior: + if processed % 5 == 0: # Alle 5 Seiten + # Simuliere eine Kaffeepause (ohne Statusmeldung) + pause_duration = random.uniform(10, 20) + time.sleep(pause_duration) + + # Gelegentlich Session erneuern (simuliert Browser-Neustart) + if processed % 20 == 0: + # Session wird still erneuert + old_cookies = self.session.cookies + self.session.close() + self.session = requests.Session() + self.session.cookies = old_cookies + self._set_random_user_agent() + + self._emit_progress(100) + + # Zusammenfassung + if self.skipped_urls: + self._emit_status(f"\n=== Zusammenfassung ===") + self._emit_status(f"Erfolgreich: {len(self.downloaded_resources)} Dateien") + self._emit_status(f"Übersprungen: {len(self.skipped_urls)} URLs") + self._emit_status("\nÜbersprungene URLs:") + for url, reason in list(self.skipped_urls.items())[:10]: # Zeige max 10 + self._emit_status(f" - {url}: {reason}") + if len(self.skipped_urls) > 10: + self._emit_status(f" ... und {len(self.skipped_urls) - 10} weitere") + else: + self._emit_status(f"\n=== Zusammenfassung ===") + self._emit_status(f"Alle {len(self.downloaded_resources)} Dateien erfolgreich heruntergeladen!") + + self._emit_status("\nDownload abgeschlossen!") + self.end_time = datetime.now() + + # Erstelle Navigations-Index + self._create_navigation_index(save_path, url) + + return True + + except Exception as e: + self._emit_status(f"Fehler: {str(e)}") + self.end_time = datetime.now() + return False \ No newline at end of file diff --git a/src/resources/icons/check.svg b/src/resources/icons/check.svg new file mode 100644 index 0000000..a37c0ce --- /dev/null +++ b/src/resources/icons/check.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/src/resources/icons/download.svg b/src/resources/icons/download.svg new file mode 100644 index 0000000..2fa7c19 --- /dev/null +++ b/src/resources/icons/download.svg @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/src/resources/icons/folder.svg b/src/resources/icons/folder.svg new file mode 100644 index 0000000..15941e5 --- /dev/null +++ b/src/resources/icons/folder.svg @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/src/resources/icons/gear.svg b/src/resources/icons/gear.svg new file mode 100644 index 0000000..b28a895 --- /dev/null +++ b/src/resources/icons/gear.svg @@ -0,0 +1,5 @@ + + + + + \ No newline at end of file diff --git a/src/resources/icons/globe.svg b/src/resources/icons/globe.svg new file mode 100644 index 0000000..bde67ae --- /dev/null +++ b/src/resources/icons/globe.svg @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/src/resources/icons/moon.svg b/src/resources/icons/moon.svg new file mode 100644 index 0000000..8dbdf3a --- /dev/null +++ b/src/resources/icons/moon.svg @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/src/resources/icons/sun.svg b/src/resources/icons/sun.svg new file mode 100644 index 0000000..1c0898f --- /dev/null +++ b/src/resources/icons/sun.svg @@ -0,0 +1,7 @@ + + + + + + + \ No newline at end of file diff --git a/src/resources/logo/intelsight-full-dark.svg b/src/resources/logo/intelsight-full-dark.svg new file mode 100644 index 0000000..0e225d0 --- /dev/null +++ b/src/resources/logo/intelsight-full-dark.svg @@ -0,0 +1,54 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + IntelSight + SICHERHEIT MADE IN GERMANY + + \ No newline at end of file diff --git a/src/resources/logo/intelsight-full-light.svg b/src/resources/logo/intelsight-full-light.svg new file mode 100644 index 0000000..353b693 --- /dev/null +++ b/src/resources/logo/intelsight-full-light.svg @@ -0,0 +1,54 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + IntelSight + SICHERHEIT MADE IN GERMANY + + \ No newline at end of file diff --git a/src/resources/logo/intelsight-icon-transparent-dark.svg b/src/resources/logo/intelsight-icon-transparent-dark.svg new file mode 100644 index 0000000..fcbed9f --- /dev/null +++ b/src/resources/logo/intelsight-icon-transparent-dark.svg @@ -0,0 +1,40 @@ + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/src/resources/logo/intelsight-name-dark.svg b/src/resources/logo/intelsight-name-dark.svg new file mode 100644 index 0000000..5efe807 --- /dev/null +++ b/src/resources/logo/intelsight-name-dark.svg @@ -0,0 +1,53 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + IntelSight + + \ No newline at end of file diff --git a/src/resources/logo/intelsight-name-light.svg b/src/resources/logo/intelsight-name-light.svg new file mode 100644 index 0000000..7e5c2dd --- /dev/null +++ b/src/resources/logo/intelsight-name-light.svg @@ -0,0 +1,53 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + IntelSight + + \ No newline at end of file diff --git a/src/resources/logo/intelsight-name-transparent-dark.svg b/src/resources/logo/intelsight-name-transparent-dark.svg new file mode 100644 index 0000000..5efe807 --- /dev/null +++ b/src/resources/logo/intelsight-name-transparent-dark.svg @@ -0,0 +1,53 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + IntelSight + + \ No newline at end of file diff --git a/src/resources/styles/__pycache__/dark_theme.cpython-310.pyc b/src/resources/styles/__pycache__/dark_theme.cpython-310.pyc new file mode 100644 index 0000000..02d6ea4 Binary files /dev/null and b/src/resources/styles/__pycache__/dark_theme.cpython-310.pyc differ diff --git a/src/resources/styles/__pycache__/light_theme.cpython-310.pyc b/src/resources/styles/__pycache__/light_theme.cpython-310.pyc new file mode 100644 index 0000000..5d781b9 Binary files /dev/null and b/src/resources/styles/__pycache__/light_theme.cpython-310.pyc differ diff --git a/src/resources/styles/dark_theme.py b/src/resources/styles/dark_theme.py new file mode 100644 index 0000000..e535335 --- /dev/null +++ b/src/resources/styles/dark_theme.py @@ -0,0 +1,337 @@ +DARK_THEME = """ +/* Globale Variablen und Basis-Styles */ +* { + font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Arial, sans-serif; +} + +/* Hauptfenster */ +QMainWindow { + background-color: #0a0a0a; + color: #FFFFFF; +} + +/* Widget Hintergründe */ +QWidget { + background-color: transparent; + color: #FFFFFF; +} + +/* App Header */ +QWidget#appHeader { + background-color: transparent; + padding-bottom: 20px; +} + +/* Überschriften */ +QLabel#heading { + font-family: 'Poppins', sans-serif; + font-size: 32px; + font-weight: 700; + color: #FFFFFF; + letter-spacing: -0.5px; +} + +QLabel#subheading { + font-size: 16px; + color: rgba(255, 255, 255, 0.6); + margin-top: -5px; +} + +QLabel#sectionTitle { + font-size: 18px; + font-weight: 600; + color: #FFFFFF; + margin-bottom: 10px; +} + +QLabel#inputLabel { + font-size: 14px; + font-weight: 600; + color: rgba(255, 255, 255, 0.8); + margin-bottom: 4px; +} + +/* Content Card - Hauptcontainer */ +QWidget#contentCard { + background-color: #1a1a1a; + border-radius: 12px; + padding: 32px; +} + +/* Tabellen-Style */ +QTableWidget#dataTable { + background-color: transparent; + border: none; + outline: none; +} + +QTableWidget#dataTable::item { + background-color: transparent; + color: rgba(255, 255, 255, 0.7); + font-weight: 600; + padding: 12px 16px; + border-bottom: 1px solid rgba(255, 255, 255, 0.1); +} + +QTableWidget#dataTable::item:selected { + background-color: transparent; +} + +/* Eingabefelder */ +QLineEdit { + background-color: #232D53; + border: none; + border-radius: 8px; + padding: 12px 16px; + color: #FFFFFF; + font-size: 14px; + min-height: 24px; +} + +QLineEdit:focus { + background-color: #2A3560; + outline: none; +} + +QLineEdit::placeholder { + color: rgba(255, 255, 255, 0.4); +} + +/* Buttons */ +QPushButton { + background-color: transparent; + color: #FFFFFF; + border: 1px solid #232D53; + border-radius: 24px; + padding: 0 24px; + min-height: 40px; + font-size: 14px; + font-weight: 600; +} + +QPushButton:hover { + background-color: #232D53; +} + +QPushButton:pressed { + background-color: #1A1F3A; +} +QPushButton:disabled { + background-color: #1a1a1a; + color: rgba(255, 255, 255, 0.3); + border: 1px solid rgba(255, 255, 255, 0.1); +} + +/* Primary Button */ +QPushButton#primaryButton { + background-color: #00D4FF; + color: #1A1F3A; + border: none; +} + +QPushButton#primaryButton:hover { + background-color: #00B8E6; + color: #FFFFFF; +} + +QPushButton#primaryButton:pressed { + background-color: #0099CC; +} + +QPushButton#primaryButton:disabled { + background-color: #2a2a2a; + color: rgba(255, 255, 255, 0.3); +} + +/* Mode Toggle Button */ +QPushButton#modeToggle { + background-color: rgba(255, 255, 255, 0.1); + border: none; + border-radius: 20px; + padding: 8px; + min-width: 40px; + min-height: 40px; + max-width: 40px; + max-height: 40px; +} + +QPushButton#modeToggle:hover { + background-color: rgba(255, 255, 255, 0.2); +} + +/* CheckBox */ +QCheckBox { + spacing: 10px; + color: #FFFFFF; + font-size: 14px; + padding: 4px 0; +} + +QCheckBox::indicator { + width: 20px; + height: 20px; + border-radius: 4px; + border: 2px solid rgba(255, 255, 255, 0.3); + background-color: rgba(255, 255, 255, 0.05); +} + +QCheckBox::indicator:hover { + border: 2px solid rgba(255, 255, 255, 0.5); + background-color: rgba(255, 255, 255, 0.1); +} + +QCheckBox::indicator:checked { + background-color: #00D4FF; + border: 2px solid #00D4FF; + image: url(src/resources/icons/check.svg); + padding: 2px; +} + +/* SpinBox */ +QSpinBox { + background-color: #232D53; + border: none; + border-radius: 8px; + padding: 8px 12px; + color: #FFFFFF; + font-size: 14px; + min-width: 60px; +} + +QSpinBox:focus { + background-color: #2A3560; +} + +QSpinBox::up-button, QSpinBox::down-button { + background-color: transparent; + border: none; + width: 20px; +} + +QSpinBox::up-button:hover, QSpinBox::down-button:hover { + background-color: rgba(0, 212, 255, 0.1); +} + +QSpinBox::up-arrow, QSpinBox::down-arrow { + image: none; + width: 0; + height: 0; +} + +/* Progress Bar */ +QProgressBar { + background-color: rgba(255, 255, 255, 0.1); + border: none; + border-radius: 6px; + height: 12px; + text-align: center; + font-size: 12px; +} + +QProgressBar::chunk { + background: linear-gradient(90deg, #00D4FF 0%, #00B8E6 100%); + border-radius: 6px; +} + +/* Text Edit für Status */ +QTextEdit#statusLog { + background-color: rgba(0, 0, 0, 0.3); + border: 1px solid rgba(255, 255, 255, 0.1); + border-radius: 8px; + padding: 12px; + color: rgba(255, 255, 255, 0.8); + font-family: 'SF Mono', Monaco, 'Cascadia Code', monospace; + font-size: 13px; +} + +/* Scrollbar */ +QScrollBar:vertical { + background-color: transparent; + width: 10px; + border-radius: 5px; +} + +QScrollBar::handle:vertical { + background-color: rgba(255, 255, 255, 0.2); + border-radius: 5px; + min-height: 30px; +} + +QScrollBar::handle:vertical:hover { + background-color: rgba(255, 255, 255, 0.3); +} + +QScrollBar::add-line:vertical, QScrollBar::sub-line:vertical { + height: 0; +} + +/* Frame Separator */ +QFrame#separator { + background-color: rgba(255, 255, 255, 0.1); + max-height: 1px; + margin: 20px 0; +} + +/* Status Bar */ +QStatusBar { + background-color: #0a0a0a; + color: rgba(255, 255, 255, 0.6); + border-top: 1px solid rgba(255, 255, 255, 0.1); + font-size: 13px; + padding: 4px; +} + +/* Message Box */ +QMessageBox { + background-color: #1a1a1a; + color: #FFFFFF; +} + +QMessageBox QPushButton { + min-width: 80px; +} + +/* ComboBox */ +QComboBox { + background-color: #232D53; + border: none; + border-radius: 8px; + padding: 10px 16px; + color: #FFFFFF; + font-size: 14px; +} + +QComboBox:hover { + background-color: #2A3560; +} + +QComboBox::drop-down { + border: none; + width: 24px; +} + +QComboBox::down-arrow { + width: 0; + height: 0; + border-style: solid; + border-width: 6px 4px 0 4px; + border-color: #00D4FF transparent transparent transparent; +} + +QComboBox QAbstractItemView { + background-color: #232D53; + border: none; + selection-background-color: #2A3560; + color: #FFFFFF; +} + +/* Tool Tips */ +QToolTip { + background-color: #232D53; + color: #FFFFFF; + border: 1px solid rgba(0, 212, 255, 0.2); + border-radius: 6px; + padding: 8px 12px; + font-size: 13px; +} +""" \ No newline at end of file diff --git a/src/resources/styles/light_theme.py b/src/resources/styles/light_theme.py new file mode 100644 index 0000000..bd2e7c2 --- /dev/null +++ b/src/resources/styles/light_theme.py @@ -0,0 +1,347 @@ +LIGHT_THEME = """ +/* Globale Variablen und Basis-Styles */ +* { + font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Arial, sans-serif; +} + +/* Hauptfenster */ +QMainWindow { + background-color: #f8f9fa; + color: #212529; +} + +QMainWindow > QWidget { + background-color: #f8f9fa; +} + +/* Widget Hintergründe */ +QWidget { + background-color: transparent; + color: #212529; +} + +/* App Header */ +QWidget#appHeader { + background-color: transparent; + padding-bottom: 20px; +} + +/* Überschriften */ +QLabel#heading { + font-family: 'Poppins', sans-serif; + font-size: 32px; + font-weight: 700; + color: #212529 !important; + letter-spacing: -0.5px; + background-color: transparent; +} + +QLabel#subheading { + font-size: 16px; + color: #6c757d; + margin-top: -5px; +} + +QLabel#sectionTitle { + font-size: 18px; + font-weight: 600; + color: #212529; + margin-bottom: 10px; +} + +QLabel#inputLabel { + font-size: 14px; + font-weight: 600; + color: #495057; + margin-bottom: 4px; +} + +/* Content Card - Hauptcontainer */ +QWidget#contentCard { + background-color: #ffffff; + border: 1px solid #e9ecef; + border-radius: 12px; + padding: 32px; +} + +/* Tabellen-Style */ +QTableWidget#dataTable { + background-color: transparent; + border: none; + outline: none; +} + +QTableWidget#dataTable::item { + background-color: transparent; + color: #495057; + font-weight: 600; + padding: 12px 16px; + border-bottom: 1px solid #e9ecef; +} + +QTableWidget#dataTable::item:selected { + background-color: transparent; +} + +/* Eingabefelder */ +QLineEdit { + background-color: #f8f9fa; + border: 1px solid #ced4da; + border-radius: 8px; + padding: 12px 16px; + color: #212529; + font-size: 14px; + min-height: 24px; +} + +QLineEdit:focus { + border-color: #00D4FF; + background-color: #ffffff; + outline: none; +} + +QLineEdit::placeholder { + color: #adb5bd; +} + +/* Buttons */ +QPushButton { + background-color: #ffffff; + color: #212529; + border: 1px solid #ced4da; + border-radius: 24px; + padding: 0 24px; + min-height: 40px; + font-size: 14px; + font-weight: 600; +} + +QPushButton:hover { + background-color: #f8f9fa; + border-color: #00D4FF; +} + +QPushButton:pressed { + background-color: #e9ecef; +} +QPushButton:disabled { + background-color: #f8f9fa; + color: rgba(0, 0, 0, 0.3); + border: 1px solid rgba(0, 0, 0, 0.1); +} + +/* Primary Button */ +QPushButton#primaryButton { + background-color: #00D4FF; + color: #212529; + border: none; +} + +QPushButton#primaryButton:hover { + background-color: #00B8E6; + color: #ffffff; +} + +QPushButton#primaryButton:pressed { + background-color: #0099CC; +} + +QPushButton#primaryButton:disabled { + background-color: #e9ecef; + color: #adb5bd; +} + +/* Mode Toggle Button */ +QPushButton#modeToggle { + background-color: rgba(0, 0, 0, 0.05); + border: none; + border-radius: 20px; + padding: 8px; + min-width: 40px; + min-height: 40px; + max-width: 40px; + max-height: 40px; +} + +QPushButton#modeToggle:hover { + background-color: rgba(0, 0, 0, 0.1); +} + +/* CheckBox */ +QCheckBox { + spacing: 10px; + color: #212529; + font-size: 14px; + padding: 4px 0; +} + +QCheckBox::indicator { + width: 20px; + height: 20px; + border-radius: 4px; + border: 2px solid #adb5bd; + background-color: #ffffff; +} + +QCheckBox::indicator:hover { + border: 2px solid #6c757d; + background-color: #f8f9fa; +} + +QCheckBox::indicator:checked { + background-color: #00D4FF; + border: 2px solid #00D4FF; + image: url(src/resources/icons/check.svg); + padding: 2px; +} + +/* SpinBox */ +QSpinBox { + background-color: #f8f9fa; + border: 1px solid #ced4da; + border-radius: 8px; + padding: 8px 12px; + color: #212529; + font-size: 14px; + min-width: 60px; +} + +QSpinBox:focus { + border-color: #00D4FF; + background-color: #ffffff; +} + +QSpinBox::up-button, QSpinBox::down-button { + background-color: transparent; + border: none; + width: 20px; +} + +QSpinBox::up-button:hover, QSpinBox::down-button:hover { + background-color: rgba(0, 212, 255, 0.1); +} + +QSpinBox::up-arrow, QSpinBox::down-arrow { + image: none; + width: 0; + height: 0; +} + +/* Progress Bar */ +QProgressBar { + background-color: #e9ecef; + border: none; + border-radius: 6px; + height: 12px; + text-align: center; + font-size: 12px; + color: #212529; +} + +QProgressBar::chunk { + background: linear-gradient(90deg, #00D4FF 0%, #00B8E6 100%); + border-radius: 6px; +} + +/* Text Edit für Status */ +QTextEdit#statusLog { + background-color: #f8f9fa; + border: 1px solid #e9ecef; + border-radius: 8px; + padding: 12px; + color: #495057; + font-family: 'SF Mono', Monaco, 'Cascadia Code', monospace; + font-size: 13px; +} + +/* Scrollbar */ +QScrollBar:vertical { + background-color: transparent; + width: 10px; + border-radius: 5px; +} + +QScrollBar::handle:vertical { + background-color: #ced4da; + border-radius: 5px; + min-height: 30px; +} + +QScrollBar::handle:vertical:hover { + background-color: #adb5bd; +} + +QScrollBar::add-line:vertical, QScrollBar::sub-line:vertical { + height: 0; +} + +/* Frame Separator */ +QFrame#separator { + background-color: #e9ecef; + max-height: 1px; + margin: 20px 0; +} + +/* Status Bar */ +QStatusBar { + background-color: #f8f9fa; + color: #6c757d; + border-top: 1px solid #e9ecef; + font-size: 13px; + padding: 4px; +} + +/* Message Box */ +QMessageBox { + background-color: #ffffff; + color: #212529; +} + +QMessageBox QPushButton { + min-width: 80px; +} + +/* ComboBox */ +QComboBox { + background-color: #f8f9fa; + border: 1px solid #ced4da; + border-radius: 8px; + padding: 10px 16px; + color: #212529; + font-size: 14px; +} + +QComboBox:hover { + border-color: #00D4FF; +} + +QComboBox::drop-down { + border: none; + width: 24px; +} + +QComboBox::down-arrow { + width: 0; + height: 0; + border-style: solid; + border-width: 6px 4px 0 4px; + border-color: #00D4FF transparent transparent transparent; +} + +QComboBox QAbstractItemView { + background-color: #ffffff; + border: 1px solid #ced4da; + selection-background-color: #e3f2fd; + color: #212529; +} + +/* Tool Tips */ +QToolTip { + background-color: #212529; + color: #ffffff; + border: none; + border-radius: 6px; + padding: 8px 12px; + font-size: 13px; +} +""" \ No newline at end of file diff --git a/src/ui/__init__.py b/src/ui/__init__.py new file mode 100644 index 0000000..ed45bbc --- /dev/null +++ b/src/ui/__init__.py @@ -0,0 +1 @@ +# UI Module \ No newline at end of file diff --git a/src/ui/__pycache__/__init__.cpython-310.pyc b/src/ui/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..76328b7 Binary files /dev/null and b/src/ui/__pycache__/__init__.cpython-310.pyc differ diff --git a/src/ui/__pycache__/custom_widgets.cpython-310.pyc b/src/ui/__pycache__/custom_widgets.cpython-310.pyc new file mode 100644 index 0000000..759a229 Binary files /dev/null and b/src/ui/__pycache__/custom_widgets.cpython-310.pyc differ diff --git a/src/ui/__pycache__/main_window.cpython-310.pyc b/src/ui/__pycache__/main_window.cpython-310.pyc new file mode 100644 index 0000000..14c7143 Binary files /dev/null and b/src/ui/__pycache__/main_window.cpython-310.pyc differ diff --git a/src/ui/custom_widgets.py b/src/ui/custom_widgets.py new file mode 100644 index 0000000..f164e0d --- /dev/null +++ b/src/ui/custom_widgets.py @@ -0,0 +1,54 @@ +from PyQt6.QtWidgets import QSpinBox, QStyleOptionSpinBox, QStyle +from PyQt6.QtCore import Qt, QRect +from PyQt6.QtGui import QPainter, QFont, QPen, QColor + + +class CustomSpinBox(QSpinBox): + def __init__(self, parent=None): + super().__init__(parent) + + def paintEvent(self, event): + # Standard paint event + super().paintEvent(event) + + # Zeichne custom + und - Zeichen + painter = QPainter(self) + painter.setRenderHint(QPainter.RenderHint.Antialiasing) + + # Button-Bereiche berechnen + option = QStyleOptionSpinBox() + self.initStyleOption(option) + + up_rect = self.style().subControlRect( + QStyle.ComplexControl.CC_SpinBox, + option, + QStyle.SubControl.SC_SpinBoxUp, + self + ) + + down_rect = self.style().subControlRect( + QStyle.ComplexControl.CC_SpinBox, + option, + QStyle.SubControl.SC_SpinBoxDown, + self + ) + + # Font für Symbole + font = QFont() + font.setPixelSize(14) + font.setBold(True) + painter.setFont(font) + + # Farbe + if self.isEnabled(): + painter.setPen(QPen(QColor("#00D4FF"), 2)) + else: + painter.setPen(QPen(QColor("#666666"), 2)) + + # + Zeichen zeichnen + painter.drawText(up_rect, Qt.AlignmentFlag.AlignCenter, "+") + + # - Zeichen zeichnen + painter.drawText(down_rect, Qt.AlignmentFlag.AlignCenter, "−") # Unicode minus + + painter.end() \ No newline at end of file diff --git a/src/ui/main_window.py b/src/ui/main_window.py new file mode 100644 index 0000000..d239a81 --- /dev/null +++ b/src/ui/main_window.py @@ -0,0 +1,604 @@ +from PyQt6.QtWidgets import (QMainWindow, QWidget, QVBoxLayout, QHBoxLayout, + QLabel, QLineEdit, QPushButton, QGroupBox, + QComboBox, QTextEdit, QProgressBar, QCheckBox, + QFileDialog, QMessageBox, QStatusBar, QSpinBox, + QTabWidget, QListWidget, QListWidgetItem, + QTableWidget, QTableWidgetItem, QHeaderView, + QFrame, QSplitter, QScrollArea) +from PyQt6.QtCore import Qt, QThread, pyqtSignal, QTimer, QSettings +from PyQt6.QtGui import QIcon, QPixmap, QPalette, QColor, QDesktopServices, QPainter +from PyQt6.QtCore import QUrl +import os +import sys +from datetime import datetime + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +from resources.styles.dark_theme import DARK_THEME +from resources.styles.light_theme import LIGHT_THEME +from core.web_crawler import WebCrawler +from utils.pdf_report import PDFReport +from ui.custom_widgets import CustomSpinBox +from utils.local_server import LocalWebServer + + +class CrawlerThread(QThread): + progress = pyqtSignal(int) + status = pyqtSignal(str) + finished = pyqtSignal(bool) + report_ready = pyqtSignal(str) # Signal für PDF-Pfad + + def __init__(self, url, save_path, options): + super().__init__() + self.url = url + self.save_path = save_path + self.options = options + self.crawler = None + + def run(self): + try: + # Human behavior ist immer aktiviert + self.crawler = WebCrawler(human_behavior=True) + self.crawler.progress_callback = self.progress.emit + self.crawler.status_callback = self.status.emit + + success = self.crawler.download_website( + self.url, + self.save_path, + **self.options + ) + + # Erstelle PDF-Bericht + try: + pdf_report = PDFReport() + report_path = pdf_report.generate_report( + url=self.url, + save_path=self.save_path, + start_time=self.crawler.start_time, + end_time=self.crawler.end_time, + downloaded_resources=self.crawler.downloaded_resources, + skipped_urls=self.crawler.skipped_urls, + options=self.options, + success=success + ) + self.status.emit(f"PDF-Bericht erstellt: {report_path}") + self.report_ready.emit(report_path) + except Exception as e: + self.status.emit(f"Fehler beim Erstellen des PDF-Berichts: {str(e)}") + + self.finished.emit(success) + except Exception as e: + self.status.emit(f"Fehler: {str(e)}") + + # Versuche trotzdem einen Fehlerbericht zu erstellen + try: + if self.crawler and self.crawler.start_time: + pdf_report = PDFReport() + report_path = pdf_report.generate_report( + url=self.url, + save_path=self.save_path, + start_time=self.crawler.start_time, + end_time=self.crawler.end_time or datetime.now(), + downloaded_resources=self.crawler.downloaded_resources, + skipped_urls=self.crawler.skipped_urls, + options=self.options, + success=False + ) + self.status.emit(f"Fehler-PDF-Bericht erstellt: {report_path}") + self.report_ready.emit(report_path) + except: + pass + + self.finished.emit(False) + + +class WebsiteCrawlerWindow(QMainWindow): + def __init__(self): + super().__init__() + self.crawler_thread = None + self.last_report_path = None + self.last_save_path = None + self.local_server = None + self.settings = QSettings('IntelSight', 'WebsiteCrawler') + self.dark_mode = self.settings.value('dark_mode', True, type=bool) + self.init_ui() + + def init_ui(self): + self.setWindowTitle("IntelSight Webseiten-Crawler") + self.setGeometry(100, 100, 1200, 800) + + # Theme anwenden + self.apply_theme() + + # Scroll Area als zentrales Widget + scroll_area = QScrollArea() + scroll_area.setWidgetResizable(True) + scroll_area.setHorizontalScrollBarPolicy(Qt.ScrollBarPolicy.ScrollBarAsNeeded) + scroll_area.setVerticalScrollBarPolicy(Qt.ScrollBarPolicy.ScrollBarAsNeeded) + self.setCentralWidget(scroll_area) + + # Container Widget für den Inhalt + central_widget = QWidget() + scroll_area.setWidget(central_widget) + + # Hauptlayout + main_layout = QVBoxLayout(central_widget) + main_layout.setContentsMargins(32, 32, 32, 32) + main_layout.setSpacing(20) + + # Header mit Logo, Titel und Mode Toggle + header_widget = QWidget() + header_widget.setObjectName("appHeader") + header_layout = QHBoxLayout(header_widget) + header_layout.setContentsMargins(0, 0, 0, 20) + + # IntelSight Logo - Neu implementiert + self.logo_label = QLabel() + self.logo_label.setFixedSize(300, 70) # Feste Größe für konsistente Darstellung + self.logo_label.setAlignment(Qt.AlignmentFlag.AlignLeft | Qt.AlignmentFlag.AlignVCenter) + self.refresh_logo() # Neue Methode verwenden + header_layout.addWidget(self.logo_label) + + # Titel (ohne IntelSight, da es im Logo ist) + title_label = QLabel("Webseiten-Crawler") + title_label.setObjectName("heading") + header_layout.addWidget(title_label) + + header_layout.addStretch() + + # Mode Toggle Button + self.mode_toggle = QPushButton() + self.mode_toggle.setObjectName("modeToggle") + self.mode_toggle.clicked.connect(self.toggle_theme) + self.update_mode_icon() + header_layout.addWidget(self.mode_toggle) + + main_layout.addWidget(header_widget) + + # Beschreibung + desc_label = QLabel("Sichere Webseiten lokal für Offline-Zugriff") + desc_label.setObjectName("subheading") + main_layout.addWidget(desc_label) + + # Container für Formular + content_widget = QWidget() + content_widget.setObjectName("contentCard") + content_layout = QVBoxLayout(content_widget) + content_layout.setSpacing(20) + + # URL Eingabe + url_widget = QWidget() + url_layout = QVBoxLayout(url_widget) + url_layout.setSpacing(8) + url_label = QLabel("URL:") + url_label.setObjectName("inputLabel") + self.url_input = QLineEdit() + self.url_input.setPlaceholderText("https://example.com") + url_layout.addWidget(url_label) + url_layout.addWidget(self.url_input) + content_layout.addWidget(url_widget) + + # Speicherort + save_widget = QWidget() + save_layout = QVBoxLayout(save_widget) + save_layout.setSpacing(8) + save_label = QLabel("Speicherort:") + save_label.setObjectName("inputLabel") + save_input_layout = QHBoxLayout() + save_input_layout.setContentsMargins(0, 0, 0, 0) + self.save_path_input = QLineEdit() + self.save_path_input.setPlaceholderText("C:\\Downloads\\Webseite") + self.browse_button = QPushButton("Durchsuchen") + self.browse_button.clicked.connect(self.browse_folder) + save_input_layout.addWidget(self.save_path_input) + save_input_layout.addWidget(self.browse_button) + save_layout.addWidget(save_label) + save_layout.addLayout(save_input_layout) + content_layout.addWidget(save_widget) + + # Sicherungsart + backup_type_widget = QWidget() + backup_type_layout = QVBoxLayout(backup_type_widget) + backup_type_layout.setSpacing(8) + backup_type_label = QLabel("Sicherungsart:") + backup_type_label.setObjectName("inputLabel") + backup_type_layout.addWidget(backup_type_label) + + self.snapshot_mode = QCheckBox("Webseiten-Snapshot erstellen (nur die aktuelle Seite mit Bildern und Formatierung)") + self.snapshot_mode.setChecked(True) + self.full_backup_mode = QCheckBox("Gesamte Webseite sichern (alle verlinkten Unterseiten und Medien)") + backup_type_layout.addWidget(self.snapshot_mode) + backup_type_layout.addWidget(self.full_backup_mode) + content_layout.addWidget(backup_type_widget) + + # Checkbox-Verhalten: nur eine Option auswählbar + self.snapshot_mode.toggled.connect(lambda checked: self.full_backup_mode.setChecked(not checked) if checked else None) + self.full_backup_mode.toggled.connect(lambda checked: self.snapshot_mode.setChecked(not checked) if checked else None) + self.full_backup_mode.toggled.connect(self.toggle_resource_options) + + # Ressourcen-Optionen (nur sichtbar bei "Gesamte Webseite sichern") + self.resources_widget = QWidget() + resources_layout = QVBoxLayout(self.resources_widget) + resources_layout.setSpacing(8) + resources_label = QLabel("Ressourcen:") + resources_label.setObjectName("inputLabel") + resources_layout.addWidget(resources_label) + + self.download_images = QCheckBox("Bilder herunterladen") + self.download_images.setChecked(True) + self.download_videos = QCheckBox("Videos herunterladen") + self.download_videos.setChecked(True) + self.download_css = QCheckBox("CSS-Dateien herunterladen") + self.download_css.setChecked(True) + self.download_js = QCheckBox("JavaScript-Dateien herunterladen") + self.download_js.setChecked(True) + resources_layout.addWidget(self.download_images) + resources_layout.addWidget(self.download_videos) + resources_layout.addWidget(self.download_css) + resources_layout.addWidget(self.download_js) + + + content_layout.addWidget(self.resources_widget) + self.resources_widget.setVisible(False) # Standardmäßig ausgeblendet + + # Separator + separator = QFrame() + separator.setFrameShape(QFrame.Shape.HLine) + separator.setObjectName("separator") + content_layout.addWidget(separator) + + # Progress Section + progress_widget = QWidget() + progress_layout = QVBoxLayout(progress_widget) + + progress_label = QLabel("Download-Fortschritt") + progress_label.setObjectName("sectionTitle") + progress_layout.addWidget(progress_label) + + self.progress_bar = QProgressBar() + self.progress_bar.setTextVisible(True) + progress_layout.addWidget(self.progress_bar) + + # Status Log + self.status_text = QTextEdit() + self.status_text.setReadOnly(True) + self.status_text.setMaximumHeight(120) + self.status_text.setObjectName("statusLog") + progress_layout.addWidget(self.status_text) + + content_layout.addWidget(progress_widget) + + # Buttons + button_widget = QWidget() + button_layout = QHBoxLayout(button_widget) + button_layout.setContentsMargins(0, 20, 0, 0) + + button_layout.addStretch() + + self.view_button = QPushButton("Webseite lokal anzeigen") + self.view_button.setEnabled(False) + self.view_button.clicked.connect(self.view_website) + + self.report_button = QPushButton("Download Bericht (PDF)") + self.report_button.setEnabled(False) + self.report_button.clicked.connect(self.open_last_report) + + self.stop_button = QPushButton("Abbrechen") + self.stop_button.setEnabled(False) + self.stop_button.clicked.connect(self.stop_download) + + self.start_button = QPushButton("Download starten") + self.start_button.setObjectName("primaryButton") + self.start_button.clicked.connect(self.start_download) + + button_layout.addWidget(self.view_button) + button_layout.addWidget(self.report_button) + button_layout.addWidget(self.stop_button) + button_layout.addWidget(self.start_button) + + content_layout.addWidget(button_widget) + + main_layout.addWidget(content_widget) + main_layout.addStretch() + + # Status Bar + self.status_bar = QStatusBar() + self.setStatusBar(self.status_bar) + self.status_bar.showMessage("Bereit") + + def toggle_resource_options(self, checked): + """Zeigt/versteckt die Ressourcen-Optionen basierend auf der Sicherungsart""" + self.resources_widget.setVisible(checked) + + def browse_folder(self): + folder = QFileDialog.getExistingDirectory(self, "Speicherort wählen") + if folder: + self.save_path_input.setText(folder) + + def start_download(self): + url = self.url_input.text().strip() + save_path = self.save_path_input.text().strip() + + if not url: + QMessageBox.warning(self, "Warnung", "Bitte geben Sie eine URL ein.") + return + + if not save_path: + QMessageBox.warning(self, "Warnung", "Bitte wählen Sie einen Speicherort.") + return + + # Optionen sammeln + if self.full_backup_mode.isChecked(): + options = { + 'download_images': self.download_images.isChecked(), + 'download_videos': self.download_videos.isChecked(), + 'download_css': self.download_css.isChecked(), + 'download_js': self.download_js.isChecked(), + 'follow_links': True, + 'max_depth': 999 # Sehr hohe Tiefe für vollständige Sicherung + } + else: + # Snapshot-Modus: nur die aktuelle Seite mit minimalen Ressourcen + options = { + 'download_images': True, + 'download_videos': False, + 'download_css': True, + 'download_js': False, + 'follow_links': False, + 'max_depth': 0 + } + + options['backup_type'] = 'full' if self.full_backup_mode.isChecked() else 'snapshot' + + # Erstelle Verzeichnisstruktur: Datum_Webseitenname_Art + from datetime import datetime + from urllib.parse import urlparse + + # Datum im Format YYMMDD + date_str = datetime.now().strftime("%y%m%d") + + # Webseitenname aus URL extrahieren + parsed_url = urlparse(url) + website_name = parsed_url.netloc.replace('www.', '').replace(':', '_') + if not website_name: + website_name = 'website' + + # Art der Sicherung + backup_type = 'complete' if self.full_backup_mode.isChecked() else 'snapshot' + + # Verzeichnisname erstellen + dir_name = f"{date_str}_{website_name}_{backup_type}" + final_save_path = os.path.join(save_path, dir_name) + + # UI für Download vorbereiten - Deaktiviere alle Eingabefelder + self.start_button.setEnabled(False) + self.stop_button.setEnabled(True) + self.view_button.setEnabled(False) + self.report_button.setEnabled(False) + + # Deaktiviere alle Eingabefelder + self.url_input.setEnabled(False) + self.save_path_input.setEnabled(False) + self.browse_button.setEnabled(False) + + # Deaktiviere Sicherungsart + self.snapshot_mode.setEnabled(False) + self.full_backup_mode.setEnabled(False) + + # Deaktiviere Ressourcen-Checkboxen + self.download_images.setEnabled(False) + self.download_videos.setEnabled(False) + self.download_css.setEnabled(False) + self.download_js.setEnabled(False) + + # Deaktiviere Theme-Toggle + self.mode_toggle.setEnabled(False) + + self.progress_bar.setValue(0) + self.status_text.clear() + + # Thread starten mit angepasstem Pfad + self.crawler_thread = CrawlerThread(url, final_save_path, options) + self.crawler_thread.progress.connect(self.update_progress) + self.crawler_thread.status.connect(self.update_status) + self.crawler_thread.finished.connect(self.download_finished) + self.crawler_thread.report_ready.connect(self.open_report) + self.crawler_thread.start() + + # Speichere den finalen Pfad für die lokale Anzeige + self.last_save_path = final_save_path + + def stop_download(self): + if self.crawler_thread and self.crawler_thread.isRunning(): + self.update_status("Download wird abgebrochen...") + + # Versuche Bericht zu erstellen bevor Thread beendet wird + if self.crawler_thread.crawler: + try: + pdf_report = PDFReport() + report_path = pdf_report.generate_report( + url=self.crawler_thread.url, + save_path=self.crawler_thread.save_path, + start_time=self.crawler_thread.crawler.start_time, + end_time=datetime.now(), + downloaded_resources=self.crawler_thread.crawler.downloaded_resources, + skipped_urls=self.crawler_thread.crawler.skipped_urls, + options=self.crawler_thread.options, + success=False, + error_message="Download manuell abgebrochen" + ) + self.last_report_path = report_path + self.report_button.setEnabled(True) + self.update_status(f"Abbruch-Bericht erstellt: {report_path}") + except Exception as e: + self.update_status(f"Fehler beim Erstellen des Abbruch-Berichts: {str(e)}") + + self.crawler_thread.terminate() + self.download_finished(False) + + def update_progress(self, value): + self.progress_bar.setValue(value) + + def update_status(self, message): + self.status_text.append(message) + self.status_bar.showMessage(message) + + def download_finished(self, success): + # Aktiviere alle UI-Elemente wieder + self.start_button.setEnabled(True) + self.stop_button.setEnabled(False) + + # Aktiviere alle Eingabefelder wieder + self.url_input.setEnabled(True) + self.save_path_input.setEnabled(True) + self.browse_button.setEnabled(True) + + # Aktiviere Sicherungsart wieder + self.snapshot_mode.setEnabled(True) + self.full_backup_mode.setEnabled(True) + + # Aktiviere Ressourcen-Checkboxen wieder + self.download_images.setEnabled(True) + self.download_videos.setEnabled(True) + self.download_css.setEnabled(True) + self.download_js.setEnabled(True) + + # Aktiviere Theme-Toggle wieder + self.mode_toggle.setEnabled(True) + + # Speichere den Pfad für die Webseiten-Anzeige + self.last_save_path = self.save_path_input.text() + + if success: + self.view_button.setEnabled(True) + self.progress_bar.setValue(100) + QMessageBox.information(self, "Erfolg", "Website wurde erfolgreich heruntergeladen!") + else: + # Bei Fehler trotzdem Report-Button aktivieren, wenn ein Bericht vorhanden ist + if self.last_report_path and os.path.exists(self.last_report_path): + self.report_button.setEnabled(True) + QMessageBox.warning(self, "Fehler", "Der Download wurde unterbrochen oder es ist ein Fehler aufgetreten.\nDetails finden Sie im PDF-Bericht.") + + self.status_bar.showMessage("Bereit") + + def apply_theme(self): + if self.dark_mode: + self.setStyleSheet(DARK_THEME) + else: + self.setStyleSheet(LIGHT_THEME) + + def toggle_theme(self): + self.dark_mode = not self.dark_mode + self.settings.setValue('dark_mode', self.dark_mode) + self.apply_theme() + self.update_mode_icon() + self.refresh_logo() # Neue Methode verwenden + + def update_mode_icon(self): + if self.dark_mode: + icon_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), + 'resources', 'icons', 'sun.svg') + else: + icon_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), + 'resources', 'icons', 'moon.svg') + + if os.path.exists(icon_path): + self.mode_toggle.setIcon(QIcon(icon_path)) + + def refresh_logo(self): + """Neue Methode zum Laden des Logos - komplett neu implementiert""" + # Bestimme den Basispfad für die Ressourcen + base_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + resources_path = os.path.join(base_path, 'resources', 'logo') + + # Wähle die richtige Logo-Datei basierend auf dem Theme + if self.dark_mode: + logo_filename = 'intelsight-name-dark.svg' + else: + logo_filename = 'intelsight-name-light.svg' + + logo_path = os.path.join(resources_path, logo_filename) + + # Prüfe ob die Datei existiert + if not os.path.exists(logo_path): + print(f"Logo-Datei nicht gefunden: {logo_path}") + self.logo_label.setText("IntelSight") # Fallback Text + return + + # Lade das Logo + pixmap = QPixmap(logo_path) + + # Prüfe ob das Pixmap erfolgreich geladen wurde + if pixmap.isNull(): + print(f"Fehler beim Laden des Logos: {logo_path}") + self.logo_label.setText("IntelSight") # Fallback Text + return + + # Skaliere das Logo auf die gewünschte Größe + scaled_pixmap = pixmap.scaled( + 290, 65, # Etwas kleiner als die Label-Größe für Padding + Qt.AspectRatioMode.KeepAspectRatio, + Qt.TransformationMode.SmoothTransformation + ) + + # Setze das Logo + self.logo_label.setPixmap(scaled_pixmap) + + # Debug-Info + print(f"Logo erfolgreich geladen: {logo_filename} (Dark Mode: {self.dark_mode})") + + def open_report(self, report_path: str): + """Speichert den PDF-Bericht-Pfad ohne Benutzeraufforderung""" + self.last_report_path = report_path + self.report_button.setEnabled(True) + + def open_last_report(self): + """Öffnet den letzten PDF-Bericht""" + if self.last_report_path and os.path.exists(self.last_report_path): + QDesktopServices.openUrl(QUrl.fromLocalFile(self.last_report_path)) + else: + QMessageBox.information(self, "Info", "Kein Bericht vorhanden.") + + def view_website(self): + """Startet einen lokalen Webserver und öffnet die Webseite""" + if not self.last_save_path or not os.path.exists(self.last_save_path): + QMessageBox.information(self, "Info", "Keine gesicherte Webseite vorhanden.") + return + + try: + # Stoppe vorherigen Server falls vorhanden + if self.local_server: + self.local_server.stop() + + # Starte neuen Server + self.local_server = LocalWebServer(self.last_save_path) + url = self.local_server.start() + + # Öffne im Browser + self.local_server.open_in_browser() + + # Zeige Info + QMessageBox.information( + self, + "Webserver gestartet", + f"Die Webseite wird auf {url} bereitgestellt.\n\n" + "Der Server läuft im Hintergrund und wird beim Beenden der Anwendung gestoppt." + ) + + except Exception as e: + QMessageBox.critical(self, "Fehler", f"Fehler beim Starten des Webservers:\n{str(e)}") + + def closeEvent(self, event): + """Wird beim Schließen der Anwendung aufgerufen""" + # Stoppe lokalen Server falls vorhanden + if self.local_server: + self.local_server.stop() + + # Speichere Einstellungen + self.settings.setValue('dark_mode', self.dark_mode) + + # Stoppe laufenden Download + if self.crawler_thread and self.crawler_thread.isRunning(): + self.crawler_thread.terminate() + self.crawler_thread.wait() + + event.accept() \ No newline at end of file diff --git a/src/utils/__init__.py b/src/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/utils/__pycache__/__init__.cpython-310.pyc b/src/utils/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..3ae5979 Binary files /dev/null and b/src/utils/__pycache__/__init__.cpython-310.pyc differ diff --git a/src/utils/__pycache__/local_server.cpython-310.pyc b/src/utils/__pycache__/local_server.cpython-310.pyc new file mode 100644 index 0000000..dfe66a2 Binary files /dev/null and b/src/utils/__pycache__/local_server.cpython-310.pyc differ diff --git a/src/utils/__pycache__/pdf_report.cpython-310.pyc b/src/utils/__pycache__/pdf_report.cpython-310.pyc new file mode 100644 index 0000000..5151973 Binary files /dev/null and b/src/utils/__pycache__/pdf_report.cpython-310.pyc differ diff --git a/src/utils/local_server.py b/src/utils/local_server.py new file mode 100644 index 0000000..76f2c63 --- /dev/null +++ b/src/utils/local_server.py @@ -0,0 +1,86 @@ +import http.server +import socketserver +import os +import webbrowser +import threading +from pathlib import Path + +class LocalWebServer: + """Einfacher lokaler Webserver für die Anzeige gesicherter Webseiten""" + + def __init__(self, directory: str, port: int = 8000): + self.directory = Path(directory).resolve() + self.port = port + self.server = None + self.server_thread = None + + def start(self): + """Startet den lokalen Webserver""" + # Wechsle zum Verzeichnis + os.chdir(self.directory) + + # Erstelle Handler + handler = http.server.SimpleHTTPRequestHandler + + # Finde einen freien Port + while True: + try: + self.server = socketserver.TCPServer(("", self.port), handler) + break + except OSError: + self.port += 1 + if self.port > 9000: + raise Exception("Kein freier Port zwischen 8000 und 9000 gefunden") + + # Starte Server in separatem Thread + self.server_thread = threading.Thread(target=self.server.serve_forever) + self.server_thread.daemon = True + self.server_thread.start() + + return f"http://localhost:{self.port}" + + def stop(self): + """Stoppt den Webserver""" + if self.server: + self.server.shutdown() + self.server_thread.join() + + def open_in_browser(self): + """Öffnet die Webseite im Browser""" + url = f"http://localhost:{self.port}" + webbrowser.open(url) + return url + + +def serve_website(directory: str, auto_open: bool = True): + """ + Startet einen lokalen Webserver für die gesicherte Webseite + + Args: + directory: Pfad zum Verzeichnis mit der gesicherten Webseite + auto_open: Öffnet automatisch den Browser + """ + server = LocalWebServer(directory) + url = server.start() + + print(f"Webserver gestartet auf {url}") + print("Drücken Sie Strg+C zum Beenden") + + if auto_open: + server.open_in_browser() + + try: + # Server läuft bis Ctrl+C + while True: + pass + except KeyboardInterrupt: + print("\nServer wird beendet...") + server.stop() + + +if __name__ == "__main__": + import sys + if len(sys.argv) > 1: + serve_website(sys.argv[1]) + else: + print("Verwendung: python local_server.py ") \ No newline at end of file diff --git a/src/utils/pdf_report.py b/src/utils/pdf_report.py new file mode 100644 index 0000000..850b2e0 --- /dev/null +++ b/src/utils/pdf_report.py @@ -0,0 +1,418 @@ +from reportlab.lib import colors +from reportlab.lib.pagesizes import A4 +from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle +from reportlab.lib.units import inch, cm +from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle, PageBreak +from reportlab.platypus.tableofcontents import TableOfContents +from reportlab.lib.enums import TA_LEFT, TA_CENTER, TA_RIGHT, TA_JUSTIFY +from datetime import datetime +import os +from typing import Dict, List, Optional + + +class PDFReport: + def __init__(self): + self.styles = getSampleStyleSheet() + self._create_custom_styles() + + def _create_custom_styles(self): + """Erstellt angepasste Styles für den Bericht""" + # Titel-Style + self.styles.add(ParagraphStyle( + name='CustomTitle', + parent=self.styles['Heading1'], + fontSize=24, + textColor=colors.HexColor('#232D53'), + spaceAfter=30, + alignment=TA_CENTER + )) + + # Untertitel-Style + self.styles.add(ParagraphStyle( + name='CustomSubtitle', + parent=self.styles['Normal'], + fontSize=14, + textColor=colors.HexColor('#6C757D'), + spaceAfter=20, + alignment=TA_CENTER + )) + + # Section Header + self.styles.add(ParagraphStyle( + name='SectionHeader', + parent=self.styles['Heading2'], + fontSize=16, + textColor=colors.HexColor('#232D53'), + spaceAfter=12, + spaceBefore=20 + )) + + # Info Text + self.styles.add(ParagraphStyle( + name='InfoText', + parent=self.styles['Normal'], + fontSize=11, + textColor=colors.HexColor('#495057'), + alignment=TA_JUSTIFY + )) + + # Error Text + self.styles.add(ParagraphStyle( + name='ErrorText', + parent=self.styles['Normal'], + fontSize=10, + textColor=colors.HexColor('#FF4444'), + leftIndent=20 + )) + + # Success Text + self.styles.add(ParagraphStyle( + name='SuccessText', + parent=self.styles['Normal'], + fontSize=10, + textColor=colors.HexColor('#4CAF50'), + leftIndent=20 + )) + + # Table Cell Style für automatischen Textumbruch + self.styles.add(ParagraphStyle( + name='TableCell', + parent=self.styles['Normal'], + fontSize=9, + leading=11, + wordWrap='CJK' # Besserer Umbruch für lange URLs + )) + + # Small Table Cell Style + self.styles.add(ParagraphStyle( + name='SmallTableCell', + parent=self.styles['Normal'], + fontSize=8, + leading=10, + wordWrap='CJK' + )) + + def generate_report(self, + url: str, + save_path: str, + start_time: datetime, + end_time: datetime, + downloaded_resources: Dict[str, str], + skipped_urls: Dict[str, str], + options: Dict, + success: bool, + output_path: Optional[str] = None, + error_message: Optional[str] = None) -> str: + """Generiert einen PDF-Bericht""" + + # Bestimme Ausgabepfad + if not output_path: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"crawler_report_{timestamp}.pdf" + output_path = os.path.join(save_path, filename) + + # Erstelle PDF + doc = SimpleDocTemplate( + output_path, + pagesize=A4, + rightMargin=2*cm, + leftMargin=2*cm, + topMargin=2*cm, + bottomMargin=2*cm + ) + + # Story - Container für alle Elemente + story = [] + + # Titel + story.append(Paragraph("IntelSight Webseiten-Crawler", self.styles['CustomTitle'])) + story.append(Paragraph("Sicherungsbericht", self.styles['CustomSubtitle'])) + story.append(Spacer(1, 0.5*inch)) + + # Zusammenfassung + story.append(Paragraph("Zusammenfassung", self.styles['SectionHeader'])) + + # Erstelle Zusammenfassungstabelle + duration = end_time - start_time + status_text = 'Erfolgreich' if success else 'Fehlgeschlagen' + if error_message: + status_text += f' - {error_message}' + + # Verwende Paragraph-Objekte für automatischen Textumbruch + # Erstelle speziellen Style für die linke Spalte + label_style = ParagraphStyle( + name='LabelCell', + parent=self.styles['TableCell'], + alignment=TA_RIGHT, + fontName='Helvetica-Bold' + ) + + summary_data = [ + [Paragraph('Status:', label_style), Paragraph(status_text, self.styles['TableCell'])], + [Paragraph('URL:', label_style), Paragraph(url, self.styles['TableCell'])], + [Paragraph('Speicherort:', label_style), Paragraph(save_path, self.styles['TableCell'])], + [Paragraph('Startzeit:', label_style), Paragraph(start_time.strftime('%d.%m.%Y %H:%M:%S'), self.styles['TableCell'])], + [Paragraph('Endzeit:', label_style), Paragraph(end_time.strftime('%d.%m.%Y %H:%M:%S'), self.styles['TableCell'])], + [Paragraph('Dauer:', label_style), Paragraph(f"{duration.total_seconds():.1f} Sekunden", self.styles['TableCell'])], + [Paragraph('Gesicherte Dateien:', label_style), Paragraph(str(len(downloaded_resources)), self.styles['TableCell'])], + [Paragraph('Übersprungene URLs:', label_style), Paragraph(str(len(skipped_urls)), self.styles['TableCell'])], + ] + + # Erhöhe die linke Spalte für längere deutsche Texte + summary_table = Table(summary_data, colWidths=[4.5*cm, 11*cm]) + summary_table.setStyle(TableStyle([ + ('BACKGROUND', (0, 0), (0, -1), colors.HexColor('#F8F9FA')), + ('VALIGN', (0, 0), (-1, -1), 'TOP'), + ('GRID', (0, 0), (-1, -1), 0.5, colors.HexColor('#E9ECEF')), + ('ROWBACKGROUNDS', (0, 0), (-1, -1), [colors.white, colors.HexColor('#F8F9FA')]), + ('TOPPADDING', (0, 0), (-1, -1), 8), + ('BOTTOMPADDING', (0, 0), (-1, -1), 8), + ('LEFTPADDING', (0, 0), (0, -1), 5), + ('RIGHTPADDING', (0, 0), (0, -1), 10), + ])) + + story.append(summary_table) + story.append(Spacer(1, 0.3*inch)) + + # Fehlerinformationen wenn vorhanden + if not success and error_message: + story.append(Paragraph("Fehlerdetails", self.styles['SectionHeader'])) + error_style = ParagraphStyle( + name='ErrorText', + parent=self.styles['Normal'], + fontSize=12, + textColor=colors.HexColor('#DC3545'), + leftIndent=20, + spaceBefore=6, + spaceAfter=6 + ) + story.append(Paragraph(error_message, error_style)) + story.append(Spacer(1, 0.3*inch)) + + # Download-Optionen + story.append(Paragraph("Download-Optionen", self.styles['SectionHeader'])) + + # Sicherungsart + backup_type = options.get('backup_type', 'snapshot') + backup_type_text = 'Webseiten-Snapshot' if backup_type == 'snapshot' else 'Gesamte Webseite' + + options_data = [ + ['Sicherungsart:', Paragraph(backup_type_text, self.styles['TableCell'])], + ] + + # Füge weitere Optionen nur bei "Gesamte Webseite" hinzu + if backup_type == 'full': + options_data.extend([ + ['Bilder herunterladen:', Paragraph('Ja' if options.get('download_images', False) else 'Nein', self.styles['TableCell'])], + ['Videos herunterladen:', Paragraph('Ja' if options.get('download_videos', False) else 'Nein', self.styles['TableCell'])], + ['CSS-Dateien herunterladen:', Paragraph('Ja' if options.get('download_css', False) else 'Nein', self.styles['TableCell'])], + ['JavaScript herunterladen:', Paragraph('Ja' if options.get('download_js', False) else 'Nein', self.styles['TableCell'])], + ['Maximale Tiefe:', Paragraph(str(options.get('max_depth', 0)), self.styles['TableCell'])], + ]) + + options_table = Table(options_data, colWidths=[6*cm, 9.5*cm]) + options_table.setStyle(TableStyle([ + ('FONTSIZE', (0, 0), (-1, -1), 10), + ('GRID', (0, 0), (-1, -1), 0.5, colors.HexColor('#E9ECEF')), + ('TOPPADDING', (0, 0), (-1, -1), 6), + ('BOTTOMPADDING', (0, 0), (-1, -1), 6), + ])) + + story.append(options_table) + story.append(Spacer(1, 0.3*inch)) + + # Übersprungene URLs + if skipped_urls: + story.append(Paragraph("Übersprungene URLs", self.styles['SectionHeader'])) + story.append(Paragraph( + f"Insgesamt wurden {len(skipped_urls)} URLs übersprungen:", + self.styles['InfoText'] + )) + story.append(Spacer(1, 0.1*inch)) + + # Gruppiere nach Fehlertyp + error_groups = {} + for url, reason in skipped_urls.items(): + if reason not in error_groups: + error_groups[reason] = [] + error_groups[reason].append(url) + + # Zeige Fehlergruppen + for reason, urls in sorted(error_groups.items()): + story.append(Paragraph(f"{reason} ({len(urls)} URLs):", self.styles['Normal'])) + + # Zeige maximal 10 URLs pro Gruppe + for url in urls[:10]: + story.append(Paragraph(f"• {url}", self.styles['ErrorText'])) + + if len(urls) > 10: + story.append(Paragraph( + f"... und {len(urls) - 10} weitere", + self.styles['ErrorText'] + )) + + story.append(Spacer(1, 0.1*inch)) + + # Erfolgreich gesicherte Dateien + if downloaded_resources: + story.append(PageBreak()) + story.append(Paragraph("Gesicherte Dateien", self.styles['SectionHeader'])) + story.append(Paragraph( + f"Insgesamt wurden {len(downloaded_resources)} Dateien erfolgreich gesichert.", + self.styles['InfoText'] + )) + + # Kategorisiere Dateien nach Typ + categories = { + 'HTML/Webseiten': ['.html', '.htm', '.xhtml', '.php', '.asp', '.aspx', '.jsp'], + 'Bilder': ['.jpg', '.jpeg', '.png', '.gif', '.svg', '.webp', '.ico', '.bmp', '.tiff'], + 'CSS/Stylesheets': ['.css', '.scss', '.sass', '.less'], + 'JavaScript': ['.js', '.mjs', '.jsx', '.ts', '.tsx'], + 'Videos': ['.mp4', '.webm', '.ogg', '.avi', '.mov', '.flv', '.wmv', '.m4v'], + 'Schriften': ['.woff', '.woff2', '.ttf', '.otf', '.eot'], + 'Dokumente': ['.pdf', '.doc', '.docx', '.txt', '.rtf'], + 'Sonstige': [] + } + + # Zuordnung der Dateien zu Kategorien + categorized_files = {cat: [] for cat in categories} + unknown_extensions = set() + + for url, local_path in downloaded_resources.items(): + ext = os.path.splitext(url)[1].lower() + if not ext: + # Prüfe ob es eine Zahl als Extension ist (.0, .1, etc) + parts = url.split('.') + if len(parts) > 1 and parts[-1].isdigit(): + ext = '.' + parts[-1] + unknown_extensions.add(ext) + + # Finde passende Kategorie + categorized = False + for category, extensions in categories.items(): + if category != 'Sonstige' and ext in extensions: + categorized_files[category].append((url, local_path)) + categorized = True + break + + if not categorized: + categorized_files['Sonstige'].append((url, local_path)) + + # Zeige Kategorien-Übersicht + story.append(Spacer(1, 0.3*inch)) + story.append(Paragraph("Übersicht nach Kategorie:", self.styles['InfoText'])) + + category_data = [] + for category, files in categorized_files.items(): + if files: # Nur Kategorien mit Dateien anzeigen + category_data.append([category, str(len(files))]) + + if category_data: + category_table = Table(category_data, colWidths=[6*cm, 2*cm]) + category_table.setStyle(TableStyle([ + ('FONTSIZE', (0, 0), (-1, -1), 10), + ('GRID', (0, 0), (-1, -1), 0.5, colors.HexColor('#E9ECEF')), + ('TOPPADDING', (0, 0), (-1, -1), 4), + ('BOTTOMPADDING', (0, 0), (-1, -1), 4), + ('BACKGROUND', (0, 0), (0, -1), colors.HexColor('#F8F9FA')), + ('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'), + ])) + story.append(category_table) + + # Hinweis zu unbekannten Extensions + if unknown_extensions: + story.append(Spacer(1, 0.2*inch)) + story.append(Paragraph( + f"Hinweis: Dateien mit numerischen Endungen ({', '.join(sorted(unknown_extensions))}) " + + "sind vermutlich versionierte Ressourcen oder Chunks von größeren Dateien.", + self.styles['InfoText'] + )) + + # Detaillierte Liste pro Kategorie (begrenzt auf erste 20 pro Kategorie) + for category, files in categorized_files.items(): + if files: + story.append(Spacer(1, 0.3*inch)) + story.append(Paragraph(f"{category} ({len(files)} Dateien):", self.styles['InfoText'])) + + # Zeige maximal 20 Einträge pro Kategorie + display_files = files[:20] + if len(files) > 20: + story.append(Paragraph( + f"(Zeige erste 20 von {len(files)} Dateien)", + self.styles['CustomSubtitle'] + )) + + file_data = [] + for url, local_path in display_files: + # Verwende SmallTableCell Style für besseren Textumbruch + file_data.append([ + Paragraph(url, self.styles['SmallTableCell']), + Paragraph(os.path.basename(local_path), self.styles['SmallTableCell']) + ]) + + # Dynamische Spaltenbreiten: mehr Platz für URLs + file_table = Table(file_data, colWidths=[12*cm, 3.5*cm]) + file_table.setStyle(TableStyle([ + ('FONTSIZE', (0, 0), (-1, -1), 8), + ('GRID', (0, 0), (-1, -1), 0.5, colors.HexColor('#E9ECEF')), + ('VALIGN', (0, 0), (-1, -1), 'TOP'), + ('TOPPADDING', (0, 0), (-1, -1), 2), + ('BOTTOMPADDING', (0, 0), (-1, -1), 2), + ])) + story.append(file_table) + + # Übersprungene URLs + if skipped_urls: + story.append(PageBreak()) + story.append(Paragraph("Übersprungene URLs", self.styles['SectionHeader'])) + story.append(Paragraph( + f"Es wurden {len(skipped_urls)} URLs übersprungen.", + self.styles['InfoText'] + )) + story.append(Spacer(1, 0.3*inch)) + + # Gruppiere nach Fehlergrund + skip_reasons = {} + for url, reason in skipped_urls.items(): + if reason not in skip_reasons: + skip_reasons[reason] = [] + skip_reasons[reason].append(url) + + for reason, urls in skip_reasons.items(): + story.append(Paragraph(f"{reason} ({len(urls)} URLs):", self.styles['InfoText'])) + + # Zeige maximal 10 URLs pro Grund + display_urls = urls[:10] + if len(urls) > 10: + story.append(Paragraph( + f"(Zeige erste 10 von {len(urls)} URLs)", + self.styles['CustomSubtitle'] + )) + + skip_data = [] + for url in display_urls: + # Verwende SmallTableCell für automatischen Umbruch + skip_data.append([Paragraph(url, self.styles['SmallTableCell'])]) + + skip_table = Table(skip_data, colWidths=[15.5*cm]) + skip_table.setStyle(TableStyle([ + ('FONTSIZE', (0, 0), (-1, -1), 8), + ('GRID', (0, 0), (-1, -1), 0.5, colors.HexColor('#E9ECEF')), + ('VALIGN', (0, 0), (-1, -1), 'TOP'), + ('TOPPADDING', (0, 0), (-1, -1), 2), + ('BOTTOMPADDING', (0, 0), (-1, -1), 2), + ])) + story.append(skip_table) + story.append(Spacer(1, 0.2*inch)) + + # Footer + story.append(Spacer(1, 0.5*inch)) + story.append(Paragraph( + f"Bericht erstellt am {datetime.now().strftime('%d.%m.%Y um %H:%M:%S')}", + self.styles['CustomSubtitle'] + )) + + # Erstelle PDF + doc.build(story) + + return output_path \ No newline at end of file diff --git a/start.bat b/start.bat new file mode 100644 index 0000000..31accab --- /dev/null +++ b/start.bat @@ -0,0 +1,4 @@ +@echo off +echo IntelSight Webseiten-Crawler wird gestartet... +python main.py +pause \ No newline at end of file