Add complete Mail Fine-Tuning Web-App for macOS Apple Silicon

Implemented a full-stack web application for fine-tuning LLMs on email data, optimized for Apple Silicon (M4 Pro with 24GB RAM). Features: - Mail import with drag & drop support (.mbox, .eml, .txt) - Automated mail cleaning and preprocessing - Interactive labeling interface with keyboard shortcuts - Training data export to JSONL format - MLX-based LoRA fine-tuning with live updates - Model evaluation and comparison interface - Server-Sent Events for real-time training progress - Dark theme UI optimized for extended use Technical Stack: - Backend: FastAPI with SQLite database - Frontend: Vanilla HTML/CSS/JavaScript (no external dependencies) - ML Framework: MLX for Apple Silicon optimization - Models: Support for Mistral 7B and Llama 3 8B via MLX Components: - data_manager.py: SQLite operations for mail storage and labeling - mail_parser.py: Parser for multiple mail formats with cleaning - training.py: MLX training wrapper with LoRA support - inference.py: Model loading and inference for evaluation - main.py: FastAPI backend with REST API and SSE - Frontend: Complete UI with all features Documentation: - Comprehensive README with installation and usage guide - Quick-start guide for rapid setup - Example mails for testing - Troubleshooting and best practices Ready for local deployment and fine-tuning workflows.
2025-12-03 07:35:35 +00:00
commit 1456995462
20 changed files with 3884 additions and 0 deletions
@@ -0,0 +1,264 @@
+"""
+Mail Parser für verschiedene Formate
+Bereinigt und normalisiert Mail-Inhalte
+"""
+
+import email
+import mailbox
+import re
+from bs4 import BeautifulSoup
+from typing import List, Dict, Optional
+from pathlib import Path
+import chardet
+
+
+class MailParser:
+    """Parst und bereinigt Mail-Dateien"""
+
+    # Häufige Footer/Disclaimer Pattern
+    FOOTER_PATTERNS = [
+        r'(?i)^--\s*$.*',  # Standard signature delimiter
+        r'(?i)Diese E-Mail.*vertraulich.*',
+        r'(?i)This email.*confidential.*',
+        r'(?i)Disclaimer:.*',
+        r'(?i)Get Outlook for.*',
+        r'(?i)Sent from my iPhone.*',
+        r'(?i)Von meinem.*gesendet.*',
+        r'(?i)Diese Nachricht.*Virenfrei.*',
+    ]
+
+    @staticmethod
+    def detect_encoding(file_path: Path) -> str:
+        """Erkennt das Encoding einer Datei"""
+        with open(file_path, 'rb') as f:
+            raw_data = f.read()
+            result = chardet.detect(raw_data)
+            return result['encoding'] or 'utf-8'
+
+    @staticmethod
+    def html_to_text(html: str) -> str:
+        """Konvertiert HTML zu Plain Text"""
+        soup = BeautifulSoup(html, 'html.parser')
+
+        # Entferne Script und Style Tags
+        for script in soup(['script', 'style']):
+            script.decompose()
+
+        # Extrahiere Text
+        text = soup.get_text()
+
+        # Bereinige Whitespace
+        lines = (line.strip() for line in text.splitlines())
+        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
+        text = ' '.join(chunk for chunk in chunks if chunk)
+
+        return text
+
+    @staticmethod
+    def remove_multiple_newlines(text: str) -> str:
+        """Entfernt mehrfache Leerzeilen"""
+        return re.sub(r'\n{3,}', '\n\n', text)
+
+    @staticmethod
+    def remove_footers(text: str) -> str:
+        """Entfernt häufige Footer und Disclaimer"""
+        for pattern in MailParser.FOOTER_PATTERNS:
+            # Suche Pattern und entferne alles danach
+            match = re.search(pattern, text, re.MULTILINE | re.DOTALL)
+            if match:
+                text = text[:match.start()].strip()
+
+        return text
+
+    @staticmethod
+    def clean_quoted_text(text: str) -> str:
+        """Entfernt oder markiert quoted Text (> oder |)"""
+        lines = text.split('\n')
+        cleaned_lines = []
+
+        for line in lines:
+            # Überspringe Zeilen die mit > oder | beginnen (quoted text)
+            if not line.strip().startswith('>') and not line.strip().startswith('|'):
+                cleaned_lines.append(line)
+
+        return '\n'.join(cleaned_lines)
+
+    @staticmethod
+    def normalize_whitespace(text: str) -> str:
+        """Normalisiert Whitespace"""
+        # Entferne trailing spaces
+        lines = [line.rstrip() for line in text.split('\n')]
+        text = '\n'.join(lines)
+
+        # Entferne mehrfache Spaces
+        text = re.sub(r' {2,}', ' ', text)
+
+        # Entferne mehrfache Leerzeilen
+        text = MailParser.remove_multiple_newlines(text)
+
+        return text.strip()
+
+    @staticmethod
+    def clean_text(text: str, is_html: bool = False) -> str:
+        """Vollständige Bereinigung eines Texts"""
+        if is_html:
+            text = MailParser.html_to_text(text)
+
+        text = MailParser.remove_footers(text)
+        text = MailParser.clean_quoted_text(text)
+        text = MailParser.normalize_whitespace(text)
+
+        return text
+
+    @staticmethod
+    def parse_eml(file_path: Path) -> Dict:
+        """Parst eine .eml Datei"""
+        encoding = MailParser.detect_encoding(file_path)
+
+        with open(file_path, 'r', encoding=encoding, errors='ignore') as f:
+            msg = email.message_from_file(f)
+
+        subject = msg.get('Subject', 'No Subject')
+        sender = msg.get('From', 'Unknown')
+        recipient = msg.get('To', 'Unknown')
+        date = msg.get('Date', '')
+
+        # Body extrahieren
+        body = ""
+        is_html = False
+
+        if msg.is_multipart():
+            for part in msg.walk():
+                content_type = part.get_content_type()
+                if content_type == 'text/plain':
+                    body = part.get_payload(decode=True).decode(errors='ignore')
+                    break
+                elif content_type == 'text/html' and not body:
+                    body = part.get_payload(decode=True).decode(errors='ignore')
+                    is_html = True
+        else:
+            body = msg.get_payload(decode=True).decode(errors='ignore')
+            if msg.get_content_type() == 'text/html':
+                is_html = True
+
+        # Bereinige Body
+        body = MailParser.clean_text(body, is_html)
+
+        return {
+            'subject': subject,
+            'sender': sender,
+            'recipient': recipient,
+            'date': date,
+            'body': body,
+            'original_format': 'eml'
+        }
+
+    @staticmethod
+    def parse_mbox(file_path: Path) -> List[Dict]:
+        """Parst eine .mbox Datei"""
+        mails = []
+
+        try:
+            mbox = mailbox.mbox(str(file_path))
+
+            for message in mbox:
+                subject = message.get('Subject', 'No Subject')
+                sender = message.get('From', 'Unknown')
+                recipient = message.get('To', 'Unknown')
+                date = message.get('Date', '')
+
+                body = ""
+                is_html = False
+
+                if message.is_multipart():
+                    for part in message.walk():
+                        content_type = part.get_content_type()
+                        if content_type == 'text/plain':
+                            payload = part.get_payload(decode=True)
+                            if payload:
+                                body = payload.decode(errors='ignore')
+                            break
+                        elif content_type == 'text/html' and not body:
+                            payload = part.get_payload(decode=True)
+                            if payload:
+                                body = payload.decode(errors='ignore')
+                                is_html = True
+                else:
+                    payload = message.get_payload(decode=True)
+                    if payload:
+                        body = payload.decode(errors='ignore')
+                        if message.get_content_type() == 'text/html':
+                            is_html = True
+
+                body = MailParser.clean_text(body, is_html)
+
+                mails.append({
+                    'subject': subject,
+                    'sender': sender,
+                    'recipient': recipient,
+                    'date': date,
+                    'body': body,
+                    'original_format': 'mbox'
+                })
+
+        except Exception as e:
+            raise Exception(f"Error parsing mbox: {str(e)}")
+
+        return mails
+
+    @staticmethod
+    def parse_txt(file_path: Path) -> Dict:
+        """Parst eine .txt Datei (simple Mail als Text)"""
+        encoding = MailParser.detect_encoding(file_path)
+
+        with open(file_path, 'r', encoding=encoding, errors='ignore') as f:
+            content = f.read()
+
+        # Einfache Struktur: Versuche Subject/From/To zu erkennen
+        lines = content.split('\n')
+        subject = 'No Subject'
+        sender = 'Unknown'
+        recipient = 'Unknown'
+        date = ''
+        body_start = 0
+
+        for i, line in enumerate(lines[:10]):  # Erste 10 Zeilen prüfen
+            if line.lower().startswith('subject:'):
+                subject = line[8:].strip()
+                body_start = max(body_start, i + 1)
+            elif line.lower().startswith('from:'):
+                sender = line[5:].strip()
+                body_start = max(body_start, i + 1)
+            elif line.lower().startswith('to:'):
+                recipient = line[3:].strip()
+                body_start = max(body_start, i + 1)
+            elif line.lower().startswith('date:'):
+                date = line[5:].strip()
+                body_start = max(body_start, i + 1)
+
+        # Body ist der Rest
+        body = '\n'.join(lines[body_start:])
+        body = MailParser.clean_text(body)
+
+        return {
+            'subject': subject,
+            'sender': sender,
+            'recipient': recipient,
+            'date': date,
+            'body': body,
+            'original_format': 'txt'
+        }
+
+    @staticmethod
+    def parse_file(file_path: Path) -> List[Dict]:
+        """Parst eine Mail-Datei basierend auf Endung"""
+        suffix = file_path.suffix.lower()
+
+        if suffix == '.eml':
+            return [MailParser.parse_eml(file_path)]
+        elif suffix == '.mbox':
+            return MailParser.parse_mbox(file_path)
+        elif suffix == '.txt':
+            return [MailParser.parse_txt(file_path)]
+        else:
+            raise ValueError(f"Unsupported file format: {suffix}")