Add complete Mail Fine-Tuning Web-App for macOS Apple Silicon
Implemented a full-stack web application for fine-tuning LLMs on email data, optimized for Apple Silicon (M4 Pro with 24GB RAM). Features: - Mail import with drag & drop support (.mbox, .eml, .txt) - Automated mail cleaning and preprocessing - Interactive labeling interface with keyboard shortcuts - Training data export to JSONL format - MLX-based LoRA fine-tuning with live updates - Model evaluation and comparison interface - Server-Sent Events for real-time training progress - Dark theme UI optimized for extended use Technical Stack: - Backend: FastAPI with SQLite database - Frontend: Vanilla HTML/CSS/JavaScript (no external dependencies) - ML Framework: MLX for Apple Silicon optimization - Models: Support for Mistral 7B and Llama 3 8B via MLX Components: - data_manager.py: SQLite operations for mail storage and labeling - mail_parser.py: Parser for multiple mail formats with cleaning - training.py: MLX training wrapper with LoRA support - inference.py: Model loading and inference for evaluation - main.py: FastAPI backend with REST API and SSE - Frontend: Complete UI with all features Documentation: - Comprehensive README with installation and usage guide - Quick-start guide for rapid setup - Example mails for testing - Troubleshooting and best practices Ready for local deployment and fine-tuning workflows.
This commit is contained in:
@@ -0,0 +1,264 @@
|
||||
"""
|
||||
Mail Parser für verschiedene Formate
|
||||
Bereinigt und normalisiert Mail-Inhalte
|
||||
"""
|
||||
|
||||
import email
|
||||
import mailbox
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
from typing import List, Dict, Optional
|
||||
from pathlib import Path
|
||||
import chardet
|
||||
|
||||
|
||||
class MailParser:
|
||||
"""Parst und bereinigt Mail-Dateien"""
|
||||
|
||||
# Häufige Footer/Disclaimer Pattern
|
||||
FOOTER_PATTERNS = [
|
||||
r'(?i)^--\s*$.*', # Standard signature delimiter
|
||||
r'(?i)Diese E-Mail.*vertraulich.*',
|
||||
r'(?i)This email.*confidential.*',
|
||||
r'(?i)Disclaimer:.*',
|
||||
r'(?i)Get Outlook for.*',
|
||||
r'(?i)Sent from my iPhone.*',
|
||||
r'(?i)Von meinem.*gesendet.*',
|
||||
r'(?i)Diese Nachricht.*Virenfrei.*',
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def detect_encoding(file_path: Path) -> str:
|
||||
"""Erkennt das Encoding einer Datei"""
|
||||
with open(file_path, 'rb') as f:
|
||||
raw_data = f.read()
|
||||
result = chardet.detect(raw_data)
|
||||
return result['encoding'] or 'utf-8'
|
||||
|
||||
@staticmethod
|
||||
def html_to_text(html: str) -> str:
|
||||
"""Konvertiert HTML zu Plain Text"""
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
|
||||
# Entferne Script und Style Tags
|
||||
for script in soup(['script', 'style']):
|
||||
script.decompose()
|
||||
|
||||
# Extrahiere Text
|
||||
text = soup.get_text()
|
||||
|
||||
# Bereinige Whitespace
|
||||
lines = (line.strip() for line in text.splitlines())
|
||||
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
||||
text = ' '.join(chunk for chunk in chunks if chunk)
|
||||
|
||||
return text
|
||||
|
||||
@staticmethod
|
||||
def remove_multiple_newlines(text: str) -> str:
|
||||
"""Entfernt mehrfache Leerzeilen"""
|
||||
return re.sub(r'\n{3,}', '\n\n', text)
|
||||
|
||||
@staticmethod
|
||||
def remove_footers(text: str) -> str:
|
||||
"""Entfernt häufige Footer und Disclaimer"""
|
||||
for pattern in MailParser.FOOTER_PATTERNS:
|
||||
# Suche Pattern und entferne alles danach
|
||||
match = re.search(pattern, text, re.MULTILINE | re.DOTALL)
|
||||
if match:
|
||||
text = text[:match.start()].strip()
|
||||
|
||||
return text
|
||||
|
||||
@staticmethod
|
||||
def clean_quoted_text(text: str) -> str:
|
||||
"""Entfernt oder markiert quoted Text (> oder |)"""
|
||||
lines = text.split('\n')
|
||||
cleaned_lines = []
|
||||
|
||||
for line in lines:
|
||||
# Überspringe Zeilen die mit > oder | beginnen (quoted text)
|
||||
if not line.strip().startswith('>') and not line.strip().startswith('|'):
|
||||
cleaned_lines.append(line)
|
||||
|
||||
return '\n'.join(cleaned_lines)
|
||||
|
||||
@staticmethod
|
||||
def normalize_whitespace(text: str) -> str:
|
||||
"""Normalisiert Whitespace"""
|
||||
# Entferne trailing spaces
|
||||
lines = [line.rstrip() for line in text.split('\n')]
|
||||
text = '\n'.join(lines)
|
||||
|
||||
# Entferne mehrfache Spaces
|
||||
text = re.sub(r' {2,}', ' ', text)
|
||||
|
||||
# Entferne mehrfache Leerzeilen
|
||||
text = MailParser.remove_multiple_newlines(text)
|
||||
|
||||
return text.strip()
|
||||
|
||||
@staticmethod
|
||||
def clean_text(text: str, is_html: bool = False) -> str:
|
||||
"""Vollständige Bereinigung eines Texts"""
|
||||
if is_html:
|
||||
text = MailParser.html_to_text(text)
|
||||
|
||||
text = MailParser.remove_footers(text)
|
||||
text = MailParser.clean_quoted_text(text)
|
||||
text = MailParser.normalize_whitespace(text)
|
||||
|
||||
return text
|
||||
|
||||
@staticmethod
|
||||
def parse_eml(file_path: Path) -> Dict:
|
||||
"""Parst eine .eml Datei"""
|
||||
encoding = MailParser.detect_encoding(file_path)
|
||||
|
||||
with open(file_path, 'r', encoding=encoding, errors='ignore') as f:
|
||||
msg = email.message_from_file(f)
|
||||
|
||||
subject = msg.get('Subject', 'No Subject')
|
||||
sender = msg.get('From', 'Unknown')
|
||||
recipient = msg.get('To', 'Unknown')
|
||||
date = msg.get('Date', '')
|
||||
|
||||
# Body extrahieren
|
||||
body = ""
|
||||
is_html = False
|
||||
|
||||
if msg.is_multipart():
|
||||
for part in msg.walk():
|
||||
content_type = part.get_content_type()
|
||||
if content_type == 'text/plain':
|
||||
body = part.get_payload(decode=True).decode(errors='ignore')
|
||||
break
|
||||
elif content_type == 'text/html' and not body:
|
||||
body = part.get_payload(decode=True).decode(errors='ignore')
|
||||
is_html = True
|
||||
else:
|
||||
body = msg.get_payload(decode=True).decode(errors='ignore')
|
||||
if msg.get_content_type() == 'text/html':
|
||||
is_html = True
|
||||
|
||||
# Bereinige Body
|
||||
body = MailParser.clean_text(body, is_html)
|
||||
|
||||
return {
|
||||
'subject': subject,
|
||||
'sender': sender,
|
||||
'recipient': recipient,
|
||||
'date': date,
|
||||
'body': body,
|
||||
'original_format': 'eml'
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def parse_mbox(file_path: Path) -> List[Dict]:
|
||||
"""Parst eine .mbox Datei"""
|
||||
mails = []
|
||||
|
||||
try:
|
||||
mbox = mailbox.mbox(str(file_path))
|
||||
|
||||
for message in mbox:
|
||||
subject = message.get('Subject', 'No Subject')
|
||||
sender = message.get('From', 'Unknown')
|
||||
recipient = message.get('To', 'Unknown')
|
||||
date = message.get('Date', '')
|
||||
|
||||
body = ""
|
||||
is_html = False
|
||||
|
||||
if message.is_multipart():
|
||||
for part in message.walk():
|
||||
content_type = part.get_content_type()
|
||||
if content_type == 'text/plain':
|
||||
payload = part.get_payload(decode=True)
|
||||
if payload:
|
||||
body = payload.decode(errors='ignore')
|
||||
break
|
||||
elif content_type == 'text/html' and not body:
|
||||
payload = part.get_payload(decode=True)
|
||||
if payload:
|
||||
body = payload.decode(errors='ignore')
|
||||
is_html = True
|
||||
else:
|
||||
payload = message.get_payload(decode=True)
|
||||
if payload:
|
||||
body = payload.decode(errors='ignore')
|
||||
if message.get_content_type() == 'text/html':
|
||||
is_html = True
|
||||
|
||||
body = MailParser.clean_text(body, is_html)
|
||||
|
||||
mails.append({
|
||||
'subject': subject,
|
||||
'sender': sender,
|
||||
'recipient': recipient,
|
||||
'date': date,
|
||||
'body': body,
|
||||
'original_format': 'mbox'
|
||||
})
|
||||
|
||||
except Exception as e:
|
||||
raise Exception(f"Error parsing mbox: {str(e)}")
|
||||
|
||||
return mails
|
||||
|
||||
@staticmethod
|
||||
def parse_txt(file_path: Path) -> Dict:
|
||||
"""Parst eine .txt Datei (simple Mail als Text)"""
|
||||
encoding = MailParser.detect_encoding(file_path)
|
||||
|
||||
with open(file_path, 'r', encoding=encoding, errors='ignore') as f:
|
||||
content = f.read()
|
||||
|
||||
# Einfache Struktur: Versuche Subject/From/To zu erkennen
|
||||
lines = content.split('\n')
|
||||
subject = 'No Subject'
|
||||
sender = 'Unknown'
|
||||
recipient = 'Unknown'
|
||||
date = ''
|
||||
body_start = 0
|
||||
|
||||
for i, line in enumerate(lines[:10]): # Erste 10 Zeilen prüfen
|
||||
if line.lower().startswith('subject:'):
|
||||
subject = line[8:].strip()
|
||||
body_start = max(body_start, i + 1)
|
||||
elif line.lower().startswith('from:'):
|
||||
sender = line[5:].strip()
|
||||
body_start = max(body_start, i + 1)
|
||||
elif line.lower().startswith('to:'):
|
||||
recipient = line[3:].strip()
|
||||
body_start = max(body_start, i + 1)
|
||||
elif line.lower().startswith('date:'):
|
||||
date = line[5:].strip()
|
||||
body_start = max(body_start, i + 1)
|
||||
|
||||
# Body ist der Rest
|
||||
body = '\n'.join(lines[body_start:])
|
||||
body = MailParser.clean_text(body)
|
||||
|
||||
return {
|
||||
'subject': subject,
|
||||
'sender': sender,
|
||||
'recipient': recipient,
|
||||
'date': date,
|
||||
'body': body,
|
||||
'original_format': 'txt'
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def parse_file(file_path: Path) -> List[Dict]:
|
||||
"""Parst eine Mail-Datei basierend auf Endung"""
|
||||
suffix = file_path.suffix.lower()
|
||||
|
||||
if suffix == '.eml':
|
||||
return [MailParser.parse_eml(file_path)]
|
||||
elif suffix == '.mbox':
|
||||
return MailParser.parse_mbox(file_path)
|
||||
elif suffix == '.txt':
|
||||
return [MailParser.parse_txt(file_path)]
|
||||
else:
|
||||
raise ValueError(f"Unsupported file format: {suffix}")
|
||||
Reference in New Issue
Block a user