1456995462
Implemented a full-stack web application for fine-tuning LLMs on email data, optimized for Apple Silicon (M4 Pro with 24GB RAM). Features: - Mail import with drag & drop support (.mbox, .eml, .txt) - Automated mail cleaning and preprocessing - Interactive labeling interface with keyboard shortcuts - Training data export to JSONL format - MLX-based LoRA fine-tuning with live updates - Model evaluation and comparison interface - Server-Sent Events for real-time training progress - Dark theme UI optimized for extended use Technical Stack: - Backend: FastAPI with SQLite database - Frontend: Vanilla HTML/CSS/JavaScript (no external dependencies) - ML Framework: MLX for Apple Silicon optimization - Models: Support for Mistral 7B and Llama 3 8B via MLX Components: - data_manager.py: SQLite operations for mail storage and labeling - mail_parser.py: Parser for multiple mail formats with cleaning - training.py: MLX training wrapper with LoRA support - inference.py: Model loading and inference for evaluation - main.py: FastAPI backend with REST API and SSE - Frontend: Complete UI with all features Documentation: - Comprehensive README with installation and usage guide - Quick-start guide for rapid setup - Example mails for testing - Troubleshooting and best practices Ready for local deployment and fine-tuning workflows.
265 lines
8.6 KiB
Python
265 lines
8.6 KiB
Python
"""
|
|
Mail Parser für verschiedene Formate
|
|
Bereinigt und normalisiert Mail-Inhalte
|
|
"""
|
|
|
|
import email
|
|
import mailbox
|
|
import re
|
|
from bs4 import BeautifulSoup
|
|
from typing import List, Dict, Optional
|
|
from pathlib import Path
|
|
import chardet
|
|
|
|
|
|
class MailParser:
|
|
"""Parst und bereinigt Mail-Dateien"""
|
|
|
|
# Häufige Footer/Disclaimer Pattern
|
|
FOOTER_PATTERNS = [
|
|
r'(?i)^--\s*$.*', # Standard signature delimiter
|
|
r'(?i)Diese E-Mail.*vertraulich.*',
|
|
r'(?i)This email.*confidential.*',
|
|
r'(?i)Disclaimer:.*',
|
|
r'(?i)Get Outlook for.*',
|
|
r'(?i)Sent from my iPhone.*',
|
|
r'(?i)Von meinem.*gesendet.*',
|
|
r'(?i)Diese Nachricht.*Virenfrei.*',
|
|
]
|
|
|
|
@staticmethod
|
|
def detect_encoding(file_path: Path) -> str:
|
|
"""Erkennt das Encoding einer Datei"""
|
|
with open(file_path, 'rb') as f:
|
|
raw_data = f.read()
|
|
result = chardet.detect(raw_data)
|
|
return result['encoding'] or 'utf-8'
|
|
|
|
@staticmethod
|
|
def html_to_text(html: str) -> str:
|
|
"""Konvertiert HTML zu Plain Text"""
|
|
soup = BeautifulSoup(html, 'html.parser')
|
|
|
|
# Entferne Script und Style Tags
|
|
for script in soup(['script', 'style']):
|
|
script.decompose()
|
|
|
|
# Extrahiere Text
|
|
text = soup.get_text()
|
|
|
|
# Bereinige Whitespace
|
|
lines = (line.strip() for line in text.splitlines())
|
|
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
|
text = ' '.join(chunk for chunk in chunks if chunk)
|
|
|
|
return text
|
|
|
|
@staticmethod
|
|
def remove_multiple_newlines(text: str) -> str:
|
|
"""Entfernt mehrfache Leerzeilen"""
|
|
return re.sub(r'\n{3,}', '\n\n', text)
|
|
|
|
@staticmethod
|
|
def remove_footers(text: str) -> str:
|
|
"""Entfernt häufige Footer und Disclaimer"""
|
|
for pattern in MailParser.FOOTER_PATTERNS:
|
|
# Suche Pattern und entferne alles danach
|
|
match = re.search(pattern, text, re.MULTILINE | re.DOTALL)
|
|
if match:
|
|
text = text[:match.start()].strip()
|
|
|
|
return text
|
|
|
|
@staticmethod
|
|
def clean_quoted_text(text: str) -> str:
|
|
"""Entfernt oder markiert quoted Text (> oder |)"""
|
|
lines = text.split('\n')
|
|
cleaned_lines = []
|
|
|
|
for line in lines:
|
|
# Überspringe Zeilen die mit > oder | beginnen (quoted text)
|
|
if not line.strip().startswith('>') and not line.strip().startswith('|'):
|
|
cleaned_lines.append(line)
|
|
|
|
return '\n'.join(cleaned_lines)
|
|
|
|
@staticmethod
|
|
def normalize_whitespace(text: str) -> str:
|
|
"""Normalisiert Whitespace"""
|
|
# Entferne trailing spaces
|
|
lines = [line.rstrip() for line in text.split('\n')]
|
|
text = '\n'.join(lines)
|
|
|
|
# Entferne mehrfache Spaces
|
|
text = re.sub(r' {2,}', ' ', text)
|
|
|
|
# Entferne mehrfache Leerzeilen
|
|
text = MailParser.remove_multiple_newlines(text)
|
|
|
|
return text.strip()
|
|
|
|
@staticmethod
|
|
def clean_text(text: str, is_html: bool = False) -> str:
|
|
"""Vollständige Bereinigung eines Texts"""
|
|
if is_html:
|
|
text = MailParser.html_to_text(text)
|
|
|
|
text = MailParser.remove_footers(text)
|
|
text = MailParser.clean_quoted_text(text)
|
|
text = MailParser.normalize_whitespace(text)
|
|
|
|
return text
|
|
|
|
@staticmethod
|
|
def parse_eml(file_path: Path) -> Dict:
|
|
"""Parst eine .eml Datei"""
|
|
encoding = MailParser.detect_encoding(file_path)
|
|
|
|
with open(file_path, 'r', encoding=encoding, errors='ignore') as f:
|
|
msg = email.message_from_file(f)
|
|
|
|
subject = msg.get('Subject', 'No Subject')
|
|
sender = msg.get('From', 'Unknown')
|
|
recipient = msg.get('To', 'Unknown')
|
|
date = msg.get('Date', '')
|
|
|
|
# Body extrahieren
|
|
body = ""
|
|
is_html = False
|
|
|
|
if msg.is_multipart():
|
|
for part in msg.walk():
|
|
content_type = part.get_content_type()
|
|
if content_type == 'text/plain':
|
|
body = part.get_payload(decode=True).decode(errors='ignore')
|
|
break
|
|
elif content_type == 'text/html' and not body:
|
|
body = part.get_payload(decode=True).decode(errors='ignore')
|
|
is_html = True
|
|
else:
|
|
body = msg.get_payload(decode=True).decode(errors='ignore')
|
|
if msg.get_content_type() == 'text/html':
|
|
is_html = True
|
|
|
|
# Bereinige Body
|
|
body = MailParser.clean_text(body, is_html)
|
|
|
|
return {
|
|
'subject': subject,
|
|
'sender': sender,
|
|
'recipient': recipient,
|
|
'date': date,
|
|
'body': body,
|
|
'original_format': 'eml'
|
|
}
|
|
|
|
@staticmethod
|
|
def parse_mbox(file_path: Path) -> List[Dict]:
|
|
"""Parst eine .mbox Datei"""
|
|
mails = []
|
|
|
|
try:
|
|
mbox = mailbox.mbox(str(file_path))
|
|
|
|
for message in mbox:
|
|
subject = message.get('Subject', 'No Subject')
|
|
sender = message.get('From', 'Unknown')
|
|
recipient = message.get('To', 'Unknown')
|
|
date = message.get('Date', '')
|
|
|
|
body = ""
|
|
is_html = False
|
|
|
|
if message.is_multipart():
|
|
for part in message.walk():
|
|
content_type = part.get_content_type()
|
|
if content_type == 'text/plain':
|
|
payload = part.get_payload(decode=True)
|
|
if payload:
|
|
body = payload.decode(errors='ignore')
|
|
break
|
|
elif content_type == 'text/html' and not body:
|
|
payload = part.get_payload(decode=True)
|
|
if payload:
|
|
body = payload.decode(errors='ignore')
|
|
is_html = True
|
|
else:
|
|
payload = message.get_payload(decode=True)
|
|
if payload:
|
|
body = payload.decode(errors='ignore')
|
|
if message.get_content_type() == 'text/html':
|
|
is_html = True
|
|
|
|
body = MailParser.clean_text(body, is_html)
|
|
|
|
mails.append({
|
|
'subject': subject,
|
|
'sender': sender,
|
|
'recipient': recipient,
|
|
'date': date,
|
|
'body': body,
|
|
'original_format': 'mbox'
|
|
})
|
|
|
|
except Exception as e:
|
|
raise Exception(f"Error parsing mbox: {str(e)}")
|
|
|
|
return mails
|
|
|
|
@staticmethod
|
|
def parse_txt(file_path: Path) -> Dict:
|
|
"""Parst eine .txt Datei (simple Mail als Text)"""
|
|
encoding = MailParser.detect_encoding(file_path)
|
|
|
|
with open(file_path, 'r', encoding=encoding, errors='ignore') as f:
|
|
content = f.read()
|
|
|
|
# Einfache Struktur: Versuche Subject/From/To zu erkennen
|
|
lines = content.split('\n')
|
|
subject = 'No Subject'
|
|
sender = 'Unknown'
|
|
recipient = 'Unknown'
|
|
date = ''
|
|
body_start = 0
|
|
|
|
for i, line in enumerate(lines[:10]): # Erste 10 Zeilen prüfen
|
|
if line.lower().startswith('subject:'):
|
|
subject = line[8:].strip()
|
|
body_start = max(body_start, i + 1)
|
|
elif line.lower().startswith('from:'):
|
|
sender = line[5:].strip()
|
|
body_start = max(body_start, i + 1)
|
|
elif line.lower().startswith('to:'):
|
|
recipient = line[3:].strip()
|
|
body_start = max(body_start, i + 1)
|
|
elif line.lower().startswith('date:'):
|
|
date = line[5:].strip()
|
|
body_start = max(body_start, i + 1)
|
|
|
|
# Body ist der Rest
|
|
body = '\n'.join(lines[body_start:])
|
|
body = MailParser.clean_text(body)
|
|
|
|
return {
|
|
'subject': subject,
|
|
'sender': sender,
|
|
'recipient': recipient,
|
|
'date': date,
|
|
'body': body,
|
|
'original_format': 'txt'
|
|
}
|
|
|
|
@staticmethod
|
|
def parse_file(file_path: Path) -> List[Dict]:
|
|
"""Parst eine Mail-Datei basierend auf Endung"""
|
|
suffix = file_path.suffix.lower()
|
|
|
|
if suffix == '.eml':
|
|
return [MailParser.parse_eml(file_path)]
|
|
elif suffix == '.mbox':
|
|
return MailParser.parse_mbox(file_path)
|
|
elif suffix == '.txt':
|
|
return [MailParser.parse_txt(file_path)]
|
|
else:
|
|
raise ValueError(f"Unsupported file format: {suffix}")
|