diff --git a/.DS_Store b/.DS_Store
deleted file mode 100644
index e90ef19..0000000
Binary files a/.DS_Store and /dev/null differ
diff --git a/.gitignore b/.gitignore
index 382d186..8b0731e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,27 +1,108 @@
-# IDE & Cache
+# ─────────────────────────────
+# IDEs & System Files
+# ─────────────────────────────
 .idea/
+.vscode/
 __pycache__/
 *.pyc
 .DS_Store
+*.log
 
-# Whisper Modelle & Cache
+# ─────────────────────────────
+# Cache / Modelle / Checkpoints
+# ─────────────────────────────
 whisper-cache/
 models/
 *.pt
+*.onnx
+*.bin
+*.safetensors
 
-# Output/Temp Files
+# ─────────────────────────────
+# Datenbank / temporäre Dateien
+# ─────────────────────────────
+*.db
+*.sqlite
+logs/
+temp/
+tmp/
+*.tmp
+
+# ─────────────────────────────
+# Transkripte / KI-Zwischenausgaben
+# ─────────────────────────────
+/data/transkripte/
+/transcripts/
+/outputs/
+/results/
+*_segments.json
+*_timed.txt
+*_suspect_lines.txt
+
+# ─────────────────────────────
+# Video / Audio Outputs
+# ─────────────────────────────
 *.mp4
 *.mov
-*.db
+*.mkv
 *.wav
-*.json
-temp.*
-logs/
+*.webm
+*.mp3
 
-# Eingebettete Repos
+# ─────────────────────────────
+# Generierte Teil-/Ergebnis-Ordner
+# ─────────────────────────────
+/raw_clips/
+/face_combined/
+/face_crop_centers/
+/cropped/
+/subtitled/
+/segments/
+/highlight_clips/
+/output/
+/renders/
+/exports/
+
+# ─────────────────────────────
+# Eingebettete Repos oder externe Module
+# ─────────────────────────────
+/whisper.cpp/
+/text-clustering/
+/venv/
+/.env/
+/.env.local
+.envrc
+.env.*
+
+# ─────────────────────────────
+# Backups / Sonstiges
+# ─────────────────────────────
+*.bak
+*.old
+*.orig
+*.swp
+*.zip
+*.tar
+*.gz
+
+# IDE/System
+.idea/
+.DS_Store
+__pycache__/
+*.pyc
+
+# Secrets/Umgebung
+.env
+config.py
+
+# Große/ausgeleitete Daten
+data/
+transkripte/
+whisper-cache/
+models/
+*.db
+*.mp4 *.mov *.mkv *.wav *.mp3 *.webm
+logs/ tmp/ temp/
+# embedded / external
 text-clustering/
 whisper.cpp/
-
-# Video-Rohmaterial
-*.mov
-
diff --git a/.idea/.gitignore b/.idea/.gitignore
deleted file mode 100644
index 13566b8..0000000
--- a/.idea/.gitignore
+++ /dev/null
@@ -1,8 +0,0 @@
-# Default ignored files
-/shelf/
-/workspace.xml
-# Editor-based HTTP Client requests
-/httpRequests/
-# Datasource local storage ignored files
-/dataSources/
-/dataSources.local.xml
diff --git a/.idea/BachlorArbeit.iml b/.idea/BachlorArbeit.iml
deleted file mode 100644
index 106b3db..0000000
--- a/.idea/BachlorArbeit.iml
+++ /dev/null
@@ -1,11 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<module type="PYTHON_MODULE" version="4">
-  <component name="NewModuleRootManager">
-    <content url="file://$MODULE_DIR$">
-      <sourceFolder url="file://$MODULE_DIR$" isTestSource="false" />
-      <excludeFolder url="file://$MODULE_DIR$/.venv" />
-    </content>
-    <orderEntry type="jdk" jdkName="Python 3.10 (BachlorArbeit)" jdkType="Python SDK" />
-    <orderEntry type="sourceFolder" forTests="false" />
-  </component>
-</module>
\ No newline at end of file
diff --git a/.idea/dataSources.xml b/.idea/dataSources.xml
deleted file mode 100644
index 34586b5..0000000
--- a/.idea/dataSources.xml
+++ /dev/null
@@ -1,35 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="DataSourceManagerImpl" format="xml" multifile-model="true">
-    <data-source source="LOCAL" name="segments" uuid="b474bded-3824-407e-9dc9-bcc11057235d">
-      <driver-ref>sqlite.xerial</driver-ref>
-      <synchronize>true</synchronize>
-      <jdbc-driver>org.sqlite.JDBC</jdbc-driver>
-      <jdbc-url>jdbc:sqlite:$PROJECT_DIR$/segments.db</jdbc-url>
-      <working-dir>$ProjectFileDir$</working-dir>
-      <libraries>
-        <library>
-          <url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/xerial/sqlite-jdbc/3.45.1.0/sqlite-jdbc-3.45.1.0.jar</url>
-        </library>
-        <library>
-          <url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/slf4j/slf4j-api/1.7.36/slf4j-api-1.7.36.jar</url>
-        </library>
-      </libraries>
-    </data-source>
-    <data-source source="LOCAL" name="clips_openai" uuid="50f21c9a-9baf-4dc5-9c33-fde0fd385e38">
-      <driver-ref>sqlite.xerial</driver-ref>
-      <synchronize>true</synchronize>
-      <jdbc-driver>org.sqlite.JDBC</jdbc-driver>
-      <jdbc-url>jdbc:sqlite:$PROJECT_DIR$/clips_openai.db</jdbc-url>
-      <working-dir>$ProjectFileDir$</working-dir>
-      <libraries>
-        <library>
-          <url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/xerial/sqlite-jdbc/3.45.1.0/sqlite-jdbc-3.45.1.0.jar</url>
-        </library>
-        <library>
-          <url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/slf4j/slf4j-api/1.7.36/slf4j-api-1.7.36.jar</url>
-        </library>
-      </libraries>
-    </data-source>
-  </component>
-</project>
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
deleted file mode 100644
index 105ce2d..0000000
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-<component name="InspectionProjectProfileManager">
-  <settings>
-    <option name="USE_PROJECT_PROFILE" value="false" />
-    <version value="1.0" />
-  </settings>
-</component>
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
deleted file mode 100644
index 1733c19..0000000
--- a/.idea/misc.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="Black">
-    <option name="sdkName" value="Python 3.10 (BachlorArbeit)" />
-  </component>
-</project>
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
deleted file mode 100644
index 5be715f..0000000
--- a/.idea/modules.xml
+++ /dev/null
@@ -1,8 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ProjectModuleManager">
-    <modules>
-      <module fileurl="file://$PROJECT_DIR$/.idea/BachlorArbeit.iml" filepath="$PROJECT_DIR$/.idea/BachlorArbeit.iml" />
-    </modules>
-  </component>
-</project>
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
deleted file mode 100644
index 6bdb7e2..0000000
--- a/.idea/vcs.xml
+++ /dev/null
@@ -1,9 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="VcsDirectoryMappings">
-    <mapping directory="$PROJECT_DIR$/models/distiluse-base-multilingual-cased-v2" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/text-clustering" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/whisper.cpp" vcs="Git" />
-    <mapping directory="$PROJECT_DIR$/whisper.cpp/whisper.cpp" vcs="Git" />
-  </component>
-</project>
\ No newline at end of file
diff --git a/README.md b/README.md
index e69de29..fa355d2 100644
--- a/README.md
+++ b/README.md
@@ -0,0 +1,250 @@
+# Bachelorarbeit – Pipeline: Automatisierte Highlight-Erkennung & 9:16-Aufbereitung
+
+Diese Repository enthält eine vollständige, skriptbasierte Pipeline, um aus Langvideos automatisch Social‑Media‑taugliche 9:16‑Highlights zu erzeugen – inkl. Transkription, KI‑gestützter Clip‑Selektion, Gesichts‑/Mundaktivitätsanalyse, Auto‑Cropping, Untertitel (Word‑Caps) und finalem Export.
+
+## Inhaltsverzeichnis
+- [Features](#features)
+- [Ordnerstruktur](#ordnerstruktur)
+- [Voraussetzungen](#voraussetzungen)
+- [Installation](#installation)
+- [Schnellstart (empfohlener Workflow)](#schnellstart-empfohlener-workflow)
+- [Skripte & CLI](#skripte--cli)
+- [Tipps & Troubleshooting](#tipps--troubleshooting)
+- [Reproduzierbarkeit](#reproduzierbarkeit)
+- [Lizenz / Danksagung](#lizenz--danksagung)
+
+---
+
+## Features
+- **Transkription mit Wort‑Zeitstempeln (Whisper, chunked ohne Grenz‑Doppler)**
+- **LLM‑gestützte Clip‑Selektion** (Viralität/Emotionalität etc. in SQLite gespeichert)
+- **Face‑Detection (YOLOv8‑face) & Mundaktivität (MediaPipe)**
+- **Stabiles 9:16‑Auto‑Cropping** (Median + EMA, Deadband, Szenenschnitt‑Erkennung, Switch‑Cooldown)
+- **Word‑Caps Untertitel** (ASS generiert, per ffmpeg eingebrannt)
+- **Batch‑Export der Highlights** (MoviePy, Längen‑/Grenz‑Checks)
+
+## Ordnerstruktur
+Die Pfade werden zentral in `config.py` definiert:
+```
+PROJECT_ROOT/
+├─ data/
+│  ├─ input/                 # Eingabevideo(s)
+│  ├─ transkripte/           # Whisper-Outputs (*_segments.json, *_timed.txt ...)
+│  ├─ segments/              # LLM-Clip-Auswahl, DB etc.
+│  ├─ output/
+│  │  └─ raw_clips/          # Roh-Highlight-Clips (aus cutClips.py)
+│  ├─ face_data_combined/    # faces.json je Clip (YOLO + MediaPipe)
+│  └─ face_crop_centers/     # (optional) Center-Listen
+├─ output/
+│  ├─ output_9x16_final/         # Auto-cropped 9:16 Videos
+│  ├─ output_9x16_final_subbed_word/  # 9:16 mit eingebrannten Word-Caps
+│  └─ debug/                     # Debug-Previews/Artefakte
+├─ models/                    # YOLO-Weights (z. B. yolov8n-face.pt)
+├─ whisper-cache/            # Whisper Modell-Cache
+└─ src/... (optional projektspezifisch)
+```
+> Beim ersten Start legt `config.py` fehlende Verzeichnisse automatisch an.
+
+## Voraussetzungen
+**System‑Tools**
+- `ffmpeg` (inkl. `ffprobe`) im `PATH`
+
+**Python**
+- Python 3.10+ empfohlen
+- Pakete (Beispiel):  
+  `openai-whisper`, `torch`, `ffmpeg-python`, `ultralytics`, `opencv-python`, `mediapipe`, `moviepy`, `tqdm`, `numpy`, `regex`
+- Optional/abhängig vom Codepfad: `pydub`, `scikit-image` (falls in Erweiterungen verwendet)
+
+**Modelle & Keys**
+- **Whisper**: lädt Modelle automatisch in `whisper-cache/` (steuerbar via `WHISPER_MODEL`)
+- **YOLOv8‑face**: `models/yolov8n-face.pt` (oder größeres Modell)
+- **OpenAI API Key** (für `segment_transcript.py` & `rateCluster.py`): `export OPENAI_API_KEY=...`
+  - Default‑Modell ggf. per `export OPENAI_MODEL=gpt-4o` setzen
+
+## Installation
+```bash
+# 1) Python-Umgebung
+python3 -m venv .venv
+source .venv/bin/activate
+
+# 2) Systemabhängigkeiten
+# ffmpeg installieren (Mac: brew install ffmpeg, Ubuntu: apt install ffmpeg)
+
+# 3) Python-Pakete (Beispiel)
+pip install --upgrade pip
+pip install openai-whisper torch ffmpeg-python ultralytics opencv-python mediapipe moviepy tqdm numpy regex
+
+# 4) Modelle/Dateien
+# YOLO-Weights:
+#   Download yolov8n-face.pt → ./models/yolov8n-face.pt
+# API Key für LLM:
+export OPENAI_API_KEY="sk-..."
+export OPENAI_MODEL="gpt-4o"
+```
+
+## Schnellstart (empfohlener Workflow)
+1) **Eingabe ablegen**  
+   Lege dein Langvideo in `data/input/` (z. B. `meinvideo.mp4`).
+
+2) **Transkription (Whisper, chunked & doppler-sicher)**  
+```bash
+python transcription.py --input data/input/meinvideo.mp4 --model small --lang de
+```
+   → erzeugt `*_segments.json` + `*_timed.txt` in `data/transkripte/`.
+
+3) **Clips mit LLM selektieren & in DB speichern**  
+```bash
+export OPENAI_API_KEY="..."; export OPENAI_MODEL="gpt-4o"
+python segment_transcript.py --base meinvideo --block 60 --min 6.0 --max 30.0
+```
+   → schreibt Clips in SQLite (`data/clips_openai.db` o. ä.)
+
+4) **Highlights aus dem Originalvideo schneiden**  
+```bash
+python cutClips.py --file meinvideo.mp4 --limit 10 --order score
+```
+   → exportiert `highlight_*.mp4` nach `data/output/raw_clips/`
+
+5) **Face‑Detection + Mundaktivität**  
+```bash
+python main_detect_faces.py --model models/yolov8n-face.pt     --input-dir data/output/raw_clips --output-dir data/face_data_combined     --frame-skip 1 --downscale 0.5
+```
+
+6) **Targets je Frame bauen (Zentren/Größe glätten)**  
+```bash
+python make_segments.py --pattern "highlight_*.mp4" --fps 0 --smooth 7 --overwrite
+```
+
+7) **9:16 Auto‑Crop anwenden**  
+```bash
+python main_apply_crop.py --pattern "highlight_*.mp4" --median 7 --ema 0.5     --deadband 16 --cut_detect --mux_audio --overwrite
+```
+   → fertige 9:16‑Clips in `output/output_9x16_final/`
+
+8) **Word‑Caps Untertitel einbrennen (optional)**  
+```bash
+python add_subtitles.py --clips_dir output/output_9x16_final     --out_dir output/output_9x16_final_subbed_word --model small --limit 20
+```
+   → fertige Videos mit eingebrannten Word‑Caps in `output/output_9x16_final_subbed_word/`
+
+> 💡 Du kannst viele Parameter (Fensterbreiten, Deadband, Erkennungsschwellen, Limits) über die CLI anpassen.
+
+## Skripte & CLI
+### `transcription.py`
+Chunked‑Transkription mit Wortzeitstempeln.
+```
+--input PATH        # Eingabevideo/-audio (Default: erstes File in data/input/)
+--outdir PATH       # Ausgabeverzeichnis (Default: data/transkripte/)
+--model NAME        # Whisper-Modell (tiny/base/small/medium/large; env: WHISPER_MODEL)
+--lang CODE         # Sprachcode (z. B. de) oder leer/None für Auto-Detect
+--chunk FLOAT       # Chunk-Länge in s (Default 60)
+--overlap FLOAT     # Überlappung in s (Default 2.0)
+--min-dur FLOAT     # Mindest-Segmentdauer (s)
+--max-gap FLOAT     # Max. Zeit-Gap beim Mergen (s)
+--fp16              # Nur sinnvoll mit GPU
+```
+
+### `segment_transcript.py`
+LLM‑Selektion & Speichern in SQLite.
+```
+--base STR          # Basename der Transkriptdateien (z. B. 'meinvideo')
+--block FLOAT       # Blocklänge s für den Prompt
+--min FLOAT         # minimale Clip-Länge s
+--max FLOAT         # maximale Clip-Länge s
+# env: OPENAI_API_KEY, OPENAI_MODEL (z. B. gpt-4o)
+```
+
+### `cutClips.py`
+Schneidet ausgewählte Highlights als Einzelclips.
+```
+--file NAME         # Name der Input-Datei in data/input (Default: erstes Video)
+--limit INT         # Anzahl zu exportierender Clips (Default 10)
+--order {score,start} # Sortierung: Score (absteigend) oder Startzeit
+```
+
+### `main_detect_faces.py`
+YOLOv8‑face + MediaPipe → `faces.json` pro Clip.
+```
+--input-dir PATH    # Default: data/output/raw_clips
+--output-dir PATH   # Default: data/face_data_combined
+--model PATH        # YOLOv8-face Weights (Default: models/yolov8n-face.pt)
+--conf-thresh FLOAT # Default 0.35
+--frame-skip INT    # z. B. 1 = jeden Frame, 2 = jeden von zwei ...
+--downscale FLOAT   # Frame-Downscale vor YOLO (0..1, z. B. 0.5)
+--expansion FLOAT   # Margin Pass 1 (relativ)
+--expansion2 FLOAT  # Margin Pass 2 (relativ)
+--min-crop INT      # minimale Croplänge (px)
+--faces-upscale INT # min. Kantenlänge für FaceMesh (kleine Crops hochskalieren)
+--imgsz INT         # YOLO input size (Default 448)
+--max-det INT       # Max Detects / Frame
+--use-refine        # MediaPipe refine_landmarks aktivieren
+```
+
+### `make_segments.py`
+Erzeugt `*_target_by_frame.json` (Zentren+Side pro Frame) aus Face/Center‑Daten.
+```
+--pattern STR       # Dateimuster in raw_clips (Default: highlight_*.mp4)
+--fps FLOAT         # FPS erzwingen (0 = aus Video lesen)
+--smooth INT        # MA-Fensterbreite (ungerade)
+--overwrite         # bestehende target_by_frame.json überschreiben
+```
+
+### `main_apply_crop.py`
+Wendet 9:16‑Crop mit Glättung/Szenenschnitt an.
+```
+--pattern STR       # Dateimuster in raw_clips (Default: *.mp4)
+--out_w INT         # Output-Breite (Default 1080)
+--out_h INT         # Output-Höhe (Default 1920)
+--zoom_pad FLOAT    # Zoom-Pad (0..1)
+--median INT        # Median-Fenster (>=3, ungerade)
+--ema FLOAT         # EMA-Alpha (0..1)
+--deadband FLOAT    # Totband in Pixel
+--switch_cd INT     # Cooldown-Frames nach Trackwechsel
+--cut_detect        # Szenenschnitt-Erkennung aktivieren
+--cut_corr FLOAT    # Schwellwert Korrelation (0..1)
+--cut_cd INT        # Cooldown-Frames nach Cut
+--mux_audio         # Original-Audio unterlegen
+--debug             # Debug-Overlay anzeigen
+--debug_scale FLOAT # Debug-Preview skaliert rendern
+--overwrite         # vorhandene Ausgaben überschreiben
+```
+
+### `add_subtitles.py`
+Generiert Word‑Caps mit Whisper & brennt sie ein.
+```
+--clips_dir PATH    # Quelle (Default: output/output_9x16_final)
+--out_dir PATH      # Ziel   (Default: output/output_9x16_final_subbed_word)
+--pattern STR       # z. B. *.mp4
+--limit INT         # Nur die ersten N Clips
+--model NAME        # Whisper-Modell (tiny/base/small/medium/large)
+--lang CODE         # Sprachcode oder Auto
+```
+
+### `rateCluster.py` (optional)
+Lässt LLM Scores (Viralität, Emotion, Humor, Provokation) nachtragen.
+> Modelliere Standard‑Modell via `OPENAI_MODEL` (z. B. `gpt-4o`).
+
+---
+
+## Tipps & Troubleshooting
+- **Modelle/Performance**
+  - CPU‑only ist möglich (Whisper/YOLO langsamer). Auf Apple Silicon wird automatisch **MPS** genutzt; auf NVIDIA **CUDA**.
+  - `--frame-skip` und `--downscale` in `main_detect_faces.py` beschleunigen die Face‑Detection deutlich.
+- **ffmpeg‑Muxing prüfen** (`main_apply_crop.py --mux_audio`): Falls Ton fehlt, `ffmpeg`-Installation checken. Rückgabecode im Log prüfen.
+- **Fehlende Dateien**
+  - Kein Input? → `data/input/` prüfen.
+  - Fehlende Transkript‑Paare? → `*_timed.txt` und `*_segments.json` müssen existieren (aus `transcription.py`).
+  - Fehlende Faces? → Pfad zu `models/yolov8n-face.pt` korrekt?
+- **Datenbank**
+  - Highlights liegen in SQLite (siehe `config.py`: `DB_PATH`). Bei Wiederholungen kann ein `DELETE FROM highlights; VACUUM;` sinnvoll sein.
+- **Cache/Verzeichnisse**
+  - Whisper‑Cache via `XDG_CACHE_HOME` → `whisper-cache/` neben dem Projekt. Speicherplatz beachten.
+
+## Reproduzierbarkeit
+- Lege eine `requirements.txt` mit exakten Versionen an (Freeze deiner funktionierenden Umgebung).
+- Dokumentiere verwendete **Modell‑Versionsstände** (YOLO Weights, Whisper‑Modellgröße, OPENAI_MODEL).
+- Fixiere Random‑Seeds, falls nötig (hier meist deterministisch durch externe Modelle/Bibliotheken).
+
+## Lizenz / Danksagung
+- Verwendung von **OpenAI Whisper**, **Ultralytics YOLOv8**, **MediaPipe**, **OpenCV**, **MoviePy**, **ffmpeg**.
+- Die jeweiligen Lizenzen der Bibliotheken beachten.
diff --git a/code/text/cutClips.py b/code/text/cutClips.py
deleted file mode 100644
index a58331e..0000000
--- a/code/text/cutClips.py
+++ /dev/null
@@ -1,38 +0,0 @@
-from moviepy.video.io.VideoFileClip import VideoFileClip
-from pathlib import Path
-import sqlite3
-
-# === Setup ===
-input_video = Path("input/testVideoShort.mov")
-output_dir = Path("output")
-output_dir.mkdir(parents=True, exist_ok=True)
-
-# === SQLite DB lesen ===
-db_path = "clips_openai.db"
-conn = sqlite3.connect(db_path)
-cursor = conn.cursor()
-
-# Nur die Top 10 Clips mit höchstem score_total
-cursor.execute("""
-    SELECT start, end, text
-    FROM highlights
-    ORDER BY score_total DESC
-    LIMIT 10
-""")
-highlights = cursor.fetchall()
-
-# === Video laden ===
-video = VideoFileClip(str(input_video))
-
-# === Clips schneiden ===
-for i, (start, end, text) in enumerate(highlights):
-    output_file = output_dir / f"highlight_{i+1}.mp4"
-    end = min(end, video.duration)  # Sicherstellen, dass das Ende nicht über das Video hinausgeht
-    print(f"🎬 Exportiere Clip {i+1}: {start:.2f}s – {end:.2f}s → {output_file.name}")
-    clip = video.subclipped(start, end)
-    clip.write_videofile(str(output_file), codec="libx264", audio_codec="aac")
-
-# === Cleanup ===
-conn.close()
-video.close()
-print("✅ Top 10 Clips exportiert.")
diff --git a/code/text/segment_transcript.py b/code/text/segment_transcript.py
deleted file mode 100644
index d8eba8b..0000000
--- a/code/text/segment_transcript.py
+++ /dev/null
@@ -1,196 +0,0 @@
-import json
-import sqlite3
-import re
-from pathlib import Path
-from openai import OpenAI
-from datetime import datetime
-import time
-import nltk
-
-nltk.download("punkt")
-
-# === SETTINGS ===
-TRANSCRIPT_PATH = Path("transkripte/testVideoShort_timed.txt")
-DB_PATH = Path("clips_openai.db")
-LOG_DIR = Path("logs")
-LOG_DIR.mkdir(exist_ok=True)
-BLOCK_DURATION = 300
-MIN_CLIP_LEN = 5
-MAX_CLIP_LEN = 90
-
-client = OpenAI(api_key="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")
-
-# === HILFSFUNKTIONEN ===
-def log_text(filename, content):
-    (LOG_DIR / filename).write_text(content.strip(), encoding="utf-8")
-
-def append_error_log(content):
-    with (LOG_DIR / "errors.txt").open("a", encoding="utf-8") as f:
-        f.write(content + "\n\n")
-
-def extract_json(text):
-    match = re.search(r"\[.*\]", text.strip(), re.DOTALL)
-    if match:
-        try:
-            return json.loads(match.group())
-        except Exception as e:
-            append_error_log(f"❌ JSON-Fehler: {e}\n{text}")
-    return []
-
-def get_original_text(clip, segments, debug=False):
-    texts = []
-    used_segments = []
-    for s in segments:
-        # Überschneidung: Segment und Clip teilen sich Zeit
-        if not (s["end"] < clip["start"] or s["start"] > clip["end"]):
-            texts.append(s["text"])
-            used_segments.append(f"[{s['start']:.2f} – {s['end']:.2f}] {s['text']}")
-    if debug:
-        print(f"\n🟢 Clip {clip['start']}–{clip['end']} nutzt Segmente:\n" +
-              "\n".join(used_segments))
-    return " ".join(texts).strip()
-
-# === TRANSKRIPT EINLESEN ===
-lines = TRANSCRIPT_PATH.read_text(encoding="utf-8").splitlines()
-segments = []
-for line in lines:
-    match = re.match(r"\[(\d+(?:\.\d+)?)\s*[–-]\s*(\d+(?:\.\d+)?)\]\s*(.*)", line)
-    if match:
-        start, end, text = match.groups()
-        start = float(start)
-        end = float(end)
-        if end - start >= 2.0:
-            segments.append({"start": start, "end": end, "text": text.strip()})
-
-if not segments:
-    raise RuntimeError("🚫 Keine gültigen Segmente gefunden.")
-print(f"✅ {len(segments)} gültige Transkriptsegmente geladen.")
-
-# === BLÖCKE BILDEN
-blocks = []
-current_block = []
-current_start = 0.0
-for seg in segments:
-    if seg["end"] - current_start > BLOCK_DURATION:
-        blocks.append(current_block)
-        current_block = []
-        current_start = seg["start"]
-    current_block.append(seg)
-if current_block:
-    blocks.append(current_block)
-print(f"🧱 {len(blocks)} Blöcke erstellt (à ~{BLOCK_DURATION}s).")
-
-# === KI: CLIP-AUSWAHL
-all_clips = []
-start_time = time.perf_counter()
-
-for i, block in enumerate(blocks):
-    if not block:
-        continue
-
-    print(f"\n🤖 Sende Block {i+1}/{len(blocks)} an GPT-4o...")
-
-    block_text = "\n".join([f"[{s['start']} – {s['end']}] {s['text']}" for s in block])
-    prompt = f"""
-Du bekommst einen Transkriptblock mit Zeitangaben. Extrahiere daraus 1–3 besonders interessante Abschnitte, die sich als eigenständige Clips eignen.
-Achte darauf das es abgeschlossene Clips sind und als eigenstaendiger Clip funktionieren kann.
-
-Ein guter Clip:
-- ist abgeschlossen und verständlich
-- enthält eine Pointe, Erkenntnis oder einen emotionalen Moment
-- wirkt wie ein Mini-Ausschnitt mit Anfang, Spannungsbogen, Auflösung oder Punchline
-- ist **mindestens 30 Sekunden lang**
-
-Nutze ausschließlich die vorhandenen Start- und Endzeiten – keine neuen erfinden.
-
-Gib ein valides JSON-Array zurück im Format:
-[
-  {{
-    "start": float,
-    "end": float,
-    "summary": "Kurze Beschreibung des Inhalts"
-  }}
-]
-
-TRANSKRIPT:
-{block_text}
-"""
-    log_text(f"block_prompt_{i+1}.txt", prompt)
-
-    try:
-        response = client.chat.completions.create(
-            model="gpt-4o",
-            messages=[{"role": "user", "content": prompt}],
-            temperature=0.4
-        )
-        raw = response.choices[0].message.content
-        log_text(f"block_output_{i+1}.txt", raw)
-        clips = extract_json(raw)
-
-        print(f"✅ {len(clips)} Clips empfangen in Block {i+1}")
-
-        for clip in clips:
-            try:
-                dur = float(clip["end"]) - float(clip["start"])
-                if MIN_CLIP_LEN <= dur <= MAX_CLIP_LEN:
-                    clip["duration"] = round(dur, 2)
-                    all_clips.append(clip)
-            except Exception as e:
-                append_error_log(f"⛔ Clip-Filterfehler: {clip}\n{e}")
-
-        print(f"📈 Aktueller Clip-Gesamtstand: {len(all_clips)}")
-
-        # ETA berechnen
-        elapsed = time.perf_counter() - start_time
-        avg_time = elapsed / (i + 1)
-        eta = avg_time * (len(blocks) - (i + 1))
-        print(f"⏱️ Geschätzte Restzeit: {eta:.1f} Sekunden")
-
-    except Exception as e:
-        append_error_log(f"❌ OpenAI-Fehler Block {i+1}: {e}")
-        print(f"❌ Fehler bei Block {i+1}: {e}")
-
-# === DB SPEICHERN
-conn = sqlite3.connect(DB_PATH)
-cur = conn.cursor()
-cur.execute("DROP TABLE IF EXISTS segments")
-cur.execute("""
-CREATE TABLE segments (
-    id INTEGER PRIMARY KEY AUTOINCREMENT,
-    file TEXT,
-    start REAL,
-    end REAL,
-    duration REAL,
-    text TEXT,
-    summary TEXT
-)
-""")
-
-inserted = 0
-failed = 0
-for clip in all_clips:
-    try:
-        start = float(clip["start"])
-        end = float(clip["end"])
-        duration = float(clip["duration"])
-        summary = clip.get("summary", "")
-        # debug=True für print aller Segment-Texte pro Clip
-        original_text = get_original_text(clip, segments, debug=False)
-        if end <= start or start < 0:
-            raise ValueError("Ungültige Zeiten")
-        cur.execute(
-            "INSERT INTO segments (file, start, end, duration, text, summary) VALUES (?, ?, ?, ?, ?, ?)",
-            (TRANSCRIPT_PATH.stem, start, end, duration, original_text, summary.strip())
-        )
-        inserted += 1
-    except Exception as e:
-        failed += 1
-        append_error_log(f"❌ DB-Fehler: {clip}\n{e}")
-
-conn.commit()
-conn.close()
-
-print("\n📊 Ergebnisse:")
-print(f"  ✅ Clips gespeichert:  {inserted}")
-print(f"  ❌ Fehlerhafte Clips:  {failed}")
-print(f"📁 Logs gespeichert in: {LOG_DIR.resolve()}")
diff --git a/code/text/transcription.py b/code/text/transcription.py
deleted file mode 100644
index 82ee81d..0000000
--- a/code/text/transcription.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# transcription_chunked.py
-import whisper
-from pathlib import Path
-import os
-import json
-import ffmpeg
-import tempfile
-
-# === Einstellungen ===
-input_file = Path("input/testVideoShort.mov")
-output_dir = Path("transkripte")
-output_dir.mkdir(parents=True, exist_ok=True)
-
-output_txt = output_dir / f"{input_file.stem}_timed.txt"
-output_json = output_dir / f"{input_file.stem}_segments.json"
-suspect_txt = output_dir / f"{input_file.stem}_suspect_lines.txt"
-
-CHUNKS = 4  # Anzahl Chunks (anpassen!)
-OVERLAP = 2.0  # Sekunden Überlappung
-
-os.environ["XDG_CACHE_HOME"] = str(Path(__file__).parent / "whisper-cache")
-
-probe = ffmpeg.probe(str(input_file))
-duration = float(probe["format"]["duration"])
-print(f"🎥 Videolänge: {duration:.2f} Sekunden")
-
-def extract_audio_chunk(start_time, duration, output_path):
-    ffmpeg.input(str(input_file), ss=start_time, t=duration).output(
-        str(output_path),
-        format="wav",
-        acodec="pcm_s16le",
-        ac=1,
-        ar="16000",
-        loglevel="error"
-    ).overwrite_output().run()
-
-def is_suspect(text):
-    words = text.strip().lower().split()
-    if not words:
-        return True
-    most_common = max([words.count(w) for w in set(words)])
-    return most_common / len(words) > 0.6 or most_common > 20
-
-tmp_dir = Path(tempfile.mkdtemp())
-all_segments = []
-
-print(f"✂️ Teile Audio in {CHUNKS} Chunks ...")
-for i in range(CHUNKS):
-    chunk_start = max(0, i * (duration / CHUNKS) - OVERLAP if i > 0 else 0)
-    chunk_end = min(duration, (i + 1) * (duration / CHUNKS) + OVERLAP)
-    chunk_dur = chunk_end - chunk_start
-    chunk_file = tmp_dir / f"chunk_{i}.wav"
-    print(f"🔉 Extrahiere Chunk {i+1}/{CHUNKS}: {chunk_start:.2f}s – {chunk_end:.2f}s")
-    extract_audio_chunk(chunk_start, chunk_dur, chunk_file)
-
-    print(f"🧠 Transkribiere Chunk {i+1} ...")
-    model = whisper.load_model("small")  # Wechsel zu "medium" oder "large" falls gewünscht
-    result = model.transcribe(
-        str(chunk_file),
-        language="de",
-        fp16=False,
-        word_timestamps=False,
-        condition_on_previous_text=True,
-        temperature=0,
-        verbose=False
-    )
-
-    segments = result["segments"]
-    # Zeitversatz für den aktuellen Chunk hinzufügen
-    offset = chunk_start
-    for seg in segments:
-        seg["start"] += offset
-        seg["end"] += offset
-    all_segments.extend(segments)
-
-# === Sortiere und filtere doppelte/überlappende Segmente
-all_segments.sort(key=lambda x: x["start"])
-
-def segment_hash(seg):
-    return (round(seg["start"], 2), round(seg["end"], 2), seg["text"].strip().lower())
-
-unique_segments = []
-seen = set()
-for seg in all_segments:
-    h = segment_hash(seg)
-    if h not in seen:
-        seen.add(h)
-        unique_segments.append(seg)
-
-print(f"✅ Insgesamt {len(unique_segments)} einzigartige Segmente transkribiert.")
-
-with open(output_txt, "w", encoding="utf-8") as f, open(suspect_txt, "w", encoding="utf-8") as f_sus:
-    for seg in unique_segments:
-        start = seg["start"]
-        end = seg["end"]
-        text = seg["text"].strip()
-        line = f"[{start:.2f} – {end:.2f}] {text}\n"
-        f.write(line)   # IMMER ins Haupttranskript!
-        if is_suspect(text):
-            f_sus.write(line)
-
-
-print(f"📝 Zeitmarkiertes Transkript gespeichert unter: {output_txt}")
-print(f"⚠️  Verdächtige Zeilen gespeichert unter: {suspect_txt}")
-
-with open(output_json, "w", encoding="utf-8") as f:
-    json.dump(unique_segments, f, ensure_ascii=False, indent=2)
-print(f"💾 Segmentdaten gespeichert unter: {output_json}")
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..5bbd77c
--- /dev/null
+++ b/main.py
@@ -0,0 +1,233 @@
+#!/usr/bin/env python3
+"""Run the full Bachelor pipeline end-to-end with timing, errors, and flexible flags.
+
+Steps:
+  1) transcription.py         → Whisper transcripts (segments + timed words)
+  2) segment_transcript.py    → LLM selects highlight candidates → SQLite
+  3) cutClips.py              → export highlight_*.mp4 (raw clips)
+  4) main_detect_faces.py     → YOLO + MediaPipe → faces.json per clip
+  5) make_segments.py         → *_target_by_frame.json (center+side per frame)
+  6) main_apply_crop.py       → 9:16 crop with smoothing & optional audio mux
+  7) rateCluster.py           → (optional) LLM scoring (virality, emotion, ...)
+  8) add_subtitles.py         → (optional) word-cap subtitles burned in
+
+Usage examples:
+  python main.py --input data/input/meinvideo.mp4 --limit 10 --openai-model gpt-4o
+  python main.py --no-rate --no-subs
+"""
+
+from __future__ import annotations
+import argparse
+import os
+import sys
+import subprocess
+import time
+from datetime import datetime
+from pathlib import Path
+
+# --- Import project config ---
+try:
+    from config import (
+        PROJECT_ROOT, INPUT_DIR, RAW_CLIPS_DIR, CROPPED_DIR, SUBTITLED_DIR,
+        WHISPER_CACHE_DIR
+    )
+except Exception:
+    PROJECT_ROOT = Path(__file__).resolve().parent
+    sys.path.insert(0, str(PROJECT_ROOT))
+    from config import (
+        PROJECT_ROOT, INPUT_DIR, RAW_CLIPS_DIR, CROPPED_DIR, SUBTITLED_DIR,
+        WHISPER_CACHE_DIR
+    )
+
+LOGS_DIR = PROJECT_ROOT / "logs"
+LOGS_DIR.mkdir(parents=True, exist_ok=True)
+
+# --- korrekte Pfade zu den Skripten ---
+SCRIPTS = {
+    "transcription":        str(PROJECT_ROOT / "src" / "text" / "transcription.py"),
+    "segment_transcript":   str(PROJECT_ROOT / "src" / "text" / "segment_transcript.py"),
+    "cutClips":             str(PROJECT_ROOT / "src" / "text" / "cutClips.py"),
+    "detect_faces":         str(PROJECT_ROOT / "src" / "reformat" / "main_detect_faces.py"),
+    "make_segments":        str(PROJECT_ROOT / "src" / "reformat" / "make_segments.py"),
+    "apply_crop":           str(PROJECT_ROOT / "src" / "reformat" / "main_apply_crop.py"),
+    "rateCluster":          str(PROJECT_ROOT / "src" / "text" / "rateCluster.py"),
+    "add_subtitles":        str(PROJECT_ROOT / "src" / "subtitles" / "add_subtitles.py"),
+}
+
+def shlex_join(cmd):
+    return " ".join(str(c) for c in cmd)
+
+def run_step(cmd: list[str], name: str, env: dict[str, str] | None = None) -> float:
+    """Run a subprocess step, raise on error, return duration in seconds."""
+    t0 = time.perf_counter()
+    print(f"\n===== {name} =====")
+    print(" ", shlex_join(cmd))
+    cp = subprocess.run(cmd, env=env)
+    dt = time.perf_counter() - t0
+    if cp.returncode != 0:
+        print(f"❌ Fehler in {name} (Exit {cp.returncode}) nach {dt:.2f}s")
+        print("   → Prüfe das Logfile oben für Details und stelle sicher, dass Abhängigkeiten installiert sind:")
+        print("     - ffmpeg/ffprobe im PATH")
+        print("     - Python-Pakete: openai-whisper, torch, ffmpeg-python, ultralytics, opencv-python, mediapipe, moviepy, tqdm, numpy")
+        print("     - OPENAI_API_KEY gesetzt (für LLM-Schritte)")
+        raise SystemExit(cp.returncode)
+    print(f"✅ {name} in {dt:.2f}s")
+    return dt
+
+def infer_base_from_input(input_path: Path) -> str:
+    return input_path.stem
+
+def default_input() -> Path | None:
+    if not INPUT_DIR.exists():
+        return None
+    for p in sorted(INPUT_DIR.iterdir()):
+        if p.suffix.lower() in {".mp4", ".mov", ".mkv", ".m4v", ".mp3", ".wav"}:
+            return p
+    return None
+
+def main():
+    ap = argparse.ArgumentParser(description="Bachelor Pipeline Runner")
+    ap.add_argument("--input", type=str, default=None, help="Pfad zu Eingabedatei (Default: erstes File in data/input)")
+    ap.add_argument("--limit", type=int, default=10, help="Anzahl Highlights (cutClips)")
+    ap.add_argument("--whisper-model", type=str, default=os.getenv("WHISPER_MODEL", "small"))
+    ap.add_argument("--lang", type=str, default=None, help="Sprachcode (z. B. de)")
+    ap.add_argument("--openai-model", type=str, default=os.getenv("OPENAI_MODEL", "gpt-4o"))
+    ap.add_argument("--pattern", type=str, default="highlight_*.mp4")
+    ap.add_argument("--overwrite", action="store_true")
+    ap.add_argument("--no-rate", action="store_true")
+    ap.add_argument("--no-subs", action="store_true")
+    ap.add_argument("--no-detect", action="store_true")
+    ap.add_argument("--no-make", action="store_true")
+    ap.add_argument("--no-apply", action="store_true")
+    ap.add_argument("--logfile", type=str, default=None)
+    args = ap.parse_args()
+
+    os.chdir(PROJECT_ROOT)
+
+    env = os.environ.copy()
+    env.setdefault("OPENAI_MODEL", args.openai_model)
+    env.setdefault("XDG_CACHE_HOME", str(WHISPER_CACHE_DIR))
+
+    if not env.get("OPENAI_API_KEY"):
+        print("⚠️  OPENAI_API_KEY ist nicht gesetzt – LLM-Schritte könnten fehlschlagen.")
+
+    # Input-Datei bestimmen
+    if args.input:
+        input_path = Path(args.input)
+        if not input_path.is_file():
+            candidate = INPUT_DIR / args.input
+            if candidate.is_file():
+                input_path = candidate
+            else:
+                raise SystemExit(f"Input nicht gefunden: {args.input}")
+    else:
+        picked = default_input()
+        if not picked:
+            raise SystemExit(f"Kein Input in {INPUT_DIR} gefunden. Bitte --input setzen.")
+        input_path = picked
+
+    base = infer_base_from_input(input_path)
+    print(f"📥 Input: {input_path}")
+    print(f"🔤 Whisper: {args.whisper_model} | 🌐 LLM: {env.get('OPENAI_MODEL')}")
+    print(f"🧩 Base: {base}")
+
+    # Logfile
+    if args.logfile:
+        log_path = Path(args.logfile)
+    else:
+        log_path = LOGS_DIR / f"run_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
+
+    # Tee: schreibe in Datei UND Konsole
+    try:
+        log_fh = open(log_path, "w", encoding="utf-8")
+        class _Tee:
+            def __init__(self, *streams): self.streams = streams
+            def write(self, data):
+                for s in self.streams:
+                    try: s.write(data); s.flush()
+                    except Exception: pass
+            def flush(self):
+                for s in self.streams:
+                    try: s.flush()
+                    except Exception: pass
+        sys.stdout = _Tee(sys.__stdout__, log_fh)
+        sys.stderr = _Tee(sys.__stderr__, log_fh)
+        print(f"📝 Logfile: {log_path}")
+    except Exception:
+        print(f"⚠️  Konnte Logfile nicht initialisieren: {log_path}")
+
+    durations = []
+    started = datetime.now()
+    print(f"🚀 Start: {started:%Y-%m-%d %H:%M:%S}")
+
+    try:
+        # 1) Transcription
+        t_args = [sys.executable, SCRIPTS["transcription"], "--input", str(input_path), "--model", args.whisper_model]
+        if args.lang: t_args += ["--lang", args.lang]
+        durations.append(("Transcription", run_step(t_args, "Transcription", env=env)))
+
+        # 2) LLM Segmentierung
+        st_args = [sys.executable, SCRIPTS["segment_transcript"], "--base", base]
+        durations.append(("Segment Transcript", run_step(st_args, "Segment Transcript", env=env)))
+
+        # 3) Highlights schneiden
+        cut_filename = input_path.name
+        cc_args = [sys.executable, SCRIPTS["cutClips"], "--file", cut_filename, "--limit", str(args.limit)]
+        durations.append(("Cut Clips", run_step(cc_args, "Cut Clips", env=env)))
+
+        # 4) Faces
+        if not args.no_detect:
+            df_args = [sys.executable, SCRIPTS["detect_faces"]]
+            durations.append(("Detect Faces", run_step(df_args, "Detect Faces", env=env)))
+        else:
+            print("⏭️  Detect Faces übersprungen.")
+
+        # 5) Make Targets
+        if not args.no_make:
+            ms_args = [sys.executable, SCRIPTS["make_segments"], "--pattern", args.pattern]
+            durations.append(("Make Targets", run_step(ms_args, "Make Targets", env=env)))
+        else:
+            print("⏭️  Make Targets übersprungen.")
+
+        # 6) Crop
+        if not args.no_apply:
+            ac_args = [sys.executable, SCRIPTS["apply_crop"], "--pattern", args.pattern, "--mux_audio"]
+            if args.overwrite: ac_args.append("--overwrite")
+            durations.append(("Apply Crop", run_step(ac_args, "Apply Crop", env=env)))
+        else:
+            print("⏭️  Apply Crop übersprungen.")
+
+        # 7) Bewertung
+        if not args.no_rate:
+            rc_args = [sys.executable, SCRIPTS["rateCluster"]]
+            durations.append(("Rate Clusters", run_step(rc_args, "Rate Clusters", env=env)))
+        else:
+            print("⏭️  Rate Clusters übersprungen.")
+
+        # 8) Untertitel
+        if not args.no_subs:
+            as_args = [sys.executable, SCRIPTS["add_subtitles"]]
+            durations.append(("Subtitles", run_step(as_args, "Subtitles", env=env)))
+        else:
+            print("⏭️  Subtitles übersprungen.")
+
+    except KeyboardInterrupt:
+        print("\n⛔ Abgebrochen (Ctrl+C).")
+    finally:
+        finished = datetime.now()
+        total = sum(dt for _, dt in durations)
+        print("\n======================== ZUSAMMENFASSUNG ============================")
+        for name, dt in durations:
+            print(f"✅ {name:<24} {dt:7.2f}s")
+        print("---------------------------------------------------------------------")
+        print(f"⏱️  Gesamtdauer: {total:.2f}s")
+        print(f"🕒  Start : {started:%Y-%m-%d %H:%M:%S}")
+        print(f"🕒  Ende  : {finished:%Y-%m-%d %H:%M:%S}")
+        print(f"📂 Output:")
+        print(f"    Raw Clips : {RAW_CLIPS_DIR}")
+        print(f"    9:16      : {CROPPED_DIR}")
+        print(f"    Subbed    : {SUBTITLED_DIR}")
+        print("=====================================================================")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/src/main.py b/src/main.py
new file mode 100644
index 0000000..422cc64
--- /dev/null
+++ b/src/main.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+"""
+Einfaches Master-Skript, das alle Unter-Skripte nacheinander startet – ohne Argumente.
+"""
+import subprocess
+import sys
+from pathlib import Path
+
+# Reihenfolge der auszuführenden Skripte (relativer Pfad)
+SCRIPTS = [
+    "text/transcription.py",
+    "text/segment_transcript.py",
+    "text/rateCluster.py",
+    "text/cutClips.py",
+    "reformat/track_faces_Yolo.py",
+    "reformat/detect_speaking_faces.py",
+    "reformat/crop_to_speaker.py",
+]
+
+
+def run_script(script_path: str):
+    """
+    Führt ein Python-Skript ohne weitere Argumente aus.
+    """
+    print(f"🔄 Running: {script_path}")
+    full_path = Path(__file__).parent / script_path
+    try:
+        subprocess.check_call([sys.executable, str(full_path)])
+        print(f"✔️  {script_path} erfolgreich abgeschlossen.\n")
+    except subprocess.CalledProcessError as e:
+        print(f"❌ Fehler in {script_path}: Rückgabecode {e.returncode}")
+        sys.exit(e.returncode)
+
+
+def main():
+    print("\n=== Starte komplette Podcast-Pipeline ===\n")
+    for script in SCRIPTS:
+        run_script(script)
+    print("✅ Alle Schritte erfolgreich abgeschlossen.")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/reformat/main_apply_crop.py b/src/reformat/main_apply_crop.py
new file mode 100644
index 0000000..da71e4f
--- /dev/null
+++ b/src/reformat/main_apply_crop.py
@@ -0,0 +1,315 @@
+#!/usr/bin/env python3
+# src/reformat/new/main_apply_crop.py
+from __future__ import annotations
+import logging, json, math, subprocess, argparse
+from pathlib import Path
+from typing import Optional, Tuple, List, Dict, Any
+from collections import deque
+import sys
+
+import cv2
+import numpy as np
+
+# ── Projektwurzel importierbar machen
+ROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(ROOT))
+from config import RAW_CLIPS_DIR, FACE_COMBINED_DIR, SEGMENTS_DIR, CROPPED_DIR
+
+# ==== Defaults (per CLI überschreibbar) ======================================
+OUT_W_DEFAULT, OUT_H_DEFAULT = 1080, 1920  # 9:16
+DEBUG_SCALE_DEFAULT          = 0.6
+MEDIAN_WIN_DEFAULT           = 5
+EMA_ALPHA_DEFAULT            = 0.22
+DEADBAND_PX_DEFAULT          = 8.0
+SWITCH_COOLDOWN_FR_DEFAULT   = 12
+ZOOM_PAD_FRAC_DEFAULT        = 0.10
+
+USE_CUT_DETECT_DEFAULT       = True
+CUT_CORR_THRESH_DEFAULT      = 0.65
+CUT_COOLDOWN_DEFAULT         = 6
+
+MUX_AUDIO_DEFAULT            = True
+FFMPEG_BIN                   = "ffmpeg"
+# ============================================================================
+
+def clamp(v, lo, hi): return max(lo, min(hi, v))
+
+def compute_crop_rect(cx: float, cy: float, src_w: int, src_h: int,
+                      out_w: int, out_h: int, zoom_pad_frac: float) -> tuple[int,int,int,int]:
+    """9:16 (out_w:out_h) Crop um (cx,cy) — ohne Squeeze, mit Zoom-Pad, im Bild gehalten."""
+    target_ar = out_w / out_h
+    src_ar = src_w / src_h
+    if src_ar >= target_ar:
+        base_h = src_h
+        base_w = int(round(base_h * target_ar))
+    else:
+        base_w = src_w
+        base_h = int(round(base_w / target_ar))
+
+    desired_scale = 1.0 + zoom_pad_frac
+    s = min(desired_scale, src_w / base_w, src_h / base_h)
+    w = int(round(base_w * s))
+    h = int(round(base_h * s))
+    half_w, half_h = w // 2, h // 2
+
+    cx = clamp(cx, half_w, src_w - half_w)
+    cy = clamp(cy, half_h, src_h - half_h)
+    x = int(round(cx - half_w))
+    y = int(round(cy - half_h))
+    return x, y, w, h
+
+def draw_center(img, pt, color, label=None):
+    if pt is None: return
+    x, y = int(pt[0]), int(pt[1])
+    cv2.circle(img, (x, y), 6, color, -1)
+    if label:
+        cv2.putText(img, label, (x + 8, y - 8),
+                    cv2.FONT_HERSHEY_SIMPLEX, 0.55, color, 2, cv2.LINE_AA)
+
+def scene_corr(a_small: np.ndarray, b_small: np.ndarray) -> float:
+    a_hsv = cv2.cvtColor(a_small, cv2.COLOR_BGR2HSV)
+    b_hsv = cv2.cvtColor(b_small, cv2.COLOR_BGR2HSV)
+    ha = cv2.calcHist([a_hsv],[0,1],None,[50,50],[0,180,0,256])
+    hb = cv2.calcHist([b_hsv],[0,1],None,[50,50],[0,180,0,256])
+    cv2.normalize(ha,ha,0,1,cv2.NORM_MINMAX); cv2.normalize(hb,hb,0,1,cv2.NORM_MINMAX)
+    return float((cv2.compareHist(ha, hb, cv2.HISTCMP_CORREL) + 1.0)/2.0)
+
+def mux_audio_from_source(src_video: Path, silent_video: Path, out_video: Path):
+    cmd = [
+        FFMPEG_BIN, "-y",
+        "-i", str(src_video),
+        "-i", str(silent_video),
+        "-map", "1:v:0",
+        "-map", "0:a:0?",
+        "-c:v", "copy",
+        "-c:a", "aac", "-b:a", "192k",
+        "-shortest",
+        str(out_video),
+    ]
+    subprocess.run(cmd, check=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+def load_faces(name: str) -> List[Dict[str,Any]]:
+    p = FACE_COMBINED_DIR / f"{name}_faces.json"
+    if not p.exists(): return []
+    return json.loads(p.read_text(encoding="utf-8"))
+
+def load_target_map_or_segments(name: str, total_frames: int) -> List[Optional[int] | Dict]:
+    """
+    Bevorzugt *_target_by_frame.json (Liste Dicts mit t,cx,cy,w,h).
+    Fallback: *_segments.json (pro Frame Track-ID).
+    Gibt Liste gleicher Länge wie total_frames zurück.
+    """
+    map_p = SEGMENTS_DIR / f"{name}_target_by_frame.json"
+    if map_p.exists():
+        target = json.loads(map_p.read_text(encoding="utf-8"))
+        # Falls es Dicts sind (cx,cy,w,h pro frame), einfach zurückgeben:
+        if target and isinstance(target[0], dict):
+            if len(target) < total_frames:
+                last = target[-1] if target else {"t":0,"cx":0.5,"cy":0.5,"w":0.6,"h":0.6}
+                target += [last] * (total_frames - len(target))
+            return target[:total_frames]
+        # Falls numerische IDs drin wären, fällt es unten durch auf segs-Logik
+    seg_p = SEGMENTS_DIR / f"{name}_segments.json"
+    if seg_p.exists():
+        segs = json.loads(seg_p.read_text(encoding="utf-8"))
+        target_tid = [None]*total_frames
+        for s in segs:
+            a, b, tid = int(s["start_f"]), int(s["end_f"]), s["track_id"]
+            for t in range(max(0,a), min(total_frames, b+1)):
+                target_tid[t] = tid
+        return target_tid
+    return [None]*total_frames
+
+def find_center_for_track(faces_frame: Dict[str,Any], target_tid: Optional[int], fallback: Tuple[float,float]) -> Tuple[float,float]:
+    if target_tid is None:
+        return fallback
+    faces = faces_frame.get("faces", [])
+    for f in faces:
+        if int(f.get("track_id", -1)) == int(target_tid):
+            x,y,w,h = f.get("bbox", [None,None,None,None])
+            if None not in (x,y,w,h):
+                return (float(x + w/2), float(y + h/2))
+    return fallback
+
+def parse_args():
+    p = argparse.ArgumentParser(description="Apply 9:16 Auto-Crop auf Rohclips mit Face-/Target-Daten.")
+    p.add_argument("--pattern", type=str, default="*.mp4", help="Dateimuster in RAW_CLIPS_DIR (Default: *.mp4)")
+    p.add_argument("--out_w", type=int, default=OUT_W_DEFAULT, help="Output-Breite (Default: 1080)")
+    p.add_argument("--out_h", type=int, default=OUT_H_DEFAULT, help="Output-Höhe (Default: 1920)")
+    p.add_argument("--zoom_pad", type=float, default=ZOOM_PAD_FRAC_DEFAULT, help="Zoom-Pad (0..1, Default 0.10)")
+    p.add_argument("--median", type=int, default=MEDIAN_WIN_DEFAULT, help="Median-Fenster (ungerade, >=3)")
+    p.add_argument("--ema", type=float, default=EMA_ALPHA_DEFAULT, help="EMA-Alpha (0..1)")
+    p.add_argument("--deadband", type=float, default=DEADBAND_PX_DEFAULT, help="Totband in Pixel")
+    p.add_argument("--switch_cd", type=int, default=SWITCH_COOLDOWN_FR_DEFAULT, help="Cooldown-Frames nach Trackwechsel")
+    p.add_argument("--cut_detect", action="store_true", default=USE_CUT_DETECT_DEFAULT, help="Szenenschnitt-Erkennung aktivieren")
+    p.add_argument("--cut_corr", type=float, default=CUT_CORR_THRESH_DEFAULT, help="Korrelation-Schwelle (0..1)")
+    p.add_argument("--cut_cd", type=int, default=CUT_COOLDOWN_DEFAULT, help="Cooldown-Frames nach Cut")
+    p.add_argument("--mux_audio", action="store_true", default=MUX_AUDIO_DEFAULT, help="Audio vom Original muxen")
+    p.add_argument("--debug", action="store_true", help="Debug-Overlay anzeigen (langsam)")
+    p.add_argument("--debug_scale", type=float, default=DEBUG_SCALE_DEFAULT, help="Skalierung Debug-Preview")
+    p.add_argument("--overwrite", action="store_true", help="Existierende Outputs überschreiben")
+    return p.parse_args()
+
+def main():
+    args = parse_args()
+    OUT_DIR = CROPPED_DIR
+    OUT_DIR.mkdir(parents=True, exist_ok=True)
+
+    logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO)
+    clips = sorted(list(RAW_CLIPS_DIR.glob(args.pattern)))
+    if not clips:
+        print(f"⚠️  Keine Clips in {RAW_CLIPS_DIR} mit Pattern '{args.pattern}'")
+        return
+
+    print(f"🔎 {len(clips)} Clips gefunden …")
+    for video_path in clips:
+        name = video_path.stem
+        out_path = OUT_DIR / f"{name}_9x16.mp4"
+        if out_path.exists() and not args.overwrite:
+            print(f"⏭️  Skip (existiert): {out_path.name}")
+            continue
+
+        # Video öffnen
+        cap = cv2.VideoCapture(str(video_path))
+        if not cap.isOpened():
+            print(f"❌ Kann Video nicht öffnen: {video_path.name}")
+            continue
+        width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        fps    = cap.get(cv2.CAP_PROP_FPS) or 25.0
+        total  = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+
+        # Face/Target laden
+        faces_all = load_faces(name)
+        if faces_all and len(faces_all) < total:
+            faces_all += [ {"faces": [], "W": width, "H": height} ] * (total - len(faces_all))
+        target_by_frame = load_target_map_or_segments(name, total)
+
+        # Writer vorbereiten
+        writer = cv2.VideoWriter(str(out_path),
+                                 cv2.VideoWriter_fourcc(*"mp4v"),
+                                 fps, (args.out_w, args.out_h))
+
+        median_buf = deque(maxlen=max(3, args.median if args.median % 2 else args.median+1))
+        ema_center: Optional[Tuple[float,float]] = None
+        last_center: Optional[Tuple[float,float]] = (width/2, height/2)
+        switch_cooldown = 0
+
+        prev_small = None
+        cut_cd = 0
+
+        print(f"🎞️  Apply: {name}  src={width}x{height}  fps={fps:.2f}  frames={total}")
+
+        for t in range(total):
+            ret, frame = cap.read()
+            if not ret: break
+
+            # Ziel bestimmen:
+            desired = None
+            tgt = target_by_frame[t] if t < len(target_by_frame) else None
+
+            # Fall A: target_by_frame.json mit direkten Zentren (Dict)
+            if isinstance(tgt, dict) and all(k in tgt for k in ("cx","cy","w","h")):
+                desired = (float(tgt["cx"])*width, float(tgt["cy"])*height)
+            else:
+                # Fall B: numerische Track-ID
+                target_tid = tgt if tgt is None or isinstance(tgt, (int, float)) else None
+                faces_fr = faces_all[t] if (faces_all and t < len(faces_all)) else {"faces":[]}
+                desired = find_center_for_track(faces_fr, target_tid, last_center or (width/2, height/2))
+
+            # Szenenschnitt?
+            if args.cut_detect:
+                small = cv2.resize(frame, (128, 72))
+                if prev_small is not None:
+                    corr = scene_corr(prev_small, small)
+                    if corr < args.cut_corr:
+                        ema_center = desired
+                        last_center = desired
+                        switch_cooldown = args.switch_cd
+                        cut_cd = args.cut_cd
+                prev_small = small
+
+            # Median-Filter
+            median_buf.append(desired)
+            if len(median_buf) >= 3:
+                xs = sorted(p[0] for p in median_buf)
+                ys = sorted(p[1] for p in median_buf)
+                m  = len(median_buf)//2
+                desired_f = (xs[m], ys[m])
+            else:
+                desired_f = desired
+
+            # Trackwechsel erkennen (nur bei Track-IDs sauber möglich)
+            if t > 0:
+                prev_tgt = target_by_frame[t-1] if t-1 < len(target_by_frame) else None
+            else:
+                prev_tgt = tgt
+            is_switch = (not isinstance(tgt, dict)) and (tgt != prev_tgt)
+
+            if ema_center is None:
+                ema_center = desired_f
+            if last_center is None:
+                last_center = desired_f
+
+            if is_switch:
+                ema_center  = desired_f
+                last_center = desired_f
+                switch_cooldown = args.switch_cd
+            else:
+                dx = desired_f[0] - ema_center[0]
+                dy = desired_f[1] - ema_center[1]
+                dist = math.hypot(dx, dy)
+                if cut_cd > 0:
+                    ema_center = desired_f
+                    cut_cd -= 1
+                else:
+                    if dist > args.deadband:
+                        ema_center = (ema_center[0] + dx*args.ema,
+                                      ema_center[1] + dy*args.ema)
+
+            last_center = desired_f
+
+            # 9:16 Crop anwenden
+            x, y, w, h = compute_crop_rect(ema_center[0], ema_center[1], width, height,
+                                           args.out_w, args.out_h, args.zoom_pad)
+            cropped = frame[y:y+h, x:x+w]
+            if cropped.size == 0: cropped = frame
+            final = cv2.resize(cropped, (args.out_w, args.out_h), interpolation=cv2.INTER_AREA)
+            writer.write(final)
+
+            if args.debug:
+                dbg = frame.copy()
+                cv2.rectangle(dbg, (x, y), (x+w, y+h), (0, 0, 255), 2)
+                draw_center(dbg, desired,    (128,128,255), "desired")
+                draw_center(dbg, desired_f,  (255,255,  0), "median")
+                draw_center(dbg, ema_center, (  0,255,255), "ema")
+                cv2.putText(dbg, f"t={t+1}/{total}", (12, height-14),
+                            cv2.FONT_HERSHEY_SIMPLEX, 0.6, (20,220,20), 2, cv2.LINE_AA)
+                disp = cv2.resize(dbg, (int(width*args.debug_scale), int(height*args.debug_scale)))
+                cv2.imshow("Apply Debug", disp)
+                if cv2.waitKey(1) & 0xFF == ord("q"):
+                    print("🛑 Abgebrochen (q).")
+                    break
+
+        writer.release()
+        cap.release()
+
+        # Audio muxen?
+        if args.mux_audio:
+            tmp = out_path.with_suffix(".tmp.mp4")
+            try:
+                out_path.rename(tmp)
+                mux_audio_from_source(video_path, tmp, out_path)
+            finally:
+                if tmp.exists():
+                    try: tmp.unlink()
+                    except: pass
+            print(f"✅ Fertig (mit Audio): {out_path.name}")
+        else:
+            print(f"✅ Fertig: {out_path.name}")
+
+    if args.debug:
+        cv2.destroyAllWindows()
+
+if __name__ == "__main__":
+    main()
diff --git a/src/reformat/main_detect_faces.py b/src/reformat/main_detect_faces.py
new file mode 100644
index 0000000..44f0300
--- /dev/null
+++ b/src/reformat/main_detect_faces.py
@@ -0,0 +1,335 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Face-Detection + Mouth-Openness (YOLOv8-face + MediaPipe)
+- liest Rohclips aus RAW_CLIPS_DIR
+- schreibt pro Video eine faces.json in FACE_COMBINED_DIR
+- optionaler Fortschrittsbalken (tqdm)
+"""
+
+from __future__ import annotations
+import argparse
+import logging
+import json
+import time
+from pathlib import Path
+from contextlib import nullcontext
+from typing import List, Dict, Any
+from src.reformat.speaking import get_mouth_openness
+
+import cv2
+import numpy as np
+import torch
+from ultralytics import YOLO
+import mediapipe as mp
+import sys
+
+# ── Projekt-Root + zentrale Pfade laden
+ROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(ROOT))
+from config import RAW_CLIPS_DIR, FACE_COMBINED_DIR  # zentrale Verzeichnisse
+
+# Fortschritt hübsch, wenn verfügbar
+try:
+    from tqdm import tqdm
+    _HAS_TQDM = True
+except Exception:
+    _HAS_TQDM = False
+
+# ---------- Performance Tweaks ----------
+torch.set_float32_matmul_precision("high")
+cv2.setUseOptimized(True)
+
+# ---------- Hilfsfunktionen ----------
+def make_square_crop(x1, y1, x2, y2, W, H, margin_scale, min_crop):
+    cx = (x1 + x2) * 0.5
+    cy = (y1 + y2) * 0.5
+    w  = (x2 - x1) * (1.0 + 2.0 * margin_scale)
+    h  = (y2 - y1) * (1.0 + 2.0 * margin_scale)
+    side = max(w, h, float(min_crop))
+    half = side * 0.5
+
+    sx1 = int(max(0, round(cx - half)))
+    sy1 = int(max(0, round(cy - half)))
+    sx2 = int(min(W, round(cx + half)))
+    sy2 = int(min(H, round(cy + half)))
+
+    side_w = max(0, sx2 - sx1)
+    side_h = max(0, sy2 - sy1)
+    side   = max(2, min(side_w, side_h))
+    sx2 = sx1 + side
+    sy2 = sy1 + side
+    return sx1, sy1, sx2, sy2
+
+
+def pick_landmarks_near_crop_center(lm_lists, crop_w, crop_h):
+    if not lm_lists:
+        return None
+    cx_t, cy_t = crop_w * 0.5, crop_h * 0.5
+    best, best_d = None, 1e12
+    for lms in lm_lists:
+        xs = [p.x * crop_w for p in lms.landmark]
+        ys = [p.y * crop_h for p in lms.landmark]
+        cx = sum(xs) / len(xs)
+        cy = sum(ys) / len(ys)
+        d = (cx - cx_t) ** 2 + (cy - cy_t) ** 2
+        if d < best_d:
+            best, best_d = lms, d
+    return best
+
+
+def run_mesh(face_mesh, crop_bgr, upscale_if_small):
+    if crop_bgr.size == 0:
+        return None, 0.0
+    ch, cw = crop_bgr.shape[:2]
+    if max(ch, cw) < upscale_if_small:
+        scale = float(upscale_if_small) / max(ch, cw)
+        new_w = max(1, int(round(cw * scale)))
+        new_h = max(1, int(round(ch * scale)))
+        crop_bgr = cv2.resize(crop_bgr, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
+        ch, cw = new_h, new_w
+    rgb = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2RGB)
+    res = face_mesh.process(rgb)
+    if not res.multi_face_landmarks:
+        return None, 0.0
+    chosen = pick_landmarks_near_crop_center(res.multi_face_landmarks, cw, ch)
+    if chosen is None:
+        return None, 0.0
+    mo = get_mouth_openness(chosen.landmark, ch)
+    return chosen, float(mo)
+
+# ---------- Kernprozess ----------
+def process_video(video_path: Path,
+                  output_path: Path,
+                  model: YOLO,
+                  face_mesh,
+                  conf_thresh: float,
+                  frame_skip: int,
+                  downscale: float,
+                  expansion_1: float,
+                  expansion_2: float,
+                  min_crop: int,
+                  faces_upscale: int,
+                  imgsz: int,
+                  device: str,
+                  max_det: int):
+    print(f"🎬 Starte Detection: {video_path.name}")
+    cap = cv2.VideoCapture(str(video_path))
+    if not cap.isOpened():
+        logging.error(f"❌ Kann Video nicht öffnen: {video_path}")
+        return
+
+    fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
+    orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    total_frames_raw = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    total_to_process = None
+    if total_frames_raw > 0:
+        total_to_process = (total_frames_raw + (frame_skip - 1)) // max(1, frame_skip)
+
+    scaled_w = max(1, int(round(orig_w * downscale)))
+    scaled_h = max(1, int(round(orig_h * downscale)))
+
+    data: List[Dict[str, Any]] = []
+    frame_idx = 0
+    processed_frames = 0
+
+    sx = (orig_w / scaled_w) if downscale != 1.0 else 1.0
+    sy = (orig_h / scaled_h) if downscale != 1.0 else 1.0
+
+    autocast_ctx = (
+        torch.autocast(device_type=device, dtype=torch.float16)
+        if device in ("mps", "cuda") else nullcontext()
+    )
+
+    bar = None
+    start_t = time.time()
+    if _HAS_TQDM and total_to_process:
+        bar = tqdm(total=total_to_process, desc=f"{video_path.name}", unit="f", leave=True)
+
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break
+
+        if frame_skip > 1 and (frame_idx % frame_skip != 0):
+            frame_idx += 1
+            continue
+
+        frame_infer = frame if downscale == 1.0 else cv2.resize(frame, (scaled_w, scaled_h), interpolation=cv2.INTER_AREA)
+
+        with torch.no_grad():
+            with autocast_ctx:
+                # Ultralytics 8 API: __call__ statt .predict() (beide funktionieren)
+                result = model(frame_infer, imgsz=imgsz, device=device, verbose=False,
+                               conf=conf_thresh, iou=0.5, max_det=max_det)
+                detections = result[0]
+
+        faces = []
+        for i in range(len(detections.boxes)):
+            box = detections.boxes[i]
+            conf = float(box.conf[0]) if hasattr(box.conf, "__len__") else float(box.conf)
+            if conf < conf_thresh:
+                continue
+            x1, y1, x2, y2 = [float(v) for v in box.xyxy[0].tolist()]
+            if downscale != 1.0:
+                x1 *= sx; x2 *= sx; y1 *= sy; y2 *= sy
+            x1 = max(0.0, min(x1, orig_w - 1))
+            y1 = max(0.0, min(y1, orig_h - 1))
+            x2 = max(0.0, min(x2, orig_w - 1))
+            y2 = max(0.0, min(y2, orig_h - 1))
+
+            w = max(1.0, x2 - x1)
+            h = max(1.0, y2 - y1)
+            cx = x1 + w / 2.0
+            cy = y1 + h / 2.0
+
+            # Pass 1
+            sx1, sy1, sx2, sy2 = make_square_crop(x1, y1, x2, y2, orig_w, orig_h, expansion_1, min_crop)
+            if sx2 - sx1 < 4 or sy2 - sy1 < 4:
+                continue
+            face_crop = frame[sy1:sy2, sx1:sx2]
+            _, mouth_open = run_mesh(face_mesh, face_crop, faces_upscale)
+
+            # Pass 2 nur wenn nötig
+            if mouth_open == 0.0:
+                sx1b, sy1b, sx2b, sy2b = make_square_crop(x1, y1, x2, y2, orig_w, orig_h, expansion_2, min_crop)
+                if (sx2b - sx1b) >= 4 and (sy2b - sy1b) >= 4:
+                    face_crop_b = frame[sy1b:sy2b, sx1b:sx2b]
+                    _, mouth_open = run_mesh(face_mesh, face_crop_b, faces_upscale)
+
+            faces.append({
+                "bbox": [int(round(x1)), int(round(y1)), int(round(w)), int(round(h))],
+                "conf": round(conf, 3),
+                "center": [round(cx, 1), round(cy, 1)],
+                "mouth_openness": round(float(mouth_open), 3)
+            })
+
+        data.append({
+            "frame": frame_idx,
+            "timestamp": round(frame_idx / fps, 3),
+            "W": orig_w,
+            "H": orig_h,
+            "faces": faces
+        })
+        frame_idx += 1
+        processed_frames += 1
+
+        # Fortschritt
+        if bar is not None:
+            bar.update(1)
+        else:
+            if processed_frames % 30 == 0:
+                elapsed = time.time() - start_t
+                rate = processed_frames / max(1e-6, elapsed)  # frames/sec
+                if total_to_process:
+                    remaining = max(0, total_to_process - processed_frames)
+                    eta_sec = remaining / max(1e-6, rate)
+                    print(f"[{video_path.name}] {processed_frames}/{total_to_process} "
+                          f"({processed_frames/total_to_process*100:.1f}%) "
+                          f"— {rate:.1f} f/s — ETA {eta_sec/60:.1f} min")
+                else:
+                    print(f"[{video_path.name}] {processed_frames} frames — {rate:.1f} f/s")
+
+    cap.release()
+    if bar is not None:
+        bar.close()
+
+    # schön formatiertes JSON
+    output_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
+    print(f"✅ Faces gespeichert: {output_path.name}")
+
+# ---------- CLI ----------
+def parse_args():
+    p = argparse.ArgumentParser(description="YOLOv8-face + MediaPipe FaceMesh → faces.json pro Clip")
+    # Verzeichnisse (Default aus config.py)
+    p.add_argument("--input-dir", type=Path, default=RAW_CLIPS_DIR, help=f"Rohclips (Default: {RAW_CLIPS_DIR})")
+    p.add_argument("--output-dir", type=Path, default=FACE_COMBINED_DIR, help=f"Zielordner (Default: {FACE_COMBINED_DIR})")
+    # Modell
+    p.add_argument("--model", type=Path, default=ROOT / "models" / "yolov8n-face.pt",
+                   help="Pfad zum YOLOv8-face Modell (.pt)")
+    # Optimierte Defaults
+    p.add_argument("--conf-thresh", type=float, default=0.35)
+    p.add_argument("--frame-skip", type=int, default=1, help="Nur jeden n-ten Frame verarbeiten")
+    p.add_argument("--downscale", type=float, default=0.5, help="Eingangsframe auf Faktor verkleinern (0..1)")
+    p.add_argument("--expansion", type=float, default=0.4, help="Crop-Margin Pass 1 (relativ)")
+    p.add_argument("--expansion2", type=float, default=0.8, help="Crop-Margin Pass 2 (relativ)")
+    p.add_argument("--min-crop", type=int, default=160, help="Minimaler Croprand in Pixeln (quadratisch)")
+    p.add_argument("--faces-upscale", type=int, default=192, help="Minimale Kantenlänge für FaceMesh (bei kleineren Crops upscalen)")
+    p.add_argument("--imgsz", type=int, default=448)
+    p.add_argument("--max-det", type=int, default=20)
+    p.add_argument("--use-refine", action="store_true", default=False, help="MediaPipe mit refine_landmarks")
+    return p.parse_args()
+
+def main():
+    args = parse_args()
+
+    logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO)
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+
+    # YOLO Modell & Device
+    yolo = YOLO(str(args.model))
+    if torch.backends.mps.is_available():
+        device = "mps"
+    elif torch.cuda.is_available():
+        device = "cuda"
+    else:
+        device = "cpu"
+    yolo.to(device)
+    print(f"🖥️  Inference-Device: {device}")
+
+    # Warmup
+    try:
+        with torch.no_grad():
+            dummy = np.zeros((args.imgsz, args.imgsz, 3), dtype=np.uint8)
+            _ = yolo(source=[dummy], imgsz=args.imgsz, verbose=False, device=device)
+    except Exception:
+        pass
+
+    # Eingabedateien anzeigen
+    videos = sorted([*args.input_dir.glob("*.mp4"), *args.input_dir.glob("*.mov"), *args.input_dir.glob("*.mkv")])
+    print(f"🔍 Input-Ordner: {args.input_dir.resolve()}")
+    if not videos:
+        print("⚠️  Keine passenden Videos gefunden.")
+        return
+    print("📁 Dateien:")
+    for p in videos:
+        print("  →", p.name)
+
+    outer = None
+    if _HAS_TQDM:
+        outer = tqdm(total=len(videos), desc="Gesamt", unit="vid", leave=False)
+
+    with mp.solutions.face_mesh.FaceMesh(
+        static_image_mode=False,
+        max_num_faces=10,
+        refine_landmarks=args.use_refine,
+        min_detection_confidence=0.5,
+        min_tracking_confidence=0.5
+    ) as face_mesh:
+        for vid in videos:
+            out = args.output_dir / f"{vid.stem}_faces.json"
+            process_video(
+                video_path=vid,
+                output_path=out,
+                model=yolo,
+                face_mesh=face_mesh,
+                conf_thresh=args.conf_thresh,
+                frame_skip=args.frame_skip,
+                downscale=args.downscale,
+                expansion_1=args.expansion,
+                expansion_2=args.expansion2,
+                min_crop=args.min_crop,
+                faces_upscale=args.faces_upscale,
+                imgsz=args.imgsz,
+                device=device,
+                max_det=args.max_det
+            )
+            if outer is not None:
+                outer.update(1)
+
+    if outer is not None:
+        outer.close()
+
+if __name__ == "__main__":
+    main()
diff --git a/src/reformat/main_track_faces.py b/src/reformat/main_track_faces.py
new file mode 100644
index 0000000..258bf02
--- /dev/null
+++ b/src/reformat/main_track_faces.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+import logging, json
+from pathlib import Path
+from typing import List, Dict, Any
+import sys
+
+# Projekt-Root verfügbar machen
+ROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(ROOT))
+
+from config import FACE_COMBINED_DIR, FACE_CROP_CENTERS  # ggf. SEGMENTS_DIR, wenn du dorthin schreibst
+
+
+def iou(boxA, boxB):
+    xA = max(boxA[0], boxB[0])
+    yA = max(boxA[1], boxB[1])
+    xB = min(boxA[0]+boxA[2], boxB[0]+boxB[2])
+    yB = min(boxA[1]+boxA[3], boxB[1]+boxB[3])
+    interW, interH = max(0, xB-xA), max(0, yB-yA)
+    inter = interW * interH
+    union = boxA[2]*boxA[3] + boxB[2]*boxB[3] - inter
+    return inter/union if union > 0 else 0.0
+
+def track_faces(faces_all: List[Dict[str,Any]], iou_thresh=0.3):
+    next_id = 0
+    last_boxes = {}  # track_id -> bbox
+    for frame in faces_all:
+        new_boxes = {}
+        for face in frame["faces"]:
+            box = face["bbox"]
+            # match gegen bestehende
+            best_id, best_iou = None, 0.0
+            for tid, prev_box in last_boxes.items():
+                ov = iou(box, prev_box)
+                if ov > best_iou:
+                    best_id, best_iou = tid, ov
+            if best_iou > iou_thresh:
+                face["track_id"] = best_id
+                new_boxes[best_id] = box
+            else:
+                face["track_id"] = next_id
+                new_boxes[next_id] = box
+                next_id += 1
+        last_boxes = new_boxes
+    return faces_all
+
+def main():
+    # Eingabe: erkannte Gesichter/Tracks
+    FACE_DIR = FACE_COMBINED_DIR
+    # Ausgabe: z. B. berechnete Center pro Frame
+    OUT_DIR = FACE_CROP_CENTERS
+    OUT_DIR.mkdir(parents=True, exist_ok=True)
+
+    for f in FACE_DIR.glob("*_faces.json"):
+        try:
+            faces_all = json.loads(f.read_text(encoding="utf-8"))
+        except Exception as e:
+            print(f"❌ Fehler beim Laden {f.name}: {e}")
+            continue
+
+        tracked = track_faces(faces_all)
+        f.write_text(json.dumps(tracked, ensure_ascii=False), encoding="utf-8")
+        print(f"✅ Track-IDs ergänzt: {f.name}")
+
+        # zusätzlich centers.json (dominant = höchster mouth_openness pro Frame)
+        centers = []
+        for fr in tracked:
+            if fr["faces"]:
+                best = max(fr["faces"], key=lambda ff: ff.get("mouth_openness", 0.0))
+                centers.append([best["center"][0], best["center"][1]])
+            else:
+                centers.append([fr["W"]/2, fr["H"]/2])
+        centers_path = f.with_name(f.stem.replace("_faces","_centers")+".json")
+        centers_path.write_text(json.dumps(centers, ensure_ascii=False), encoding="utf-8")
+        print(f"📝 Centers gespeichert: {centers_path.name}")
+
+if __name__ == "__main__":
+    main()
diff --git a/src/reformat/make_segments.py b/src/reformat/make_segments.py
new file mode 100644
index 0000000..1c438f5
--- /dev/null
+++ b/src/reformat/make_segments.py
@@ -0,0 +1,306 @@
+#!/usr/bin/env python3
+# make_segments.py — erzeugt pro Highlight eine Zielspur (target_by_frame.json) fürs Cropping
+
+from __future__ import annotations
+import json
+import argparse
+from dataclasses import dataclass
+from typing import List, Dict, Optional, Tuple
+from pathlib import Path
+import sys
+
+# ── Projektwurzel in sys.path aufnehmen (dieses Skript liegt z. B. unter src/reformat/)
+ROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(ROOT))
+
+from config import RAW_CLIPS_DIR, FACE_COMBINED_DIR, FACE_CROP_CENTERS, SEGMENTS_DIR
+
+try:
+    from moviepy.video.io.VideoFileClip import VideoFileClip
+    MOVIEPY_OK = True
+except Exception:
+    MOVIEPY_OK = False
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# Hilfsstrukturen
+# ──────────────────────────────────────────────────────────────────────────────
+
+@dataclass
+class FaceDet:
+    t: float             # Sekunden
+    cx: float            # Zentrum x (0..1)
+    cy: float            # Zentrum y (0..1)
+    w: float             # Breite rel. (0..1)
+    h: float             # Höhe rel. (0..1)
+    track_id: Optional[int] = None
+    mouth_prob: Optional[float] = None
+
+def moving_average(xs: List[float], win: int) -> List[float]:
+    if win <= 1 or len(xs) <= 2:
+        return xs[:]
+    # ungerade Fensterbreite erzwingen
+    win = win if win % 2 == 1 else win + 1
+    r = win // 2
+    out = []
+    for i in range(len(xs)):
+        a = max(0, i - r)
+        b = min(len(xs), i + r + 1)
+        out.append(sum(xs[a:b]) / (b - a))
+    return out
+
+def clamp01(x: float) -> float:
+    return max(0.0, min(1.0, x))
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# Lesen möglicher Eingabeformate (robust, schema-tolerant)
+# ──────────────────────────────────────────────────────────────────────────────
+
+def _parse_face_like(obj: Dict, t: float, W: float | None = None, H: float | None = None) -> FaceDet:
+    """
+    Erwartet entweder:
+      - bbox=[x,y,w,h] in Pixel → wird via W,H auf 0..1 normiert
+      - oder bereits normierte Felder cx,cy,w,h in 0..1
+    Optional: track_id, mouth_prob / mouth_open / talking_prob
+    """
+    if "bbox" in obj and isinstance(obj["bbox"], (list, tuple)) and len(obj["bbox"]) >= 4:
+        x, y, w, h = [float(v) for v in obj["bbox"][:4]]
+        if W and H and W > 0 and H > 0:
+            cx = (x + w * 0.5) / W
+            cy = (y + h * 0.5) / H
+            w  = w / W
+            h  = h / H
+        else:
+            # Falls Maße fehlen: best effort, danach clampen
+            cx = x + w * 0.5
+            cy = y + h * 0.5
+        cx, cy = clamp01(cx), clamp01(cy)
+        w, h = max(0.0, min(1.0, w)), max(0.0, min(1.0, h))
+    else:
+        cx = float(obj.get("cx", 0.5))
+        cy = float(obj.get("cy", 0.5))
+        w  = float(obj.get("w",  0.3))
+        h  = float(obj.get("h",  0.3))
+        cx, cy = clamp01(cx), clamp01(cy)
+        w, h = max(0.0, min(1.0, w)), max(0.0, min(1.0, h))
+
+    track_id   = obj.get("track_id")
+    mouth_prob = obj.get("mouth_prob") or obj.get("mouth_open") or obj.get("talking_prob")
+    mouth_prob = None if mouth_prob is None else float(mouth_prob)
+
+    return FaceDet(t=t, cx=cx, cy=cy, w=w, h=h, track_id=track_id, mouth_prob=mouth_prob)
+
+
+def load_faces_or_centers(stem: str, fps_hint: float | None = None) -> List[FaceDet]:
+    """
+    Lädt die beste verfügbare Gesichts/Center-Quelle für ein Highlight.
+    Suchreihenfolge:
+      1) FACE_COMBINED_DIR/{stem}_faces.json  (Liste von Frames mit 'faces')
+      2) FACE_CROP_CENTERS/{stem}_centers.json
+         - akzeptiert entweder [[cx,cy], ...] oder [{t,cx,cy,w,h}, ...]
+    """
+    candidates = [
+        (FACE_COMBINED_DIR / f"{stem}_faces.json", "faces"),
+        (FACE_CROP_CENTERS / f"{stem}_centers.json", "centers"),
+    ]
+    path = kind = None
+    for p, k in candidates:
+        if p.exists():
+            path, kind = p, k
+            break
+
+    if path is None:
+        print(f"⚠️ Keine Face/Centers-Datei gefunden für {stem}. Fallback später → (0.5,0.5).")
+        return []
+
+    try:
+        raw = path.read_text(encoding="utf-8")
+        data = json.loads(raw)
+    except Exception as e:
+        print(f"❌ Konnte {path.name} nicht lesen: {e}")
+        return []
+
+    dets: List[FaceDet] = []
+
+    # 1) Liste von Frames: [{ "W":..,"H":..,"timestamp"/"t":.., "faces":[...] }, ...]
+    if isinstance(data, list) and data and isinstance(data[0], dict) and "faces" in data[0]:
+        for fr in data:
+            W = float(fr.get("W") or 0.0)
+            H = float(fr.get("H") or 0.0)
+            t = float(fr.get("t") or fr.get("timestamp") or fr.get("time") or 0.0)
+            for f in fr.get("faces", []):
+                dets.append(_parse_face_like(f, t, W, H))
+
+    # 2) Dict mit "frames": [...]
+    elif isinstance(data, dict) and "frames" in data:
+        for fr in data["frames"]:
+            W = float(fr.get("W") or 0.0)
+            H = float(fr.get("H") or 0.0)
+            t = float(fr.get("t") or fr.get("timestamp") or fr.get("time") or 0.0)
+            for f in fr.get("faces", []):
+                dets.append(_parse_face_like(f, t, W, H))
+
+    # 3) centers.json als Liste von Listen: [[cx,cy], ...]
+    elif isinstance(data, list) and data and isinstance(data[0], (list, tuple)) and len(data[0]) >= 2:
+        fps = float(fps_hint or 25.0)
+        for i, pair in enumerate(data):
+            cx, cy = float(pair[0]), float(pair[1])
+            dets.append(FaceDet(t=i / fps, cx=clamp01(cx), cy=clamp01(cy), w=0.6, h=0.6))
+
+    # 4) Liste von Dicts mit evtl. bereits normierten Feldern
+    elif isinstance(data, list) and data and isinstance(data[0], dict):
+        for item in data:
+            t = float(item.get("t") or item.get("time") or 0.0)
+            dets.append(_parse_face_like(item, t))
+
+    else:
+        print(f"⚠️ Unbekanntes JSON-Format in {path.name}.")
+        return []
+
+    # filtern & sortieren
+    dets = [d for d in dets if 0.0 <= d.cx <= 1.0 and 0.0 <= d.cy <= 1.0]
+    dets.sort(key=lambda d: d.t)
+    print(f"✅ {len(dets)} Detektionen aus {path.name} ({kind}).")
+    return dets
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# Zielspur berechnen
+# ──────────────────────────────────────────────────────────────────────────────
+
+def build_target_by_frame(
+    faces: List[FaceDet],
+    duration: float,
+    fps: float,
+    smooth_win: int = 7
+) -> List[Dict]:
+    """
+    Wählt pro Frame eine Zielposition (cx,cy,w,h).
+    Heuristik:
+      - bevorzuge Gesicht mit höchster mouth_prob (wenn vorhanden),
+      - sonst größtes Bounding-Box-Areal (w*h),
+      - halte IDs stabil (nicht zu häufige Sprünge).
+    Anschließend leichte Glättung (Moving Average) der Zentren/Größen.
+    """
+    if fps <= 0:
+        fps = 25.0
+    total_frames = max(1, int(round(duration * fps)))
+    if not faces:
+        # Fallback: center track
+        return [{"frame": i, "t": round(i / fps, 4), "cx": 0.5, "cy": 0.5, "w": 0.6, "h": 0.6} for i in range(total_frames)]
+
+    frame_targets: List[Tuple[float, float, float, float]] = []  # (cx, cy, w, h)
+    last_track: Optional[int] = None
+
+    # lineare Suche über faces (bei Bedarf später bucketisieren)
+    for i in range(total_frames):
+        t = i / fps
+        lo, hi = t - 1.0 / fps, t + 1.0 / fps
+
+        cand: List[FaceDet] = [d for d in faces if lo <= d.t <= hi]
+        if not cand:
+            # Nimm den zeitlich nächsten
+            nearest = min(faces, key=lambda d: abs(d.t - t))
+            cand = [nearest]
+
+        def score(d: FaceDet) -> Tuple[float, float, float]:
+            mouth = -1.0 if d.mouth_prob is None else float(d.mouth_prob)  # None schlechter als 0
+            area  = float(d.w) * float(d.h)
+            stable = 1.0 if (last_track is not None and d.track_id == last_track) else 0.0
+            return (mouth, area, stable)
+
+        cand.sort(key=score, reverse=True)
+        best = cand[0]
+        if best.track_id is not None:
+            last_track = best.track_id
+        frame_targets.append((best.cx, best.cy, best.w, best.h))
+
+    # Glätten
+    cxs = moving_average([c for c, _, _, _ in frame_targets], smooth_win)
+    cys = moving_average([c for _, c, _, _ in frame_targets], smooth_win)
+    ws  = moving_average([w for *_, w, _ in frame_targets], max(3, smooth_win // 2))
+    hs  = moving_average([h for *_, _, h in frame_targets], max(3, smooth_win // 2))
+
+    out = []
+    for i, (cx, cy, w, h) in enumerate(zip(cxs, cys, ws, hs)):
+        t = i / fps
+        out.append({
+            "frame": i,
+            "t": round(t, 4),
+            "cx": round(clamp01(cx), 4),
+            "cy": round(clamp01(cy), 4),
+            "w": round(max(0.05, min(1.0, w)), 4),
+            "h": round(max(0.05, min(1.0, h)), 4),
+        })
+    return out
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# I/O
+# ──────────────────────────────────────────────────────────────────────────────
+
+def write_target_json(stem: str, target: List[Dict]) -> Path:
+    SEGMENTS_DIR.mkdir(parents=True, exist_ok=True)
+    out_path = SEGMENTS_DIR / f"{stem}_target_by_frame.json"
+    out_path.write_text(json.dumps(target, ensure_ascii=False, indent=2), encoding="utf-8")
+    return out_path
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# CLI / Main
+# ──────────────────────────────────────────────────────────────────────────────
+
+def parse_args():
+    p = argparse.ArgumentParser(description="Erzeugt target_by_frame.json aus Face/Center-Detektionen für Cropping.")
+    p.add_argument("--pattern", type=str, default="highlight_*.mp4", help="Dateimuster in RAW_CLIPS_DIR (Default: highlight_*.mp4)")
+    p.add_argument("--fps", type=float, default=0.0, help="FPS erzwingen (0 = aus Video lesen).")
+    p.add_argument("--smooth", type=int, default=7, help="Fensterbreite für Moving-Average-Glättung (ungerade).")
+    p.add_argument("--overwrite", action="store_true", help="Existierende target_by_frame.json überschreiben.")
+    return p.parse_args()
+
+
+def main():
+    if not MOVIEPY_OK:
+        raise RuntimeError("moviepy ist nicht installiert. Bitte `pip install moviepy` ausführen.")
+
+    args = parse_args()
+
+    vids = sorted(RAW_CLIPS_DIR.glob(args.pattern))
+    if not vids:
+        print(f"⚠️ Keine Rohclips gefunden in {RAW_CLIPS_DIR} mit Pattern '{args.pattern}'.")
+        return
+
+    print(f"🔎 Finde {len(vids)} Clips …")
+
+    for vid in vids:
+        stem = vid.stem  # z. B. highlight_3
+        out_json = SEGMENTS_DIR / f"{stem}_target_by_frame.json"
+        if out_json.exists() and not args.overwrite:
+            print(f"⏭️  {out_json.name} existiert bereits – überspringe (nutze --overwrite zum Ersetzen).")
+            continue
+
+        # Video-Metadaten
+        try:
+            with VideoFileClip(str(vid)) as V:
+                duration = float(V.duration or 0.0)
+                fps = float(args.fps or (V.fps or 25.0))
+        except Exception as e:
+            print(f"❌ Kann Video {vid.name} nicht öffnen: {e} – Fallback duration/fps (10s/25fps).")
+            duration, fps = 10.0, (args.fps or 25.0)
+
+        # Face/Centers laden (fps_hint durchreichen, wichtig für centers-Listen)
+        faces = load_faces_or_centers(stem, fps_hint=fps)
+
+        # Zielspur bauen
+        target = build_target_by_frame(faces, duration=duration, fps=fps, smooth_win=args.smooth)
+
+        # Schreiben
+        out = write_target_json(stem, target)
+        print(f"💾 geschrieben: {out}")
+
+    print("🎉 Fertig.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/reformat/new/analyze_mouth_activity.py b/src/reformat/new/analyze_mouth_activity.py
new file mode 100644
index 0000000..41f71e4
--- /dev/null
+++ b/src/reformat/new/analyze_mouth_activity.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python3
+# src/reformat/new/analyze_mouth_activity.py
+import logging
+from pathlib import Path
+from typing import List, Dict, Any, Tuple, Optional
+
+# OpenAI optional; aktuell nicht genutzt (Flag fehlt bewusst)
+# from openai import OpenAI
+
+# === HARTE DEFAULTS: einfach Play drücken ===
+PROJECT_ROOT = Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit")
+RAW_DIR      = PROJECT_ROOT / "data" / "output" / "raw_clips"
+FACES_DIR    = PROJECT_ROOT / "data" / "face_data_combined"
+TIMED_DIR    = PROJECT_ROOT / "data" / "transkripte"
+CENTERS_DIR  = PROJECT_ROOT / "data" / "face_crop_centers"
+
+def parse_timed_file(path: Path) -> List[Tuple[float, float]]:
+    """
+    Erwartet Zeilen wie:
+    [00:00.00 - 00:05.20] Text...
+    Gibt Liste [(start_sec, end_sec)] zurück. Falls keine Treffer: leere Liste.
+    """
+    import re
+    rx = re.compile(r"\[(\d+):(\d+)\.(\d+)\s*-\s*(\d+):(\d+)\.(\d+)\]")
+    segs = []
+    try:
+        for line in path.read_text(encoding="utf-8").splitlines():
+            m = rx.search(line)
+            if not m:
+                continue
+            smin, ssec, sms, emin, esec, ems = map(int, m.groups())
+            start = smin * 60 + ssec + sms / 100.0
+            end   = emin * 60 + esec + ems / 100.0
+            if end > start:
+                segs.append((start, end))
+    except FileNotFoundError:
+        pass
+    return segs
+
+def select_speaker_center(faces: List[Dict[str, Any]]) -> Tuple[float, float]:
+    """Priorität: mouth_openness, Fallback: größte Fläche; sonst Bildmitte."""
+    if not faces:
+        return (960.0, 540.0)
+    def area(f):
+        bx = f.get("bbox",[0,0,0,0]); return float(bx[2]*bx[3])
+    best = max(
+        faces,
+        key=lambda f: (float(f.get("mouth_openness", 0.0)), area(f))
+    )
+    x, y, w, h = best["bbox"]
+    return (x + w/2.0, y + h/2.0)
+
+def load_json(path: Path):
+    import json
+    return json.loads(path.read_text(encoding="utf-8"))
+
+def save_json(obj, path: Path):
+    import json
+    path.parent.mkdir(parents=True, exist_ok=True)
+    path.write_text(json.dumps(obj, ensure_ascii=False, indent=2), encoding="utf-8")
+
+def process_one(base_name: str) -> bool:
+    faces_path   = FACES_DIR   / f"{base_name}_faces.json"
+    timed_path   = TIMED_DIR   / f"{base_name}_timed.txt"
+    centers_path = CENTERS_DIR / f"{base_name}_centers.json"
+
+    if not faces_path.exists():
+        logging.warning("Skip %-18s | Faces fehlen: %s", base_name, faces_path)
+        return False
+    if centers_path.exists():
+        logging.info("Skip %-18s | Centers existieren schon: %s", base_name, centers_path.name)
+        return True
+
+    try:
+        face_data: List[Dict[str, Any]] = load_json(faces_path)
+    except Exception as e:
+        logging.error("Fehler beim Lesen von %s: %s", faces_path, e)
+        return False
+
+    segments = parse_timed_file(timed_path)
+    if not segments:
+        logging.warning("[%s] Keine Segmente erkannt oder Datei fehlt: %s", base_name, timed_path.name)
+
+    centers: List[List[float]] = []
+    for entry in face_data:
+        faces = entry.get("faces", [])
+        cx, cy = select_speaker_center(faces)
+        centers.append([float(cx), float(cy)])
+
+    save_json(centers, centers_path)
+    logging.info("OK   %-18s | Centers gespeichert: %s (frames=%d)", base_name, centers_path.name, len(centers))
+    return True
+
+def main():
+    logging.basicConfig(
+        format="%(asctime)s %(levelname)s: %(message)s",
+        level=logging.INFO
+    )
+
+    if not RAW_DIR.exists():
+        logging.error("RAW_DIR existiert nicht: %s", RAW_DIR)
+        return
+
+    clips = sorted(RAW_DIR.glob("*.mp4"))
+    if not clips:
+        logging.warning("Keine Clips gefunden in %s", RAW_DIR)
+        return
+
+    logging.info("Analyze (mouth) Batch-Mode: %d Clips", len(clips))
+    ok = 0
+    for clip in clips:
+        base = clip.stem
+        if process_one(base):
+            ok += 1
+    logging.info("Fertig. %d/%d Clips verarbeitet.", ok, len(clips))
+
+if __name__ == "__main__":
+    main()
diff --git a/src/reformat/new/main_apply_crop.py b/src/reformat/new/main_apply_crop.py
new file mode 100644
index 0000000..cf90a2c
--- /dev/null
+++ b/src/reformat/new/main_apply_crop.py
@@ -0,0 +1,294 @@
+#!/usr/bin/env python3
+# src/reformat/new/main_apply_crop.py
+from __future__ import annotations
+import logging, json, math, subprocess
+from pathlib import Path
+from typing import Optional, Tuple, List, Dict, Any
+from collections import deque
+
+import cv2
+import numpy as np
+
+# ==== Pfade =================================================================
+PROJECT_ROOT      = Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit")
+INPUT_VIDEO_DIR   = PROJECT_ROOT / "data" / "output" / "raw_clips"
+FACE_COMBINED_DIR = PROJECT_ROOT / "data" / "face_data_combined"
+SEGMENTS_DIR      = PROJECT_ROOT / "data" / "segments"
+OUTPUT_DIR        = PROJECT_ROOT / "output" / "output_9x16_final"
+OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+
+OUT_W, OUT_H = 1080, 1920
+TARGET_AR    = OUT_W / OUT_H  # 0.5625
+
+# ==== Debug =================================================================
+DEBUG_MODE   = False
+DEBUG_SCALE  = 0.6
+DRAW_GUIDES  = True
+
+# ==== Smooth / Switch =======================================================
+MEDIAN_WIN               = 5
+EMA_ALPHA                = 0.22
+DEADBAND_PX              = 8.0
+SWITCH_COOLDOWN_FRAMES   = 12   # kurze Ruhe nach Segmentwechsel
+ZOOM_PAD_FRAC            = 0.10
+
+# ==== Scene-Cut-Erkennung ===================================================
+USE_CUT_DETECT  = True
+CUT_CORR_THRESH = 0.65
+CUT_COOLDOWN    = 6
+
+# ==== Audio-Mux =============================================================
+MUX_AUDIO   = True
+FFMPEG_BIN  = "ffmpeg"
+# ============================================================================
+
+def clamp(v, lo, hi): return max(lo, min(hi, v))
+
+def compute_crop_rect(cx: float, cy: float, src_w: int, src_h: int) -> tuple[int,int,int,int]:
+    """
+    Liefert ein 9:16-Croprechteck (x,y,w,h) um (cx,cy).
+    - AR bleibt IMMER exakt 9:16 (kein Squeeze)
+    - ZOOM_PAD_FRAC wirkt als uniformer Scale auf Breite und Höhe
+    - Rechteck bleibt vollständig im Bild
+    """
+    src_ar = src_w / src_h
+
+    if src_ar >= TARGET_AR:
+        base_h = src_h
+        base_w = int(round(base_h * TARGET_AR))
+    else:
+        base_w = src_w
+        base_h = int(round(base_w / TARGET_AR))
+
+    desired_scale = 1.0 + ZOOM_PAD_FRAC
+    max_scale_w = src_w / base_w
+    max_scale_h = src_h / base_h
+    s = min(desired_scale, max_scale_w, max_scale_h)
+
+    w = int(round(base_w * s))
+    h = int(round(base_h * s))
+
+    half_w, half_h = w // 2, h // 2
+
+    cx = clamp(cx, half_w, src_w - half_w)
+    cy = clamp(cy, half_h, src_h - half_h)
+
+    x = int(round(cx - half_w))
+    y = int(round(cy - half_h))
+    return x, y, w, h
+
+def draw_center(img, pt, color, label=None):
+    if pt is None: return
+    x, y = int(pt[0]), int(pt[1])
+    cv2.circle(img, (x, y), 6, color, -1)
+    if label:
+        cv2.putText(img, label, (x + 8, y - 8),
+                    cv2.FONT_HERSHEY_SIMPLEX, 0.55, color, 2, cv2.LINE_AA)
+
+def scene_corr(a_small: np.ndarray, b_small: np.ndarray) -> float:
+    a_hsv = cv2.cvtColor(a_small, cv2.COLOR_BGR2HSV)
+    b_hsv = cv2.cvtColor(b_small, cv2.COLOR_BGR2HSV)
+    ha = cv2.calcHist([a_hsv],[0,1],None,[50,50],[0,180,0,256])
+    hb = cv2.calcHist([b_hsv],[0,1],None,[50,50],[0,180,0,256])
+    cv2.normalize(ha,ha,0,1,cv2.NORM_MINMAX); cv2.normalize(hb,hb,0,1,cv2.NORM_MINMAX)
+    corr = cv2.compareHist(ha, hb, cv2.HISTCMP_CORREL)
+    return float((corr + 1.0)/2.0)
+
+def mux_audio_from_source(src_video: Path, silent_video: Path, out_video: Path):
+    cmd = [
+        FFMPEG_BIN, "-y",
+        "-i", str(src_video),
+        "-i", str(silent_video),
+        "-map", "1:v:0",
+        "-map", "0:a:0?",
+        "-c:v", "copy",
+        "-c:a", "aac", "-b:a", "192k",
+        "-shortest",
+        str(out_video),
+    ]
+    subprocess.run(cmd, check=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+def load_faces(name: str) -> List[Dict[str,Any]]:
+    p = FACE_COMBINED_DIR / f"{name}_faces.json"
+    return json.loads(p.read_text(encoding="utf-8"))
+
+def load_segments(name: str, total_frames: int) -> List[Optional[int]]:
+    seg_p = SEGMENTS_DIR / f"{name}_segments.json"
+    map_p = SEGMENTS_DIR / f"{name}_target_by_frame.json"
+    if map_p.exists():
+        target = json.loads(map_p.read_text(encoding="utf-8"))
+        if len(target) < total_frames:
+            target += [target[-1] if target else None] * (total_frames - len(target))
+        return target[:total_frames]
+    if seg_p.exists():
+        segs = json.loads(seg_p.read_text(encoding="utf-8"))
+        target = [None]*total_frames
+        for s in segs:
+            a, b, tid = int(s["start_f"]), int(s["end_f"]), s["track_id"]
+            for t in range(max(0,a), min(total_frames, b+1)):
+                target[t] = tid
+        return target
+    return [None]*total_frames
+
+def find_center_for_track(faces_frame: Dict[str,Any], target_tid: Optional[int], fallback: Tuple[float,float]) -> Tuple[float,float]:
+    if target_tid is None:
+        return fallback
+    faces = faces_frame.get("faces", [])
+    for f in faces:
+        if int(f.get("track_id", -1)) == int(target_tid):
+            x,y,w,h = f.get("bbox", [None,None,None,None])
+            if None not in (x,y,w,h):
+                return (float(x + w/2), float(y + h/2))
+    return fallback
+
+def main():
+    logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO)
+    clips = sorted(list(INPUT_VIDEO_DIR.glob("*.mp4")) + list(INPUT_VIDEO_DIR.glob("*.mov")))
+    if not clips:
+        print(f"⚠️  Keine Clips in {INPUT_VIDEO_DIR}")
+        return
+
+    for video_path in clips:
+        name = video_path.stem
+        faces_path = FACE_COMBINED_DIR / f"{name}_faces.json"
+        if not faces_path.exists():
+            print(f"⏭️  Skip (keine Faces): {faces_path.name}")
+            continue
+
+        cap = cv2.VideoCapture(str(video_path))
+        if not cap.isOpened():
+            print(f"❌ Kann Video nicht öffnen: {video_path.name}")
+            continue
+        width  = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        fps    = cap.get(cv2.CAP_PROP_FPS) or 25.0
+        total  = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+
+        faces_all = load_faces(name)
+        if len(faces_all) < total:
+            faces_all += [ {"faces": [], "W": width, "H": height} ] * (total - len(faces_all))
+
+        target_by_frame = load_segments(name, total)
+
+        out_path = OUTPUT_DIR / f"{name}_9x16.mp4"
+        if out_path.exists():
+            print(f"⏭️  Skip: Output existiert bereits → {out_path.name}")
+            cap.release()
+            continue
+
+        writer = cv2.VideoWriter(
+            str(out_path),
+            cv2.VideoWriter_fourcc(*"mp4v"),
+            fps,
+            (OUT_W, OUT_H)
+        )
+
+        median_buf = deque(maxlen=max(3, MEDIAN_WIN if MEDIAN_WIN % 2 else MEDIAN_WIN+1))
+        ema_center: Optional[Tuple[float,float]] = None
+        last_center: Optional[Tuple[float,float]] = (width/2, height/2)
+        switch_cooldown = 0
+
+        prev_small = None
+        cut_cd = 0
+
+        print(f"🎞️  Apply: {name}  src={width}x{height}  fps={fps:.2f}  frames={total}")
+
+        for t in range(total):
+            ret, frame = cap.read()
+            if not ret: break
+
+            target_tid = target_by_frame[t] if t < len(target_by_frame) else None
+            faces_fr = faces_all[t] if t < len(faces_all) else {"faces":[]}
+            desired = find_center_for_track(faces_fr, target_tid, last_center or (width/2, height/2))
+
+            if USE_CUT_DETECT:
+                small = cv2.resize(frame, (128, 72))
+                if prev_small is not None:
+                    corr = scene_corr(prev_small, small)
+                    if corr < CUT_CORR_THRESH:
+                        ema_center = desired
+                        last_center = desired
+                        switch_cooldown = SWITCH_COOLDOWN_FRAMES
+                        cut_cd = CUT_COOLDOWN
+                prev_small = small
+
+            median_buf.append(desired)
+            if len(median_buf) >= 3:
+                xs = sorted(p[0] for p in median_buf)
+                ys = sorted(p[1] for p in median_buf)
+                m  = len(median_buf)//2
+                desired_f = (xs[m], ys[m])
+            else:
+                desired_f = desired
+
+            if t > 0:
+                prev_tid = target_by_frame[t-1] if t-1 < len(target_by_frame) else None
+            else:
+                prev_tid = target_tid
+
+            if ema_center is None:
+                ema_center = desired_f
+            if last_center is None:
+                last_center = desired_f
+
+            if target_tid != prev_tid:
+                ema_center  = desired_f
+                last_center = desired_f
+                switch_cooldown = SWITCH_COOLDOWN_FRAMES
+            else:
+                dx = desired_f[0] - ema_center[0]
+                dy = desired_f[1] - ema_center[1]
+                dist = math.hypot(dx, dy)
+                if cut_cd > 0:
+                    ema_center = desired_f
+                    cut_cd -= 1
+                else:
+                    if dist > DEADBAND_PX:
+                        ema_center = (ema_center[0] + dx*EMA_ALPHA,
+                                      ema_center[1] + dy*EMA_ALPHA)
+
+            last_center = desired_f
+
+            # neuer 9:16 Crop
+            x, y, w, h = compute_crop_rect(ema_center[0], ema_center[1], width, height)
+            cropped = frame[y:y+h, x:x+w]
+            if cropped.size == 0: cropped = frame
+            final = cv2.resize(cropped, (OUT_W, OUT_H), interpolation=cv2.INTER_AREA)
+            writer.write(final)
+
+            if DEBUG_MODE:
+                dbg = frame.copy()
+                cv2.rectangle(dbg, (x, y), (x+w, y+h), (0, 0, 255), 2)
+                if DRAW_GUIDES:
+                    draw_center(dbg, desired,    (128,128,255), "desired")
+                    draw_center(dbg, desired_f,  (255,255,  0), "median")
+                    draw_center(dbg, ema_center, (  0,255,255), "ema")
+                cv2.putText(dbg, f"t={t+1}/{total} tid={target_tid}",
+                            (12, height-14), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (20,220,20), 2, cv2.LINE_AA)
+                disp = cv2.resize(dbg, (int(width*DEBUG_SCALE), int(height*DEBUG_SCALE)))
+                cv2.imshow("Apply Debug", disp)
+                if cv2.waitKey(1) & 0xFF == ord("q"):
+                    print("🛑 Abgebrochen (q).")
+                    break
+
+        writer.release()
+        cap.release()
+
+        if MUX_AUDIO:
+            tmp = out_path.with_suffix(".tmp.mp4")
+            try:
+                out_path.rename(tmp)
+                mux_audio_from_source(video_path, tmp, out_path)
+            finally:
+                if tmp.exists():
+                    try: tmp.unlink()
+                    except: pass
+            print(f"✅ Fertig (mit Audio): {out_path.name}")
+        else:
+            print(f"✅ Fertig: {out_path.name}")
+
+    if DEBUG_MODE:
+        cv2.destroyAllWindows()
+
+if __name__ == "__main__":
+    main()
diff --git a/src/reformat/new/main_detect_faces.py b/src/reformat/new/main_detect_faces.py
new file mode 100644
index 0000000..12094ec
--- /dev/null
+++ b/src/reformat/new/main_detect_faces.py
@@ -0,0 +1,319 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import argparse
+import logging
+import json
+import time
+from pathlib import Path
+from contextlib import nullcontext
+
+import cv2
+import numpy as np
+import torch
+from ultralytics import YOLO
+import mediapipe as mp
+
+# Fortschritt hübsch, wenn verfügbar
+try:
+    from tqdm import tqdm
+    _HAS_TQDM = True
+except Exception:
+    _HAS_TQDM = False
+
+from src.reformat.new.speaking import get_mouth_openness
+
+# ---------- Performance Tweaks ----------
+torch.set_float32_matmul_precision("high")
+cv2.setUseOptimized(True)
+
+# ---------- Hilfsfunktionen ----------
+
+def make_square_crop(x1, y1, x2, y2, W, H, margin_scale, min_crop):
+    cx = (x1 + x2) * 0.5
+    cy = (y1 + y2) * 0.5
+    w  = (x2 - x1) * (1.0 + 2.0 * margin_scale)
+    h  = (y2 - y1) * (1.0 + 2.0 * margin_scale)
+    side = max(w, h, float(min_crop))
+    half = side * 0.5
+
+    sx1 = int(max(0, round(cx - half)))
+    sy1 = int(max(0, round(cy - half)))
+    sx2 = int(min(W, round(cx + half)))
+    sy2 = int(min(H, round(cy + half)))
+
+    side_w = max(0, sx2 - sx1)
+    side_h = max(0, sy2 - sy1)
+    side   = max(2, min(side_w, side_h))
+    sx2 = sx1 + side
+    sy2 = sy1 + side
+    return sx1, sy1, sx2, sy2
+
+
+def pick_landmarks_near_crop_center(lm_lists, crop_w, crop_h):
+    if not lm_lists:
+        return None
+    cx_t, cy_t = crop_w * 0.5, crop_h * 0.5
+    best, best_d = None, 1e12
+    for lms in lm_lists:
+        xs = [p.x * crop_w for p in lms.landmark]
+        ys = [p.y * crop_h for p in lms.landmark]
+        cx = sum(xs) / len(xs)
+        cy = sum(ys) / len(ys)
+        d = (cx - cx_t) ** 2 + (cy - cy_t) ** 2
+        if d < best_d:
+            best, best_d = lms, d
+    return best
+
+
+def run_mesh(face_mesh, crop_bgr, upscale_if_small):
+    if crop_bgr.size == 0:
+        return None, 0.0
+    ch, cw = crop_bgr.shape[:2]
+    if max(ch, cw) < upscale_if_small:
+        scale = float(upscale_if_small) / max(ch, cw)
+        new_w = max(1, int(round(cw * scale)))
+        new_h = max(1, int(round(ch * scale)))
+        crop_bgr = cv2.resize(crop_bgr, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
+        ch, cw = new_h, new_w
+    rgb = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2RGB)
+    res = face_mesh.process(rgb)
+    if not res.multi_face_landmarks:
+        return None, 0.0
+    chosen = pick_landmarks_near_crop_center(res.multi_face_landmarks, cw, ch)
+    if chosen is None:
+        return None, 0.0
+    mo = get_mouth_openness(chosen.landmark, ch)
+    return chosen, float(mo)
+
+# ---------- Kernprozess ----------
+
+def process_video(video_path: Path,
+                  output_path: Path,
+                  model: YOLO,
+                  face_mesh,
+                  conf_thresh: float,
+                  frame_skip: int,
+                  downscale: float,
+                  expansion_1: float,
+                  expansion_2: float,
+                  min_crop: int,
+                  faces_upscale: int,
+                  imgsz: int,
+                  device: str,
+                  max_det: int):
+    print(f"🎬 Starte Detection: {video_path.name}")
+    cap = cv2.VideoCapture(str(video_path))
+    if not cap.isOpened():
+        logging.error(f"❌ Kann Video nicht öffnen: {video_path}")
+        return
+
+    fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
+    orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    total_frames_raw = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    # Wenn frame_skip > 1, reduziert sich die tatsächlich verarbeitete Anzahl
+    total_to_process = None
+    if total_frames_raw > 0:
+        total_to_process = (total_frames_raw + (frame_skip - 1)) // max(1, frame_skip)
+
+    scaled_w = max(1, int(round(orig_w * downscale)))
+    scaled_h = max(1, int(round(orig_h * downscale)))
+
+    data = []
+    frame_idx = 0
+    processed_frames = 0
+
+    sx = (orig_w / scaled_w) if downscale != 1.0 else 1.0
+    sy = (orig_h / scaled_h) if downscale != 1.0 else 1.0
+
+    autocast_ctx = (
+        torch.autocast(device_type=device, dtype=torch.float16)
+        if device in ("mps", "cuda") else nullcontext()
+    )
+
+    # Fortschrittsbalken pro Video
+    bar = None
+    start_t = time.time()
+    if _HAS_TQDM:
+        bar = tqdm(total=total_to_process, desc=f"{video_path.name}", unit="f", leave=True)
+
+    while True:
+        ret, frame = cap.read()
+        if not ret:
+            break
+
+        if frame_skip > 1 and (frame_idx % frame_skip != 0):
+            frame_idx += 1
+            continue
+
+        frame_infer = frame if downscale == 1.0 else cv2.resize(frame, (scaled_w, scaled_h), interpolation=cv2.INTER_AREA)
+
+        with torch.no_grad():
+            with autocast_ctx:
+                detections = model(frame_infer, imgsz=imgsz, device=device, verbose=False,
+                                   conf=conf_thresh, iou=0.5, max_det=max_det)[0]
+
+        faces = []
+        for i in range(len(detections.boxes)):
+            box = detections.boxes[i]
+            conf = float(box.conf[0]) if hasattr(box.conf, "__len__") else float(box.conf)
+            if conf < conf_thresh:
+                continue
+            x1, y1, x2, y2 = [float(v) for v in box.xyxy[0].tolist()]
+            if downscale != 1.0:
+                x1 *= sx; x2 *= sx; y1 *= sy; y2 *= sy
+            x1 = max(0.0, min(x1, orig_w - 1))
+            y1 = max(0.0, min(y1, orig_h - 1))
+            x2 = max(0.0, min(x2, orig_w - 1))
+            y2 = max(0.0, min(y2, orig_h - 1))
+
+            w = max(1.0, x2 - x1)
+            h = max(1.0, y2 - y1)
+            cx = x1 + w / 2.0
+            cy = y1 + h / 2.0
+
+            # Pass 1
+            sx1, sy1, sx2, sy2 = make_square_crop(x1, y1, x2, y2, orig_w, orig_h, expansion_1, min_crop)
+            if sx2 - sx1 < 4 or sy2 - sy1 < 4:
+                continue
+            face_crop = frame[sy1:sy2, sx1:sx2]
+            _, mouth_open = run_mesh(face_mesh, face_crop, faces_upscale)
+
+            # Pass 2 nur wenn nötig
+            if mouth_open == 0.0:
+                sx1b, sy1b, sx2b, sy2b = make_square_crop(x1, y1, x2, y2, orig_w, orig_h, expansion_2, min_crop)
+                if (sx2b - sx1b) >= 4 and (sy2b - sy1b) >= 4:
+                    face_crop_b = frame[sy1b:sy2b, sx1b:sx2b]
+                    _, mouth_open = run_mesh(face_mesh, face_crop_b, faces_upscale)
+
+            faces.append({
+                "bbox": [int(round(x1)), int(round(y1)), int(round(w)), int(round(h))],
+                "conf": round(conf, 3),
+                "center": [round(cx, 1), round(cy, 1)],
+                "mouth_openness": round(float(mouth_open), 3)
+            })
+
+        data.append({
+            "frame": frame_idx,
+            "timestamp": round(frame_idx / fps, 3),
+            "W": orig_w,
+            "H": orig_h,
+            "faces": faces
+        })
+        frame_idx += 1
+        processed_frames += 1
+
+        # Fortschritt aktualisieren
+        if _HAS_TQDM:
+            bar.update(1)
+        else:
+            # leichter Fallback: ETA Ausgabe alle 30 verarbeitete Frames
+            if processed_frames % 30 == 0:
+                elapsed = time.time() - start_t
+                rate = processed_frames / max(1e-6, elapsed)  # frames/sec
+                if total_to_process:
+                    remaining = max(0, total_to_process - processed_frames)
+                    eta_sec = remaining / max(1e-6, rate)
+                    print(f"[{video_path.name}] {processed_frames}/{total_to_process} "
+                          f"({processed_frames/total_to_process*100:.1f}%) "
+                          f"— {rate:.1f} f/s — ETA {eta_sec/60:.1f} min")
+                else:
+                    print(f"[{video_path.name}] {processed_frames} frames — {rate:.1f} f/s")
+
+    cap.release()
+    if _HAS_TQDM and bar is not None:
+        bar.close()
+
+    output_path.write_text(json.dumps(data, ensure_ascii=False), encoding="utf-8")
+    print(f"✅ Faces gespeichert: {output_path.name}")
+
+def main():
+    parser = argparse.ArgumentParser()
+    # Verzeichnisse
+    parser.add_argument("--input-dir", type=Path,
+                        default=Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit/data/output/raw_clips"))
+    parser.add_argument("--output-dir", type=Path,
+                        default=Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit/data/face_data_combined"))
+    parser.add_argument("--model", type=Path,
+                        default=Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit/models/yolov8n-face.pt"))
+    # Optimierte Defaults (keine Presets nötig)
+    parser.add_argument("--conf-thresh", type=float, default=0.35)
+    parser.add_argument("--frame-skip", type=int, default=1)
+    parser.add_argument("--downscale", type=float, default=0.5)
+    parser.add_argument("--expansion", type=float, default=0.4)
+    parser.add_argument("--expansion2", type=float, default=0.8)
+    parser.add_argument("--min-crop", type=int, default=160)
+    parser.add_argument("--faces-upscale", type=int, default=192)
+    parser.add_argument("--imgsz", type=int, default=448)
+    parser.add_argument("--max-det", type=int, default=20)
+    parser.add_argument("--use-refine", action="store_true", default=False)
+    args = parser.parse_args()
+
+    logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO)
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Model & Device
+    yolo = YOLO(str(args.model))
+    if torch.backends.mps.is_available():
+        device = "mps"
+    elif torch.cuda.is_available():
+        device = "cuda"
+    else:
+        device = "cpu"
+    yolo.to(device)
+    print(f"🖥️  Inference-Device: {device}")
+
+    # Warmup (reduziert Anlaufschwankungen)
+    try:
+        with torch.no_grad():
+            dummy = np.zeros((args.imgsz, args.imgsz, 3), dtype=np.uint8)
+            _ = yolo.predict(source=[dummy], imgsz=args.imgsz, verbose=False, device=device)
+    except Exception:
+        pass
+
+    # Liste der Videos (für Gesamt-Fortschritt)
+    videos = sorted(args.input_dir.glob("*.mp4"))
+    print(f"🔍 Input-Ordner: {args.input_dir.resolve()}")
+    print("📁 Dateien:")
+    for p in sorted(args.input_dir.glob("*")):
+        print("  →", p.name)
+
+    # Gesamt-Fortschrittsbalken pro Datei
+    outer = None
+    if _HAS_TQDM:
+        outer = tqdm(total=len(videos), desc="Gesamt", unit="vid", leave=False)
+
+    with mp.solutions.face_mesh.FaceMesh(
+        static_image_mode=False,
+        max_num_faces=10,
+        refine_landmarks=args.use_refine,
+        min_detection_confidence=0.5,
+        min_tracking_confidence=0.5
+    ) as face_mesh:
+        for vid in videos:
+            out = args.output_dir / f"{vid.stem}_faces.json"
+            process_video(
+                video_path=vid,
+                output_path=out,
+                model=yolo,
+                face_mesh=face_mesh,
+                conf_thresh=args.conf_thresh,
+                frame_skip=args.frame_skip,
+                downscale=args.downscale,
+                expansion_1=args.expansion,
+                expansion_2=args.expansion2,
+                min_crop=args.min_crop,
+                faces_upscale=args.faces_upscale,
+                imgsz=args.imgsz,
+                device=device,
+                max_det=args.max_det
+            )
+            if _HAS_TQDM and outer is not None:
+                outer.update(1)
+
+    if _HAS_TQDM and outer is not None:
+        outer.close()
+
+if __name__ == "__main__":
+    main()
diff --git a/src/reformat/new/main_track_faces.py b/src/reformat/new/main_track_faces.py
new file mode 100644
index 0000000..53d7347
--- /dev/null
+++ b/src/reformat/new/main_track_faces.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+import logging, json
+from pathlib import Path
+from typing import List, Dict, Any
+
+def iou(boxA, boxB):
+    xA = max(boxA[0], boxB[0])
+    yA = max(boxA[1], boxB[1])
+    xB = min(boxA[0]+boxA[2], boxB[0]+boxB[2])
+    yB = min(boxA[1]+boxA[3], boxB[1]+boxB[3])
+    interW, interH = max(0, xB-xA), max(0, yB-yA)
+    inter = interW * interH
+    union = boxA[2]*boxA[3] + boxB[2]*boxB[3] - inter
+    return inter/union if union > 0 else 0.0
+
+def track_faces(faces_all: List[Dict[str,Any]], iou_thresh=0.3):
+    next_id = 0
+    last_boxes = {}  # track_id -> bbox
+    for frame in faces_all:
+        new_boxes = {}
+        for face in frame["faces"]:
+            box = face["bbox"]
+            # match gegen bestehende
+            best_id, best_iou = None, 0.0
+            for tid, prev_box in last_boxes.items():
+                ov = iou(box, prev_box)
+                if ov > best_iou:
+                    best_id, best_iou = tid, ov
+            if best_iou > iou_thresh:
+                face["track_id"] = best_id
+                new_boxes[best_id] = box
+            else:
+                face["track_id"] = next_id
+                new_boxes[next_id] = box
+                next_id += 1
+        last_boxes = new_boxes
+    return faces_all
+
+def main():
+    PROJECT_ROOT = Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit")
+    FACE_DIR     = PROJECT_ROOT / "data" / "face_data_combined"
+
+    for f in FACE_DIR.glob("*_faces.json"):
+        try:
+            faces_all = json.loads(f.read_text(encoding="utf-8"))
+        except Exception as e:
+            print(f"❌ Fehler beim Laden {f.name}: {e}")
+            continue
+
+        tracked = track_faces(faces_all)
+        f.write_text(json.dumps(tracked, ensure_ascii=False), encoding="utf-8")
+        print(f"✅ Track-IDs ergänzt: {f.name}")
+
+        # zusätzlich centers.json (dominant = höchster mouth_openness pro Frame)
+        centers = []
+        for fr in tracked:
+            if fr["faces"]:
+                best = max(fr["faces"], key=lambda ff: ff.get("mouth_openness", 0.0))
+                centers.append([best["center"][0], best["center"][1]])
+            else:
+                centers.append([fr["W"]/2, fr["H"]/2])
+        centers_path = f.with_name(f.stem.replace("_faces","_centers")+".json")
+        centers_path.write_text(json.dumps(centers, ensure_ascii=False), encoding="utf-8")
+        print(f"📝 Centers gespeichert: {centers_path.name}")
+
+if __name__ == "__main__":
+    main()
diff --git a/src/reformat/new/make_segments.py b/src/reformat/new/make_segments.py
new file mode 100644
index 0000000..c661485
--- /dev/null
+++ b/src/reformat/new/make_segments.py
@@ -0,0 +1,179 @@
+#!/usr/bin/env python3
+# src/reformat/new/make_segments.py
+from __future__ import annotations
+import json, math
+from pathlib import Path
+from dataclasses import dataclass
+from typing import List, Dict, Any, Optional
+import numpy as np
+import cv2
+
+# ==== Pfade (an dein Projekt angepasst) =====================================
+PROJECT_ROOT     = Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit")
+RAW_DIR          = PROJECT_ROOT / "data" / "output" / "raw_clips"           # Videos
+FACE_COMBINED_DIR= PROJECT_ROOT / "data" / "face_data_combined"              # *_faces.json
+SEGMENTS_DIR     = PROJECT_ROOT / "data" / "segments"                        # Output
+SEGMENTS_DIR.mkdir(parents=True, exist_ok=True)
+# ===========================================================================
+
+# === Segment-Parameter ===
+WIN_SEC            = 1.2     # Fensterlänge
+STRIDE_SEC         = 0.6     # Schrittweite
+HYSTERESIS_FACTOR  = 1.25    # neuer Sprecher muss +25% besser sein
+MIN_SEG_SEC        = 1.0     # kürzere Segmente werden an Nachbarn gemerged
+CONF_MIN           = 0.35    # Sichtbarkeits-Threshold
+AREA_CAP_FRAC      = 0.12    # ab 12% Framefläche kappen wir den Flächenbonus
+
+@dataclass
+class Segment:
+    start_f: int
+    end_f: int
+    track_id: Optional[int]
+
+def robust_minmax(vals, p_lo=5, p_hi=95):
+    v = np.array(vals, dtype=float)
+    lo, hi = np.percentile(v, [p_lo, p_hi])
+    if hi <= lo: hi = lo + 1e-6
+    return float(lo), float(hi)
+
+def score_face(face: Dict[str,Any], W: int, H: int, cx: float, cy: float,
+               lo: float, hi: float) -> float:
+    # Mundaktivität robust normalisieren
+    mo = float(face.get("mouth_openness", 0.0))
+    mo = (mo - lo) / (hi - lo + 1e-9)
+    mo = float(np.clip(mo, 0.0, 1.0))
+
+    x, y, w, h = map(float, face.get("bbox", [0,0,0,0]))
+    conf = float(face.get("conf", 1.0))
+    if conf < CONF_MIN or w <= 5 or h <= 5:  # sehr kleine/unsichere Gesichter raus
+        return 0.0
+
+    area = (w*h) / (W*H + 1e-9)
+    size_w = min(1.0, area / AREA_CAP_FRAC)     # Flächengewicht
+    fx = x + w/2; fy = y + h/2
+    dist = math.hypot(fx - cx, fy - cy) / math.hypot(W/2, H/2)
+    center_w = max(0.0, 1.0 - dist**2)          # Mitte leicht bevorzugen
+
+    # MO dominiert, Fläche und Mitte geben Stabilität
+    return mo * (0.6 + 0.3*size_w + 0.1*center_w)
+
+def build_segments_for_clip(faces_per_frame: List[Dict[str,Any]], fps: float) -> (List[Segment], List[Optional[int]]):
+    T = len(faces_per_frame)
+    if T == 0:
+        return [], []
+
+    # Framegröße
+    W = faces_per_frame[0].get("W") or faces_per_frame[0].get("width")
+    H = faces_per_frame[0].get("H") or faces_per_frame[0].get("height")
+    if not W or not H:
+        # Versuch, aus BBox-Max abzuleiten (Fallback)
+        max_w = max((f["bbox"][0]+f["bbox"][2]) for fr in faces_per_frame for f in fr.get("faces", []) if "bbox" in f) if any(fr.get("faces") for fr in faces_per_frame) else 1920
+        max_h = max((f["bbox"][1]+f["bbox"][3]) for fr in faces_per_frame for f in fr.get("faces", []) if "bbox" in f) if any(fr.get("faces") for fr in faces_per_frame) else 1080
+        W, H = int(max_w), int(max_h)
+
+    # Mundwerte für robuste Normierung sammeln
+    all_mo = [float(f.get("mouth_openness", 0.0))
+              for fr in faces_per_frame for f in fr.get("faces", [])]
+    lo, hi = robust_minmax(all_mo) if all_mo else (0.0, 1.0)
+
+    win    = max(1, int(round(WIN_SEC    * fps)))
+    stride = max(1, int(round(STRIDE_SEC * fps)))
+    minseg = max(1, int(round(MIN_SEG_SEC * fps)))
+
+    chosen_by_frame: List[Optional[int]] = [None]*T
+    last_track: Optional[int] = None
+
+    for start in range(0, T, stride):
+        end = min(T, start + win)
+        sums: Dict[int, float] = {}
+        for t in range(start, end):
+            faces = faces_per_frame[t].get("faces", [])
+            if not faces: continue
+            for face in faces:
+                tid = face.get("track_id")
+                if tid is None:
+                    continue
+                s = score_face(face, W, H, W/2, H/2, lo, hi)
+                if s <= 0:
+                    continue
+                tid = int(tid)
+                sums[tid] = sums.get(tid, 0.0) + s
+
+        if not sums:
+            chosen = last_track
+        else:
+            best_tid, best_val = max(sums.items(), key=lambda kv: kv[1])
+            if last_track is None:
+                chosen = best_tid
+            else:
+                prev_val = sums.get(last_track, 0.0)
+                chosen = best_tid if best_val > prev_val * HYSTERESIS_FACTOR else last_track
+
+        for t in range(start, end):
+            chosen_by_frame[t] = chosen
+        last_track = chosen
+
+    # Lücken auffüllen
+    for t in range(T):
+        if chosen_by_frame[t] is None:
+            chosen_by_frame[t] = last_track
+
+    # Segmente bauen
+    segs: List[Segment] = []
+    cur = chosen_by_frame[0]
+    seg_start = 0
+    for t in range(1, T):
+        if chosen_by_frame[t] != cur:
+            segs.append(Segment(seg_start, t-1, cur))
+            cur = chosen_by_frame[t]
+            seg_start = t
+    segs.append(Segment(seg_start, T-1, cur))
+
+    # Mindestlänge: zu kurze an vorheriges mergen
+    out: List[Segment] = []
+    for s in segs:
+        if out and (s.end_f - s.start_f + 1) < minseg:
+            out[-1].end_f = s.end_f
+        else:
+            out.append(s)
+
+    return out, chosen_by_frame
+
+def main():
+    clips = sorted(list(RAW_DIR.glob("*.mp4")) + list(RAW_DIR.glob("*.mov")))
+    if not clips:
+        print(f"⚠️ Keine Videos in {RAW_DIR}")
+        return
+
+    for vid in clips:
+        name = vid.stem
+        faces_path = FACE_COMBINED_DIR / f"{name}_faces.json"
+        if not faces_path.exists():
+            print(f"⏭️  Skip (keine Faces): {faces_path.name}")
+            continue
+
+        # FPS vom Video
+        cap = cv2.VideoCapture(str(vid))
+        if not cap.isOpened():
+            print(f"❌ Kann Video nicht öffnen: {vid.name}")
+            continue
+        fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
+        cap.release()
+
+        try:
+            faces_per_frame = json.loads(faces_path.read_text(encoding="utf-8"))
+        except Exception as e:
+            print(f"❌ Fehler beim Lesen {faces_path.name}: {e}")
+            continue
+
+        segs, chosen = build_segments_for_clip(faces_per_frame, fps)
+
+        seg_out = SEGMENTS_DIR / f"{name}_segments.json"
+        map_out = SEGMENTS_DIR / f"{name}_target_by_frame.json"
+        seg_out.write_text(json.dumps([s.__dict__ for s in segs], ensure_ascii=False), encoding="utf-8")
+        map_out.write_text(json.dumps(chosen, ensure_ascii=False), encoding="utf-8")
+
+        print(f"✅ Segmente erzeugt: {seg_out.name}  ({len(segs)} Segmente)")
+
+if __name__ == "__main__":
+    main()
diff --git a/src/reformat/new/smart_speaker_tracker.py b/src/reformat/new/smart_speaker_tracker.py
new file mode 100644
index 0000000..5875c54
--- /dev/null
+++ b/src/reformat/new/smart_speaker_tracker.py
@@ -0,0 +1,58 @@
+from typing import Dict, List, Tuple, Optional
+from .tracking import FaceTracker
+
+class SmartSpeakerTracker:
+    def __init__(self):
+        self.face_tracker = FaceTracker()
+        self.movement_per_id: Dict[int, float] = {}
+        self.prev_openness: Dict[int, float] = {}
+        self.confirmation_counter: Dict[int, int] = {}
+        self.speaker_threshold = 3.0          # wie viel Lippenbewegung braucht es mind.
+        self.decay_factor = 0.9               # wie schnell "verblasst" die Bewegung
+        self.speaker_confirm_frames = 25      # wie viele Frames muss ein Sprecher dominieren
+        self.speaker_id: Optional[int] = None
+
+    def update(self, faces: List[Dict]) -> Tuple[float, float]:
+        if not faces:
+            return self.face_tracker.update([])
+
+        # Lippenbewegung analysieren
+        for face in faces:
+            id = face.get("id")
+            openness = face.get("mouth_openness", 0.0)
+            prev = self.prev_openness.get(id, openness)
+            movement = abs(openness - prev)
+
+            # Bewegung aufaddieren mit Decay
+            old_score = self.movement_per_id.get(id, 0.0) * self.decay_factor
+            self.movement_per_id[id] = old_score + movement
+            self.prev_openness[id] = openness
+
+        # Finde ID mit größter Bewegung
+        if self.movement_per_id:
+            top_id = max(self.movement_per_id, key=self.movement_per_id.get)
+            top_movement = self.movement_per_id[top_id]
+
+            if top_movement >= self.speaker_threshold:
+                self.confirmation_counter[top_id] = self.confirmation_counter.get(top_id, 0) + 1
+                # Andere runterzählen
+                for other_id in self.confirmation_counter:
+                    if other_id != top_id:
+                        self.confirmation_counter[other_id] = max(0, self.confirmation_counter[other_id] - 1)
+
+                # Wenn lange genug bestätigt, neuer Sprecher
+                if self.confirmation_counter[top_id] >= self.speaker_confirm_frames:
+                    self.speaker_id = top_id
+            else:
+                # Wenn keiner über der Schwelle → kein neuer Sprecher
+                self.confirmation_counter = {k: max(0, v - 1) for k, v in self.confirmation_counter.items()}
+
+        # Sprecher vorhanden → dahin zentrieren
+        if self.speaker_id is not None:
+            for face in faces:
+                if face.get("id") == self.speaker_id:
+                    return tuple(face["center"])
+
+        # Fallback: stabiler Durchschnitt
+        centers = [tuple(face["center"]) for face in faces]
+        return self.face_tracker.update(centers)
diff --git a/src/reformat/new/speaker_crop_from_segments.py b/src/reformat/new/speaker_crop_from_segments.py
new file mode 100644
index 0000000..5d25c3f
--- /dev/null
+++ b/src/reformat/new/speaker_crop_from_segments.py
@@ -0,0 +1,67 @@
+import json
+from pathlib import Path
+from typing import List, Dict
+
+# === Pfade ===
+SCRIPT_DIR = Path(__file__).resolve().parent
+PROJECT_ROOT = SCRIPT_DIR.parents[2]
+
+FACES_PATH    = PROJECT_ROOT / "data" / "face_data_combined" / "testVideoShort_faces.json"
+SEGMENTS_PATH = PROJECT_ROOT / "data" / "transkripte" / "testVideoShort_segments.json"
+OUTPUT_PATH   = PROJECT_ROOT / "data" / "face_crop_centers" / "testVideoShort_centers.json"
+
+FPS = 25  # Muss zur Framerate deines Videos passen
+
+# === Dateien laden ===
+with open(FACES_PATH) as f:
+    face_data = json.load(f)
+
+with open(SEGMENTS_PATH) as f:
+    segments = json.load(f)
+
+# === Zentrierungen pro Frame bestimmen ===
+frame_centers: List[List[float]] = []
+
+for segment in segments:
+    start_sec = segment["start"]
+    end_sec = segment["end"]
+    start_f = int(start_sec * FPS)
+    end_f = int(end_sec * FPS)
+
+    # Lippenbewegung pro ID in diesem Segment aufaddieren
+    movement: Dict[int, float] = {}
+    count: Dict[int, int] = {}
+
+    for f in range(start_f, min(end_f, len(face_data))):
+        for face in face_data[f]["faces"]:
+            id = face.get("id")
+            openness = face.get("mouth_openness", 0.0)
+            movement[id] = movement.get(id, 0.0) + openness
+            count[id] = count.get(id, 0) + 1
+
+    # Durchschnitt berechnen
+    avg_movement = {id: movement[id] / count[id] for id in movement if count[id] > 0}
+    if not avg_movement:
+        speaker_id = None
+    else:
+        speaker_id = max(avg_movement, key=avg_movement.get)
+
+    # Für jedes Frame in diesem Segment den Sprecher zentrieren
+    for f in range(start_f, min(end_f, len(face_data))):
+        faces = face_data[f].get("faces", [])
+        center = [960.0, 540.0]  # Fallback
+
+        if speaker_id is not None:
+            for face in faces:
+                if face.get("id") == speaker_id:
+                    center = face["center"][:2]
+                    break
+
+        frame_centers.append([round(center[0], 2), round(center[1], 2)])
+
+# === Ergebnis speichern ===
+OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
+with open(OUTPUT_PATH, "w") as f:
+    json.dump(frame_centers, f, indent=2)
+
+print(f"✅ Zentrierung auf Sprecher für {len(frame_centers)} Frames gespeichert unter:\n{OUTPUT_PATH}")
diff --git a/src/reformat/new/tracking.py b/src/reformat/new/tracking.py
new file mode 100644
index 0000000..838b6c6
--- /dev/null
+++ b/src/reformat/new/tracking.py
@@ -0,0 +1,84 @@
+from typing import List, Tuple, Optional
+
+
+class FaceTracker:
+    def __init__(
+        self,
+        dist_threshold: float = 200.0,
+        switch_frames: int = 5,
+        panning_window: int = 10,
+        panning_threshold: float = 40.0,
+        smooth_window: int = 3,
+        scene_jump_threshold: float = 400.0
+    ):
+        self.dist_threshold = dist_threshold
+        self.switch_frames = switch_frames
+        self.panning_window = panning_window
+        self.panning_threshold = panning_threshold
+        self.smooth_window = smooth_window
+        self.scene_jump_threshold = scene_jump_threshold
+
+        self.current_center: Tuple[float, float] = (960.0, 540.0)  # Default Mitte (bei 1920x1080)
+        self.raw_center: Tuple[float, float] = self.current_center
+        self.prev_center: Tuple[float, float] = self.current_center
+        self.prev_raw: Tuple[float, float] = self.current_center
+        self.candidate_center: Optional[Tuple[float, float]] = None
+        self.switch_counter = 0
+
+        self.recent_raw_centers: List[Tuple[float, float]] = []
+        self.recent_final_centers: List[Tuple[float, float]] = []
+
+    def update(self, candidates: List[Tuple[float, float]]) -> Tuple[float, float]:
+        if not candidates:
+            # kein Gesicht → verwende alten Wert
+            self.recent_raw_centers.append(self.raw_center)
+            self.recent_final_centers.append(self.current_center)
+            return self.current_center
+
+        # nehme das Gesicht, das am nächsten zur vorherigen Position ist
+        new_center = min(candidates, key=lambda pt: self._distance(self.prev_center, pt))
+        self.raw_center = new_center
+        self.recent_raw_centers.append(new_center)
+
+        dist = self._distance(self.prev_raw, new_center)
+        if dist > self.scene_jump_threshold:
+            self.current_center = new_center
+            self.prev_center = new_center
+            self.prev_raw = new_center
+            self._smooth_reset()
+            return self.current_center
+
+        if dist > self.dist_threshold:
+            if self.candidate_center != new_center:
+                self.candidate_center = new_center
+                self.switch_counter = 1
+            else:
+                self.switch_counter += 1
+                if self.switch_counter >= self.switch_frames:
+                    self.prev_center = self.current_center
+                    self.current_center = new_center
+                    self.prev_raw = new_center
+                    self.switch_counter = 0
+        else:
+            self.switch_counter = 0
+            self.prev_raw = new_center
+
+        # Smoothes Nachziehen
+        smoothed = self._moving_average(self.current_center, new_center, self.smooth_window)
+        self.prev_center = self.current_center
+        self.current_center = smoothed
+        self.recent_final_centers.append(smoothed)
+
+        return smoothed
+
+    def _moving_average(self, old, new, factor):
+        x = (old[0] * (factor - 1) + new[0]) / factor
+        y = (old[1] * (factor - 1) + new[1]) / factor
+        return (x, y)
+
+    def _distance(self, pt1, pt2):
+        return ((pt1[0] - pt2[0]) ** 2 + (pt1[1] - pt2[1]) ** 2) ** 0.5
+
+    def _smooth_reset(self):
+        self.recent_raw_centers.clear()
+        self.recent_final_centers.clear()
diff --git a/src/reformat/new/utils.py b/src/reformat/new/utils.py
new file mode 100644
index 0000000..0ea37c6
--- /dev/null
+++ b/src/reformat/new/utils.py
@@ -0,0 +1,129 @@
+# src/utils.py
+from __future__ import annotations
+import json
+import logging
+import os
+from pathlib import Path
+from typing import Any, Dict, Tuple
+
+try:
+    import cv2
+except Exception:
+    cv2 = None  # erlaubt Import ohne OpenCV (z.B. beim reinen Testen)
+
+# --- Logging ---------------------------------------------------------------
+
+def setup_logging(debug: bool = False) -> None:
+    level = logging.DEBUG if debug else logging.INFO
+    logging.basicConfig(
+        level=level,
+        format="%(asctime)s | %(levelname)s | %(message)s",
+    )
+
+# --- Mathe/Helpers ---------------------------------------------------------
+
+def clamp(v: float, lo: float, hi: float) -> float:
+    return max(lo, min(hi, v))
+
+def compute_crop_width(orig_w: int, orig_h: int, out_w: int = 1080, out_h: int = 1920) -> int:
+    # Für 9:16 Ziel: Breite = (9/16) * orig_h, standardmäßig 1080x1920
+    return int((out_w / out_h) * orig_h)
+
+def iou(boxA, boxB) -> float:
+    """Berechnet Intersection-over-Union zweier Bounding-Boxes."""
+    ax1, ay1, aw, ah = boxA
+    ax2, ay2 = ax1 + aw, ay1 + ah
+    bx1, by1, bw, bh = boxB
+    bx2, by2 = bx1 + bw, by1 + bh
+
+    inter_x1 = max(ax1, bx1)
+    inter_y1 = max(ay1, by1)
+    inter_x2 = min(ax2, bx2)
+    inter_y2 = min(ay2, by2)
+    inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1)
+
+    union_area = aw * ah + bw * bh - inter_area
+    return inter_area / union_area if union_area > 0 else 0
+
+# --- IO --------------------------------------------------------------------
+
+def load_json(path: Path) -> Any:
+    if not path.exists():
+        raise FileNotFoundError(f"Datei fehlt: {path}")
+    with open(path, "r", encoding="utf-8") as f:
+        return json.load(f)
+
+def save_json(obj: Any, path: Path) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(obj, f, ensure_ascii=False, indent=2)
+
+def ensure_exists(path: Path, what: str = "Datei/Ordner") -> None:
+    if not path.exists():
+        raise FileNotFoundError(f"{what} nicht gefunden: {path}")
+
+# --- Video / Pfade ---------------------------------------------------------
+
+def get_fps(video_path: Path, fallback: float = 25.0) -> float:
+    if cv2 is None:
+        logging.warning("OpenCV nicht verfügbar – nutze FPS-Fallback %.2f", fallback)
+        return fallback
+    cap = cv2.VideoCapture(str(video_path))
+    fps = cap.get(5)  # cv2.CAP_PROP_FPS
+    cap.release()
+    if not fps or fps <= 1e-3:
+        logging.warning("Konnte FPS nicht lesen – nutze Fallback %.2f", fallback)
+        return fallback
+    return float(fps)
+
+def project_root_from(file: Path) -> Path:
+    # Dein Projekt nutzt häufig parents[2]; kapseln:
+    return file.resolve().parents[3]
+
+def resolve_paths(project_root: Path, base_name: str) -> Dict[str, Path]:
+    data = project_root / "data"
+    return {
+        "timed_path":   data / "transkripte" / f"{base_name}_timed.txt",
+        "segments_path":data / "transkripte" / f"{base_name}_segments.json",
+        "faces_path":   data / "face_data_combined" / f"{base_name}_faces.json",
+        "centers_path": data / "face_crop_centers" / f"{base_name}_centers.json",
+        "video_path":   data / "output" / "raw_clips" / f"{base_name}.mp4",
+        "out_9x16_dir": project_root / "output" / "output_9x16_final",
+        "face_debug_dir": project_root / "output" / "debug" / "faces",
+    }
+
+def require_api_key(env_name: str = "OPENAI_API_KEY") -> str:
+    key = os.getenv(env_name)
+    if not key:
+        raise RuntimeError(
+            f"Umgebungsvariable {env_name} fehlt. "
+            f"Exportiere sie z.B.: export {env_name}='sk-...'")
+    return key
+
+# --- Simple smoothing for centers ------------------------------------------
+
+from typing import List, Optional
+
+class CenterSmoother:
+    """Glättet Zentren mit Moving Average und optionaler Jump-Erkennung."""
+    def __init__(self, window: int = 7, jump_thresh: float = 120.0):
+        self.window = window
+        self.jump_thresh = jump_thresh
+        self.buffer: List[Tuple[float, float]] = []
+        self.prev: Optional[Tuple[float, float]] = None
+
+    def push(self, cx: float, cy: float) -> Tuple[float, float]:
+        if self.prev is not None:
+            dx = abs(cx - self.prev[0]) + abs(cy - self.prev[1])
+            if dx > self.jump_thresh:
+                # harter Cut: reset buffer
+                self.buffer.clear()
+
+        self.buffer.append((cx, cy))
+        if len(self.buffer) > self.window:
+            self.buffer.pop(0)
+
+        avgx = sum(p[0] for p in self.buffer) / len(self.buffer)
+        avgy = sum(p[1] for p in self.buffer) / len(self.buffer)
+        self.prev = (avgx, avgy)
+        return self.prev
diff --git a/src/reformat/old/analyze_crop_position.py b/src/reformat/old/analyze_crop_position.py
new file mode 100644
index 0000000..33a832a
--- /dev/null
+++ b/src/reformat/old/analyze_crop_position.py
@@ -0,0 +1,235 @@
+import argparse
+import json
+import logging
+import math
+import random
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+
+class FaceTracker:
+    def __init__(
+        self,
+        dist_threshold: float,
+        switch_frames: int,
+        panning_window: int,
+        panning_threshold: float,
+        smooth_window: int,
+        scene_jump_threshold: float,
+    ):
+        self.dist_threshold = dist_threshold
+        self.switch_frames = switch_frames
+        self.panning_window = panning_window
+        self.panning_threshold = panning_threshold
+        self.smooth_window = smooth_window
+        self.scene_jump_threshold = scene_jump_threshold
+
+        self.current_center: Tuple[float, float] = (960.0, 540.0)
+        self.raw_center: Tuple[float, float] = self.current_center
+        self.prev_center: Tuple[float, float] = self.current_center
+        self.prev_raw: Tuple[float, float] = self.current_center
+        self.candidate_center: Optional[Tuple[float, float]] = None
+        self.switch_counter: int = 0
+        self.last_speaker_set: bool = False
+        self.random_center: Optional[Tuple[float, float]] = None
+
+        self.panning_buffer: List[float] = []
+        self.smooth_buffer: List[Tuple[float, float]] = []
+
+    def process_frame(self, faces: List[Dict[str, Any]]) -> Tuple[Tuple[int, int], bool]:
+        valid_faces = [f for f in faces if f.get("center") and f.get("mouth_openness") is not None]
+        all_faces = [f for f in faces if f.get("center")]
+
+        # Speaker tracking
+        if valid_faces:
+            self._update_speaker(valid_faces)
+        else:
+            self._retain_or_random_center(all_faces)
+
+        # Panning detection
+        is_panning = self._detect_panning()
+
+        # Smooth / moving average
+        center = self._smooth_center()
+
+        return (int(center[0]), int(center[1])), is_panning
+
+    def _update_speaker(self, valid_faces: List[Dict[str, Any]]) -> None:
+        best = max(valid_faces, key=lambda x: x["mouth_openness"])
+        cx, cy, *_ = best["center"]
+        new_center = (cx, cy)
+
+        dist = math.hypot(new_center[0] - self.raw_center[0], new_center[1] - self.raw_center[1])
+        if dist < self.dist_threshold:
+            self.raw_center = new_center
+            self.candidate_center = None
+            self.switch_counter = 0
+        else:
+            if (
+                self.candidate_center is None
+                or math.hypot(
+                    new_center[0] - self.candidate_center[0], new_center[1] - self.candidate_center[1]
+                )
+                > self.dist_threshold
+            ):
+                self.candidate_center = new_center
+                self.switch_counter = 1
+            else:
+                self.switch_counter += 1
+
+            if self.switch_counter >= self.switch_frames:
+                self.raw_center = self.candidate_center  # type: ignore
+                self.candidate_center = None
+                self.switch_counter = 0
+
+        self.random_center = None
+        self.last_speaker_set = True
+
+    def _retain_or_random_center(self, all_faces: List[Dict[str, Any]]) -> None:
+        if self.last_speaker_set:
+            # keep previous raw_center
+            pass
+        elif self.random_center is not None:
+            self.raw_center = self.random_center
+        elif all_faces:
+            f = random.choice(all_faces)
+            cx, cy, *_ = f["center"]
+            self.random_center = (cx, cy)
+            self.raw_center = self.random_center
+
+    def _detect_panning(self) -> bool:
+        dx = self.raw_center[0] - self.prev_raw[0]
+        self.panning_buffer.append(dx)
+        if len(self.panning_buffer) > self.panning_window:
+            self.panning_buffer.pop(0)
+        avg_dx = sum(abs(d) for d in self.panning_buffer) / len(self.panning_buffer)
+        is_panning = avg_dx > self.panning_threshold
+        self.prev_raw = self.raw_center
+        return is_panning
+
+    def _smooth_center(self) -> Tuple[float, float]:
+        sudden_jump = (
+            math.hypot(
+                self.raw_center[0] - self.prev_center[0],
+                self.raw_center[1] - self.prev_center[1],
+            )
+            > self.scene_jump_threshold
+        )
+        if not sudden_jump:
+            self.smooth_buffer.append(self.raw_center)
+            if len(self.smooth_buffer) > self.smooth_window:
+                self.smooth_buffer.pop(0)
+            avg_x = sum(p[0] for p in self.smooth_buffer) / len(self.smooth_buffer)
+            avg_y = sum(p[1] for p in self.smooth_buffer) / len(self.smooth_buffer)
+            center = (avg_x, avg_y)
+        else:
+            center = self.raw_center
+            self.smooth_buffer.clear()
+
+        self.prev_center = center
+        return center
+
+
+def parse_args() -> argparse.Namespace:
+    script_dir = Path(__file__).resolve().parent
+    project_root = script_dir.parents[1]
+    default_input = project_root / "data" / "face_data_combined"
+    default_output = project_root / "data" / "face_crop_centers"
+
+    parser = argparse.ArgumentParser(
+        description="Track and smooth face crop centers based on mouth openness."
+    )
+    parser.add_argument(
+        "-i", "--input-dir", type=Path,
+        default=default_input,
+        help=f"Directory containing *_faces.json files (default: {default_input})"
+    )
+    parser.add_argument(
+        "-o", "--output-dir", type=Path,
+        default=default_output,
+        help=f"Directory to save *_centers.json files (default: {default_output})"
+    )
+    parser.add_argument(
+        "--dist-threshold", type=float, default=30.0,
+        help="Pixel distance threshold to switch speaker"
+    )
+    parser.add_argument(
+        "--switch-frames", type=int, default=20,
+        help="Number of consecutive frames required to confirm speaker switch"
+    )
+    parser.add_argument(
+        "--panning-window", type=int, default=30,
+        help="Frame window size for panning detection"
+    )
+    parser.add_argument(
+        "--panning-threshold", type=float, default=3.0,
+        help="Average dx threshold for panning detection"
+    )
+    parser.add_argument(
+        "--smooth-window", type=int, default=8,
+        help="Moving average window for smoothing"
+    )
+    parser.add_argument(
+        "--scene-jump-threshold", type=float, default=300.0,
+        help="Jump threshold to detect scene cuts"
+    )
+    return parser.parse_args()
+
+
+def setup_logging() -> None:
+    logging.basicConfig(
+        format="%(asctime)s %(levelname)s: %(message)s",
+        level=logging.INFO,
+    )
+
+
+def main() -> None:
+    setup_logging()
+    args = parse_args()
+
+    input_dir: Path = args.input_dir.resolve()
+    output_dir: Path = args.output_dir.resolve()
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    tracker = FaceTracker(
+        dist_threshold=args.dist_threshold,
+        switch_frames=args.switch_frames,
+        panning_window=args.panning_window,
+        panning_threshold=args.panning_threshold,
+        smooth_window=args.smooth_window,
+        scene_jump_threshold=args.scene_jump_threshold,
+    )
+
+    json_files = sorted(input_dir.glob("*_faces.json"))
+    if not json_files:
+        logging.error("Keine *_faces.json-Dateien gefunden in %s", input_dir)
+        return
+
+    logging.info("Gefundene Dateien: %d", len(json_files))
+
+    for json_path in json_files:
+        logging.info("Verarbeite %s", json_path.name)
+        try:
+            frames_data = json.loads(json_path.read_text())
+        except json.JSONDecodeError as e:
+            logging.error("JSON-Fehler in %s: %s", json_path.name, e)
+            continue
+
+        out_data: List[Dict[str, Any]] = []
+        for frame_idx, frame in enumerate(frames_data):
+            faces = frame.get("faces", [])
+            center, is_panning = tracker.process_frame(faces)
+            out_data.append({
+                "frame": frame_idx,
+                "center": [center[0], center[1]],
+                "panning": is_panning,
+            })
+
+        out_path = output_dir / f"{json_path.stem.replace('_faces', '')}_centers.json"
+        with out_path.open("w") as f:
+            json.dump(out_data, f, indent=2)
+        logging.info("Gespeichert: %s (%d Frames)", out_path.name, len(out_data))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/reformat/old/crop_to_speaker.py b/src/reformat/old/crop_to_speaker.py
new file mode 100644
index 0000000..553bbbd
--- /dev/null
+++ b/src/reformat/old/crop_to_speaker.py
@@ -0,0 +1,180 @@
+import json
+import cv2
+import subprocess
+from pathlib import Path
+
+# === Pfade & globale Settings ===
+SCRIPT_DIR   = Path(__file__).resolve().parent
+PROJECT_ROOT = SCRIPT_DIR.parents[1]
+
+INPUT_VIDEO_DIR  = PROJECT_ROOT / "data" / "output" / "raw_clips"
+INPUT_CENTER_DIR = PROJECT_ROOT / "data" / "face_crop_centers"
+INPUT_FACES_DIR  = PROJECT_ROOT / "data" / "face_data_combined"
+OUTPUT_DIR       = PROJECT_ROOT / "output" / "output_9x16_final"
+OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+
+OUT_W, OUT_H = 1080, 1920
+
+DEBUG_MODE = True
+DEBUG_SCALE = 0.75
+# Ab welcher Offenheit wir "Bewegung" annehmen
+DEBUG_MOUTH_THRESHOLD = 0.02
+
+# === Hilfsfunktionen ===
+def clamp(v, lo, hi):
+    return max(lo, min(hi, v))
+
+def compute_crop_width(orig_w, orig_h):
+    return int((OUT_W / OUT_H) * orig_h)
+
+# === Verarbeitung ===
+for center_path in sorted(INPUT_CENTER_DIR.glob("*_centers.json")):
+    stem = center_path.stem.replace("_centers", "")
+    video_path = INPUT_VIDEO_DIR / f"{stem}.mp4"
+    faces_path = INPUT_FACES_DIR / f"{stem}_faces.json"
+
+    if not video_path.exists():
+        print(f"⚠️  Video fehlt: {stem}.mp4")
+        continue
+    if not faces_path.exists():
+        print(f"⚠️  Gesichtsdaten fehlen: {stem}_faces.json")
+        continue
+
+    centers_data = json.loads(center_path.read_text())
+    faces_data   = json.loads(faces_path.read_text())
+
+    # Debug-Liste pro Video anlegen
+    if DEBUG_MODE:
+        debug_results: list = []
+
+    cap = cv2.VideoCapture(str(video_path))
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    crop_w = compute_crop_width(orig_w, orig_h)
+    crop_h = orig_h
+
+    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+    temp_vid = OUTPUT_DIR / f"{stem}_cropped.mp4"
+    out_vid = cv2.VideoWriter(str(temp_vid), fourcc, fps, (OUT_W, OUT_H))
+    if not out_vid.isOpened():
+        print(f"❌ Kann nicht schreiben: {temp_vid.name}")
+        continue
+
+    if DEBUG_MODE:
+        cv2.namedWindow("Debug", cv2.WINDOW_NORMAL)
+
+    frame_idx = 0
+    while True:
+        ret, frame = cap.read()
+        if not ret or frame_idx >= len(centers_data):
+            break
+
+        # Crop-Infos
+        info = centers_data[frame_idx]
+        cx, cy = info["center"]
+        is_panning = info.get("panning", False)
+        if is_panning:
+            cx = orig_w // 2
+
+        x0 = int(cx - crop_w / 2)
+        x0 = clamp(x0, 0, orig_w - crop_w)
+        y0 = 0
+
+        # Ausschneiden + Resize
+        crop = frame[y0:y0+crop_h, x0:x0+crop_w]
+        if crop.shape[1] != crop_w or crop.shape[0] != crop_h:
+            crop = cv2.copyMakeBorder(
+                crop, 0, crop_h - crop.shape[0],
+                      0, crop_w - crop.shape[1],
+                cv2.BORDER_CONSTANT, value=[0, 0, 0]
+            )
+        out_frame = cv2.resize(crop, (OUT_W, OUT_H), interpolation=cv2.INTER_LINEAR)
+        out_vid.write(out_frame)
+
+        if DEBUG_MODE:
+            debug_frame = frame.copy()
+            frame_faces = faces_data[frame_idx].get("faces", [])
+
+            # Build debug entry for this frame
+            dbg_faces = []
+            for f in frame_faces:
+                # center und Offenheit
+                cx_f, cy_f = map(int, f["center"][:2])
+                openness = f.get("mouth_openness", 0.0)
+                moving = openness > DEBUG_MOUTH_THRESHOLD
+                dbg_faces.append({
+                    "center": [cx_f, cy_f],
+                    "mouth_openness": openness,
+                    "mouth_moving": moving
+                })
+
+                # Anzeige im Debug-Fenster
+                cv2.circle(debug_frame, (cx_f, cy_f), 4, (180, 180, 180), -1)
+                cv2.putText(
+                    debug_frame,
+                    f"{round(openness,2)}",
+                    (cx_f + 6, cy_f - 6),
+                    cv2.FONT_HERSHEY_SIMPLEX,
+                    0.5,
+                    (255, 255, 255),
+                    1,
+                    cv2.LINE_AA
+                )
+                # roter Punkt, wenn Bewegung
+                color = (0,0,255) if moving else (0,255,255)
+                cv2.circle(debug_frame, (cx_f, cy_f), 6, color, 1)
+
+            debug_results.append({
+                "frame": frame_idx,
+                "faces": dbg_faces
+            })
+
+            # Haupt-Center & Crop-Rahmen
+            cv2.circle(debug_frame, (int(cx), int(cy)), 18, (0, 255, 0), 2)
+            cv2.rectangle(debug_frame, (x0, 0), (x0 + crop_w, crop_h), (0, 0, 255), 2)
+
+            dbg = cv2.resize(
+                debug_frame,
+                (int(orig_w * DEBUG_SCALE), int(orig_h * DEBUG_SCALE))
+            )
+            cv2.imshow("Debug", dbg)
+            if cv2.waitKey(1) & 0xFF == ord('q'):
+                break
+
+        frame_idx += 1
+
+    cap.release()
+    out_vid.release()
+    if DEBUG_MODE:
+        cv2.destroyAllWindows()
+
+    # Audio extrahieren & muxen
+    audio_tmp = OUTPUT_DIR / f"{stem}_audio.aac"
+    final_vid = OUTPUT_DIR / f"{stem}.mp4"
+    try:
+        subprocess.run(
+            ["ffmpeg", "-y", "-i", str(video_path), "-vn", "-acodec", "copy", str(audio_tmp)],
+            check=True
+        )
+        subprocess.run(
+            ["ffmpeg", "-y", "-i", str(temp_vid), "-i", str(audio_tmp),
+             "-c:v", "copy", "-c:a", "aac", "-b:a", "128k", str(final_vid)],
+            check=True
+        )
+    finally:
+        try: temp_vid.unlink()
+        except: pass
+        try: audio_tmp.unlink()
+        except: pass
+
+    # Debug-JSON schreiben
+    if DEBUG_MODE:
+        dbg_path = OUTPUT_DIR / f"{stem}_debug.json"
+        with dbg_path.open("w") as f:
+            json.dump(debug_results, f, indent=2)
+        print(f"🛠️ Debug-Daten: {dbg_path.name}")
+
+    print(f"✅ Finales Video: {final_vid.name}")
+
+print("\n🏁 Alle Videos fertig in:", OUTPUT_DIR.resolve())
diff --git a/src/reformat/old/detect_speaking_faces.py b/src/reformat/old/detect_speaking_faces.py
new file mode 100644
index 0000000..f439d30
--- /dev/null
+++ b/src/reformat/old/detect_speaking_faces.py
@@ -0,0 +1,126 @@
+import json
+from pathlib import Path
+from collections import defaultdict
+import numpy as np
+
+# === Einstellungen ===
+SCRIPT_DIR = Path(__file__).resolve().parent
+PROJECT_ROOT = SCRIPT_DIR.parents[1]
+INPUT_DIR = PROJECT_ROOT / "data" / "face_data_combined"
+OUTPUT_PATH = INPUT_DIR / "dominant_faces.json"
+
+SEGMENT_LENGTH = 2.0     # Länge jedes Segments in Sekunden
+MOUTH_THRESHOLD = 0.01   # minimale Mundöffnung, um einen Sprecher zu zählen
+SMOOTH_WINDOW = 5        # Fenstergröße (in Segmenten) für Moving Average
+
+def analyze_clip_timed(path):
+    # 1) JSON einlesen
+    try:
+        data = json.loads(path.read_text())
+    except Exception as e:
+        print(f"❌ Fehler beim Lesen von {path.name}: {e}")
+        return None
+
+    # 2) Nur valide Frames verwenden
+    frames = [d for d in data if "timestamp" in d and isinstance(d.get("faces"), list)]
+    if not frames:
+        print(f"⚠️  Keine validen Frames in {path.name}")
+        return None
+
+    frames.sort(key=lambda x: x["timestamp"])
+    max_time = frames[-1]["timestamp"]
+
+    # 3) Segmente erzeugen und dominanten Sprecher per Segment finden
+    segments = []
+    t = 0.0
+    while t < max_time:
+        t_end = t + SEGMENT_LENGTH
+        face_scores  = defaultdict(list)   # mouth_openness pro bbox
+        face_boxes   = defaultdict(list)   # raw bbox pro bbox-key
+        face_centers = defaultdict(list)   # center [cx,cy,w,h] pro bbox-key
+
+        # alle Frames durchsuchen, die in dieses Segment fallen
+        for f in frames:
+            ts = f["timestamp"]
+            if t <= ts < t_end:
+                for face in f["faces"]:
+                    bbox   = face["bbox"]               # [x,y,w,h]
+                    score  = face.get("mouth_openness", 0.0)
+                    center = face.get("center", None)   # [cx,cy,w,h]
+                    key    = tuple(bbox)
+
+                    if score >= MOUTH_THRESHOLD and center is not None:
+                        face_scores[key].append(score)
+                        face_boxes[key].append(bbox)
+                        face_centers[key].append(center)
+
+        if face_scores:
+            # den Key mit dem höchsten Durchschnittsscore wählen
+            avg_scores   = {k: np.mean(v) for k, v in face_scores.items()}
+            dominant_key = max(avg_scores, key=avg_scores.get)
+
+            # mittlere Bounding‑Box und mittleres Center berechnen
+            avg_bbox   = np.mean(face_boxes[dominant_key], axis=0).astype(int).tolist()
+            avg_center = np.mean(face_centers[dominant_key], axis=0).tolist()  # [cx,cy,w,h]
+
+            segments.append({
+                "start": round(t,   2),
+                "end":   round(t_end if t_end < max_time else max_time, 2),
+                "bbox":   avg_bbox,
+                "center": [float(avg_center[0]), float(avg_center[1]), float(avg_center[2]), float(avg_center[3])]
+            })
+
+        t += SEGMENT_LENGTH
+
+    if not segments:
+        print(f"⚠️  Keine Segmente für Clip {path.name}")
+        return None
+
+    # 4) Glätten der Segment‑Zentren mit Moving Average
+    seg_centers = [s["center"] for s in segments]  # Liste von [cx,cy,w,h]
+    sm_centers  = []
+    n = len(seg_centers)
+    half = SMOOTH_WINDOW // 2
+
+    for i in range(n):
+        start = max(0,         i - half)
+        end   = min(n,         i + half + 1)
+        window = seg_centers[start:end]
+        avg = np.mean(window, axis=0)   # ergibt [cx,cy,w,h]
+        sm_centers.append(avg.tolist())
+
+    # 5) Ausgabe des geglätteten Pfades in die Konsole
+    print(f"\n🔄 Smoothed path für Clip {path.stem}:")
+    for i, s in enumerate(segments):
+        cx, cy, w, h = sm_centers[i]
+        print(f"  Segment {i} [{s['start']}–{s['end']}s]: "
+              f"center=({cx:.1f}, {cy:.1f}), size=({w:.1f}×{h:.1f})")
+
+    # 6) Neue Felder für Ausgabe‑JSON bauen
+    sm_segments = []
+    for i, s in enumerate(segments):
+        cx, cy, w, h = sm_centers[i]
+        x0 = int(cx - w/2)
+        y0 = int(cy - h/2)
+        sm_segments.append({
+            "start": s["start"],
+            "end":   s["end"],
+            "bbox":  [x0, y0, int(w), int(h)]
+        })
+
+    return {
+        "clip":     path.stem.replace("_faces", "") + ".mp4",
+        "segments": sm_segments
+    }
+
+
+# === Hauptschleife über alle Clips ===
+results = []
+for json_file in sorted(INPUT_DIR.glob("*_faces.json")):
+    out = analyze_clip_timed(json_file)
+    if out:
+        results.append(out)
+
+OUTPUT_PATH.write_text(json.dumps(results, indent=2))
+print(f"\n✅ Analyse abgeschlossen – {len(results)} Clips erkannt.")
+print(f"📄 Gespeichert in: {OUTPUT_PATH.resolve()}")
diff --git a/src/reformat/old/grid_faces_from_yolo.py b/src/reformat/old/grid_faces_from_yolo.py
new file mode 100644
index 0000000..b9de56f
--- /dev/null
+++ b/src/reformat/old/grid_faces_from_yolo.py
@@ -0,0 +1,114 @@
+import json
+import cv2
+import numpy as np
+from pathlib import Path
+from tqdm import tqdm
+from collections import defaultdict, Counter
+from sklearn.cluster import DBSCAN
+
+# === Einstellungen ===
+SCRIPT_DIR = Path(__file__).resolve().parent
+VIDEO_DIR = SCRIPT_DIR.parents[1] / "output"
+FACE_JSON_DIR = SCRIPT_DIR / "face_data_yolo"
+OUTPUT_DIR = SCRIPT_DIR.parents[1] / "output_stacked_faces"
+OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+
+OUT_WIDTH = 1080
+OUT_HEIGHT = 1920
+GRID_ROWS = 4
+FACE_CROP_HEIGHT = OUT_HEIGHT // GRID_ROWS
+FACE_CROP_WIDTH = OUT_WIDTH
+
+# === Hilfsfunktion
+def bbox_center(bbox):
+    x, y, w, h = bbox
+    return int(x + w // 2), int(y + h // 2)
+
+# === Hauptverarbeitung ===
+for json_path in tqdm(sorted(FACE_JSON_DIR.glob("*_faces.json")), desc="🔍 Erzeuge Grid-Clips"):
+    video_name = json_path.stem.replace("_faces", "") + ".mp4"
+    video_path = VIDEO_DIR / video_name
+    if not video_path.exists():
+        print(f"❌ Video nicht gefunden: {video_name}")
+        continue
+
+    data = json.loads(json_path.read_text())
+
+    # === Alle Gesichtszentren sammeln
+    all_faces = []
+    for frame in data:
+        for face in frame["faces"]:
+            center = bbox_center(face["bbox"])
+            all_faces.append((center, face["bbox"]))
+
+    if not all_faces:
+        print(f"⚠️ Keine Gesichter erkannt in {video_name}")
+        continue
+
+    # === Clustern
+    coords = [pos for pos, _ in all_faces]
+    clustering = DBSCAN(eps=80, min_samples=5).fit(coords)
+    cluster_labels = clustering.labels_
+    label_counts = Counter(cluster_labels)
+    most_common_labels = [lbl for lbl, _ in label_counts.most_common(GRID_ROWS) if lbl != -1]
+
+    if not most_common_labels:
+        print(f"⚠️ Keine gültigen Cluster in {video_name}")
+        continue
+
+    # === Zuordnung: cluster_id → feste Zeile
+    cluster_faces = defaultdict(list)
+    for (_, bbox), label in zip(all_faces, cluster_labels):
+        if label in most_common_labels:
+            cluster_faces[label].append(bbox)
+
+    def cluster_y(label):
+        return np.mean([bbox[1] for bbox in cluster_faces[label]])
+
+    sorted_labels = sorted(most_common_labels, key=cluster_y)
+    label_to_row = {lbl: idx for idx, lbl in enumerate(sorted_labels)}
+
+    # === cluster_id zu jedem Gesicht hinzufügen
+    for frame in data:
+        for face in frame["faces"]:
+            center = bbox_center(face["bbox"])
+            distances = [np.linalg.norm(np.array(center) - np.array(c)) for c in coords]
+            nearest = np.argmin(distances)
+            label = cluster_labels[nearest]
+            face["cluster_id"] = label
+
+    # === Video verarbeiten
+    cap = cv2.VideoCapture(str(video_path))
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    out_path = OUTPUT_DIR / video_name.replace(".mp4", "_stacked.mp4")
+    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+    writer = cv2.VideoWriter(str(out_path), fourcc, fps, (OUT_WIDTH, OUT_HEIGHT))
+
+    frame_idx = 0
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret or frame_idx >= len(data):
+            break
+
+        output_frame = np.zeros((OUT_HEIGHT, OUT_WIDTH, 3), dtype=np.uint8)
+        for face in data[frame_idx]["faces"]:
+            label = face.get("cluster_id", -1)
+            if label not in label_to_row:
+                continue
+            row = label_to_row[label]
+            x, y, w, h = face["bbox"]
+            crop = frame[y:y+h, x:x+w]
+            if crop.size == 0:
+                continue
+            resized = cv2.resize(crop, (FACE_CROP_WIDTH, FACE_CROP_HEIGHT))
+            y_offset = row * FACE_CROP_HEIGHT
+            output_frame[y_offset:y_offset+FACE_CROP_HEIGHT, :] = resized
+
+        writer.write(output_frame)
+        frame_idx += 1
+
+    cap.release()
+    writer.release()
+    print(f"✅ Exportiert: {out_path.name}")
+
+print("🏁 Alle Grid-Videos fertig.")
diff --git a/src/reformat/old/preview_faces.py b/src/reformat/old/preview_faces.py
new file mode 100644
index 0000000..dc777fc
--- /dev/null
+++ b/src/reformat/old/preview_faces.py
@@ -0,0 +1,75 @@
+import cv2
+import json
+from pathlib import Path
+from tqdm import tqdm
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+PROJECT_DIR = SCRIPT_DIR.parents[1]  # ← geht von /src/reformat zu /BachlorArbeit
+
+FACES_DIR = PROJECT_DIR / "data" / "face_data_combined"
+INPUT_VIDEO_DIR = PROJECT_DIR / "data" / "output" / "raw_clips"
+OUTPUT_DIR = PROJECT_DIR / "output" / "output_preview_faces"
+OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+
+# === Alle *_faces.json Dateien durchgehen ===
+face_files = sorted(FACES_DIR.glob("*_faces.json"))
+
+for face_file in tqdm(face_files, desc="🔍 Erzeuge Vorschau mit Sprechererkennung"):
+    clip_name = face_file.stem.replace("_faces", "") + ".mp4"
+    input_path = INPUT_VIDEO_DIR / clip_name
+    output_path = OUTPUT_DIR / clip_name.replace(".mp4", "_preview_faces.mp4")
+
+    if not input_path.exists():
+        print(f"❌ Clip nicht gefunden: {clip_name}")
+        continue
+
+    # Video-Setup
+    cap = cv2.VideoCapture(str(input_path))
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    fps = fps if fps > 1 else 25  # fallback falls FPS = 0
+    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    fourcc = cv2.VideoWriter_fourcc(*"avc1")  # Kompatibler als mp4v
+    out = cv2.VideoWriter(str(output_path), fourcc, fps, (width, height))
+
+    # Gesichts-Daten laden
+    data = json.loads(face_file.read_text())
+    data_by_frame = {d["frame"]: d["faces"] for d in data if d["faces"]}
+
+    print(f"🔢 Frames mit Gesichtern: {len(data_by_frame)}")
+
+    frame_idx = 0
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if not ret:
+            break
+
+        faces = data_by_frame.get(frame_idx, [])
+        speaker_idx = None
+
+        # Sprecher anhand Mundöffnung
+        if faces and all("mouth_openness" in f for f in faces):
+            mouth_vals = [f["mouth_openness"] for f in faces]
+            if any(v > 0.01 for v in mouth_vals):  # einfache Aktivitäts-Schwelle
+                speaker_idx = mouth_vals.index(max(mouth_vals))
+
+        for i, face in enumerate(faces):
+            x, y, w, h = face["bbox"]
+            color = (0, 255, 0) if i == speaker_idx else (255, 255, 255)
+            label = f"Mouth: {face.get('mouth_openness', 0):.2f}"
+
+            # Debug-Ausgabe (optional)
+            print(f"Frame {frame_idx} | Face {i} | BBox: ({x},{y},{w},{h}) | Speaker: {speaker_idx}")
+
+            cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
+            cv2.putText(frame, label, (x, y - 10),
+                        cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
+
+        out.write(frame)
+        frame_idx += 1
+
+    cap.release()
+    out.release()
+    print(f"✅ Vorschau exportiert: {output_path.name}")
+
+print("🏁 Alle Vorschauvideos mit Sprecherkennung erstellt.")
diff --git a/src/reformat/old/track_faces.py b/src/reformat/old/track_faces.py
new file mode 100644
index 0000000..f335069
--- /dev/null
+++ b/src/reformat/old/track_faces.py
@@ -0,0 +1,92 @@
+import cv2
+import mediapipe as mp
+import json
+from pathlib import Path
+from tqdm import tqdm
+
+# === Einstellungen ===
+INPUT_DIR = Path(__file__).resolve().parents[2] / "output"
+OUTPUT_DIR = Path(__file__).resolve().parent / "face_data"
+OUTPUT_DIR.mkdir(exist_ok=True)
+FRAME_SKIP = 1  # analysiere jeden Frame für maximale Genauigkeit
+PADDING = 30  # Pixel Padding um Gesicht
+
+mp_face_mesh = mp.solutions.face_mesh
+
+# Erweiterte Lippen-Landmarks (innen)
+TOP_LIPS = [13, 78, 82]
+BOTTOM_LIPS = [14, 87, 317]
+
+def mouth_openness(landmarks, image_height):
+    try:
+        top_avg = sum([landmarks[i].y for i in TOP_LIPS]) / len(TOP_LIPS)
+        bottom_avg = sum([landmarks[i].y for i in BOTTOM_LIPS]) / len(BOTTOM_LIPS)
+        return abs(bottom_avg - top_avg)
+    except:
+        return 0.0
+
+def process_video(path):
+    cap = cv2.VideoCapture(str(path))
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+    results = []
+
+    with mp_face_mesh.FaceMesh(
+        static_image_mode=False,
+        max_num_faces=5,
+        refine_landmarks=True,
+        min_detection_confidence=0.6,
+        min_tracking_confidence=0.6
+    ) as face_mesh:
+
+        frame_idx = 0
+        while cap.isOpened():
+            ret, frame = cap.read()
+            if not ret:
+                break
+
+            if frame_idx % FRAME_SKIP != 0:
+                frame_idx += 1
+                continue
+
+            rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            output = face_mesh.process(rgb)
+
+            faces = []
+            if output.multi_face_landmarks:
+                for landmarks in output.multi_face_landmarks:
+                    mouth = mouth_openness(landmarks.landmark, height)
+
+                    xs = [lm.x * width for lm in landmarks.landmark]
+                    ys = [lm.y * height for lm in landmarks.landmark]
+                    x1 = max(0, int(min(xs)) - PADDING)
+                    y1 = max(0, int(min(ys)) - PADDING)
+                    x2 = min(width, int(max(xs)) + PADDING)
+                    y2 = min(height, int(max(ys)) + PADDING)
+                    bbox = [x1, y1, x2 - x1, y2 - y1]
+
+                    faces.append({
+                        "bbox": bbox,
+                        "mouth_openness": round(mouth, 4)
+                    })
+
+            results.append({
+                "frame": frame_idx,
+                "timestamp": round(frame_idx / fps, 2),
+                "faces": faces
+            })
+
+            frame_idx += 1
+
+    cap.release()
+    out_path = OUTPUT_DIR / f"{path.stem}_faces.json"
+    out_path.write_text(json.dumps(results, indent=2))
+    print(f"✅ {path.name} verarbeitet → {out_path.name}")
+
+# === Alle Videos im output/ Ordner durchgehen
+videos = list(INPUT_DIR.glob("*.mp4"))
+print(f"🎬 {len(videos)} Videos gefunden in: {INPUT_DIR}")
+
+for video in tqdm(videos):
+    process_video(video)
diff --git a/src/reformat/old/track_faces_Yolo.py b/src/reformat/old/track_faces_Yolo.py
new file mode 100644
index 0000000..d7f5d1f
--- /dev/null
+++ b/src/reformat/old/track_faces_Yolo.py
@@ -0,0 +1,206 @@
+#!/usr/bin/env python3
+import argparse
+import logging
+import json
+from pathlib import Path
+
+import cv2
+from ultralytics import YOLO
+import mediapipe as mp
+
+# === Pfade und Standardwerte ===
+SCRIPT_DIR = Path(__file__).resolve().parent
+PROJECT_ROOT = SCRIPT_DIR.parents[1]
+DEFAULT_INPUT_DIR = PROJECT_ROOT / "data" / "output" / "raw_clips"
+DEFAULT_OUTPUT_DIR = PROJECT_ROOT / "data" / "face_data_combined"
+DEFAULT_MODEL_PATH = PROJECT_ROOT / "models" / "yolov8n-face.pt"
+
+# Stelle sicher, dass das Standard-Output-Verzeichnis existiert
+DEFAULT_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+
+# === Landmarks für Lippen ===
+TOP_LIPS = [61, 185, 40, 39, 37, 0, 267, 269, 270, 409]
+BOTTOM_LIPS = [146, 91, 181, 84, 17, 314, 405, 321, 375, 291]
+
+
+def get_mouth_openness(landmarks, image_height):
+    """
+    Berechnet die Mundöffnung in Pixeln basierend auf normierten Landmark-Koordinaten.
+    """
+    top_avg = sum(landmarks[i].y for i in TOP_LIPS) / len(TOP_LIPS)
+    bottom_avg = sum(landmarks[i].y for i in BOTTOM_LIPS) / len(BOTTOM_LIPS)
+    return abs(bottom_avg - top_avg) * image_height
+
+
+def iou(boxA, boxB):
+    """Berechnet Intersection-over-Union zweier Bounding-Boxes im Format (x, y, w, h)."""
+    ax1, ay1, aw, ah = boxA
+    ax2, ay2 = ax1 + aw, ay1 + ah
+    bx1, by1, bw, bh = boxB
+    bx2, by2 = bx1 + bw, by1 + bh
+
+    inter_x1 = max(ax1, bx1)
+    inter_y1 = max(ay1, by1)
+    inter_x2 = min(ax2, bx2)
+    inter_y2 = min(ay2, by2)
+    inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1)
+
+    union_area = aw * ah + bw * bh - inter_area
+    return inter_area / union_area if union_area > 0 else 0
+
+
+def process_video(
+    video_path: Path,
+    output_path: Path,
+    model: YOLO,
+    face_mesh: mp.solutions.face_mesh.FaceMesh,
+    conf_thresh: float,
+    frame_skip: int,
+    downscale: float,
+):
+    cap = cv2.VideoCapture(str(video_path))
+    if not cap.isOpened():
+        logging.error(f"Kann Video nicht öffnen: {video_path}")
+        return
+
+    fps = cap.get(cv2.CAP_PROP_FPS)
+    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH) * downscale)
+    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT) * downscale)
+
+    # JSON-Ausgabe mit Streaming
+    with output_path.open('w', encoding='utf-8') as f_out:
+        f_out.write('[\n')
+        first = True
+        frame_idx = 0
+
+        while True:
+            ret, frame = cap.read()
+            if not ret:
+                break
+            if frame_skip > 1 and frame_idx % frame_skip != 0:
+                frame_idx += 1
+                continue
+
+            if downscale != 1.0:
+                frame = cv2.resize(frame, (width, height))
+
+            detections = model(frame, verbose=False)[0]
+            yolo_boxes = []
+            for box in detections.boxes:
+                conf = float(box.conf[0] if hasattr(box.conf, '__getitem__') else box.conf)
+                if conf < conf_thresh:
+                    continue
+                coords = box.xyxy[0].cpu().numpy()
+                x1, y1, x2, y2 = map(int, coords)
+                yolo_boxes.append([x1, y1, x2 - x1, y2 - y1])
+
+            rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            mp_result = face_mesh.process(rgb)
+            mp_faces = []
+            if mp_result.multi_face_landmarks:
+                for landmarks in mp_result.multi_face_landmarks:
+                    mouth_px = get_mouth_openness(landmarks.landmark, height)
+                    xs = [lm.x * width for lm in landmarks.landmark]
+                    ys = [lm.y * height for lm in landmarks.landmark]
+                    x1, y1 = int(min(xs)), int(min(ys))
+                    x2, y2 = int(max(xs)), int(max(ys))
+                    mp_faces.append({
+                        "bbox": [x1, y1, x2 - x1, y2 - y1],
+                        "mouth_openness": round(mouth_px, 1)
+                    })
+
+            combined = []
+            for yb in yolo_boxes:
+                if mp_faces:
+                    best = max(mp_faces, key=lambda m: iou(yb, m["bbox"]))
+                    best_iou = iou(yb, best["bbox"])
+                    mouth = best["mouth_openness"] if best_iou > 0.2 else 0.0
+                else:
+                    mouth = 0.0
+
+                x, y, w, h = yb
+                cx, cy = x + w / 2, y + h / 2
+                combined.append({
+                    "bbox": yb,
+                    "mouth_openness": round(mouth, 1),
+                    "center": [round(cx, 1), round(cy, 1), w, h]
+                })
+
+            result = {
+                "frame": frame_idx,
+                "timestamp": round(frame_idx / fps, 3),
+                "faces": combined
+            }
+
+            if not first:
+                f_out.write(',\n')
+            json.dump(result, f_out, ensure_ascii=False)
+            first = False
+            frame_idx += 1
+
+        f_out.write('\n]')
+
+    cap.release()
+    logging.info(f"Verarbeitet: {video_path.name} → {output_path.name}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Analyse von Videos: Gesichter und Mundöffnung erkennen"
+    )
+    parser.add_argument(
+        "--input-dir", type=Path,
+        default=DEFAULT_INPUT_DIR,
+        help=f"Verzeichnis mit MP4-Videos (standard: {DEFAULT_INPUT_DIR})"
+    )
+    parser.add_argument(
+        "--output-dir", type=Path,
+        default=DEFAULT_OUTPUT_DIR,
+        help=f"Verzeichnis für JSON-Ergebnisse (standard: {DEFAULT_OUTPUT_DIR})"
+    )
+    parser.add_argument(
+        "--model", type=Path,
+        default=DEFAULT_MODEL_PATH,
+        help=f"Pfad zum YOLOv8-Face-Modell (.pt) (standard: {DEFAULT_MODEL_PATH})"
+    )
+    parser.add_argument(
+        "--conf-thresh", type=float, default=0.5,
+        help="Schwelle für YOLO-Confidence"
+    )
+    parser.add_argument(
+        "--frame-skip", type=int, default=1,
+        help="Nur jede n-te Frame verarbeiten"
+    )
+    parser.add_argument(
+        "--downscale", type=float, default=1.0,
+        help="Skalierungsfaktor für Frames"
+    )
+    args = parser.parse_args()
+
+    logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO)
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+
+    yolo = YOLO(str(args.model))
+    face_mesh = mp.solutions.face_mesh.FaceMesh(
+        static_image_mode=False,
+        max_num_faces=5,
+        refine_landmarks=True,
+        min_detection_confidence=0.5,
+        min_tracking_confidence=0.5
+    )
+
+    for video_path in sorted(args.input_dir.glob("*.mp4")):
+        out_path = args.output_dir / f"{video_path.stem}_faces.json"
+        process_video(
+            video_path,
+            out_path,
+            yolo,
+            face_mesh,
+            args.conf_thresh,
+            args.frame_skip,
+            args.downscale,
+        )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/reformat/speaking.py b/src/reformat/speaking.py
new file mode 100644
index 0000000..4d7b83c
--- /dev/null
+++ b/src/reformat/speaking.py
@@ -0,0 +1,12 @@
+# src/speaking.py
+
+TOP_LIPS = [61, 185, 40, 39, 37, 0, 267, 269, 270, 409]
+BOTTOM_LIPS = [146, 91, 181, 84, 17, 314, 405, 321, 375, 291]
+
+def get_mouth_openness(landmarks, image_height):
+    """
+    Berechnet die Mundöffnung basierend auf MediaPipe-Landmarks.
+    """
+    top_avg = sum(landmarks[i].y for i in TOP_LIPS) / len(TOP_LIPS)
+    bottom_avg = sum(landmarks[i].y for i in BOTTOM_LIPS) / len(BOTTOM_LIPS)
+    return abs(bottom_avg - top_avg) * image_height
diff --git a/src/subtitles/add_subtitles.py b/src/subtitles/add_subtitles.py
new file mode 100644
index 0000000..2f3448c
--- /dev/null
+++ b/src/subtitles/add_subtitles.py
@@ -0,0 +1,265 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+add_subtitles.py — TikTok-Word-Caps mit OpenAI Whisper (CPU)
+- läuft Ordner-weise über 9:16-Kurzclips
+- transkribiert mit word_timestamps=True
+- erzeugt ASS (ein Wort pro Zeile, Pop-Animation, Bottom-Center)
+- brennt via ffmpeg in *_subtitled.mp4
+"""
+
+import os
+import re
+import glob
+import json
+import subprocess
+import tempfile
+import traceback
+import argparse
+from typing import List, Tuple, Optional
+from pathlib import Path
+import sys
+
+# ── Projektwurzel in sys.path aufnehmen (dieses Skript liegt z. B. unter src/subtitles/)
+ROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(ROOT))
+
+from config import CROPPED_DIR, SUBTITLED_DIR  # zentrale Pfade
+
+# --- Stabil auf CPU (vermeidet MPS-Sparse-Fehler) ---
+os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
+os.environ["CUDA_VISIBLE_DEVICES"] = ""
+
+def log(*a): print("[LOG]", *a)
+def ensure_dir(p: Path): p.mkdir(parents=True, exist_ok=True)
+
+def has_audio_stream(video_path: str) -> bool:
+    cmd = ["ffprobe","-v","error","-select_streams","a","-show_entries","stream=index","-of","json",video_path]
+    try:
+        out = subprocess.check_output(cmd).decode("utf-8")
+        data = json.loads(out)
+        return bool(data.get("streams"))
+    except Exception:
+        return False
+
+def load_whisper_cpu(model_name: str):
+    import whisper  # openai-whisper
+    device = "cpu"
+    model = whisper.load_model(model_name, device=device)
+    fp16 = False
+    return model, device, fp16
+
+def transcribe_words_whisper(model, media_path: str, language: Optional[str], fp16: bool) -> List[Tuple[float,float,str]]:
+    """
+    Nutzt 'openai-whisper' mit word_timestamps=True.
+    Fallback: wenn 'words' fehlen, werden Segmenttexte approx. auf Wörter verteilt.
+    """
+    result = model.transcribe(
+        media_path,
+        language=language,
+        task="transcribe",
+        word_timestamps=True,
+        condition_on_previous_text=False,
+        verbose=False,
+        fp16=fp16
+    )
+    words: List[Tuple[float,float,str]] = []
+    segs = result.get("segments", []) or []
+    for seg in segs:
+        wlist = seg.get("words")
+        if isinstance(wlist, list) and wlist and all(isinstance(w, dict) for w in wlist):
+            for w in wlist:
+                t = (w.get("word") or w.get("text") or "").strip()
+                if not t:
+                    continue
+                ws = w.get("start"); we = w.get("end")
+                if ws is None or we is None:
+                    continue
+                t = re.sub(r"\s+", " ", t)
+                if t:
+                    words.append((float(ws), float(we), t))
+        else:
+            # Fallback: Segment auf Wörter aufteilen & Zeiten gleichmäßig verteilen
+            text = (seg.get("text") or "").strip()
+            if not text:
+                continue
+            seg_start = float(seg.get("start", 0.0))
+            seg_end   = float(seg.get("end", seg_start))
+            toks = [w for w in re.split(r"(\s+)", text) if w.strip()]
+            if not toks or seg_end <= seg_start:
+                continue
+            dur = seg_end - seg_start
+            step = dur / len(toks)
+            for i, tok in enumerate(toks):
+                ws = seg_start + i * step
+                we = seg_start + (i+1) * step
+                words.append((ws, we, tok))
+    return words
+
+def ass_time(t: float) -> str:
+    if t < 0: t = 0
+    h = int(t // 3600); m = int((t % 3600)//60); s = int(t % 60); cs = int(round((t - int(t))*100))
+    return f"{h:d}:{m:02d}:{s:02d}.{cs:02d}"
+
+def write_ass_words(words: List[Tuple[float,float,str]], ass_path: Path, font_size: int, margin_v: int, uppercase: bool):
+    """
+    Ein Wort pro Zeile, ohne Überlappung:
+    - Ende = min(eigene Endzeit, Start nächstes Wort - 0.02)
+    - Pop-Animation 150ms, fette Outline, Bottom-Center (PlayResY=1920)
+    """
+    header = f"""[Script Info]
+ScriptType: v4.00+
+Collisions: Normal
+PlayResX: 1080
+PlayResY: 1920
+ScaledBorderAndShadow: yes
+
+[V4+ Styles]
+Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
+Style: WordCap,Inter,{font_size},&H00FFFFFF,&H00FFFFFF,&H00101010,&H64000000,1,0,0,0,100,100,0,0,1,6,0.8,2,80,80,{margin_v},1
+
+[Events]
+Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
+"""
+    # Zeiten glätten, damit immer nur ein Wort sichtbar ist
+    adjusted = []
+    for i, (s, e, t) in enumerate(words):
+        nstart = words[i+1][0] if i+1 < len(words) else e
+        new_end = min(e, nstart - 0.02) if nstart > s else e
+        if new_end <= s:
+            new_end = s + 0.06
+        adjusted.append((s, new_end, t))
+
+    with open(ass_path, "w", encoding="utf-8") as f:
+        f.write(header)
+        for s, e, t in adjusted:
+            st, en = ass_time(s), ass_time(e)
+            txt = t.upper() if uppercase else t
+            # \fs sichere Größe, \blur für weiche Outline, \fad Ein/Aus,
+            # \fscx135\fscy135 → Start groß, \t(...) schrumpft in 150ms auf 100% = Pop
+            overrides = r"\blur1\bord8\1c&H0000FFFF&\3c&H000000&\4c&H000000&\fad(50,20)\fscx135\fscy135\t(0,150,\fscx100\fscy100)"
+            f.write(f"Dialogue: 0,{st},{en},WordCap,,0,0,0,,{{{overrides}}}{txt}\n")
+
+def ffmpeg_escape_for_subtitles(path: Path) -> str:
+    """
+    Pfad für -vf subtitles=… escapen (für Leerzeichen, Doppelpunkte etc.).
+    ffmpeg erwartet Backslash-escaping für Filter-Argumente.
+    """
+    s = str(path)
+    s = s.replace("\\", "\\\\")
+    s = s.replace(":", "\\:")
+    s = s.replace("'", "\\'")
+    s = s.replace(",", "\\,")
+    s = s.replace("[", "\\[")
+    s = s.replace("]", "\\]")
+    s = s.replace(";", "\\;")
+    s = s.replace("=", "\\=")
+    return s
+
+def burn(video_in: Path, ass_file: Path, out_path: Path, crf=18, preset="medium") -> int:
+    vf = f"subtitles={ffmpeg_escape_for_subtitles(ass_file)}"
+    cmd = [
+        "ffmpeg","-y","-i",str(video_in),
+        "-vf", vf,
+        "-c:v","libx264","-preset",preset,"-crf",str(crf),
+        "-c:a","copy",
+        str(out_path)
+    ]
+    log("FFmpeg:", " ".join(cmd))
+    return subprocess.call(cmd)
+
+def parse_args():
+    p = argparse.ArgumentParser(description="Brennt Word-Caps (ASS) via Whisper-Transkription in 9:16-Clips.")
+    p.add_argument("--clips_dir", type=Path, default=CROPPED_DIR, help=f"Quellordner (Default: {CROPPED_DIR})")
+    p.add_argument("--out_dir",   type=Path, default=SUBTITLED_DIR, help=f"Zielordner (Default: {SUBTITLED_DIR})")
+    p.add_argument("--pattern",   type=str,  default="*.mp4",       help="Dateimuster (Default: *.mp4)")
+    p.add_argument("--limit",     type=int,  default=None,          help="Nur die ersten N Clips verarbeiten")
+    p.add_argument("--model",     type=str,  default=os.getenv("WHISPER_MODEL", "medium"), help="Whisper-Modell")
+    p.add_argument("--lang",      type=str,  default=os.getenv("LANGUAGE", "none"),          help="Sprachcode (z. B. de, en, None=Auto)")
+    p.add_argument("--uppercase", action="store_true", help="Text in Großbuchstaben rendern")
+    p.add_argument("--font_size", type=int,  default=108,  help="ASS-Fontgröße")
+    p.add_argument("--margin_v",  type=int,  default=320,  help="ASS-MarginV (Abstand vom unteren Rand)")
+    p.add_argument("--crf",       type=int,  default=18,   help="ffmpeg CRF (Qualität)")
+    p.add_argument("--preset",    type=str,  default="medium", help="ffmpeg Preset")
+    return p.parse_args()
+
+def main():
+    args = parse_args()
+
+    clips_dir  = args.clips_dir
+    output_dir = args.out_dir
+    ensure_dir(output_dir)
+
+    log("Starte TikTok Word-Caps (Whisper)")
+    log("CLIPS_DIR =", clips_dir)
+    log("OUTPUT_DIR =", output_dir)
+
+    clips: List[str] = []
+    for pat in (args.pattern,):
+        clips += glob.glob(str(clips_dir / pat))
+    clips.sort()
+    log(f"{len(clips)} Clips gefunden.")
+    if args.limit:
+        clips = clips[:args.limit]
+        log(f"LIMIT aktiv: {args.limit}")
+
+    if not clips:
+        log("Keine Clips gefunden. Pfad/Pattern checken.")
+        return
+
+    # Whisper laden (CPU)
+    try:
+        model, device, fp16 = load_whisper_cpu(args.model)
+        log(f"Whisper geladen: {args.model} auf {device} (fp16={fp16})")
+        log("Hinweis: Beim ersten Lauf kann das Modell nachgeladen werden.")
+    except Exception as e:
+        print("[ERROR] Whisper konnte nicht geladen werden:", e)
+        traceback.print_exc()
+        return
+
+    lang = None if str(args.lang).strip().lower() in {"", "none", "null"} else args.lang
+
+    for clip in clips:
+        base = os.path.basename(clip)
+        stem, _ = os.path.splitext(base)
+        log("="*60)
+        log("Clip:", base)
+
+        if not has_audio_stream(clip):
+            log("WARN: Keine Audio-Spur → übersprungen.")
+            continue
+
+        # Transkription
+        try:
+            log("Transkription startet …")
+            words = transcribe_words_whisper(model, clip, language=lang, fp16=fp16)
+            log(f"Transkription fertig. {len(words)} Wörter.")
+            if not words:
+                log("WARN: 0 Wörter erkannt → übersprungen.")
+                continue
+        except Exception as e:
+            print("[ERROR] Transkription fehlgeschlagen:", e)
+            traceback.print_exc()
+            continue
+
+        # ASS erzeugen & brennen
+        with tempfile.NamedTemporaryFile(suffix=".ass", delete=False) as tmp:
+            ass_path = Path(tmp.name)
+        try:
+            log("Erzeuge ASS …")
+            write_ass_words(words, ass_path, font_size=args.font_size, margin_v=args.margin_v, uppercase=args.uppercase)
+            out_path = output_dir / f"{stem}_subtitled.mp4"
+            log("Brenne Untertitel …")
+            rc = burn(Path(clip), ass_path, out_path, crf=args.crf, preset=args.preset)
+            if rc == 0:
+                log("OK:", out_path)
+            else:
+                log("ERROR: ffmpeg fehlgeschlagen, code", rc)
+        finally:
+            try: ass_path.unlink(missing_ok=True)
+            except Exception: pass
+
+    log("Fertig.")
+
+if __name__ == "__main__":
+    main()
diff --git a/src/subtitles/run_subtitles.py b/src/subtitles/run_subtitles.py
new file mode 100644
index 0000000..1ce2f40
--- /dev/null
+++ b/src/subtitles/run_subtitles.py
@@ -0,0 +1,25 @@
+import os
+import tempfile
+from add_subtitles import process  # wir nutzen die Logik aus dem großen Skript
+
+# ==== HIER EINSTELLEN ====
+VIDEO_PATH = "data/input.mp4"           # Dein Video
+TRANSCRIPT_PATH = "data/transcript.srt" # Oder .json (Whisper)
+OUTPUT_DIR = "data/output"              # Ordner für Ergebnisse
+CLIPS_PATH = None                       # Optional: "data/clips.csv" oder "data/clips.json"
+CRF = 18
+PRESET = "medium"
+STYLE = r"\\bord4\\shad4\\outline3\\fs64\\b1\\1c&HFFFFFF&\\3c&H000000&\\4c&H000000&"
+# ==========================
+
+if __name__ == "__main__":
+    os.makedirs(OUTPUT_DIR, exist_ok=True)
+    process(
+        video_path=VIDEO_PATH,
+        transcript_path=TRANSCRIPT_PATH,
+        output_dir=OUTPUT_DIR,
+        clips_path=CLIPS_PATH,
+        crf=CRF,
+        preset=PRESET,
+        style_overrides=STYLE,
+    )
diff --git a/src/text/cutClips.py b/src/text/cutClips.py
new file mode 100644
index 0000000..ae314b8
--- /dev/null
+++ b/src/text/cutClips.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+# cutClips.py — exportiert Clips aus dem ersten gefundenen Video oder aus angegebener Datei
+
+from pathlib import Path
+import sqlite3
+import argparse
+from moviepy.video.io.VideoFileClip import VideoFileClip
+import sys
+
+# ── Projektwurzel in sys.path aufnehmen
+ROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(ROOT))
+
+from config import INPUT_DIR, RAW_CLIPS_DIR, DB_PATH
+
+
+def parse_args():
+    p = argparse.ArgumentParser(description="Exportiert Highlights aus dem Video gemäß SQLite-DB.")
+    p.add_argument("--file", type=str, default=None,
+                   help="Name der Input-Datei im INPUT_DIR. Wenn leer, wird das erste Video im Ordner verwendet.")
+    p.add_argument("--limit", type=int, default=10,
+                   help="Anzahl der zu exportierenden Clips (Default: 10)")
+    p.add_argument("--order", type=str, choices=["score", "start"], default="score",
+                   help="Sortierung: 'score' (score_total absteigend) oder 'start' (zeitlich).")
+    return p.parse_args()
+
+
+def find_first_video(directory: Path):
+    """Suche nach der ersten Videodatei im Verzeichnis (mp4, mov, mkv)."""
+    for ext in ("*.mp4", "*.mov", "*.mkv"):
+        files = sorted(directory.glob(ext))
+        if files:
+            return files[0]
+    return None
+
+
+def main():
+    args = parse_args()
+
+    # === Eingabevideo bestimmen ===
+    if args.file:
+        input_video = INPUT_DIR / args.file
+    else:
+        input_video = find_first_video(INPUT_DIR)
+        if not input_video:
+            raise FileNotFoundError(f"🚫 Kein Video im Eingabeordner {INPUT_DIR} gefunden.")
+        print(f"📂 Kein --file angegeben → verwende automatisch: {input_video.name}")
+
+    if not input_video.exists():
+        raise FileNotFoundError(f"🚫 Input-Video nicht gefunden: {input_video}")
+
+    output_dir = RAW_CLIPS_DIR
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # === SQLite DB lesen ===
+    conn = sqlite3.connect(DB_PATH)
+    cursor = conn.cursor()
+
+    order_clause = "ORDER BY score_total DESC" if args.order == "score" else "ORDER BY start ASC"
+    cursor.execute(f"""
+        SELECT start, end, text
+        FROM highlights
+        {order_clause}
+        LIMIT ?
+    """, (args.limit,))
+    highlights = cursor.fetchall()
+
+    if not highlights:
+        print("⚠️ Keine Highlights in der Datenbank gefunden.")
+        conn.close()
+        return
+
+    # === Video laden ===
+    video = VideoFileClip(str(input_video))
+
+    # === Clips schneiden ===
+    for i, (start, end, text) in enumerate(highlights, start=1):
+        if start >= video.duration:
+            print(f"⚠️ Clip {i} übersprungen – Startzeit {start:.2f}s liegt außerhalb der Videolänge ({video.duration:.2f}s).")
+            continue
+
+        end = min(end, video.duration)
+        output_file = output_dir / f"highlight_{i}.mp4"
+        print(f"🎬 Exportiere Clip {i}: {start:.2f}s – {end:.2f}s → {output_file.name}")
+
+        try:
+            clip = video.subclipped(start, end)
+            clip.write_videofile(str(output_file), codec="libx264", audio_codec="aac", logger=None)
+            clip.close()
+        except Exception as e:
+            print(f"❌ Fehler beim Export von Clip {i}: {e}")
+
+    # === Cleanup ===
+    conn.close()
+    video.close()
+    print(f"✅ {len(highlights)} Clips exportiert nach {output_dir}")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/code/text/rateCluster.py b/src/text/rateCluster.py
similarity index 69%
rename from code/text/rateCluster.py
rename to src/text/rateCluster.py
index 0c9cf07..fe5e2f6 100644
--- a/code/text/rateCluster.py
+++ b/src/text/rateCluster.py
@@ -2,44 +2,41 @@ import sqlite3
 import re
 from openai import OpenAI
 from time import sleep
+from pathlib import Path
+import os
+
+from pathlib import Path
+import sys
+
+# Projekt-Root einfügen (2 Ebenen hoch von src/* ausgehend)
+ROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(ROOT))
+
+from config import DB_PATH
+
 
-# === Einstellungen ===
-DB_PATH = "clips_openai.db"
-VIDEO_ID = "testVideoShort"
 MAX_CLIPS = 5  # oder "all"
-OPENAI_API_KEY = "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"
 
-client = OpenAI(api_key=OPENAI_API_KEY)
+# === OPENAI-CLIENT (API-Key aus Env) ===
+if not os.getenv("OPENAI_API_KEY"):
+    raise RuntimeError("🚫 OPENAI_API_KEY fehlt in der Umgebung")
+client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 
 # === DB-Verbindung
 conn = sqlite3.connect(DB_PATH)
 cursor = conn.cursor()
 
-cursor.execute("DROP TABLE IF EXISTS highlights")
+# === Unbewertete Highlights laden
 cursor.execute("""
-CREATE TABLE highlights (
-    id INTEGER PRIMARY KEY AUTOINCREMENT,
-    file TEXT,
-    start REAL,
-    end REAL,
-    text TEXT,
-    viralitaet INTEGER,
-    emotionalitaet INTEGER,
-    witz INTEGER,
-    provokation INTEGER,
-    score_total INTEGER
-)
+    SELECT id, start, end, text FROM highlights
+    WHERE viralitaet IS NULL OR emotionalitaet IS NULL
+    ORDER BY start
 """)
-conn.commit()
-print(f"🧹 Tabelle 'highlights' neu erstellt für: {VIDEO_ID}")
-
-# === Segmente laden
-cursor.execute("SELECT start, end, text FROM segments ORDER BY start")
 segments = cursor.fetchall()
-print(f"📥 {len(segments)} Segmente (Originaltext) geladen.")
+print(f"📥 {len(segments)} unbewertete Highlights geladen.")
 
 # === Bewertungsfunktion (GPT-4o)
-def analyse_segment(text, start, end):
+def analyse_segment(clip_id, text, start, end):
     print(f"\n🔎 Bewerte Clip: {start:.2f}s – {end:.2f}s")
 
     prompt = f"""
@@ -86,19 +83,19 @@ Provokation: [Zahl]
         if all(v is not None for v in values.values()):
             total_score = sum(values.values())
             cursor.execute("""
-                INSERT INTO highlights (
-                    file, start, end, text,
-                    viralitaet, emotionalitaet, witz, provokation, score_total
-                ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
+                UPDATE highlights SET
+                    viralitaet = ?, emotionalitaet = ?, witz = ?, provokation = ?, score_total = ?
+                WHERE id = ?
             """, (
-                VIDEO_ID, start, end, text.strip(),
                 values["viralitaet"], values["emotionalitaet"],
                 values["witz"], values["provokation"],
-                total_score
+                total_score,
+                clip_id
             ))
             conn.commit()
 
             return {
+                "id": clip_id,
                 "start": start,
                 "end": end,
                 "text": text.strip(),
@@ -113,8 +110,8 @@ Provokation: [Zahl]
 
 # === Clips bewerten
 rated = []
-for start, end, text in segments:
-    result = analyse_segment(text, float(start), float(end))
+for clip_id, start, end, text in segments:
+    result = analyse_segment(clip_id, text, float(start), float(end))
     if result:
         rated.append(result)
     sleep(1.2)  # Anti-Rate-Limit
@@ -123,7 +120,7 @@ for start, end, text in segments:
 rated.sort(key=lambda x: x["total"], reverse=True)
 selected = rated if MAX_CLIPS == "all" else rated[:int(MAX_CLIPS)]
 
-print(f"\n🎬 Beste {len(selected)} Highlights für: {VIDEO_ID}")
+print(f"\n🎬 Beste {len(selected)} Highlights nach Bewertung:")
 for clip in selected:
     print(f"\n🚀 {clip['start']:.2f}s – {clip['end']:.2f}s")
     print(f"🎙️  {clip['text'][:200]}...")
diff --git a/src/text/segment_transcript.py b/src/text/segment_transcript.py
new file mode 100644
index 0000000..7e8e577
--- /dev/null
+++ b/src/text/segment_transcript.py
@@ -0,0 +1,409 @@
+#!/usr/bin/env python3
+# clip_selector_optimized.py — word-based text rebuild (no duplicates)
+
+import os
+import re
+import json
+import sqlite3
+import time
+from pathlib import Path
+from datetime import datetime
+import argparse
+import sys
+from typing import List, Dict, Optional
+
+from openai import OpenAI
+
+# ── Projektwurzel in sys.path aufnehmen (dieses Skript kann z. B. unter src/text/ liegen)
+ROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(ROOT))
+
+from config import TRANSCRIPTS_DIR, DB_PATH  # zentrale Pfade
+
+LOG_DIR = ROOT / "logs"
+LOG_DIR.mkdir(exist_ok=True, parents=True)
+
+# === DEFAULTS (per CLI überschreibbar) ===
+DEFAULT_BLOCK_DURATION = 300.0  # Sek. pro Block
+DEFAULT_MIN_CLIP_LEN   = 30.0   # konsistent mit Prompt
+DEFAULT_MAX_CLIP_LEN   = 90.0
+
+# === OPENAI-CLIENT (API-Key aus Env) ===
+API_KEY = os.getenv("OPENAI_API_KEY")
+if not API_KEY:
+    raise RuntimeError("🚫 OPENAI_API_KEY fehlt in der Umgebung")
+OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-5")  # bei Bedarf überschreiben
+client = OpenAI(api_key=API_KEY)
+
+# ──────────────────────────────────────────────────────────────────────────────
+# Hilfsfunktionen
+# ──────────────────────────────────────────────────────────────────────────────
+
+def log_text(filename: str, content: str) -> None:
+    (LOG_DIR / filename).write_text((content or "").strip(), encoding="utf-8")
+
+def append_error_log(content: str) -> None:
+    with (LOG_DIR / "errors.txt").open("a", encoding="utf-8") as f:
+        f.write(f"{datetime.now().isoformat()} {content}\n\n")
+
+def extract_json(text: str) -> list:
+    """Nur für Debug: versucht JSON-Array aus beliebigem Text zu extrahieren."""
+    txt = (text or "").strip()
+    txt = re.sub(r"^```(?:json)?\s*|\s*```$", "", txt, flags=re.IGNORECASE | re.DOTALL)
+    m = re.search(r"\[\s*{.*?}\s*\]", txt, re.DOTALL)
+    if not m:
+        append_error_log(f"❌ Kein JSON-Array gefunden.\n{txt}")
+        return []
+    try:
+        return json.loads(m.group(0))
+    except Exception as e:
+        append_error_log(f"❌ JSON-Fehler: {e}\n{txt}")
+        return []
+
+def get_json_snippets_for_clip(start: float, end: float, segment_json: List[Dict]) -> List[Dict]:
+    """halb-offenes Fenster [start, end)"""
+    return [s for s in segment_json if not (float(s["end"]) <= start or float(s["start"]) >= end)]
+
+def _norm_space(s: str) -> str:
+    return re.sub(r"\s+", " ", (s or "").strip())
+
+def explode_segments_to_words(segments: List[Dict]) -> List[Dict]:
+    """
+    Baut eine globale Wortliste. Bevorzugt echte 'words' aus JSON,
+    fällt ansonsten auf lineare Interpolation über Segmentdauer zurück.
+    Ausgabe-Items: {idx, mid, text}
+    """
+    words = []
+    idx = 0
+    for seg in sorted(segments, key=lambda s: (float(s["start"]), float(s["end"]))):
+        s0, s1 = float(seg["start"]), float(seg["end"])
+        txt = (seg.get("text") or "").strip()
+        seg_words = seg.get("words") or []
+        if seg_words:
+            for w in seg_words:
+                t = (w.get("text") or w.get("word") or "").strip()
+                if not t:
+                    continue
+                w0 = float(w["start"]); w1 = float(w["end"])
+                words.append({"idx": idx, "mid": round((w0 + w1) / 2.0, 4), "text": t})
+                idx += 1
+        else:
+            toks = txt.split()
+            n = len(toks)
+            if n == 0:
+                continue
+            dur = max(1e-6, s1 - s0)
+            for i, tok in enumerate(toks):
+                w0 = s0 + (i / n) * dur
+                w1 = s0 + ((i + 1) / n) * dur
+                words.append({"idx": idx, "mid": round((w0 + w1) / 2.0, 4), "text": tok})
+                idx += 1
+    return words
+
+def build_text_strict_from_words(clip_start: float, clip_end: float, WORDS: List[Dict]) -> str:
+    """Nimmt jedes Wort genau einmal, wenn mid ∈ [start, end)."""
+    sel = [w for w in WORDS if clip_start <= w["mid"] < clip_end]
+    sel.sort(key=lambda w: w["idx"])
+    return _norm_space(" ".join(w["text"] for w in sel))
+
+def find_transcript_pair(base: Optional[str]) -> tuple[Path, Path, str]:
+    """
+    Finde (timed.txt, segments.json) in TRANSCRIPTS_DIR.
+    - Wenn base übergeben: benutzt {base}_timed.txt und {base}_segments.json.
+    - Sonst: nimmt das lexikographisch erste *_timed.txt und leitet die JSON davon ab.
+    """
+    if base:
+        txt = TRANSCRIPTS_DIR / f"{base}_timed.txt"
+        jsn = TRANSCRIPTS_DIR / f"{base}_segments.json"
+        if not txt.exists():
+            raise FileNotFoundError(f"Transkript nicht gefunden: {txt}")
+        if not jsn.exists():
+            raise FileNotFoundError(f"Segment-JSON nicht gefunden: {jsn}")
+        return txt, jsn, base
+
+    # auto-detect
+    candidates = sorted(TRANSCRIPTS_DIR.glob("*_timed.txt"))
+    if not candidates:
+        raise FileNotFoundError(f"Keine *_timed.txt in {TRANSCRIPTS_DIR} gefunden.")
+    txt = candidates[0]
+    stem = txt.stem.replace("_timed", "")
+    jsn = TRANSCRIPTS_DIR / f"{stem}_segments.json"
+    if not jsn.exists():
+        raise FileNotFoundError(f"Gefundenes TXT: {txt.name}, aber JSON fehlt: {jsn.name}")
+    return txt, jsn, stem
+
+# ──────────────────────────────────────────────────────────────────────────────
+# CLI
+# ──────────────────────────────────────────────────────────────────────────────
+
+def parse_args():
+    p = argparse.ArgumentParser(description="Selektiert Social-Media-taugliche Clips aus Transkripten (LLM-gestützt).")
+    p.add_argument("--base", type=str, default=None,
+                   help="Basename der Transkriptdateien (z. B. 'testVideoShort' für *_timed.txt und *_segments.json).")
+    p.add_argument("--block", type=float, default=DEFAULT_BLOCK_DURATION, help="Blocklänge in Sekunden für die Prompt-Bildung.")
+    p.add_argument("--min",   type=float, default=DEFAULT_MIN_CLIP_LEN,   help="Minimale Clip-Länge (Sekunden).")
+    p.add_argument("--max",   type=float, default=DEFAULT_MAX_CLIP_LEN,   help="Maximale Clip-Länge (Sekunden).")
+    return p.parse_args()
+
+# ──────────────────────────────────────────────────────────────────────────────
+# Main
+# ──────────────────────────────────────────────────────────────────────────────
+
+def main():
+    args = parse_args()
+    BLOCK_DURATION = float(args.block)
+    MIN_CLIP_LEN   = float(args.min)
+    MAX_CLIP_LEN   = float(args.max)
+
+    # --- Transkriptdateien finden ---
+    TRANSCRIPT_PATH, SEGMENT_JSON_PATH, base = find_transcript_pair(args.base)
+    print(f"📄 TXT : {TRANSCRIPT_PATH}")
+    print(f"🧾 JSON: {SEGMENT_JSON_PATH}")
+
+    # === TRANSKRIPT EINLESEN (TXT) -> NUR für Blockbildung & Promptanzeige ===
+    lines = TRANSCRIPT_PATH.read_text(encoding="utf-8").splitlines()
+    segments_txt: List[Dict] = []
+    for line in lines:
+        m = re.match(r"\[(\d+(?:\.\d+)?)\s*[–-]\s*(\d+(?:\.\d+)?)\]\s*(?:[A-Z_0-9]+:)?\s*(.*)", line)
+        if not m:
+            continue
+        start, end, text = m.groups()
+        start, end = float(start), float(end)
+        if end - start >= 2.0:
+            segments_txt.append({"start": start, "end": end, "text": (text or "").strip()})
+
+    if not segments_txt:
+        raise RuntimeError("🚫 Keine gültigen TXT-Segmente gefunden.")
+    print(f"✅ {len(segments_txt)} gültige TXT-Segmente geladen.")
+
+    # === TRANSKRIPT EINLESEN (JSON) -> Quelle für DB-Text/Wörter ===
+    segment_json_data = json.loads(SEGMENT_JSON_PATH.read_text(encoding="utf-8"))
+    if not isinstance(segment_json_data, list) or not segment_json_data:
+        raise RuntimeError("🚫 JSON-Segmente leer/ungültig.")
+    print(f"✅ {len(segment_json_data)} JSON-Segmente geladen.")
+
+    # Globale Wörterliste einmal berechnen (bevor wir Clips bilden)
+    WORDS = explode_segments_to_words(segment_json_data)
+    print(f"🔤 Globale Wörter im Korpus: {len(WORDS)}")
+
+    # === BLÖCKE BILDEN (aus TXT) ===
+    segments_txt.sort(key=lambda s: (s["start"], s["end"]))
+    blocks, current_block, current_start = [], [], 0.0
+    for seg in segments_txt:
+        if not current_block:
+            current_start = seg["start"]
+        # Blockwechsel, wenn Dauer überschritten
+        if seg["end"] - current_start > BLOCK_DURATION:
+            blocks.append(current_block)
+            current_block = []
+            current_start = seg["start"]
+        current_block.append(seg)
+    if current_block:
+        blocks.append(current_block)
+    print(f"🧱 {len(blocks)} Blöcke erstellt (à ~{BLOCK_DURATION:.0f}s).")
+
+    # === KI: CLIP-AUSWAHL ===
+    all_clips = []
+    t0 = time.perf_counter()
+
+    for i, block in enumerate(blocks, start=1):
+        if not block:
+            continue
+        print(f"\n🤖 Sende Block {i}/{len(blocks)} an {OPENAI_MODEL} …")
+        block_text = "\n".join([f"[{s['start']} – {s['end']}] {s['text']}" for s in block])
+
+        prompt = f"""
+Du bekommst einen Transkriptblock mit Zeitangaben. Extrahiere daraus 1–3 besonders interessante Abschnitte, die sich als eigenständige Social Media Clips eignen.
+Ein guter Clip:
+- ist abgeschlossen und verständlich
+- enthält eine Pointe, Erkenntnis oder einen emotionalen Moment
+- wirkt wie ein Mini-Ausschnitt mit Anfang, Spannungsbogen, Auflösung oder Punchline
+- ist mindestens {MIN_CLIP_LEN:.0f} Sekunden lang
+Nutze ausschließlich die vorhandenen Start- und Endzeiten – keine neuen erfinden.
+
+Gib ein JSON-Objekt zurück im Format:
+{{
+  "clips": [
+    {{
+      "start": float,
+      "end": float,
+      "summary": "Kurze Beschreibung des Inhalts"
+    }}
+  ]
+}}
+
+TRANSKRIPT:
+{block_text}
+""".strip()
+
+        log_text(f"block_prompt_{i:02d}.txt", prompt)
+
+        # --- robuster API-Call mit Schema (Root=object) und kleinem Retry ---
+        import time as _time
+        clips = []
+        for attempt in range(3):
+            try:
+                resp = client.chat.completions.create(
+                    model=OPENAI_MODEL,
+                    messages=[{"role": "user", "content": prompt}],
+                    response_format={
+                        "type": "json_schema",
+                        "json_schema": {
+                            "name": "clips_payload",
+                            "schema": {
+                                "type": "object",
+                                "properties": {
+                                    "clips": {
+                                        "type": "array",
+                                        "items": {
+                                            "type": "object",
+                                            "properties": {
+                                                "start": {"type": "number"},
+                                                "end": {"type": "number"},
+                                                "summary": {"type": "string"}
+                                            },
+                                            "required": ["start", "end", "summary"],
+                                            "additionalProperties": False
+                                        }
+                                    }
+                                },
+                                "required": ["clips"],
+                                "additionalProperties": False
+                            }
+                        }
+                    }
+                )
+                msg = resp.choices[0].message
+                payload = getattr(msg, "parsed", None)
+                if payload is None:
+                    payload = json.loads(msg.content)
+
+                clips = payload.get("clips", []) or []
+
+                try:
+                    log_text(f"block_output_{i:02d}.txt", json.dumps(payload, ensure_ascii=False, indent=2))
+                except Exception:
+                    pass
+                break
+            except Exception as e:
+                if attempt == 2:
+                    append_error_log(f"❌ OpenAI-Fehler Block {i}: {e}")
+                    print(f"❌ Fehler bei Block {i}: {e}")
+                else:
+                    _time.sleep(1.5 * (attempt + 1))
+
+        print(f"✅ {len(clips)} Clips empfangen in Block {i}")
+
+        # --- Clips filtern & clampen ---
+        for clip in clips:
+            try:
+                b_start, b_end = block[0]["start"], block[-1]["end"]
+                start = max(b_start, min(float(clip["start"]), b_end))
+                end   = max(b_start, min(float(clip["end"]),   b_end))
+                dur = end - start
+                if MIN_CLIP_LEN <= dur <= MAX_CLIP_LEN:
+                    clip["start"] = start
+                    clip["end"] = end
+                    clip["duration"] = round(dur, 2)
+                    all_clips.append(clip)
+            except Exception as e:
+                append_error_log(f"⛔ Clip-Filterfehler: {clip}\n{e}")
+
+        elapsed = time.perf_counter() - t0
+        avg = elapsed / i
+        eta = max(0.0, avg * (len(blocks) - i))
+        print(f"⏱️ Geschätzte Restzeit: {eta:.1f} s")
+
+    # --- Duplikate entfernen (auf 2 Dezimalen) ---
+    dedup, seen = [], set()
+    for c in all_clips:
+        k = (round(c["start"], 2), round(c["end"], 2))
+        if k in seen:
+            continue
+        seen.add(k)
+        dedup.append(c)
+    all_clips = dedup
+
+    print(f"\n📈 Gesamtclips vor DB-Insert: {len(all_clips)}")
+
+    # === DB SPEICHERN ===
+    conn = sqlite3.connect(DB_PATH)
+    cur = conn.cursor()
+
+    cur.execute("""
+    CREATE TABLE IF NOT EXISTS highlights (
+        id INTEGER PRIMARY KEY AUTOINCREMENT,
+        file TEXT,
+        start REAL,
+        end REAL,
+        duration REAL,
+        text TEXT,
+        summary TEXT,
+        json_raw TEXT,
+        viralitaet INTEGER,
+        emotionalitaet INTEGER,
+        witz INTEGER,
+        provokation INTEGER,
+        score_total INTEGER,
+        UNIQUE(file,start,end)
+    )
+    """)
+
+    # --- Tabelle vor neuem Lauf komplett leeren ---
+    cur.execute("DELETE FROM highlights")
+    conn.commit()  # Transaktion schließen, damit VACUUM außerhalb läuft
+
+    # VACUUM separat (optional)
+    try:
+        conn.execute("VACUUM")  # oder: sqlite3.connect(DB_PATH).execute("VACUUM").close()
+        print("🧹 Alte Highlights gelöscht und Datenbank komprimiert.")
+    except sqlite3.OperationalError as e:
+        print(f"⚠️ VACUUM übersprungen: {e}")
+
+    inserted = 0
+    failed = 0
+
+    for clip in all_clips:
+        try:
+            start = float(clip["start"])
+            end = float(clip["end"])
+            duration = float(clip["duration"])
+            summary = (clip.get("summary") or "").strip()
+
+            if end <= start or start < 0:
+                raise ValueError("Ungültige Zeiten")
+
+            # JSON-Segmente (zur Nachvollziehbarkeit) + Wort-basierter Text (dopplerfrei)
+            json_snippets = get_json_snippets_for_clip(start, end, segment_json_data)
+            json_raw = json.dumps(json_snippets, ensure_ascii=False)
+
+            original_text = build_text_strict_from_words(start, end, WORDS)
+
+            cur.execute("""
+                INSERT OR IGNORE INTO highlights (
+                    file, start, end, duration, text, summary, json_raw,
+                    viralitaet, emotionalitaet, witz, provokation, score_total
+                )
+                VALUES (?, ?, ?, ?, ?, ?, ?, NULL, NULL, NULL, NULL, NULL)
+            """, (
+                # 'file' = Basename (z. B. testVideoShort)
+                Path(base).name,
+                start, end, duration,
+                original_text, summary, json_raw
+            ))
+            if cur.rowcount > 0:
+                inserted += 1
+        except Exception as e:
+            failed += 1
+            append_error_log(f"❌ DB-Fehler: {clip}\n{e}")
+
+    conn.commit()
+    conn.close()
+
+    print("\n📊 Ergebnisse:")
+    print(f"  ✅ Highlights gespeichert:  {inserted}")
+    print(f"  ❌ Fehlerhafte Clips:       {failed}")
+    print(f"📁 Logs:                     {LOG_DIR.resolve()}")
+
+if __name__ == "__main__":
+    main()
diff --git a/src/text/transcription.py b/src/text/transcription.py
new file mode 100644
index 0000000..0c8ee69
--- /dev/null
+++ b/src/text/transcription.py
@@ -0,0 +1,276 @@
+#!/usr/bin/env python3
+# transcription_chunked_words.py — Whisper mit Wortzeitstempeln, doppler-sicher
+
+import os
+import sys
+import json
+import argparse
+from pathlib import Path
+from tempfile import TemporaryDirectory
+from typing import List, Dict, Tuple, Optional
+
+import ffmpeg
+import whisper
+
+# ── Projektwurzel in sys.path aufnehmen (dieses Skript liegt z. B. unter src/text/)
+ROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(ROOT))
+
+from config import INPUT_DIR, TRANSCRIPTS_DIR  # zentrale Pfade
+
+# ──────────────────────────────────────────────────────────────────────────────
+# Utilities
+# ──────────────────────────────────────────────────────────────────────────────
+
+def probe_duration(path: Path) -> float:
+    """Ermittle die Videodauer in Sekunden (ffmpeg.probe)."""
+    try:
+        meta = ffmpeg.probe(str(path))
+    except ffmpeg.Error as e:
+        raise RuntimeError(f"ffmpeg.probe fehlgeschlagen für {path}: {e.stderr.decode('utf-8','ignore') if hasattr(e, 'stderr') else e}") from e
+
+    dur = meta.get("format", {}).get("duration")
+    if dur is not None:
+        return float(dur)
+
+    cand = 0.0
+    for s in meta.get("streams", []) or []:
+        d = s.get("duration")
+        if d:
+            cand = max(cand, float(d))
+    if cand > 0:
+        return cand
+    raise RuntimeError(f"Konnte Videodauer nicht bestimmen: {path}")
+
+def make_chunks(total: float, chunk_seconds: float, overlap: float) -> List[Tuple[float,float]]:
+    """Zerteile [0,total] in überlappende Intervalle."""
+    if chunk_seconds <= 0:
+        return [(0.0, total)]
+    s, out = 0.0, []
+    while s < total:
+        e = min(s + chunk_seconds, total)
+        out.append((s, e))
+        if e >= total:
+            break
+        s = max(0.0, e - overlap)
+    return out
+
+def extract_audio_segment(src_video: Path, start: float, end: float, out_wav: Path) -> None:
+    """Extrahiere [start,end] als Mono-16kHz-WAV."""
+    (
+        ffmpeg
+        .input(str(src_video), ss=start, to=end)
+        .output(
+            str(out_wav),
+            format="wav",
+            acodec="pcm_s16le",
+            ac=1,
+            ar="16000",
+            loglevel="error",
+        )
+        .overwrite_output()
+        .run()
+    )
+
+def is_suspect(text: str) -> bool:
+    """Heuristik: leere/loopende/zweifelhafte Zeilen markieren."""
+    t = (text or "").strip().lower()
+    if not t:
+        return True
+    words = t.split()
+    if not words:
+        return True
+    counts = {w: words.count(w) for w in set(words)}
+    most_common = max(counts.values())
+    return most_common / len(words) > 0.6 or most_common > 20
+
+def merge_overlaps_keep_best(
+    segments: List[Dict],
+    max_gap: float = 0.15,
+    min_dur: float = 0.30
+) -> List[Dict]:
+    """
+    Zeitlich sortieren, kleine Gaps schließen. Bei Überlappung:
+    - keine Text-Konkatenation
+    - behalte das "bessere" Segment (längere Dauer, dann längerer Text)
+    - words: vom "best" übernehmen (falls vorhanden)
+    """
+    cleaned = []
+    for s in segments:
+        s0 = float(s["start"]); s1 = float(s["end"])
+        txt = (s.get("text") or "").strip()
+        if s1 - s0 >= min_dur and txt:
+            cleaned.append({
+                "start": s0, "end": s1,
+                "text": txt,
+                "words": s.get("words", [])
+            })
+    if not cleaned:
+        return []
+
+    cleaned.sort(key=lambda x: (x["start"], x["end"]))
+    out = [cleaned[0]]
+
+    def score(x: Dict) -> tuple:
+        return (x["end"] - x["start"], len(x.get("text", "")))
+
+    for s in cleaned[1:]:
+        m = out[-1]
+        if s["start"] <= m["end"] + max_gap:
+            best = s if score(s) > score(m) else m
+            out[-1] = {
+                "start": min(m["start"], s["start"]),
+                "end":   max(m["end"],   s["end"]),
+                "text":  best["text"],
+                "words": best.get("words", []),
+            }
+        else:
+            out.append(s)
+    return out
+
+def write_outputs(base: Path, segments: List[Dict], out_dir: Path, ascii_dash: bool = True):
+    """Schreibe _timed.txt, _suspect_lines.txt und _segments.json."""
+    out_dir.mkdir(parents=True, exist_ok=True)
+    dash = "-" if ascii_dash else "–"
+
+    out_txt  = out_dir / f"{base.stem}_timed.txt"
+    out_sus  = out_dir / f"{base.stem}_suspect_lines.txt"
+    out_json = out_dir / f"{base.stem}_segments.json"
+
+    # TXT nur zur Ansicht
+    with open(out_txt, "w", encoding="utf-8") as f_txt, open(out_sus, "w", encoding="utf-8") as f_sus:
+        for s in segments:
+            line = f"[{s['start']:.2f} {dash} {s['end']:.2f}] {s['text']}\n"
+            f_txt.write(line)
+            if is_suspect(s["text"]):
+                f_sus.write(line)
+
+    # JSON für die Weiterverarbeitung (inkl. words)
+    with open(out_json, "w", encoding="utf-8") as f_json:
+        json.dump(segments, f_json, ensure_ascii=False, indent=2)
+
+    return out_txt, out_sus, out_json
+
+def find_default_input() -> Optional[Path]:
+    """Nimm das erste Video aus INPUT_DIR, falls kein --input übergeben wurde."""
+    exts = (".mp4", ".mov", ".mkv", ".m4v", ".wav", ".mp3")
+    for p in sorted(INPUT_DIR.iterdir()):
+        if p.suffix.lower() in exts:
+            return p
+    return None
+
+# ──────────────────────────────────────────────────────────────────────────────
+# CLI
+# ──────────────────────────────────────────────────────────────────────────────
+
+def parse_args():
+    p = argparse.ArgumentParser(
+        description="Chunked Whisper Transcription mit Wortzeitstempeln & doppler-sicherem Stitching."
+    )
+    p.add_argument("--input", type=Path, default=None, help=f"Eingabevideo/-audio. Default: erstes File in {INPUT_DIR}")
+    p.add_argument("--outdir", type=Path, default=None, help=f"Ausgabeverzeichnis. Default: {TRANSCRIPTS_DIR}")
+    p.add_argument("--model", type=str, default=os.getenv("WHISPER_MODEL", "small"), help="Whisper-Modell (tiny/base/small/medium/large)")
+    p.add_argument("--lang", type=str, default=os.getenv("LANGUAGE", "none"), help="Sprachcode (z. B. 'de') oder leer/None für Auto-Detect")
+    p.add_argument("--chunk", type=float, default=60.0, help="Chunk-Länge in Sekunden (0 = ganzes File)")
+    p.add_argument("--overlap", type=float, default=2.0, help="Overlap in Sekunden")
+    p.add_argument("--min-dur", type=float, default=0.30, help="Mindest-Segmentdauer (Sekunden)")
+    p.add_argument("--max-gap", type=float, default=0.15, help="Maximaler Zeit-Gap für Merge (Sekunden)")
+    p.add_argument("--fp16", action="store_true", help="fp16 aktivieren (nur sinnvoll mit GPU)")
+    return p.parse_args()
+
+# ──────────────────────────────────────────────────────────────────────────────
+# Main
+# ──────────────────────────────────────────────────────────────────────────────
+
+def main():
+    # Whisper-Cache (damit Modelle lokal landen)
+    os.environ.setdefault("XDG_CACHE_HOME", str(ROOT / "whisper-cache"))
+
+    args = parse_args()
+    input_path = args.input or find_default_input()
+    out_dir = args.outdir or TRANSCRIPTS_DIR
+
+    print("📁 Projekt-Root:", ROOT)
+    print("📄 Input:", input_path if input_path else "—")
+    if not input_path or not input_path.exists():
+        raise FileNotFoundError(f"Kein gültiges Eingabefile gefunden. Lege ein Video/Audio in {INPUT_DIR} oder nutze --input.")
+
+    out_dir.mkdir(parents=True, exist_ok=True)
+
+    duration = probe_duration(input_path)
+    print(f"🎬 Dauer: {duration:.2f}s")
+
+    chunks = make_chunks(duration, args.chunk, args.overlap)
+    print(f"🔪 {len(chunks)} Chunks à {args.chunk:.1f}s mit {args.overlap:.1f}s Overlap")
+
+    # Whisper laden
+    print(f"🧠 Lade Whisper-Modell: {args.model}")
+    try:
+        model = whisper.load_model(args.model)
+    except Exception as e:
+        raise RuntimeError(f"Whisper-Modell '{args.model}' konnte nicht geladen werden. Installiert? (pip install openai-whisper)\n{e}") from e
+
+    all_segments: List[Dict] = []
+    with TemporaryDirectory() as tmpdir_str:
+        tmpdir = Path(tmpdir_str)
+        for i, (start, end) in enumerate(chunks, 1):
+            print(f"🔉 Chunk {i}/{len(chunks)}: {start:.2f}s - {end:.2f}s")
+            wav = tmpdir / f"chunk_{i:03d}.wav"
+            extract_audio_segment(input_path, start, end, wav)
+
+            # Sprache: ''/none = Auto-Detect
+            lang = None if str(args.lang).strip().lower() in {"", "none", "null"} else args.lang
+
+            # Transkribieren mit Wortzeiten, ohne Cross-Chunk-Kontext
+            result = model.transcribe(
+                str(wav),
+                language=lang,
+                fp16=args.fp16,
+                word_timestamps=True,
+                condition_on_previous_text=False,
+                temperature=0,
+                verbose=False,
+            )
+
+            # Center-Cut: nur Mittelteil behalten (verhindert Grenz-Doppler)
+            keep_start = start if i == 1 else start + args.overlap / 2.0
+            keep_end   = end   if i == len(chunks) else end - args.overlap / 2.0
+
+            for seg in result.get("segments", []) or []:
+                s0 = float(seg["start"]) + start
+                s1 = float(seg["end"]) + start
+                mid = (s0 + s1) / 2.0
+                if not (keep_start <= mid < keep_end):
+                    continue
+
+                # Wörter mit absoluten Zeiten übernehmen
+                words = []
+                for w in (seg.get("words") or []):
+                    txt = (w.get("word") or w.get("text") or "").strip()
+                    if not txt:
+                        continue
+                    words.append({
+                        "start": float(w["start"]) + start,
+                        "end":   float(w["end"])   + start,
+                        "text":  txt
+                    })
+
+                all_segments.append({
+                    "start": s0,
+                    "end":   s1,
+                    "text":  (seg.get("text") or "").strip(),
+                    "words": words
+                })
+
+    print(f"🧹 Roh-Segmente: {len(all_segments)}  → merge & filter …")
+    merged = merge_overlaps_keep_best(all_segments, max_gap=args.max_gap, min_dur=args.min_dur)
+    print(f"✅ Gemergte Segmente: {len(merged)}")
+
+    out_txt, out_sus, out_json = write_outputs(input_path, merged, out_dir, ascii_dash=True)
+    print(f"📝 TXT: {out_txt}")
+    print(f"⚠️  SUSPECT: {out_sus}")
+    print(f"💾 JSON: {out_json}")
+    print("🎉 Fertig.")
+
+if __name__ == "__main__":
+    main()
diff --git a/src/text/transcription_with_speaker.py b/src/text/transcription_with_speaker.py
new file mode 100644
index 0000000..e5912b7
--- /dev/null
+++ b/src/text/transcription_with_speaker.py
@@ -0,0 +1,88 @@
+import os
+import json
+import ffmpeg
+import whisper
+import tempfile
+import torch
+from tqdm import tqdm
+from pathlib import Path
+from pyannote.audio import Pipeline
+
+# === HUGGING FACE TOKEN (für pyannote) ===
+HF_TOKEN = "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"
+
+# === Torch Optimierung (optional) ===
+torch.set_float32_matmul_precision("medium")
+
+# === Einstellungen ===
+PROJECT_ROOT = Path(__file__).resolve().parents[2]
+input_file = PROJECT_ROOT / "input" / "testVideoShort.mov"
+output_dir = PROJECT_ROOT / "transkripte"
+output_dir.mkdir(parents=True, exist_ok=True)
+
+output_txt = output_dir / f"{input_file.stem}_timed.txt"
+output_json = output_dir / f"{input_file.stem}_segments.json"
+
+# === Video in Audio konvertieren ===
+print("🎞️ Extrahiere Audio ...")
+tmp_dir = Path(tempfile.mkdtemp())
+wav_file = tmp_dir / "audio.wav"
+ffmpeg.input(str(input_file)).output(
+    str(wav_file),
+    format="wav",
+    acodec="pcm_s16le",
+    ac=1,
+    ar="16000",
+    loglevel="error"
+).overwrite_output().run()
+
+# === Transkription mit Whisper ===
+print("🧠 Starte Transkription mit Whisper ...")
+model = whisper.load_model("small")
+result = model.transcribe(
+    str(wav_file),
+    language="de",
+    fp16=False,
+    word_timestamps=False,
+    condition_on_previous_text=True,
+    temperature=0,
+    verbose=False
+)
+segments = result["segments"]
+
+# === Diarisation mit Pyannote ===
+print("🗣️ Starte Sprecheranalyse mit Pyannote (das dauert jetzt etwas) ...")
+pipeline = Pipeline.from_pretrained(
+    "pyannote/speaker-diarization-3.1",
+    use_auth_token=HF_TOKEN
+)
+pipeline.to(torch.device("mps"))  # ⬅️ Apple GPU beschleunigen
+
+diarization = pipeline(str(wav_file))
+
+# === Sprecher zuordnen ===
+def assign_speakers_to_segments(segments, diarization):
+    assigned = []
+    for seg in tqdm(segments, desc="🎙️ Weise Sprecher zu"):
+        speaker = "unknown"
+        for turn, _, label in diarization.itertracks(yield_label=True):
+            if turn.start <= seg["start"] <= turn.end:
+                speaker = label
+                break
+        seg["speaker"] = speaker
+        assigned.append(seg)
+    return assigned
+
+segments_with_speaker = assign_speakers_to_segments(segments, diarization)
+
+# === Speichern als TXT
+with open(output_txt, "w", encoding="utf-8") as f:
+    for seg in segments_with_speaker:
+        line = f"[{seg['start']:.2f} – {seg['end']:.2f}] {seg['speaker'].upper()}: {seg['text'].strip()}\n"
+        f.write(line)
+
+# === Speichern als JSON
+with open(output_json, "w", encoding="utf-8") as f:
+    json.dump(segments_with_speaker, f, ensure_ascii=False, indent=2)
+
+print(f"✅ Transkript mit Sprecherinfos gespeichert unter:\n📄 {output_txt}\n📄 {output_json}")
diff --git a/text-clustering b/text-clustering
deleted file mode 160000
index 7815f8b..0000000
--- a/text-clustering
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 7815f8b37d91b75cf160ed3f0ec8550c0b58cabb
diff --git a/transkripte/.DS_Store b/transkripte/.DS_Store
deleted file mode 100644
index 1a1bbf7..0000000
Binary files a/transkripte/.DS_Store and /dev/null differ
diff --git a/whisper.cpp b/whisper.cpp
deleted file mode 160000
index 2e310b8..0000000
--- a/whisper.cpp
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 2e310b841e0b4e7cf00890b53411dd9f8578f243