diff --git a/.DS_Store b/.DS_Store
deleted file mode 100644
index e90ef19..0000000
Binary files a/.DS_Store and /dev/null differ
diff --git a/.gitignore b/.gitignore
index 382d186..8b0731e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,27 +1,108 @@
-# IDE & Cache
+# ─────────────────────────────
+# IDEs & System Files
+# ─────────────────────────────
.idea/
+.vscode/
__pycache__/
*.pyc
.DS_Store
+*.log
-# Whisper Modelle & Cache
+# ─────────────────────────────
+# Cache / Modelle / Checkpoints
+# ─────────────────────────────
whisper-cache/
models/
*.pt
+*.onnx
+*.bin
+*.safetensors
-# Output/Temp Files
+# ─────────────────────────────
+# Datenbank / temporäre Dateien
+# ─────────────────────────────
+*.db
+*.sqlite
+logs/
+temp/
+tmp/
+*.tmp
+
+# ─────────────────────────────
+# Transkripte / KI-Zwischenausgaben
+# ─────────────────────────────
+/data/transkripte/
+/transcripts/
+/outputs/
+/results/
+*_segments.json
+*_timed.txt
+*_suspect_lines.txt
+
+# ─────────────────────────────
+# Video / Audio Outputs
+# ─────────────────────────────
*.mp4
*.mov
-*.db
+*.mkv
*.wav
-*.json
-temp.*
-logs/
+*.webm
+*.mp3
-# Eingebettete Repos
+# ─────────────────────────────
+# Generierte Teil-/Ergebnis-Ordner
+# ─────────────────────────────
+/raw_clips/
+/face_combined/
+/face_crop_centers/
+/cropped/
+/subtitled/
+/segments/
+/highlight_clips/
+/output/
+/renders/
+/exports/
+
+# ─────────────────────────────
+# Eingebettete Repos oder externe Module
+# ─────────────────────────────
+/whisper.cpp/
+/text-clustering/
+/venv/
+/.env/
+/.env.local
+.envrc
+.env.*
+
+# ─────────────────────────────
+# Backups / Sonstiges
+# ─────────────────────────────
+*.bak
+*.old
+*.orig
+*.swp
+*.zip
+*.tar
+*.gz
+
+# IDE/System
+.idea/
+.DS_Store
+__pycache__/
+*.pyc
+
+# Secrets/Umgebung
+.env
+config.py
+
+# Große/ausgeleitete Daten
+data/
+transkripte/
+whisper-cache/
+models/
+*.db
+*.mp4 *.mov *.mkv *.wav *.mp3 *.webm
+logs/ tmp/ temp/
+# embedded / external
text-clustering/
whisper.cpp/
-
-# Video-Rohmaterial
-*.mov
-
diff --git a/.idea/.gitignore b/.idea/.gitignore
deleted file mode 100644
index 13566b8..0000000
--- a/.idea/.gitignore
+++ /dev/null
@@ -1,8 +0,0 @@
-# Default ignored files
-/shelf/
-/workspace.xml
-# Editor-based HTTP Client requests
-/httpRequests/
-# Datasource local storage ignored files
-/dataSources/
-/dataSources.local.xml
diff --git a/.idea/BachlorArbeit.iml b/.idea/BachlorArbeit.iml
deleted file mode 100644
index 106b3db..0000000
--- a/.idea/BachlorArbeit.iml
+++ /dev/null
@@ -1,11 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/dataSources.xml b/.idea/dataSources.xml
deleted file mode 100644
index 34586b5..0000000
--- a/.idea/dataSources.xml
+++ /dev/null
@@ -1,35 +0,0 @@
-
-
-
-
- sqlite.xerial
- true
- org.sqlite.JDBC
- jdbc:sqlite:$PROJECT_DIR$/segments.db
- $ProjectFileDir$
-
-
- file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/xerial/sqlite-jdbc/3.45.1.0/sqlite-jdbc-3.45.1.0.jar
-
-
- file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/slf4j/slf4j-api/1.7.36/slf4j-api-1.7.36.jar
-
-
-
-
- sqlite.xerial
- true
- org.sqlite.JDBC
- jdbc:sqlite:$PROJECT_DIR$/clips_openai.db
- $ProjectFileDir$
-
-
- file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/xerial/sqlite-jdbc/3.45.1.0/sqlite-jdbc-3.45.1.0.jar
-
-
- file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/slf4j/slf4j-api/1.7.36/slf4j-api-1.7.36.jar
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
deleted file mode 100644
index 105ce2d..0000000
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/misc.xml b/.idea/misc.xml
deleted file mode 100644
index 1733c19..0000000
--- a/.idea/misc.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/modules.xml b/.idea/modules.xml
deleted file mode 100644
index 5be715f..0000000
--- a/.idea/modules.xml
+++ /dev/null
@@ -1,8 +0,0 @@
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
deleted file mode 100644
index 6bdb7e2..0000000
--- a/.idea/vcs.xml
+++ /dev/null
@@ -1,9 +0,0 @@
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/README.md b/README.md
index e69de29..fa355d2 100644
--- a/README.md
+++ b/README.md
@@ -0,0 +1,250 @@
+# Bachelorarbeit – Pipeline: Automatisierte Highlight-Erkennung & 9:16-Aufbereitung
+
+Diese Repository enthält eine vollständige, skriptbasierte Pipeline, um aus Langvideos automatisch Social‑Media‑taugliche 9:16‑Highlights zu erzeugen – inkl. Transkription, KI‑gestützter Clip‑Selektion, Gesichts‑/Mundaktivitätsanalyse, Auto‑Cropping, Untertitel (Word‑Caps) und finalem Export.
+
+## Inhaltsverzeichnis
+- [Features](#features)
+- [Ordnerstruktur](#ordnerstruktur)
+- [Voraussetzungen](#voraussetzungen)
+- [Installation](#installation)
+- [Schnellstart (empfohlener Workflow)](#schnellstart-empfohlener-workflow)
+- [Skripte & CLI](#skripte--cli)
+- [Tipps & Troubleshooting](#tipps--troubleshooting)
+- [Reproduzierbarkeit](#reproduzierbarkeit)
+- [Lizenz / Danksagung](#lizenz--danksagung)
+
+---
+
+## Features
+- **Transkription mit Wort‑Zeitstempeln (Whisper, chunked ohne Grenz‑Doppler)**
+- **LLM‑gestützte Clip‑Selektion** (Viralität/Emotionalität etc. in SQLite gespeichert)
+- **Face‑Detection (YOLOv8‑face) & Mundaktivität (MediaPipe)**
+- **Stabiles 9:16‑Auto‑Cropping** (Median + EMA, Deadband, Szenenschnitt‑Erkennung, Switch‑Cooldown)
+- **Word‑Caps Untertitel** (ASS generiert, per ffmpeg eingebrannt)
+- **Batch‑Export der Highlights** (MoviePy, Längen‑/Grenz‑Checks)
+
+## Ordnerstruktur
+Die Pfade werden zentral in `config.py` definiert:
+```
+PROJECT_ROOT/
+├─ data/
+│ ├─ input/ # Eingabevideo(s)
+│ ├─ transkripte/ # Whisper-Outputs (*_segments.json, *_timed.txt ...)
+│ ├─ segments/ # LLM-Clip-Auswahl, DB etc.
+│ ├─ output/
+│ │ └─ raw_clips/ # Roh-Highlight-Clips (aus cutClips.py)
+│ ├─ face_data_combined/ # faces.json je Clip (YOLO + MediaPipe)
+│ └─ face_crop_centers/ # (optional) Center-Listen
+├─ output/
+│ ├─ output_9x16_final/ # Auto-cropped 9:16 Videos
+│ ├─ output_9x16_final_subbed_word/ # 9:16 mit eingebrannten Word-Caps
+│ └─ debug/ # Debug-Previews/Artefakte
+├─ models/ # YOLO-Weights (z. B. yolov8n-face.pt)
+├─ whisper-cache/ # Whisper Modell-Cache
+└─ src/... (optional projektspezifisch)
+```
+> Beim ersten Start legt `config.py` fehlende Verzeichnisse automatisch an.
+
+## Voraussetzungen
+**System‑Tools**
+- `ffmpeg` (inkl. `ffprobe`) im `PATH`
+
+**Python**
+- Python 3.10+ empfohlen
+- Pakete (Beispiel):
+ `openai-whisper`, `torch`, `ffmpeg-python`, `ultralytics`, `opencv-python`, `mediapipe`, `moviepy`, `tqdm`, `numpy`, `regex`
+- Optional/abhängig vom Codepfad: `pydub`, `scikit-image` (falls in Erweiterungen verwendet)
+
+**Modelle & Keys**
+- **Whisper**: lädt Modelle automatisch in `whisper-cache/` (steuerbar via `WHISPER_MODEL`)
+- **YOLOv8‑face**: `models/yolov8n-face.pt` (oder größeres Modell)
+- **OpenAI API Key** (für `segment_transcript.py` & `rateCluster.py`): `export OPENAI_API_KEY=...`
+ - Default‑Modell ggf. per `export OPENAI_MODEL=gpt-4o` setzen
+
+## Installation
+```bash
+# 1) Python-Umgebung
+python3 -m venv .venv
+source .venv/bin/activate
+
+# 2) Systemabhängigkeiten
+# ffmpeg installieren (Mac: brew install ffmpeg, Ubuntu: apt install ffmpeg)
+
+# 3) Python-Pakete (Beispiel)
+pip install --upgrade pip
+pip install openai-whisper torch ffmpeg-python ultralytics opencv-python mediapipe moviepy tqdm numpy regex
+
+# 4) Modelle/Dateien
+# YOLO-Weights:
+# Download yolov8n-face.pt → ./models/yolov8n-face.pt
+# API Key für LLM:
+export OPENAI_API_KEY="sk-..."
+export OPENAI_MODEL="gpt-4o"
+```
+
+## Schnellstart (empfohlener Workflow)
+1) **Eingabe ablegen**
+ Lege dein Langvideo in `data/input/` (z. B. `meinvideo.mp4`).
+
+2) **Transkription (Whisper, chunked & doppler-sicher)**
+```bash
+python transcription.py --input data/input/meinvideo.mp4 --model small --lang de
+```
+ → erzeugt `*_segments.json` + `*_timed.txt` in `data/transkripte/`.
+
+3) **Clips mit LLM selektieren & in DB speichern**
+```bash
+export OPENAI_API_KEY="..."; export OPENAI_MODEL="gpt-4o"
+python segment_transcript.py --base meinvideo --block 60 --min 6.0 --max 30.0
+```
+ → schreibt Clips in SQLite (`data/clips_openai.db` o. ä.)
+
+4) **Highlights aus dem Originalvideo schneiden**
+```bash
+python cutClips.py --file meinvideo.mp4 --limit 10 --order score
+```
+ → exportiert `highlight_*.mp4` nach `data/output/raw_clips/`
+
+5) **Face‑Detection + Mundaktivität**
+```bash
+python main_detect_faces.py --model models/yolov8n-face.pt --input-dir data/output/raw_clips --output-dir data/face_data_combined --frame-skip 1 --downscale 0.5
+```
+
+6) **Targets je Frame bauen (Zentren/Größe glätten)**
+```bash
+python make_segments.py --pattern "highlight_*.mp4" --fps 0 --smooth 7 --overwrite
+```
+
+7) **9:16 Auto‑Crop anwenden**
+```bash
+python main_apply_crop.py --pattern "highlight_*.mp4" --median 7 --ema 0.5 --deadband 16 --cut_detect --mux_audio --overwrite
+```
+ → fertige 9:16‑Clips in `output/output_9x16_final/`
+
+8) **Word‑Caps Untertitel einbrennen (optional)**
+```bash
+python add_subtitles.py --clips_dir output/output_9x16_final --out_dir output/output_9x16_final_subbed_word --model small --limit 20
+```
+ → fertige Videos mit eingebrannten Word‑Caps in `output/output_9x16_final_subbed_word/`
+
+> 💡 Du kannst viele Parameter (Fensterbreiten, Deadband, Erkennungsschwellen, Limits) über die CLI anpassen.
+
+## Skripte & CLI
+### `transcription.py`
+Chunked‑Transkription mit Wortzeitstempeln.
+```
+--input PATH # Eingabevideo/-audio (Default: erstes File in data/input/)
+--outdir PATH # Ausgabeverzeichnis (Default: data/transkripte/)
+--model NAME # Whisper-Modell (tiny/base/small/medium/large; env: WHISPER_MODEL)
+--lang CODE # Sprachcode (z. B. de) oder leer/None für Auto-Detect
+--chunk FLOAT # Chunk-Länge in s (Default 60)
+--overlap FLOAT # Überlappung in s (Default 2.0)
+--min-dur FLOAT # Mindest-Segmentdauer (s)
+--max-gap FLOAT # Max. Zeit-Gap beim Mergen (s)
+--fp16 # Nur sinnvoll mit GPU
+```
+
+### `segment_transcript.py`
+LLM‑Selektion & Speichern in SQLite.
+```
+--base STR # Basename der Transkriptdateien (z. B. 'meinvideo')
+--block FLOAT # Blocklänge s für den Prompt
+--min FLOAT # minimale Clip-Länge s
+--max FLOAT # maximale Clip-Länge s
+# env: OPENAI_API_KEY, OPENAI_MODEL (z. B. gpt-4o)
+```
+
+### `cutClips.py`
+Schneidet ausgewählte Highlights als Einzelclips.
+```
+--file NAME # Name der Input-Datei in data/input (Default: erstes Video)
+--limit INT # Anzahl zu exportierender Clips (Default 10)
+--order {score,start} # Sortierung: Score (absteigend) oder Startzeit
+```
+
+### `main_detect_faces.py`
+YOLOv8‑face + MediaPipe → `faces.json` pro Clip.
+```
+--input-dir PATH # Default: data/output/raw_clips
+--output-dir PATH # Default: data/face_data_combined
+--model PATH # YOLOv8-face Weights (Default: models/yolov8n-face.pt)
+--conf-thresh FLOAT # Default 0.35
+--frame-skip INT # z. B. 1 = jeden Frame, 2 = jeden von zwei ...
+--downscale FLOAT # Frame-Downscale vor YOLO (0..1, z. B. 0.5)
+--expansion FLOAT # Margin Pass 1 (relativ)
+--expansion2 FLOAT # Margin Pass 2 (relativ)
+--min-crop INT # minimale Croplänge (px)
+--faces-upscale INT # min. Kantenlänge für FaceMesh (kleine Crops hochskalieren)
+--imgsz INT # YOLO input size (Default 448)
+--max-det INT # Max Detects / Frame
+--use-refine # MediaPipe refine_landmarks aktivieren
+```
+
+### `make_segments.py`
+Erzeugt `*_target_by_frame.json` (Zentren+Side pro Frame) aus Face/Center‑Daten.
+```
+--pattern STR # Dateimuster in raw_clips (Default: highlight_*.mp4)
+--fps FLOAT # FPS erzwingen (0 = aus Video lesen)
+--smooth INT # MA-Fensterbreite (ungerade)
+--overwrite # bestehende target_by_frame.json überschreiben
+```
+
+### `main_apply_crop.py`
+Wendet 9:16‑Crop mit Glättung/Szenenschnitt an.
+```
+--pattern STR # Dateimuster in raw_clips (Default: *.mp4)
+--out_w INT # Output-Breite (Default 1080)
+--out_h INT # Output-Höhe (Default 1920)
+--zoom_pad FLOAT # Zoom-Pad (0..1)
+--median INT # Median-Fenster (>=3, ungerade)
+--ema FLOAT # EMA-Alpha (0..1)
+--deadband FLOAT # Totband in Pixel
+--switch_cd INT # Cooldown-Frames nach Trackwechsel
+--cut_detect # Szenenschnitt-Erkennung aktivieren
+--cut_corr FLOAT # Schwellwert Korrelation (0..1)
+--cut_cd INT # Cooldown-Frames nach Cut
+--mux_audio # Original-Audio unterlegen
+--debug # Debug-Overlay anzeigen
+--debug_scale FLOAT # Debug-Preview skaliert rendern
+--overwrite # vorhandene Ausgaben überschreiben
+```
+
+### `add_subtitles.py`
+Generiert Word‑Caps mit Whisper & brennt sie ein.
+```
+--clips_dir PATH # Quelle (Default: output/output_9x16_final)
+--out_dir PATH # Ziel (Default: output/output_9x16_final_subbed_word)
+--pattern STR # z. B. *.mp4
+--limit INT # Nur die ersten N Clips
+--model NAME # Whisper-Modell (tiny/base/small/medium/large)
+--lang CODE # Sprachcode oder Auto
+```
+
+### `rateCluster.py` (optional)
+Lässt LLM Scores (Viralität, Emotion, Humor, Provokation) nachtragen.
+> Modelliere Standard‑Modell via `OPENAI_MODEL` (z. B. `gpt-4o`).
+
+---
+
+## Tipps & Troubleshooting
+- **Modelle/Performance**
+ - CPU‑only ist möglich (Whisper/YOLO langsamer). Auf Apple Silicon wird automatisch **MPS** genutzt; auf NVIDIA **CUDA**.
+ - `--frame-skip` und `--downscale` in `main_detect_faces.py` beschleunigen die Face‑Detection deutlich.
+- **ffmpeg‑Muxing prüfen** (`main_apply_crop.py --mux_audio`): Falls Ton fehlt, `ffmpeg`-Installation checken. Rückgabecode im Log prüfen.
+- **Fehlende Dateien**
+ - Kein Input? → `data/input/` prüfen.
+ - Fehlende Transkript‑Paare? → `*_timed.txt` und `*_segments.json` müssen existieren (aus `transcription.py`).
+ - Fehlende Faces? → Pfad zu `models/yolov8n-face.pt` korrekt?
+- **Datenbank**
+ - Highlights liegen in SQLite (siehe `config.py`: `DB_PATH`). Bei Wiederholungen kann ein `DELETE FROM highlights; VACUUM;` sinnvoll sein.
+- **Cache/Verzeichnisse**
+ - Whisper‑Cache via `XDG_CACHE_HOME` → `whisper-cache/` neben dem Projekt. Speicherplatz beachten.
+
+## Reproduzierbarkeit
+- Lege eine `requirements.txt` mit exakten Versionen an (Freeze deiner funktionierenden Umgebung).
+- Dokumentiere verwendete **Modell‑Versionsstände** (YOLO Weights, Whisper‑Modellgröße, OPENAI_MODEL).
+- Fixiere Random‑Seeds, falls nötig (hier meist deterministisch durch externe Modelle/Bibliotheken).
+
+## Lizenz / Danksagung
+- Verwendung von **OpenAI Whisper**, **Ultralytics YOLOv8**, **MediaPipe**, **OpenCV**, **MoviePy**, **ffmpeg**.
+- Die jeweiligen Lizenzen der Bibliotheken beachten.
diff --git a/code/text/cutClips.py b/code/text/cutClips.py
deleted file mode 100644
index a58331e..0000000
--- a/code/text/cutClips.py
+++ /dev/null
@@ -1,38 +0,0 @@
-from moviepy.video.io.VideoFileClip import VideoFileClip
-from pathlib import Path
-import sqlite3
-
-# === Setup ===
-input_video = Path("input/testVideoShort.mov")
-output_dir = Path("output")
-output_dir.mkdir(parents=True, exist_ok=True)
-
-# === SQLite DB lesen ===
-db_path = "clips_openai.db"
-conn = sqlite3.connect(db_path)
-cursor = conn.cursor()
-
-# Nur die Top 10 Clips mit höchstem score_total
-cursor.execute("""
- SELECT start, end, text
- FROM highlights
- ORDER BY score_total DESC
- LIMIT 10
-""")
-highlights = cursor.fetchall()
-
-# === Video laden ===
-video = VideoFileClip(str(input_video))
-
-# === Clips schneiden ===
-for i, (start, end, text) in enumerate(highlights):
- output_file = output_dir / f"highlight_{i+1}.mp4"
- end = min(end, video.duration) # Sicherstellen, dass das Ende nicht über das Video hinausgeht
- print(f"🎬 Exportiere Clip {i+1}: {start:.2f}s – {end:.2f}s → {output_file.name}")
- clip = video.subclipped(start, end)
- clip.write_videofile(str(output_file), codec="libx264", audio_codec="aac")
-
-# === Cleanup ===
-conn.close()
-video.close()
-print("✅ Top 10 Clips exportiert.")
diff --git a/code/text/segment_transcript.py b/code/text/segment_transcript.py
deleted file mode 100644
index d8eba8b..0000000
--- a/code/text/segment_transcript.py
+++ /dev/null
@@ -1,196 +0,0 @@
-import json
-import sqlite3
-import re
-from pathlib import Path
-from openai import OpenAI
-from datetime import datetime
-import time
-import nltk
-
-nltk.download("punkt")
-
-# === SETTINGS ===
-TRANSCRIPT_PATH = Path("transkripte/testVideoShort_timed.txt")
-DB_PATH = Path("clips_openai.db")
-LOG_DIR = Path("logs")
-LOG_DIR.mkdir(exist_ok=True)
-BLOCK_DURATION = 300
-MIN_CLIP_LEN = 5
-MAX_CLIP_LEN = 90
-
-client = OpenAI(api_key="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")
-
-# === HILFSFUNKTIONEN ===
-def log_text(filename, content):
- (LOG_DIR / filename).write_text(content.strip(), encoding="utf-8")
-
-def append_error_log(content):
- with (LOG_DIR / "errors.txt").open("a", encoding="utf-8") as f:
- f.write(content + "\n\n")
-
-def extract_json(text):
- match = re.search(r"\[.*\]", text.strip(), re.DOTALL)
- if match:
- try:
- return json.loads(match.group())
- except Exception as e:
- append_error_log(f"❌ JSON-Fehler: {e}\n{text}")
- return []
-
-def get_original_text(clip, segments, debug=False):
- texts = []
- used_segments = []
- for s in segments:
- # Überschneidung: Segment und Clip teilen sich Zeit
- if not (s["end"] < clip["start"] or s["start"] > clip["end"]):
- texts.append(s["text"])
- used_segments.append(f"[{s['start']:.2f} – {s['end']:.2f}] {s['text']}")
- if debug:
- print(f"\n🟢 Clip {clip['start']}–{clip['end']} nutzt Segmente:\n" +
- "\n".join(used_segments))
- return " ".join(texts).strip()
-
-# === TRANSKRIPT EINLESEN ===
-lines = TRANSCRIPT_PATH.read_text(encoding="utf-8").splitlines()
-segments = []
-for line in lines:
- match = re.match(r"\[(\d+(?:\.\d+)?)\s*[–-]\s*(\d+(?:\.\d+)?)\]\s*(.*)", line)
- if match:
- start, end, text = match.groups()
- start = float(start)
- end = float(end)
- if end - start >= 2.0:
- segments.append({"start": start, "end": end, "text": text.strip()})
-
-if not segments:
- raise RuntimeError("🚫 Keine gültigen Segmente gefunden.")
-print(f"✅ {len(segments)} gültige Transkriptsegmente geladen.")
-
-# === BLÖCKE BILDEN
-blocks = []
-current_block = []
-current_start = 0.0
-for seg in segments:
- if seg["end"] - current_start > BLOCK_DURATION:
- blocks.append(current_block)
- current_block = []
- current_start = seg["start"]
- current_block.append(seg)
-if current_block:
- blocks.append(current_block)
-print(f"🧱 {len(blocks)} Blöcke erstellt (à ~{BLOCK_DURATION}s).")
-
-# === KI: CLIP-AUSWAHL
-all_clips = []
-start_time = time.perf_counter()
-
-for i, block in enumerate(blocks):
- if not block:
- continue
-
- print(f"\n🤖 Sende Block {i+1}/{len(blocks)} an GPT-4o...")
-
- block_text = "\n".join([f"[{s['start']} – {s['end']}] {s['text']}" for s in block])
- prompt = f"""
-Du bekommst einen Transkriptblock mit Zeitangaben. Extrahiere daraus 1–3 besonders interessante Abschnitte, die sich als eigenständige Clips eignen.
-Achte darauf das es abgeschlossene Clips sind und als eigenstaendiger Clip funktionieren kann.
-
-Ein guter Clip:
-- ist abgeschlossen und verständlich
-- enthält eine Pointe, Erkenntnis oder einen emotionalen Moment
-- wirkt wie ein Mini-Ausschnitt mit Anfang, Spannungsbogen, Auflösung oder Punchline
-- ist **mindestens 30 Sekunden lang**
-
-Nutze ausschließlich die vorhandenen Start- und Endzeiten – keine neuen erfinden.
-
-Gib ein valides JSON-Array zurück im Format:
-[
- {{
- "start": float,
- "end": float,
- "summary": "Kurze Beschreibung des Inhalts"
- }}
-]
-
-TRANSKRIPT:
-{block_text}
-"""
- log_text(f"block_prompt_{i+1}.txt", prompt)
-
- try:
- response = client.chat.completions.create(
- model="gpt-4o",
- messages=[{"role": "user", "content": prompt}],
- temperature=0.4
- )
- raw = response.choices[0].message.content
- log_text(f"block_output_{i+1}.txt", raw)
- clips = extract_json(raw)
-
- print(f"✅ {len(clips)} Clips empfangen in Block {i+1}")
-
- for clip in clips:
- try:
- dur = float(clip["end"]) - float(clip["start"])
- if MIN_CLIP_LEN <= dur <= MAX_CLIP_LEN:
- clip["duration"] = round(dur, 2)
- all_clips.append(clip)
- except Exception as e:
- append_error_log(f"⛔ Clip-Filterfehler: {clip}\n{e}")
-
- print(f"📈 Aktueller Clip-Gesamtstand: {len(all_clips)}")
-
- # ETA berechnen
- elapsed = time.perf_counter() - start_time
- avg_time = elapsed / (i + 1)
- eta = avg_time * (len(blocks) - (i + 1))
- print(f"⏱️ Geschätzte Restzeit: {eta:.1f} Sekunden")
-
- except Exception as e:
- append_error_log(f"❌ OpenAI-Fehler Block {i+1}: {e}")
- print(f"❌ Fehler bei Block {i+1}: {e}")
-
-# === DB SPEICHERN
-conn = sqlite3.connect(DB_PATH)
-cur = conn.cursor()
-cur.execute("DROP TABLE IF EXISTS segments")
-cur.execute("""
-CREATE TABLE segments (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- file TEXT,
- start REAL,
- end REAL,
- duration REAL,
- text TEXT,
- summary TEXT
-)
-""")
-
-inserted = 0
-failed = 0
-for clip in all_clips:
- try:
- start = float(clip["start"])
- end = float(clip["end"])
- duration = float(clip["duration"])
- summary = clip.get("summary", "")
- # debug=True für print aller Segment-Texte pro Clip
- original_text = get_original_text(clip, segments, debug=False)
- if end <= start or start < 0:
- raise ValueError("Ungültige Zeiten")
- cur.execute(
- "INSERT INTO segments (file, start, end, duration, text, summary) VALUES (?, ?, ?, ?, ?, ?)",
- (TRANSCRIPT_PATH.stem, start, end, duration, original_text, summary.strip())
- )
- inserted += 1
- except Exception as e:
- failed += 1
- append_error_log(f"❌ DB-Fehler: {clip}\n{e}")
-
-conn.commit()
-conn.close()
-
-print("\n📊 Ergebnisse:")
-print(f" ✅ Clips gespeichert: {inserted}")
-print(f" ❌ Fehlerhafte Clips: {failed}")
-print(f"📁 Logs gespeichert in: {LOG_DIR.resolve()}")
diff --git a/code/text/transcription.py b/code/text/transcription.py
deleted file mode 100644
index 82ee81d..0000000
--- a/code/text/transcription.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# transcription_chunked.py
-import whisper
-from pathlib import Path
-import os
-import json
-import ffmpeg
-import tempfile
-
-# === Einstellungen ===
-input_file = Path("input/testVideoShort.mov")
-output_dir = Path("transkripte")
-output_dir.mkdir(parents=True, exist_ok=True)
-
-output_txt = output_dir / f"{input_file.stem}_timed.txt"
-output_json = output_dir / f"{input_file.stem}_segments.json"
-suspect_txt = output_dir / f"{input_file.stem}_suspect_lines.txt"
-
-CHUNKS = 4 # Anzahl Chunks (anpassen!)
-OVERLAP = 2.0 # Sekunden Überlappung
-
-os.environ["XDG_CACHE_HOME"] = str(Path(__file__).parent / "whisper-cache")
-
-probe = ffmpeg.probe(str(input_file))
-duration = float(probe["format"]["duration"])
-print(f"🎥 Videolänge: {duration:.2f} Sekunden")
-
-def extract_audio_chunk(start_time, duration, output_path):
- ffmpeg.input(str(input_file), ss=start_time, t=duration).output(
- str(output_path),
- format="wav",
- acodec="pcm_s16le",
- ac=1,
- ar="16000",
- loglevel="error"
- ).overwrite_output().run()
-
-def is_suspect(text):
- words = text.strip().lower().split()
- if not words:
- return True
- most_common = max([words.count(w) for w in set(words)])
- return most_common / len(words) > 0.6 or most_common > 20
-
-tmp_dir = Path(tempfile.mkdtemp())
-all_segments = []
-
-print(f"✂️ Teile Audio in {CHUNKS} Chunks ...")
-for i in range(CHUNKS):
- chunk_start = max(0, i * (duration / CHUNKS) - OVERLAP if i > 0 else 0)
- chunk_end = min(duration, (i + 1) * (duration / CHUNKS) + OVERLAP)
- chunk_dur = chunk_end - chunk_start
- chunk_file = tmp_dir / f"chunk_{i}.wav"
- print(f"🔉 Extrahiere Chunk {i+1}/{CHUNKS}: {chunk_start:.2f}s – {chunk_end:.2f}s")
- extract_audio_chunk(chunk_start, chunk_dur, chunk_file)
-
- print(f"🧠 Transkribiere Chunk {i+1} ...")
- model = whisper.load_model("small") # Wechsel zu "medium" oder "large" falls gewünscht
- result = model.transcribe(
- str(chunk_file),
- language="de",
- fp16=False,
- word_timestamps=False,
- condition_on_previous_text=True,
- temperature=0,
- verbose=False
- )
-
- segments = result["segments"]
- # Zeitversatz für den aktuellen Chunk hinzufügen
- offset = chunk_start
- for seg in segments:
- seg["start"] += offset
- seg["end"] += offset
- all_segments.extend(segments)
-
-# === Sortiere und filtere doppelte/überlappende Segmente
-all_segments.sort(key=lambda x: x["start"])
-
-def segment_hash(seg):
- return (round(seg["start"], 2), round(seg["end"], 2), seg["text"].strip().lower())
-
-unique_segments = []
-seen = set()
-for seg in all_segments:
- h = segment_hash(seg)
- if h not in seen:
- seen.add(h)
- unique_segments.append(seg)
-
-print(f"✅ Insgesamt {len(unique_segments)} einzigartige Segmente transkribiert.")
-
-with open(output_txt, "w", encoding="utf-8") as f, open(suspect_txt, "w", encoding="utf-8") as f_sus:
- for seg in unique_segments:
- start = seg["start"]
- end = seg["end"]
- text = seg["text"].strip()
- line = f"[{start:.2f} – {end:.2f}] {text}\n"
- f.write(line) # IMMER ins Haupttranskript!
- if is_suspect(text):
- f_sus.write(line)
-
-
-print(f"📝 Zeitmarkiertes Transkript gespeichert unter: {output_txt}")
-print(f"⚠️ Verdächtige Zeilen gespeichert unter: {suspect_txt}")
-
-with open(output_json, "w", encoding="utf-8") as f:
- json.dump(unique_segments, f, ensure_ascii=False, indent=2)
-print(f"💾 Segmentdaten gespeichert unter: {output_json}")
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..5bbd77c
--- /dev/null
+++ b/main.py
@@ -0,0 +1,233 @@
+#!/usr/bin/env python3
+"""Run the full Bachelor pipeline end-to-end with timing, errors, and flexible flags.
+
+Steps:
+ 1) transcription.py → Whisper transcripts (segments + timed words)
+ 2) segment_transcript.py → LLM selects highlight candidates → SQLite
+ 3) cutClips.py → export highlight_*.mp4 (raw clips)
+ 4) main_detect_faces.py → YOLO + MediaPipe → faces.json per clip
+ 5) make_segments.py → *_target_by_frame.json (center+side per frame)
+ 6) main_apply_crop.py → 9:16 crop with smoothing & optional audio mux
+ 7) rateCluster.py → (optional) LLM scoring (virality, emotion, ...)
+ 8) add_subtitles.py → (optional) word-cap subtitles burned in
+
+Usage examples:
+ python main.py --input data/input/meinvideo.mp4 --limit 10 --openai-model gpt-4o
+ python main.py --no-rate --no-subs
+"""
+
+from __future__ import annotations
+import argparse
+import os
+import sys
+import subprocess
+import time
+from datetime import datetime
+from pathlib import Path
+
+# --- Import project config ---
+try:
+ from config import (
+ PROJECT_ROOT, INPUT_DIR, RAW_CLIPS_DIR, CROPPED_DIR, SUBTITLED_DIR,
+ WHISPER_CACHE_DIR
+ )
+except Exception:
+ PROJECT_ROOT = Path(__file__).resolve().parent
+ sys.path.insert(0, str(PROJECT_ROOT))
+ from config import (
+ PROJECT_ROOT, INPUT_DIR, RAW_CLIPS_DIR, CROPPED_DIR, SUBTITLED_DIR,
+ WHISPER_CACHE_DIR
+ )
+
+LOGS_DIR = PROJECT_ROOT / "logs"
+LOGS_DIR.mkdir(parents=True, exist_ok=True)
+
+# --- korrekte Pfade zu den Skripten ---
+SCRIPTS = {
+ "transcription": str(PROJECT_ROOT / "src" / "text" / "transcription.py"),
+ "segment_transcript": str(PROJECT_ROOT / "src" / "text" / "segment_transcript.py"),
+ "cutClips": str(PROJECT_ROOT / "src" / "text" / "cutClips.py"),
+ "detect_faces": str(PROJECT_ROOT / "src" / "reformat" / "main_detect_faces.py"),
+ "make_segments": str(PROJECT_ROOT / "src" / "reformat" / "make_segments.py"),
+ "apply_crop": str(PROJECT_ROOT / "src" / "reformat" / "main_apply_crop.py"),
+ "rateCluster": str(PROJECT_ROOT / "src" / "text" / "rateCluster.py"),
+ "add_subtitles": str(PROJECT_ROOT / "src" / "subtitles" / "add_subtitles.py"),
+}
+
+def shlex_join(cmd):
+ return " ".join(str(c) for c in cmd)
+
+def run_step(cmd: list[str], name: str, env: dict[str, str] | None = None) -> float:
+ """Run a subprocess step, raise on error, return duration in seconds."""
+ t0 = time.perf_counter()
+ print(f"\n===== {name} =====")
+ print(" ", shlex_join(cmd))
+ cp = subprocess.run(cmd, env=env)
+ dt = time.perf_counter() - t0
+ if cp.returncode != 0:
+ print(f"❌ Fehler in {name} (Exit {cp.returncode}) nach {dt:.2f}s")
+ print(" → Prüfe das Logfile oben für Details und stelle sicher, dass Abhängigkeiten installiert sind:")
+ print(" - ffmpeg/ffprobe im PATH")
+ print(" - Python-Pakete: openai-whisper, torch, ffmpeg-python, ultralytics, opencv-python, mediapipe, moviepy, tqdm, numpy")
+ print(" - OPENAI_API_KEY gesetzt (für LLM-Schritte)")
+ raise SystemExit(cp.returncode)
+ print(f"✅ {name} in {dt:.2f}s")
+ return dt
+
+def infer_base_from_input(input_path: Path) -> str:
+ return input_path.stem
+
+def default_input() -> Path | None:
+ if not INPUT_DIR.exists():
+ return None
+ for p in sorted(INPUT_DIR.iterdir()):
+ if p.suffix.lower() in {".mp4", ".mov", ".mkv", ".m4v", ".mp3", ".wav"}:
+ return p
+ return None
+
+def main():
+ ap = argparse.ArgumentParser(description="Bachelor Pipeline Runner")
+ ap.add_argument("--input", type=str, default=None, help="Pfad zu Eingabedatei (Default: erstes File in data/input)")
+ ap.add_argument("--limit", type=int, default=10, help="Anzahl Highlights (cutClips)")
+ ap.add_argument("--whisper-model", type=str, default=os.getenv("WHISPER_MODEL", "small"))
+ ap.add_argument("--lang", type=str, default=None, help="Sprachcode (z. B. de)")
+ ap.add_argument("--openai-model", type=str, default=os.getenv("OPENAI_MODEL", "gpt-4o"))
+ ap.add_argument("--pattern", type=str, default="highlight_*.mp4")
+ ap.add_argument("--overwrite", action="store_true")
+ ap.add_argument("--no-rate", action="store_true")
+ ap.add_argument("--no-subs", action="store_true")
+ ap.add_argument("--no-detect", action="store_true")
+ ap.add_argument("--no-make", action="store_true")
+ ap.add_argument("--no-apply", action="store_true")
+ ap.add_argument("--logfile", type=str, default=None)
+ args = ap.parse_args()
+
+ os.chdir(PROJECT_ROOT)
+
+ env = os.environ.copy()
+ env.setdefault("OPENAI_MODEL", args.openai_model)
+ env.setdefault("XDG_CACHE_HOME", str(WHISPER_CACHE_DIR))
+
+ if not env.get("OPENAI_API_KEY"):
+ print("⚠️ OPENAI_API_KEY ist nicht gesetzt – LLM-Schritte könnten fehlschlagen.")
+
+ # Input-Datei bestimmen
+ if args.input:
+ input_path = Path(args.input)
+ if not input_path.is_file():
+ candidate = INPUT_DIR / args.input
+ if candidate.is_file():
+ input_path = candidate
+ else:
+ raise SystemExit(f"Input nicht gefunden: {args.input}")
+ else:
+ picked = default_input()
+ if not picked:
+ raise SystemExit(f"Kein Input in {INPUT_DIR} gefunden. Bitte --input setzen.")
+ input_path = picked
+
+ base = infer_base_from_input(input_path)
+ print(f"📥 Input: {input_path}")
+ print(f"🔤 Whisper: {args.whisper_model} | 🌐 LLM: {env.get('OPENAI_MODEL')}")
+ print(f"🧩 Base: {base}")
+
+ # Logfile
+ if args.logfile:
+ log_path = Path(args.logfile)
+ else:
+ log_path = LOGS_DIR / f"run_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
+
+ # Tee: schreibe in Datei UND Konsole
+ try:
+ log_fh = open(log_path, "w", encoding="utf-8")
+ class _Tee:
+ def __init__(self, *streams): self.streams = streams
+ def write(self, data):
+ for s in self.streams:
+ try: s.write(data); s.flush()
+ except Exception: pass
+ def flush(self):
+ for s in self.streams:
+ try: s.flush()
+ except Exception: pass
+ sys.stdout = _Tee(sys.__stdout__, log_fh)
+ sys.stderr = _Tee(sys.__stderr__, log_fh)
+ print(f"📝 Logfile: {log_path}")
+ except Exception:
+ print(f"⚠️ Konnte Logfile nicht initialisieren: {log_path}")
+
+ durations = []
+ started = datetime.now()
+ print(f"🚀 Start: {started:%Y-%m-%d %H:%M:%S}")
+
+ try:
+ # 1) Transcription
+ t_args = [sys.executable, SCRIPTS["transcription"], "--input", str(input_path), "--model", args.whisper_model]
+ if args.lang: t_args += ["--lang", args.lang]
+ durations.append(("Transcription", run_step(t_args, "Transcription", env=env)))
+
+ # 2) LLM Segmentierung
+ st_args = [sys.executable, SCRIPTS["segment_transcript"], "--base", base]
+ durations.append(("Segment Transcript", run_step(st_args, "Segment Transcript", env=env)))
+
+ # 3) Highlights schneiden
+ cut_filename = input_path.name
+ cc_args = [sys.executable, SCRIPTS["cutClips"], "--file", cut_filename, "--limit", str(args.limit)]
+ durations.append(("Cut Clips", run_step(cc_args, "Cut Clips", env=env)))
+
+ # 4) Faces
+ if not args.no_detect:
+ df_args = [sys.executable, SCRIPTS["detect_faces"]]
+ durations.append(("Detect Faces", run_step(df_args, "Detect Faces", env=env)))
+ else:
+ print("⏭️ Detect Faces übersprungen.")
+
+ # 5) Make Targets
+ if not args.no_make:
+ ms_args = [sys.executable, SCRIPTS["make_segments"], "--pattern", args.pattern]
+ durations.append(("Make Targets", run_step(ms_args, "Make Targets", env=env)))
+ else:
+ print("⏭️ Make Targets übersprungen.")
+
+ # 6) Crop
+ if not args.no_apply:
+ ac_args = [sys.executable, SCRIPTS["apply_crop"], "--pattern", args.pattern, "--mux_audio"]
+ if args.overwrite: ac_args.append("--overwrite")
+ durations.append(("Apply Crop", run_step(ac_args, "Apply Crop", env=env)))
+ else:
+ print("⏭️ Apply Crop übersprungen.")
+
+ # 7) Bewertung
+ if not args.no_rate:
+ rc_args = [sys.executable, SCRIPTS["rateCluster"]]
+ durations.append(("Rate Clusters", run_step(rc_args, "Rate Clusters", env=env)))
+ else:
+ print("⏭️ Rate Clusters übersprungen.")
+
+ # 8) Untertitel
+ if not args.no_subs:
+ as_args = [sys.executable, SCRIPTS["add_subtitles"]]
+ durations.append(("Subtitles", run_step(as_args, "Subtitles", env=env)))
+ else:
+ print("⏭️ Subtitles übersprungen.")
+
+ except KeyboardInterrupt:
+ print("\n⛔ Abgebrochen (Ctrl+C).")
+ finally:
+ finished = datetime.now()
+ total = sum(dt for _, dt in durations)
+ print("\n======================== ZUSAMMENFASSUNG ============================")
+ for name, dt in durations:
+ print(f"✅ {name:<24} {dt:7.2f}s")
+ print("---------------------------------------------------------------------")
+ print(f"⏱️ Gesamtdauer: {total:.2f}s")
+ print(f"🕒 Start : {started:%Y-%m-%d %H:%M:%S}")
+ print(f"🕒 Ende : {finished:%Y-%m-%d %H:%M:%S}")
+ print(f"📂 Output:")
+ print(f" Raw Clips : {RAW_CLIPS_DIR}")
+ print(f" 9:16 : {CROPPED_DIR}")
+ print(f" Subbed : {SUBTITLED_DIR}")
+ print("=====================================================================")
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/src/main.py b/src/main.py
new file mode 100644
index 0000000..422cc64
--- /dev/null
+++ b/src/main.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python3
+"""
+Einfaches Master-Skript, das alle Unter-Skripte nacheinander startet – ohne Argumente.
+"""
+import subprocess
+import sys
+from pathlib import Path
+
+# Reihenfolge der auszuführenden Skripte (relativer Pfad)
+SCRIPTS = [
+ "text/transcription.py",
+ "text/segment_transcript.py",
+ "text/rateCluster.py",
+ "text/cutClips.py",
+ "reformat/track_faces_Yolo.py",
+ "reformat/detect_speaking_faces.py",
+ "reformat/crop_to_speaker.py",
+]
+
+
+def run_script(script_path: str):
+ """
+ Führt ein Python-Skript ohne weitere Argumente aus.
+ """
+ print(f"🔄 Running: {script_path}")
+ full_path = Path(__file__).parent / script_path
+ try:
+ subprocess.check_call([sys.executable, str(full_path)])
+ print(f"✔️ {script_path} erfolgreich abgeschlossen.\n")
+ except subprocess.CalledProcessError as e:
+ print(f"❌ Fehler in {script_path}: Rückgabecode {e.returncode}")
+ sys.exit(e.returncode)
+
+
+def main():
+ print("\n=== Starte komplette Podcast-Pipeline ===\n")
+ for script in SCRIPTS:
+ run_script(script)
+ print("✅ Alle Schritte erfolgreich abgeschlossen.")
+
+
+if __name__ == '__main__':
+ main()
diff --git a/src/reformat/main_apply_crop.py b/src/reformat/main_apply_crop.py
new file mode 100644
index 0000000..da71e4f
--- /dev/null
+++ b/src/reformat/main_apply_crop.py
@@ -0,0 +1,315 @@
+#!/usr/bin/env python3
+# src/reformat/new/main_apply_crop.py
+from __future__ import annotations
+import logging, json, math, subprocess, argparse
+from pathlib import Path
+from typing import Optional, Tuple, List, Dict, Any
+from collections import deque
+import sys
+
+import cv2
+import numpy as np
+
+# ── Projektwurzel importierbar machen
+ROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(ROOT))
+from config import RAW_CLIPS_DIR, FACE_COMBINED_DIR, SEGMENTS_DIR, CROPPED_DIR
+
+# ==== Defaults (per CLI überschreibbar) ======================================
+OUT_W_DEFAULT, OUT_H_DEFAULT = 1080, 1920 # 9:16
+DEBUG_SCALE_DEFAULT = 0.6
+MEDIAN_WIN_DEFAULT = 5
+EMA_ALPHA_DEFAULT = 0.22
+DEADBAND_PX_DEFAULT = 8.0
+SWITCH_COOLDOWN_FR_DEFAULT = 12
+ZOOM_PAD_FRAC_DEFAULT = 0.10
+
+USE_CUT_DETECT_DEFAULT = True
+CUT_CORR_THRESH_DEFAULT = 0.65
+CUT_COOLDOWN_DEFAULT = 6
+
+MUX_AUDIO_DEFAULT = True
+FFMPEG_BIN = "ffmpeg"
+# ============================================================================
+
+def clamp(v, lo, hi): return max(lo, min(hi, v))
+
+def compute_crop_rect(cx: float, cy: float, src_w: int, src_h: int,
+ out_w: int, out_h: int, zoom_pad_frac: float) -> tuple[int,int,int,int]:
+ """9:16 (out_w:out_h) Crop um (cx,cy) — ohne Squeeze, mit Zoom-Pad, im Bild gehalten."""
+ target_ar = out_w / out_h
+ src_ar = src_w / src_h
+ if src_ar >= target_ar:
+ base_h = src_h
+ base_w = int(round(base_h * target_ar))
+ else:
+ base_w = src_w
+ base_h = int(round(base_w / target_ar))
+
+ desired_scale = 1.0 + zoom_pad_frac
+ s = min(desired_scale, src_w / base_w, src_h / base_h)
+ w = int(round(base_w * s))
+ h = int(round(base_h * s))
+ half_w, half_h = w // 2, h // 2
+
+ cx = clamp(cx, half_w, src_w - half_w)
+ cy = clamp(cy, half_h, src_h - half_h)
+ x = int(round(cx - half_w))
+ y = int(round(cy - half_h))
+ return x, y, w, h
+
+def draw_center(img, pt, color, label=None):
+ if pt is None: return
+ x, y = int(pt[0]), int(pt[1])
+ cv2.circle(img, (x, y), 6, color, -1)
+ if label:
+ cv2.putText(img, label, (x + 8, y - 8),
+ cv2.FONT_HERSHEY_SIMPLEX, 0.55, color, 2, cv2.LINE_AA)
+
+def scene_corr(a_small: np.ndarray, b_small: np.ndarray) -> float:
+ a_hsv = cv2.cvtColor(a_small, cv2.COLOR_BGR2HSV)
+ b_hsv = cv2.cvtColor(b_small, cv2.COLOR_BGR2HSV)
+ ha = cv2.calcHist([a_hsv],[0,1],None,[50,50],[0,180,0,256])
+ hb = cv2.calcHist([b_hsv],[0,1],None,[50,50],[0,180,0,256])
+ cv2.normalize(ha,ha,0,1,cv2.NORM_MINMAX); cv2.normalize(hb,hb,0,1,cv2.NORM_MINMAX)
+ return float((cv2.compareHist(ha, hb, cv2.HISTCMP_CORREL) + 1.0)/2.0)
+
+def mux_audio_from_source(src_video: Path, silent_video: Path, out_video: Path):
+ cmd = [
+ FFMPEG_BIN, "-y",
+ "-i", str(src_video),
+ "-i", str(silent_video),
+ "-map", "1:v:0",
+ "-map", "0:a:0?",
+ "-c:v", "copy",
+ "-c:a", "aac", "-b:a", "192k",
+ "-shortest",
+ str(out_video),
+ ]
+ subprocess.run(cmd, check=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+def load_faces(name: str) -> List[Dict[str,Any]]:
+ p = FACE_COMBINED_DIR / f"{name}_faces.json"
+ if not p.exists(): return []
+ return json.loads(p.read_text(encoding="utf-8"))
+
+def load_target_map_or_segments(name: str, total_frames: int) -> List[Optional[int] | Dict]:
+ """
+ Bevorzugt *_target_by_frame.json (Liste Dicts mit t,cx,cy,w,h).
+ Fallback: *_segments.json (pro Frame Track-ID).
+ Gibt Liste gleicher Länge wie total_frames zurück.
+ """
+ map_p = SEGMENTS_DIR / f"{name}_target_by_frame.json"
+ if map_p.exists():
+ target = json.loads(map_p.read_text(encoding="utf-8"))
+ # Falls es Dicts sind (cx,cy,w,h pro frame), einfach zurückgeben:
+ if target and isinstance(target[0], dict):
+ if len(target) < total_frames:
+ last = target[-1] if target else {"t":0,"cx":0.5,"cy":0.5,"w":0.6,"h":0.6}
+ target += [last] * (total_frames - len(target))
+ return target[:total_frames]
+ # Falls numerische IDs drin wären, fällt es unten durch auf segs-Logik
+ seg_p = SEGMENTS_DIR / f"{name}_segments.json"
+ if seg_p.exists():
+ segs = json.loads(seg_p.read_text(encoding="utf-8"))
+ target_tid = [None]*total_frames
+ for s in segs:
+ a, b, tid = int(s["start_f"]), int(s["end_f"]), s["track_id"]
+ for t in range(max(0,a), min(total_frames, b+1)):
+ target_tid[t] = tid
+ return target_tid
+ return [None]*total_frames
+
+def find_center_for_track(faces_frame: Dict[str,Any], target_tid: Optional[int], fallback: Tuple[float,float]) -> Tuple[float,float]:
+ if target_tid is None:
+ return fallback
+ faces = faces_frame.get("faces", [])
+ for f in faces:
+ if int(f.get("track_id", -1)) == int(target_tid):
+ x,y,w,h = f.get("bbox", [None,None,None,None])
+ if None not in (x,y,w,h):
+ return (float(x + w/2), float(y + h/2))
+ return fallback
+
+def parse_args():
+ p = argparse.ArgumentParser(description="Apply 9:16 Auto-Crop auf Rohclips mit Face-/Target-Daten.")
+ p.add_argument("--pattern", type=str, default="*.mp4", help="Dateimuster in RAW_CLIPS_DIR (Default: *.mp4)")
+ p.add_argument("--out_w", type=int, default=OUT_W_DEFAULT, help="Output-Breite (Default: 1080)")
+ p.add_argument("--out_h", type=int, default=OUT_H_DEFAULT, help="Output-Höhe (Default: 1920)")
+ p.add_argument("--zoom_pad", type=float, default=ZOOM_PAD_FRAC_DEFAULT, help="Zoom-Pad (0..1, Default 0.10)")
+ p.add_argument("--median", type=int, default=MEDIAN_WIN_DEFAULT, help="Median-Fenster (ungerade, >=3)")
+ p.add_argument("--ema", type=float, default=EMA_ALPHA_DEFAULT, help="EMA-Alpha (0..1)")
+ p.add_argument("--deadband", type=float, default=DEADBAND_PX_DEFAULT, help="Totband in Pixel")
+ p.add_argument("--switch_cd", type=int, default=SWITCH_COOLDOWN_FR_DEFAULT, help="Cooldown-Frames nach Trackwechsel")
+ p.add_argument("--cut_detect", action="store_true", default=USE_CUT_DETECT_DEFAULT, help="Szenenschnitt-Erkennung aktivieren")
+ p.add_argument("--cut_corr", type=float, default=CUT_CORR_THRESH_DEFAULT, help="Korrelation-Schwelle (0..1)")
+ p.add_argument("--cut_cd", type=int, default=CUT_COOLDOWN_DEFAULT, help="Cooldown-Frames nach Cut")
+ p.add_argument("--mux_audio", action="store_true", default=MUX_AUDIO_DEFAULT, help="Audio vom Original muxen")
+ p.add_argument("--debug", action="store_true", help="Debug-Overlay anzeigen (langsam)")
+ p.add_argument("--debug_scale", type=float, default=DEBUG_SCALE_DEFAULT, help="Skalierung Debug-Preview")
+ p.add_argument("--overwrite", action="store_true", help="Existierende Outputs überschreiben")
+ return p.parse_args()
+
+def main():
+ args = parse_args()
+ OUT_DIR = CROPPED_DIR
+ OUT_DIR.mkdir(parents=True, exist_ok=True)
+
+ logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO)
+ clips = sorted(list(RAW_CLIPS_DIR.glob(args.pattern)))
+ if not clips:
+ print(f"⚠️ Keine Clips in {RAW_CLIPS_DIR} mit Pattern '{args.pattern}'")
+ return
+
+ print(f"🔎 {len(clips)} Clips gefunden …")
+ for video_path in clips:
+ name = video_path.stem
+ out_path = OUT_DIR / f"{name}_9x16.mp4"
+ if out_path.exists() and not args.overwrite:
+ print(f"⏭️ Skip (existiert): {out_path.name}")
+ continue
+
+ # Video öffnen
+ cap = cv2.VideoCapture(str(video_path))
+ if not cap.isOpened():
+ print(f"❌ Kann Video nicht öffnen: {video_path.name}")
+ continue
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+ fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
+ total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+
+ # Face/Target laden
+ faces_all = load_faces(name)
+ if faces_all and len(faces_all) < total:
+ faces_all += [ {"faces": [], "W": width, "H": height} ] * (total - len(faces_all))
+ target_by_frame = load_target_map_or_segments(name, total)
+
+ # Writer vorbereiten
+ writer = cv2.VideoWriter(str(out_path),
+ cv2.VideoWriter_fourcc(*"mp4v"),
+ fps, (args.out_w, args.out_h))
+
+ median_buf = deque(maxlen=max(3, args.median if args.median % 2 else args.median+1))
+ ema_center: Optional[Tuple[float,float]] = None
+ last_center: Optional[Tuple[float,float]] = (width/2, height/2)
+ switch_cooldown = 0
+
+ prev_small = None
+ cut_cd = 0
+
+ print(f"🎞️ Apply: {name} src={width}x{height} fps={fps:.2f} frames={total}")
+
+ for t in range(total):
+ ret, frame = cap.read()
+ if not ret: break
+
+ # Ziel bestimmen:
+ desired = None
+ tgt = target_by_frame[t] if t < len(target_by_frame) else None
+
+ # Fall A: target_by_frame.json mit direkten Zentren (Dict)
+ if isinstance(tgt, dict) and all(k in tgt for k in ("cx","cy","w","h")):
+ desired = (float(tgt["cx"])*width, float(tgt["cy"])*height)
+ else:
+ # Fall B: numerische Track-ID
+ target_tid = tgt if tgt is None or isinstance(tgt, (int, float)) else None
+ faces_fr = faces_all[t] if (faces_all and t < len(faces_all)) else {"faces":[]}
+ desired = find_center_for_track(faces_fr, target_tid, last_center or (width/2, height/2))
+
+ # Szenenschnitt?
+ if args.cut_detect:
+ small = cv2.resize(frame, (128, 72))
+ if prev_small is not None:
+ corr = scene_corr(prev_small, small)
+ if corr < args.cut_corr:
+ ema_center = desired
+ last_center = desired
+ switch_cooldown = args.switch_cd
+ cut_cd = args.cut_cd
+ prev_small = small
+
+ # Median-Filter
+ median_buf.append(desired)
+ if len(median_buf) >= 3:
+ xs = sorted(p[0] for p in median_buf)
+ ys = sorted(p[1] for p in median_buf)
+ m = len(median_buf)//2
+ desired_f = (xs[m], ys[m])
+ else:
+ desired_f = desired
+
+ # Trackwechsel erkennen (nur bei Track-IDs sauber möglich)
+ if t > 0:
+ prev_tgt = target_by_frame[t-1] if t-1 < len(target_by_frame) else None
+ else:
+ prev_tgt = tgt
+ is_switch = (not isinstance(tgt, dict)) and (tgt != prev_tgt)
+
+ if ema_center is None:
+ ema_center = desired_f
+ if last_center is None:
+ last_center = desired_f
+
+ if is_switch:
+ ema_center = desired_f
+ last_center = desired_f
+ switch_cooldown = args.switch_cd
+ else:
+ dx = desired_f[0] - ema_center[0]
+ dy = desired_f[1] - ema_center[1]
+ dist = math.hypot(dx, dy)
+ if cut_cd > 0:
+ ema_center = desired_f
+ cut_cd -= 1
+ else:
+ if dist > args.deadband:
+ ema_center = (ema_center[0] + dx*args.ema,
+ ema_center[1] + dy*args.ema)
+
+ last_center = desired_f
+
+ # 9:16 Crop anwenden
+ x, y, w, h = compute_crop_rect(ema_center[0], ema_center[1], width, height,
+ args.out_w, args.out_h, args.zoom_pad)
+ cropped = frame[y:y+h, x:x+w]
+ if cropped.size == 0: cropped = frame
+ final = cv2.resize(cropped, (args.out_w, args.out_h), interpolation=cv2.INTER_AREA)
+ writer.write(final)
+
+ if args.debug:
+ dbg = frame.copy()
+ cv2.rectangle(dbg, (x, y), (x+w, y+h), (0, 0, 255), 2)
+ draw_center(dbg, desired, (128,128,255), "desired")
+ draw_center(dbg, desired_f, (255,255, 0), "median")
+ draw_center(dbg, ema_center, ( 0,255,255), "ema")
+ cv2.putText(dbg, f"t={t+1}/{total}", (12, height-14),
+ cv2.FONT_HERSHEY_SIMPLEX, 0.6, (20,220,20), 2, cv2.LINE_AA)
+ disp = cv2.resize(dbg, (int(width*args.debug_scale), int(height*args.debug_scale)))
+ cv2.imshow("Apply Debug", disp)
+ if cv2.waitKey(1) & 0xFF == ord("q"):
+ print("🛑 Abgebrochen (q).")
+ break
+
+ writer.release()
+ cap.release()
+
+ # Audio muxen?
+ if args.mux_audio:
+ tmp = out_path.with_suffix(".tmp.mp4")
+ try:
+ out_path.rename(tmp)
+ mux_audio_from_source(video_path, tmp, out_path)
+ finally:
+ if tmp.exists():
+ try: tmp.unlink()
+ except: pass
+ print(f"✅ Fertig (mit Audio): {out_path.name}")
+ else:
+ print(f"✅ Fertig: {out_path.name}")
+
+ if args.debug:
+ cv2.destroyAllWindows()
+
+if __name__ == "__main__":
+ main()
diff --git a/src/reformat/main_detect_faces.py b/src/reformat/main_detect_faces.py
new file mode 100644
index 0000000..44f0300
--- /dev/null
+++ b/src/reformat/main_detect_faces.py
@@ -0,0 +1,335 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Face-Detection + Mouth-Openness (YOLOv8-face + MediaPipe)
+- liest Rohclips aus RAW_CLIPS_DIR
+- schreibt pro Video eine faces.json in FACE_COMBINED_DIR
+- optionaler Fortschrittsbalken (tqdm)
+"""
+
+from __future__ import annotations
+import argparse
+import logging
+import json
+import time
+from pathlib import Path
+from contextlib import nullcontext
+from typing import List, Dict, Any
+from src.reformat.speaking import get_mouth_openness
+
+import cv2
+import numpy as np
+import torch
+from ultralytics import YOLO
+import mediapipe as mp
+import sys
+
+# ── Projekt-Root + zentrale Pfade laden
+ROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(ROOT))
+from config import RAW_CLIPS_DIR, FACE_COMBINED_DIR # zentrale Verzeichnisse
+
+# Fortschritt hübsch, wenn verfügbar
+try:
+ from tqdm import tqdm
+ _HAS_TQDM = True
+except Exception:
+ _HAS_TQDM = False
+
+# ---------- Performance Tweaks ----------
+torch.set_float32_matmul_precision("high")
+cv2.setUseOptimized(True)
+
+# ---------- Hilfsfunktionen ----------
+def make_square_crop(x1, y1, x2, y2, W, H, margin_scale, min_crop):
+ cx = (x1 + x2) * 0.5
+ cy = (y1 + y2) * 0.5
+ w = (x2 - x1) * (1.0 + 2.0 * margin_scale)
+ h = (y2 - y1) * (1.0 + 2.0 * margin_scale)
+ side = max(w, h, float(min_crop))
+ half = side * 0.5
+
+ sx1 = int(max(0, round(cx - half)))
+ sy1 = int(max(0, round(cy - half)))
+ sx2 = int(min(W, round(cx + half)))
+ sy2 = int(min(H, round(cy + half)))
+
+ side_w = max(0, sx2 - sx1)
+ side_h = max(0, sy2 - sy1)
+ side = max(2, min(side_w, side_h))
+ sx2 = sx1 + side
+ sy2 = sy1 + side
+ return sx1, sy1, sx2, sy2
+
+
+def pick_landmarks_near_crop_center(lm_lists, crop_w, crop_h):
+ if not lm_lists:
+ return None
+ cx_t, cy_t = crop_w * 0.5, crop_h * 0.5
+ best, best_d = None, 1e12
+ for lms in lm_lists:
+ xs = [p.x * crop_w for p in lms.landmark]
+ ys = [p.y * crop_h for p in lms.landmark]
+ cx = sum(xs) / len(xs)
+ cy = sum(ys) / len(ys)
+ d = (cx - cx_t) ** 2 + (cy - cy_t) ** 2
+ if d < best_d:
+ best, best_d = lms, d
+ return best
+
+
+def run_mesh(face_mesh, crop_bgr, upscale_if_small):
+ if crop_bgr.size == 0:
+ return None, 0.0
+ ch, cw = crop_bgr.shape[:2]
+ if max(ch, cw) < upscale_if_small:
+ scale = float(upscale_if_small) / max(ch, cw)
+ new_w = max(1, int(round(cw * scale)))
+ new_h = max(1, int(round(ch * scale)))
+ crop_bgr = cv2.resize(crop_bgr, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
+ ch, cw = new_h, new_w
+ rgb = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2RGB)
+ res = face_mesh.process(rgb)
+ if not res.multi_face_landmarks:
+ return None, 0.0
+ chosen = pick_landmarks_near_crop_center(res.multi_face_landmarks, cw, ch)
+ if chosen is None:
+ return None, 0.0
+ mo = get_mouth_openness(chosen.landmark, ch)
+ return chosen, float(mo)
+
+# ---------- Kernprozess ----------
+def process_video(video_path: Path,
+ output_path: Path,
+ model: YOLO,
+ face_mesh,
+ conf_thresh: float,
+ frame_skip: int,
+ downscale: float,
+ expansion_1: float,
+ expansion_2: float,
+ min_crop: int,
+ faces_upscale: int,
+ imgsz: int,
+ device: str,
+ max_det: int):
+ print(f"🎬 Starte Detection: {video_path.name}")
+ cap = cv2.VideoCapture(str(video_path))
+ if not cap.isOpened():
+ logging.error(f"❌ Kann Video nicht öffnen: {video_path}")
+ return
+
+ fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
+ orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+ orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+ total_frames_raw = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+ total_to_process = None
+ if total_frames_raw > 0:
+ total_to_process = (total_frames_raw + (frame_skip - 1)) // max(1, frame_skip)
+
+ scaled_w = max(1, int(round(orig_w * downscale)))
+ scaled_h = max(1, int(round(orig_h * downscale)))
+
+ data: List[Dict[str, Any]] = []
+ frame_idx = 0
+ processed_frames = 0
+
+ sx = (orig_w / scaled_w) if downscale != 1.0 else 1.0
+ sy = (orig_h / scaled_h) if downscale != 1.0 else 1.0
+
+ autocast_ctx = (
+ torch.autocast(device_type=device, dtype=torch.float16)
+ if device in ("mps", "cuda") else nullcontext()
+ )
+
+ bar = None
+ start_t = time.time()
+ if _HAS_TQDM and total_to_process:
+ bar = tqdm(total=total_to_process, desc=f"{video_path.name}", unit="f", leave=True)
+
+ while True:
+ ret, frame = cap.read()
+ if not ret:
+ break
+
+ if frame_skip > 1 and (frame_idx % frame_skip != 0):
+ frame_idx += 1
+ continue
+
+ frame_infer = frame if downscale == 1.0 else cv2.resize(frame, (scaled_w, scaled_h), interpolation=cv2.INTER_AREA)
+
+ with torch.no_grad():
+ with autocast_ctx:
+ # Ultralytics 8 API: __call__ statt .predict() (beide funktionieren)
+ result = model(frame_infer, imgsz=imgsz, device=device, verbose=False,
+ conf=conf_thresh, iou=0.5, max_det=max_det)
+ detections = result[0]
+
+ faces = []
+ for i in range(len(detections.boxes)):
+ box = detections.boxes[i]
+ conf = float(box.conf[0]) if hasattr(box.conf, "__len__") else float(box.conf)
+ if conf < conf_thresh:
+ continue
+ x1, y1, x2, y2 = [float(v) for v in box.xyxy[0].tolist()]
+ if downscale != 1.0:
+ x1 *= sx; x2 *= sx; y1 *= sy; y2 *= sy
+ x1 = max(0.0, min(x1, orig_w - 1))
+ y1 = max(0.0, min(y1, orig_h - 1))
+ x2 = max(0.0, min(x2, orig_w - 1))
+ y2 = max(0.0, min(y2, orig_h - 1))
+
+ w = max(1.0, x2 - x1)
+ h = max(1.0, y2 - y1)
+ cx = x1 + w / 2.0
+ cy = y1 + h / 2.0
+
+ # Pass 1
+ sx1, sy1, sx2, sy2 = make_square_crop(x1, y1, x2, y2, orig_w, orig_h, expansion_1, min_crop)
+ if sx2 - sx1 < 4 or sy2 - sy1 < 4:
+ continue
+ face_crop = frame[sy1:sy2, sx1:sx2]
+ _, mouth_open = run_mesh(face_mesh, face_crop, faces_upscale)
+
+ # Pass 2 nur wenn nötig
+ if mouth_open == 0.0:
+ sx1b, sy1b, sx2b, sy2b = make_square_crop(x1, y1, x2, y2, orig_w, orig_h, expansion_2, min_crop)
+ if (sx2b - sx1b) >= 4 and (sy2b - sy1b) >= 4:
+ face_crop_b = frame[sy1b:sy2b, sx1b:sx2b]
+ _, mouth_open = run_mesh(face_mesh, face_crop_b, faces_upscale)
+
+ faces.append({
+ "bbox": [int(round(x1)), int(round(y1)), int(round(w)), int(round(h))],
+ "conf": round(conf, 3),
+ "center": [round(cx, 1), round(cy, 1)],
+ "mouth_openness": round(float(mouth_open), 3)
+ })
+
+ data.append({
+ "frame": frame_idx,
+ "timestamp": round(frame_idx / fps, 3),
+ "W": orig_w,
+ "H": orig_h,
+ "faces": faces
+ })
+ frame_idx += 1
+ processed_frames += 1
+
+ # Fortschritt
+ if bar is not None:
+ bar.update(1)
+ else:
+ if processed_frames % 30 == 0:
+ elapsed = time.time() - start_t
+ rate = processed_frames / max(1e-6, elapsed) # frames/sec
+ if total_to_process:
+ remaining = max(0, total_to_process - processed_frames)
+ eta_sec = remaining / max(1e-6, rate)
+ print(f"[{video_path.name}] {processed_frames}/{total_to_process} "
+ f"({processed_frames/total_to_process*100:.1f}%) "
+ f"— {rate:.1f} f/s — ETA {eta_sec/60:.1f} min")
+ else:
+ print(f"[{video_path.name}] {processed_frames} frames — {rate:.1f} f/s")
+
+ cap.release()
+ if bar is not None:
+ bar.close()
+
+ # schön formatiertes JSON
+ output_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
+ print(f"✅ Faces gespeichert: {output_path.name}")
+
+# ---------- CLI ----------
+def parse_args():
+ p = argparse.ArgumentParser(description="YOLOv8-face + MediaPipe FaceMesh → faces.json pro Clip")
+ # Verzeichnisse (Default aus config.py)
+ p.add_argument("--input-dir", type=Path, default=RAW_CLIPS_DIR, help=f"Rohclips (Default: {RAW_CLIPS_DIR})")
+ p.add_argument("--output-dir", type=Path, default=FACE_COMBINED_DIR, help=f"Zielordner (Default: {FACE_COMBINED_DIR})")
+ # Modell
+ p.add_argument("--model", type=Path, default=ROOT / "models" / "yolov8n-face.pt",
+ help="Pfad zum YOLOv8-face Modell (.pt)")
+ # Optimierte Defaults
+ p.add_argument("--conf-thresh", type=float, default=0.35)
+ p.add_argument("--frame-skip", type=int, default=1, help="Nur jeden n-ten Frame verarbeiten")
+ p.add_argument("--downscale", type=float, default=0.5, help="Eingangsframe auf Faktor verkleinern (0..1)")
+ p.add_argument("--expansion", type=float, default=0.4, help="Crop-Margin Pass 1 (relativ)")
+ p.add_argument("--expansion2", type=float, default=0.8, help="Crop-Margin Pass 2 (relativ)")
+ p.add_argument("--min-crop", type=int, default=160, help="Minimaler Croprand in Pixeln (quadratisch)")
+ p.add_argument("--faces-upscale", type=int, default=192, help="Minimale Kantenlänge für FaceMesh (bei kleineren Crops upscalen)")
+ p.add_argument("--imgsz", type=int, default=448)
+ p.add_argument("--max-det", type=int, default=20)
+ p.add_argument("--use-refine", action="store_true", default=False, help="MediaPipe mit refine_landmarks")
+ return p.parse_args()
+
+def main():
+ args = parse_args()
+
+ logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO)
+ args.output_dir.mkdir(parents=True, exist_ok=True)
+
+ # YOLO Modell & Device
+ yolo = YOLO(str(args.model))
+ if torch.backends.mps.is_available():
+ device = "mps"
+ elif torch.cuda.is_available():
+ device = "cuda"
+ else:
+ device = "cpu"
+ yolo.to(device)
+ print(f"🖥️ Inference-Device: {device}")
+
+ # Warmup
+ try:
+ with torch.no_grad():
+ dummy = np.zeros((args.imgsz, args.imgsz, 3), dtype=np.uint8)
+ _ = yolo(source=[dummy], imgsz=args.imgsz, verbose=False, device=device)
+ except Exception:
+ pass
+
+ # Eingabedateien anzeigen
+ videos = sorted([*args.input_dir.glob("*.mp4"), *args.input_dir.glob("*.mov"), *args.input_dir.glob("*.mkv")])
+ print(f"🔍 Input-Ordner: {args.input_dir.resolve()}")
+ if not videos:
+ print("⚠️ Keine passenden Videos gefunden.")
+ return
+ print("📁 Dateien:")
+ for p in videos:
+ print(" →", p.name)
+
+ outer = None
+ if _HAS_TQDM:
+ outer = tqdm(total=len(videos), desc="Gesamt", unit="vid", leave=False)
+
+ with mp.solutions.face_mesh.FaceMesh(
+ static_image_mode=False,
+ max_num_faces=10,
+ refine_landmarks=args.use_refine,
+ min_detection_confidence=0.5,
+ min_tracking_confidence=0.5
+ ) as face_mesh:
+ for vid in videos:
+ out = args.output_dir / f"{vid.stem}_faces.json"
+ process_video(
+ video_path=vid,
+ output_path=out,
+ model=yolo,
+ face_mesh=face_mesh,
+ conf_thresh=args.conf_thresh,
+ frame_skip=args.frame_skip,
+ downscale=args.downscale,
+ expansion_1=args.expansion,
+ expansion_2=args.expansion2,
+ min_crop=args.min_crop,
+ faces_upscale=args.faces_upscale,
+ imgsz=args.imgsz,
+ device=device,
+ max_det=args.max_det
+ )
+ if outer is not None:
+ outer.update(1)
+
+ if outer is not None:
+ outer.close()
+
+if __name__ == "__main__":
+ main()
diff --git a/src/reformat/main_track_faces.py b/src/reformat/main_track_faces.py
new file mode 100644
index 0000000..258bf02
--- /dev/null
+++ b/src/reformat/main_track_faces.py
@@ -0,0 +1,78 @@
+#!/usr/bin/env python3
+import logging, json
+from pathlib import Path
+from typing import List, Dict, Any
+import sys
+
+# Projekt-Root verfügbar machen
+ROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(ROOT))
+
+from config import FACE_COMBINED_DIR, FACE_CROP_CENTERS # ggf. SEGMENTS_DIR, wenn du dorthin schreibst
+
+
+def iou(boxA, boxB):
+ xA = max(boxA[0], boxB[0])
+ yA = max(boxA[1], boxB[1])
+ xB = min(boxA[0]+boxA[2], boxB[0]+boxB[2])
+ yB = min(boxA[1]+boxA[3], boxB[1]+boxB[3])
+ interW, interH = max(0, xB-xA), max(0, yB-yA)
+ inter = interW * interH
+ union = boxA[2]*boxA[3] + boxB[2]*boxB[3] - inter
+ return inter/union if union > 0 else 0.0
+
+def track_faces(faces_all: List[Dict[str,Any]], iou_thresh=0.3):
+ next_id = 0
+ last_boxes = {} # track_id -> bbox
+ for frame in faces_all:
+ new_boxes = {}
+ for face in frame["faces"]:
+ box = face["bbox"]
+ # match gegen bestehende
+ best_id, best_iou = None, 0.0
+ for tid, prev_box in last_boxes.items():
+ ov = iou(box, prev_box)
+ if ov > best_iou:
+ best_id, best_iou = tid, ov
+ if best_iou > iou_thresh:
+ face["track_id"] = best_id
+ new_boxes[best_id] = box
+ else:
+ face["track_id"] = next_id
+ new_boxes[next_id] = box
+ next_id += 1
+ last_boxes = new_boxes
+ return faces_all
+
+def main():
+ # Eingabe: erkannte Gesichter/Tracks
+ FACE_DIR = FACE_COMBINED_DIR
+ # Ausgabe: z. B. berechnete Center pro Frame
+ OUT_DIR = FACE_CROP_CENTERS
+ OUT_DIR.mkdir(parents=True, exist_ok=True)
+
+ for f in FACE_DIR.glob("*_faces.json"):
+ try:
+ faces_all = json.loads(f.read_text(encoding="utf-8"))
+ except Exception as e:
+ print(f"❌ Fehler beim Laden {f.name}: {e}")
+ continue
+
+ tracked = track_faces(faces_all)
+ f.write_text(json.dumps(tracked, ensure_ascii=False), encoding="utf-8")
+ print(f"✅ Track-IDs ergänzt: {f.name}")
+
+ # zusätzlich centers.json (dominant = höchster mouth_openness pro Frame)
+ centers = []
+ for fr in tracked:
+ if fr["faces"]:
+ best = max(fr["faces"], key=lambda ff: ff.get("mouth_openness", 0.0))
+ centers.append([best["center"][0], best["center"][1]])
+ else:
+ centers.append([fr["W"]/2, fr["H"]/2])
+ centers_path = f.with_name(f.stem.replace("_faces","_centers")+".json")
+ centers_path.write_text(json.dumps(centers, ensure_ascii=False), encoding="utf-8")
+ print(f"📝 Centers gespeichert: {centers_path.name}")
+
+if __name__ == "__main__":
+ main()
diff --git a/src/reformat/make_segments.py b/src/reformat/make_segments.py
new file mode 100644
index 0000000..1c438f5
--- /dev/null
+++ b/src/reformat/make_segments.py
@@ -0,0 +1,306 @@
+#!/usr/bin/env python3
+# make_segments.py — erzeugt pro Highlight eine Zielspur (target_by_frame.json) fürs Cropping
+
+from __future__ import annotations
+import json
+import argparse
+from dataclasses import dataclass
+from typing import List, Dict, Optional, Tuple
+from pathlib import Path
+import sys
+
+# ── Projektwurzel in sys.path aufnehmen (dieses Skript liegt z. B. unter src/reformat/)
+ROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(ROOT))
+
+from config import RAW_CLIPS_DIR, FACE_COMBINED_DIR, FACE_CROP_CENTERS, SEGMENTS_DIR
+
+try:
+ from moviepy.video.io.VideoFileClip import VideoFileClip
+ MOVIEPY_OK = True
+except Exception:
+ MOVIEPY_OK = False
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# Hilfsstrukturen
+# ──────────────────────────────────────────────────────────────────────────────
+
+@dataclass
+class FaceDet:
+ t: float # Sekunden
+ cx: float # Zentrum x (0..1)
+ cy: float # Zentrum y (0..1)
+ w: float # Breite rel. (0..1)
+ h: float # Höhe rel. (0..1)
+ track_id: Optional[int] = None
+ mouth_prob: Optional[float] = None
+
+def moving_average(xs: List[float], win: int) -> List[float]:
+ if win <= 1 or len(xs) <= 2:
+ return xs[:]
+ # ungerade Fensterbreite erzwingen
+ win = win if win % 2 == 1 else win + 1
+ r = win // 2
+ out = []
+ for i in range(len(xs)):
+ a = max(0, i - r)
+ b = min(len(xs), i + r + 1)
+ out.append(sum(xs[a:b]) / (b - a))
+ return out
+
+def clamp01(x: float) -> float:
+ return max(0.0, min(1.0, x))
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# Lesen möglicher Eingabeformate (robust, schema-tolerant)
+# ──────────────────────────────────────────────────────────────────────────────
+
+def _parse_face_like(obj: Dict, t: float, W: float | None = None, H: float | None = None) -> FaceDet:
+ """
+ Erwartet entweder:
+ - bbox=[x,y,w,h] in Pixel → wird via W,H auf 0..1 normiert
+ - oder bereits normierte Felder cx,cy,w,h in 0..1
+ Optional: track_id, mouth_prob / mouth_open / talking_prob
+ """
+ if "bbox" in obj and isinstance(obj["bbox"], (list, tuple)) and len(obj["bbox"]) >= 4:
+ x, y, w, h = [float(v) for v in obj["bbox"][:4]]
+ if W and H and W > 0 and H > 0:
+ cx = (x + w * 0.5) / W
+ cy = (y + h * 0.5) / H
+ w = w / W
+ h = h / H
+ else:
+ # Falls Maße fehlen: best effort, danach clampen
+ cx = x + w * 0.5
+ cy = y + h * 0.5
+ cx, cy = clamp01(cx), clamp01(cy)
+ w, h = max(0.0, min(1.0, w)), max(0.0, min(1.0, h))
+ else:
+ cx = float(obj.get("cx", 0.5))
+ cy = float(obj.get("cy", 0.5))
+ w = float(obj.get("w", 0.3))
+ h = float(obj.get("h", 0.3))
+ cx, cy = clamp01(cx), clamp01(cy)
+ w, h = max(0.0, min(1.0, w)), max(0.0, min(1.0, h))
+
+ track_id = obj.get("track_id")
+ mouth_prob = obj.get("mouth_prob") or obj.get("mouth_open") or obj.get("talking_prob")
+ mouth_prob = None if mouth_prob is None else float(mouth_prob)
+
+ return FaceDet(t=t, cx=cx, cy=cy, w=w, h=h, track_id=track_id, mouth_prob=mouth_prob)
+
+
+def load_faces_or_centers(stem: str, fps_hint: float | None = None) -> List[FaceDet]:
+ """
+ Lädt die beste verfügbare Gesichts/Center-Quelle für ein Highlight.
+ Suchreihenfolge:
+ 1) FACE_COMBINED_DIR/{stem}_faces.json (Liste von Frames mit 'faces')
+ 2) FACE_CROP_CENTERS/{stem}_centers.json
+ - akzeptiert entweder [[cx,cy], ...] oder [{t,cx,cy,w,h}, ...]
+ """
+ candidates = [
+ (FACE_COMBINED_DIR / f"{stem}_faces.json", "faces"),
+ (FACE_CROP_CENTERS / f"{stem}_centers.json", "centers"),
+ ]
+ path = kind = None
+ for p, k in candidates:
+ if p.exists():
+ path, kind = p, k
+ break
+
+ if path is None:
+ print(f"⚠️ Keine Face/Centers-Datei gefunden für {stem}. Fallback später → (0.5,0.5).")
+ return []
+
+ try:
+ raw = path.read_text(encoding="utf-8")
+ data = json.loads(raw)
+ except Exception as e:
+ print(f"❌ Konnte {path.name} nicht lesen: {e}")
+ return []
+
+ dets: List[FaceDet] = []
+
+ # 1) Liste von Frames: [{ "W":..,"H":..,"timestamp"/"t":.., "faces":[...] }, ...]
+ if isinstance(data, list) and data and isinstance(data[0], dict) and "faces" in data[0]:
+ for fr in data:
+ W = float(fr.get("W") or 0.0)
+ H = float(fr.get("H") or 0.0)
+ t = float(fr.get("t") or fr.get("timestamp") or fr.get("time") or 0.0)
+ for f in fr.get("faces", []):
+ dets.append(_parse_face_like(f, t, W, H))
+
+ # 2) Dict mit "frames": [...]
+ elif isinstance(data, dict) and "frames" in data:
+ for fr in data["frames"]:
+ W = float(fr.get("W") or 0.0)
+ H = float(fr.get("H") or 0.0)
+ t = float(fr.get("t") or fr.get("timestamp") or fr.get("time") or 0.0)
+ for f in fr.get("faces", []):
+ dets.append(_parse_face_like(f, t, W, H))
+
+ # 3) centers.json als Liste von Listen: [[cx,cy], ...]
+ elif isinstance(data, list) and data and isinstance(data[0], (list, tuple)) and len(data[0]) >= 2:
+ fps = float(fps_hint or 25.0)
+ for i, pair in enumerate(data):
+ cx, cy = float(pair[0]), float(pair[1])
+ dets.append(FaceDet(t=i / fps, cx=clamp01(cx), cy=clamp01(cy), w=0.6, h=0.6))
+
+ # 4) Liste von Dicts mit evtl. bereits normierten Feldern
+ elif isinstance(data, list) and data and isinstance(data[0], dict):
+ for item in data:
+ t = float(item.get("t") or item.get("time") or 0.0)
+ dets.append(_parse_face_like(item, t))
+
+ else:
+ print(f"⚠️ Unbekanntes JSON-Format in {path.name}.")
+ return []
+
+ # filtern & sortieren
+ dets = [d for d in dets if 0.0 <= d.cx <= 1.0 and 0.0 <= d.cy <= 1.0]
+ dets.sort(key=lambda d: d.t)
+ print(f"✅ {len(dets)} Detektionen aus {path.name} ({kind}).")
+ return dets
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# Zielspur berechnen
+# ──────────────────────────────────────────────────────────────────────────────
+
+def build_target_by_frame(
+ faces: List[FaceDet],
+ duration: float,
+ fps: float,
+ smooth_win: int = 7
+) -> List[Dict]:
+ """
+ Wählt pro Frame eine Zielposition (cx,cy,w,h).
+ Heuristik:
+ - bevorzuge Gesicht mit höchster mouth_prob (wenn vorhanden),
+ - sonst größtes Bounding-Box-Areal (w*h),
+ - halte IDs stabil (nicht zu häufige Sprünge).
+ Anschließend leichte Glättung (Moving Average) der Zentren/Größen.
+ """
+ if fps <= 0:
+ fps = 25.0
+ total_frames = max(1, int(round(duration * fps)))
+ if not faces:
+ # Fallback: center track
+ return [{"frame": i, "t": round(i / fps, 4), "cx": 0.5, "cy": 0.5, "w": 0.6, "h": 0.6} for i in range(total_frames)]
+
+ frame_targets: List[Tuple[float, float, float, float]] = [] # (cx, cy, w, h)
+ last_track: Optional[int] = None
+
+ # lineare Suche über faces (bei Bedarf später bucketisieren)
+ for i in range(total_frames):
+ t = i / fps
+ lo, hi = t - 1.0 / fps, t + 1.0 / fps
+
+ cand: List[FaceDet] = [d for d in faces if lo <= d.t <= hi]
+ if not cand:
+ # Nimm den zeitlich nächsten
+ nearest = min(faces, key=lambda d: abs(d.t - t))
+ cand = [nearest]
+
+ def score(d: FaceDet) -> Tuple[float, float, float]:
+ mouth = -1.0 if d.mouth_prob is None else float(d.mouth_prob) # None schlechter als 0
+ area = float(d.w) * float(d.h)
+ stable = 1.0 if (last_track is not None and d.track_id == last_track) else 0.0
+ return (mouth, area, stable)
+
+ cand.sort(key=score, reverse=True)
+ best = cand[0]
+ if best.track_id is not None:
+ last_track = best.track_id
+ frame_targets.append((best.cx, best.cy, best.w, best.h))
+
+ # Glätten
+ cxs = moving_average([c for c, _, _, _ in frame_targets], smooth_win)
+ cys = moving_average([c for _, c, _, _ in frame_targets], smooth_win)
+ ws = moving_average([w for *_, w, _ in frame_targets], max(3, smooth_win // 2))
+ hs = moving_average([h for *_, _, h in frame_targets], max(3, smooth_win // 2))
+
+ out = []
+ for i, (cx, cy, w, h) in enumerate(zip(cxs, cys, ws, hs)):
+ t = i / fps
+ out.append({
+ "frame": i,
+ "t": round(t, 4),
+ "cx": round(clamp01(cx), 4),
+ "cy": round(clamp01(cy), 4),
+ "w": round(max(0.05, min(1.0, w)), 4),
+ "h": round(max(0.05, min(1.0, h)), 4),
+ })
+ return out
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# I/O
+# ──────────────────────────────────────────────────────────────────────────────
+
+def write_target_json(stem: str, target: List[Dict]) -> Path:
+ SEGMENTS_DIR.mkdir(parents=True, exist_ok=True)
+ out_path = SEGMENTS_DIR / f"{stem}_target_by_frame.json"
+ out_path.write_text(json.dumps(target, ensure_ascii=False, indent=2), encoding="utf-8")
+ return out_path
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# CLI / Main
+# ──────────────────────────────────────────────────────────────────────────────
+
+def parse_args():
+ p = argparse.ArgumentParser(description="Erzeugt target_by_frame.json aus Face/Center-Detektionen für Cropping.")
+ p.add_argument("--pattern", type=str, default="highlight_*.mp4", help="Dateimuster in RAW_CLIPS_DIR (Default: highlight_*.mp4)")
+ p.add_argument("--fps", type=float, default=0.0, help="FPS erzwingen (0 = aus Video lesen).")
+ p.add_argument("--smooth", type=int, default=7, help="Fensterbreite für Moving-Average-Glättung (ungerade).")
+ p.add_argument("--overwrite", action="store_true", help="Existierende target_by_frame.json überschreiben.")
+ return p.parse_args()
+
+
+def main():
+ if not MOVIEPY_OK:
+ raise RuntimeError("moviepy ist nicht installiert. Bitte `pip install moviepy` ausführen.")
+
+ args = parse_args()
+
+ vids = sorted(RAW_CLIPS_DIR.glob(args.pattern))
+ if not vids:
+ print(f"⚠️ Keine Rohclips gefunden in {RAW_CLIPS_DIR} mit Pattern '{args.pattern}'.")
+ return
+
+ print(f"🔎 Finde {len(vids)} Clips …")
+
+ for vid in vids:
+ stem = vid.stem # z. B. highlight_3
+ out_json = SEGMENTS_DIR / f"{stem}_target_by_frame.json"
+ if out_json.exists() and not args.overwrite:
+ print(f"⏭️ {out_json.name} existiert bereits – überspringe (nutze --overwrite zum Ersetzen).")
+ continue
+
+ # Video-Metadaten
+ try:
+ with VideoFileClip(str(vid)) as V:
+ duration = float(V.duration or 0.0)
+ fps = float(args.fps or (V.fps or 25.0))
+ except Exception as e:
+ print(f"❌ Kann Video {vid.name} nicht öffnen: {e} – Fallback duration/fps (10s/25fps).")
+ duration, fps = 10.0, (args.fps or 25.0)
+
+ # Face/Centers laden (fps_hint durchreichen, wichtig für centers-Listen)
+ faces = load_faces_or_centers(stem, fps_hint=fps)
+
+ # Zielspur bauen
+ target = build_target_by_frame(faces, duration=duration, fps=fps, smooth_win=args.smooth)
+
+ # Schreiben
+ out = write_target_json(stem, target)
+ print(f"💾 geschrieben: {out}")
+
+ print("🎉 Fertig.")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/src/reformat/new/analyze_mouth_activity.py b/src/reformat/new/analyze_mouth_activity.py
new file mode 100644
index 0000000..41f71e4
--- /dev/null
+++ b/src/reformat/new/analyze_mouth_activity.py
@@ -0,0 +1,118 @@
+#!/usr/bin/env python3
+# src/reformat/new/analyze_mouth_activity.py
+import logging
+from pathlib import Path
+from typing import List, Dict, Any, Tuple, Optional
+
+# OpenAI optional; aktuell nicht genutzt (Flag fehlt bewusst)
+# from openai import OpenAI
+
+# === HARTE DEFAULTS: einfach Play drücken ===
+PROJECT_ROOT = Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit")
+RAW_DIR = PROJECT_ROOT / "data" / "output" / "raw_clips"
+FACES_DIR = PROJECT_ROOT / "data" / "face_data_combined"
+TIMED_DIR = PROJECT_ROOT / "data" / "transkripte"
+CENTERS_DIR = PROJECT_ROOT / "data" / "face_crop_centers"
+
+def parse_timed_file(path: Path) -> List[Tuple[float, float]]:
+ """
+ Erwartet Zeilen wie:
+ [00:00.00 - 00:05.20] Text...
+ Gibt Liste [(start_sec, end_sec)] zurück. Falls keine Treffer: leere Liste.
+ """
+ import re
+ rx = re.compile(r"\[(\d+):(\d+)\.(\d+)\s*-\s*(\d+):(\d+)\.(\d+)\]")
+ segs = []
+ try:
+ for line in path.read_text(encoding="utf-8").splitlines():
+ m = rx.search(line)
+ if not m:
+ continue
+ smin, ssec, sms, emin, esec, ems = map(int, m.groups())
+ start = smin * 60 + ssec + sms / 100.0
+ end = emin * 60 + esec + ems / 100.0
+ if end > start:
+ segs.append((start, end))
+ except FileNotFoundError:
+ pass
+ return segs
+
+def select_speaker_center(faces: List[Dict[str, Any]]) -> Tuple[float, float]:
+ """Priorität: mouth_openness, Fallback: größte Fläche; sonst Bildmitte."""
+ if not faces:
+ return (960.0, 540.0)
+ def area(f):
+ bx = f.get("bbox",[0,0,0,0]); return float(bx[2]*bx[3])
+ best = max(
+ faces,
+ key=lambda f: (float(f.get("mouth_openness", 0.0)), area(f))
+ )
+ x, y, w, h = best["bbox"]
+ return (x + w/2.0, y + h/2.0)
+
+def load_json(path: Path):
+ import json
+ return json.loads(path.read_text(encoding="utf-8"))
+
+def save_json(obj, path: Path):
+ import json
+ path.parent.mkdir(parents=True, exist_ok=True)
+ path.write_text(json.dumps(obj, ensure_ascii=False, indent=2), encoding="utf-8")
+
+def process_one(base_name: str) -> bool:
+ faces_path = FACES_DIR / f"{base_name}_faces.json"
+ timed_path = TIMED_DIR / f"{base_name}_timed.txt"
+ centers_path = CENTERS_DIR / f"{base_name}_centers.json"
+
+ if not faces_path.exists():
+ logging.warning("Skip %-18s | Faces fehlen: %s", base_name, faces_path)
+ return False
+ if centers_path.exists():
+ logging.info("Skip %-18s | Centers existieren schon: %s", base_name, centers_path.name)
+ return True
+
+ try:
+ face_data: List[Dict[str, Any]] = load_json(faces_path)
+ except Exception as e:
+ logging.error("Fehler beim Lesen von %s: %s", faces_path, e)
+ return False
+
+ segments = parse_timed_file(timed_path)
+ if not segments:
+ logging.warning("[%s] Keine Segmente erkannt oder Datei fehlt: %s", base_name, timed_path.name)
+
+ centers: List[List[float]] = []
+ for entry in face_data:
+ faces = entry.get("faces", [])
+ cx, cy = select_speaker_center(faces)
+ centers.append([float(cx), float(cy)])
+
+ save_json(centers, centers_path)
+ logging.info("OK %-18s | Centers gespeichert: %s (frames=%d)", base_name, centers_path.name, len(centers))
+ return True
+
+def main():
+ logging.basicConfig(
+ format="%(asctime)s %(levelname)s: %(message)s",
+ level=logging.INFO
+ )
+
+ if not RAW_DIR.exists():
+ logging.error("RAW_DIR existiert nicht: %s", RAW_DIR)
+ return
+
+ clips = sorted(RAW_DIR.glob("*.mp4"))
+ if not clips:
+ logging.warning("Keine Clips gefunden in %s", RAW_DIR)
+ return
+
+ logging.info("Analyze (mouth) Batch-Mode: %d Clips", len(clips))
+ ok = 0
+ for clip in clips:
+ base = clip.stem
+ if process_one(base):
+ ok += 1
+ logging.info("Fertig. %d/%d Clips verarbeitet.", ok, len(clips))
+
+if __name__ == "__main__":
+ main()
diff --git a/src/reformat/new/main_apply_crop.py b/src/reformat/new/main_apply_crop.py
new file mode 100644
index 0000000..cf90a2c
--- /dev/null
+++ b/src/reformat/new/main_apply_crop.py
@@ -0,0 +1,294 @@
+#!/usr/bin/env python3
+# src/reformat/new/main_apply_crop.py
+from __future__ import annotations
+import logging, json, math, subprocess
+from pathlib import Path
+from typing import Optional, Tuple, List, Dict, Any
+from collections import deque
+
+import cv2
+import numpy as np
+
+# ==== Pfade =================================================================
+PROJECT_ROOT = Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit")
+INPUT_VIDEO_DIR = PROJECT_ROOT / "data" / "output" / "raw_clips"
+FACE_COMBINED_DIR = PROJECT_ROOT / "data" / "face_data_combined"
+SEGMENTS_DIR = PROJECT_ROOT / "data" / "segments"
+OUTPUT_DIR = PROJECT_ROOT / "output" / "output_9x16_final"
+OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+
+OUT_W, OUT_H = 1080, 1920
+TARGET_AR = OUT_W / OUT_H # 0.5625
+
+# ==== Debug =================================================================
+DEBUG_MODE = False
+DEBUG_SCALE = 0.6
+DRAW_GUIDES = True
+
+# ==== Smooth / Switch =======================================================
+MEDIAN_WIN = 5
+EMA_ALPHA = 0.22
+DEADBAND_PX = 8.0
+SWITCH_COOLDOWN_FRAMES = 12 # kurze Ruhe nach Segmentwechsel
+ZOOM_PAD_FRAC = 0.10
+
+# ==== Scene-Cut-Erkennung ===================================================
+USE_CUT_DETECT = True
+CUT_CORR_THRESH = 0.65
+CUT_COOLDOWN = 6
+
+# ==== Audio-Mux =============================================================
+MUX_AUDIO = True
+FFMPEG_BIN = "ffmpeg"
+# ============================================================================
+
+def clamp(v, lo, hi): return max(lo, min(hi, v))
+
+def compute_crop_rect(cx: float, cy: float, src_w: int, src_h: int) -> tuple[int,int,int,int]:
+ """
+ Liefert ein 9:16-Croprechteck (x,y,w,h) um (cx,cy).
+ - AR bleibt IMMER exakt 9:16 (kein Squeeze)
+ - ZOOM_PAD_FRAC wirkt als uniformer Scale auf Breite und Höhe
+ - Rechteck bleibt vollständig im Bild
+ """
+ src_ar = src_w / src_h
+
+ if src_ar >= TARGET_AR:
+ base_h = src_h
+ base_w = int(round(base_h * TARGET_AR))
+ else:
+ base_w = src_w
+ base_h = int(round(base_w / TARGET_AR))
+
+ desired_scale = 1.0 + ZOOM_PAD_FRAC
+ max_scale_w = src_w / base_w
+ max_scale_h = src_h / base_h
+ s = min(desired_scale, max_scale_w, max_scale_h)
+
+ w = int(round(base_w * s))
+ h = int(round(base_h * s))
+
+ half_w, half_h = w // 2, h // 2
+
+ cx = clamp(cx, half_w, src_w - half_w)
+ cy = clamp(cy, half_h, src_h - half_h)
+
+ x = int(round(cx - half_w))
+ y = int(round(cy - half_h))
+ return x, y, w, h
+
+def draw_center(img, pt, color, label=None):
+ if pt is None: return
+ x, y = int(pt[0]), int(pt[1])
+ cv2.circle(img, (x, y), 6, color, -1)
+ if label:
+ cv2.putText(img, label, (x + 8, y - 8),
+ cv2.FONT_HERSHEY_SIMPLEX, 0.55, color, 2, cv2.LINE_AA)
+
+def scene_corr(a_small: np.ndarray, b_small: np.ndarray) -> float:
+ a_hsv = cv2.cvtColor(a_small, cv2.COLOR_BGR2HSV)
+ b_hsv = cv2.cvtColor(b_small, cv2.COLOR_BGR2HSV)
+ ha = cv2.calcHist([a_hsv],[0,1],None,[50,50],[0,180,0,256])
+ hb = cv2.calcHist([b_hsv],[0,1],None,[50,50],[0,180,0,256])
+ cv2.normalize(ha,ha,0,1,cv2.NORM_MINMAX); cv2.normalize(hb,hb,0,1,cv2.NORM_MINMAX)
+ corr = cv2.compareHist(ha, hb, cv2.HISTCMP_CORREL)
+ return float((corr + 1.0)/2.0)
+
+def mux_audio_from_source(src_video: Path, silent_video: Path, out_video: Path):
+ cmd = [
+ FFMPEG_BIN, "-y",
+ "-i", str(src_video),
+ "-i", str(silent_video),
+ "-map", "1:v:0",
+ "-map", "0:a:0?",
+ "-c:v", "copy",
+ "-c:a", "aac", "-b:a", "192k",
+ "-shortest",
+ str(out_video),
+ ]
+ subprocess.run(cmd, check=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+def load_faces(name: str) -> List[Dict[str,Any]]:
+ p = FACE_COMBINED_DIR / f"{name}_faces.json"
+ return json.loads(p.read_text(encoding="utf-8"))
+
+def load_segments(name: str, total_frames: int) -> List[Optional[int]]:
+ seg_p = SEGMENTS_DIR / f"{name}_segments.json"
+ map_p = SEGMENTS_DIR / f"{name}_target_by_frame.json"
+ if map_p.exists():
+ target = json.loads(map_p.read_text(encoding="utf-8"))
+ if len(target) < total_frames:
+ target += [target[-1] if target else None] * (total_frames - len(target))
+ return target[:total_frames]
+ if seg_p.exists():
+ segs = json.loads(seg_p.read_text(encoding="utf-8"))
+ target = [None]*total_frames
+ for s in segs:
+ a, b, tid = int(s["start_f"]), int(s["end_f"]), s["track_id"]
+ for t in range(max(0,a), min(total_frames, b+1)):
+ target[t] = tid
+ return target
+ return [None]*total_frames
+
+def find_center_for_track(faces_frame: Dict[str,Any], target_tid: Optional[int], fallback: Tuple[float,float]) -> Tuple[float,float]:
+ if target_tid is None:
+ return fallback
+ faces = faces_frame.get("faces", [])
+ for f in faces:
+ if int(f.get("track_id", -1)) == int(target_tid):
+ x,y,w,h = f.get("bbox", [None,None,None,None])
+ if None not in (x,y,w,h):
+ return (float(x + w/2), float(y + h/2))
+ return fallback
+
+def main():
+ logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO)
+ clips = sorted(list(INPUT_VIDEO_DIR.glob("*.mp4")) + list(INPUT_VIDEO_DIR.glob("*.mov")))
+ if not clips:
+ print(f"⚠️ Keine Clips in {INPUT_VIDEO_DIR}")
+ return
+
+ for video_path in clips:
+ name = video_path.stem
+ faces_path = FACE_COMBINED_DIR / f"{name}_faces.json"
+ if not faces_path.exists():
+ print(f"⏭️ Skip (keine Faces): {faces_path.name}")
+ continue
+
+ cap = cv2.VideoCapture(str(video_path))
+ if not cap.isOpened():
+ print(f"❌ Kann Video nicht öffnen: {video_path.name}")
+ continue
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+ fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
+ total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+
+ faces_all = load_faces(name)
+ if len(faces_all) < total:
+ faces_all += [ {"faces": [], "W": width, "H": height} ] * (total - len(faces_all))
+
+ target_by_frame = load_segments(name, total)
+
+ out_path = OUTPUT_DIR / f"{name}_9x16.mp4"
+ if out_path.exists():
+ print(f"⏭️ Skip: Output existiert bereits → {out_path.name}")
+ cap.release()
+ continue
+
+ writer = cv2.VideoWriter(
+ str(out_path),
+ cv2.VideoWriter_fourcc(*"mp4v"),
+ fps,
+ (OUT_W, OUT_H)
+ )
+
+ median_buf = deque(maxlen=max(3, MEDIAN_WIN if MEDIAN_WIN % 2 else MEDIAN_WIN+1))
+ ema_center: Optional[Tuple[float,float]] = None
+ last_center: Optional[Tuple[float,float]] = (width/2, height/2)
+ switch_cooldown = 0
+
+ prev_small = None
+ cut_cd = 0
+
+ print(f"🎞️ Apply: {name} src={width}x{height} fps={fps:.2f} frames={total}")
+
+ for t in range(total):
+ ret, frame = cap.read()
+ if not ret: break
+
+ target_tid = target_by_frame[t] if t < len(target_by_frame) else None
+ faces_fr = faces_all[t] if t < len(faces_all) else {"faces":[]}
+ desired = find_center_for_track(faces_fr, target_tid, last_center or (width/2, height/2))
+
+ if USE_CUT_DETECT:
+ small = cv2.resize(frame, (128, 72))
+ if prev_small is not None:
+ corr = scene_corr(prev_small, small)
+ if corr < CUT_CORR_THRESH:
+ ema_center = desired
+ last_center = desired
+ switch_cooldown = SWITCH_COOLDOWN_FRAMES
+ cut_cd = CUT_COOLDOWN
+ prev_small = small
+
+ median_buf.append(desired)
+ if len(median_buf) >= 3:
+ xs = sorted(p[0] for p in median_buf)
+ ys = sorted(p[1] for p in median_buf)
+ m = len(median_buf)//2
+ desired_f = (xs[m], ys[m])
+ else:
+ desired_f = desired
+
+ if t > 0:
+ prev_tid = target_by_frame[t-1] if t-1 < len(target_by_frame) else None
+ else:
+ prev_tid = target_tid
+
+ if ema_center is None:
+ ema_center = desired_f
+ if last_center is None:
+ last_center = desired_f
+
+ if target_tid != prev_tid:
+ ema_center = desired_f
+ last_center = desired_f
+ switch_cooldown = SWITCH_COOLDOWN_FRAMES
+ else:
+ dx = desired_f[0] - ema_center[0]
+ dy = desired_f[1] - ema_center[1]
+ dist = math.hypot(dx, dy)
+ if cut_cd > 0:
+ ema_center = desired_f
+ cut_cd -= 1
+ else:
+ if dist > DEADBAND_PX:
+ ema_center = (ema_center[0] + dx*EMA_ALPHA,
+ ema_center[1] + dy*EMA_ALPHA)
+
+ last_center = desired_f
+
+ # neuer 9:16 Crop
+ x, y, w, h = compute_crop_rect(ema_center[0], ema_center[1], width, height)
+ cropped = frame[y:y+h, x:x+w]
+ if cropped.size == 0: cropped = frame
+ final = cv2.resize(cropped, (OUT_W, OUT_H), interpolation=cv2.INTER_AREA)
+ writer.write(final)
+
+ if DEBUG_MODE:
+ dbg = frame.copy()
+ cv2.rectangle(dbg, (x, y), (x+w, y+h), (0, 0, 255), 2)
+ if DRAW_GUIDES:
+ draw_center(dbg, desired, (128,128,255), "desired")
+ draw_center(dbg, desired_f, (255,255, 0), "median")
+ draw_center(dbg, ema_center, ( 0,255,255), "ema")
+ cv2.putText(dbg, f"t={t+1}/{total} tid={target_tid}",
+ (12, height-14), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (20,220,20), 2, cv2.LINE_AA)
+ disp = cv2.resize(dbg, (int(width*DEBUG_SCALE), int(height*DEBUG_SCALE)))
+ cv2.imshow("Apply Debug", disp)
+ if cv2.waitKey(1) & 0xFF == ord("q"):
+ print("🛑 Abgebrochen (q).")
+ break
+
+ writer.release()
+ cap.release()
+
+ if MUX_AUDIO:
+ tmp = out_path.with_suffix(".tmp.mp4")
+ try:
+ out_path.rename(tmp)
+ mux_audio_from_source(video_path, tmp, out_path)
+ finally:
+ if tmp.exists():
+ try: tmp.unlink()
+ except: pass
+ print(f"✅ Fertig (mit Audio): {out_path.name}")
+ else:
+ print(f"✅ Fertig: {out_path.name}")
+
+ if DEBUG_MODE:
+ cv2.destroyAllWindows()
+
+if __name__ == "__main__":
+ main()
diff --git a/src/reformat/new/main_detect_faces.py b/src/reformat/new/main_detect_faces.py
new file mode 100644
index 0000000..12094ec
--- /dev/null
+++ b/src/reformat/new/main_detect_faces.py
@@ -0,0 +1,319 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+import argparse
+import logging
+import json
+import time
+from pathlib import Path
+from contextlib import nullcontext
+
+import cv2
+import numpy as np
+import torch
+from ultralytics import YOLO
+import mediapipe as mp
+
+# Fortschritt hübsch, wenn verfügbar
+try:
+ from tqdm import tqdm
+ _HAS_TQDM = True
+except Exception:
+ _HAS_TQDM = False
+
+from src.reformat.new.speaking import get_mouth_openness
+
+# ---------- Performance Tweaks ----------
+torch.set_float32_matmul_precision("high")
+cv2.setUseOptimized(True)
+
+# ---------- Hilfsfunktionen ----------
+
+def make_square_crop(x1, y1, x2, y2, W, H, margin_scale, min_crop):
+ cx = (x1 + x2) * 0.5
+ cy = (y1 + y2) * 0.5
+ w = (x2 - x1) * (1.0 + 2.0 * margin_scale)
+ h = (y2 - y1) * (1.0 + 2.0 * margin_scale)
+ side = max(w, h, float(min_crop))
+ half = side * 0.5
+
+ sx1 = int(max(0, round(cx - half)))
+ sy1 = int(max(0, round(cy - half)))
+ sx2 = int(min(W, round(cx + half)))
+ sy2 = int(min(H, round(cy + half)))
+
+ side_w = max(0, sx2 - sx1)
+ side_h = max(0, sy2 - sy1)
+ side = max(2, min(side_w, side_h))
+ sx2 = sx1 + side
+ sy2 = sy1 + side
+ return sx1, sy1, sx2, sy2
+
+
+def pick_landmarks_near_crop_center(lm_lists, crop_w, crop_h):
+ if not lm_lists:
+ return None
+ cx_t, cy_t = crop_w * 0.5, crop_h * 0.5
+ best, best_d = None, 1e12
+ for lms in lm_lists:
+ xs = [p.x * crop_w for p in lms.landmark]
+ ys = [p.y * crop_h for p in lms.landmark]
+ cx = sum(xs) / len(xs)
+ cy = sum(ys) / len(ys)
+ d = (cx - cx_t) ** 2 + (cy - cy_t) ** 2
+ if d < best_d:
+ best, best_d = lms, d
+ return best
+
+
+def run_mesh(face_mesh, crop_bgr, upscale_if_small):
+ if crop_bgr.size == 0:
+ return None, 0.0
+ ch, cw = crop_bgr.shape[:2]
+ if max(ch, cw) < upscale_if_small:
+ scale = float(upscale_if_small) / max(ch, cw)
+ new_w = max(1, int(round(cw * scale)))
+ new_h = max(1, int(round(ch * scale)))
+ crop_bgr = cv2.resize(crop_bgr, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
+ ch, cw = new_h, new_w
+ rgb = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2RGB)
+ res = face_mesh.process(rgb)
+ if not res.multi_face_landmarks:
+ return None, 0.0
+ chosen = pick_landmarks_near_crop_center(res.multi_face_landmarks, cw, ch)
+ if chosen is None:
+ return None, 0.0
+ mo = get_mouth_openness(chosen.landmark, ch)
+ return chosen, float(mo)
+
+# ---------- Kernprozess ----------
+
+def process_video(video_path: Path,
+ output_path: Path,
+ model: YOLO,
+ face_mesh,
+ conf_thresh: float,
+ frame_skip: int,
+ downscale: float,
+ expansion_1: float,
+ expansion_2: float,
+ min_crop: int,
+ faces_upscale: int,
+ imgsz: int,
+ device: str,
+ max_det: int):
+ print(f"🎬 Starte Detection: {video_path.name}")
+ cap = cv2.VideoCapture(str(video_path))
+ if not cap.isOpened():
+ logging.error(f"❌ Kann Video nicht öffnen: {video_path}")
+ return
+
+ fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
+ orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+ orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+ total_frames_raw = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+ # Wenn frame_skip > 1, reduziert sich die tatsächlich verarbeitete Anzahl
+ total_to_process = None
+ if total_frames_raw > 0:
+ total_to_process = (total_frames_raw + (frame_skip - 1)) // max(1, frame_skip)
+
+ scaled_w = max(1, int(round(orig_w * downscale)))
+ scaled_h = max(1, int(round(orig_h * downscale)))
+
+ data = []
+ frame_idx = 0
+ processed_frames = 0
+
+ sx = (orig_w / scaled_w) if downscale != 1.0 else 1.0
+ sy = (orig_h / scaled_h) if downscale != 1.0 else 1.0
+
+ autocast_ctx = (
+ torch.autocast(device_type=device, dtype=torch.float16)
+ if device in ("mps", "cuda") else nullcontext()
+ )
+
+ # Fortschrittsbalken pro Video
+ bar = None
+ start_t = time.time()
+ if _HAS_TQDM:
+ bar = tqdm(total=total_to_process, desc=f"{video_path.name}", unit="f", leave=True)
+
+ while True:
+ ret, frame = cap.read()
+ if not ret:
+ break
+
+ if frame_skip > 1 and (frame_idx % frame_skip != 0):
+ frame_idx += 1
+ continue
+
+ frame_infer = frame if downscale == 1.0 else cv2.resize(frame, (scaled_w, scaled_h), interpolation=cv2.INTER_AREA)
+
+ with torch.no_grad():
+ with autocast_ctx:
+ detections = model(frame_infer, imgsz=imgsz, device=device, verbose=False,
+ conf=conf_thresh, iou=0.5, max_det=max_det)[0]
+
+ faces = []
+ for i in range(len(detections.boxes)):
+ box = detections.boxes[i]
+ conf = float(box.conf[0]) if hasattr(box.conf, "__len__") else float(box.conf)
+ if conf < conf_thresh:
+ continue
+ x1, y1, x2, y2 = [float(v) for v in box.xyxy[0].tolist()]
+ if downscale != 1.0:
+ x1 *= sx; x2 *= sx; y1 *= sy; y2 *= sy
+ x1 = max(0.0, min(x1, orig_w - 1))
+ y1 = max(0.0, min(y1, orig_h - 1))
+ x2 = max(0.0, min(x2, orig_w - 1))
+ y2 = max(0.0, min(y2, orig_h - 1))
+
+ w = max(1.0, x2 - x1)
+ h = max(1.0, y2 - y1)
+ cx = x1 + w / 2.0
+ cy = y1 + h / 2.0
+
+ # Pass 1
+ sx1, sy1, sx2, sy2 = make_square_crop(x1, y1, x2, y2, orig_w, orig_h, expansion_1, min_crop)
+ if sx2 - sx1 < 4 or sy2 - sy1 < 4:
+ continue
+ face_crop = frame[sy1:sy2, sx1:sx2]
+ _, mouth_open = run_mesh(face_mesh, face_crop, faces_upscale)
+
+ # Pass 2 nur wenn nötig
+ if mouth_open == 0.0:
+ sx1b, sy1b, sx2b, sy2b = make_square_crop(x1, y1, x2, y2, orig_w, orig_h, expansion_2, min_crop)
+ if (sx2b - sx1b) >= 4 and (sy2b - sy1b) >= 4:
+ face_crop_b = frame[sy1b:sy2b, sx1b:sx2b]
+ _, mouth_open = run_mesh(face_mesh, face_crop_b, faces_upscale)
+
+ faces.append({
+ "bbox": [int(round(x1)), int(round(y1)), int(round(w)), int(round(h))],
+ "conf": round(conf, 3),
+ "center": [round(cx, 1), round(cy, 1)],
+ "mouth_openness": round(float(mouth_open), 3)
+ })
+
+ data.append({
+ "frame": frame_idx,
+ "timestamp": round(frame_idx / fps, 3),
+ "W": orig_w,
+ "H": orig_h,
+ "faces": faces
+ })
+ frame_idx += 1
+ processed_frames += 1
+
+ # Fortschritt aktualisieren
+ if _HAS_TQDM:
+ bar.update(1)
+ else:
+ # leichter Fallback: ETA Ausgabe alle 30 verarbeitete Frames
+ if processed_frames % 30 == 0:
+ elapsed = time.time() - start_t
+ rate = processed_frames / max(1e-6, elapsed) # frames/sec
+ if total_to_process:
+ remaining = max(0, total_to_process - processed_frames)
+ eta_sec = remaining / max(1e-6, rate)
+ print(f"[{video_path.name}] {processed_frames}/{total_to_process} "
+ f"({processed_frames/total_to_process*100:.1f}%) "
+ f"— {rate:.1f} f/s — ETA {eta_sec/60:.1f} min")
+ else:
+ print(f"[{video_path.name}] {processed_frames} frames — {rate:.1f} f/s")
+
+ cap.release()
+ if _HAS_TQDM and bar is not None:
+ bar.close()
+
+ output_path.write_text(json.dumps(data, ensure_ascii=False), encoding="utf-8")
+ print(f"✅ Faces gespeichert: {output_path.name}")
+
+def main():
+ parser = argparse.ArgumentParser()
+ # Verzeichnisse
+ parser.add_argument("--input-dir", type=Path,
+ default=Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit/data/output/raw_clips"))
+ parser.add_argument("--output-dir", type=Path,
+ default=Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit/data/face_data_combined"))
+ parser.add_argument("--model", type=Path,
+ default=Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit/models/yolov8n-face.pt"))
+ # Optimierte Defaults (keine Presets nötig)
+ parser.add_argument("--conf-thresh", type=float, default=0.35)
+ parser.add_argument("--frame-skip", type=int, default=1)
+ parser.add_argument("--downscale", type=float, default=0.5)
+ parser.add_argument("--expansion", type=float, default=0.4)
+ parser.add_argument("--expansion2", type=float, default=0.8)
+ parser.add_argument("--min-crop", type=int, default=160)
+ parser.add_argument("--faces-upscale", type=int, default=192)
+ parser.add_argument("--imgsz", type=int, default=448)
+ parser.add_argument("--max-det", type=int, default=20)
+ parser.add_argument("--use-refine", action="store_true", default=False)
+ args = parser.parse_args()
+
+ logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO)
+ args.output_dir.mkdir(parents=True, exist_ok=True)
+
+ # Model & Device
+ yolo = YOLO(str(args.model))
+ if torch.backends.mps.is_available():
+ device = "mps"
+ elif torch.cuda.is_available():
+ device = "cuda"
+ else:
+ device = "cpu"
+ yolo.to(device)
+ print(f"🖥️ Inference-Device: {device}")
+
+ # Warmup (reduziert Anlaufschwankungen)
+ try:
+ with torch.no_grad():
+ dummy = np.zeros((args.imgsz, args.imgsz, 3), dtype=np.uint8)
+ _ = yolo.predict(source=[dummy], imgsz=args.imgsz, verbose=False, device=device)
+ except Exception:
+ pass
+
+ # Liste der Videos (für Gesamt-Fortschritt)
+ videos = sorted(args.input_dir.glob("*.mp4"))
+ print(f"🔍 Input-Ordner: {args.input_dir.resolve()}")
+ print("📁 Dateien:")
+ for p in sorted(args.input_dir.glob("*")):
+ print(" →", p.name)
+
+ # Gesamt-Fortschrittsbalken pro Datei
+ outer = None
+ if _HAS_TQDM:
+ outer = tqdm(total=len(videos), desc="Gesamt", unit="vid", leave=False)
+
+ with mp.solutions.face_mesh.FaceMesh(
+ static_image_mode=False,
+ max_num_faces=10,
+ refine_landmarks=args.use_refine,
+ min_detection_confidence=0.5,
+ min_tracking_confidence=0.5
+ ) as face_mesh:
+ for vid in videos:
+ out = args.output_dir / f"{vid.stem}_faces.json"
+ process_video(
+ video_path=vid,
+ output_path=out,
+ model=yolo,
+ face_mesh=face_mesh,
+ conf_thresh=args.conf_thresh,
+ frame_skip=args.frame_skip,
+ downscale=args.downscale,
+ expansion_1=args.expansion,
+ expansion_2=args.expansion2,
+ min_crop=args.min_crop,
+ faces_upscale=args.faces_upscale,
+ imgsz=args.imgsz,
+ device=device,
+ max_det=args.max_det
+ )
+ if _HAS_TQDM and outer is not None:
+ outer.update(1)
+
+ if _HAS_TQDM and outer is not None:
+ outer.close()
+
+if __name__ == "__main__":
+ main()
diff --git a/src/reformat/new/main_track_faces.py b/src/reformat/new/main_track_faces.py
new file mode 100644
index 0000000..53d7347
--- /dev/null
+++ b/src/reformat/new/main_track_faces.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+import logging, json
+from pathlib import Path
+from typing import List, Dict, Any
+
+def iou(boxA, boxB):
+ xA = max(boxA[0], boxB[0])
+ yA = max(boxA[1], boxB[1])
+ xB = min(boxA[0]+boxA[2], boxB[0]+boxB[2])
+ yB = min(boxA[1]+boxA[3], boxB[1]+boxB[3])
+ interW, interH = max(0, xB-xA), max(0, yB-yA)
+ inter = interW * interH
+ union = boxA[2]*boxA[3] + boxB[2]*boxB[3] - inter
+ return inter/union if union > 0 else 0.0
+
+def track_faces(faces_all: List[Dict[str,Any]], iou_thresh=0.3):
+ next_id = 0
+ last_boxes = {} # track_id -> bbox
+ for frame in faces_all:
+ new_boxes = {}
+ for face in frame["faces"]:
+ box = face["bbox"]
+ # match gegen bestehende
+ best_id, best_iou = None, 0.0
+ for tid, prev_box in last_boxes.items():
+ ov = iou(box, prev_box)
+ if ov > best_iou:
+ best_id, best_iou = tid, ov
+ if best_iou > iou_thresh:
+ face["track_id"] = best_id
+ new_boxes[best_id] = box
+ else:
+ face["track_id"] = next_id
+ new_boxes[next_id] = box
+ next_id += 1
+ last_boxes = new_boxes
+ return faces_all
+
+def main():
+ PROJECT_ROOT = Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit")
+ FACE_DIR = PROJECT_ROOT / "data" / "face_data_combined"
+
+ for f in FACE_DIR.glob("*_faces.json"):
+ try:
+ faces_all = json.loads(f.read_text(encoding="utf-8"))
+ except Exception as e:
+ print(f"❌ Fehler beim Laden {f.name}: {e}")
+ continue
+
+ tracked = track_faces(faces_all)
+ f.write_text(json.dumps(tracked, ensure_ascii=False), encoding="utf-8")
+ print(f"✅ Track-IDs ergänzt: {f.name}")
+
+ # zusätzlich centers.json (dominant = höchster mouth_openness pro Frame)
+ centers = []
+ for fr in tracked:
+ if fr["faces"]:
+ best = max(fr["faces"], key=lambda ff: ff.get("mouth_openness", 0.0))
+ centers.append([best["center"][0], best["center"][1]])
+ else:
+ centers.append([fr["W"]/2, fr["H"]/2])
+ centers_path = f.with_name(f.stem.replace("_faces","_centers")+".json")
+ centers_path.write_text(json.dumps(centers, ensure_ascii=False), encoding="utf-8")
+ print(f"📝 Centers gespeichert: {centers_path.name}")
+
+if __name__ == "__main__":
+ main()
diff --git a/src/reformat/new/make_segments.py b/src/reformat/new/make_segments.py
new file mode 100644
index 0000000..c661485
--- /dev/null
+++ b/src/reformat/new/make_segments.py
@@ -0,0 +1,179 @@
+#!/usr/bin/env python3
+# src/reformat/new/make_segments.py
+from __future__ import annotations
+import json, math
+from pathlib import Path
+from dataclasses import dataclass
+from typing import List, Dict, Any, Optional
+import numpy as np
+import cv2
+
+# ==== Pfade (an dein Projekt angepasst) =====================================
+PROJECT_ROOT = Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit")
+RAW_DIR = PROJECT_ROOT / "data" / "output" / "raw_clips" # Videos
+FACE_COMBINED_DIR= PROJECT_ROOT / "data" / "face_data_combined" # *_faces.json
+SEGMENTS_DIR = PROJECT_ROOT / "data" / "segments" # Output
+SEGMENTS_DIR.mkdir(parents=True, exist_ok=True)
+# ===========================================================================
+
+# === Segment-Parameter ===
+WIN_SEC = 1.2 # Fensterlänge
+STRIDE_SEC = 0.6 # Schrittweite
+HYSTERESIS_FACTOR = 1.25 # neuer Sprecher muss +25% besser sein
+MIN_SEG_SEC = 1.0 # kürzere Segmente werden an Nachbarn gemerged
+CONF_MIN = 0.35 # Sichtbarkeits-Threshold
+AREA_CAP_FRAC = 0.12 # ab 12% Framefläche kappen wir den Flächenbonus
+
+@dataclass
+class Segment:
+ start_f: int
+ end_f: int
+ track_id: Optional[int]
+
+def robust_minmax(vals, p_lo=5, p_hi=95):
+ v = np.array(vals, dtype=float)
+ lo, hi = np.percentile(v, [p_lo, p_hi])
+ if hi <= lo: hi = lo + 1e-6
+ return float(lo), float(hi)
+
+def score_face(face: Dict[str,Any], W: int, H: int, cx: float, cy: float,
+ lo: float, hi: float) -> float:
+ # Mundaktivität robust normalisieren
+ mo = float(face.get("mouth_openness", 0.0))
+ mo = (mo - lo) / (hi - lo + 1e-9)
+ mo = float(np.clip(mo, 0.0, 1.0))
+
+ x, y, w, h = map(float, face.get("bbox", [0,0,0,0]))
+ conf = float(face.get("conf", 1.0))
+ if conf < CONF_MIN or w <= 5 or h <= 5: # sehr kleine/unsichere Gesichter raus
+ return 0.0
+
+ area = (w*h) / (W*H + 1e-9)
+ size_w = min(1.0, area / AREA_CAP_FRAC) # Flächengewicht
+ fx = x + w/2; fy = y + h/2
+ dist = math.hypot(fx - cx, fy - cy) / math.hypot(W/2, H/2)
+ center_w = max(0.0, 1.0 - dist**2) # Mitte leicht bevorzugen
+
+ # MO dominiert, Fläche und Mitte geben Stabilität
+ return mo * (0.6 + 0.3*size_w + 0.1*center_w)
+
+def build_segments_for_clip(faces_per_frame: List[Dict[str,Any]], fps: float) -> (List[Segment], List[Optional[int]]):
+ T = len(faces_per_frame)
+ if T == 0:
+ return [], []
+
+ # Framegröße
+ W = faces_per_frame[0].get("W") or faces_per_frame[0].get("width")
+ H = faces_per_frame[0].get("H") or faces_per_frame[0].get("height")
+ if not W or not H:
+ # Versuch, aus BBox-Max abzuleiten (Fallback)
+ max_w = max((f["bbox"][0]+f["bbox"][2]) for fr in faces_per_frame for f in fr.get("faces", []) if "bbox" in f) if any(fr.get("faces") for fr in faces_per_frame) else 1920
+ max_h = max((f["bbox"][1]+f["bbox"][3]) for fr in faces_per_frame for f in fr.get("faces", []) if "bbox" in f) if any(fr.get("faces") for fr in faces_per_frame) else 1080
+ W, H = int(max_w), int(max_h)
+
+ # Mundwerte für robuste Normierung sammeln
+ all_mo = [float(f.get("mouth_openness", 0.0))
+ for fr in faces_per_frame for f in fr.get("faces", [])]
+ lo, hi = robust_minmax(all_mo) if all_mo else (0.0, 1.0)
+
+ win = max(1, int(round(WIN_SEC * fps)))
+ stride = max(1, int(round(STRIDE_SEC * fps)))
+ minseg = max(1, int(round(MIN_SEG_SEC * fps)))
+
+ chosen_by_frame: List[Optional[int]] = [None]*T
+ last_track: Optional[int] = None
+
+ for start in range(0, T, stride):
+ end = min(T, start + win)
+ sums: Dict[int, float] = {}
+ for t in range(start, end):
+ faces = faces_per_frame[t].get("faces", [])
+ if not faces: continue
+ for face in faces:
+ tid = face.get("track_id")
+ if tid is None:
+ continue
+ s = score_face(face, W, H, W/2, H/2, lo, hi)
+ if s <= 0:
+ continue
+ tid = int(tid)
+ sums[tid] = sums.get(tid, 0.0) + s
+
+ if not sums:
+ chosen = last_track
+ else:
+ best_tid, best_val = max(sums.items(), key=lambda kv: kv[1])
+ if last_track is None:
+ chosen = best_tid
+ else:
+ prev_val = sums.get(last_track, 0.0)
+ chosen = best_tid if best_val > prev_val * HYSTERESIS_FACTOR else last_track
+
+ for t in range(start, end):
+ chosen_by_frame[t] = chosen
+ last_track = chosen
+
+ # Lücken auffüllen
+ for t in range(T):
+ if chosen_by_frame[t] is None:
+ chosen_by_frame[t] = last_track
+
+ # Segmente bauen
+ segs: List[Segment] = []
+ cur = chosen_by_frame[0]
+ seg_start = 0
+ for t in range(1, T):
+ if chosen_by_frame[t] != cur:
+ segs.append(Segment(seg_start, t-1, cur))
+ cur = chosen_by_frame[t]
+ seg_start = t
+ segs.append(Segment(seg_start, T-1, cur))
+
+ # Mindestlänge: zu kurze an vorheriges mergen
+ out: List[Segment] = []
+ for s in segs:
+ if out and (s.end_f - s.start_f + 1) < minseg:
+ out[-1].end_f = s.end_f
+ else:
+ out.append(s)
+
+ return out, chosen_by_frame
+
+def main():
+ clips = sorted(list(RAW_DIR.glob("*.mp4")) + list(RAW_DIR.glob("*.mov")))
+ if not clips:
+ print(f"⚠️ Keine Videos in {RAW_DIR}")
+ return
+
+ for vid in clips:
+ name = vid.stem
+ faces_path = FACE_COMBINED_DIR / f"{name}_faces.json"
+ if not faces_path.exists():
+ print(f"⏭️ Skip (keine Faces): {faces_path.name}")
+ continue
+
+ # FPS vom Video
+ cap = cv2.VideoCapture(str(vid))
+ if not cap.isOpened():
+ print(f"❌ Kann Video nicht öffnen: {vid.name}")
+ continue
+ fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
+ cap.release()
+
+ try:
+ faces_per_frame = json.loads(faces_path.read_text(encoding="utf-8"))
+ except Exception as e:
+ print(f"❌ Fehler beim Lesen {faces_path.name}: {e}")
+ continue
+
+ segs, chosen = build_segments_for_clip(faces_per_frame, fps)
+
+ seg_out = SEGMENTS_DIR / f"{name}_segments.json"
+ map_out = SEGMENTS_DIR / f"{name}_target_by_frame.json"
+ seg_out.write_text(json.dumps([s.__dict__ for s in segs], ensure_ascii=False), encoding="utf-8")
+ map_out.write_text(json.dumps(chosen, ensure_ascii=False), encoding="utf-8")
+
+ print(f"✅ Segmente erzeugt: {seg_out.name} ({len(segs)} Segmente)")
+
+if __name__ == "__main__":
+ main()
diff --git a/src/reformat/new/smart_speaker_tracker.py b/src/reformat/new/smart_speaker_tracker.py
new file mode 100644
index 0000000..5875c54
--- /dev/null
+++ b/src/reformat/new/smart_speaker_tracker.py
@@ -0,0 +1,58 @@
+from typing import Dict, List, Tuple, Optional
+from .tracking import FaceTracker
+
+class SmartSpeakerTracker:
+ def __init__(self):
+ self.face_tracker = FaceTracker()
+ self.movement_per_id: Dict[int, float] = {}
+ self.prev_openness: Dict[int, float] = {}
+ self.confirmation_counter: Dict[int, int] = {}
+ self.speaker_threshold = 3.0 # wie viel Lippenbewegung braucht es mind.
+ self.decay_factor = 0.9 # wie schnell "verblasst" die Bewegung
+ self.speaker_confirm_frames = 25 # wie viele Frames muss ein Sprecher dominieren
+ self.speaker_id: Optional[int] = None
+
+ def update(self, faces: List[Dict]) -> Tuple[float, float]:
+ if not faces:
+ return self.face_tracker.update([])
+
+ # Lippenbewegung analysieren
+ for face in faces:
+ id = face.get("id")
+ openness = face.get("mouth_openness", 0.0)
+ prev = self.prev_openness.get(id, openness)
+ movement = abs(openness - prev)
+
+ # Bewegung aufaddieren mit Decay
+ old_score = self.movement_per_id.get(id, 0.0) * self.decay_factor
+ self.movement_per_id[id] = old_score + movement
+ self.prev_openness[id] = openness
+
+ # Finde ID mit größter Bewegung
+ if self.movement_per_id:
+ top_id = max(self.movement_per_id, key=self.movement_per_id.get)
+ top_movement = self.movement_per_id[top_id]
+
+ if top_movement >= self.speaker_threshold:
+ self.confirmation_counter[top_id] = self.confirmation_counter.get(top_id, 0) + 1
+ # Andere runterzählen
+ for other_id in self.confirmation_counter:
+ if other_id != top_id:
+ self.confirmation_counter[other_id] = max(0, self.confirmation_counter[other_id] - 1)
+
+ # Wenn lange genug bestätigt, neuer Sprecher
+ if self.confirmation_counter[top_id] >= self.speaker_confirm_frames:
+ self.speaker_id = top_id
+ else:
+ # Wenn keiner über der Schwelle → kein neuer Sprecher
+ self.confirmation_counter = {k: max(0, v - 1) for k, v in self.confirmation_counter.items()}
+
+ # Sprecher vorhanden → dahin zentrieren
+ if self.speaker_id is not None:
+ for face in faces:
+ if face.get("id") == self.speaker_id:
+ return tuple(face["center"])
+
+ # Fallback: stabiler Durchschnitt
+ centers = [tuple(face["center"]) for face in faces]
+ return self.face_tracker.update(centers)
diff --git a/src/reformat/new/speaker_crop_from_segments.py b/src/reformat/new/speaker_crop_from_segments.py
new file mode 100644
index 0000000..5d25c3f
--- /dev/null
+++ b/src/reformat/new/speaker_crop_from_segments.py
@@ -0,0 +1,67 @@
+import json
+from pathlib import Path
+from typing import List, Dict
+
+# === Pfade ===
+SCRIPT_DIR = Path(__file__).resolve().parent
+PROJECT_ROOT = SCRIPT_DIR.parents[2]
+
+FACES_PATH = PROJECT_ROOT / "data" / "face_data_combined" / "testVideoShort_faces.json"
+SEGMENTS_PATH = PROJECT_ROOT / "data" / "transkripte" / "testVideoShort_segments.json"
+OUTPUT_PATH = PROJECT_ROOT / "data" / "face_crop_centers" / "testVideoShort_centers.json"
+
+FPS = 25 # Muss zur Framerate deines Videos passen
+
+# === Dateien laden ===
+with open(FACES_PATH) as f:
+ face_data = json.load(f)
+
+with open(SEGMENTS_PATH) as f:
+ segments = json.load(f)
+
+# === Zentrierungen pro Frame bestimmen ===
+frame_centers: List[List[float]] = []
+
+for segment in segments:
+ start_sec = segment["start"]
+ end_sec = segment["end"]
+ start_f = int(start_sec * FPS)
+ end_f = int(end_sec * FPS)
+
+ # Lippenbewegung pro ID in diesem Segment aufaddieren
+ movement: Dict[int, float] = {}
+ count: Dict[int, int] = {}
+
+ for f in range(start_f, min(end_f, len(face_data))):
+ for face in face_data[f]["faces"]:
+ id = face.get("id")
+ openness = face.get("mouth_openness", 0.0)
+ movement[id] = movement.get(id, 0.0) + openness
+ count[id] = count.get(id, 0) + 1
+
+ # Durchschnitt berechnen
+ avg_movement = {id: movement[id] / count[id] for id in movement if count[id] > 0}
+ if not avg_movement:
+ speaker_id = None
+ else:
+ speaker_id = max(avg_movement, key=avg_movement.get)
+
+ # Für jedes Frame in diesem Segment den Sprecher zentrieren
+ for f in range(start_f, min(end_f, len(face_data))):
+ faces = face_data[f].get("faces", [])
+ center = [960.0, 540.0] # Fallback
+
+ if speaker_id is not None:
+ for face in faces:
+ if face.get("id") == speaker_id:
+ center = face["center"][:2]
+ break
+
+ frame_centers.append([round(center[0], 2), round(center[1], 2)])
+
+# === Ergebnis speichern ===
+OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
+with open(OUTPUT_PATH, "w") as f:
+ json.dump(frame_centers, f, indent=2)
+
+print(f"✅ Zentrierung auf Sprecher für {len(frame_centers)} Frames gespeichert unter:\n{OUTPUT_PATH}")
diff --git a/src/reformat/new/tracking.py b/src/reformat/new/tracking.py
new file mode 100644
index 0000000..838b6c6
--- /dev/null
+++ b/src/reformat/new/tracking.py
@@ -0,0 +1,84 @@
+from typing import List, Tuple, Optional
+
+
+class FaceTracker:
+ def __init__(
+ self,
+ dist_threshold: float = 200.0,
+ switch_frames: int = 5,
+ panning_window: int = 10,
+ panning_threshold: float = 40.0,
+ smooth_window: int = 3,
+ scene_jump_threshold: float = 400.0
+ ):
+ self.dist_threshold = dist_threshold
+ self.switch_frames = switch_frames
+ self.panning_window = panning_window
+ self.panning_threshold = panning_threshold
+ self.smooth_window = smooth_window
+ self.scene_jump_threshold = scene_jump_threshold
+
+ self.current_center: Tuple[float, float] = (960.0, 540.0) # Default Mitte (bei 1920x1080)
+ self.raw_center: Tuple[float, float] = self.current_center
+ self.prev_center: Tuple[float, float] = self.current_center
+ self.prev_raw: Tuple[float, float] = self.current_center
+ self.candidate_center: Optional[Tuple[float, float]] = None
+ self.switch_counter = 0
+
+ self.recent_raw_centers: List[Tuple[float, float]] = []
+ self.recent_final_centers: List[Tuple[float, float]] = []
+
+ def update(self, candidates: List[Tuple[float, float]]) -> Tuple[float, float]:
+ if not candidates:
+ # kein Gesicht → verwende alten Wert
+ self.recent_raw_centers.append(self.raw_center)
+ self.recent_final_centers.append(self.current_center)
+ return self.current_center
+
+ # nehme das Gesicht, das am nächsten zur vorherigen Position ist
+ new_center = min(candidates, key=lambda pt: self._distance(self.prev_center, pt))
+ self.raw_center = new_center
+ self.recent_raw_centers.append(new_center)
+
+ dist = self._distance(self.prev_raw, new_center)
+ if dist > self.scene_jump_threshold:
+ self.current_center = new_center
+ self.prev_center = new_center
+ self.prev_raw = new_center
+ self._smooth_reset()
+ return self.current_center
+
+ if dist > self.dist_threshold:
+ if self.candidate_center != new_center:
+ self.candidate_center = new_center
+ self.switch_counter = 1
+ else:
+ self.switch_counter += 1
+ if self.switch_counter >= self.switch_frames:
+ self.prev_center = self.current_center
+ self.current_center = new_center
+ self.prev_raw = new_center
+ self.switch_counter = 0
+ else:
+ self.switch_counter = 0
+ self.prev_raw = new_center
+
+ # Smoothes Nachziehen
+ smoothed = self._moving_average(self.current_center, new_center, self.smooth_window)
+ self.prev_center = self.current_center
+ self.current_center = smoothed
+ self.recent_final_centers.append(smoothed)
+
+ return smoothed
+
+ def _moving_average(self, old, new, factor):
+ x = (old[0] * (factor - 1) + new[0]) / factor
+ y = (old[1] * (factor - 1) + new[1]) / factor
+ return (x, y)
+
+ def _distance(self, pt1, pt2):
+ return ((pt1[0] - pt2[0]) ** 2 + (pt1[1] - pt2[1]) ** 2) ** 0.5
+
+ def _smooth_reset(self):
+ self.recent_raw_centers.clear()
+ self.recent_final_centers.clear()
diff --git a/src/reformat/new/utils.py b/src/reformat/new/utils.py
new file mode 100644
index 0000000..0ea37c6
--- /dev/null
+++ b/src/reformat/new/utils.py
@@ -0,0 +1,129 @@
+# src/utils.py
+from __future__ import annotations
+import json
+import logging
+import os
+from pathlib import Path
+from typing import Any, Dict, Tuple
+
+try:
+ import cv2
+except Exception:
+ cv2 = None # erlaubt Import ohne OpenCV (z.B. beim reinen Testen)
+
+# --- Logging ---------------------------------------------------------------
+
+def setup_logging(debug: bool = False) -> None:
+ level = logging.DEBUG if debug else logging.INFO
+ logging.basicConfig(
+ level=level,
+ format="%(asctime)s | %(levelname)s | %(message)s",
+ )
+
+# --- Mathe/Helpers ---------------------------------------------------------
+
+def clamp(v: float, lo: float, hi: float) -> float:
+ return max(lo, min(hi, v))
+
+def compute_crop_width(orig_w: int, orig_h: int, out_w: int = 1080, out_h: int = 1920) -> int:
+ # Für 9:16 Ziel: Breite = (9/16) * orig_h, standardmäßig 1080x1920
+ return int((out_w / out_h) * orig_h)
+
+def iou(boxA, boxB) -> float:
+ """Berechnet Intersection-over-Union zweier Bounding-Boxes."""
+ ax1, ay1, aw, ah = boxA
+ ax2, ay2 = ax1 + aw, ay1 + ah
+ bx1, by1, bw, bh = boxB
+ bx2, by2 = bx1 + bw, by1 + bh
+
+ inter_x1 = max(ax1, bx1)
+ inter_y1 = max(ay1, by1)
+ inter_x2 = min(ax2, bx2)
+ inter_y2 = min(ay2, by2)
+ inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1)
+
+ union_area = aw * ah + bw * bh - inter_area
+ return inter_area / union_area if union_area > 0 else 0
+
+# --- IO --------------------------------------------------------------------
+
+def load_json(path: Path) -> Any:
+ if not path.exists():
+ raise FileNotFoundError(f"Datei fehlt: {path}")
+ with open(path, "r", encoding="utf-8") as f:
+ return json.load(f)
+
+def save_json(obj: Any, path: Path) -> None:
+ path.parent.mkdir(parents=True, exist_ok=True)
+ with open(path, "w", encoding="utf-8") as f:
+ json.dump(obj, f, ensure_ascii=False, indent=2)
+
+def ensure_exists(path: Path, what: str = "Datei/Ordner") -> None:
+ if not path.exists():
+ raise FileNotFoundError(f"{what} nicht gefunden: {path}")
+
+# --- Video / Pfade ---------------------------------------------------------
+
+def get_fps(video_path: Path, fallback: float = 25.0) -> float:
+ if cv2 is None:
+ logging.warning("OpenCV nicht verfügbar – nutze FPS-Fallback %.2f", fallback)
+ return fallback
+ cap = cv2.VideoCapture(str(video_path))
+ fps = cap.get(5) # cv2.CAP_PROP_FPS
+ cap.release()
+ if not fps or fps <= 1e-3:
+ logging.warning("Konnte FPS nicht lesen – nutze Fallback %.2f", fallback)
+ return fallback
+ return float(fps)
+
+def project_root_from(file: Path) -> Path:
+ # Dein Projekt nutzt häufig parents[2]; kapseln:
+ return file.resolve().parents[3]
+
+def resolve_paths(project_root: Path, base_name: str) -> Dict[str, Path]:
+ data = project_root / "data"
+ return {
+ "timed_path": data / "transkripte" / f"{base_name}_timed.txt",
+ "segments_path":data / "transkripte" / f"{base_name}_segments.json",
+ "faces_path": data / "face_data_combined" / f"{base_name}_faces.json",
+ "centers_path": data / "face_crop_centers" / f"{base_name}_centers.json",
+ "video_path": data / "output" / "raw_clips" / f"{base_name}.mp4",
+ "out_9x16_dir": project_root / "output" / "output_9x16_final",
+ "face_debug_dir": project_root / "output" / "debug" / "faces",
+ }
+
+def require_api_key(env_name: str = "OPENAI_API_KEY") -> str:
+ key = os.getenv(env_name)
+ if not key:
+ raise RuntimeError(
+ f"Umgebungsvariable {env_name} fehlt. "
+ f"Exportiere sie z.B.: export {env_name}='sk-...'")
+ return key
+
+# --- Simple smoothing for centers ------------------------------------------
+
+from typing import List, Optional
+
+class CenterSmoother:
+ """Glättet Zentren mit Moving Average und optionaler Jump-Erkennung."""
+ def __init__(self, window: int = 7, jump_thresh: float = 120.0):
+ self.window = window
+ self.jump_thresh = jump_thresh
+ self.buffer: List[Tuple[float, float]] = []
+ self.prev: Optional[Tuple[float, float]] = None
+
+ def push(self, cx: float, cy: float) -> Tuple[float, float]:
+ if self.prev is not None:
+ dx = abs(cx - self.prev[0]) + abs(cy - self.prev[1])
+ if dx > self.jump_thresh:
+ # harter Cut: reset buffer
+ self.buffer.clear()
+
+ self.buffer.append((cx, cy))
+ if len(self.buffer) > self.window:
+ self.buffer.pop(0)
+
+ avgx = sum(p[0] for p in self.buffer) / len(self.buffer)
+ avgy = sum(p[1] for p in self.buffer) / len(self.buffer)
+ self.prev = (avgx, avgy)
+ return self.prev
diff --git a/src/reformat/old/analyze_crop_position.py b/src/reformat/old/analyze_crop_position.py
new file mode 100644
index 0000000..33a832a
--- /dev/null
+++ b/src/reformat/old/analyze_crop_position.py
@@ -0,0 +1,235 @@
+import argparse
+import json
+import logging
+import math
+import random
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+
+
+class FaceTracker:
+ def __init__(
+ self,
+ dist_threshold: float,
+ switch_frames: int,
+ panning_window: int,
+ panning_threshold: float,
+ smooth_window: int,
+ scene_jump_threshold: float,
+ ):
+ self.dist_threshold = dist_threshold
+ self.switch_frames = switch_frames
+ self.panning_window = panning_window
+ self.panning_threshold = panning_threshold
+ self.smooth_window = smooth_window
+ self.scene_jump_threshold = scene_jump_threshold
+
+ self.current_center: Tuple[float, float] = (960.0, 540.0)
+ self.raw_center: Tuple[float, float] = self.current_center
+ self.prev_center: Tuple[float, float] = self.current_center
+ self.prev_raw: Tuple[float, float] = self.current_center
+ self.candidate_center: Optional[Tuple[float, float]] = None
+ self.switch_counter: int = 0
+ self.last_speaker_set: bool = False
+ self.random_center: Optional[Tuple[float, float]] = None
+
+ self.panning_buffer: List[float] = []
+ self.smooth_buffer: List[Tuple[float, float]] = []
+
+ def process_frame(self, faces: List[Dict[str, Any]]) -> Tuple[Tuple[int, int], bool]:
+ valid_faces = [f for f in faces if f.get("center") and f.get("mouth_openness") is not None]
+ all_faces = [f for f in faces if f.get("center")]
+
+ # Speaker tracking
+ if valid_faces:
+ self._update_speaker(valid_faces)
+ else:
+ self._retain_or_random_center(all_faces)
+
+ # Panning detection
+ is_panning = self._detect_panning()
+
+ # Smooth / moving average
+ center = self._smooth_center()
+
+ return (int(center[0]), int(center[1])), is_panning
+
+ def _update_speaker(self, valid_faces: List[Dict[str, Any]]) -> None:
+ best = max(valid_faces, key=lambda x: x["mouth_openness"])
+ cx, cy, *_ = best["center"]
+ new_center = (cx, cy)
+
+ dist = math.hypot(new_center[0] - self.raw_center[0], new_center[1] - self.raw_center[1])
+ if dist < self.dist_threshold:
+ self.raw_center = new_center
+ self.candidate_center = None
+ self.switch_counter = 0
+ else:
+ if (
+ self.candidate_center is None
+ or math.hypot(
+ new_center[0] - self.candidate_center[0], new_center[1] - self.candidate_center[1]
+ )
+ > self.dist_threshold
+ ):
+ self.candidate_center = new_center
+ self.switch_counter = 1
+ else:
+ self.switch_counter += 1
+
+ if self.switch_counter >= self.switch_frames:
+ self.raw_center = self.candidate_center # type: ignore
+ self.candidate_center = None
+ self.switch_counter = 0
+
+ self.random_center = None
+ self.last_speaker_set = True
+
+ def _retain_or_random_center(self, all_faces: List[Dict[str, Any]]) -> None:
+ if self.last_speaker_set:
+ # keep previous raw_center
+ pass
+ elif self.random_center is not None:
+ self.raw_center = self.random_center
+ elif all_faces:
+ f = random.choice(all_faces)
+ cx, cy, *_ = f["center"]
+ self.random_center = (cx, cy)
+ self.raw_center = self.random_center
+
+ def _detect_panning(self) -> bool:
+ dx = self.raw_center[0] - self.prev_raw[0]
+ self.panning_buffer.append(dx)
+ if len(self.panning_buffer) > self.panning_window:
+ self.panning_buffer.pop(0)
+ avg_dx = sum(abs(d) for d in self.panning_buffer) / len(self.panning_buffer)
+ is_panning = avg_dx > self.panning_threshold
+ self.prev_raw = self.raw_center
+ return is_panning
+
+ def _smooth_center(self) -> Tuple[float, float]:
+ sudden_jump = (
+ math.hypot(
+ self.raw_center[0] - self.prev_center[0],
+ self.raw_center[1] - self.prev_center[1],
+ )
+ > self.scene_jump_threshold
+ )
+ if not sudden_jump:
+ self.smooth_buffer.append(self.raw_center)
+ if len(self.smooth_buffer) > self.smooth_window:
+ self.smooth_buffer.pop(0)
+ avg_x = sum(p[0] for p in self.smooth_buffer) / len(self.smooth_buffer)
+ avg_y = sum(p[1] for p in self.smooth_buffer) / len(self.smooth_buffer)
+ center = (avg_x, avg_y)
+ else:
+ center = self.raw_center
+ self.smooth_buffer.clear()
+
+ self.prev_center = center
+ return center
+
+
+def parse_args() -> argparse.Namespace:
+ script_dir = Path(__file__).resolve().parent
+ project_root = script_dir.parents[1]
+ default_input = project_root / "data" / "face_data_combined"
+ default_output = project_root / "data" / "face_crop_centers"
+
+ parser = argparse.ArgumentParser(
+ description="Track and smooth face crop centers based on mouth openness."
+ )
+ parser.add_argument(
+ "-i", "--input-dir", type=Path,
+ default=default_input,
+ help=f"Directory containing *_faces.json files (default: {default_input})"
+ )
+ parser.add_argument(
+ "-o", "--output-dir", type=Path,
+ default=default_output,
+ help=f"Directory to save *_centers.json files (default: {default_output})"
+ )
+ parser.add_argument(
+ "--dist-threshold", type=float, default=30.0,
+ help="Pixel distance threshold to switch speaker"
+ )
+ parser.add_argument(
+ "--switch-frames", type=int, default=20,
+ help="Number of consecutive frames required to confirm speaker switch"
+ )
+ parser.add_argument(
+ "--panning-window", type=int, default=30,
+ help="Frame window size for panning detection"
+ )
+ parser.add_argument(
+ "--panning-threshold", type=float, default=3.0,
+ help="Average dx threshold for panning detection"
+ )
+ parser.add_argument(
+ "--smooth-window", type=int, default=8,
+ help="Moving average window for smoothing"
+ )
+ parser.add_argument(
+ "--scene-jump-threshold", type=float, default=300.0,
+ help="Jump threshold to detect scene cuts"
+ )
+ return parser.parse_args()
+
+
+def setup_logging() -> None:
+ logging.basicConfig(
+ format="%(asctime)s %(levelname)s: %(message)s",
+ level=logging.INFO,
+ )
+
+
+def main() -> None:
+ setup_logging()
+ args = parse_args()
+
+ input_dir: Path = args.input_dir.resolve()
+ output_dir: Path = args.output_dir.resolve()
+ output_dir.mkdir(parents=True, exist_ok=True)
+
+ tracker = FaceTracker(
+ dist_threshold=args.dist_threshold,
+ switch_frames=args.switch_frames,
+ panning_window=args.panning_window,
+ panning_threshold=args.panning_threshold,
+ smooth_window=args.smooth_window,
+ scene_jump_threshold=args.scene_jump_threshold,
+ )
+
+ json_files = sorted(input_dir.glob("*_faces.json"))
+ if not json_files:
+ logging.error("Keine *_faces.json-Dateien gefunden in %s", input_dir)
+ return
+
+ logging.info("Gefundene Dateien: %d", len(json_files))
+
+ for json_path in json_files:
+ logging.info("Verarbeite %s", json_path.name)
+ try:
+ frames_data = json.loads(json_path.read_text())
+ except json.JSONDecodeError as e:
+ logging.error("JSON-Fehler in %s: %s", json_path.name, e)
+ continue
+
+ out_data: List[Dict[str, Any]] = []
+ for frame_idx, frame in enumerate(frames_data):
+ faces = frame.get("faces", [])
+ center, is_panning = tracker.process_frame(faces)
+ out_data.append({
+ "frame": frame_idx,
+ "center": [center[0], center[1]],
+ "panning": is_panning,
+ })
+
+ out_path = output_dir / f"{json_path.stem.replace('_faces', '')}_centers.json"
+ with out_path.open("w") as f:
+ json.dump(out_data, f, indent=2)
+ logging.info("Gespeichert: %s (%d Frames)", out_path.name, len(out_data))
+
+
+if __name__ == "__main__":
+ main()
diff --git a/src/reformat/old/crop_to_speaker.py b/src/reformat/old/crop_to_speaker.py
new file mode 100644
index 0000000..553bbbd
--- /dev/null
+++ b/src/reformat/old/crop_to_speaker.py
@@ -0,0 +1,180 @@
+import json
+import cv2
+import subprocess
+from pathlib import Path
+
+# === Pfade & globale Settings ===
+SCRIPT_DIR = Path(__file__).resolve().parent
+PROJECT_ROOT = SCRIPT_DIR.parents[1]
+
+INPUT_VIDEO_DIR = PROJECT_ROOT / "data" / "output" / "raw_clips"
+INPUT_CENTER_DIR = PROJECT_ROOT / "data" / "face_crop_centers"
+INPUT_FACES_DIR = PROJECT_ROOT / "data" / "face_data_combined"
+OUTPUT_DIR = PROJECT_ROOT / "output" / "output_9x16_final"
+OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+
+OUT_W, OUT_H = 1080, 1920
+
+DEBUG_MODE = True
+DEBUG_SCALE = 0.75
+# Ab welcher Offenheit wir "Bewegung" annehmen
+DEBUG_MOUTH_THRESHOLD = 0.02
+
+# === Hilfsfunktionen ===
+def clamp(v, lo, hi):
+ return max(lo, min(hi, v))
+
+def compute_crop_width(orig_w, orig_h):
+ return int((OUT_W / OUT_H) * orig_h)
+
+# === Verarbeitung ===
+for center_path in sorted(INPUT_CENTER_DIR.glob("*_centers.json")):
+ stem = center_path.stem.replace("_centers", "")
+ video_path = INPUT_VIDEO_DIR / f"{stem}.mp4"
+ faces_path = INPUT_FACES_DIR / f"{stem}_faces.json"
+
+ if not video_path.exists():
+ print(f"⚠️ Video fehlt: {stem}.mp4")
+ continue
+ if not faces_path.exists():
+ print(f"⚠️ Gesichtsdaten fehlen: {stem}_faces.json")
+ continue
+
+ centers_data = json.loads(center_path.read_text())
+ faces_data = json.loads(faces_path.read_text())
+
+ # Debug-Liste pro Video anlegen
+ if DEBUG_MODE:
+ debug_results: list = []
+
+ cap = cv2.VideoCapture(str(video_path))
+ fps = cap.get(cv2.CAP_PROP_FPS)
+ orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+ orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+ crop_w = compute_crop_width(orig_w, orig_h)
+ crop_h = orig_h
+
+ fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+ temp_vid = OUTPUT_DIR / f"{stem}_cropped.mp4"
+ out_vid = cv2.VideoWriter(str(temp_vid), fourcc, fps, (OUT_W, OUT_H))
+ if not out_vid.isOpened():
+ print(f"❌ Kann nicht schreiben: {temp_vid.name}")
+ continue
+
+ if DEBUG_MODE:
+ cv2.namedWindow("Debug", cv2.WINDOW_NORMAL)
+
+ frame_idx = 0
+ while True:
+ ret, frame = cap.read()
+ if not ret or frame_idx >= len(centers_data):
+ break
+
+ # Crop-Infos
+ info = centers_data[frame_idx]
+ cx, cy = info["center"]
+ is_panning = info.get("panning", False)
+ if is_panning:
+ cx = orig_w // 2
+
+ x0 = int(cx - crop_w / 2)
+ x0 = clamp(x0, 0, orig_w - crop_w)
+ y0 = 0
+
+ # Ausschneiden + Resize
+ crop = frame[y0:y0+crop_h, x0:x0+crop_w]
+ if crop.shape[1] != crop_w or crop.shape[0] != crop_h:
+ crop = cv2.copyMakeBorder(
+ crop, 0, crop_h - crop.shape[0],
+ 0, crop_w - crop.shape[1],
+ cv2.BORDER_CONSTANT, value=[0, 0, 0]
+ )
+ out_frame = cv2.resize(crop, (OUT_W, OUT_H), interpolation=cv2.INTER_LINEAR)
+ out_vid.write(out_frame)
+
+ if DEBUG_MODE:
+ debug_frame = frame.copy()
+ frame_faces = faces_data[frame_idx].get("faces", [])
+
+ # Build debug entry for this frame
+ dbg_faces = []
+ for f in frame_faces:
+ # center und Offenheit
+ cx_f, cy_f = map(int, f["center"][:2])
+ openness = f.get("mouth_openness", 0.0)
+ moving = openness > DEBUG_MOUTH_THRESHOLD
+ dbg_faces.append({
+ "center": [cx_f, cy_f],
+ "mouth_openness": openness,
+ "mouth_moving": moving
+ })
+
+ # Anzeige im Debug-Fenster
+ cv2.circle(debug_frame, (cx_f, cy_f), 4, (180, 180, 180), -1)
+ cv2.putText(
+ debug_frame,
+ f"{round(openness,2)}",
+ (cx_f + 6, cy_f - 6),
+ cv2.FONT_HERSHEY_SIMPLEX,
+ 0.5,
+ (255, 255, 255),
+ 1,
+ cv2.LINE_AA
+ )
+ # roter Punkt, wenn Bewegung
+ color = (0,0,255) if moving else (0,255,255)
+ cv2.circle(debug_frame, (cx_f, cy_f), 6, color, 1)
+
+ debug_results.append({
+ "frame": frame_idx,
+ "faces": dbg_faces
+ })
+
+ # Haupt-Center & Crop-Rahmen
+ cv2.circle(debug_frame, (int(cx), int(cy)), 18, (0, 255, 0), 2)
+ cv2.rectangle(debug_frame, (x0, 0), (x0 + crop_w, crop_h), (0, 0, 255), 2)
+
+ dbg = cv2.resize(
+ debug_frame,
+ (int(orig_w * DEBUG_SCALE), int(orig_h * DEBUG_SCALE))
+ )
+ cv2.imshow("Debug", dbg)
+ if cv2.waitKey(1) & 0xFF == ord('q'):
+ break
+
+ frame_idx += 1
+
+ cap.release()
+ out_vid.release()
+ if DEBUG_MODE:
+ cv2.destroyAllWindows()
+
+ # Audio extrahieren & muxen
+ audio_tmp = OUTPUT_DIR / f"{stem}_audio.aac"
+ final_vid = OUTPUT_DIR / f"{stem}.mp4"
+ try:
+ subprocess.run(
+ ["ffmpeg", "-y", "-i", str(video_path), "-vn", "-acodec", "copy", str(audio_tmp)],
+ check=True
+ )
+ subprocess.run(
+ ["ffmpeg", "-y", "-i", str(temp_vid), "-i", str(audio_tmp),
+ "-c:v", "copy", "-c:a", "aac", "-b:a", "128k", str(final_vid)],
+ check=True
+ )
+ finally:
+ try: temp_vid.unlink()
+ except: pass
+ try: audio_tmp.unlink()
+ except: pass
+
+ # Debug-JSON schreiben
+ if DEBUG_MODE:
+ dbg_path = OUTPUT_DIR / f"{stem}_debug.json"
+ with dbg_path.open("w") as f:
+ json.dump(debug_results, f, indent=2)
+ print(f"🛠️ Debug-Daten: {dbg_path.name}")
+
+ print(f"✅ Finales Video: {final_vid.name}")
+
+print("\n🏁 Alle Videos fertig in:", OUTPUT_DIR.resolve())
diff --git a/src/reformat/old/detect_speaking_faces.py b/src/reformat/old/detect_speaking_faces.py
new file mode 100644
index 0000000..f439d30
--- /dev/null
+++ b/src/reformat/old/detect_speaking_faces.py
@@ -0,0 +1,126 @@
+import json
+from pathlib import Path
+from collections import defaultdict
+import numpy as np
+
+# === Einstellungen ===
+SCRIPT_DIR = Path(__file__).resolve().parent
+PROJECT_ROOT = SCRIPT_DIR.parents[1]
+INPUT_DIR = PROJECT_ROOT / "data" / "face_data_combined"
+OUTPUT_PATH = INPUT_DIR / "dominant_faces.json"
+
+SEGMENT_LENGTH = 2.0 # Länge jedes Segments in Sekunden
+MOUTH_THRESHOLD = 0.01 # minimale Mundöffnung, um einen Sprecher zu zählen
+SMOOTH_WINDOW = 5 # Fenstergröße (in Segmenten) für Moving Average
+
+def analyze_clip_timed(path):
+ # 1) JSON einlesen
+ try:
+ data = json.loads(path.read_text())
+ except Exception as e:
+ print(f"❌ Fehler beim Lesen von {path.name}: {e}")
+ return None
+
+ # 2) Nur valide Frames verwenden
+ frames = [d for d in data if "timestamp" in d and isinstance(d.get("faces"), list)]
+ if not frames:
+ print(f"⚠️ Keine validen Frames in {path.name}")
+ return None
+
+ frames.sort(key=lambda x: x["timestamp"])
+ max_time = frames[-1]["timestamp"]
+
+ # 3) Segmente erzeugen und dominanten Sprecher per Segment finden
+ segments = []
+ t = 0.0
+ while t < max_time:
+ t_end = t + SEGMENT_LENGTH
+ face_scores = defaultdict(list) # mouth_openness pro bbox
+ face_boxes = defaultdict(list) # raw bbox pro bbox-key
+ face_centers = defaultdict(list) # center [cx,cy,w,h] pro bbox-key
+
+ # alle Frames durchsuchen, die in dieses Segment fallen
+ for f in frames:
+ ts = f["timestamp"]
+ if t <= ts < t_end:
+ for face in f["faces"]:
+ bbox = face["bbox"] # [x,y,w,h]
+ score = face.get("mouth_openness", 0.0)
+ center = face.get("center", None) # [cx,cy,w,h]
+ key = tuple(bbox)
+
+ if score >= MOUTH_THRESHOLD and center is not None:
+ face_scores[key].append(score)
+ face_boxes[key].append(bbox)
+ face_centers[key].append(center)
+
+ if face_scores:
+ # den Key mit dem höchsten Durchschnittsscore wählen
+ avg_scores = {k: np.mean(v) for k, v in face_scores.items()}
+ dominant_key = max(avg_scores, key=avg_scores.get)
+
+ # mittlere Bounding‑Box und mittleres Center berechnen
+ avg_bbox = np.mean(face_boxes[dominant_key], axis=0).astype(int).tolist()
+ avg_center = np.mean(face_centers[dominant_key], axis=0).tolist() # [cx,cy,w,h]
+
+ segments.append({
+ "start": round(t, 2),
+ "end": round(t_end if t_end < max_time else max_time, 2),
+ "bbox": avg_bbox,
+ "center": [float(avg_center[0]), float(avg_center[1]), float(avg_center[2]), float(avg_center[3])]
+ })
+
+ t += SEGMENT_LENGTH
+
+ if not segments:
+ print(f"⚠️ Keine Segmente für Clip {path.name}")
+ return None
+
+ # 4) Glätten der Segment‑Zentren mit Moving Average
+ seg_centers = [s["center"] for s in segments] # Liste von [cx,cy,w,h]
+ sm_centers = []
+ n = len(seg_centers)
+ half = SMOOTH_WINDOW // 2
+
+ for i in range(n):
+ start = max(0, i - half)
+ end = min(n, i + half + 1)
+ window = seg_centers[start:end]
+ avg = np.mean(window, axis=0) # ergibt [cx,cy,w,h]
+ sm_centers.append(avg.tolist())
+
+ # 5) Ausgabe des geglätteten Pfades in die Konsole
+ print(f"\n🔄 Smoothed path für Clip {path.stem}:")
+ for i, s in enumerate(segments):
+ cx, cy, w, h = sm_centers[i]
+ print(f" Segment {i} [{s['start']}–{s['end']}s]: "
+ f"center=({cx:.1f}, {cy:.1f}), size=({w:.1f}×{h:.1f})")
+
+ # 6) Neue Felder für Ausgabe‑JSON bauen
+ sm_segments = []
+ for i, s in enumerate(segments):
+ cx, cy, w, h = sm_centers[i]
+ x0 = int(cx - w/2)
+ y0 = int(cy - h/2)
+ sm_segments.append({
+ "start": s["start"],
+ "end": s["end"],
+ "bbox": [x0, y0, int(w), int(h)]
+ })
+
+ return {
+ "clip": path.stem.replace("_faces", "") + ".mp4",
+ "segments": sm_segments
+ }
+
+
+# === Hauptschleife über alle Clips ===
+results = []
+for json_file in sorted(INPUT_DIR.glob("*_faces.json")):
+ out = analyze_clip_timed(json_file)
+ if out:
+ results.append(out)
+
+OUTPUT_PATH.write_text(json.dumps(results, indent=2))
+print(f"\n✅ Analyse abgeschlossen – {len(results)} Clips erkannt.")
+print(f"📄 Gespeichert in: {OUTPUT_PATH.resolve()}")
diff --git a/src/reformat/old/grid_faces_from_yolo.py b/src/reformat/old/grid_faces_from_yolo.py
new file mode 100644
index 0000000..b9de56f
--- /dev/null
+++ b/src/reformat/old/grid_faces_from_yolo.py
@@ -0,0 +1,114 @@
+import json
+import cv2
+import numpy as np
+from pathlib import Path
+from tqdm import tqdm
+from collections import defaultdict, Counter
+from sklearn.cluster import DBSCAN
+
+# === Einstellungen ===
+SCRIPT_DIR = Path(__file__).resolve().parent
+VIDEO_DIR = SCRIPT_DIR.parents[1] / "output"
+FACE_JSON_DIR = SCRIPT_DIR / "face_data_yolo"
+OUTPUT_DIR = SCRIPT_DIR.parents[1] / "output_stacked_faces"
+OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+
+OUT_WIDTH = 1080
+OUT_HEIGHT = 1920
+GRID_ROWS = 4
+FACE_CROP_HEIGHT = OUT_HEIGHT // GRID_ROWS
+FACE_CROP_WIDTH = OUT_WIDTH
+
+# === Hilfsfunktion
+def bbox_center(bbox):
+ x, y, w, h = bbox
+ return int(x + w // 2), int(y + h // 2)
+
+# === Hauptverarbeitung ===
+for json_path in tqdm(sorted(FACE_JSON_DIR.glob("*_faces.json")), desc="🔍 Erzeuge Grid-Clips"):
+ video_name = json_path.stem.replace("_faces", "") + ".mp4"
+ video_path = VIDEO_DIR / video_name
+ if not video_path.exists():
+ print(f"❌ Video nicht gefunden: {video_name}")
+ continue
+
+ data = json.loads(json_path.read_text())
+
+ # === Alle Gesichtszentren sammeln
+ all_faces = []
+ for frame in data:
+ for face in frame["faces"]:
+ center = bbox_center(face["bbox"])
+ all_faces.append((center, face["bbox"]))
+
+ if not all_faces:
+ print(f"⚠️ Keine Gesichter erkannt in {video_name}")
+ continue
+
+ # === Clustern
+ coords = [pos for pos, _ in all_faces]
+ clustering = DBSCAN(eps=80, min_samples=5).fit(coords)
+ cluster_labels = clustering.labels_
+ label_counts = Counter(cluster_labels)
+ most_common_labels = [lbl for lbl, _ in label_counts.most_common(GRID_ROWS) if lbl != -1]
+
+ if not most_common_labels:
+ print(f"⚠️ Keine gültigen Cluster in {video_name}")
+ continue
+
+ # === Zuordnung: cluster_id → feste Zeile
+ cluster_faces = defaultdict(list)
+ for (_, bbox), label in zip(all_faces, cluster_labels):
+ if label in most_common_labels:
+ cluster_faces[label].append(bbox)
+
+ def cluster_y(label):
+ return np.mean([bbox[1] for bbox in cluster_faces[label]])
+
+ sorted_labels = sorted(most_common_labels, key=cluster_y)
+ label_to_row = {lbl: idx for idx, lbl in enumerate(sorted_labels)}
+
+ # === cluster_id zu jedem Gesicht hinzufügen
+ for frame in data:
+ for face in frame["faces"]:
+ center = bbox_center(face["bbox"])
+ distances = [np.linalg.norm(np.array(center) - np.array(c)) for c in coords]
+ nearest = np.argmin(distances)
+ label = cluster_labels[nearest]
+ face["cluster_id"] = label
+
+ # === Video verarbeiten
+ cap = cv2.VideoCapture(str(video_path))
+ fps = cap.get(cv2.CAP_PROP_FPS)
+ out_path = OUTPUT_DIR / video_name.replace(".mp4", "_stacked.mp4")
+ fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+ writer = cv2.VideoWriter(str(out_path), fourcc, fps, (OUT_WIDTH, OUT_HEIGHT))
+
+ frame_idx = 0
+ while cap.isOpened():
+ ret, frame = cap.read()
+ if not ret or frame_idx >= len(data):
+ break
+
+ output_frame = np.zeros((OUT_HEIGHT, OUT_WIDTH, 3), dtype=np.uint8)
+ for face in data[frame_idx]["faces"]:
+ label = face.get("cluster_id", -1)
+ if label not in label_to_row:
+ continue
+ row = label_to_row[label]
+ x, y, w, h = face["bbox"]
+ crop = frame[y:y+h, x:x+w]
+ if crop.size == 0:
+ continue
+ resized = cv2.resize(crop, (FACE_CROP_WIDTH, FACE_CROP_HEIGHT))
+ y_offset = row * FACE_CROP_HEIGHT
+ output_frame[y_offset:y_offset+FACE_CROP_HEIGHT, :] = resized
+
+ writer.write(output_frame)
+ frame_idx += 1
+
+ cap.release()
+ writer.release()
+ print(f"✅ Exportiert: {out_path.name}")
+
+print("🏁 Alle Grid-Videos fertig.")
diff --git a/src/reformat/old/preview_faces.py b/src/reformat/old/preview_faces.py
new file mode 100644
index 0000000..dc777fc
--- /dev/null
+++ b/src/reformat/old/preview_faces.py
@@ -0,0 +1,75 @@
+import cv2
+import json
+from pathlib import Path
+from tqdm import tqdm
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+PROJECT_DIR = SCRIPT_DIR.parents[1] # ← geht von /src/reformat zu /BachlorArbeit
+
+FACES_DIR = PROJECT_DIR / "data" / "face_data_combined"
+INPUT_VIDEO_DIR = PROJECT_DIR / "data" / "output" / "raw_clips"
+OUTPUT_DIR = PROJECT_DIR / "output" / "output_preview_faces"
+OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+
+# === Alle *_faces.json Dateien durchgehen ===
+face_files = sorted(FACES_DIR.glob("*_faces.json"))
+
+for face_file in tqdm(face_files, desc="🔍 Erzeuge Vorschau mit Sprechererkennung"):
+ clip_name = face_file.stem.replace("_faces", "") + ".mp4"
+ input_path = INPUT_VIDEO_DIR / clip_name
+ output_path = OUTPUT_DIR / clip_name.replace(".mp4", "_preview_faces.mp4")
+
+ if not input_path.exists():
+ print(f"❌ Clip nicht gefunden: {clip_name}")
+ continue
+
+ # Video-Setup
+ cap = cv2.VideoCapture(str(input_path))
+ fps = cap.get(cv2.CAP_PROP_FPS)
+ fps = fps if fps > 1 else 25 # fallback falls FPS = 0
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+ fourcc = cv2.VideoWriter_fourcc(*"avc1") # Kompatibler als mp4v
+ out = cv2.VideoWriter(str(output_path), fourcc, fps, (width, height))
+
+ # Gesichts-Daten laden
+ data = json.loads(face_file.read_text())
+ data_by_frame = {d["frame"]: d["faces"] for d in data if d["faces"]}
+
+ print(f"🔢 Frames mit Gesichtern: {len(data_by_frame)}")
+
+ frame_idx = 0
+ while cap.isOpened():
+ ret, frame = cap.read()
+ if not ret:
+ break
+
+ faces = data_by_frame.get(frame_idx, [])
+ speaker_idx = None
+
+ # Sprecher anhand Mundöffnung
+ if faces and all("mouth_openness" in f for f in faces):
+ mouth_vals = [f["mouth_openness"] for f in faces]
+ if any(v > 0.01 for v in mouth_vals): # einfache Aktivitäts-Schwelle
+ speaker_idx = mouth_vals.index(max(mouth_vals))
+
+ for i, face in enumerate(faces):
+ x, y, w, h = face["bbox"]
+ color = (0, 255, 0) if i == speaker_idx else (255, 255, 255)
+ label = f"Mouth: {face.get('mouth_openness', 0):.2f}"
+
+ # Debug-Ausgabe (optional)
+ print(f"Frame {frame_idx} | Face {i} | BBox: ({x},{y},{w},{h}) | Speaker: {speaker_idx}")
+
+ cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
+ cv2.putText(frame, label, (x, y - 10),
+ cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
+
+ out.write(frame)
+ frame_idx += 1
+
+ cap.release()
+ out.release()
+ print(f"✅ Vorschau exportiert: {output_path.name}")
+
+print("🏁 Alle Vorschauvideos mit Sprecherkennung erstellt.")
diff --git a/src/reformat/old/track_faces.py b/src/reformat/old/track_faces.py
new file mode 100644
index 0000000..f335069
--- /dev/null
+++ b/src/reformat/old/track_faces.py
@@ -0,0 +1,92 @@
+import cv2
+import mediapipe as mp
+import json
+from pathlib import Path
+from tqdm import tqdm
+
+# === Einstellungen ===
+INPUT_DIR = Path(__file__).resolve().parents[2] / "output"
+OUTPUT_DIR = Path(__file__).resolve().parent / "face_data"
+OUTPUT_DIR.mkdir(exist_ok=True)
+FRAME_SKIP = 1 # analysiere jeden Frame für maximale Genauigkeit
+PADDING = 30 # Pixel Padding um Gesicht
+
+mp_face_mesh = mp.solutions.face_mesh
+
+# Erweiterte Lippen-Landmarks (innen)
+TOP_LIPS = [13, 78, 82]
+BOTTOM_LIPS = [14, 87, 317]
+
+def mouth_openness(landmarks, image_height):
+ try:
+ top_avg = sum([landmarks[i].y for i in TOP_LIPS]) / len(TOP_LIPS)
+ bottom_avg = sum([landmarks[i].y for i in BOTTOM_LIPS]) / len(BOTTOM_LIPS)
+ return abs(bottom_avg - top_avg)
+ except:
+ return 0.0
+
+def process_video(path):
+ cap = cv2.VideoCapture(str(path))
+ fps = cap.get(cv2.CAP_PROP_FPS)
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
+ results = []
+
+ with mp_face_mesh.FaceMesh(
+ static_image_mode=False,
+ max_num_faces=5,
+ refine_landmarks=True,
+ min_detection_confidence=0.6,
+ min_tracking_confidence=0.6
+ ) as face_mesh:
+
+ frame_idx = 0
+ while cap.isOpened():
+ ret, frame = cap.read()
+ if not ret:
+ break
+
+ if frame_idx % FRAME_SKIP != 0:
+ frame_idx += 1
+ continue
+
+ rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+ output = face_mesh.process(rgb)
+
+ faces = []
+ if output.multi_face_landmarks:
+ for landmarks in output.multi_face_landmarks:
+ mouth = mouth_openness(landmarks.landmark, height)
+
+ xs = [lm.x * width for lm in landmarks.landmark]
+ ys = [lm.y * height for lm in landmarks.landmark]
+ x1 = max(0, int(min(xs)) - PADDING)
+ y1 = max(0, int(min(ys)) - PADDING)
+ x2 = min(width, int(max(xs)) + PADDING)
+ y2 = min(height, int(max(ys)) + PADDING)
+ bbox = [x1, y1, x2 - x1, y2 - y1]
+
+ faces.append({
+ "bbox": bbox,
+ "mouth_openness": round(mouth, 4)
+ })
+
+ results.append({
+ "frame": frame_idx,
+ "timestamp": round(frame_idx / fps, 2),
+ "faces": faces
+ })
+
+ frame_idx += 1
+
+ cap.release()
+ out_path = OUTPUT_DIR / f"{path.stem}_faces.json"
+ out_path.write_text(json.dumps(results, indent=2))
+ print(f"✅ {path.name} verarbeitet → {out_path.name}")
+
+# === Alle Videos im output/ Ordner durchgehen
+videos = list(INPUT_DIR.glob("*.mp4"))
+print(f"🎬 {len(videos)} Videos gefunden in: {INPUT_DIR}")
+
+for video in tqdm(videos):
+ process_video(video)
diff --git a/src/reformat/old/track_faces_Yolo.py b/src/reformat/old/track_faces_Yolo.py
new file mode 100644
index 0000000..d7f5d1f
--- /dev/null
+++ b/src/reformat/old/track_faces_Yolo.py
@@ -0,0 +1,206 @@
+#!/usr/bin/env python3
+import argparse
+import logging
+import json
+from pathlib import Path
+
+import cv2
+from ultralytics import YOLO
+import mediapipe as mp
+
+# === Pfade und Standardwerte ===
+SCRIPT_DIR = Path(__file__).resolve().parent
+PROJECT_ROOT = SCRIPT_DIR.parents[1]
+DEFAULT_INPUT_DIR = PROJECT_ROOT / "data" / "output" / "raw_clips"
+DEFAULT_OUTPUT_DIR = PROJECT_ROOT / "data" / "face_data_combined"
+DEFAULT_MODEL_PATH = PROJECT_ROOT / "models" / "yolov8n-face.pt"
+
+# Stelle sicher, dass das Standard-Output-Verzeichnis existiert
+DEFAULT_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+
+# === Landmarks für Lippen ===
+TOP_LIPS = [61, 185, 40, 39, 37, 0, 267, 269, 270, 409]
+BOTTOM_LIPS = [146, 91, 181, 84, 17, 314, 405, 321, 375, 291]
+
+
+def get_mouth_openness(landmarks, image_height):
+ """
+ Berechnet die Mundöffnung in Pixeln basierend auf normierten Landmark-Koordinaten.
+ """
+ top_avg = sum(landmarks[i].y for i in TOP_LIPS) / len(TOP_LIPS)
+ bottom_avg = sum(landmarks[i].y for i in BOTTOM_LIPS) / len(BOTTOM_LIPS)
+ return abs(bottom_avg - top_avg) * image_height
+
+
+def iou(boxA, boxB):
+ """Berechnet Intersection-over-Union zweier Bounding-Boxes im Format (x, y, w, h)."""
+ ax1, ay1, aw, ah = boxA
+ ax2, ay2 = ax1 + aw, ay1 + ah
+ bx1, by1, bw, bh = boxB
+ bx2, by2 = bx1 + bw, by1 + bh
+
+ inter_x1 = max(ax1, bx1)
+ inter_y1 = max(ay1, by1)
+ inter_x2 = min(ax2, bx2)
+ inter_y2 = min(ay2, by2)
+ inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1)
+
+ union_area = aw * ah + bw * bh - inter_area
+ return inter_area / union_area if union_area > 0 else 0
+
+
+def process_video(
+ video_path: Path,
+ output_path: Path,
+ model: YOLO,
+ face_mesh: mp.solutions.face_mesh.FaceMesh,
+ conf_thresh: float,
+ frame_skip: int,
+ downscale: float,
+):
+ cap = cv2.VideoCapture(str(video_path))
+ if not cap.isOpened():
+ logging.error(f"Kann Video nicht öffnen: {video_path}")
+ return
+
+ fps = cap.get(cv2.CAP_PROP_FPS)
+ width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH) * downscale)
+ height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT) * downscale)
+
+ # JSON-Ausgabe mit Streaming
+ with output_path.open('w', encoding='utf-8') as f_out:
+ f_out.write('[\n')
+ first = True
+ frame_idx = 0
+
+ while True:
+ ret, frame = cap.read()
+ if not ret:
+ break
+ if frame_skip > 1 and frame_idx % frame_skip != 0:
+ frame_idx += 1
+ continue
+
+ if downscale != 1.0:
+ frame = cv2.resize(frame, (width, height))
+
+ detections = model(frame, verbose=False)[0]
+ yolo_boxes = []
+ for box in detections.boxes:
+ conf = float(box.conf[0] if hasattr(box.conf, '__getitem__') else box.conf)
+ if conf < conf_thresh:
+ continue
+ coords = box.xyxy[0].cpu().numpy()
+ x1, y1, x2, y2 = map(int, coords)
+ yolo_boxes.append([x1, y1, x2 - x1, y2 - y1])
+
+ rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+ mp_result = face_mesh.process(rgb)
+ mp_faces = []
+ if mp_result.multi_face_landmarks:
+ for landmarks in mp_result.multi_face_landmarks:
+ mouth_px = get_mouth_openness(landmarks.landmark, height)
+ xs = [lm.x * width for lm in landmarks.landmark]
+ ys = [lm.y * height for lm in landmarks.landmark]
+ x1, y1 = int(min(xs)), int(min(ys))
+ x2, y2 = int(max(xs)), int(max(ys))
+ mp_faces.append({
+ "bbox": [x1, y1, x2 - x1, y2 - y1],
+ "mouth_openness": round(mouth_px, 1)
+ })
+
+ combined = []
+ for yb in yolo_boxes:
+ if mp_faces:
+ best = max(mp_faces, key=lambda m: iou(yb, m["bbox"]))
+ best_iou = iou(yb, best["bbox"])
+ mouth = best["mouth_openness"] if best_iou > 0.2 else 0.0
+ else:
+ mouth = 0.0
+
+ x, y, w, h = yb
+ cx, cy = x + w / 2, y + h / 2
+ combined.append({
+ "bbox": yb,
+ "mouth_openness": round(mouth, 1),
+ "center": [round(cx, 1), round(cy, 1), w, h]
+ })
+
+ result = {
+ "frame": frame_idx,
+ "timestamp": round(frame_idx / fps, 3),
+ "faces": combined
+ }
+
+ if not first:
+ f_out.write(',\n')
+ json.dump(result, f_out, ensure_ascii=False)
+ first = False
+ frame_idx += 1
+
+ f_out.write('\n]')
+
+ cap.release()
+ logging.info(f"Verarbeitet: {video_path.name} → {output_path.name}")
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description="Analyse von Videos: Gesichter und Mundöffnung erkennen"
+ )
+ parser.add_argument(
+ "--input-dir", type=Path,
+ default=DEFAULT_INPUT_DIR,
+ help=f"Verzeichnis mit MP4-Videos (standard: {DEFAULT_INPUT_DIR})"
+ )
+ parser.add_argument(
+ "--output-dir", type=Path,
+ default=DEFAULT_OUTPUT_DIR,
+ help=f"Verzeichnis für JSON-Ergebnisse (standard: {DEFAULT_OUTPUT_DIR})"
+ )
+ parser.add_argument(
+ "--model", type=Path,
+ default=DEFAULT_MODEL_PATH,
+ help=f"Pfad zum YOLOv8-Face-Modell (.pt) (standard: {DEFAULT_MODEL_PATH})"
+ )
+ parser.add_argument(
+ "--conf-thresh", type=float, default=0.5,
+ help="Schwelle für YOLO-Confidence"
+ )
+ parser.add_argument(
+ "--frame-skip", type=int, default=1,
+ help="Nur jede n-te Frame verarbeiten"
+ )
+ parser.add_argument(
+ "--downscale", type=float, default=1.0,
+ help="Skalierungsfaktor für Frames"
+ )
+ args = parser.parse_args()
+
+ logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO)
+ args.output_dir.mkdir(parents=True, exist_ok=True)
+
+ yolo = YOLO(str(args.model))
+ face_mesh = mp.solutions.face_mesh.FaceMesh(
+ static_image_mode=False,
+ max_num_faces=5,
+ refine_landmarks=True,
+ min_detection_confidence=0.5,
+ min_tracking_confidence=0.5
+ )
+
+ for video_path in sorted(args.input_dir.glob("*.mp4")):
+ out_path = args.output_dir / f"{video_path.stem}_faces.json"
+ process_video(
+ video_path,
+ out_path,
+ yolo,
+ face_mesh,
+ args.conf_thresh,
+ args.frame_skip,
+ args.downscale,
+ )
+
+
+if __name__ == "__main__":
+ main()
diff --git a/src/reformat/speaking.py b/src/reformat/speaking.py
new file mode 100644
index 0000000..4d7b83c
--- /dev/null
+++ b/src/reformat/speaking.py
@@ -0,0 +1,12 @@
+# src/speaking.py
+
+TOP_LIPS = [61, 185, 40, 39, 37, 0, 267, 269, 270, 409]
+BOTTOM_LIPS = [146, 91, 181, 84, 17, 314, 405, 321, 375, 291]
+
+def get_mouth_openness(landmarks, image_height):
+ """
+ Berechnet die Mundöffnung basierend auf MediaPipe-Landmarks.
+ """
+ top_avg = sum(landmarks[i].y for i in TOP_LIPS) / len(TOP_LIPS)
+ bottom_avg = sum(landmarks[i].y for i in BOTTOM_LIPS) / len(BOTTOM_LIPS)
+ return abs(bottom_avg - top_avg) * image_height
diff --git a/src/subtitles/add_subtitles.py b/src/subtitles/add_subtitles.py
new file mode 100644
index 0000000..2f3448c
--- /dev/null
+++ b/src/subtitles/add_subtitles.py
@@ -0,0 +1,265 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+add_subtitles.py — TikTok-Word-Caps mit OpenAI Whisper (CPU)
+- läuft Ordner-weise über 9:16-Kurzclips
+- transkribiert mit word_timestamps=True
+- erzeugt ASS (ein Wort pro Zeile, Pop-Animation, Bottom-Center)
+- brennt via ffmpeg in *_subtitled.mp4
+"""
+
+import os
+import re
+import glob
+import json
+import subprocess
+import tempfile
+import traceback
+import argparse
+from typing import List, Tuple, Optional
+from pathlib import Path
+import sys
+
+# ── Projektwurzel in sys.path aufnehmen (dieses Skript liegt z. B. unter src/subtitles/)
+ROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(ROOT))
+
+from config import CROPPED_DIR, SUBTITLED_DIR # zentrale Pfade
+
+# --- Stabil auf CPU (vermeidet MPS-Sparse-Fehler) ---
+os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
+os.environ["CUDA_VISIBLE_DEVICES"] = ""
+
+def log(*a): print("[LOG]", *a)
+def ensure_dir(p: Path): p.mkdir(parents=True, exist_ok=True)
+
+def has_audio_stream(video_path: str) -> bool:
+ cmd = ["ffprobe","-v","error","-select_streams","a","-show_entries","stream=index","-of","json",video_path]
+ try:
+ out = subprocess.check_output(cmd).decode("utf-8")
+ data = json.loads(out)
+ return bool(data.get("streams"))
+ except Exception:
+ return False
+
+def load_whisper_cpu(model_name: str):
+ import whisper # openai-whisper
+ device = "cpu"
+ model = whisper.load_model(model_name, device=device)
+ fp16 = False
+ return model, device, fp16
+
+def transcribe_words_whisper(model, media_path: str, language: Optional[str], fp16: bool) -> List[Tuple[float,float,str]]:
+ """
+ Nutzt 'openai-whisper' mit word_timestamps=True.
+ Fallback: wenn 'words' fehlen, werden Segmenttexte approx. auf Wörter verteilt.
+ """
+ result = model.transcribe(
+ media_path,
+ language=language,
+ task="transcribe",
+ word_timestamps=True,
+ condition_on_previous_text=False,
+ verbose=False,
+ fp16=fp16
+ )
+ words: List[Tuple[float,float,str]] = []
+ segs = result.get("segments", []) or []
+ for seg in segs:
+ wlist = seg.get("words")
+ if isinstance(wlist, list) and wlist and all(isinstance(w, dict) for w in wlist):
+ for w in wlist:
+ t = (w.get("word") or w.get("text") or "").strip()
+ if not t:
+ continue
+ ws = w.get("start"); we = w.get("end")
+ if ws is None or we is None:
+ continue
+ t = re.sub(r"\s+", " ", t)
+ if t:
+ words.append((float(ws), float(we), t))
+ else:
+ # Fallback: Segment auf Wörter aufteilen & Zeiten gleichmäßig verteilen
+ text = (seg.get("text") or "").strip()
+ if not text:
+ continue
+ seg_start = float(seg.get("start", 0.0))
+ seg_end = float(seg.get("end", seg_start))
+ toks = [w for w in re.split(r"(\s+)", text) if w.strip()]
+ if not toks or seg_end <= seg_start:
+ continue
+ dur = seg_end - seg_start
+ step = dur / len(toks)
+ for i, tok in enumerate(toks):
+ ws = seg_start + i * step
+ we = seg_start + (i+1) * step
+ words.append((ws, we, tok))
+ return words
+
+def ass_time(t: float) -> str:
+ if t < 0: t = 0
+ h = int(t // 3600); m = int((t % 3600)//60); s = int(t % 60); cs = int(round((t - int(t))*100))
+ return f"{h:d}:{m:02d}:{s:02d}.{cs:02d}"
+
+def write_ass_words(words: List[Tuple[float,float,str]], ass_path: Path, font_size: int, margin_v: int, uppercase: bool):
+ """
+ Ein Wort pro Zeile, ohne Überlappung:
+ - Ende = min(eigene Endzeit, Start nächstes Wort - 0.02)
+ - Pop-Animation 150ms, fette Outline, Bottom-Center (PlayResY=1920)
+ """
+ header = f"""[Script Info]
+ScriptType: v4.00+
+Collisions: Normal
+PlayResX: 1080
+PlayResY: 1920
+ScaledBorderAndShadow: yes
+
+[V4+ Styles]
+Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
+Style: WordCap,Inter,{font_size},&H00FFFFFF,&H00FFFFFF,&H00101010,&H64000000,1,0,0,0,100,100,0,0,1,6,0.8,2,80,80,{margin_v},1
+
+[Events]
+Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
+"""
+ # Zeiten glätten, damit immer nur ein Wort sichtbar ist
+ adjusted = []
+ for i, (s, e, t) in enumerate(words):
+ nstart = words[i+1][0] if i+1 < len(words) else e
+ new_end = min(e, nstart - 0.02) if nstart > s else e
+ if new_end <= s:
+ new_end = s + 0.06
+ adjusted.append((s, new_end, t))
+
+ with open(ass_path, "w", encoding="utf-8") as f:
+ f.write(header)
+ for s, e, t in adjusted:
+ st, en = ass_time(s), ass_time(e)
+ txt = t.upper() if uppercase else t
+ # \fs sichere Größe, \blur für weiche Outline, \fad Ein/Aus,
+ # \fscx135\fscy135 → Start groß, \t(...) schrumpft in 150ms auf 100% = Pop
+ overrides = r"\blur1\bord8\1c&H0000FFFF&\3c&H000000&\4c&H000000&\fad(50,20)\fscx135\fscy135\t(0,150,\fscx100\fscy100)"
+ f.write(f"Dialogue: 0,{st},{en},WordCap,,0,0,0,,{{{overrides}}}{txt}\n")
+
+def ffmpeg_escape_for_subtitles(path: Path) -> str:
+ """
+ Pfad für -vf subtitles=… escapen (für Leerzeichen, Doppelpunkte etc.).
+ ffmpeg erwartet Backslash-escaping für Filter-Argumente.
+ """
+ s = str(path)
+ s = s.replace("\\", "\\\\")
+ s = s.replace(":", "\\:")
+ s = s.replace("'", "\\'")
+ s = s.replace(",", "\\,")
+ s = s.replace("[", "\\[")
+ s = s.replace("]", "\\]")
+ s = s.replace(";", "\\;")
+ s = s.replace("=", "\\=")
+ return s
+
+def burn(video_in: Path, ass_file: Path, out_path: Path, crf=18, preset="medium") -> int:
+ vf = f"subtitles={ffmpeg_escape_for_subtitles(ass_file)}"
+ cmd = [
+ "ffmpeg","-y","-i",str(video_in),
+ "-vf", vf,
+ "-c:v","libx264","-preset",preset,"-crf",str(crf),
+ "-c:a","copy",
+ str(out_path)
+ ]
+ log("FFmpeg:", " ".join(cmd))
+ return subprocess.call(cmd)
+
+def parse_args():
+ p = argparse.ArgumentParser(description="Brennt Word-Caps (ASS) via Whisper-Transkription in 9:16-Clips.")
+ p.add_argument("--clips_dir", type=Path, default=CROPPED_DIR, help=f"Quellordner (Default: {CROPPED_DIR})")
+ p.add_argument("--out_dir", type=Path, default=SUBTITLED_DIR, help=f"Zielordner (Default: {SUBTITLED_DIR})")
+ p.add_argument("--pattern", type=str, default="*.mp4", help="Dateimuster (Default: *.mp4)")
+ p.add_argument("--limit", type=int, default=None, help="Nur die ersten N Clips verarbeiten")
+ p.add_argument("--model", type=str, default=os.getenv("WHISPER_MODEL", "medium"), help="Whisper-Modell")
+ p.add_argument("--lang", type=str, default=os.getenv("LANGUAGE", "none"), help="Sprachcode (z. B. de, en, None=Auto)")
+ p.add_argument("--uppercase", action="store_true", help="Text in Großbuchstaben rendern")
+ p.add_argument("--font_size", type=int, default=108, help="ASS-Fontgröße")
+ p.add_argument("--margin_v", type=int, default=320, help="ASS-MarginV (Abstand vom unteren Rand)")
+ p.add_argument("--crf", type=int, default=18, help="ffmpeg CRF (Qualität)")
+ p.add_argument("--preset", type=str, default="medium", help="ffmpeg Preset")
+ return p.parse_args()
+
+def main():
+ args = parse_args()
+
+ clips_dir = args.clips_dir
+ output_dir = args.out_dir
+ ensure_dir(output_dir)
+
+ log("Starte TikTok Word-Caps (Whisper)")
+ log("CLIPS_DIR =", clips_dir)
+ log("OUTPUT_DIR =", output_dir)
+
+ clips: List[str] = []
+ for pat in (args.pattern,):
+ clips += glob.glob(str(clips_dir / pat))
+ clips.sort()
+ log(f"{len(clips)} Clips gefunden.")
+ if args.limit:
+ clips = clips[:args.limit]
+ log(f"LIMIT aktiv: {args.limit}")
+
+ if not clips:
+ log("Keine Clips gefunden. Pfad/Pattern checken.")
+ return
+
+ # Whisper laden (CPU)
+ try:
+ model, device, fp16 = load_whisper_cpu(args.model)
+ log(f"Whisper geladen: {args.model} auf {device} (fp16={fp16})")
+ log("Hinweis: Beim ersten Lauf kann das Modell nachgeladen werden.")
+ except Exception as e:
+ print("[ERROR] Whisper konnte nicht geladen werden:", e)
+ traceback.print_exc()
+ return
+
+ lang = None if str(args.lang).strip().lower() in {"", "none", "null"} else args.lang
+
+ for clip in clips:
+ base = os.path.basename(clip)
+ stem, _ = os.path.splitext(base)
+ log("="*60)
+ log("Clip:", base)
+
+ if not has_audio_stream(clip):
+ log("WARN: Keine Audio-Spur → übersprungen.")
+ continue
+
+ # Transkription
+ try:
+ log("Transkription startet …")
+ words = transcribe_words_whisper(model, clip, language=lang, fp16=fp16)
+ log(f"Transkription fertig. {len(words)} Wörter.")
+ if not words:
+ log("WARN: 0 Wörter erkannt → übersprungen.")
+ continue
+ except Exception as e:
+ print("[ERROR] Transkription fehlgeschlagen:", e)
+ traceback.print_exc()
+ continue
+
+ # ASS erzeugen & brennen
+ with tempfile.NamedTemporaryFile(suffix=".ass", delete=False) as tmp:
+ ass_path = Path(tmp.name)
+ try:
+ log("Erzeuge ASS …")
+ write_ass_words(words, ass_path, font_size=args.font_size, margin_v=args.margin_v, uppercase=args.uppercase)
+ out_path = output_dir / f"{stem}_subtitled.mp4"
+ log("Brenne Untertitel …")
+ rc = burn(Path(clip), ass_path, out_path, crf=args.crf, preset=args.preset)
+ if rc == 0:
+ log("OK:", out_path)
+ else:
+ log("ERROR: ffmpeg fehlgeschlagen, code", rc)
+ finally:
+ try: ass_path.unlink(missing_ok=True)
+ except Exception: pass
+
+ log("Fertig.")
+
+if __name__ == "__main__":
+ main()
diff --git a/src/subtitles/run_subtitles.py b/src/subtitles/run_subtitles.py
new file mode 100644
index 0000000..1ce2f40
--- /dev/null
+++ b/src/subtitles/run_subtitles.py
@@ -0,0 +1,25 @@
+import os
+import tempfile
+from add_subtitles import process # wir nutzen die Logik aus dem großen Skript
+
+# ==== HIER EINSTELLEN ====
+VIDEO_PATH = "data/input.mp4" # Dein Video
+TRANSCRIPT_PATH = "data/transcript.srt" # Oder .json (Whisper)
+OUTPUT_DIR = "data/output" # Ordner für Ergebnisse
+CLIPS_PATH = None # Optional: "data/clips.csv" oder "data/clips.json"
+CRF = 18
+PRESET = "medium"
+STYLE = r"\\bord4\\shad4\\outline3\\fs64\\b1\\1c&HFFFFFF&\\3c&H000000&\\4c&H000000&"
+# ==========================
+
+if __name__ == "__main__":
+ os.makedirs(OUTPUT_DIR, exist_ok=True)
+ process(
+ video_path=VIDEO_PATH,
+ transcript_path=TRANSCRIPT_PATH,
+ output_dir=OUTPUT_DIR,
+ clips_path=CLIPS_PATH,
+ crf=CRF,
+ preset=PRESET,
+ style_overrides=STYLE,
+ )
diff --git a/src/text/cutClips.py b/src/text/cutClips.py
new file mode 100644
index 0000000..ae314b8
--- /dev/null
+++ b/src/text/cutClips.py
@@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+# cutClips.py — exportiert Clips aus dem ersten gefundenen Video oder aus angegebener Datei
+
+from pathlib import Path
+import sqlite3
+import argparse
+from moviepy.video.io.VideoFileClip import VideoFileClip
+import sys
+
+# ── Projektwurzel in sys.path aufnehmen
+ROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(ROOT))
+
+from config import INPUT_DIR, RAW_CLIPS_DIR, DB_PATH
+
+
+def parse_args():
+ p = argparse.ArgumentParser(description="Exportiert Highlights aus dem Video gemäß SQLite-DB.")
+ p.add_argument("--file", type=str, default=None,
+ help="Name der Input-Datei im INPUT_DIR. Wenn leer, wird das erste Video im Ordner verwendet.")
+ p.add_argument("--limit", type=int, default=10,
+ help="Anzahl der zu exportierenden Clips (Default: 10)")
+ p.add_argument("--order", type=str, choices=["score", "start"], default="score",
+ help="Sortierung: 'score' (score_total absteigend) oder 'start' (zeitlich).")
+ return p.parse_args()
+
+
+def find_first_video(directory: Path):
+ """Suche nach der ersten Videodatei im Verzeichnis (mp4, mov, mkv)."""
+ for ext in ("*.mp4", "*.mov", "*.mkv"):
+ files = sorted(directory.glob(ext))
+ if files:
+ return files[0]
+ return None
+
+
+def main():
+ args = parse_args()
+
+ # === Eingabevideo bestimmen ===
+ if args.file:
+ input_video = INPUT_DIR / args.file
+ else:
+ input_video = find_first_video(INPUT_DIR)
+ if not input_video:
+ raise FileNotFoundError(f"🚫 Kein Video im Eingabeordner {INPUT_DIR} gefunden.")
+ print(f"📂 Kein --file angegeben → verwende automatisch: {input_video.name}")
+
+ if not input_video.exists():
+ raise FileNotFoundError(f"🚫 Input-Video nicht gefunden: {input_video}")
+
+ output_dir = RAW_CLIPS_DIR
+ output_dir.mkdir(parents=True, exist_ok=True)
+
+ # === SQLite DB lesen ===
+ conn = sqlite3.connect(DB_PATH)
+ cursor = conn.cursor()
+
+ order_clause = "ORDER BY score_total DESC" if args.order == "score" else "ORDER BY start ASC"
+ cursor.execute(f"""
+ SELECT start, end, text
+ FROM highlights
+ {order_clause}
+ LIMIT ?
+ """, (args.limit,))
+ highlights = cursor.fetchall()
+
+ if not highlights:
+ print("⚠️ Keine Highlights in der Datenbank gefunden.")
+ conn.close()
+ return
+
+ # === Video laden ===
+ video = VideoFileClip(str(input_video))
+
+ # === Clips schneiden ===
+ for i, (start, end, text) in enumerate(highlights, start=1):
+ if start >= video.duration:
+ print(f"⚠️ Clip {i} übersprungen – Startzeit {start:.2f}s liegt außerhalb der Videolänge ({video.duration:.2f}s).")
+ continue
+
+ end = min(end, video.duration)
+ output_file = output_dir / f"highlight_{i}.mp4"
+ print(f"🎬 Exportiere Clip {i}: {start:.2f}s – {end:.2f}s → {output_file.name}")
+
+ try:
+ clip = video.subclipped(start, end)
+ clip.write_videofile(str(output_file), codec="libx264", audio_codec="aac", logger=None)
+ clip.close()
+ except Exception as e:
+ print(f"❌ Fehler beim Export von Clip {i}: {e}")
+
+ # === Cleanup ===
+ conn.close()
+ video.close()
+ print(f"✅ {len(highlights)} Clips exportiert nach {output_dir}")
+
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file
diff --git a/code/text/rateCluster.py b/src/text/rateCluster.py
similarity index 69%
rename from code/text/rateCluster.py
rename to src/text/rateCluster.py
index 0c9cf07..fe5e2f6 100644
--- a/code/text/rateCluster.py
+++ b/src/text/rateCluster.py
@@ -2,44 +2,41 @@ import sqlite3
import re
from openai import OpenAI
from time import sleep
+from pathlib import Path
+import os
+
+from pathlib import Path
+import sys
+
+# Projekt-Root einfügen (2 Ebenen hoch von src/* ausgehend)
+ROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(ROOT))
+
+from config import DB_PATH
+
-# === Einstellungen ===
-DB_PATH = "clips_openai.db"
-VIDEO_ID = "testVideoShort"
MAX_CLIPS = 5 # oder "all"
-OPENAI_API_KEY = "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"
-client = OpenAI(api_key=OPENAI_API_KEY)
+# === OPENAI-CLIENT (API-Key aus Env) ===
+if not os.getenv("OPENAI_API_KEY"):
+ raise RuntimeError("🚫 OPENAI_API_KEY fehlt in der Umgebung")
+client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
# === DB-Verbindung
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
-cursor.execute("DROP TABLE IF EXISTS highlights")
+# === Unbewertete Highlights laden
cursor.execute("""
-CREATE TABLE highlights (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- file TEXT,
- start REAL,
- end REAL,
- text TEXT,
- viralitaet INTEGER,
- emotionalitaet INTEGER,
- witz INTEGER,
- provokation INTEGER,
- score_total INTEGER
-)
+ SELECT id, start, end, text FROM highlights
+ WHERE viralitaet IS NULL OR emotionalitaet IS NULL
+ ORDER BY start
""")
-conn.commit()
-print(f"🧹 Tabelle 'highlights' neu erstellt für: {VIDEO_ID}")
-
-# === Segmente laden
-cursor.execute("SELECT start, end, text FROM segments ORDER BY start")
segments = cursor.fetchall()
-print(f"📥 {len(segments)} Segmente (Originaltext) geladen.")
+print(f"📥 {len(segments)} unbewertete Highlights geladen.")
# === Bewertungsfunktion (GPT-4o)
-def analyse_segment(text, start, end):
+def analyse_segment(clip_id, text, start, end):
print(f"\n🔎 Bewerte Clip: {start:.2f}s – {end:.2f}s")
prompt = f"""
@@ -86,19 +83,19 @@ Provokation: [Zahl]
if all(v is not None for v in values.values()):
total_score = sum(values.values())
cursor.execute("""
- INSERT INTO highlights (
- file, start, end, text,
- viralitaet, emotionalitaet, witz, provokation, score_total
- ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
+ UPDATE highlights SET
+ viralitaet = ?, emotionalitaet = ?, witz = ?, provokation = ?, score_total = ?
+ WHERE id = ?
""", (
- VIDEO_ID, start, end, text.strip(),
values["viralitaet"], values["emotionalitaet"],
values["witz"], values["provokation"],
- total_score
+ total_score,
+ clip_id
))
conn.commit()
return {
+ "id": clip_id,
"start": start,
"end": end,
"text": text.strip(),
@@ -113,8 +110,8 @@ Provokation: [Zahl]
# === Clips bewerten
rated = []
-for start, end, text in segments:
- result = analyse_segment(text, float(start), float(end))
+for clip_id, start, end, text in segments:
+ result = analyse_segment(clip_id, text, float(start), float(end))
if result:
rated.append(result)
sleep(1.2) # Anti-Rate-Limit
@@ -123,7 +120,7 @@ for start, end, text in segments:
rated.sort(key=lambda x: x["total"], reverse=True)
selected = rated if MAX_CLIPS == "all" else rated[:int(MAX_CLIPS)]
-print(f"\n🎬 Beste {len(selected)} Highlights für: {VIDEO_ID}")
+print(f"\n🎬 Beste {len(selected)} Highlights nach Bewertung:")
for clip in selected:
print(f"\n🚀 {clip['start']:.2f}s – {clip['end']:.2f}s")
print(f"🎙️ {clip['text'][:200]}...")
diff --git a/src/text/segment_transcript.py b/src/text/segment_transcript.py
new file mode 100644
index 0000000..7e8e577
--- /dev/null
+++ b/src/text/segment_transcript.py
@@ -0,0 +1,409 @@
+#!/usr/bin/env python3
+# clip_selector_optimized.py — word-based text rebuild (no duplicates)
+
+import os
+import re
+import json
+import sqlite3
+import time
+from pathlib import Path
+from datetime import datetime
+import argparse
+import sys
+from typing import List, Dict, Optional
+
+from openai import OpenAI
+
+# ── Projektwurzel in sys.path aufnehmen (dieses Skript kann z. B. unter src/text/ liegen)
+ROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(ROOT))
+
+from config import TRANSCRIPTS_DIR, DB_PATH # zentrale Pfade
+
+LOG_DIR = ROOT / "logs"
+LOG_DIR.mkdir(exist_ok=True, parents=True)
+
+# === DEFAULTS (per CLI überschreibbar) ===
+DEFAULT_BLOCK_DURATION = 300.0 # Sek. pro Block
+DEFAULT_MIN_CLIP_LEN = 30.0 # konsistent mit Prompt
+DEFAULT_MAX_CLIP_LEN = 90.0
+
+# === OPENAI-CLIENT (API-Key aus Env) ===
+API_KEY = os.getenv("OPENAI_API_KEY")
+if not API_KEY:
+ raise RuntimeError("🚫 OPENAI_API_KEY fehlt in der Umgebung")
+OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-5") # bei Bedarf überschreiben
+client = OpenAI(api_key=API_KEY)
+
+# ──────────────────────────────────────────────────────────────────────────────
+# Hilfsfunktionen
+# ──────────────────────────────────────────────────────────────────────────────
+
+def log_text(filename: str, content: str) -> None:
+ (LOG_DIR / filename).write_text((content or "").strip(), encoding="utf-8")
+
+def append_error_log(content: str) -> None:
+ with (LOG_DIR / "errors.txt").open("a", encoding="utf-8") as f:
+ f.write(f"{datetime.now().isoformat()} {content}\n\n")
+
+def extract_json(text: str) -> list:
+ """Nur für Debug: versucht JSON-Array aus beliebigem Text zu extrahieren."""
+ txt = (text or "").strip()
+ txt = re.sub(r"^```(?:json)?\s*|\s*```$", "", txt, flags=re.IGNORECASE | re.DOTALL)
+ m = re.search(r"\[\s*{.*?}\s*\]", txt, re.DOTALL)
+ if not m:
+ append_error_log(f"❌ Kein JSON-Array gefunden.\n{txt}")
+ return []
+ try:
+ return json.loads(m.group(0))
+ except Exception as e:
+ append_error_log(f"❌ JSON-Fehler: {e}\n{txt}")
+ return []
+
+def get_json_snippets_for_clip(start: float, end: float, segment_json: List[Dict]) -> List[Dict]:
+ """halb-offenes Fenster [start, end)"""
+ return [s for s in segment_json if not (float(s["end"]) <= start or float(s["start"]) >= end)]
+
+def _norm_space(s: str) -> str:
+ return re.sub(r"\s+", " ", (s or "").strip())
+
+def explode_segments_to_words(segments: List[Dict]) -> List[Dict]:
+ """
+ Baut eine globale Wortliste. Bevorzugt echte 'words' aus JSON,
+ fällt ansonsten auf lineare Interpolation über Segmentdauer zurück.
+ Ausgabe-Items: {idx, mid, text}
+ """
+ words = []
+ idx = 0
+ for seg in sorted(segments, key=lambda s: (float(s["start"]), float(s["end"]))):
+ s0, s1 = float(seg["start"]), float(seg["end"])
+ txt = (seg.get("text") or "").strip()
+ seg_words = seg.get("words") or []
+ if seg_words:
+ for w in seg_words:
+ t = (w.get("text") or w.get("word") or "").strip()
+ if not t:
+ continue
+ w0 = float(w["start"]); w1 = float(w["end"])
+ words.append({"idx": idx, "mid": round((w0 + w1) / 2.0, 4), "text": t})
+ idx += 1
+ else:
+ toks = txt.split()
+ n = len(toks)
+ if n == 0:
+ continue
+ dur = max(1e-6, s1 - s0)
+ for i, tok in enumerate(toks):
+ w0 = s0 + (i / n) * dur
+ w1 = s0 + ((i + 1) / n) * dur
+ words.append({"idx": idx, "mid": round((w0 + w1) / 2.0, 4), "text": tok})
+ idx += 1
+ return words
+
+def build_text_strict_from_words(clip_start: float, clip_end: float, WORDS: List[Dict]) -> str:
+ """Nimmt jedes Wort genau einmal, wenn mid ∈ [start, end)."""
+ sel = [w for w in WORDS if clip_start <= w["mid"] < clip_end]
+ sel.sort(key=lambda w: w["idx"])
+ return _norm_space(" ".join(w["text"] for w in sel))
+
+def find_transcript_pair(base: Optional[str]) -> tuple[Path, Path, str]:
+ """
+ Finde (timed.txt, segments.json) in TRANSCRIPTS_DIR.
+ - Wenn base übergeben: benutzt {base}_timed.txt und {base}_segments.json.
+ - Sonst: nimmt das lexikographisch erste *_timed.txt und leitet die JSON davon ab.
+ """
+ if base:
+ txt = TRANSCRIPTS_DIR / f"{base}_timed.txt"
+ jsn = TRANSCRIPTS_DIR / f"{base}_segments.json"
+ if not txt.exists():
+ raise FileNotFoundError(f"Transkript nicht gefunden: {txt}")
+ if not jsn.exists():
+ raise FileNotFoundError(f"Segment-JSON nicht gefunden: {jsn}")
+ return txt, jsn, base
+
+ # auto-detect
+ candidates = sorted(TRANSCRIPTS_DIR.glob("*_timed.txt"))
+ if not candidates:
+ raise FileNotFoundError(f"Keine *_timed.txt in {TRANSCRIPTS_DIR} gefunden.")
+ txt = candidates[0]
+ stem = txt.stem.replace("_timed", "")
+ jsn = TRANSCRIPTS_DIR / f"{stem}_segments.json"
+ if not jsn.exists():
+ raise FileNotFoundError(f"Gefundenes TXT: {txt.name}, aber JSON fehlt: {jsn.name}")
+ return txt, jsn, stem
+
+# ──────────────────────────────────────────────────────────────────────────────
+# CLI
+# ──────────────────────────────────────────────────────────────────────────────
+
+def parse_args():
+ p = argparse.ArgumentParser(description="Selektiert Social-Media-taugliche Clips aus Transkripten (LLM-gestützt).")
+ p.add_argument("--base", type=str, default=None,
+ help="Basename der Transkriptdateien (z. B. 'testVideoShort' für *_timed.txt und *_segments.json).")
+ p.add_argument("--block", type=float, default=DEFAULT_BLOCK_DURATION, help="Blocklänge in Sekunden für die Prompt-Bildung.")
+ p.add_argument("--min", type=float, default=DEFAULT_MIN_CLIP_LEN, help="Minimale Clip-Länge (Sekunden).")
+ p.add_argument("--max", type=float, default=DEFAULT_MAX_CLIP_LEN, help="Maximale Clip-Länge (Sekunden).")
+ return p.parse_args()
+
+# ──────────────────────────────────────────────────────────────────────────────
+# Main
+# ──────────────────────────────────────────────────────────────────────────────
+
+def main():
+ args = parse_args()
+ BLOCK_DURATION = float(args.block)
+ MIN_CLIP_LEN = float(args.min)
+ MAX_CLIP_LEN = float(args.max)
+
+ # --- Transkriptdateien finden ---
+ TRANSCRIPT_PATH, SEGMENT_JSON_PATH, base = find_transcript_pair(args.base)
+ print(f"📄 TXT : {TRANSCRIPT_PATH}")
+ print(f"🧾 JSON: {SEGMENT_JSON_PATH}")
+
+ # === TRANSKRIPT EINLESEN (TXT) -> NUR für Blockbildung & Promptanzeige ===
+ lines = TRANSCRIPT_PATH.read_text(encoding="utf-8").splitlines()
+ segments_txt: List[Dict] = []
+ for line in lines:
+ m = re.match(r"\[(\d+(?:\.\d+)?)\s*[–-]\s*(\d+(?:\.\d+)?)\]\s*(?:[A-Z_0-9]+:)?\s*(.*)", line)
+ if not m:
+ continue
+ start, end, text = m.groups()
+ start, end = float(start), float(end)
+ if end - start >= 2.0:
+ segments_txt.append({"start": start, "end": end, "text": (text or "").strip()})
+
+ if not segments_txt:
+ raise RuntimeError("🚫 Keine gültigen TXT-Segmente gefunden.")
+ print(f"✅ {len(segments_txt)} gültige TXT-Segmente geladen.")
+
+ # === TRANSKRIPT EINLESEN (JSON) -> Quelle für DB-Text/Wörter ===
+ segment_json_data = json.loads(SEGMENT_JSON_PATH.read_text(encoding="utf-8"))
+ if not isinstance(segment_json_data, list) or not segment_json_data:
+ raise RuntimeError("🚫 JSON-Segmente leer/ungültig.")
+ print(f"✅ {len(segment_json_data)} JSON-Segmente geladen.")
+
+ # Globale Wörterliste einmal berechnen (bevor wir Clips bilden)
+ WORDS = explode_segments_to_words(segment_json_data)
+ print(f"🔤 Globale Wörter im Korpus: {len(WORDS)}")
+
+ # === BLÖCKE BILDEN (aus TXT) ===
+ segments_txt.sort(key=lambda s: (s["start"], s["end"]))
+ blocks, current_block, current_start = [], [], 0.0
+ for seg in segments_txt:
+ if not current_block:
+ current_start = seg["start"]
+ # Blockwechsel, wenn Dauer überschritten
+ if seg["end"] - current_start > BLOCK_DURATION:
+ blocks.append(current_block)
+ current_block = []
+ current_start = seg["start"]
+ current_block.append(seg)
+ if current_block:
+ blocks.append(current_block)
+ print(f"🧱 {len(blocks)} Blöcke erstellt (à ~{BLOCK_DURATION:.0f}s).")
+
+ # === KI: CLIP-AUSWAHL ===
+ all_clips = []
+ t0 = time.perf_counter()
+
+ for i, block in enumerate(blocks, start=1):
+ if not block:
+ continue
+ print(f"\n🤖 Sende Block {i}/{len(blocks)} an {OPENAI_MODEL} …")
+ block_text = "\n".join([f"[{s['start']} – {s['end']}] {s['text']}" for s in block])
+
+ prompt = f"""
+Du bekommst einen Transkriptblock mit Zeitangaben. Extrahiere daraus 1–3 besonders interessante Abschnitte, die sich als eigenständige Social Media Clips eignen.
+Ein guter Clip:
+- ist abgeschlossen und verständlich
+- enthält eine Pointe, Erkenntnis oder einen emotionalen Moment
+- wirkt wie ein Mini-Ausschnitt mit Anfang, Spannungsbogen, Auflösung oder Punchline
+- ist mindestens {MIN_CLIP_LEN:.0f} Sekunden lang
+Nutze ausschließlich die vorhandenen Start- und Endzeiten – keine neuen erfinden.
+
+Gib ein JSON-Objekt zurück im Format:
+{{
+ "clips": [
+ {{
+ "start": float,
+ "end": float,
+ "summary": "Kurze Beschreibung des Inhalts"
+ }}
+ ]
+}}
+
+TRANSKRIPT:
+{block_text}
+""".strip()
+
+ log_text(f"block_prompt_{i:02d}.txt", prompt)
+
+ # --- robuster API-Call mit Schema (Root=object) und kleinem Retry ---
+ import time as _time
+ clips = []
+ for attempt in range(3):
+ try:
+ resp = client.chat.completions.create(
+ model=OPENAI_MODEL,
+ messages=[{"role": "user", "content": prompt}],
+ response_format={
+ "type": "json_schema",
+ "json_schema": {
+ "name": "clips_payload",
+ "schema": {
+ "type": "object",
+ "properties": {
+ "clips": {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "start": {"type": "number"},
+ "end": {"type": "number"},
+ "summary": {"type": "string"}
+ },
+ "required": ["start", "end", "summary"],
+ "additionalProperties": False
+ }
+ }
+ },
+ "required": ["clips"],
+ "additionalProperties": False
+ }
+ }
+ }
+ )
+ msg = resp.choices[0].message
+ payload = getattr(msg, "parsed", None)
+ if payload is None:
+ payload = json.loads(msg.content)
+
+ clips = payload.get("clips", []) or []
+
+ try:
+ log_text(f"block_output_{i:02d}.txt", json.dumps(payload, ensure_ascii=False, indent=2))
+ except Exception:
+ pass
+ break
+ except Exception as e:
+ if attempt == 2:
+ append_error_log(f"❌ OpenAI-Fehler Block {i}: {e}")
+ print(f"❌ Fehler bei Block {i}: {e}")
+ else:
+ _time.sleep(1.5 * (attempt + 1))
+
+ print(f"✅ {len(clips)} Clips empfangen in Block {i}")
+
+ # --- Clips filtern & clampen ---
+ for clip in clips:
+ try:
+ b_start, b_end = block[0]["start"], block[-1]["end"]
+ start = max(b_start, min(float(clip["start"]), b_end))
+ end = max(b_start, min(float(clip["end"]), b_end))
+ dur = end - start
+ if MIN_CLIP_LEN <= dur <= MAX_CLIP_LEN:
+ clip["start"] = start
+ clip["end"] = end
+ clip["duration"] = round(dur, 2)
+ all_clips.append(clip)
+ except Exception as e:
+ append_error_log(f"⛔ Clip-Filterfehler: {clip}\n{e}")
+
+ elapsed = time.perf_counter() - t0
+ avg = elapsed / i
+ eta = max(0.0, avg * (len(blocks) - i))
+ print(f"⏱️ Geschätzte Restzeit: {eta:.1f} s")
+
+ # --- Duplikate entfernen (auf 2 Dezimalen) ---
+ dedup, seen = [], set()
+ for c in all_clips:
+ k = (round(c["start"], 2), round(c["end"], 2))
+ if k in seen:
+ continue
+ seen.add(k)
+ dedup.append(c)
+ all_clips = dedup
+
+ print(f"\n📈 Gesamtclips vor DB-Insert: {len(all_clips)}")
+
+ # === DB SPEICHERN ===
+ conn = sqlite3.connect(DB_PATH)
+ cur = conn.cursor()
+
+ cur.execute("""
+ CREATE TABLE IF NOT EXISTS highlights (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ file TEXT,
+ start REAL,
+ end REAL,
+ duration REAL,
+ text TEXT,
+ summary TEXT,
+ json_raw TEXT,
+ viralitaet INTEGER,
+ emotionalitaet INTEGER,
+ witz INTEGER,
+ provokation INTEGER,
+ score_total INTEGER,
+ UNIQUE(file,start,end)
+ )
+ """)
+
+ # --- Tabelle vor neuem Lauf komplett leeren ---
+ cur.execute("DELETE FROM highlights")
+ conn.commit() # Transaktion schließen, damit VACUUM außerhalb läuft
+
+ # VACUUM separat (optional)
+ try:
+ conn.execute("VACUUM") # oder: sqlite3.connect(DB_PATH).execute("VACUUM").close()
+ print("🧹 Alte Highlights gelöscht und Datenbank komprimiert.")
+ except sqlite3.OperationalError as e:
+ print(f"⚠️ VACUUM übersprungen: {e}")
+
+ inserted = 0
+ failed = 0
+
+ for clip in all_clips:
+ try:
+ start = float(clip["start"])
+ end = float(clip["end"])
+ duration = float(clip["duration"])
+ summary = (clip.get("summary") or "").strip()
+
+ if end <= start or start < 0:
+ raise ValueError("Ungültige Zeiten")
+
+ # JSON-Segmente (zur Nachvollziehbarkeit) + Wort-basierter Text (dopplerfrei)
+ json_snippets = get_json_snippets_for_clip(start, end, segment_json_data)
+ json_raw = json.dumps(json_snippets, ensure_ascii=False)
+
+ original_text = build_text_strict_from_words(start, end, WORDS)
+
+ cur.execute("""
+ INSERT OR IGNORE INTO highlights (
+ file, start, end, duration, text, summary, json_raw,
+ viralitaet, emotionalitaet, witz, provokation, score_total
+ )
+ VALUES (?, ?, ?, ?, ?, ?, ?, NULL, NULL, NULL, NULL, NULL)
+ """, (
+ # 'file' = Basename (z. B. testVideoShort)
+ Path(base).name,
+ start, end, duration,
+ original_text, summary, json_raw
+ ))
+ if cur.rowcount > 0:
+ inserted += 1
+ except Exception as e:
+ failed += 1
+ append_error_log(f"❌ DB-Fehler: {clip}\n{e}")
+
+ conn.commit()
+ conn.close()
+
+ print("\n📊 Ergebnisse:")
+ print(f" ✅ Highlights gespeichert: {inserted}")
+ print(f" ❌ Fehlerhafte Clips: {failed}")
+ print(f"📁 Logs: {LOG_DIR.resolve()}")
+
+if __name__ == "__main__":
+ main()
diff --git a/src/text/transcription.py b/src/text/transcription.py
new file mode 100644
index 0000000..0c8ee69
--- /dev/null
+++ b/src/text/transcription.py
@@ -0,0 +1,276 @@
+#!/usr/bin/env python3
+# transcription_chunked_words.py — Whisper mit Wortzeitstempeln, doppler-sicher
+
+import os
+import sys
+import json
+import argparse
+from pathlib import Path
+from tempfile import TemporaryDirectory
+from typing import List, Dict, Tuple, Optional
+
+import ffmpeg
+import whisper
+
+# ── Projektwurzel in sys.path aufnehmen (dieses Skript liegt z. B. unter src/text/)
+ROOT = Path(__file__).resolve().parents[2]
+sys.path.insert(0, str(ROOT))
+
+from config import INPUT_DIR, TRANSCRIPTS_DIR # zentrale Pfade
+
+# ──────────────────────────────────────────────────────────────────────────────
+# Utilities
+# ──────────────────────────────────────────────────────────────────────────────
+
+def probe_duration(path: Path) -> float:
+ """Ermittle die Videodauer in Sekunden (ffmpeg.probe)."""
+ try:
+ meta = ffmpeg.probe(str(path))
+ except ffmpeg.Error as e:
+ raise RuntimeError(f"ffmpeg.probe fehlgeschlagen für {path}: {e.stderr.decode('utf-8','ignore') if hasattr(e, 'stderr') else e}") from e
+
+ dur = meta.get("format", {}).get("duration")
+ if dur is not None:
+ return float(dur)
+
+ cand = 0.0
+ for s in meta.get("streams", []) or []:
+ d = s.get("duration")
+ if d:
+ cand = max(cand, float(d))
+ if cand > 0:
+ return cand
+ raise RuntimeError(f"Konnte Videodauer nicht bestimmen: {path}")
+
+def make_chunks(total: float, chunk_seconds: float, overlap: float) -> List[Tuple[float,float]]:
+ """Zerteile [0,total] in überlappende Intervalle."""
+ if chunk_seconds <= 0:
+ return [(0.0, total)]
+ s, out = 0.0, []
+ while s < total:
+ e = min(s + chunk_seconds, total)
+ out.append((s, e))
+ if e >= total:
+ break
+ s = max(0.0, e - overlap)
+ return out
+
+def extract_audio_segment(src_video: Path, start: float, end: float, out_wav: Path) -> None:
+ """Extrahiere [start,end] als Mono-16kHz-WAV."""
+ (
+ ffmpeg
+ .input(str(src_video), ss=start, to=end)
+ .output(
+ str(out_wav),
+ format="wav",
+ acodec="pcm_s16le",
+ ac=1,
+ ar="16000",
+ loglevel="error",
+ )
+ .overwrite_output()
+ .run()
+ )
+
+def is_suspect(text: str) -> bool:
+ """Heuristik: leere/loopende/zweifelhafte Zeilen markieren."""
+ t = (text or "").strip().lower()
+ if not t:
+ return True
+ words = t.split()
+ if not words:
+ return True
+ counts = {w: words.count(w) for w in set(words)}
+ most_common = max(counts.values())
+ return most_common / len(words) > 0.6 or most_common > 20
+
+def merge_overlaps_keep_best(
+ segments: List[Dict],
+ max_gap: float = 0.15,
+ min_dur: float = 0.30
+) -> List[Dict]:
+ """
+ Zeitlich sortieren, kleine Gaps schließen. Bei Überlappung:
+ - keine Text-Konkatenation
+ - behalte das "bessere" Segment (längere Dauer, dann längerer Text)
+ - words: vom "best" übernehmen (falls vorhanden)
+ """
+ cleaned = []
+ for s in segments:
+ s0 = float(s["start"]); s1 = float(s["end"])
+ txt = (s.get("text") or "").strip()
+ if s1 - s0 >= min_dur and txt:
+ cleaned.append({
+ "start": s0, "end": s1,
+ "text": txt,
+ "words": s.get("words", [])
+ })
+ if not cleaned:
+ return []
+
+ cleaned.sort(key=lambda x: (x["start"], x["end"]))
+ out = [cleaned[0]]
+
+ def score(x: Dict) -> tuple:
+ return (x["end"] - x["start"], len(x.get("text", "")))
+
+ for s in cleaned[1:]:
+ m = out[-1]
+ if s["start"] <= m["end"] + max_gap:
+ best = s if score(s) > score(m) else m
+ out[-1] = {
+ "start": min(m["start"], s["start"]),
+ "end": max(m["end"], s["end"]),
+ "text": best["text"],
+ "words": best.get("words", []),
+ }
+ else:
+ out.append(s)
+ return out
+
+def write_outputs(base: Path, segments: List[Dict], out_dir: Path, ascii_dash: bool = True):
+ """Schreibe _timed.txt, _suspect_lines.txt und _segments.json."""
+ out_dir.mkdir(parents=True, exist_ok=True)
+ dash = "-" if ascii_dash else "–"
+
+ out_txt = out_dir / f"{base.stem}_timed.txt"
+ out_sus = out_dir / f"{base.stem}_suspect_lines.txt"
+ out_json = out_dir / f"{base.stem}_segments.json"
+
+ # TXT nur zur Ansicht
+ with open(out_txt, "w", encoding="utf-8") as f_txt, open(out_sus, "w", encoding="utf-8") as f_sus:
+ for s in segments:
+ line = f"[{s['start']:.2f} {dash} {s['end']:.2f}] {s['text']}\n"
+ f_txt.write(line)
+ if is_suspect(s["text"]):
+ f_sus.write(line)
+
+ # JSON für die Weiterverarbeitung (inkl. words)
+ with open(out_json, "w", encoding="utf-8") as f_json:
+ json.dump(segments, f_json, ensure_ascii=False, indent=2)
+
+ return out_txt, out_sus, out_json
+
+def find_default_input() -> Optional[Path]:
+ """Nimm das erste Video aus INPUT_DIR, falls kein --input übergeben wurde."""
+ exts = (".mp4", ".mov", ".mkv", ".m4v", ".wav", ".mp3")
+ for p in sorted(INPUT_DIR.iterdir()):
+ if p.suffix.lower() in exts:
+ return p
+ return None
+
+# ──────────────────────────────────────────────────────────────────────────────
+# CLI
+# ──────────────────────────────────────────────────────────────────────────────
+
+def parse_args():
+ p = argparse.ArgumentParser(
+ description="Chunked Whisper Transcription mit Wortzeitstempeln & doppler-sicherem Stitching."
+ )
+ p.add_argument("--input", type=Path, default=None, help=f"Eingabevideo/-audio. Default: erstes File in {INPUT_DIR}")
+ p.add_argument("--outdir", type=Path, default=None, help=f"Ausgabeverzeichnis. Default: {TRANSCRIPTS_DIR}")
+ p.add_argument("--model", type=str, default=os.getenv("WHISPER_MODEL", "small"), help="Whisper-Modell (tiny/base/small/medium/large)")
+ p.add_argument("--lang", type=str, default=os.getenv("LANGUAGE", "none"), help="Sprachcode (z. B. 'de') oder leer/None für Auto-Detect")
+ p.add_argument("--chunk", type=float, default=60.0, help="Chunk-Länge in Sekunden (0 = ganzes File)")
+ p.add_argument("--overlap", type=float, default=2.0, help="Overlap in Sekunden")
+ p.add_argument("--min-dur", type=float, default=0.30, help="Mindest-Segmentdauer (Sekunden)")
+ p.add_argument("--max-gap", type=float, default=0.15, help="Maximaler Zeit-Gap für Merge (Sekunden)")
+ p.add_argument("--fp16", action="store_true", help="fp16 aktivieren (nur sinnvoll mit GPU)")
+ return p.parse_args()
+
+# ──────────────────────────────────────────────────────────────────────────────
+# Main
+# ──────────────────────────────────────────────────────────────────────────────
+
+def main():
+ # Whisper-Cache (damit Modelle lokal landen)
+ os.environ.setdefault("XDG_CACHE_HOME", str(ROOT / "whisper-cache"))
+
+ args = parse_args()
+ input_path = args.input or find_default_input()
+ out_dir = args.outdir or TRANSCRIPTS_DIR
+
+ print("📁 Projekt-Root:", ROOT)
+ print("📄 Input:", input_path if input_path else "—")
+ if not input_path or not input_path.exists():
+ raise FileNotFoundError(f"Kein gültiges Eingabefile gefunden. Lege ein Video/Audio in {INPUT_DIR} oder nutze --input.")
+
+ out_dir.mkdir(parents=True, exist_ok=True)
+
+ duration = probe_duration(input_path)
+ print(f"🎬 Dauer: {duration:.2f}s")
+
+ chunks = make_chunks(duration, args.chunk, args.overlap)
+ print(f"🔪 {len(chunks)} Chunks à {args.chunk:.1f}s mit {args.overlap:.1f}s Overlap")
+
+ # Whisper laden
+ print(f"🧠 Lade Whisper-Modell: {args.model}")
+ try:
+ model = whisper.load_model(args.model)
+ except Exception as e:
+ raise RuntimeError(f"Whisper-Modell '{args.model}' konnte nicht geladen werden. Installiert? (pip install openai-whisper)\n{e}") from e
+
+ all_segments: List[Dict] = []
+ with TemporaryDirectory() as tmpdir_str:
+ tmpdir = Path(tmpdir_str)
+ for i, (start, end) in enumerate(chunks, 1):
+ print(f"🔉 Chunk {i}/{len(chunks)}: {start:.2f}s - {end:.2f}s")
+ wav = tmpdir / f"chunk_{i:03d}.wav"
+ extract_audio_segment(input_path, start, end, wav)
+
+ # Sprache: ''/none = Auto-Detect
+ lang = None if str(args.lang).strip().lower() in {"", "none", "null"} else args.lang
+
+ # Transkribieren mit Wortzeiten, ohne Cross-Chunk-Kontext
+ result = model.transcribe(
+ str(wav),
+ language=lang,
+ fp16=args.fp16,
+ word_timestamps=True,
+ condition_on_previous_text=False,
+ temperature=0,
+ verbose=False,
+ )
+
+ # Center-Cut: nur Mittelteil behalten (verhindert Grenz-Doppler)
+ keep_start = start if i == 1 else start + args.overlap / 2.0
+ keep_end = end if i == len(chunks) else end - args.overlap / 2.0
+
+ for seg in result.get("segments", []) or []:
+ s0 = float(seg["start"]) + start
+ s1 = float(seg["end"]) + start
+ mid = (s0 + s1) / 2.0
+ if not (keep_start <= mid < keep_end):
+ continue
+
+ # Wörter mit absoluten Zeiten übernehmen
+ words = []
+ for w in (seg.get("words") or []):
+ txt = (w.get("word") or w.get("text") or "").strip()
+ if not txt:
+ continue
+ words.append({
+ "start": float(w["start"]) + start,
+ "end": float(w["end"]) + start,
+ "text": txt
+ })
+
+ all_segments.append({
+ "start": s0,
+ "end": s1,
+ "text": (seg.get("text") or "").strip(),
+ "words": words
+ })
+
+ print(f"🧹 Roh-Segmente: {len(all_segments)} → merge & filter …")
+ merged = merge_overlaps_keep_best(all_segments, max_gap=args.max_gap, min_dur=args.min_dur)
+ print(f"✅ Gemergte Segmente: {len(merged)}")
+
+ out_txt, out_sus, out_json = write_outputs(input_path, merged, out_dir, ascii_dash=True)
+ print(f"📝 TXT: {out_txt}")
+ print(f"⚠️ SUSPECT: {out_sus}")
+ print(f"💾 JSON: {out_json}")
+ print("🎉 Fertig.")
+
+if __name__ == "__main__":
+ main()
diff --git a/src/text/transcription_with_speaker.py b/src/text/transcription_with_speaker.py
new file mode 100644
index 0000000..e5912b7
--- /dev/null
+++ b/src/text/transcription_with_speaker.py
@@ -0,0 +1,88 @@
+import os
+import json
+import ffmpeg
+import whisper
+import tempfile
+import torch
+from tqdm import tqdm
+from pathlib import Path
+from pyannote.audio import Pipeline
+
+# === HUGGING FACE TOKEN (für pyannote) ===
+HF_TOKEN = "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"
+
+# === Torch Optimierung (optional) ===
+torch.set_float32_matmul_precision("medium")
+
+# === Einstellungen ===
+PROJECT_ROOT = Path(__file__).resolve().parents[2]
+input_file = PROJECT_ROOT / "input" / "testVideoShort.mov"
+output_dir = PROJECT_ROOT / "transkripte"
+output_dir.mkdir(parents=True, exist_ok=True)
+
+output_txt = output_dir / f"{input_file.stem}_timed.txt"
+output_json = output_dir / f"{input_file.stem}_segments.json"
+
+# === Video in Audio konvertieren ===
+print("🎞️ Extrahiere Audio ...")
+tmp_dir = Path(tempfile.mkdtemp())
+wav_file = tmp_dir / "audio.wav"
+ffmpeg.input(str(input_file)).output(
+ str(wav_file),
+ format="wav",
+ acodec="pcm_s16le",
+ ac=1,
+ ar="16000",
+ loglevel="error"
+).overwrite_output().run()
+
+# === Transkription mit Whisper ===
+print("🧠 Starte Transkription mit Whisper ...")
+model = whisper.load_model("small")
+result = model.transcribe(
+ str(wav_file),
+ language="de",
+ fp16=False,
+ word_timestamps=False,
+ condition_on_previous_text=True,
+ temperature=0,
+ verbose=False
+)
+segments = result["segments"]
+
+# === Diarisation mit Pyannote ===
+print("🗣️ Starte Sprecheranalyse mit Pyannote (das dauert jetzt etwas) ...")
+pipeline = Pipeline.from_pretrained(
+ "pyannote/speaker-diarization-3.1",
+ use_auth_token=HF_TOKEN
+)
+pipeline.to(torch.device("mps")) # ⬅️ Apple GPU beschleunigen
+
+diarization = pipeline(str(wav_file))
+
+# === Sprecher zuordnen ===
+def assign_speakers_to_segments(segments, diarization):
+ assigned = []
+ for seg in tqdm(segments, desc="🎙️ Weise Sprecher zu"):
+ speaker = "unknown"
+ for turn, _, label in diarization.itertracks(yield_label=True):
+ if turn.start <= seg["start"] <= turn.end:
+ speaker = label
+ break
+ seg["speaker"] = speaker
+ assigned.append(seg)
+ return assigned
+
+segments_with_speaker = assign_speakers_to_segments(segments, diarization)
+
+# === Speichern als TXT
+with open(output_txt, "w", encoding="utf-8") as f:
+ for seg in segments_with_speaker:
+ line = f"[{seg['start']:.2f} – {seg['end']:.2f}] {seg['speaker'].upper()}: {seg['text'].strip()}\n"
+ f.write(line)
+
+# === Speichern als JSON
+with open(output_json, "w", encoding="utf-8") as f:
+ json.dump(segments_with_speaker, f, ensure_ascii=False, indent=2)
+
+print(f"✅ Transkript mit Sprecherinfos gespeichert unter:\n📄 {output_txt}\n📄 {output_json}")
diff --git a/text-clustering b/text-clustering
deleted file mode 160000
index 7815f8b..0000000
--- a/text-clustering
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 7815f8b37d91b75cf160ed3f0ec8550c0b58cabb
diff --git a/transkripte/.DS_Store b/transkripte/.DS_Store
deleted file mode 100644
index 1a1bbf7..0000000
Binary files a/transkripte/.DS_Store and /dev/null differ
diff --git a/whisper.cpp b/whisper.cpp
deleted file mode 160000
index 2e310b8..0000000
--- a/whisper.cpp
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 2e310b841e0b4e7cf00890b53411dd9f8578f243