cleanup: ignore text-clustering & whisper.cpp

This commit is contained in:
Jupp Kerschbaum 2025-10-19 16:22:26 +02:00
parent a9d700b20e
commit 0c9b43af42
46 changed files with 5202 additions and 473 deletions

BIN
.DS_Store vendored

Binary file not shown.

105
.gitignore vendored
View File

@ -1,27 +1,108 @@
# IDE & Cache
# ─────────────────────────────
# IDEs & System Files
# ─────────────────────────────
.idea/
.vscode/
__pycache__/
*.pyc
.DS_Store
*.log
# Whisper Modelle & Cache
# ─────────────────────────────
# Cache / Modelle / Checkpoints
# ─────────────────────────────
whisper-cache/
models/
*.pt
*.onnx
*.bin
*.safetensors
# Output/Temp Files
# ─────────────────────────────
# Datenbank / temporäre Dateien
# ─────────────────────────────
*.db
*.sqlite
logs/
temp/
tmp/
*.tmp
# ─────────────────────────────
# Transkripte / KI-Zwischenausgaben
# ─────────────────────────────
/data/transkripte/
/transcripts/
/outputs/
/results/
*_segments.json
*_timed.txt
*_suspect_lines.txt
# ─────────────────────────────
# Video / Audio Outputs
# ─────────────────────────────
*.mp4
*.mov
*.db
*.mkv
*.wav
*.json
temp.*
logs/
*.webm
*.mp3
# Eingebettete Repos
# ─────────────────────────────
# Generierte Teil-/Ergebnis-Ordner
# ─────────────────────────────
/raw_clips/
/face_combined/
/face_crop_centers/
/cropped/
/subtitled/
/segments/
/highlight_clips/
/output/
/renders/
/exports/
# ─────────────────────────────
# Eingebettete Repos oder externe Module
# ─────────────────────────────
/whisper.cpp/
/text-clustering/
/venv/
/.env/
/.env.local
.envrc
.env.*
# ─────────────────────────────
# Backups / Sonstiges
# ─────────────────────────────
*.bak
*.old
*.orig
*.swp
*.zip
*.tar
*.gz
# IDE/System
.idea/
.DS_Store
__pycache__/
*.pyc
# Secrets/Umgebung
.env
config.py
# Große/ausgeleitete Daten
data/
transkripte/
whisper-cache/
models/
*.db
*.mp4 *.mov *.mkv *.wav *.mp3 *.webm
logs/ tmp/ temp/
# embedded / external
text-clustering/
whisper.cpp/
# Video-Rohmaterial
*.mov

8
.idea/.gitignore generated vendored
View File

@ -1,8 +0,0 @@
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

View File

@ -1,11 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$" isTestSource="false" />
<excludeFolder url="file://$MODULE_DIR$/.venv" />
</content>
<orderEntry type="jdk" jdkName="Python 3.10 (BachlorArbeit)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

35
.idea/dataSources.xml generated
View File

@ -1,35 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="DataSourceManagerImpl" format="xml" multifile-model="true">
<data-source source="LOCAL" name="segments" uuid="b474bded-3824-407e-9dc9-bcc11057235d">
<driver-ref>sqlite.xerial</driver-ref>
<synchronize>true</synchronize>
<jdbc-driver>org.sqlite.JDBC</jdbc-driver>
<jdbc-url>jdbc:sqlite:$PROJECT_DIR$/segments.db</jdbc-url>
<working-dir>$ProjectFileDir$</working-dir>
<libraries>
<library>
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/xerial/sqlite-jdbc/3.45.1.0/sqlite-jdbc-3.45.1.0.jar</url>
</library>
<library>
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/slf4j/slf4j-api/1.7.36/slf4j-api-1.7.36.jar</url>
</library>
</libraries>
</data-source>
<data-source source="LOCAL" name="clips_openai" uuid="50f21c9a-9baf-4dc5-9c33-fde0fd385e38">
<driver-ref>sqlite.xerial</driver-ref>
<synchronize>true</synchronize>
<jdbc-driver>org.sqlite.JDBC</jdbc-driver>
<jdbc-url>jdbc:sqlite:$PROJECT_DIR$/clips_openai.db</jdbc-url>
<working-dir>$ProjectFileDir$</working-dir>
<libraries>
<library>
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/xerial/sqlite-jdbc/3.45.1.0/sqlite-jdbc-3.45.1.0.jar</url>
</library>
<library>
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/slf4j/slf4j-api/1.7.36/slf4j-api-1.7.36.jar</url>
</library>
</libraries>
</data-source>
</component>
</project>

View File

@ -1,6 +0,0 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

6
.idea/misc.xml generated
View File

@ -1,6 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Black">
<option name="sdkName" value="Python 3.10 (BachlorArbeit)" />
</component>
</project>

8
.idea/modules.xml generated
View File

@ -1,8 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/BachlorArbeit.iml" filepath="$PROJECT_DIR$/.idea/BachlorArbeit.iml" />
</modules>
</component>
</project>

9
.idea/vcs.xml generated
View File

@ -1,9 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$/models/distiluse-base-multilingual-cased-v2" vcs="Git" />
<mapping directory="$PROJECT_DIR$/text-clustering" vcs="Git" />
<mapping directory="$PROJECT_DIR$/whisper.cpp" vcs="Git" />
<mapping directory="$PROJECT_DIR$/whisper.cpp/whisper.cpp" vcs="Git" />
</component>
</project>

250
README.md
View File

@ -0,0 +1,250 @@
# Bachelorarbeit Pipeline: Automatisierte Highlight-Erkennung & 9:16-Aufbereitung
Diese Repository enthält eine vollständige, skriptbasierte Pipeline, um aus Langvideos automatisch SocialMediataugliche 9:16Highlights zu erzeugen inkl. Transkription, KIgestützter ClipSelektion, Gesichts/Mundaktivitätsanalyse, AutoCropping, Untertitel (WordCaps) und finalem Export.
## Inhaltsverzeichnis
- [Features](#features)
- [Ordnerstruktur](#ordnerstruktur)
- [Voraussetzungen](#voraussetzungen)
- [Installation](#installation)
- [Schnellstart (empfohlener Workflow)](#schnellstart-empfohlener-workflow)
- [Skripte & CLI](#skripte--cli)
- [Tipps & Troubleshooting](#tipps--troubleshooting)
- [Reproduzierbarkeit](#reproduzierbarkeit)
- [Lizenz / Danksagung](#lizenz--danksagung)
---
## Features
- **Transkription mit WortZeitstempeln (Whisper, chunked ohne GrenzDoppler)**
- **LLMgestützte ClipSelektion** (Viralität/Emotionalität etc. in SQLite gespeichert)
- **FaceDetection (YOLOv8face) & Mundaktivität (MediaPipe)**
- **Stabiles 9:16AutoCropping** (Median + EMA, Deadband, SzenenschnittErkennung, SwitchCooldown)
- **WordCaps Untertitel** (ASS generiert, per ffmpeg eingebrannt)
- **BatchExport der Highlights** (MoviePy, Längen/GrenzChecks)
## Ordnerstruktur
Die Pfade werden zentral in `config.py` definiert:
```
PROJECT_ROOT/
├─ data/
│ ├─ input/ # Eingabevideo(s)
│ ├─ transkripte/ # Whisper-Outputs (*_segments.json, *_timed.txt ...)
│ ├─ segments/ # LLM-Clip-Auswahl, DB etc.
│ ├─ output/
│ │ └─ raw_clips/ # Roh-Highlight-Clips (aus cutClips.py)
│ ├─ face_data_combined/ # faces.json je Clip (YOLO + MediaPipe)
│ └─ face_crop_centers/ # (optional) Center-Listen
├─ output/
│ ├─ output_9x16_final/ # Auto-cropped 9:16 Videos
│ ├─ output_9x16_final_subbed_word/ # 9:16 mit eingebrannten Word-Caps
│ └─ debug/ # Debug-Previews/Artefakte
├─ models/ # YOLO-Weights (z. B. yolov8n-face.pt)
├─ whisper-cache/ # Whisper Modell-Cache
└─ src/... (optional projektspezifisch)
```
> Beim ersten Start legt `config.py` fehlende Verzeichnisse automatisch an.
## Voraussetzungen
**SystemTools**
- `ffmpeg` (inkl. `ffprobe`) im `PATH`
**Python**
- Python 3.10+ empfohlen
- Pakete (Beispiel):
`openai-whisper`, `torch`, `ffmpeg-python`, `ultralytics`, `opencv-python`, `mediapipe`, `moviepy`, `tqdm`, `numpy`, `regex`
- Optional/abhängig vom Codepfad: `pydub`, `scikit-image` (falls in Erweiterungen verwendet)
**Modelle & Keys**
- **Whisper**: lädt Modelle automatisch in `whisper-cache/` (steuerbar via `WHISPER_MODEL`)
- **YOLOv8face**: `models/yolov8n-face.pt` (oder größeres Modell)
- **OpenAI API Key** (für `segment_transcript.py` & `rateCluster.py`): `export OPENAI_API_KEY=...`
- DefaultModell ggf. per `export OPENAI_MODEL=gpt-4o` setzen
## Installation
```bash
# 1) Python-Umgebung
python3 -m venv .venv
source .venv/bin/activate
# 2) Systemabhängigkeiten
# ffmpeg installieren (Mac: brew install ffmpeg, Ubuntu: apt install ffmpeg)
# 3) Python-Pakete (Beispiel)
pip install --upgrade pip
pip install openai-whisper torch ffmpeg-python ultralytics opencv-python mediapipe moviepy tqdm numpy regex
# 4) Modelle/Dateien
# YOLO-Weights:
# Download yolov8n-face.pt → ./models/yolov8n-face.pt
# API Key für LLM:
export OPENAI_API_KEY="sk-..."
export OPENAI_MODEL="gpt-4o"
```
## Schnellstart (empfohlener Workflow)
1) **Eingabe ablegen**
Lege dein Langvideo in `data/input/` (z.B. `meinvideo.mp4`).
2) **Transkription (Whisper, chunked & doppler-sicher)**
```bash
python transcription.py --input data/input/meinvideo.mp4 --model small --lang de
```
→ erzeugt `*_segments.json` + `*_timed.txt` in `data/transkripte/`.
3) **Clips mit LLM selektieren & in DB speichern**
```bash
export OPENAI_API_KEY="..."; export OPENAI_MODEL="gpt-4o"
python segment_transcript.py --base meinvideo --block 60 --min 6.0 --max 30.0
```
→ schreibt Clips in SQLite (`data/clips_openai.db` o. ä.)
4) **Highlights aus dem Originalvideo schneiden**
```bash
python cutClips.py --file meinvideo.mp4 --limit 10 --order score
```
→ exportiert `highlight_*.mp4` nach `data/output/raw_clips/`
5) **FaceDetection + Mundaktivität**
```bash
python main_detect_faces.py --model models/yolov8n-face.pt --input-dir data/output/raw_clips --output-dir data/face_data_combined --frame-skip 1 --downscale 0.5
```
6) **Targets je Frame bauen (Zentren/Größe glätten)**
```bash
python make_segments.py --pattern "highlight_*.mp4" --fps 0 --smooth 7 --overwrite
```
7) **9:16 AutoCrop anwenden**
```bash
python main_apply_crop.py --pattern "highlight_*.mp4" --median 7 --ema 0.5 --deadband 16 --cut_detect --mux_audio --overwrite
```
→ fertige 9:16Clips in `output/output_9x16_final/`
8) **WordCaps Untertitel einbrennen (optional)**
```bash
python add_subtitles.py --clips_dir output/output_9x16_final --out_dir output/output_9x16_final_subbed_word --model small --limit 20
```
→ fertige Videos mit eingebrannten WordCaps in `output/output_9x16_final_subbed_word/`
> 💡 Du kannst viele Parameter (Fensterbreiten, Deadband, Erkennungsschwellen, Limits) über die CLI anpassen.
## Skripte & CLI
### `transcription.py`
ChunkedTranskription mit Wortzeitstempeln.
```
--input PATH # Eingabevideo/-audio (Default: erstes File in data/input/)
--outdir PATH # Ausgabeverzeichnis (Default: data/transkripte/)
--model NAME # Whisper-Modell (tiny/base/small/medium/large; env: WHISPER_MODEL)
--lang CODE # Sprachcode (z.B. de) oder leer/None für Auto-Detect
--chunk FLOAT # Chunk-Länge in s (Default 60)
--overlap FLOAT # Überlappung in s (Default 2.0)
--min-dur FLOAT # Mindest-Segmentdauer (s)
--max-gap FLOAT # Max. Zeit-Gap beim Mergen (s)
--fp16 # Nur sinnvoll mit GPU
```
### `segment_transcript.py`
LLMSelektion & Speichern in SQLite.
```
--base STR # Basename der Transkriptdateien (z.B. 'meinvideo')
--block FLOAT # Blocklänge s für den Prompt
--min FLOAT # minimale Clip-Länge s
--max FLOAT # maximale Clip-Länge s
# env: OPENAI_API_KEY, OPENAI_MODEL (z. B. gpt-4o)
```
### `cutClips.py`
Schneidet ausgewählte Highlights als Einzelclips.
```
--file NAME # Name der Input-Datei in data/input (Default: erstes Video)
--limit INT # Anzahl zu exportierender Clips (Default 10)
--order {score,start} # Sortierung: Score (absteigend) oder Startzeit
```
### `main_detect_faces.py`
YOLOv8face + MediaPipe → `faces.json` pro Clip.
```
--input-dir PATH # Default: data/output/raw_clips
--output-dir PATH # Default: data/face_data_combined
--model PATH # YOLOv8-face Weights (Default: models/yolov8n-face.pt)
--conf-thresh FLOAT # Default 0.35
--frame-skip INT # z. B. 1 = jeden Frame, 2 = jeden von zwei ...
--downscale FLOAT # Frame-Downscale vor YOLO (0..1, z. B. 0.5)
--expansion FLOAT # Margin Pass 1 (relativ)
--expansion2 FLOAT # Margin Pass 2 (relativ)
--min-crop INT # minimale Croplänge (px)
--faces-upscale INT # min. Kantenlänge für FaceMesh (kleine Crops hochskalieren)
--imgsz INT # YOLO input size (Default 448)
--max-det INT # Max Detects / Frame
--use-refine # MediaPipe refine_landmarks aktivieren
```
### `make_segments.py`
Erzeugt `*_target_by_frame.json` (Zentren+Side pro Frame) aus Face/CenterDaten.
```
--pattern STR # Dateimuster in raw_clips (Default: highlight_*.mp4)
--fps FLOAT # FPS erzwingen (0 = aus Video lesen)
--smooth INT # MA-Fensterbreite (ungerade)
--overwrite # bestehende target_by_frame.json überschreiben
```
### `main_apply_crop.py`
Wendet 9:16Crop mit Glättung/Szenenschnitt an.
```
--pattern STR # Dateimuster in raw_clips (Default: *.mp4)
--out_w INT # Output-Breite (Default 1080)
--out_h INT # Output-Höhe (Default 1920)
--zoom_pad FLOAT # Zoom-Pad (0..1)
--median INT # Median-Fenster (>=3, ungerade)
--ema FLOAT # EMA-Alpha (0..1)
--deadband FLOAT # Totband in Pixel
--switch_cd INT # Cooldown-Frames nach Trackwechsel
--cut_detect # Szenenschnitt-Erkennung aktivieren
--cut_corr FLOAT # Schwellwert Korrelation (0..1)
--cut_cd INT # Cooldown-Frames nach Cut
--mux_audio # Original-Audio unterlegen
--debug # Debug-Overlay anzeigen
--debug_scale FLOAT # Debug-Preview skaliert rendern
--overwrite # vorhandene Ausgaben überschreiben
```
### `add_subtitles.py`
Generiert WordCaps mit Whisper & brennt sie ein.
```
--clips_dir PATH # Quelle (Default: output/output_9x16_final)
--out_dir PATH # Ziel (Default: output/output_9x16_final_subbed_word)
--pattern STR # z. B. *.mp4
--limit INT # Nur die ersten N Clips
--model NAME # Whisper-Modell (tiny/base/small/medium/large)
--lang CODE # Sprachcode oder Auto
```
### `rateCluster.py` (optional)
Lässt LLM Scores (Viralität, Emotion, Humor, Provokation) nachtragen.
> Modelliere StandardModell via `OPENAI_MODEL` (z.B. `gpt-4o`).
---
## Tipps & Troubleshooting
- **Modelle/Performance**
- CPUonly ist möglich (Whisper/YOLO langsamer). Auf Apple Silicon wird automatisch **MPS** genutzt; auf NVIDIA **CUDA**.
- `--frame-skip` und `--downscale` in `main_detect_faces.py` beschleunigen die FaceDetection deutlich.
- **ffmpegMuxing prüfen** (`main_apply_crop.py --mux_audio`): Falls Ton fehlt, `ffmpeg`-Installation checken. Rückgabecode im Log prüfen.
- **Fehlende Dateien**
- Kein Input? → `data/input/` prüfen.
- Fehlende TranskriptPaare? → `*_timed.txt` und `*_segments.json` müssen existieren (aus `transcription.py`).
- Fehlende Faces? → Pfad zu `models/yolov8n-face.pt` korrekt?
- **Datenbank**
- Highlights liegen in SQLite (siehe `config.py`: `DB_PATH`). Bei Wiederholungen kann ein `DELETE FROM highlights; VACUUM;` sinnvoll sein.
- **Cache/Verzeichnisse**
- WhisperCache via `XDG_CACHE_HOME``whisper-cache/` neben dem Projekt. Speicherplatz beachten.
## Reproduzierbarkeit
- Lege eine `requirements.txt` mit exakten Versionen an (Freeze deiner funktionierenden Umgebung).
- Dokumentiere verwendete **ModellVersionsstände** (YOLO Weights, WhisperModellgröße, OPENAI_MODEL).
- Fixiere RandomSeeds, falls nötig (hier meist deterministisch durch externe Modelle/Bibliotheken).
## Lizenz / Danksagung
- Verwendung von **OpenAI Whisper**, **Ultralytics YOLOv8**, **MediaPipe**, **OpenCV**, **MoviePy**, **ffmpeg**.
- Die jeweiligen Lizenzen der Bibliotheken beachten.

View File

@ -1,38 +0,0 @@
from moviepy.video.io.VideoFileClip import VideoFileClip
from pathlib import Path
import sqlite3
# === Setup ===
input_video = Path("input/testVideoShort.mov")
output_dir = Path("output")
output_dir.mkdir(parents=True, exist_ok=True)
# === SQLite DB lesen ===
db_path = "clips_openai.db"
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
# Nur die Top 10 Clips mit höchstem score_total
cursor.execute("""
SELECT start, end, text
FROM highlights
ORDER BY score_total DESC
LIMIT 10
""")
highlights = cursor.fetchall()
# === Video laden ===
video = VideoFileClip(str(input_video))
# === Clips schneiden ===
for i, (start, end, text) in enumerate(highlights):
output_file = output_dir / f"highlight_{i+1}.mp4"
end = min(end, video.duration) # Sicherstellen, dass das Ende nicht über das Video hinausgeht
print(f"🎬 Exportiere Clip {i+1}: {start:.2f}s {end:.2f}s → {output_file.name}")
clip = video.subclipped(start, end)
clip.write_videofile(str(output_file), codec="libx264", audio_codec="aac")
# === Cleanup ===
conn.close()
video.close()
print("✅ Top 10 Clips exportiert.")

View File

@ -1,196 +0,0 @@
import json
import sqlite3
import re
from pathlib import Path
from openai import OpenAI
from datetime import datetime
import time
import nltk
nltk.download("punkt")
# === SETTINGS ===
TRANSCRIPT_PATH = Path("transkripte/testVideoShort_timed.txt")
DB_PATH = Path("clips_openai.db")
LOG_DIR = Path("logs")
LOG_DIR.mkdir(exist_ok=True)
BLOCK_DURATION = 300
MIN_CLIP_LEN = 5
MAX_CLIP_LEN = 90
client = OpenAI(api_key="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")
# === HILFSFUNKTIONEN ===
def log_text(filename, content):
(LOG_DIR / filename).write_text(content.strip(), encoding="utf-8")
def append_error_log(content):
with (LOG_DIR / "errors.txt").open("a", encoding="utf-8") as f:
f.write(content + "\n\n")
def extract_json(text):
match = re.search(r"\[.*\]", text.strip(), re.DOTALL)
if match:
try:
return json.loads(match.group())
except Exception as e:
append_error_log(f"❌ JSON-Fehler: {e}\n{text}")
return []
def get_original_text(clip, segments, debug=False):
texts = []
used_segments = []
for s in segments:
# Überschneidung: Segment und Clip teilen sich Zeit
if not (s["end"] < clip["start"] or s["start"] > clip["end"]):
texts.append(s["text"])
used_segments.append(f"[{s['start']:.2f} {s['end']:.2f}] {s['text']}")
if debug:
print(f"\n🟢 Clip {clip['start']}{clip['end']} nutzt Segmente:\n" +
"\n".join(used_segments))
return " ".join(texts).strip()
# === TRANSKRIPT EINLESEN ===
lines = TRANSCRIPT_PATH.read_text(encoding="utf-8").splitlines()
segments = []
for line in lines:
match = re.match(r"\[(\d+(?:\.\d+)?)\s*[-]\s*(\d+(?:\.\d+)?)\]\s*(.*)", line)
if match:
start, end, text = match.groups()
start = float(start)
end = float(end)
if end - start >= 2.0:
segments.append({"start": start, "end": end, "text": text.strip()})
if not segments:
raise RuntimeError("🚫 Keine gültigen Segmente gefunden.")
print(f"{len(segments)} gültige Transkriptsegmente geladen.")
# === BLÖCKE BILDEN
blocks = []
current_block = []
current_start = 0.0
for seg in segments:
if seg["end"] - current_start > BLOCK_DURATION:
blocks.append(current_block)
current_block = []
current_start = seg["start"]
current_block.append(seg)
if current_block:
blocks.append(current_block)
print(f"🧱 {len(blocks)} Blöcke erstellt (à ~{BLOCK_DURATION}s).")
# === KI: CLIP-AUSWAHL
all_clips = []
start_time = time.perf_counter()
for i, block in enumerate(blocks):
if not block:
continue
print(f"\n🤖 Sende Block {i+1}/{len(blocks)} an GPT-4o...")
block_text = "\n".join([f"[{s['start']} {s['end']}] {s['text']}" for s in block])
prompt = f"""
Du bekommst einen Transkriptblock mit Zeitangaben. Extrahiere daraus 13 besonders interessante Abschnitte, die sich als eigenständige Clips eignen.
Achte darauf das es abgeschlossene Clips sind und als eigenstaendiger Clip funktionieren kann.
Ein guter Clip:
- ist abgeschlossen und verständlich
- enthält eine Pointe, Erkenntnis oder einen emotionalen Moment
- wirkt wie ein Mini-Ausschnitt mit Anfang, Spannungsbogen, Auflösung oder Punchline
- ist **mindestens 30 Sekunden lang**
Nutze ausschließlich die vorhandenen Start- und Endzeiten keine neuen erfinden.
Gib ein valides JSON-Array zurück im Format:
[
{{
"start": float,
"end": float,
"summary": "Kurze Beschreibung des Inhalts"
}}
]
TRANSKRIPT:
{block_text}
"""
log_text(f"block_prompt_{i+1}.txt", prompt)
try:
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
temperature=0.4
)
raw = response.choices[0].message.content
log_text(f"block_output_{i+1}.txt", raw)
clips = extract_json(raw)
print(f"{len(clips)} Clips empfangen in Block {i+1}")
for clip in clips:
try:
dur = float(clip["end"]) - float(clip["start"])
if MIN_CLIP_LEN <= dur <= MAX_CLIP_LEN:
clip["duration"] = round(dur, 2)
all_clips.append(clip)
except Exception as e:
append_error_log(f"⛔ Clip-Filterfehler: {clip}\n{e}")
print(f"📈 Aktueller Clip-Gesamtstand: {len(all_clips)}")
# ETA berechnen
elapsed = time.perf_counter() - start_time
avg_time = elapsed / (i + 1)
eta = avg_time * (len(blocks) - (i + 1))
print(f"⏱️ Geschätzte Restzeit: {eta:.1f} Sekunden")
except Exception as e:
append_error_log(f"❌ OpenAI-Fehler Block {i+1}: {e}")
print(f"❌ Fehler bei Block {i+1}: {e}")
# === DB SPEICHERN
conn = sqlite3.connect(DB_PATH)
cur = conn.cursor()
cur.execute("DROP TABLE IF EXISTS segments")
cur.execute("""
CREATE TABLE segments (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file TEXT,
start REAL,
end REAL,
duration REAL,
text TEXT,
summary TEXT
)
""")
inserted = 0
failed = 0
for clip in all_clips:
try:
start = float(clip["start"])
end = float(clip["end"])
duration = float(clip["duration"])
summary = clip.get("summary", "")
# debug=True für print aller Segment-Texte pro Clip
original_text = get_original_text(clip, segments, debug=False)
if end <= start or start < 0:
raise ValueError("Ungültige Zeiten")
cur.execute(
"INSERT INTO segments (file, start, end, duration, text, summary) VALUES (?, ?, ?, ?, ?, ?)",
(TRANSCRIPT_PATH.stem, start, end, duration, original_text, summary.strip())
)
inserted += 1
except Exception as e:
failed += 1
append_error_log(f"❌ DB-Fehler: {clip}\n{e}")
conn.commit()
conn.close()
print("\n📊 Ergebnisse:")
print(f" ✅ Clips gespeichert: {inserted}")
print(f" ❌ Fehlerhafte Clips: {failed}")
print(f"📁 Logs gespeichert in: {LOG_DIR.resolve()}")

View File

@ -1,108 +0,0 @@
# transcription_chunked.py
import whisper
from pathlib import Path
import os
import json
import ffmpeg
import tempfile
# === Einstellungen ===
input_file = Path("input/testVideoShort.mov")
output_dir = Path("transkripte")
output_dir.mkdir(parents=True, exist_ok=True)
output_txt = output_dir / f"{input_file.stem}_timed.txt"
output_json = output_dir / f"{input_file.stem}_segments.json"
suspect_txt = output_dir / f"{input_file.stem}_suspect_lines.txt"
CHUNKS = 4 # Anzahl Chunks (anpassen!)
OVERLAP = 2.0 # Sekunden Überlappung
os.environ["XDG_CACHE_HOME"] = str(Path(__file__).parent / "whisper-cache")
probe = ffmpeg.probe(str(input_file))
duration = float(probe["format"]["duration"])
print(f"🎥 Videolänge: {duration:.2f} Sekunden")
def extract_audio_chunk(start_time, duration, output_path):
ffmpeg.input(str(input_file), ss=start_time, t=duration).output(
str(output_path),
format="wav",
acodec="pcm_s16le",
ac=1,
ar="16000",
loglevel="error"
).overwrite_output().run()
def is_suspect(text):
words = text.strip().lower().split()
if not words:
return True
most_common = max([words.count(w) for w in set(words)])
return most_common / len(words) > 0.6 or most_common > 20
tmp_dir = Path(tempfile.mkdtemp())
all_segments = []
print(f"✂️ Teile Audio in {CHUNKS} Chunks ...")
for i in range(CHUNKS):
chunk_start = max(0, i * (duration / CHUNKS) - OVERLAP if i > 0 else 0)
chunk_end = min(duration, (i + 1) * (duration / CHUNKS) + OVERLAP)
chunk_dur = chunk_end - chunk_start
chunk_file = tmp_dir / f"chunk_{i}.wav"
print(f"🔉 Extrahiere Chunk {i+1}/{CHUNKS}: {chunk_start:.2f}s {chunk_end:.2f}s")
extract_audio_chunk(chunk_start, chunk_dur, chunk_file)
print(f"🧠 Transkribiere Chunk {i+1} ...")
model = whisper.load_model("small") # Wechsel zu "medium" oder "large" falls gewünscht
result = model.transcribe(
str(chunk_file),
language="de",
fp16=False,
word_timestamps=False,
condition_on_previous_text=True,
temperature=0,
verbose=False
)
segments = result["segments"]
# Zeitversatz für den aktuellen Chunk hinzufügen
offset = chunk_start
for seg in segments:
seg["start"] += offset
seg["end"] += offset
all_segments.extend(segments)
# === Sortiere und filtere doppelte/überlappende Segmente
all_segments.sort(key=lambda x: x["start"])
def segment_hash(seg):
return (round(seg["start"], 2), round(seg["end"], 2), seg["text"].strip().lower())
unique_segments = []
seen = set()
for seg in all_segments:
h = segment_hash(seg)
if h not in seen:
seen.add(h)
unique_segments.append(seg)
print(f"✅ Insgesamt {len(unique_segments)} einzigartige Segmente transkribiert.")
with open(output_txt, "w", encoding="utf-8") as f, open(suspect_txt, "w", encoding="utf-8") as f_sus:
for seg in unique_segments:
start = seg["start"]
end = seg["end"]
text = seg["text"].strip()
line = f"[{start:.2f} {end:.2f}] {text}\n"
f.write(line) # IMMER ins Haupttranskript!
if is_suspect(text):
f_sus.write(line)
print(f"📝 Zeitmarkiertes Transkript gespeichert unter: {output_txt}")
print(f"⚠️ Verdächtige Zeilen gespeichert unter: {suspect_txt}")
with open(output_json, "w", encoding="utf-8") as f:
json.dump(unique_segments, f, ensure_ascii=False, indent=2)
print(f"💾 Segmentdaten gespeichert unter: {output_json}")

233
main.py Normal file
View File

@ -0,0 +1,233 @@
#!/usr/bin/env python3
"""Run the full Bachelor pipeline end-to-end with timing, errors, and flexible flags.
Steps:
1) transcription.py Whisper transcripts (segments + timed words)
2) segment_transcript.py LLM selects highlight candidates SQLite
3) cutClips.py export highlight_*.mp4 (raw clips)
4) main_detect_faces.py YOLO + MediaPipe faces.json per clip
5) make_segments.py *_target_by_frame.json (center+side per frame)
6) main_apply_crop.py 9:16 crop with smoothing & optional audio mux
7) rateCluster.py (optional) LLM scoring (virality, emotion, ...)
8) add_subtitles.py (optional) word-cap subtitles burned in
Usage examples:
python main.py --input data/input/meinvideo.mp4 --limit 10 --openai-model gpt-4o
python main.py --no-rate --no-subs
"""
from __future__ import annotations
import argparse
import os
import sys
import subprocess
import time
from datetime import datetime
from pathlib import Path
# --- Import project config ---
try:
from config import (
PROJECT_ROOT, INPUT_DIR, RAW_CLIPS_DIR, CROPPED_DIR, SUBTITLED_DIR,
WHISPER_CACHE_DIR
)
except Exception:
PROJECT_ROOT = Path(__file__).resolve().parent
sys.path.insert(0, str(PROJECT_ROOT))
from config import (
PROJECT_ROOT, INPUT_DIR, RAW_CLIPS_DIR, CROPPED_DIR, SUBTITLED_DIR,
WHISPER_CACHE_DIR
)
LOGS_DIR = PROJECT_ROOT / "logs"
LOGS_DIR.mkdir(parents=True, exist_ok=True)
# --- korrekte Pfade zu den Skripten ---
SCRIPTS = {
"transcription": str(PROJECT_ROOT / "src" / "text" / "transcription.py"),
"segment_transcript": str(PROJECT_ROOT / "src" / "text" / "segment_transcript.py"),
"cutClips": str(PROJECT_ROOT / "src" / "text" / "cutClips.py"),
"detect_faces": str(PROJECT_ROOT / "src" / "reformat" / "main_detect_faces.py"),
"make_segments": str(PROJECT_ROOT / "src" / "reformat" / "make_segments.py"),
"apply_crop": str(PROJECT_ROOT / "src" / "reformat" / "main_apply_crop.py"),
"rateCluster": str(PROJECT_ROOT / "src" / "text" / "rateCluster.py"),
"add_subtitles": str(PROJECT_ROOT / "src" / "subtitles" / "add_subtitles.py"),
}
def shlex_join(cmd):
return " ".join(str(c) for c in cmd)
def run_step(cmd: list[str], name: str, env: dict[str, str] | None = None) -> float:
"""Run a subprocess step, raise on error, return duration in seconds."""
t0 = time.perf_counter()
print(f"\n===== {name} =====")
print(" ", shlex_join(cmd))
cp = subprocess.run(cmd, env=env)
dt = time.perf_counter() - t0
if cp.returncode != 0:
print(f"❌ Fehler in {name} (Exit {cp.returncode}) nach {dt:.2f}s")
print(" → Prüfe das Logfile oben für Details und stelle sicher, dass Abhängigkeiten installiert sind:")
print(" - ffmpeg/ffprobe im PATH")
print(" - Python-Pakete: openai-whisper, torch, ffmpeg-python, ultralytics, opencv-python, mediapipe, moviepy, tqdm, numpy")
print(" - OPENAI_API_KEY gesetzt (für LLM-Schritte)")
raise SystemExit(cp.returncode)
print(f"{name} in {dt:.2f}s")
return dt
def infer_base_from_input(input_path: Path) -> str:
return input_path.stem
def default_input() -> Path | None:
if not INPUT_DIR.exists():
return None
for p in sorted(INPUT_DIR.iterdir()):
if p.suffix.lower() in {".mp4", ".mov", ".mkv", ".m4v", ".mp3", ".wav"}:
return p
return None
def main():
ap = argparse.ArgumentParser(description="Bachelor Pipeline Runner")
ap.add_argument("--input", type=str, default=None, help="Pfad zu Eingabedatei (Default: erstes File in data/input)")
ap.add_argument("--limit", type=int, default=10, help="Anzahl Highlights (cutClips)")
ap.add_argument("--whisper-model", type=str, default=os.getenv("WHISPER_MODEL", "small"))
ap.add_argument("--lang", type=str, default=None, help="Sprachcode (z. B. de)")
ap.add_argument("--openai-model", type=str, default=os.getenv("OPENAI_MODEL", "gpt-4o"))
ap.add_argument("--pattern", type=str, default="highlight_*.mp4")
ap.add_argument("--overwrite", action="store_true")
ap.add_argument("--no-rate", action="store_true")
ap.add_argument("--no-subs", action="store_true")
ap.add_argument("--no-detect", action="store_true")
ap.add_argument("--no-make", action="store_true")
ap.add_argument("--no-apply", action="store_true")
ap.add_argument("--logfile", type=str, default=None)
args = ap.parse_args()
os.chdir(PROJECT_ROOT)
env = os.environ.copy()
env.setdefault("OPENAI_MODEL", args.openai_model)
env.setdefault("XDG_CACHE_HOME", str(WHISPER_CACHE_DIR))
if not env.get("OPENAI_API_KEY"):
print("⚠️ OPENAI_API_KEY ist nicht gesetzt LLM-Schritte könnten fehlschlagen.")
# Input-Datei bestimmen
if args.input:
input_path = Path(args.input)
if not input_path.is_file():
candidate = INPUT_DIR / args.input
if candidate.is_file():
input_path = candidate
else:
raise SystemExit(f"Input nicht gefunden: {args.input}")
else:
picked = default_input()
if not picked:
raise SystemExit(f"Kein Input in {INPUT_DIR} gefunden. Bitte --input setzen.")
input_path = picked
base = infer_base_from_input(input_path)
print(f"📥 Input: {input_path}")
print(f"🔤 Whisper: {args.whisper_model} | 🌐 LLM: {env.get('OPENAI_MODEL')}")
print(f"🧩 Base: {base}")
# Logfile
if args.logfile:
log_path = Path(args.logfile)
else:
log_path = LOGS_DIR / f"run_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
# Tee: schreibe in Datei UND Konsole
try:
log_fh = open(log_path, "w", encoding="utf-8")
class _Tee:
def __init__(self, *streams): self.streams = streams
def write(self, data):
for s in self.streams:
try: s.write(data); s.flush()
except Exception: pass
def flush(self):
for s in self.streams:
try: s.flush()
except Exception: pass
sys.stdout = _Tee(sys.__stdout__, log_fh)
sys.stderr = _Tee(sys.__stderr__, log_fh)
print(f"📝 Logfile: {log_path}")
except Exception:
print(f"⚠️ Konnte Logfile nicht initialisieren: {log_path}")
durations = []
started = datetime.now()
print(f"🚀 Start: {started:%Y-%m-%d %H:%M:%S}")
try:
# 1) Transcription
t_args = [sys.executable, SCRIPTS["transcription"], "--input", str(input_path), "--model", args.whisper_model]
if args.lang: t_args += ["--lang", args.lang]
durations.append(("Transcription", run_step(t_args, "Transcription", env=env)))
# 2) LLM Segmentierung
st_args = [sys.executable, SCRIPTS["segment_transcript"], "--base", base]
durations.append(("Segment Transcript", run_step(st_args, "Segment Transcript", env=env)))
# 3) Highlights schneiden
cut_filename = input_path.name
cc_args = [sys.executable, SCRIPTS["cutClips"], "--file", cut_filename, "--limit", str(args.limit)]
durations.append(("Cut Clips", run_step(cc_args, "Cut Clips", env=env)))
# 4) Faces
if not args.no_detect:
df_args = [sys.executable, SCRIPTS["detect_faces"]]
durations.append(("Detect Faces", run_step(df_args, "Detect Faces", env=env)))
else:
print("⏭️ Detect Faces übersprungen.")
# 5) Make Targets
if not args.no_make:
ms_args = [sys.executable, SCRIPTS["make_segments"], "--pattern", args.pattern]
durations.append(("Make Targets", run_step(ms_args, "Make Targets", env=env)))
else:
print("⏭️ Make Targets übersprungen.")
# 6) Crop
if not args.no_apply:
ac_args = [sys.executable, SCRIPTS["apply_crop"], "--pattern", args.pattern, "--mux_audio"]
if args.overwrite: ac_args.append("--overwrite")
durations.append(("Apply Crop", run_step(ac_args, "Apply Crop", env=env)))
else:
print("⏭️ Apply Crop übersprungen.")
# 7) Bewertung
if not args.no_rate:
rc_args = [sys.executable, SCRIPTS["rateCluster"]]
durations.append(("Rate Clusters", run_step(rc_args, "Rate Clusters", env=env)))
else:
print("⏭️ Rate Clusters übersprungen.")
# 8) Untertitel
if not args.no_subs:
as_args = [sys.executable, SCRIPTS["add_subtitles"]]
durations.append(("Subtitles", run_step(as_args, "Subtitles", env=env)))
else:
print("⏭️ Subtitles übersprungen.")
except KeyboardInterrupt:
print("\n⛔ Abgebrochen (Ctrl+C).")
finally:
finished = datetime.now()
total = sum(dt for _, dt in durations)
print("\n======================== ZUSAMMENFASSUNG ============================")
for name, dt in durations:
print(f"{name:<24} {dt:7.2f}s")
print("---------------------------------------------------------------------")
print(f"⏱️ Gesamtdauer: {total:.2f}s")
print(f"🕒 Start : {started:%Y-%m-%d %H:%M:%S}")
print(f"🕒 Ende : {finished:%Y-%m-%d %H:%M:%S}")
print(f"📂 Output:")
print(f" Raw Clips : {RAW_CLIPS_DIR}")
print(f" 9:16 : {CROPPED_DIR}")
print(f" Subbed : {SUBTITLED_DIR}")
print("=====================================================================")
if __name__ == "__main__":
main()

43
src/main.py Normal file
View File

@ -0,0 +1,43 @@
#!/usr/bin/env python3
"""
Einfaches Master-Skript, das alle Unter-Skripte nacheinander startet ohne Argumente.
"""
import subprocess
import sys
from pathlib import Path
# Reihenfolge der auszuführenden Skripte (relativer Pfad)
SCRIPTS = [
"text/transcription.py",
"text/segment_transcript.py",
"text/rateCluster.py",
"text/cutClips.py",
"reformat/track_faces_Yolo.py",
"reformat/detect_speaking_faces.py",
"reformat/crop_to_speaker.py",
]
def run_script(script_path: str):
"""
Führt ein Python-Skript ohne weitere Argumente aus.
"""
print(f"🔄 Running: {script_path}")
full_path = Path(__file__).parent / script_path
try:
subprocess.check_call([sys.executable, str(full_path)])
print(f"✔️ {script_path} erfolgreich abgeschlossen.\n")
except subprocess.CalledProcessError as e:
print(f"❌ Fehler in {script_path}: Rückgabecode {e.returncode}")
sys.exit(e.returncode)
def main():
print("\n=== Starte komplette Podcast-Pipeline ===\n")
for script in SCRIPTS:
run_script(script)
print("✅ Alle Schritte erfolgreich abgeschlossen.")
if __name__ == '__main__':
main()

View File

@ -0,0 +1,315 @@
#!/usr/bin/env python3
# src/reformat/new/main_apply_crop.py
from __future__ import annotations
import logging, json, math, subprocess, argparse
from pathlib import Path
from typing import Optional, Tuple, List, Dict, Any
from collections import deque
import sys
import cv2
import numpy as np
# ── Projektwurzel importierbar machen
ROOT = Path(__file__).resolve().parents[2]
sys.path.insert(0, str(ROOT))
from config import RAW_CLIPS_DIR, FACE_COMBINED_DIR, SEGMENTS_DIR, CROPPED_DIR
# ==== Defaults (per CLI überschreibbar) ======================================
OUT_W_DEFAULT, OUT_H_DEFAULT = 1080, 1920 # 9:16
DEBUG_SCALE_DEFAULT = 0.6
MEDIAN_WIN_DEFAULT = 5
EMA_ALPHA_DEFAULT = 0.22
DEADBAND_PX_DEFAULT = 8.0
SWITCH_COOLDOWN_FR_DEFAULT = 12
ZOOM_PAD_FRAC_DEFAULT = 0.10
USE_CUT_DETECT_DEFAULT = True
CUT_CORR_THRESH_DEFAULT = 0.65
CUT_COOLDOWN_DEFAULT = 6
MUX_AUDIO_DEFAULT = True
FFMPEG_BIN = "ffmpeg"
# ============================================================================
def clamp(v, lo, hi): return max(lo, min(hi, v))
def compute_crop_rect(cx: float, cy: float, src_w: int, src_h: int,
out_w: int, out_h: int, zoom_pad_frac: float) -> tuple[int,int,int,int]:
"""9:16 (out_w:out_h) Crop um (cx,cy) — ohne Squeeze, mit Zoom-Pad, im Bild gehalten."""
target_ar = out_w / out_h
src_ar = src_w / src_h
if src_ar >= target_ar:
base_h = src_h
base_w = int(round(base_h * target_ar))
else:
base_w = src_w
base_h = int(round(base_w / target_ar))
desired_scale = 1.0 + zoom_pad_frac
s = min(desired_scale, src_w / base_w, src_h / base_h)
w = int(round(base_w * s))
h = int(round(base_h * s))
half_w, half_h = w // 2, h // 2
cx = clamp(cx, half_w, src_w - half_w)
cy = clamp(cy, half_h, src_h - half_h)
x = int(round(cx - half_w))
y = int(round(cy - half_h))
return x, y, w, h
def draw_center(img, pt, color, label=None):
if pt is None: return
x, y = int(pt[0]), int(pt[1])
cv2.circle(img, (x, y), 6, color, -1)
if label:
cv2.putText(img, label, (x + 8, y - 8),
cv2.FONT_HERSHEY_SIMPLEX, 0.55, color, 2, cv2.LINE_AA)
def scene_corr(a_small: np.ndarray, b_small: np.ndarray) -> float:
a_hsv = cv2.cvtColor(a_small, cv2.COLOR_BGR2HSV)
b_hsv = cv2.cvtColor(b_small, cv2.COLOR_BGR2HSV)
ha = cv2.calcHist([a_hsv],[0,1],None,[50,50],[0,180,0,256])
hb = cv2.calcHist([b_hsv],[0,1],None,[50,50],[0,180,0,256])
cv2.normalize(ha,ha,0,1,cv2.NORM_MINMAX); cv2.normalize(hb,hb,0,1,cv2.NORM_MINMAX)
return float((cv2.compareHist(ha, hb, cv2.HISTCMP_CORREL) + 1.0)/2.0)
def mux_audio_from_source(src_video: Path, silent_video: Path, out_video: Path):
cmd = [
FFMPEG_BIN, "-y",
"-i", str(src_video),
"-i", str(silent_video),
"-map", "1:v:0",
"-map", "0:a:0?",
"-c:v", "copy",
"-c:a", "aac", "-b:a", "192k",
"-shortest",
str(out_video),
]
subprocess.run(cmd, check=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
def load_faces(name: str) -> List[Dict[str,Any]]:
p = FACE_COMBINED_DIR / f"{name}_faces.json"
if not p.exists(): return []
return json.loads(p.read_text(encoding="utf-8"))
def load_target_map_or_segments(name: str, total_frames: int) -> List[Optional[int] | Dict]:
"""
Bevorzugt *_target_by_frame.json (Liste Dicts mit t,cx,cy,w,h).
Fallback: *_segments.json (pro Frame Track-ID).
Gibt Liste gleicher Länge wie total_frames zurück.
"""
map_p = SEGMENTS_DIR / f"{name}_target_by_frame.json"
if map_p.exists():
target = json.loads(map_p.read_text(encoding="utf-8"))
# Falls es Dicts sind (cx,cy,w,h pro frame), einfach zurückgeben:
if target and isinstance(target[0], dict):
if len(target) < total_frames:
last = target[-1] if target else {"t":0,"cx":0.5,"cy":0.5,"w":0.6,"h":0.6}
target += [last] * (total_frames - len(target))
return target[:total_frames]
# Falls numerische IDs drin wären, fällt es unten durch auf segs-Logik
seg_p = SEGMENTS_DIR / f"{name}_segments.json"
if seg_p.exists():
segs = json.loads(seg_p.read_text(encoding="utf-8"))
target_tid = [None]*total_frames
for s in segs:
a, b, tid = int(s["start_f"]), int(s["end_f"]), s["track_id"]
for t in range(max(0,a), min(total_frames, b+1)):
target_tid[t] = tid
return target_tid
return [None]*total_frames
def find_center_for_track(faces_frame: Dict[str,Any], target_tid: Optional[int], fallback: Tuple[float,float]) -> Tuple[float,float]:
if target_tid is None:
return fallback
faces = faces_frame.get("faces", [])
for f in faces:
if int(f.get("track_id", -1)) == int(target_tid):
x,y,w,h = f.get("bbox", [None,None,None,None])
if None not in (x,y,w,h):
return (float(x + w/2), float(y + h/2))
return fallback
def parse_args():
p = argparse.ArgumentParser(description="Apply 9:16 Auto-Crop auf Rohclips mit Face-/Target-Daten.")
p.add_argument("--pattern", type=str, default="*.mp4", help="Dateimuster in RAW_CLIPS_DIR (Default: *.mp4)")
p.add_argument("--out_w", type=int, default=OUT_W_DEFAULT, help="Output-Breite (Default: 1080)")
p.add_argument("--out_h", type=int, default=OUT_H_DEFAULT, help="Output-Höhe (Default: 1920)")
p.add_argument("--zoom_pad", type=float, default=ZOOM_PAD_FRAC_DEFAULT, help="Zoom-Pad (0..1, Default 0.10)")
p.add_argument("--median", type=int, default=MEDIAN_WIN_DEFAULT, help="Median-Fenster (ungerade, >=3)")
p.add_argument("--ema", type=float, default=EMA_ALPHA_DEFAULT, help="EMA-Alpha (0..1)")
p.add_argument("--deadband", type=float, default=DEADBAND_PX_DEFAULT, help="Totband in Pixel")
p.add_argument("--switch_cd", type=int, default=SWITCH_COOLDOWN_FR_DEFAULT, help="Cooldown-Frames nach Trackwechsel")
p.add_argument("--cut_detect", action="store_true", default=USE_CUT_DETECT_DEFAULT, help="Szenenschnitt-Erkennung aktivieren")
p.add_argument("--cut_corr", type=float, default=CUT_CORR_THRESH_DEFAULT, help="Korrelation-Schwelle (0..1)")
p.add_argument("--cut_cd", type=int, default=CUT_COOLDOWN_DEFAULT, help="Cooldown-Frames nach Cut")
p.add_argument("--mux_audio", action="store_true", default=MUX_AUDIO_DEFAULT, help="Audio vom Original muxen")
p.add_argument("--debug", action="store_true", help="Debug-Overlay anzeigen (langsam)")
p.add_argument("--debug_scale", type=float, default=DEBUG_SCALE_DEFAULT, help="Skalierung Debug-Preview")
p.add_argument("--overwrite", action="store_true", help="Existierende Outputs überschreiben")
return p.parse_args()
def main():
args = parse_args()
OUT_DIR = CROPPED_DIR
OUT_DIR.mkdir(parents=True, exist_ok=True)
logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO)
clips = sorted(list(RAW_CLIPS_DIR.glob(args.pattern)))
if not clips:
print(f"⚠️ Keine Clips in {RAW_CLIPS_DIR} mit Pattern '{args.pattern}'")
return
print(f"🔎 {len(clips)} Clips gefunden …")
for video_path in clips:
name = video_path.stem
out_path = OUT_DIR / f"{name}_9x16.mp4"
if out_path.exists() and not args.overwrite:
print(f"⏭️ Skip (existiert): {out_path.name}")
continue
# Video öffnen
cap = cv2.VideoCapture(str(video_path))
if not cap.isOpened():
print(f"❌ Kann Video nicht öffnen: {video_path.name}")
continue
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
# Face/Target laden
faces_all = load_faces(name)
if faces_all and len(faces_all) < total:
faces_all += [ {"faces": [], "W": width, "H": height} ] * (total - len(faces_all))
target_by_frame = load_target_map_or_segments(name, total)
# Writer vorbereiten
writer = cv2.VideoWriter(str(out_path),
cv2.VideoWriter_fourcc(*"mp4v"),
fps, (args.out_w, args.out_h))
median_buf = deque(maxlen=max(3, args.median if args.median % 2 else args.median+1))
ema_center: Optional[Tuple[float,float]] = None
last_center: Optional[Tuple[float,float]] = (width/2, height/2)
switch_cooldown = 0
prev_small = None
cut_cd = 0
print(f"🎞️ Apply: {name} src={width}x{height} fps={fps:.2f} frames={total}")
for t in range(total):
ret, frame = cap.read()
if not ret: break
# Ziel bestimmen:
desired = None
tgt = target_by_frame[t] if t < len(target_by_frame) else None
# Fall A: target_by_frame.json mit direkten Zentren (Dict)
if isinstance(tgt, dict) and all(k in tgt for k in ("cx","cy","w","h")):
desired = (float(tgt["cx"])*width, float(tgt["cy"])*height)
else:
# Fall B: numerische Track-ID
target_tid = tgt if tgt is None or isinstance(tgt, (int, float)) else None
faces_fr = faces_all[t] if (faces_all and t < len(faces_all)) else {"faces":[]}
desired = find_center_for_track(faces_fr, target_tid, last_center or (width/2, height/2))
# Szenenschnitt?
if args.cut_detect:
small = cv2.resize(frame, (128, 72))
if prev_small is not None:
corr = scene_corr(prev_small, small)
if corr < args.cut_corr:
ema_center = desired
last_center = desired
switch_cooldown = args.switch_cd
cut_cd = args.cut_cd
prev_small = small
# Median-Filter
median_buf.append(desired)
if len(median_buf) >= 3:
xs = sorted(p[0] for p in median_buf)
ys = sorted(p[1] for p in median_buf)
m = len(median_buf)//2
desired_f = (xs[m], ys[m])
else:
desired_f = desired
# Trackwechsel erkennen (nur bei Track-IDs sauber möglich)
if t > 0:
prev_tgt = target_by_frame[t-1] if t-1 < len(target_by_frame) else None
else:
prev_tgt = tgt
is_switch = (not isinstance(tgt, dict)) and (tgt != prev_tgt)
if ema_center is None:
ema_center = desired_f
if last_center is None:
last_center = desired_f
if is_switch:
ema_center = desired_f
last_center = desired_f
switch_cooldown = args.switch_cd
else:
dx = desired_f[0] - ema_center[0]
dy = desired_f[1] - ema_center[1]
dist = math.hypot(dx, dy)
if cut_cd > 0:
ema_center = desired_f
cut_cd -= 1
else:
if dist > args.deadband:
ema_center = (ema_center[0] + dx*args.ema,
ema_center[1] + dy*args.ema)
last_center = desired_f
# 9:16 Crop anwenden
x, y, w, h = compute_crop_rect(ema_center[0], ema_center[1], width, height,
args.out_w, args.out_h, args.zoom_pad)
cropped = frame[y:y+h, x:x+w]
if cropped.size == 0: cropped = frame
final = cv2.resize(cropped, (args.out_w, args.out_h), interpolation=cv2.INTER_AREA)
writer.write(final)
if args.debug:
dbg = frame.copy()
cv2.rectangle(dbg, (x, y), (x+w, y+h), (0, 0, 255), 2)
draw_center(dbg, desired, (128,128,255), "desired")
draw_center(dbg, desired_f, (255,255, 0), "median")
draw_center(dbg, ema_center, ( 0,255,255), "ema")
cv2.putText(dbg, f"t={t+1}/{total}", (12, height-14),
cv2.FONT_HERSHEY_SIMPLEX, 0.6, (20,220,20), 2, cv2.LINE_AA)
disp = cv2.resize(dbg, (int(width*args.debug_scale), int(height*args.debug_scale)))
cv2.imshow("Apply Debug", disp)
if cv2.waitKey(1) & 0xFF == ord("q"):
print("🛑 Abgebrochen (q).")
break
writer.release()
cap.release()
# Audio muxen?
if args.mux_audio:
tmp = out_path.with_suffix(".tmp.mp4")
try:
out_path.rename(tmp)
mux_audio_from_source(video_path, tmp, out_path)
finally:
if tmp.exists():
try: tmp.unlink()
except: pass
print(f"✅ Fertig (mit Audio): {out_path.name}")
else:
print(f"✅ Fertig: {out_path.name}")
if args.debug:
cv2.destroyAllWindows()
if __name__ == "__main__":
main()

View File

@ -0,0 +1,335 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Face-Detection + Mouth-Openness (YOLOv8-face + MediaPipe)
- liest Rohclips aus RAW_CLIPS_DIR
- schreibt pro Video eine faces.json in FACE_COMBINED_DIR
- optionaler Fortschrittsbalken (tqdm)
"""
from __future__ import annotations
import argparse
import logging
import json
import time
from pathlib import Path
from contextlib import nullcontext
from typing import List, Dict, Any
from src.reformat.speaking import get_mouth_openness
import cv2
import numpy as np
import torch
from ultralytics import YOLO
import mediapipe as mp
import sys
# ── Projekt-Root + zentrale Pfade laden
ROOT = Path(__file__).resolve().parents[2]
sys.path.insert(0, str(ROOT))
from config import RAW_CLIPS_DIR, FACE_COMBINED_DIR # zentrale Verzeichnisse
# Fortschritt hübsch, wenn verfügbar
try:
from tqdm import tqdm
_HAS_TQDM = True
except Exception:
_HAS_TQDM = False
# ---------- Performance Tweaks ----------
torch.set_float32_matmul_precision("high")
cv2.setUseOptimized(True)
# ---------- Hilfsfunktionen ----------
def make_square_crop(x1, y1, x2, y2, W, H, margin_scale, min_crop):
cx = (x1 + x2) * 0.5
cy = (y1 + y2) * 0.5
w = (x2 - x1) * (1.0 + 2.0 * margin_scale)
h = (y2 - y1) * (1.0 + 2.0 * margin_scale)
side = max(w, h, float(min_crop))
half = side * 0.5
sx1 = int(max(0, round(cx - half)))
sy1 = int(max(0, round(cy - half)))
sx2 = int(min(W, round(cx + half)))
sy2 = int(min(H, round(cy + half)))
side_w = max(0, sx2 - sx1)
side_h = max(0, sy2 - sy1)
side = max(2, min(side_w, side_h))
sx2 = sx1 + side
sy2 = sy1 + side
return sx1, sy1, sx2, sy2
def pick_landmarks_near_crop_center(lm_lists, crop_w, crop_h):
if not lm_lists:
return None
cx_t, cy_t = crop_w * 0.5, crop_h * 0.5
best, best_d = None, 1e12
for lms in lm_lists:
xs = [p.x * crop_w for p in lms.landmark]
ys = [p.y * crop_h for p in lms.landmark]
cx = sum(xs) / len(xs)
cy = sum(ys) / len(ys)
d = (cx - cx_t) ** 2 + (cy - cy_t) ** 2
if d < best_d:
best, best_d = lms, d
return best
def run_mesh(face_mesh, crop_bgr, upscale_if_small):
if crop_bgr.size == 0:
return None, 0.0
ch, cw = crop_bgr.shape[:2]
if max(ch, cw) < upscale_if_small:
scale = float(upscale_if_small) / max(ch, cw)
new_w = max(1, int(round(cw * scale)))
new_h = max(1, int(round(ch * scale)))
crop_bgr = cv2.resize(crop_bgr, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
ch, cw = new_h, new_w
rgb = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2RGB)
res = face_mesh.process(rgb)
if not res.multi_face_landmarks:
return None, 0.0
chosen = pick_landmarks_near_crop_center(res.multi_face_landmarks, cw, ch)
if chosen is None:
return None, 0.0
mo = get_mouth_openness(chosen.landmark, ch)
return chosen, float(mo)
# ---------- Kernprozess ----------
def process_video(video_path: Path,
output_path: Path,
model: YOLO,
face_mesh,
conf_thresh: float,
frame_skip: int,
downscale: float,
expansion_1: float,
expansion_2: float,
min_crop: int,
faces_upscale: int,
imgsz: int,
device: str,
max_det: int):
print(f"🎬 Starte Detection: {video_path.name}")
cap = cv2.VideoCapture(str(video_path))
if not cap.isOpened():
logging.error(f"❌ Kann Video nicht öffnen: {video_path}")
return
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
total_frames_raw = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
total_to_process = None
if total_frames_raw > 0:
total_to_process = (total_frames_raw + (frame_skip - 1)) // max(1, frame_skip)
scaled_w = max(1, int(round(orig_w * downscale)))
scaled_h = max(1, int(round(orig_h * downscale)))
data: List[Dict[str, Any]] = []
frame_idx = 0
processed_frames = 0
sx = (orig_w / scaled_w) if downscale != 1.0 else 1.0
sy = (orig_h / scaled_h) if downscale != 1.0 else 1.0
autocast_ctx = (
torch.autocast(device_type=device, dtype=torch.float16)
if device in ("mps", "cuda") else nullcontext()
)
bar = None
start_t = time.time()
if _HAS_TQDM and total_to_process:
bar = tqdm(total=total_to_process, desc=f"{video_path.name}", unit="f", leave=True)
while True:
ret, frame = cap.read()
if not ret:
break
if frame_skip > 1 and (frame_idx % frame_skip != 0):
frame_idx += 1
continue
frame_infer = frame if downscale == 1.0 else cv2.resize(frame, (scaled_w, scaled_h), interpolation=cv2.INTER_AREA)
with torch.no_grad():
with autocast_ctx:
# Ultralytics 8 API: __call__ statt .predict() (beide funktionieren)
result = model(frame_infer, imgsz=imgsz, device=device, verbose=False,
conf=conf_thresh, iou=0.5, max_det=max_det)
detections = result[0]
faces = []
for i in range(len(detections.boxes)):
box = detections.boxes[i]
conf = float(box.conf[0]) if hasattr(box.conf, "__len__") else float(box.conf)
if conf < conf_thresh:
continue
x1, y1, x2, y2 = [float(v) for v in box.xyxy[0].tolist()]
if downscale != 1.0:
x1 *= sx; x2 *= sx; y1 *= sy; y2 *= sy
x1 = max(0.0, min(x1, orig_w - 1))
y1 = max(0.0, min(y1, orig_h - 1))
x2 = max(0.0, min(x2, orig_w - 1))
y2 = max(0.0, min(y2, orig_h - 1))
w = max(1.0, x2 - x1)
h = max(1.0, y2 - y1)
cx = x1 + w / 2.0
cy = y1 + h / 2.0
# Pass 1
sx1, sy1, sx2, sy2 = make_square_crop(x1, y1, x2, y2, orig_w, orig_h, expansion_1, min_crop)
if sx2 - sx1 < 4 or sy2 - sy1 < 4:
continue
face_crop = frame[sy1:sy2, sx1:sx2]
_, mouth_open = run_mesh(face_mesh, face_crop, faces_upscale)
# Pass 2 nur wenn nötig
if mouth_open == 0.0:
sx1b, sy1b, sx2b, sy2b = make_square_crop(x1, y1, x2, y2, orig_w, orig_h, expansion_2, min_crop)
if (sx2b - sx1b) >= 4 and (sy2b - sy1b) >= 4:
face_crop_b = frame[sy1b:sy2b, sx1b:sx2b]
_, mouth_open = run_mesh(face_mesh, face_crop_b, faces_upscale)
faces.append({
"bbox": [int(round(x1)), int(round(y1)), int(round(w)), int(round(h))],
"conf": round(conf, 3),
"center": [round(cx, 1), round(cy, 1)],
"mouth_openness": round(float(mouth_open), 3)
})
data.append({
"frame": frame_idx,
"timestamp": round(frame_idx / fps, 3),
"W": orig_w,
"H": orig_h,
"faces": faces
})
frame_idx += 1
processed_frames += 1
# Fortschritt
if bar is not None:
bar.update(1)
else:
if processed_frames % 30 == 0:
elapsed = time.time() - start_t
rate = processed_frames / max(1e-6, elapsed) # frames/sec
if total_to_process:
remaining = max(0, total_to_process - processed_frames)
eta_sec = remaining / max(1e-6, rate)
print(f"[{video_path.name}] {processed_frames}/{total_to_process} "
f"({processed_frames/total_to_process*100:.1f}%) "
f"{rate:.1f} f/s — ETA {eta_sec/60:.1f} min")
else:
print(f"[{video_path.name}] {processed_frames} frames — {rate:.1f} f/s")
cap.release()
if bar is not None:
bar.close()
# schön formatiertes JSON
output_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
print(f"✅ Faces gespeichert: {output_path.name}")
# ---------- CLI ----------
def parse_args():
p = argparse.ArgumentParser(description="YOLOv8-face + MediaPipe FaceMesh → faces.json pro Clip")
# Verzeichnisse (Default aus config.py)
p.add_argument("--input-dir", type=Path, default=RAW_CLIPS_DIR, help=f"Rohclips (Default: {RAW_CLIPS_DIR})")
p.add_argument("--output-dir", type=Path, default=FACE_COMBINED_DIR, help=f"Zielordner (Default: {FACE_COMBINED_DIR})")
# Modell
p.add_argument("--model", type=Path, default=ROOT / "models" / "yolov8n-face.pt",
help="Pfad zum YOLOv8-face Modell (.pt)")
# Optimierte Defaults
p.add_argument("--conf-thresh", type=float, default=0.35)
p.add_argument("--frame-skip", type=int, default=1, help="Nur jeden n-ten Frame verarbeiten")
p.add_argument("--downscale", type=float, default=0.5, help="Eingangsframe auf Faktor verkleinern (0..1)")
p.add_argument("--expansion", type=float, default=0.4, help="Crop-Margin Pass 1 (relativ)")
p.add_argument("--expansion2", type=float, default=0.8, help="Crop-Margin Pass 2 (relativ)")
p.add_argument("--min-crop", type=int, default=160, help="Minimaler Croprand in Pixeln (quadratisch)")
p.add_argument("--faces-upscale", type=int, default=192, help="Minimale Kantenlänge für FaceMesh (bei kleineren Crops upscalen)")
p.add_argument("--imgsz", type=int, default=448)
p.add_argument("--max-det", type=int, default=20)
p.add_argument("--use-refine", action="store_true", default=False, help="MediaPipe mit refine_landmarks")
return p.parse_args()
def main():
args = parse_args()
logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO)
args.output_dir.mkdir(parents=True, exist_ok=True)
# YOLO Modell & Device
yolo = YOLO(str(args.model))
if torch.backends.mps.is_available():
device = "mps"
elif torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
yolo.to(device)
print(f"🖥️ Inference-Device: {device}")
# Warmup
try:
with torch.no_grad():
dummy = np.zeros((args.imgsz, args.imgsz, 3), dtype=np.uint8)
_ = yolo(source=[dummy], imgsz=args.imgsz, verbose=False, device=device)
except Exception:
pass
# Eingabedateien anzeigen
videos = sorted([*args.input_dir.glob("*.mp4"), *args.input_dir.glob("*.mov"), *args.input_dir.glob("*.mkv")])
print(f"🔍 Input-Ordner: {args.input_dir.resolve()}")
if not videos:
print("⚠️ Keine passenden Videos gefunden.")
return
print("📁 Dateien:")
for p in videos:
print("", p.name)
outer = None
if _HAS_TQDM:
outer = tqdm(total=len(videos), desc="Gesamt", unit="vid", leave=False)
with mp.solutions.face_mesh.FaceMesh(
static_image_mode=False,
max_num_faces=10,
refine_landmarks=args.use_refine,
min_detection_confidence=0.5,
min_tracking_confidence=0.5
) as face_mesh:
for vid in videos:
out = args.output_dir / f"{vid.stem}_faces.json"
process_video(
video_path=vid,
output_path=out,
model=yolo,
face_mesh=face_mesh,
conf_thresh=args.conf_thresh,
frame_skip=args.frame_skip,
downscale=args.downscale,
expansion_1=args.expansion,
expansion_2=args.expansion2,
min_crop=args.min_crop,
faces_upscale=args.faces_upscale,
imgsz=args.imgsz,
device=device,
max_det=args.max_det
)
if outer is not None:
outer.update(1)
if outer is not None:
outer.close()
if __name__ == "__main__":
main()

View File

@ -0,0 +1,78 @@
#!/usr/bin/env python3
import logging, json
from pathlib import Path
from typing import List, Dict, Any
import sys
# Projekt-Root verfügbar machen
ROOT = Path(__file__).resolve().parents[2]
sys.path.insert(0, str(ROOT))
from config import FACE_COMBINED_DIR, FACE_CROP_CENTERS # ggf. SEGMENTS_DIR, wenn du dorthin schreibst
def iou(boxA, boxB):
xA = max(boxA[0], boxB[0])
yA = max(boxA[1], boxB[1])
xB = min(boxA[0]+boxA[2], boxB[0]+boxB[2])
yB = min(boxA[1]+boxA[3], boxB[1]+boxB[3])
interW, interH = max(0, xB-xA), max(0, yB-yA)
inter = interW * interH
union = boxA[2]*boxA[3] + boxB[2]*boxB[3] - inter
return inter/union if union > 0 else 0.0
def track_faces(faces_all: List[Dict[str,Any]], iou_thresh=0.3):
next_id = 0
last_boxes = {} # track_id -> bbox
for frame in faces_all:
new_boxes = {}
for face in frame["faces"]:
box = face["bbox"]
# match gegen bestehende
best_id, best_iou = None, 0.0
for tid, prev_box in last_boxes.items():
ov = iou(box, prev_box)
if ov > best_iou:
best_id, best_iou = tid, ov
if best_iou > iou_thresh:
face["track_id"] = best_id
new_boxes[best_id] = box
else:
face["track_id"] = next_id
new_boxes[next_id] = box
next_id += 1
last_boxes = new_boxes
return faces_all
def main():
# Eingabe: erkannte Gesichter/Tracks
FACE_DIR = FACE_COMBINED_DIR
# Ausgabe: z. B. berechnete Center pro Frame
OUT_DIR = FACE_CROP_CENTERS
OUT_DIR.mkdir(parents=True, exist_ok=True)
for f in FACE_DIR.glob("*_faces.json"):
try:
faces_all = json.loads(f.read_text(encoding="utf-8"))
except Exception as e:
print(f"❌ Fehler beim Laden {f.name}: {e}")
continue
tracked = track_faces(faces_all)
f.write_text(json.dumps(tracked, ensure_ascii=False), encoding="utf-8")
print(f"✅ Track-IDs ergänzt: {f.name}")
# zusätzlich centers.json (dominant = höchster mouth_openness pro Frame)
centers = []
for fr in tracked:
if fr["faces"]:
best = max(fr["faces"], key=lambda ff: ff.get("mouth_openness", 0.0))
centers.append([best["center"][0], best["center"][1]])
else:
centers.append([fr["W"]/2, fr["H"]/2])
centers_path = f.with_name(f.stem.replace("_faces","_centers")+".json")
centers_path.write_text(json.dumps(centers, ensure_ascii=False), encoding="utf-8")
print(f"📝 Centers gespeichert: {centers_path.name}")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,306 @@
#!/usr/bin/env python3
# make_segments.py — erzeugt pro Highlight eine Zielspur (target_by_frame.json) fürs Cropping
from __future__ import annotations
import json
import argparse
from dataclasses import dataclass
from typing import List, Dict, Optional, Tuple
from pathlib import Path
import sys
# ── Projektwurzel in sys.path aufnehmen (dieses Skript liegt z. B. unter src/reformat/)
ROOT = Path(__file__).resolve().parents[2]
sys.path.insert(0, str(ROOT))
from config import RAW_CLIPS_DIR, FACE_COMBINED_DIR, FACE_CROP_CENTERS, SEGMENTS_DIR
try:
from moviepy.video.io.VideoFileClip import VideoFileClip
MOVIEPY_OK = True
except Exception:
MOVIEPY_OK = False
# ──────────────────────────────────────────────────────────────────────────────
# Hilfsstrukturen
# ──────────────────────────────────────────────────────────────────────────────
@dataclass
class FaceDet:
t: float # Sekunden
cx: float # Zentrum x (0..1)
cy: float # Zentrum y (0..1)
w: float # Breite rel. (0..1)
h: float # Höhe rel. (0..1)
track_id: Optional[int] = None
mouth_prob: Optional[float] = None
def moving_average(xs: List[float], win: int) -> List[float]:
if win <= 1 or len(xs) <= 2:
return xs[:]
# ungerade Fensterbreite erzwingen
win = win if win % 2 == 1 else win + 1
r = win // 2
out = []
for i in range(len(xs)):
a = max(0, i - r)
b = min(len(xs), i + r + 1)
out.append(sum(xs[a:b]) / (b - a))
return out
def clamp01(x: float) -> float:
return max(0.0, min(1.0, x))
# ──────────────────────────────────────────────────────────────────────────────
# Lesen möglicher Eingabeformate (robust, schema-tolerant)
# ──────────────────────────────────────────────────────────────────────────────
def _parse_face_like(obj: Dict, t: float, W: float | None = None, H: float | None = None) -> FaceDet:
"""
Erwartet entweder:
- bbox=[x,y,w,h] in Pixel wird via W,H auf 0..1 normiert
- oder bereits normierte Felder cx,cy,w,h in 0..1
Optional: track_id, mouth_prob / mouth_open / talking_prob
"""
if "bbox" in obj and isinstance(obj["bbox"], (list, tuple)) and len(obj["bbox"]) >= 4:
x, y, w, h = [float(v) for v in obj["bbox"][:4]]
if W and H and W > 0 and H > 0:
cx = (x + w * 0.5) / W
cy = (y + h * 0.5) / H
w = w / W
h = h / H
else:
# Falls Maße fehlen: best effort, danach clampen
cx = x + w * 0.5
cy = y + h * 0.5
cx, cy = clamp01(cx), clamp01(cy)
w, h = max(0.0, min(1.0, w)), max(0.0, min(1.0, h))
else:
cx = float(obj.get("cx", 0.5))
cy = float(obj.get("cy", 0.5))
w = float(obj.get("w", 0.3))
h = float(obj.get("h", 0.3))
cx, cy = clamp01(cx), clamp01(cy)
w, h = max(0.0, min(1.0, w)), max(0.0, min(1.0, h))
track_id = obj.get("track_id")
mouth_prob = obj.get("mouth_prob") or obj.get("mouth_open") or obj.get("talking_prob")
mouth_prob = None if mouth_prob is None else float(mouth_prob)
return FaceDet(t=t, cx=cx, cy=cy, w=w, h=h, track_id=track_id, mouth_prob=mouth_prob)
def load_faces_or_centers(stem: str, fps_hint: float | None = None) -> List[FaceDet]:
"""
Lädt die beste verfügbare Gesichts/Center-Quelle für ein Highlight.
Suchreihenfolge:
1) FACE_COMBINED_DIR/{stem}_faces.json (Liste von Frames mit 'faces')
2) FACE_CROP_CENTERS/{stem}_centers.json
- akzeptiert entweder [[cx,cy], ...] oder [{t,cx,cy,w,h}, ...]
"""
candidates = [
(FACE_COMBINED_DIR / f"{stem}_faces.json", "faces"),
(FACE_CROP_CENTERS / f"{stem}_centers.json", "centers"),
]
path = kind = None
for p, k in candidates:
if p.exists():
path, kind = p, k
break
if path is None:
print(f"⚠️ Keine Face/Centers-Datei gefunden für {stem}. Fallback später → (0.5,0.5).")
return []
try:
raw = path.read_text(encoding="utf-8")
data = json.loads(raw)
except Exception as e:
print(f"❌ Konnte {path.name} nicht lesen: {e}")
return []
dets: List[FaceDet] = []
# 1) Liste von Frames: [{ "W":..,"H":..,"timestamp"/"t":.., "faces":[...] }, ...]
if isinstance(data, list) and data and isinstance(data[0], dict) and "faces" in data[0]:
for fr in data:
W = float(fr.get("W") or 0.0)
H = float(fr.get("H") or 0.0)
t = float(fr.get("t") or fr.get("timestamp") or fr.get("time") or 0.0)
for f in fr.get("faces", []):
dets.append(_parse_face_like(f, t, W, H))
# 2) Dict mit "frames": [...]
elif isinstance(data, dict) and "frames" in data:
for fr in data["frames"]:
W = float(fr.get("W") or 0.0)
H = float(fr.get("H") or 0.0)
t = float(fr.get("t") or fr.get("timestamp") or fr.get("time") or 0.0)
for f in fr.get("faces", []):
dets.append(_parse_face_like(f, t, W, H))
# 3) centers.json als Liste von Listen: [[cx,cy], ...]
elif isinstance(data, list) and data and isinstance(data[0], (list, tuple)) and len(data[0]) >= 2:
fps = float(fps_hint or 25.0)
for i, pair in enumerate(data):
cx, cy = float(pair[0]), float(pair[1])
dets.append(FaceDet(t=i / fps, cx=clamp01(cx), cy=clamp01(cy), w=0.6, h=0.6))
# 4) Liste von Dicts mit evtl. bereits normierten Feldern
elif isinstance(data, list) and data and isinstance(data[0], dict):
for item in data:
t = float(item.get("t") or item.get("time") or 0.0)
dets.append(_parse_face_like(item, t))
else:
print(f"⚠️ Unbekanntes JSON-Format in {path.name}.")
return []
# filtern & sortieren
dets = [d for d in dets if 0.0 <= d.cx <= 1.0 and 0.0 <= d.cy <= 1.0]
dets.sort(key=lambda d: d.t)
print(f"{len(dets)} Detektionen aus {path.name} ({kind}).")
return dets
# ──────────────────────────────────────────────────────────────────────────────
# Zielspur berechnen
# ──────────────────────────────────────────────────────────────────────────────
def build_target_by_frame(
faces: List[FaceDet],
duration: float,
fps: float,
smooth_win: int = 7
) -> List[Dict]:
"""
Wählt pro Frame eine Zielposition (cx,cy,w,h).
Heuristik:
- bevorzuge Gesicht mit höchster mouth_prob (wenn vorhanden),
- sonst größtes Bounding-Box-Areal (w*h),
- halte IDs stabil (nicht zu häufige Sprünge).
Anschließend leichte Glättung (Moving Average) der Zentren/Größen.
"""
if fps <= 0:
fps = 25.0
total_frames = max(1, int(round(duration * fps)))
if not faces:
# Fallback: center track
return [{"frame": i, "t": round(i / fps, 4), "cx": 0.5, "cy": 0.5, "w": 0.6, "h": 0.6} for i in range(total_frames)]
frame_targets: List[Tuple[float, float, float, float]] = [] # (cx, cy, w, h)
last_track: Optional[int] = None
# lineare Suche über faces (bei Bedarf später bucketisieren)
for i in range(total_frames):
t = i / fps
lo, hi = t - 1.0 / fps, t + 1.0 / fps
cand: List[FaceDet] = [d for d in faces if lo <= d.t <= hi]
if not cand:
# Nimm den zeitlich nächsten
nearest = min(faces, key=lambda d: abs(d.t - t))
cand = [nearest]
def score(d: FaceDet) -> Tuple[float, float, float]:
mouth = -1.0 if d.mouth_prob is None else float(d.mouth_prob) # None schlechter als 0
area = float(d.w) * float(d.h)
stable = 1.0 if (last_track is not None and d.track_id == last_track) else 0.0
return (mouth, area, stable)
cand.sort(key=score, reverse=True)
best = cand[0]
if best.track_id is not None:
last_track = best.track_id
frame_targets.append((best.cx, best.cy, best.w, best.h))
# Glätten
cxs = moving_average([c for c, _, _, _ in frame_targets], smooth_win)
cys = moving_average([c for _, c, _, _ in frame_targets], smooth_win)
ws = moving_average([w for *_, w, _ in frame_targets], max(3, smooth_win // 2))
hs = moving_average([h for *_, _, h in frame_targets], max(3, smooth_win // 2))
out = []
for i, (cx, cy, w, h) in enumerate(zip(cxs, cys, ws, hs)):
t = i / fps
out.append({
"frame": i,
"t": round(t, 4),
"cx": round(clamp01(cx), 4),
"cy": round(clamp01(cy), 4),
"w": round(max(0.05, min(1.0, w)), 4),
"h": round(max(0.05, min(1.0, h)), 4),
})
return out
# ──────────────────────────────────────────────────────────────────────────────
# I/O
# ──────────────────────────────────────────────────────────────────────────────
def write_target_json(stem: str, target: List[Dict]) -> Path:
SEGMENTS_DIR.mkdir(parents=True, exist_ok=True)
out_path = SEGMENTS_DIR / f"{stem}_target_by_frame.json"
out_path.write_text(json.dumps(target, ensure_ascii=False, indent=2), encoding="utf-8")
return out_path
# ──────────────────────────────────────────────────────────────────────────────
# CLI / Main
# ──────────────────────────────────────────────────────────────────────────────
def parse_args():
p = argparse.ArgumentParser(description="Erzeugt target_by_frame.json aus Face/Center-Detektionen für Cropping.")
p.add_argument("--pattern", type=str, default="highlight_*.mp4", help="Dateimuster in RAW_CLIPS_DIR (Default: highlight_*.mp4)")
p.add_argument("--fps", type=float, default=0.0, help="FPS erzwingen (0 = aus Video lesen).")
p.add_argument("--smooth", type=int, default=7, help="Fensterbreite für Moving-Average-Glättung (ungerade).")
p.add_argument("--overwrite", action="store_true", help="Existierende target_by_frame.json überschreiben.")
return p.parse_args()
def main():
if not MOVIEPY_OK:
raise RuntimeError("moviepy ist nicht installiert. Bitte `pip install moviepy` ausführen.")
args = parse_args()
vids = sorted(RAW_CLIPS_DIR.glob(args.pattern))
if not vids:
print(f"⚠️ Keine Rohclips gefunden in {RAW_CLIPS_DIR} mit Pattern '{args.pattern}'.")
return
print(f"🔎 Finde {len(vids)} Clips …")
for vid in vids:
stem = vid.stem # z. B. highlight_3
out_json = SEGMENTS_DIR / f"{stem}_target_by_frame.json"
if out_json.exists() and not args.overwrite:
print(f"⏭️ {out_json.name} existiert bereits überspringe (nutze --overwrite zum Ersetzen).")
continue
# Video-Metadaten
try:
with VideoFileClip(str(vid)) as V:
duration = float(V.duration or 0.0)
fps = float(args.fps or (V.fps or 25.0))
except Exception as e:
print(f"❌ Kann Video {vid.name} nicht öffnen: {e} Fallback duration/fps (10s/25fps).")
duration, fps = 10.0, (args.fps or 25.0)
# Face/Centers laden (fps_hint durchreichen, wichtig für centers-Listen)
faces = load_faces_or_centers(stem, fps_hint=fps)
# Zielspur bauen
target = build_target_by_frame(faces, duration=duration, fps=fps, smooth_win=args.smooth)
# Schreiben
out = write_target_json(stem, target)
print(f"💾 geschrieben: {out}")
print("🎉 Fertig.")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,118 @@
#!/usr/bin/env python3
# src/reformat/new/analyze_mouth_activity.py
import logging
from pathlib import Path
from typing import List, Dict, Any, Tuple, Optional
# OpenAI optional; aktuell nicht genutzt (Flag fehlt bewusst)
# from openai import OpenAI
# === HARTE DEFAULTS: einfach Play drücken ===
PROJECT_ROOT = Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit")
RAW_DIR = PROJECT_ROOT / "data" / "output" / "raw_clips"
FACES_DIR = PROJECT_ROOT / "data" / "face_data_combined"
TIMED_DIR = PROJECT_ROOT / "data" / "transkripte"
CENTERS_DIR = PROJECT_ROOT / "data" / "face_crop_centers"
def parse_timed_file(path: Path) -> List[Tuple[float, float]]:
"""
Erwartet Zeilen wie:
[00:00.00 - 00:05.20] Text...
Gibt Liste [(start_sec, end_sec)] zurück. Falls keine Treffer: leere Liste.
"""
import re
rx = re.compile(r"\[(\d+):(\d+)\.(\d+)\s*-\s*(\d+):(\d+)\.(\d+)\]")
segs = []
try:
for line in path.read_text(encoding="utf-8").splitlines():
m = rx.search(line)
if not m:
continue
smin, ssec, sms, emin, esec, ems = map(int, m.groups())
start = smin * 60 + ssec + sms / 100.0
end = emin * 60 + esec + ems / 100.0
if end > start:
segs.append((start, end))
except FileNotFoundError:
pass
return segs
def select_speaker_center(faces: List[Dict[str, Any]]) -> Tuple[float, float]:
"""Priorität: mouth_openness, Fallback: größte Fläche; sonst Bildmitte."""
if not faces:
return (960.0, 540.0)
def area(f):
bx = f.get("bbox",[0,0,0,0]); return float(bx[2]*bx[3])
best = max(
faces,
key=lambda f: (float(f.get("mouth_openness", 0.0)), area(f))
)
x, y, w, h = best["bbox"]
return (x + w/2.0, y + h/2.0)
def load_json(path: Path):
import json
return json.loads(path.read_text(encoding="utf-8"))
def save_json(obj, path: Path):
import json
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text(json.dumps(obj, ensure_ascii=False, indent=2), encoding="utf-8")
def process_one(base_name: str) -> bool:
faces_path = FACES_DIR / f"{base_name}_faces.json"
timed_path = TIMED_DIR / f"{base_name}_timed.txt"
centers_path = CENTERS_DIR / f"{base_name}_centers.json"
if not faces_path.exists():
logging.warning("Skip %-18s | Faces fehlen: %s", base_name, faces_path)
return False
if centers_path.exists():
logging.info("Skip %-18s | Centers existieren schon: %s", base_name, centers_path.name)
return True
try:
face_data: List[Dict[str, Any]] = load_json(faces_path)
except Exception as e:
logging.error("Fehler beim Lesen von %s: %s", faces_path, e)
return False
segments = parse_timed_file(timed_path)
if not segments:
logging.warning("[%s] Keine Segmente erkannt oder Datei fehlt: %s", base_name, timed_path.name)
centers: List[List[float]] = []
for entry in face_data:
faces = entry.get("faces", [])
cx, cy = select_speaker_center(faces)
centers.append([float(cx), float(cy)])
save_json(centers, centers_path)
logging.info("OK %-18s | Centers gespeichert: %s (frames=%d)", base_name, centers_path.name, len(centers))
return True
def main():
logging.basicConfig(
format="%(asctime)s %(levelname)s: %(message)s",
level=logging.INFO
)
if not RAW_DIR.exists():
logging.error("RAW_DIR existiert nicht: %s", RAW_DIR)
return
clips = sorted(RAW_DIR.glob("*.mp4"))
if not clips:
logging.warning("Keine Clips gefunden in %s", RAW_DIR)
return
logging.info("Analyze (mouth) Batch-Mode: %d Clips", len(clips))
ok = 0
for clip in clips:
base = clip.stem
if process_one(base):
ok += 1
logging.info("Fertig. %d/%d Clips verarbeitet.", ok, len(clips))
if __name__ == "__main__":
main()

View File

@ -0,0 +1,294 @@
#!/usr/bin/env python3
# src/reformat/new/main_apply_crop.py
from __future__ import annotations
import logging, json, math, subprocess
from pathlib import Path
from typing import Optional, Tuple, List, Dict, Any
from collections import deque
import cv2
import numpy as np
# ==== Pfade =================================================================
PROJECT_ROOT = Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit")
INPUT_VIDEO_DIR = PROJECT_ROOT / "data" / "output" / "raw_clips"
FACE_COMBINED_DIR = PROJECT_ROOT / "data" / "face_data_combined"
SEGMENTS_DIR = PROJECT_ROOT / "data" / "segments"
OUTPUT_DIR = PROJECT_ROOT / "output" / "output_9x16_final"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_W, OUT_H = 1080, 1920
TARGET_AR = OUT_W / OUT_H # 0.5625
# ==== Debug =================================================================
DEBUG_MODE = False
DEBUG_SCALE = 0.6
DRAW_GUIDES = True
# ==== Smooth / Switch =======================================================
MEDIAN_WIN = 5
EMA_ALPHA = 0.22
DEADBAND_PX = 8.0
SWITCH_COOLDOWN_FRAMES = 12 # kurze Ruhe nach Segmentwechsel
ZOOM_PAD_FRAC = 0.10
# ==== Scene-Cut-Erkennung ===================================================
USE_CUT_DETECT = True
CUT_CORR_THRESH = 0.65
CUT_COOLDOWN = 6
# ==== Audio-Mux =============================================================
MUX_AUDIO = True
FFMPEG_BIN = "ffmpeg"
# ============================================================================
def clamp(v, lo, hi): return max(lo, min(hi, v))
def compute_crop_rect(cx: float, cy: float, src_w: int, src_h: int) -> tuple[int,int,int,int]:
"""
Liefert ein 9:16-Croprechteck (x,y,w,h) um (cx,cy).
- AR bleibt IMMER exakt 9:16 (kein Squeeze)
- ZOOM_PAD_FRAC wirkt als uniformer Scale auf Breite und Höhe
- Rechteck bleibt vollständig im Bild
"""
src_ar = src_w / src_h
if src_ar >= TARGET_AR:
base_h = src_h
base_w = int(round(base_h * TARGET_AR))
else:
base_w = src_w
base_h = int(round(base_w / TARGET_AR))
desired_scale = 1.0 + ZOOM_PAD_FRAC
max_scale_w = src_w / base_w
max_scale_h = src_h / base_h
s = min(desired_scale, max_scale_w, max_scale_h)
w = int(round(base_w * s))
h = int(round(base_h * s))
half_w, half_h = w // 2, h // 2
cx = clamp(cx, half_w, src_w - half_w)
cy = clamp(cy, half_h, src_h - half_h)
x = int(round(cx - half_w))
y = int(round(cy - half_h))
return x, y, w, h
def draw_center(img, pt, color, label=None):
if pt is None: return
x, y = int(pt[0]), int(pt[1])
cv2.circle(img, (x, y), 6, color, -1)
if label:
cv2.putText(img, label, (x + 8, y - 8),
cv2.FONT_HERSHEY_SIMPLEX, 0.55, color, 2, cv2.LINE_AA)
def scene_corr(a_small: np.ndarray, b_small: np.ndarray) -> float:
a_hsv = cv2.cvtColor(a_small, cv2.COLOR_BGR2HSV)
b_hsv = cv2.cvtColor(b_small, cv2.COLOR_BGR2HSV)
ha = cv2.calcHist([a_hsv],[0,1],None,[50,50],[0,180,0,256])
hb = cv2.calcHist([b_hsv],[0,1],None,[50,50],[0,180,0,256])
cv2.normalize(ha,ha,0,1,cv2.NORM_MINMAX); cv2.normalize(hb,hb,0,1,cv2.NORM_MINMAX)
corr = cv2.compareHist(ha, hb, cv2.HISTCMP_CORREL)
return float((corr + 1.0)/2.0)
def mux_audio_from_source(src_video: Path, silent_video: Path, out_video: Path):
cmd = [
FFMPEG_BIN, "-y",
"-i", str(src_video),
"-i", str(silent_video),
"-map", "1:v:0",
"-map", "0:a:0?",
"-c:v", "copy",
"-c:a", "aac", "-b:a", "192k",
"-shortest",
str(out_video),
]
subprocess.run(cmd, check=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
def load_faces(name: str) -> List[Dict[str,Any]]:
p = FACE_COMBINED_DIR / f"{name}_faces.json"
return json.loads(p.read_text(encoding="utf-8"))
def load_segments(name: str, total_frames: int) -> List[Optional[int]]:
seg_p = SEGMENTS_DIR / f"{name}_segments.json"
map_p = SEGMENTS_DIR / f"{name}_target_by_frame.json"
if map_p.exists():
target = json.loads(map_p.read_text(encoding="utf-8"))
if len(target) < total_frames:
target += [target[-1] if target else None] * (total_frames - len(target))
return target[:total_frames]
if seg_p.exists():
segs = json.loads(seg_p.read_text(encoding="utf-8"))
target = [None]*total_frames
for s in segs:
a, b, tid = int(s["start_f"]), int(s["end_f"]), s["track_id"]
for t in range(max(0,a), min(total_frames, b+1)):
target[t] = tid
return target
return [None]*total_frames
def find_center_for_track(faces_frame: Dict[str,Any], target_tid: Optional[int], fallback: Tuple[float,float]) -> Tuple[float,float]:
if target_tid is None:
return fallback
faces = faces_frame.get("faces", [])
for f in faces:
if int(f.get("track_id", -1)) == int(target_tid):
x,y,w,h = f.get("bbox", [None,None,None,None])
if None not in (x,y,w,h):
return (float(x + w/2), float(y + h/2))
return fallback
def main():
logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO)
clips = sorted(list(INPUT_VIDEO_DIR.glob("*.mp4")) + list(INPUT_VIDEO_DIR.glob("*.mov")))
if not clips:
print(f"⚠️ Keine Clips in {INPUT_VIDEO_DIR}")
return
for video_path in clips:
name = video_path.stem
faces_path = FACE_COMBINED_DIR / f"{name}_faces.json"
if not faces_path.exists():
print(f"⏭️ Skip (keine Faces): {faces_path.name}")
continue
cap = cv2.VideoCapture(str(video_path))
if not cap.isOpened():
print(f"❌ Kann Video nicht öffnen: {video_path.name}")
continue
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
faces_all = load_faces(name)
if len(faces_all) < total:
faces_all += [ {"faces": [], "W": width, "H": height} ] * (total - len(faces_all))
target_by_frame = load_segments(name, total)
out_path = OUTPUT_DIR / f"{name}_9x16.mp4"
if out_path.exists():
print(f"⏭️ Skip: Output existiert bereits → {out_path.name}")
cap.release()
continue
writer = cv2.VideoWriter(
str(out_path),
cv2.VideoWriter_fourcc(*"mp4v"),
fps,
(OUT_W, OUT_H)
)
median_buf = deque(maxlen=max(3, MEDIAN_WIN if MEDIAN_WIN % 2 else MEDIAN_WIN+1))
ema_center: Optional[Tuple[float,float]] = None
last_center: Optional[Tuple[float,float]] = (width/2, height/2)
switch_cooldown = 0
prev_small = None
cut_cd = 0
print(f"🎞️ Apply: {name} src={width}x{height} fps={fps:.2f} frames={total}")
for t in range(total):
ret, frame = cap.read()
if not ret: break
target_tid = target_by_frame[t] if t < len(target_by_frame) else None
faces_fr = faces_all[t] if t < len(faces_all) else {"faces":[]}
desired = find_center_for_track(faces_fr, target_tid, last_center or (width/2, height/2))
if USE_CUT_DETECT:
small = cv2.resize(frame, (128, 72))
if prev_small is not None:
corr = scene_corr(prev_small, small)
if corr < CUT_CORR_THRESH:
ema_center = desired
last_center = desired
switch_cooldown = SWITCH_COOLDOWN_FRAMES
cut_cd = CUT_COOLDOWN
prev_small = small
median_buf.append(desired)
if len(median_buf) >= 3:
xs = sorted(p[0] for p in median_buf)
ys = sorted(p[1] for p in median_buf)
m = len(median_buf)//2
desired_f = (xs[m], ys[m])
else:
desired_f = desired
if t > 0:
prev_tid = target_by_frame[t-1] if t-1 < len(target_by_frame) else None
else:
prev_tid = target_tid
if ema_center is None:
ema_center = desired_f
if last_center is None:
last_center = desired_f
if target_tid != prev_tid:
ema_center = desired_f
last_center = desired_f
switch_cooldown = SWITCH_COOLDOWN_FRAMES
else:
dx = desired_f[0] - ema_center[0]
dy = desired_f[1] - ema_center[1]
dist = math.hypot(dx, dy)
if cut_cd > 0:
ema_center = desired_f
cut_cd -= 1
else:
if dist > DEADBAND_PX:
ema_center = (ema_center[0] + dx*EMA_ALPHA,
ema_center[1] + dy*EMA_ALPHA)
last_center = desired_f
# neuer 9:16 Crop
x, y, w, h = compute_crop_rect(ema_center[0], ema_center[1], width, height)
cropped = frame[y:y+h, x:x+w]
if cropped.size == 0: cropped = frame
final = cv2.resize(cropped, (OUT_W, OUT_H), interpolation=cv2.INTER_AREA)
writer.write(final)
if DEBUG_MODE:
dbg = frame.copy()
cv2.rectangle(dbg, (x, y), (x+w, y+h), (0, 0, 255), 2)
if DRAW_GUIDES:
draw_center(dbg, desired, (128,128,255), "desired")
draw_center(dbg, desired_f, (255,255, 0), "median")
draw_center(dbg, ema_center, ( 0,255,255), "ema")
cv2.putText(dbg, f"t={t+1}/{total} tid={target_tid}",
(12, height-14), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (20,220,20), 2, cv2.LINE_AA)
disp = cv2.resize(dbg, (int(width*DEBUG_SCALE), int(height*DEBUG_SCALE)))
cv2.imshow("Apply Debug", disp)
if cv2.waitKey(1) & 0xFF == ord("q"):
print("🛑 Abgebrochen (q).")
break
writer.release()
cap.release()
if MUX_AUDIO:
tmp = out_path.with_suffix(".tmp.mp4")
try:
out_path.rename(tmp)
mux_audio_from_source(video_path, tmp, out_path)
finally:
if tmp.exists():
try: tmp.unlink()
except: pass
print(f"✅ Fertig (mit Audio): {out_path.name}")
else:
print(f"✅ Fertig: {out_path.name}")
if DEBUG_MODE:
cv2.destroyAllWindows()
if __name__ == "__main__":
main()

View File

@ -0,0 +1,319 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
import logging
import json
import time
from pathlib import Path
from contextlib import nullcontext
import cv2
import numpy as np
import torch
from ultralytics import YOLO
import mediapipe as mp
# Fortschritt hübsch, wenn verfügbar
try:
from tqdm import tqdm
_HAS_TQDM = True
except Exception:
_HAS_TQDM = False
from src.reformat.new.speaking import get_mouth_openness
# ---------- Performance Tweaks ----------
torch.set_float32_matmul_precision("high")
cv2.setUseOptimized(True)
# ---------- Hilfsfunktionen ----------
def make_square_crop(x1, y1, x2, y2, W, H, margin_scale, min_crop):
cx = (x1 + x2) * 0.5
cy = (y1 + y2) * 0.5
w = (x2 - x1) * (1.0 + 2.0 * margin_scale)
h = (y2 - y1) * (1.0 + 2.0 * margin_scale)
side = max(w, h, float(min_crop))
half = side * 0.5
sx1 = int(max(0, round(cx - half)))
sy1 = int(max(0, round(cy - half)))
sx2 = int(min(W, round(cx + half)))
sy2 = int(min(H, round(cy + half)))
side_w = max(0, sx2 - sx1)
side_h = max(0, sy2 - sy1)
side = max(2, min(side_w, side_h))
sx2 = sx1 + side
sy2 = sy1 + side
return sx1, sy1, sx2, sy2
def pick_landmarks_near_crop_center(lm_lists, crop_w, crop_h):
if not lm_lists:
return None
cx_t, cy_t = crop_w * 0.5, crop_h * 0.5
best, best_d = None, 1e12
for lms in lm_lists:
xs = [p.x * crop_w for p in lms.landmark]
ys = [p.y * crop_h for p in lms.landmark]
cx = sum(xs) / len(xs)
cy = sum(ys) / len(ys)
d = (cx - cx_t) ** 2 + (cy - cy_t) ** 2
if d < best_d:
best, best_d = lms, d
return best
def run_mesh(face_mesh, crop_bgr, upscale_if_small):
if crop_bgr.size == 0:
return None, 0.0
ch, cw = crop_bgr.shape[:2]
if max(ch, cw) < upscale_if_small:
scale = float(upscale_if_small) / max(ch, cw)
new_w = max(1, int(round(cw * scale)))
new_h = max(1, int(round(ch * scale)))
crop_bgr = cv2.resize(crop_bgr, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
ch, cw = new_h, new_w
rgb = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2RGB)
res = face_mesh.process(rgb)
if not res.multi_face_landmarks:
return None, 0.0
chosen = pick_landmarks_near_crop_center(res.multi_face_landmarks, cw, ch)
if chosen is None:
return None, 0.0
mo = get_mouth_openness(chosen.landmark, ch)
return chosen, float(mo)
# ---------- Kernprozess ----------
def process_video(video_path: Path,
output_path: Path,
model: YOLO,
face_mesh,
conf_thresh: float,
frame_skip: int,
downscale: float,
expansion_1: float,
expansion_2: float,
min_crop: int,
faces_upscale: int,
imgsz: int,
device: str,
max_det: int):
print(f"🎬 Starte Detection: {video_path.name}")
cap = cv2.VideoCapture(str(video_path))
if not cap.isOpened():
logging.error(f"❌ Kann Video nicht öffnen: {video_path}")
return
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
total_frames_raw = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
# Wenn frame_skip > 1, reduziert sich die tatsächlich verarbeitete Anzahl
total_to_process = None
if total_frames_raw > 0:
total_to_process = (total_frames_raw + (frame_skip - 1)) // max(1, frame_skip)
scaled_w = max(1, int(round(orig_w * downscale)))
scaled_h = max(1, int(round(orig_h * downscale)))
data = []
frame_idx = 0
processed_frames = 0
sx = (orig_w / scaled_w) if downscale != 1.0 else 1.0
sy = (orig_h / scaled_h) if downscale != 1.0 else 1.0
autocast_ctx = (
torch.autocast(device_type=device, dtype=torch.float16)
if device in ("mps", "cuda") else nullcontext()
)
# Fortschrittsbalken pro Video
bar = None
start_t = time.time()
if _HAS_TQDM:
bar = tqdm(total=total_to_process, desc=f"{video_path.name}", unit="f", leave=True)
while True:
ret, frame = cap.read()
if not ret:
break
if frame_skip > 1 and (frame_idx % frame_skip != 0):
frame_idx += 1
continue
frame_infer = frame if downscale == 1.0 else cv2.resize(frame, (scaled_w, scaled_h), interpolation=cv2.INTER_AREA)
with torch.no_grad():
with autocast_ctx:
detections = model(frame_infer, imgsz=imgsz, device=device, verbose=False,
conf=conf_thresh, iou=0.5, max_det=max_det)[0]
faces = []
for i in range(len(detections.boxes)):
box = detections.boxes[i]
conf = float(box.conf[0]) if hasattr(box.conf, "__len__") else float(box.conf)
if conf < conf_thresh:
continue
x1, y1, x2, y2 = [float(v) for v in box.xyxy[0].tolist()]
if downscale != 1.0:
x1 *= sx; x2 *= sx; y1 *= sy; y2 *= sy
x1 = max(0.0, min(x1, orig_w - 1))
y1 = max(0.0, min(y1, orig_h - 1))
x2 = max(0.0, min(x2, orig_w - 1))
y2 = max(0.0, min(y2, orig_h - 1))
w = max(1.0, x2 - x1)
h = max(1.0, y2 - y1)
cx = x1 + w / 2.0
cy = y1 + h / 2.0
# Pass 1
sx1, sy1, sx2, sy2 = make_square_crop(x1, y1, x2, y2, orig_w, orig_h, expansion_1, min_crop)
if sx2 - sx1 < 4 or sy2 - sy1 < 4:
continue
face_crop = frame[sy1:sy2, sx1:sx2]
_, mouth_open = run_mesh(face_mesh, face_crop, faces_upscale)
# Pass 2 nur wenn nötig
if mouth_open == 0.0:
sx1b, sy1b, sx2b, sy2b = make_square_crop(x1, y1, x2, y2, orig_w, orig_h, expansion_2, min_crop)
if (sx2b - sx1b) >= 4 and (sy2b - sy1b) >= 4:
face_crop_b = frame[sy1b:sy2b, sx1b:sx2b]
_, mouth_open = run_mesh(face_mesh, face_crop_b, faces_upscale)
faces.append({
"bbox": [int(round(x1)), int(round(y1)), int(round(w)), int(round(h))],
"conf": round(conf, 3),
"center": [round(cx, 1), round(cy, 1)],
"mouth_openness": round(float(mouth_open), 3)
})
data.append({
"frame": frame_idx,
"timestamp": round(frame_idx / fps, 3),
"W": orig_w,
"H": orig_h,
"faces": faces
})
frame_idx += 1
processed_frames += 1
# Fortschritt aktualisieren
if _HAS_TQDM:
bar.update(1)
else:
# leichter Fallback: ETA Ausgabe alle 30 verarbeitete Frames
if processed_frames % 30 == 0:
elapsed = time.time() - start_t
rate = processed_frames / max(1e-6, elapsed) # frames/sec
if total_to_process:
remaining = max(0, total_to_process - processed_frames)
eta_sec = remaining / max(1e-6, rate)
print(f"[{video_path.name}] {processed_frames}/{total_to_process} "
f"({processed_frames/total_to_process*100:.1f}%) "
f"{rate:.1f} f/s — ETA {eta_sec/60:.1f} min")
else:
print(f"[{video_path.name}] {processed_frames} frames — {rate:.1f} f/s")
cap.release()
if _HAS_TQDM and bar is not None:
bar.close()
output_path.write_text(json.dumps(data, ensure_ascii=False), encoding="utf-8")
print(f"✅ Faces gespeichert: {output_path.name}")
def main():
parser = argparse.ArgumentParser()
# Verzeichnisse
parser.add_argument("--input-dir", type=Path,
default=Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit/data/output/raw_clips"))
parser.add_argument("--output-dir", type=Path,
default=Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit/data/face_data_combined"))
parser.add_argument("--model", type=Path,
default=Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit/models/yolov8n-face.pt"))
# Optimierte Defaults (keine Presets nötig)
parser.add_argument("--conf-thresh", type=float, default=0.35)
parser.add_argument("--frame-skip", type=int, default=1)
parser.add_argument("--downscale", type=float, default=0.5)
parser.add_argument("--expansion", type=float, default=0.4)
parser.add_argument("--expansion2", type=float, default=0.8)
parser.add_argument("--min-crop", type=int, default=160)
parser.add_argument("--faces-upscale", type=int, default=192)
parser.add_argument("--imgsz", type=int, default=448)
parser.add_argument("--max-det", type=int, default=20)
parser.add_argument("--use-refine", action="store_true", default=False)
args = parser.parse_args()
logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO)
args.output_dir.mkdir(parents=True, exist_ok=True)
# Model & Device
yolo = YOLO(str(args.model))
if torch.backends.mps.is_available():
device = "mps"
elif torch.cuda.is_available():
device = "cuda"
else:
device = "cpu"
yolo.to(device)
print(f"🖥️ Inference-Device: {device}")
# Warmup (reduziert Anlaufschwankungen)
try:
with torch.no_grad():
dummy = np.zeros((args.imgsz, args.imgsz, 3), dtype=np.uint8)
_ = yolo.predict(source=[dummy], imgsz=args.imgsz, verbose=False, device=device)
except Exception:
pass
# Liste der Videos (für Gesamt-Fortschritt)
videos = sorted(args.input_dir.glob("*.mp4"))
print(f"🔍 Input-Ordner: {args.input_dir.resolve()}")
print("📁 Dateien:")
for p in sorted(args.input_dir.glob("*")):
print("", p.name)
# Gesamt-Fortschrittsbalken pro Datei
outer = None
if _HAS_TQDM:
outer = tqdm(total=len(videos), desc="Gesamt", unit="vid", leave=False)
with mp.solutions.face_mesh.FaceMesh(
static_image_mode=False,
max_num_faces=10,
refine_landmarks=args.use_refine,
min_detection_confidence=0.5,
min_tracking_confidence=0.5
) as face_mesh:
for vid in videos:
out = args.output_dir / f"{vid.stem}_faces.json"
process_video(
video_path=vid,
output_path=out,
model=yolo,
face_mesh=face_mesh,
conf_thresh=args.conf_thresh,
frame_skip=args.frame_skip,
downscale=args.downscale,
expansion_1=args.expansion,
expansion_2=args.expansion2,
min_crop=args.min_crop,
faces_upscale=args.faces_upscale,
imgsz=args.imgsz,
device=device,
max_det=args.max_det
)
if _HAS_TQDM and outer is not None:
outer.update(1)
if _HAS_TQDM and outer is not None:
outer.close()
if __name__ == "__main__":
main()

View File

@ -0,0 +1,67 @@
#!/usr/bin/env python3
import logging, json
from pathlib import Path
from typing import List, Dict, Any
def iou(boxA, boxB):
xA = max(boxA[0], boxB[0])
yA = max(boxA[1], boxB[1])
xB = min(boxA[0]+boxA[2], boxB[0]+boxB[2])
yB = min(boxA[1]+boxA[3], boxB[1]+boxB[3])
interW, interH = max(0, xB-xA), max(0, yB-yA)
inter = interW * interH
union = boxA[2]*boxA[3] + boxB[2]*boxB[3] - inter
return inter/union if union > 0 else 0.0
def track_faces(faces_all: List[Dict[str,Any]], iou_thresh=0.3):
next_id = 0
last_boxes = {} # track_id -> bbox
for frame in faces_all:
new_boxes = {}
for face in frame["faces"]:
box = face["bbox"]
# match gegen bestehende
best_id, best_iou = None, 0.0
for tid, prev_box in last_boxes.items():
ov = iou(box, prev_box)
if ov > best_iou:
best_id, best_iou = tid, ov
if best_iou > iou_thresh:
face["track_id"] = best_id
new_boxes[best_id] = box
else:
face["track_id"] = next_id
new_boxes[next_id] = box
next_id += 1
last_boxes = new_boxes
return faces_all
def main():
PROJECT_ROOT = Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit")
FACE_DIR = PROJECT_ROOT / "data" / "face_data_combined"
for f in FACE_DIR.glob("*_faces.json"):
try:
faces_all = json.loads(f.read_text(encoding="utf-8"))
except Exception as e:
print(f"❌ Fehler beim Laden {f.name}: {e}")
continue
tracked = track_faces(faces_all)
f.write_text(json.dumps(tracked, ensure_ascii=False), encoding="utf-8")
print(f"✅ Track-IDs ergänzt: {f.name}")
# zusätzlich centers.json (dominant = höchster mouth_openness pro Frame)
centers = []
for fr in tracked:
if fr["faces"]:
best = max(fr["faces"], key=lambda ff: ff.get("mouth_openness", 0.0))
centers.append([best["center"][0], best["center"][1]])
else:
centers.append([fr["W"]/2, fr["H"]/2])
centers_path = f.with_name(f.stem.replace("_faces","_centers")+".json")
centers_path.write_text(json.dumps(centers, ensure_ascii=False), encoding="utf-8")
print(f"📝 Centers gespeichert: {centers_path.name}")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,179 @@
#!/usr/bin/env python3
# src/reformat/new/make_segments.py
from __future__ import annotations
import json, math
from pathlib import Path
from dataclasses import dataclass
from typing import List, Dict, Any, Optional
import numpy as np
import cv2
# ==== Pfade (an dein Projekt angepasst) =====================================
PROJECT_ROOT = Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit")
RAW_DIR = PROJECT_ROOT / "data" / "output" / "raw_clips" # Videos
FACE_COMBINED_DIR= PROJECT_ROOT / "data" / "face_data_combined" # *_faces.json
SEGMENTS_DIR = PROJECT_ROOT / "data" / "segments" # Output
SEGMENTS_DIR.mkdir(parents=True, exist_ok=True)
# ===========================================================================
# === Segment-Parameter ===
WIN_SEC = 1.2 # Fensterlänge
STRIDE_SEC = 0.6 # Schrittweite
HYSTERESIS_FACTOR = 1.25 # neuer Sprecher muss +25% besser sein
MIN_SEG_SEC = 1.0 # kürzere Segmente werden an Nachbarn gemerged
CONF_MIN = 0.35 # Sichtbarkeits-Threshold
AREA_CAP_FRAC = 0.12 # ab 12% Framefläche kappen wir den Flächenbonus
@dataclass
class Segment:
start_f: int
end_f: int
track_id: Optional[int]
def robust_minmax(vals, p_lo=5, p_hi=95):
v = np.array(vals, dtype=float)
lo, hi = np.percentile(v, [p_lo, p_hi])
if hi <= lo: hi = lo + 1e-6
return float(lo), float(hi)
def score_face(face: Dict[str,Any], W: int, H: int, cx: float, cy: float,
lo: float, hi: float) -> float:
# Mundaktivität robust normalisieren
mo = float(face.get("mouth_openness", 0.0))
mo = (mo - lo) / (hi - lo + 1e-9)
mo = float(np.clip(mo, 0.0, 1.0))
x, y, w, h = map(float, face.get("bbox", [0,0,0,0]))
conf = float(face.get("conf", 1.0))
if conf < CONF_MIN or w <= 5 or h <= 5: # sehr kleine/unsichere Gesichter raus
return 0.0
area = (w*h) / (W*H + 1e-9)
size_w = min(1.0, area / AREA_CAP_FRAC) # Flächengewicht
fx = x + w/2; fy = y + h/2
dist = math.hypot(fx - cx, fy - cy) / math.hypot(W/2, H/2)
center_w = max(0.0, 1.0 - dist**2) # Mitte leicht bevorzugen
# MO dominiert, Fläche und Mitte geben Stabilität
return mo * (0.6 + 0.3*size_w + 0.1*center_w)
def build_segments_for_clip(faces_per_frame: List[Dict[str,Any]], fps: float) -> (List[Segment], List[Optional[int]]):
T = len(faces_per_frame)
if T == 0:
return [], []
# Framegröße
W = faces_per_frame[0].get("W") or faces_per_frame[0].get("width")
H = faces_per_frame[0].get("H") or faces_per_frame[0].get("height")
if not W or not H:
# Versuch, aus BBox-Max abzuleiten (Fallback)
max_w = max((f["bbox"][0]+f["bbox"][2]) for fr in faces_per_frame for f in fr.get("faces", []) if "bbox" in f) if any(fr.get("faces") for fr in faces_per_frame) else 1920
max_h = max((f["bbox"][1]+f["bbox"][3]) for fr in faces_per_frame for f in fr.get("faces", []) if "bbox" in f) if any(fr.get("faces") for fr in faces_per_frame) else 1080
W, H = int(max_w), int(max_h)
# Mundwerte für robuste Normierung sammeln
all_mo = [float(f.get("mouth_openness", 0.0))
for fr in faces_per_frame for f in fr.get("faces", [])]
lo, hi = robust_minmax(all_mo) if all_mo else (0.0, 1.0)
win = max(1, int(round(WIN_SEC * fps)))
stride = max(1, int(round(STRIDE_SEC * fps)))
minseg = max(1, int(round(MIN_SEG_SEC * fps)))
chosen_by_frame: List[Optional[int]] = [None]*T
last_track: Optional[int] = None
for start in range(0, T, stride):
end = min(T, start + win)
sums: Dict[int, float] = {}
for t in range(start, end):
faces = faces_per_frame[t].get("faces", [])
if not faces: continue
for face in faces:
tid = face.get("track_id")
if tid is None:
continue
s = score_face(face, W, H, W/2, H/2, lo, hi)
if s <= 0:
continue
tid = int(tid)
sums[tid] = sums.get(tid, 0.0) + s
if not sums:
chosen = last_track
else:
best_tid, best_val = max(sums.items(), key=lambda kv: kv[1])
if last_track is None:
chosen = best_tid
else:
prev_val = sums.get(last_track, 0.0)
chosen = best_tid if best_val > prev_val * HYSTERESIS_FACTOR else last_track
for t in range(start, end):
chosen_by_frame[t] = chosen
last_track = chosen
# Lücken auffüllen
for t in range(T):
if chosen_by_frame[t] is None:
chosen_by_frame[t] = last_track
# Segmente bauen
segs: List[Segment] = []
cur = chosen_by_frame[0]
seg_start = 0
for t in range(1, T):
if chosen_by_frame[t] != cur:
segs.append(Segment(seg_start, t-1, cur))
cur = chosen_by_frame[t]
seg_start = t
segs.append(Segment(seg_start, T-1, cur))
# Mindestlänge: zu kurze an vorheriges mergen
out: List[Segment] = []
for s in segs:
if out and (s.end_f - s.start_f + 1) < minseg:
out[-1].end_f = s.end_f
else:
out.append(s)
return out, chosen_by_frame
def main():
clips = sorted(list(RAW_DIR.glob("*.mp4")) + list(RAW_DIR.glob("*.mov")))
if not clips:
print(f"⚠️ Keine Videos in {RAW_DIR}")
return
for vid in clips:
name = vid.stem
faces_path = FACE_COMBINED_DIR / f"{name}_faces.json"
if not faces_path.exists():
print(f"⏭️ Skip (keine Faces): {faces_path.name}")
continue
# FPS vom Video
cap = cv2.VideoCapture(str(vid))
if not cap.isOpened():
print(f"❌ Kann Video nicht öffnen: {vid.name}")
continue
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
cap.release()
try:
faces_per_frame = json.loads(faces_path.read_text(encoding="utf-8"))
except Exception as e:
print(f"❌ Fehler beim Lesen {faces_path.name}: {e}")
continue
segs, chosen = build_segments_for_clip(faces_per_frame, fps)
seg_out = SEGMENTS_DIR / f"{name}_segments.json"
map_out = SEGMENTS_DIR / f"{name}_target_by_frame.json"
seg_out.write_text(json.dumps([s.__dict__ for s in segs], ensure_ascii=False), encoding="utf-8")
map_out.write_text(json.dumps(chosen, ensure_ascii=False), encoding="utf-8")
print(f"✅ Segmente erzeugt: {seg_out.name} ({len(segs)} Segmente)")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,58 @@
from typing import Dict, List, Tuple, Optional
from .tracking import FaceTracker
class SmartSpeakerTracker:
def __init__(self):
self.face_tracker = FaceTracker()
self.movement_per_id: Dict[int, float] = {}
self.prev_openness: Dict[int, float] = {}
self.confirmation_counter: Dict[int, int] = {}
self.speaker_threshold = 3.0 # wie viel Lippenbewegung braucht es mind.
self.decay_factor = 0.9 # wie schnell "verblasst" die Bewegung
self.speaker_confirm_frames = 25 # wie viele Frames muss ein Sprecher dominieren
self.speaker_id: Optional[int] = None
def update(self, faces: List[Dict]) -> Tuple[float, float]:
if not faces:
return self.face_tracker.update([])
# Lippenbewegung analysieren
for face in faces:
id = face.get("id")
openness = face.get("mouth_openness", 0.0)
prev = self.prev_openness.get(id, openness)
movement = abs(openness - prev)
# Bewegung aufaddieren mit Decay
old_score = self.movement_per_id.get(id, 0.0) * self.decay_factor
self.movement_per_id[id] = old_score + movement
self.prev_openness[id] = openness
# Finde ID mit größter Bewegung
if self.movement_per_id:
top_id = max(self.movement_per_id, key=self.movement_per_id.get)
top_movement = self.movement_per_id[top_id]
if top_movement >= self.speaker_threshold:
self.confirmation_counter[top_id] = self.confirmation_counter.get(top_id, 0) + 1
# Andere runterzählen
for other_id in self.confirmation_counter:
if other_id != top_id:
self.confirmation_counter[other_id] = max(0, self.confirmation_counter[other_id] - 1)
# Wenn lange genug bestätigt, neuer Sprecher
if self.confirmation_counter[top_id] >= self.speaker_confirm_frames:
self.speaker_id = top_id
else:
# Wenn keiner über der Schwelle → kein neuer Sprecher
self.confirmation_counter = {k: max(0, v - 1) for k, v in self.confirmation_counter.items()}
# Sprecher vorhanden → dahin zentrieren
if self.speaker_id is not None:
for face in faces:
if face.get("id") == self.speaker_id:
return tuple(face["center"])
# Fallback: stabiler Durchschnitt
centers = [tuple(face["center"]) for face in faces]
return self.face_tracker.update(centers)

View File

@ -0,0 +1,67 @@
import json
from pathlib import Path
from typing import List, Dict
# === Pfade ===
SCRIPT_DIR = Path(__file__).resolve().parent
PROJECT_ROOT = SCRIPT_DIR.parents[2]
FACES_PATH = PROJECT_ROOT / "data" / "face_data_combined" / "testVideoShort_faces.json"
SEGMENTS_PATH = PROJECT_ROOT / "data" / "transkripte" / "testVideoShort_segments.json"
OUTPUT_PATH = PROJECT_ROOT / "data" / "face_crop_centers" / "testVideoShort_centers.json"
FPS = 25 # Muss zur Framerate deines Videos passen
# === Dateien laden ===
with open(FACES_PATH) as f:
face_data = json.load(f)
with open(SEGMENTS_PATH) as f:
segments = json.load(f)
# === Zentrierungen pro Frame bestimmen ===
frame_centers: List[List[float]] = []
for segment in segments:
start_sec = segment["start"]
end_sec = segment["end"]
start_f = int(start_sec * FPS)
end_f = int(end_sec * FPS)
# Lippenbewegung pro ID in diesem Segment aufaddieren
movement: Dict[int, float] = {}
count: Dict[int, int] = {}
for f in range(start_f, min(end_f, len(face_data))):
for face in face_data[f]["faces"]:
id = face.get("id")
openness = face.get("mouth_openness", 0.0)
movement[id] = movement.get(id, 0.0) + openness
count[id] = count.get(id, 0) + 1
# Durchschnitt berechnen
avg_movement = {id: movement[id] / count[id] for id in movement if count[id] > 0}
if not avg_movement:
speaker_id = None
else:
speaker_id = max(avg_movement, key=avg_movement.get)
# Für jedes Frame in diesem Segment den Sprecher zentrieren
for f in range(start_f, min(end_f, len(face_data))):
faces = face_data[f].get("faces", [])
center = [960.0, 540.0] # Fallback
if speaker_id is not None:
for face in faces:
if face.get("id") == speaker_id:
center = face["center"][:2]
break
frame_centers.append([round(center[0], 2), round(center[1], 2)])
# === Ergebnis speichern ===
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
with open(OUTPUT_PATH, "w") as f:
json.dump(frame_centers, f, indent=2)
print(f"✅ Zentrierung auf Sprecher für {len(frame_centers)} Frames gespeichert unter:\n{OUTPUT_PATH}")

View File

@ -0,0 +1,84 @@
from typing import List, Tuple, Optional
class FaceTracker:
def __init__(
self,
dist_threshold: float = 200.0,
switch_frames: int = 5,
panning_window: int = 10,
panning_threshold: float = 40.0,
smooth_window: int = 3,
scene_jump_threshold: float = 400.0
):
self.dist_threshold = dist_threshold
self.switch_frames = switch_frames
self.panning_window = panning_window
self.panning_threshold = panning_threshold
self.smooth_window = smooth_window
self.scene_jump_threshold = scene_jump_threshold
self.current_center: Tuple[float, float] = (960.0, 540.0) # Default Mitte (bei 1920x1080)
self.raw_center: Tuple[float, float] = self.current_center
self.prev_center: Tuple[float, float] = self.current_center
self.prev_raw: Tuple[float, float] = self.current_center
self.candidate_center: Optional[Tuple[float, float]] = None
self.switch_counter = 0
self.recent_raw_centers: List[Tuple[float, float]] = []
self.recent_final_centers: List[Tuple[float, float]] = []
def update(self, candidates: List[Tuple[float, float]]) -> Tuple[float, float]:
if not candidates:
# kein Gesicht → verwende alten Wert
self.recent_raw_centers.append(self.raw_center)
self.recent_final_centers.append(self.current_center)
return self.current_center
# nehme das Gesicht, das am nächsten zur vorherigen Position ist
new_center = min(candidates, key=lambda pt: self._distance(self.prev_center, pt))
self.raw_center = new_center
self.recent_raw_centers.append(new_center)
dist = self._distance(self.prev_raw, new_center)
if dist > self.scene_jump_threshold:
self.current_center = new_center
self.prev_center = new_center
self.prev_raw = new_center
self._smooth_reset()
return self.current_center
if dist > self.dist_threshold:
if self.candidate_center != new_center:
self.candidate_center = new_center
self.switch_counter = 1
else:
self.switch_counter += 1
if self.switch_counter >= self.switch_frames:
self.prev_center = self.current_center
self.current_center = new_center
self.prev_raw = new_center
self.switch_counter = 0
else:
self.switch_counter = 0
self.prev_raw = new_center
# Smoothes Nachziehen
smoothed = self._moving_average(self.current_center, new_center, self.smooth_window)
self.prev_center = self.current_center
self.current_center = smoothed
self.recent_final_centers.append(smoothed)
return smoothed
def _moving_average(self, old, new, factor):
x = (old[0] * (factor - 1) + new[0]) / factor
y = (old[1] * (factor - 1) + new[1]) / factor
return (x, y)
def _distance(self, pt1, pt2):
return ((pt1[0] - pt2[0]) ** 2 + (pt1[1] - pt2[1]) ** 2) ** 0.5
def _smooth_reset(self):
self.recent_raw_centers.clear()
self.recent_final_centers.clear()

129
src/reformat/new/utils.py Normal file
View File

@ -0,0 +1,129 @@
# src/utils.py
from __future__ import annotations
import json
import logging
import os
from pathlib import Path
from typing import Any, Dict, Tuple
try:
import cv2
except Exception:
cv2 = None # erlaubt Import ohne OpenCV (z.B. beim reinen Testen)
# --- Logging ---------------------------------------------------------------
def setup_logging(debug: bool = False) -> None:
level = logging.DEBUG if debug else logging.INFO
logging.basicConfig(
level=level,
format="%(asctime)s | %(levelname)s | %(message)s",
)
# --- Mathe/Helpers ---------------------------------------------------------
def clamp(v: float, lo: float, hi: float) -> float:
return max(lo, min(hi, v))
def compute_crop_width(orig_w: int, orig_h: int, out_w: int = 1080, out_h: int = 1920) -> int:
# Für 9:16 Ziel: Breite = (9/16) * orig_h, standardmäßig 1080x1920
return int((out_w / out_h) * orig_h)
def iou(boxA, boxB) -> float:
"""Berechnet Intersection-over-Union zweier Bounding-Boxes."""
ax1, ay1, aw, ah = boxA
ax2, ay2 = ax1 + aw, ay1 + ah
bx1, by1, bw, bh = boxB
bx2, by2 = bx1 + bw, by1 + bh
inter_x1 = max(ax1, bx1)
inter_y1 = max(ay1, by1)
inter_x2 = min(ax2, bx2)
inter_y2 = min(ay2, by2)
inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1)
union_area = aw * ah + bw * bh - inter_area
return inter_area / union_area if union_area > 0 else 0
# --- IO --------------------------------------------------------------------
def load_json(path: Path) -> Any:
if not path.exists():
raise FileNotFoundError(f"Datei fehlt: {path}")
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
def save_json(obj: Any, path: Path) -> None:
path.parent.mkdir(parents=True, exist_ok=True)
with open(path, "w", encoding="utf-8") as f:
json.dump(obj, f, ensure_ascii=False, indent=2)
def ensure_exists(path: Path, what: str = "Datei/Ordner") -> None:
if not path.exists():
raise FileNotFoundError(f"{what} nicht gefunden: {path}")
# --- Video / Pfade ---------------------------------------------------------
def get_fps(video_path: Path, fallback: float = 25.0) -> float:
if cv2 is None:
logging.warning("OpenCV nicht verfügbar nutze FPS-Fallback %.2f", fallback)
return fallback
cap = cv2.VideoCapture(str(video_path))
fps = cap.get(5) # cv2.CAP_PROP_FPS
cap.release()
if not fps or fps <= 1e-3:
logging.warning("Konnte FPS nicht lesen nutze Fallback %.2f", fallback)
return fallback
return float(fps)
def project_root_from(file: Path) -> Path:
# Dein Projekt nutzt häufig parents[2]; kapseln:
return file.resolve().parents[3]
def resolve_paths(project_root: Path, base_name: str) -> Dict[str, Path]:
data = project_root / "data"
return {
"timed_path": data / "transkripte" / f"{base_name}_timed.txt",
"segments_path":data / "transkripte" / f"{base_name}_segments.json",
"faces_path": data / "face_data_combined" / f"{base_name}_faces.json",
"centers_path": data / "face_crop_centers" / f"{base_name}_centers.json",
"video_path": data / "output" / "raw_clips" / f"{base_name}.mp4",
"out_9x16_dir": project_root / "output" / "output_9x16_final",
"face_debug_dir": project_root / "output" / "debug" / "faces",
}
def require_api_key(env_name: str = "OPENAI_API_KEY") -> str:
key = os.getenv(env_name)
if not key:
raise RuntimeError(
f"Umgebungsvariable {env_name} fehlt. "
f"Exportiere sie z.B.: export {env_name}='sk-...'")
return key
# --- Simple smoothing for centers ------------------------------------------
from typing import List, Optional
class CenterSmoother:
"""Glättet Zentren mit Moving Average und optionaler Jump-Erkennung."""
def __init__(self, window: int = 7, jump_thresh: float = 120.0):
self.window = window
self.jump_thresh = jump_thresh
self.buffer: List[Tuple[float, float]] = []
self.prev: Optional[Tuple[float, float]] = None
def push(self, cx: float, cy: float) -> Tuple[float, float]:
if self.prev is not None:
dx = abs(cx - self.prev[0]) + abs(cy - self.prev[1])
if dx > self.jump_thresh:
# harter Cut: reset buffer
self.buffer.clear()
self.buffer.append((cx, cy))
if len(self.buffer) > self.window:
self.buffer.pop(0)
avgx = sum(p[0] for p in self.buffer) / len(self.buffer)
avgy = sum(p[1] for p in self.buffer) / len(self.buffer)
self.prev = (avgx, avgy)
return self.prev

View File

@ -0,0 +1,235 @@
import argparse
import json
import logging
import math
import random
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
class FaceTracker:
def __init__(
self,
dist_threshold: float,
switch_frames: int,
panning_window: int,
panning_threshold: float,
smooth_window: int,
scene_jump_threshold: float,
):
self.dist_threshold = dist_threshold
self.switch_frames = switch_frames
self.panning_window = panning_window
self.panning_threshold = panning_threshold
self.smooth_window = smooth_window
self.scene_jump_threshold = scene_jump_threshold
self.current_center: Tuple[float, float] = (960.0, 540.0)
self.raw_center: Tuple[float, float] = self.current_center
self.prev_center: Tuple[float, float] = self.current_center
self.prev_raw: Tuple[float, float] = self.current_center
self.candidate_center: Optional[Tuple[float, float]] = None
self.switch_counter: int = 0
self.last_speaker_set: bool = False
self.random_center: Optional[Tuple[float, float]] = None
self.panning_buffer: List[float] = []
self.smooth_buffer: List[Tuple[float, float]] = []
def process_frame(self, faces: List[Dict[str, Any]]) -> Tuple[Tuple[int, int], bool]:
valid_faces = [f for f in faces if f.get("center") and f.get("mouth_openness") is not None]
all_faces = [f for f in faces if f.get("center")]
# Speaker tracking
if valid_faces:
self._update_speaker(valid_faces)
else:
self._retain_or_random_center(all_faces)
# Panning detection
is_panning = self._detect_panning()
# Smooth / moving average
center = self._smooth_center()
return (int(center[0]), int(center[1])), is_panning
def _update_speaker(self, valid_faces: List[Dict[str, Any]]) -> None:
best = max(valid_faces, key=lambda x: x["mouth_openness"])
cx, cy, *_ = best["center"]
new_center = (cx, cy)
dist = math.hypot(new_center[0] - self.raw_center[0], new_center[1] - self.raw_center[1])
if dist < self.dist_threshold:
self.raw_center = new_center
self.candidate_center = None
self.switch_counter = 0
else:
if (
self.candidate_center is None
or math.hypot(
new_center[0] - self.candidate_center[0], new_center[1] - self.candidate_center[1]
)
> self.dist_threshold
):
self.candidate_center = new_center
self.switch_counter = 1
else:
self.switch_counter += 1
if self.switch_counter >= self.switch_frames:
self.raw_center = self.candidate_center # type: ignore
self.candidate_center = None
self.switch_counter = 0
self.random_center = None
self.last_speaker_set = True
def _retain_or_random_center(self, all_faces: List[Dict[str, Any]]) -> None:
if self.last_speaker_set:
# keep previous raw_center
pass
elif self.random_center is not None:
self.raw_center = self.random_center
elif all_faces:
f = random.choice(all_faces)
cx, cy, *_ = f["center"]
self.random_center = (cx, cy)
self.raw_center = self.random_center
def _detect_panning(self) -> bool:
dx = self.raw_center[0] - self.prev_raw[0]
self.panning_buffer.append(dx)
if len(self.panning_buffer) > self.panning_window:
self.panning_buffer.pop(0)
avg_dx = sum(abs(d) for d in self.panning_buffer) / len(self.panning_buffer)
is_panning = avg_dx > self.panning_threshold
self.prev_raw = self.raw_center
return is_panning
def _smooth_center(self) -> Tuple[float, float]:
sudden_jump = (
math.hypot(
self.raw_center[0] - self.prev_center[0],
self.raw_center[1] - self.prev_center[1],
)
> self.scene_jump_threshold
)
if not sudden_jump:
self.smooth_buffer.append(self.raw_center)
if len(self.smooth_buffer) > self.smooth_window:
self.smooth_buffer.pop(0)
avg_x = sum(p[0] for p in self.smooth_buffer) / len(self.smooth_buffer)
avg_y = sum(p[1] for p in self.smooth_buffer) / len(self.smooth_buffer)
center = (avg_x, avg_y)
else:
center = self.raw_center
self.smooth_buffer.clear()
self.prev_center = center
return center
def parse_args() -> argparse.Namespace:
script_dir = Path(__file__).resolve().parent
project_root = script_dir.parents[1]
default_input = project_root / "data" / "face_data_combined"
default_output = project_root / "data" / "face_crop_centers"
parser = argparse.ArgumentParser(
description="Track and smooth face crop centers based on mouth openness."
)
parser.add_argument(
"-i", "--input-dir", type=Path,
default=default_input,
help=f"Directory containing *_faces.json files (default: {default_input})"
)
parser.add_argument(
"-o", "--output-dir", type=Path,
default=default_output,
help=f"Directory to save *_centers.json files (default: {default_output})"
)
parser.add_argument(
"--dist-threshold", type=float, default=30.0,
help="Pixel distance threshold to switch speaker"
)
parser.add_argument(
"--switch-frames", type=int, default=20,
help="Number of consecutive frames required to confirm speaker switch"
)
parser.add_argument(
"--panning-window", type=int, default=30,
help="Frame window size for panning detection"
)
parser.add_argument(
"--panning-threshold", type=float, default=3.0,
help="Average dx threshold for panning detection"
)
parser.add_argument(
"--smooth-window", type=int, default=8,
help="Moving average window for smoothing"
)
parser.add_argument(
"--scene-jump-threshold", type=float, default=300.0,
help="Jump threshold to detect scene cuts"
)
return parser.parse_args()
def setup_logging() -> None:
logging.basicConfig(
format="%(asctime)s %(levelname)s: %(message)s",
level=logging.INFO,
)
def main() -> None:
setup_logging()
args = parse_args()
input_dir: Path = args.input_dir.resolve()
output_dir: Path = args.output_dir.resolve()
output_dir.mkdir(parents=True, exist_ok=True)
tracker = FaceTracker(
dist_threshold=args.dist_threshold,
switch_frames=args.switch_frames,
panning_window=args.panning_window,
panning_threshold=args.panning_threshold,
smooth_window=args.smooth_window,
scene_jump_threshold=args.scene_jump_threshold,
)
json_files = sorted(input_dir.glob("*_faces.json"))
if not json_files:
logging.error("Keine *_faces.json-Dateien gefunden in %s", input_dir)
return
logging.info("Gefundene Dateien: %d", len(json_files))
for json_path in json_files:
logging.info("Verarbeite %s", json_path.name)
try:
frames_data = json.loads(json_path.read_text())
except json.JSONDecodeError as e:
logging.error("JSON-Fehler in %s: %s", json_path.name, e)
continue
out_data: List[Dict[str, Any]] = []
for frame_idx, frame in enumerate(frames_data):
faces = frame.get("faces", [])
center, is_panning = tracker.process_frame(faces)
out_data.append({
"frame": frame_idx,
"center": [center[0], center[1]],
"panning": is_panning,
})
out_path = output_dir / f"{json_path.stem.replace('_faces', '')}_centers.json"
with out_path.open("w") as f:
json.dump(out_data, f, indent=2)
logging.info("Gespeichert: %s (%d Frames)", out_path.name, len(out_data))
if __name__ == "__main__":
main()

View File

@ -0,0 +1,180 @@
import json
import cv2
import subprocess
from pathlib import Path
# === Pfade & globale Settings ===
SCRIPT_DIR = Path(__file__).resolve().parent
PROJECT_ROOT = SCRIPT_DIR.parents[1]
INPUT_VIDEO_DIR = PROJECT_ROOT / "data" / "output" / "raw_clips"
INPUT_CENTER_DIR = PROJECT_ROOT / "data" / "face_crop_centers"
INPUT_FACES_DIR = PROJECT_ROOT / "data" / "face_data_combined"
OUTPUT_DIR = PROJECT_ROOT / "output" / "output_9x16_final"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_W, OUT_H = 1080, 1920
DEBUG_MODE = True
DEBUG_SCALE = 0.75
# Ab welcher Offenheit wir "Bewegung" annehmen
DEBUG_MOUTH_THRESHOLD = 0.02
# === Hilfsfunktionen ===
def clamp(v, lo, hi):
return max(lo, min(hi, v))
def compute_crop_width(orig_w, orig_h):
return int((OUT_W / OUT_H) * orig_h)
# === Verarbeitung ===
for center_path in sorted(INPUT_CENTER_DIR.glob("*_centers.json")):
stem = center_path.stem.replace("_centers", "")
video_path = INPUT_VIDEO_DIR / f"{stem}.mp4"
faces_path = INPUT_FACES_DIR / f"{stem}_faces.json"
if not video_path.exists():
print(f"⚠️ Video fehlt: {stem}.mp4")
continue
if not faces_path.exists():
print(f"⚠️ Gesichtsdaten fehlen: {stem}_faces.json")
continue
centers_data = json.loads(center_path.read_text())
faces_data = json.loads(faces_path.read_text())
# Debug-Liste pro Video anlegen
if DEBUG_MODE:
debug_results: list = []
cap = cv2.VideoCapture(str(video_path))
fps = cap.get(cv2.CAP_PROP_FPS)
orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
crop_w = compute_crop_width(orig_w, orig_h)
crop_h = orig_h
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
temp_vid = OUTPUT_DIR / f"{stem}_cropped.mp4"
out_vid = cv2.VideoWriter(str(temp_vid), fourcc, fps, (OUT_W, OUT_H))
if not out_vid.isOpened():
print(f"❌ Kann nicht schreiben: {temp_vid.name}")
continue
if DEBUG_MODE:
cv2.namedWindow("Debug", cv2.WINDOW_NORMAL)
frame_idx = 0
while True:
ret, frame = cap.read()
if not ret or frame_idx >= len(centers_data):
break
# Crop-Infos
info = centers_data[frame_idx]
cx, cy = info["center"]
is_panning = info.get("panning", False)
if is_panning:
cx = orig_w // 2
x0 = int(cx - crop_w / 2)
x0 = clamp(x0, 0, orig_w - crop_w)
y0 = 0
# Ausschneiden + Resize
crop = frame[y0:y0+crop_h, x0:x0+crop_w]
if crop.shape[1] != crop_w or crop.shape[0] != crop_h:
crop = cv2.copyMakeBorder(
crop, 0, crop_h - crop.shape[0],
0, crop_w - crop.shape[1],
cv2.BORDER_CONSTANT, value=[0, 0, 0]
)
out_frame = cv2.resize(crop, (OUT_W, OUT_H), interpolation=cv2.INTER_LINEAR)
out_vid.write(out_frame)
if DEBUG_MODE:
debug_frame = frame.copy()
frame_faces = faces_data[frame_idx].get("faces", [])
# Build debug entry for this frame
dbg_faces = []
for f in frame_faces:
# center und Offenheit
cx_f, cy_f = map(int, f["center"][:2])
openness = f.get("mouth_openness", 0.0)
moving = openness > DEBUG_MOUTH_THRESHOLD
dbg_faces.append({
"center": [cx_f, cy_f],
"mouth_openness": openness,
"mouth_moving": moving
})
# Anzeige im Debug-Fenster
cv2.circle(debug_frame, (cx_f, cy_f), 4, (180, 180, 180), -1)
cv2.putText(
debug_frame,
f"{round(openness,2)}",
(cx_f + 6, cy_f - 6),
cv2.FONT_HERSHEY_SIMPLEX,
0.5,
(255, 255, 255),
1,
cv2.LINE_AA
)
# roter Punkt, wenn Bewegung
color = (0,0,255) if moving else (0,255,255)
cv2.circle(debug_frame, (cx_f, cy_f), 6, color, 1)
debug_results.append({
"frame": frame_idx,
"faces": dbg_faces
})
# Haupt-Center & Crop-Rahmen
cv2.circle(debug_frame, (int(cx), int(cy)), 18, (0, 255, 0), 2)
cv2.rectangle(debug_frame, (x0, 0), (x0 + crop_w, crop_h), (0, 0, 255), 2)
dbg = cv2.resize(
debug_frame,
(int(orig_w * DEBUG_SCALE), int(orig_h * DEBUG_SCALE))
)
cv2.imshow("Debug", dbg)
if cv2.waitKey(1) & 0xFF == ord('q'):
break
frame_idx += 1
cap.release()
out_vid.release()
if DEBUG_MODE:
cv2.destroyAllWindows()
# Audio extrahieren & muxen
audio_tmp = OUTPUT_DIR / f"{stem}_audio.aac"
final_vid = OUTPUT_DIR / f"{stem}.mp4"
try:
subprocess.run(
["ffmpeg", "-y", "-i", str(video_path), "-vn", "-acodec", "copy", str(audio_tmp)],
check=True
)
subprocess.run(
["ffmpeg", "-y", "-i", str(temp_vid), "-i", str(audio_tmp),
"-c:v", "copy", "-c:a", "aac", "-b:a", "128k", str(final_vid)],
check=True
)
finally:
try: temp_vid.unlink()
except: pass
try: audio_tmp.unlink()
except: pass
# Debug-JSON schreiben
if DEBUG_MODE:
dbg_path = OUTPUT_DIR / f"{stem}_debug.json"
with dbg_path.open("w") as f:
json.dump(debug_results, f, indent=2)
print(f"🛠️ Debug-Daten: {dbg_path.name}")
print(f"✅ Finales Video: {final_vid.name}")
print("\n🏁 Alle Videos fertig in:", OUTPUT_DIR.resolve())

View File

@ -0,0 +1,126 @@
import json
from pathlib import Path
from collections import defaultdict
import numpy as np
# === Einstellungen ===
SCRIPT_DIR = Path(__file__).resolve().parent
PROJECT_ROOT = SCRIPT_DIR.parents[1]
INPUT_DIR = PROJECT_ROOT / "data" / "face_data_combined"
OUTPUT_PATH = INPUT_DIR / "dominant_faces.json"
SEGMENT_LENGTH = 2.0 # Länge jedes Segments in Sekunden
MOUTH_THRESHOLD = 0.01 # minimale Mundöffnung, um einen Sprecher zu zählen
SMOOTH_WINDOW = 5 # Fenstergröße (in Segmenten) für Moving Average
def analyze_clip_timed(path):
# 1) JSON einlesen
try:
data = json.loads(path.read_text())
except Exception as e:
print(f"❌ Fehler beim Lesen von {path.name}: {e}")
return None
# 2) Nur valide Frames verwenden
frames = [d for d in data if "timestamp" in d and isinstance(d.get("faces"), list)]
if not frames:
print(f"⚠️ Keine validen Frames in {path.name}")
return None
frames.sort(key=lambda x: x["timestamp"])
max_time = frames[-1]["timestamp"]
# 3) Segmente erzeugen und dominanten Sprecher per Segment finden
segments = []
t = 0.0
while t < max_time:
t_end = t + SEGMENT_LENGTH
face_scores = defaultdict(list) # mouth_openness pro bbox
face_boxes = defaultdict(list) # raw bbox pro bbox-key
face_centers = defaultdict(list) # center [cx,cy,w,h] pro bbox-key
# alle Frames durchsuchen, die in dieses Segment fallen
for f in frames:
ts = f["timestamp"]
if t <= ts < t_end:
for face in f["faces"]:
bbox = face["bbox"] # [x,y,w,h]
score = face.get("mouth_openness", 0.0)
center = face.get("center", None) # [cx,cy,w,h]
key = tuple(bbox)
if score >= MOUTH_THRESHOLD and center is not None:
face_scores[key].append(score)
face_boxes[key].append(bbox)
face_centers[key].append(center)
if face_scores:
# den Key mit dem höchsten Durchschnittsscore wählen
avg_scores = {k: np.mean(v) for k, v in face_scores.items()}
dominant_key = max(avg_scores, key=avg_scores.get)
# mittlere BoundingBox und mittleres Center berechnen
avg_bbox = np.mean(face_boxes[dominant_key], axis=0).astype(int).tolist()
avg_center = np.mean(face_centers[dominant_key], axis=0).tolist() # [cx,cy,w,h]
segments.append({
"start": round(t, 2),
"end": round(t_end if t_end < max_time else max_time, 2),
"bbox": avg_bbox,
"center": [float(avg_center[0]), float(avg_center[1]), float(avg_center[2]), float(avg_center[3])]
})
t += SEGMENT_LENGTH
if not segments:
print(f"⚠️ Keine Segmente für Clip {path.name}")
return None
# 4) Glätten der SegmentZentren mit Moving Average
seg_centers = [s["center"] for s in segments] # Liste von [cx,cy,w,h]
sm_centers = []
n = len(seg_centers)
half = SMOOTH_WINDOW // 2
for i in range(n):
start = max(0, i - half)
end = min(n, i + half + 1)
window = seg_centers[start:end]
avg = np.mean(window, axis=0) # ergibt [cx,cy,w,h]
sm_centers.append(avg.tolist())
# 5) Ausgabe des geglätteten Pfades in die Konsole
print(f"\n🔄 Smoothed path für Clip {path.stem}:")
for i, s in enumerate(segments):
cx, cy, w, h = sm_centers[i]
print(f" Segment {i} [{s['start']}{s['end']}s]: "
f"center=({cx:.1f}, {cy:.1f}), size=({w:.1f}×{h:.1f})")
# 6) Neue Felder für AusgabeJSON bauen
sm_segments = []
for i, s in enumerate(segments):
cx, cy, w, h = sm_centers[i]
x0 = int(cx - w/2)
y0 = int(cy - h/2)
sm_segments.append({
"start": s["start"],
"end": s["end"],
"bbox": [x0, y0, int(w), int(h)]
})
return {
"clip": path.stem.replace("_faces", "") + ".mp4",
"segments": sm_segments
}
# === Hauptschleife über alle Clips ===
results = []
for json_file in sorted(INPUT_DIR.glob("*_faces.json")):
out = analyze_clip_timed(json_file)
if out:
results.append(out)
OUTPUT_PATH.write_text(json.dumps(results, indent=2))
print(f"\n✅ Analyse abgeschlossen {len(results)} Clips erkannt.")
print(f"📄 Gespeichert in: {OUTPUT_PATH.resolve()}")

View File

@ -0,0 +1,114 @@
import json
import cv2
import numpy as np
from pathlib import Path
from tqdm import tqdm
from collections import defaultdict, Counter
from sklearn.cluster import DBSCAN
# === Einstellungen ===
SCRIPT_DIR = Path(__file__).resolve().parent
VIDEO_DIR = SCRIPT_DIR.parents[1] / "output"
FACE_JSON_DIR = SCRIPT_DIR / "face_data_yolo"
OUTPUT_DIR = SCRIPT_DIR.parents[1] / "output_stacked_faces"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
OUT_WIDTH = 1080
OUT_HEIGHT = 1920
GRID_ROWS = 4
FACE_CROP_HEIGHT = OUT_HEIGHT // GRID_ROWS
FACE_CROP_WIDTH = OUT_WIDTH
# === Hilfsfunktion
def bbox_center(bbox):
x, y, w, h = bbox
return int(x + w // 2), int(y + h // 2)
# === Hauptverarbeitung ===
for json_path in tqdm(sorted(FACE_JSON_DIR.glob("*_faces.json")), desc="🔍 Erzeuge Grid-Clips"):
video_name = json_path.stem.replace("_faces", "") + ".mp4"
video_path = VIDEO_DIR / video_name
if not video_path.exists():
print(f"❌ Video nicht gefunden: {video_name}")
continue
data = json.loads(json_path.read_text())
# === Alle Gesichtszentren sammeln
all_faces = []
for frame in data:
for face in frame["faces"]:
center = bbox_center(face["bbox"])
all_faces.append((center, face["bbox"]))
if not all_faces:
print(f"⚠️ Keine Gesichter erkannt in {video_name}")
continue
# === Clustern
coords = [pos for pos, _ in all_faces]
clustering = DBSCAN(eps=80, min_samples=5).fit(coords)
cluster_labels = clustering.labels_
label_counts = Counter(cluster_labels)
most_common_labels = [lbl for lbl, _ in label_counts.most_common(GRID_ROWS) if lbl != -1]
if not most_common_labels:
print(f"⚠️ Keine gültigen Cluster in {video_name}")
continue
# === Zuordnung: cluster_id → feste Zeile
cluster_faces = defaultdict(list)
for (_, bbox), label in zip(all_faces, cluster_labels):
if label in most_common_labels:
cluster_faces[label].append(bbox)
def cluster_y(label):
return np.mean([bbox[1] for bbox in cluster_faces[label]])
sorted_labels = sorted(most_common_labels, key=cluster_y)
label_to_row = {lbl: idx for idx, lbl in enumerate(sorted_labels)}
# === cluster_id zu jedem Gesicht hinzufügen
for frame in data:
for face in frame["faces"]:
center = bbox_center(face["bbox"])
distances = [np.linalg.norm(np.array(center) - np.array(c)) for c in coords]
nearest = np.argmin(distances)
label = cluster_labels[nearest]
face["cluster_id"] = label
# === Video verarbeiten
cap = cv2.VideoCapture(str(video_path))
fps = cap.get(cv2.CAP_PROP_FPS)
out_path = OUTPUT_DIR / video_name.replace(".mp4", "_stacked.mp4")
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
writer = cv2.VideoWriter(str(out_path), fourcc, fps, (OUT_WIDTH, OUT_HEIGHT))
frame_idx = 0
while cap.isOpened():
ret, frame = cap.read()
if not ret or frame_idx >= len(data):
break
output_frame = np.zeros((OUT_HEIGHT, OUT_WIDTH, 3), dtype=np.uint8)
for face in data[frame_idx]["faces"]:
label = face.get("cluster_id", -1)
if label not in label_to_row:
continue
row = label_to_row[label]
x, y, w, h = face["bbox"]
crop = frame[y:y+h, x:x+w]
if crop.size == 0:
continue
resized = cv2.resize(crop, (FACE_CROP_WIDTH, FACE_CROP_HEIGHT))
y_offset = row * FACE_CROP_HEIGHT
output_frame[y_offset:y_offset+FACE_CROP_HEIGHT, :] = resized
writer.write(output_frame)
frame_idx += 1
cap.release()
writer.release()
print(f"✅ Exportiert: {out_path.name}")
print("🏁 Alle Grid-Videos fertig.")

View File

@ -0,0 +1,75 @@
import cv2
import json
from pathlib import Path
from tqdm import tqdm
SCRIPT_DIR = Path(__file__).resolve().parent
PROJECT_DIR = SCRIPT_DIR.parents[1] # ← geht von /src/reformat zu /BachlorArbeit
FACES_DIR = PROJECT_DIR / "data" / "face_data_combined"
INPUT_VIDEO_DIR = PROJECT_DIR / "data" / "output" / "raw_clips"
OUTPUT_DIR = PROJECT_DIR / "output" / "output_preview_faces"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# === Alle *_faces.json Dateien durchgehen ===
face_files = sorted(FACES_DIR.glob("*_faces.json"))
for face_file in tqdm(face_files, desc="🔍 Erzeuge Vorschau mit Sprechererkennung"):
clip_name = face_file.stem.replace("_faces", "") + ".mp4"
input_path = INPUT_VIDEO_DIR / clip_name
output_path = OUTPUT_DIR / clip_name.replace(".mp4", "_preview_faces.mp4")
if not input_path.exists():
print(f"❌ Clip nicht gefunden: {clip_name}")
continue
# Video-Setup
cap = cv2.VideoCapture(str(input_path))
fps = cap.get(cv2.CAP_PROP_FPS)
fps = fps if fps > 1 else 25 # fallback falls FPS = 0
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fourcc = cv2.VideoWriter_fourcc(*"avc1") # Kompatibler als mp4v
out = cv2.VideoWriter(str(output_path), fourcc, fps, (width, height))
# Gesichts-Daten laden
data = json.loads(face_file.read_text())
data_by_frame = {d["frame"]: d["faces"] for d in data if d["faces"]}
print(f"🔢 Frames mit Gesichtern: {len(data_by_frame)}")
frame_idx = 0
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
faces = data_by_frame.get(frame_idx, [])
speaker_idx = None
# Sprecher anhand Mundöffnung
if faces and all("mouth_openness" in f for f in faces):
mouth_vals = [f["mouth_openness"] for f in faces]
if any(v > 0.01 for v in mouth_vals): # einfache Aktivitäts-Schwelle
speaker_idx = mouth_vals.index(max(mouth_vals))
for i, face in enumerate(faces):
x, y, w, h = face["bbox"]
color = (0, 255, 0) if i == speaker_idx else (255, 255, 255)
label = f"Mouth: {face.get('mouth_openness', 0):.2f}"
# Debug-Ausgabe (optional)
print(f"Frame {frame_idx} | Face {i} | BBox: ({x},{y},{w},{h}) | Speaker: {speaker_idx}")
cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
cv2.putText(frame, label, (x, y - 10),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
out.write(frame)
frame_idx += 1
cap.release()
out.release()
print(f"✅ Vorschau exportiert: {output_path.name}")
print("🏁 Alle Vorschauvideos mit Sprecherkennung erstellt.")

View File

@ -0,0 +1,92 @@
import cv2
import mediapipe as mp
import json
from pathlib import Path
from tqdm import tqdm
# === Einstellungen ===
INPUT_DIR = Path(__file__).resolve().parents[2] / "output"
OUTPUT_DIR = Path(__file__).resolve().parent / "face_data"
OUTPUT_DIR.mkdir(exist_ok=True)
FRAME_SKIP = 1 # analysiere jeden Frame für maximale Genauigkeit
PADDING = 30 # Pixel Padding um Gesicht
mp_face_mesh = mp.solutions.face_mesh
# Erweiterte Lippen-Landmarks (innen)
TOP_LIPS = [13, 78, 82]
BOTTOM_LIPS = [14, 87, 317]
def mouth_openness(landmarks, image_height):
try:
top_avg = sum([landmarks[i].y for i in TOP_LIPS]) / len(TOP_LIPS)
bottom_avg = sum([landmarks[i].y for i in BOTTOM_LIPS]) / len(BOTTOM_LIPS)
return abs(bottom_avg - top_avg)
except:
return 0.0
def process_video(path):
cap = cv2.VideoCapture(str(path))
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
results = []
with mp_face_mesh.FaceMesh(
static_image_mode=False,
max_num_faces=5,
refine_landmarks=True,
min_detection_confidence=0.6,
min_tracking_confidence=0.6
) as face_mesh:
frame_idx = 0
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
if frame_idx % FRAME_SKIP != 0:
frame_idx += 1
continue
rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
output = face_mesh.process(rgb)
faces = []
if output.multi_face_landmarks:
for landmarks in output.multi_face_landmarks:
mouth = mouth_openness(landmarks.landmark, height)
xs = [lm.x * width for lm in landmarks.landmark]
ys = [lm.y * height for lm in landmarks.landmark]
x1 = max(0, int(min(xs)) - PADDING)
y1 = max(0, int(min(ys)) - PADDING)
x2 = min(width, int(max(xs)) + PADDING)
y2 = min(height, int(max(ys)) + PADDING)
bbox = [x1, y1, x2 - x1, y2 - y1]
faces.append({
"bbox": bbox,
"mouth_openness": round(mouth, 4)
})
results.append({
"frame": frame_idx,
"timestamp": round(frame_idx / fps, 2),
"faces": faces
})
frame_idx += 1
cap.release()
out_path = OUTPUT_DIR / f"{path.stem}_faces.json"
out_path.write_text(json.dumps(results, indent=2))
print(f"{path.name} verarbeitet → {out_path.name}")
# === Alle Videos im output/ Ordner durchgehen
videos = list(INPUT_DIR.glob("*.mp4"))
print(f"🎬 {len(videos)} Videos gefunden in: {INPUT_DIR}")
for video in tqdm(videos):
process_video(video)

View File

@ -0,0 +1,206 @@
#!/usr/bin/env python3
import argparse
import logging
import json
from pathlib import Path
import cv2
from ultralytics import YOLO
import mediapipe as mp
# === Pfade und Standardwerte ===
SCRIPT_DIR = Path(__file__).resolve().parent
PROJECT_ROOT = SCRIPT_DIR.parents[1]
DEFAULT_INPUT_DIR = PROJECT_ROOT / "data" / "output" / "raw_clips"
DEFAULT_OUTPUT_DIR = PROJECT_ROOT / "data" / "face_data_combined"
DEFAULT_MODEL_PATH = PROJECT_ROOT / "models" / "yolov8n-face.pt"
# Stelle sicher, dass das Standard-Output-Verzeichnis existiert
DEFAULT_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# === Landmarks für Lippen ===
TOP_LIPS = [61, 185, 40, 39, 37, 0, 267, 269, 270, 409]
BOTTOM_LIPS = [146, 91, 181, 84, 17, 314, 405, 321, 375, 291]
def get_mouth_openness(landmarks, image_height):
"""
Berechnet die Mundöffnung in Pixeln basierend auf normierten Landmark-Koordinaten.
"""
top_avg = sum(landmarks[i].y for i in TOP_LIPS) / len(TOP_LIPS)
bottom_avg = sum(landmarks[i].y for i in BOTTOM_LIPS) / len(BOTTOM_LIPS)
return abs(bottom_avg - top_avg) * image_height
def iou(boxA, boxB):
"""Berechnet Intersection-over-Union zweier Bounding-Boxes im Format (x, y, w, h)."""
ax1, ay1, aw, ah = boxA
ax2, ay2 = ax1 + aw, ay1 + ah
bx1, by1, bw, bh = boxB
bx2, by2 = bx1 + bw, by1 + bh
inter_x1 = max(ax1, bx1)
inter_y1 = max(ay1, by1)
inter_x2 = min(ax2, bx2)
inter_y2 = min(ay2, by2)
inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1)
union_area = aw * ah + bw * bh - inter_area
return inter_area / union_area if union_area > 0 else 0
def process_video(
video_path: Path,
output_path: Path,
model: YOLO,
face_mesh: mp.solutions.face_mesh.FaceMesh,
conf_thresh: float,
frame_skip: int,
downscale: float,
):
cap = cv2.VideoCapture(str(video_path))
if not cap.isOpened():
logging.error(f"Kann Video nicht öffnen: {video_path}")
return
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH) * downscale)
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT) * downscale)
# JSON-Ausgabe mit Streaming
with output_path.open('w', encoding='utf-8') as f_out:
f_out.write('[\n')
first = True
frame_idx = 0
while True:
ret, frame = cap.read()
if not ret:
break
if frame_skip > 1 and frame_idx % frame_skip != 0:
frame_idx += 1
continue
if downscale != 1.0:
frame = cv2.resize(frame, (width, height))
detections = model(frame, verbose=False)[0]
yolo_boxes = []
for box in detections.boxes:
conf = float(box.conf[0] if hasattr(box.conf, '__getitem__') else box.conf)
if conf < conf_thresh:
continue
coords = box.xyxy[0].cpu().numpy()
x1, y1, x2, y2 = map(int, coords)
yolo_boxes.append([x1, y1, x2 - x1, y2 - y1])
rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
mp_result = face_mesh.process(rgb)
mp_faces = []
if mp_result.multi_face_landmarks:
for landmarks in mp_result.multi_face_landmarks:
mouth_px = get_mouth_openness(landmarks.landmark, height)
xs = [lm.x * width for lm in landmarks.landmark]
ys = [lm.y * height for lm in landmarks.landmark]
x1, y1 = int(min(xs)), int(min(ys))
x2, y2 = int(max(xs)), int(max(ys))
mp_faces.append({
"bbox": [x1, y1, x2 - x1, y2 - y1],
"mouth_openness": round(mouth_px, 1)
})
combined = []
for yb in yolo_boxes:
if mp_faces:
best = max(mp_faces, key=lambda m: iou(yb, m["bbox"]))
best_iou = iou(yb, best["bbox"])
mouth = best["mouth_openness"] if best_iou > 0.2 else 0.0
else:
mouth = 0.0
x, y, w, h = yb
cx, cy = x + w / 2, y + h / 2
combined.append({
"bbox": yb,
"mouth_openness": round(mouth, 1),
"center": [round(cx, 1), round(cy, 1), w, h]
})
result = {
"frame": frame_idx,
"timestamp": round(frame_idx / fps, 3),
"faces": combined
}
if not first:
f_out.write(',\n')
json.dump(result, f_out, ensure_ascii=False)
first = False
frame_idx += 1
f_out.write('\n]')
cap.release()
logging.info(f"Verarbeitet: {video_path.name}{output_path.name}")
def main():
parser = argparse.ArgumentParser(
description="Analyse von Videos: Gesichter und Mundöffnung erkennen"
)
parser.add_argument(
"--input-dir", type=Path,
default=DEFAULT_INPUT_DIR,
help=f"Verzeichnis mit MP4-Videos (standard: {DEFAULT_INPUT_DIR})"
)
parser.add_argument(
"--output-dir", type=Path,
default=DEFAULT_OUTPUT_DIR,
help=f"Verzeichnis für JSON-Ergebnisse (standard: {DEFAULT_OUTPUT_DIR})"
)
parser.add_argument(
"--model", type=Path,
default=DEFAULT_MODEL_PATH,
help=f"Pfad zum YOLOv8-Face-Modell (.pt) (standard: {DEFAULT_MODEL_PATH})"
)
parser.add_argument(
"--conf-thresh", type=float, default=0.5,
help="Schwelle für YOLO-Confidence"
)
parser.add_argument(
"--frame-skip", type=int, default=1,
help="Nur jede n-te Frame verarbeiten"
)
parser.add_argument(
"--downscale", type=float, default=1.0,
help="Skalierungsfaktor für Frames"
)
args = parser.parse_args()
logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO)
args.output_dir.mkdir(parents=True, exist_ok=True)
yolo = YOLO(str(args.model))
face_mesh = mp.solutions.face_mesh.FaceMesh(
static_image_mode=False,
max_num_faces=5,
refine_landmarks=True,
min_detection_confidence=0.5,
min_tracking_confidence=0.5
)
for video_path in sorted(args.input_dir.glob("*.mp4")):
out_path = args.output_dir / f"{video_path.stem}_faces.json"
process_video(
video_path,
out_path,
yolo,
face_mesh,
args.conf_thresh,
args.frame_skip,
args.downscale,
)
if __name__ == "__main__":
main()

12
src/reformat/speaking.py Normal file
View File

@ -0,0 +1,12 @@
# src/speaking.py
TOP_LIPS = [61, 185, 40, 39, 37, 0, 267, 269, 270, 409]
BOTTOM_LIPS = [146, 91, 181, 84, 17, 314, 405, 321, 375, 291]
def get_mouth_openness(landmarks, image_height):
"""
Berechnet die Mundöffnung basierend auf MediaPipe-Landmarks.
"""
top_avg = sum(landmarks[i].y for i in TOP_LIPS) / len(TOP_LIPS)
bottom_avg = sum(landmarks[i].y for i in BOTTOM_LIPS) / len(BOTTOM_LIPS)
return abs(bottom_avg - top_avg) * image_height

View File

@ -0,0 +1,265 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
add_subtitles.py TikTok-Word-Caps mit OpenAI Whisper (CPU)
- läuft Ordner-weise über 9:16-Kurzclips
- transkribiert mit word_timestamps=True
- erzeugt ASS (ein Wort pro Zeile, Pop-Animation, Bottom-Center)
- brennt via ffmpeg in *_subtitled.mp4
"""
import os
import re
import glob
import json
import subprocess
import tempfile
import traceback
import argparse
from typing import List, Tuple, Optional
from pathlib import Path
import sys
# ── Projektwurzel in sys.path aufnehmen (dieses Skript liegt z. B. unter src/subtitles/)
ROOT = Path(__file__).resolve().parents[2]
sys.path.insert(0, str(ROOT))
from config import CROPPED_DIR, SUBTITLED_DIR # zentrale Pfade
# --- Stabil auf CPU (vermeidet MPS-Sparse-Fehler) ---
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = ""
def log(*a): print("[LOG]", *a)
def ensure_dir(p: Path): p.mkdir(parents=True, exist_ok=True)
def has_audio_stream(video_path: str) -> bool:
cmd = ["ffprobe","-v","error","-select_streams","a","-show_entries","stream=index","-of","json",video_path]
try:
out = subprocess.check_output(cmd).decode("utf-8")
data = json.loads(out)
return bool(data.get("streams"))
except Exception:
return False
def load_whisper_cpu(model_name: str):
import whisper # openai-whisper
device = "cpu"
model = whisper.load_model(model_name, device=device)
fp16 = False
return model, device, fp16
def transcribe_words_whisper(model, media_path: str, language: Optional[str], fp16: bool) -> List[Tuple[float,float,str]]:
"""
Nutzt 'openai-whisper' mit word_timestamps=True.
Fallback: wenn 'words' fehlen, werden Segmenttexte approx. auf Wörter verteilt.
"""
result = model.transcribe(
media_path,
language=language,
task="transcribe",
word_timestamps=True,
condition_on_previous_text=False,
verbose=False,
fp16=fp16
)
words: List[Tuple[float,float,str]] = []
segs = result.get("segments", []) or []
for seg in segs:
wlist = seg.get("words")
if isinstance(wlist, list) and wlist and all(isinstance(w, dict) for w in wlist):
for w in wlist:
t = (w.get("word") or w.get("text") or "").strip()
if not t:
continue
ws = w.get("start"); we = w.get("end")
if ws is None or we is None:
continue
t = re.sub(r"\s+", " ", t)
if t:
words.append((float(ws), float(we), t))
else:
# Fallback: Segment auf Wörter aufteilen & Zeiten gleichmäßig verteilen
text = (seg.get("text") or "").strip()
if not text:
continue
seg_start = float(seg.get("start", 0.0))
seg_end = float(seg.get("end", seg_start))
toks = [w for w in re.split(r"(\s+)", text) if w.strip()]
if not toks or seg_end <= seg_start:
continue
dur = seg_end - seg_start
step = dur / len(toks)
for i, tok in enumerate(toks):
ws = seg_start + i * step
we = seg_start + (i+1) * step
words.append((ws, we, tok))
return words
def ass_time(t: float) -> str:
if t < 0: t = 0
h = int(t // 3600); m = int((t % 3600)//60); s = int(t % 60); cs = int(round((t - int(t))*100))
return f"{h:d}:{m:02d}:{s:02d}.{cs:02d}"
def write_ass_words(words: List[Tuple[float,float,str]], ass_path: Path, font_size: int, margin_v: int, uppercase: bool):
"""
Ein Wort pro Zeile, ohne Überlappung:
- Ende = min(eigene Endzeit, Start nächstes Wort - 0.02)
- Pop-Animation 150ms, fette Outline, Bottom-Center (PlayResY=1920)
"""
header = f"""[Script Info]
ScriptType: v4.00+
Collisions: Normal
PlayResX: 1080
PlayResY: 1920
ScaledBorderAndShadow: yes
[V4+ Styles]
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
Style: WordCap,Inter,{font_size},&H00FFFFFF,&H00FFFFFF,&H00101010,&H64000000,1,0,0,0,100,100,0,0,1,6,0.8,2,80,80,{margin_v},1
[Events]
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
"""
# Zeiten glätten, damit immer nur ein Wort sichtbar ist
adjusted = []
for i, (s, e, t) in enumerate(words):
nstart = words[i+1][0] if i+1 < len(words) else e
new_end = min(e, nstart - 0.02) if nstart > s else e
if new_end <= s:
new_end = s + 0.06
adjusted.append((s, new_end, t))
with open(ass_path, "w", encoding="utf-8") as f:
f.write(header)
for s, e, t in adjusted:
st, en = ass_time(s), ass_time(e)
txt = t.upper() if uppercase else t
# \fs sichere Größe, \blur für weiche Outline, \fad Ein/Aus,
# \fscx135\fscy135 → Start groß, \t(...) schrumpft in 150ms auf 100% = Pop
overrides = r"\blur1\bord8\1c&H0000FFFF&\3c&H000000&\4c&H000000&\fad(50,20)\fscx135\fscy135\t(0,150,\fscx100\fscy100)"
f.write(f"Dialogue: 0,{st},{en},WordCap,,0,0,0,,{{{overrides}}}{txt}\n")
def ffmpeg_escape_for_subtitles(path: Path) -> str:
"""
Pfad für -vf subtitles= escapen (für Leerzeichen, Doppelpunkte etc.).
ffmpeg erwartet Backslash-escaping für Filter-Argumente.
"""
s = str(path)
s = s.replace("\\", "\\\\")
s = s.replace(":", "\\:")
s = s.replace("'", "\\'")
s = s.replace(",", "\\,")
s = s.replace("[", "\\[")
s = s.replace("]", "\\]")
s = s.replace(";", "\\;")
s = s.replace("=", "\\=")
return s
def burn(video_in: Path, ass_file: Path, out_path: Path, crf=18, preset="medium") -> int:
vf = f"subtitles={ffmpeg_escape_for_subtitles(ass_file)}"
cmd = [
"ffmpeg","-y","-i",str(video_in),
"-vf", vf,
"-c:v","libx264","-preset",preset,"-crf",str(crf),
"-c:a","copy",
str(out_path)
]
log("FFmpeg:", " ".join(cmd))
return subprocess.call(cmd)
def parse_args():
p = argparse.ArgumentParser(description="Brennt Word-Caps (ASS) via Whisper-Transkription in 9:16-Clips.")
p.add_argument("--clips_dir", type=Path, default=CROPPED_DIR, help=f"Quellordner (Default: {CROPPED_DIR})")
p.add_argument("--out_dir", type=Path, default=SUBTITLED_DIR, help=f"Zielordner (Default: {SUBTITLED_DIR})")
p.add_argument("--pattern", type=str, default="*.mp4", help="Dateimuster (Default: *.mp4)")
p.add_argument("--limit", type=int, default=None, help="Nur die ersten N Clips verarbeiten")
p.add_argument("--model", type=str, default=os.getenv("WHISPER_MODEL", "medium"), help="Whisper-Modell")
p.add_argument("--lang", type=str, default=os.getenv("LANGUAGE", "none"), help="Sprachcode (z. B. de, en, None=Auto)")
p.add_argument("--uppercase", action="store_true", help="Text in Großbuchstaben rendern")
p.add_argument("--font_size", type=int, default=108, help="ASS-Fontgröße")
p.add_argument("--margin_v", type=int, default=320, help="ASS-MarginV (Abstand vom unteren Rand)")
p.add_argument("--crf", type=int, default=18, help="ffmpeg CRF (Qualität)")
p.add_argument("--preset", type=str, default="medium", help="ffmpeg Preset")
return p.parse_args()
def main():
args = parse_args()
clips_dir = args.clips_dir
output_dir = args.out_dir
ensure_dir(output_dir)
log("Starte TikTok Word-Caps (Whisper)")
log("CLIPS_DIR =", clips_dir)
log("OUTPUT_DIR =", output_dir)
clips: List[str] = []
for pat in (args.pattern,):
clips += glob.glob(str(clips_dir / pat))
clips.sort()
log(f"{len(clips)} Clips gefunden.")
if args.limit:
clips = clips[:args.limit]
log(f"LIMIT aktiv: {args.limit}")
if not clips:
log("Keine Clips gefunden. Pfad/Pattern checken.")
return
# Whisper laden (CPU)
try:
model, device, fp16 = load_whisper_cpu(args.model)
log(f"Whisper geladen: {args.model} auf {device} (fp16={fp16})")
log("Hinweis: Beim ersten Lauf kann das Modell nachgeladen werden.")
except Exception as e:
print("[ERROR] Whisper konnte nicht geladen werden:", e)
traceback.print_exc()
return
lang = None if str(args.lang).strip().lower() in {"", "none", "null"} else args.lang
for clip in clips:
base = os.path.basename(clip)
stem, _ = os.path.splitext(base)
log("="*60)
log("Clip:", base)
if not has_audio_stream(clip):
log("WARN: Keine Audio-Spur → übersprungen.")
continue
# Transkription
try:
log("Transkription startet …")
words = transcribe_words_whisper(model, clip, language=lang, fp16=fp16)
log(f"Transkription fertig. {len(words)} Wörter.")
if not words:
log("WARN: 0 Wörter erkannt → übersprungen.")
continue
except Exception as e:
print("[ERROR] Transkription fehlgeschlagen:", e)
traceback.print_exc()
continue
# ASS erzeugen & brennen
with tempfile.NamedTemporaryFile(suffix=".ass", delete=False) as tmp:
ass_path = Path(tmp.name)
try:
log("Erzeuge ASS …")
write_ass_words(words, ass_path, font_size=args.font_size, margin_v=args.margin_v, uppercase=args.uppercase)
out_path = output_dir / f"{stem}_subtitled.mp4"
log("Brenne Untertitel …")
rc = burn(Path(clip), ass_path, out_path, crf=args.crf, preset=args.preset)
if rc == 0:
log("OK:", out_path)
else:
log("ERROR: ffmpeg fehlgeschlagen, code", rc)
finally:
try: ass_path.unlink(missing_ok=True)
except Exception: pass
log("Fertig.")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,25 @@
import os
import tempfile
from add_subtitles import process # wir nutzen die Logik aus dem großen Skript
# ==== HIER EINSTELLEN ====
VIDEO_PATH = "data/input.mp4" # Dein Video
TRANSCRIPT_PATH = "data/transcript.srt" # Oder .json (Whisper)
OUTPUT_DIR = "data/output" # Ordner für Ergebnisse
CLIPS_PATH = None # Optional: "data/clips.csv" oder "data/clips.json"
CRF = 18
PRESET = "medium"
STYLE = r"\\bord4\\shad4\\outline3\\fs64\\b1\\1c&HFFFFFF&\\3c&H000000&\\4c&H000000&"
# ==========================
if __name__ == "__main__":
os.makedirs(OUTPUT_DIR, exist_ok=True)
process(
video_path=VIDEO_PATH,
transcript_path=TRANSCRIPT_PATH,
output_dir=OUTPUT_DIR,
clips_path=CLIPS_PATH,
crf=CRF,
preset=PRESET,
style_overrides=STYLE,
)

100
src/text/cutClips.py Normal file
View File

@ -0,0 +1,100 @@
#!/usr/bin/env python3
# cutClips.py — exportiert Clips aus dem ersten gefundenen Video oder aus angegebener Datei
from pathlib import Path
import sqlite3
import argparse
from moviepy.video.io.VideoFileClip import VideoFileClip
import sys
# ── Projektwurzel in sys.path aufnehmen
ROOT = Path(__file__).resolve().parents[2]
sys.path.insert(0, str(ROOT))
from config import INPUT_DIR, RAW_CLIPS_DIR, DB_PATH
def parse_args():
p = argparse.ArgumentParser(description="Exportiert Highlights aus dem Video gemäß SQLite-DB.")
p.add_argument("--file", type=str, default=None,
help="Name der Input-Datei im INPUT_DIR. Wenn leer, wird das erste Video im Ordner verwendet.")
p.add_argument("--limit", type=int, default=10,
help="Anzahl der zu exportierenden Clips (Default: 10)")
p.add_argument("--order", type=str, choices=["score", "start"], default="score",
help="Sortierung: 'score' (score_total absteigend) oder 'start' (zeitlich).")
return p.parse_args()
def find_first_video(directory: Path):
"""Suche nach der ersten Videodatei im Verzeichnis (mp4, mov, mkv)."""
for ext in ("*.mp4", "*.mov", "*.mkv"):
files = sorted(directory.glob(ext))
if files:
return files[0]
return None
def main():
args = parse_args()
# === Eingabevideo bestimmen ===
if args.file:
input_video = INPUT_DIR / args.file
else:
input_video = find_first_video(INPUT_DIR)
if not input_video:
raise FileNotFoundError(f"🚫 Kein Video im Eingabeordner {INPUT_DIR} gefunden.")
print(f"📂 Kein --file angegeben → verwende automatisch: {input_video.name}")
if not input_video.exists():
raise FileNotFoundError(f"🚫 Input-Video nicht gefunden: {input_video}")
output_dir = RAW_CLIPS_DIR
output_dir.mkdir(parents=True, exist_ok=True)
# === SQLite DB lesen ===
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
order_clause = "ORDER BY score_total DESC" if args.order == "score" else "ORDER BY start ASC"
cursor.execute(f"""
SELECT start, end, text
FROM highlights
{order_clause}
LIMIT ?
""", (args.limit,))
highlights = cursor.fetchall()
if not highlights:
print("⚠️ Keine Highlights in der Datenbank gefunden.")
conn.close()
return
# === Video laden ===
video = VideoFileClip(str(input_video))
# === Clips schneiden ===
for i, (start, end, text) in enumerate(highlights, start=1):
if start >= video.duration:
print(f"⚠️ Clip {i} übersprungen Startzeit {start:.2f}s liegt außerhalb der Videolänge ({video.duration:.2f}s).")
continue
end = min(end, video.duration)
output_file = output_dir / f"highlight_{i}.mp4"
print(f"🎬 Exportiere Clip {i}: {start:.2f}s {end:.2f}s → {output_file.name}")
try:
clip = video.subclipped(start, end)
clip.write_videofile(str(output_file), codec="libx264", audio_codec="aac", logger=None)
clip.close()
except Exception as e:
print(f"❌ Fehler beim Export von Clip {i}: {e}")
# === Cleanup ===
conn.close()
video.close()
print(f"{len(highlights)} Clips exportiert nach {output_dir}")
if __name__ == "__main__":
main()

View File

@ -2,44 +2,41 @@ import sqlite3
import re
from openai import OpenAI
from time import sleep
from pathlib import Path
import os
from pathlib import Path
import sys
# Projekt-Root einfügen (2 Ebenen hoch von src/* ausgehend)
ROOT = Path(__file__).resolve().parents[2]
sys.path.insert(0, str(ROOT))
from config import DB_PATH
# === Einstellungen ===
DB_PATH = "clips_openai.db"
VIDEO_ID = "testVideoShort"
MAX_CLIPS = 5 # oder "all"
OPENAI_API_KEY = "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"
client = OpenAI(api_key=OPENAI_API_KEY)
# === OPENAI-CLIENT (API-Key aus Env) ===
if not os.getenv("OPENAI_API_KEY"):
raise RuntimeError("🚫 OPENAI_API_KEY fehlt in der Umgebung")
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
# === DB-Verbindung
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute("DROP TABLE IF EXISTS highlights")
# === Unbewertete Highlights laden
cursor.execute("""
CREATE TABLE highlights (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file TEXT,
start REAL,
end REAL,
text TEXT,
viralitaet INTEGER,
emotionalitaet INTEGER,
witz INTEGER,
provokation INTEGER,
score_total INTEGER
)
SELECT id, start, end, text FROM highlights
WHERE viralitaet IS NULL OR emotionalitaet IS NULL
ORDER BY start
""")
conn.commit()
print(f"🧹 Tabelle 'highlights' neu erstellt für: {VIDEO_ID}")
# === Segmente laden
cursor.execute("SELECT start, end, text FROM segments ORDER BY start")
segments = cursor.fetchall()
print(f"📥 {len(segments)} Segmente (Originaltext) geladen.")
print(f"📥 {len(segments)} unbewertete Highlights geladen.")
# === Bewertungsfunktion (GPT-4o)
def analyse_segment(text, start, end):
def analyse_segment(clip_id, text, start, end):
print(f"\n🔎 Bewerte Clip: {start:.2f}s {end:.2f}s")
prompt = f"""
@ -86,19 +83,19 @@ Provokation: [Zahl]
if all(v is not None for v in values.values()):
total_score = sum(values.values())
cursor.execute("""
INSERT INTO highlights (
file, start, end, text,
viralitaet, emotionalitaet, witz, provokation, score_total
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
UPDATE highlights SET
viralitaet = ?, emotionalitaet = ?, witz = ?, provokation = ?, score_total = ?
WHERE id = ?
""", (
VIDEO_ID, start, end, text.strip(),
values["viralitaet"], values["emotionalitaet"],
values["witz"], values["provokation"],
total_score
total_score,
clip_id
))
conn.commit()
return {
"id": clip_id,
"start": start,
"end": end,
"text": text.strip(),
@ -113,8 +110,8 @@ Provokation: [Zahl]
# === Clips bewerten
rated = []
for start, end, text in segments:
result = analyse_segment(text, float(start), float(end))
for clip_id, start, end, text in segments:
result = analyse_segment(clip_id, text, float(start), float(end))
if result:
rated.append(result)
sleep(1.2) # Anti-Rate-Limit
@ -123,7 +120,7 @@ for start, end, text in segments:
rated.sort(key=lambda x: x["total"], reverse=True)
selected = rated if MAX_CLIPS == "all" else rated[:int(MAX_CLIPS)]
print(f"\n🎬 Beste {len(selected)} Highlights für: {VIDEO_ID}")
print(f"\n🎬 Beste {len(selected)} Highlights nach Bewertung:")
for clip in selected:
print(f"\n🚀 {clip['start']:.2f}s {clip['end']:.2f}s")
print(f"🎙️ {clip['text'][:200]}...")

View File

@ -0,0 +1,409 @@
#!/usr/bin/env python3
# clip_selector_optimized.py — word-based text rebuild (no duplicates)
import os
import re
import json
import sqlite3
import time
from pathlib import Path
from datetime import datetime
import argparse
import sys
from typing import List, Dict, Optional
from openai import OpenAI
# ── Projektwurzel in sys.path aufnehmen (dieses Skript kann z. B. unter src/text/ liegen)
ROOT = Path(__file__).resolve().parents[2]
sys.path.insert(0, str(ROOT))
from config import TRANSCRIPTS_DIR, DB_PATH # zentrale Pfade
LOG_DIR = ROOT / "logs"
LOG_DIR.mkdir(exist_ok=True, parents=True)
# === DEFAULTS (per CLI überschreibbar) ===
DEFAULT_BLOCK_DURATION = 300.0 # Sek. pro Block
DEFAULT_MIN_CLIP_LEN = 30.0 # konsistent mit Prompt
DEFAULT_MAX_CLIP_LEN = 90.0
# === OPENAI-CLIENT (API-Key aus Env) ===
API_KEY = os.getenv("OPENAI_API_KEY")
if not API_KEY:
raise RuntimeError("🚫 OPENAI_API_KEY fehlt in der Umgebung")
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-5") # bei Bedarf überschreiben
client = OpenAI(api_key=API_KEY)
# ──────────────────────────────────────────────────────────────────────────────
# Hilfsfunktionen
# ──────────────────────────────────────────────────────────────────────────────
def log_text(filename: str, content: str) -> None:
(LOG_DIR / filename).write_text((content or "").strip(), encoding="utf-8")
def append_error_log(content: str) -> None:
with (LOG_DIR / "errors.txt").open("a", encoding="utf-8") as f:
f.write(f"{datetime.now().isoformat()} {content}\n\n")
def extract_json(text: str) -> list:
"""Nur für Debug: versucht JSON-Array aus beliebigem Text zu extrahieren."""
txt = (text or "").strip()
txt = re.sub(r"^```(?:json)?\s*|\s*```$", "", txt, flags=re.IGNORECASE | re.DOTALL)
m = re.search(r"\[\s*{.*?}\s*\]", txt, re.DOTALL)
if not m:
append_error_log(f"❌ Kein JSON-Array gefunden.\n{txt}")
return []
try:
return json.loads(m.group(0))
except Exception as e:
append_error_log(f"❌ JSON-Fehler: {e}\n{txt}")
return []
def get_json_snippets_for_clip(start: float, end: float, segment_json: List[Dict]) -> List[Dict]:
"""halb-offenes Fenster [start, end)"""
return [s for s in segment_json if not (float(s["end"]) <= start or float(s["start"]) >= end)]
def _norm_space(s: str) -> str:
return re.sub(r"\s+", " ", (s or "").strip())
def explode_segments_to_words(segments: List[Dict]) -> List[Dict]:
"""
Baut eine globale Wortliste. Bevorzugt echte 'words' aus JSON,
fällt ansonsten auf lineare Interpolation über Segmentdauer zurück.
Ausgabe-Items: {idx, mid, text}
"""
words = []
idx = 0
for seg in sorted(segments, key=lambda s: (float(s["start"]), float(s["end"]))):
s0, s1 = float(seg["start"]), float(seg["end"])
txt = (seg.get("text") or "").strip()
seg_words = seg.get("words") or []
if seg_words:
for w in seg_words:
t = (w.get("text") or w.get("word") or "").strip()
if not t:
continue
w0 = float(w["start"]); w1 = float(w["end"])
words.append({"idx": idx, "mid": round((w0 + w1) / 2.0, 4), "text": t})
idx += 1
else:
toks = txt.split()
n = len(toks)
if n == 0:
continue
dur = max(1e-6, s1 - s0)
for i, tok in enumerate(toks):
w0 = s0 + (i / n) * dur
w1 = s0 + ((i + 1) / n) * dur
words.append({"idx": idx, "mid": round((w0 + w1) / 2.0, 4), "text": tok})
idx += 1
return words
def build_text_strict_from_words(clip_start: float, clip_end: float, WORDS: List[Dict]) -> str:
"""Nimmt jedes Wort genau einmal, wenn mid ∈ [start, end)."""
sel = [w for w in WORDS if clip_start <= w["mid"] < clip_end]
sel.sort(key=lambda w: w["idx"])
return _norm_space(" ".join(w["text"] for w in sel))
def find_transcript_pair(base: Optional[str]) -> tuple[Path, Path, str]:
"""
Finde (timed.txt, segments.json) in TRANSCRIPTS_DIR.
- Wenn base übergeben: benutzt {base}_timed.txt und {base}_segments.json.
- Sonst: nimmt das lexikographisch erste *_timed.txt und leitet die JSON davon ab.
"""
if base:
txt = TRANSCRIPTS_DIR / f"{base}_timed.txt"
jsn = TRANSCRIPTS_DIR / f"{base}_segments.json"
if not txt.exists():
raise FileNotFoundError(f"Transkript nicht gefunden: {txt}")
if not jsn.exists():
raise FileNotFoundError(f"Segment-JSON nicht gefunden: {jsn}")
return txt, jsn, base
# auto-detect
candidates = sorted(TRANSCRIPTS_DIR.glob("*_timed.txt"))
if not candidates:
raise FileNotFoundError(f"Keine *_timed.txt in {TRANSCRIPTS_DIR} gefunden.")
txt = candidates[0]
stem = txt.stem.replace("_timed", "")
jsn = TRANSCRIPTS_DIR / f"{stem}_segments.json"
if not jsn.exists():
raise FileNotFoundError(f"Gefundenes TXT: {txt.name}, aber JSON fehlt: {jsn.name}")
return txt, jsn, stem
# ──────────────────────────────────────────────────────────────────────────────
# CLI
# ──────────────────────────────────────────────────────────────────────────────
def parse_args():
p = argparse.ArgumentParser(description="Selektiert Social-Media-taugliche Clips aus Transkripten (LLM-gestützt).")
p.add_argument("--base", type=str, default=None,
help="Basename der Transkriptdateien (z. B. 'testVideoShort' für *_timed.txt und *_segments.json).")
p.add_argument("--block", type=float, default=DEFAULT_BLOCK_DURATION, help="Blocklänge in Sekunden für die Prompt-Bildung.")
p.add_argument("--min", type=float, default=DEFAULT_MIN_CLIP_LEN, help="Minimale Clip-Länge (Sekunden).")
p.add_argument("--max", type=float, default=DEFAULT_MAX_CLIP_LEN, help="Maximale Clip-Länge (Sekunden).")
return p.parse_args()
# ──────────────────────────────────────────────────────────────────────────────
# Main
# ──────────────────────────────────────────────────────────────────────────────
def main():
args = parse_args()
BLOCK_DURATION = float(args.block)
MIN_CLIP_LEN = float(args.min)
MAX_CLIP_LEN = float(args.max)
# --- Transkriptdateien finden ---
TRANSCRIPT_PATH, SEGMENT_JSON_PATH, base = find_transcript_pair(args.base)
print(f"📄 TXT : {TRANSCRIPT_PATH}")
print(f"🧾 JSON: {SEGMENT_JSON_PATH}")
# === TRANSKRIPT EINLESEN (TXT) -> NUR für Blockbildung & Promptanzeige ===
lines = TRANSCRIPT_PATH.read_text(encoding="utf-8").splitlines()
segments_txt: List[Dict] = []
for line in lines:
m = re.match(r"\[(\d+(?:\.\d+)?)\s*[-]\s*(\d+(?:\.\d+)?)\]\s*(?:[A-Z_0-9]+:)?\s*(.*)", line)
if not m:
continue
start, end, text = m.groups()
start, end = float(start), float(end)
if end - start >= 2.0:
segments_txt.append({"start": start, "end": end, "text": (text or "").strip()})
if not segments_txt:
raise RuntimeError("🚫 Keine gültigen TXT-Segmente gefunden.")
print(f"{len(segments_txt)} gültige TXT-Segmente geladen.")
# === TRANSKRIPT EINLESEN (JSON) -> Quelle für DB-Text/Wörter ===
segment_json_data = json.loads(SEGMENT_JSON_PATH.read_text(encoding="utf-8"))
if not isinstance(segment_json_data, list) or not segment_json_data:
raise RuntimeError("🚫 JSON-Segmente leer/ungültig.")
print(f"{len(segment_json_data)} JSON-Segmente geladen.")
# Globale Wörterliste einmal berechnen (bevor wir Clips bilden)
WORDS = explode_segments_to_words(segment_json_data)
print(f"🔤 Globale Wörter im Korpus: {len(WORDS)}")
# === BLÖCKE BILDEN (aus TXT) ===
segments_txt.sort(key=lambda s: (s["start"], s["end"]))
blocks, current_block, current_start = [], [], 0.0
for seg in segments_txt:
if not current_block:
current_start = seg["start"]
# Blockwechsel, wenn Dauer überschritten
if seg["end"] - current_start > BLOCK_DURATION:
blocks.append(current_block)
current_block = []
current_start = seg["start"]
current_block.append(seg)
if current_block:
blocks.append(current_block)
print(f"🧱 {len(blocks)} Blöcke erstellt (à ~{BLOCK_DURATION:.0f}s).")
# === KI: CLIP-AUSWAHL ===
all_clips = []
t0 = time.perf_counter()
for i, block in enumerate(blocks, start=1):
if not block:
continue
print(f"\n🤖 Sende Block {i}/{len(blocks)} an {OPENAI_MODEL}")
block_text = "\n".join([f"[{s['start']} {s['end']}] {s['text']}" for s in block])
prompt = f"""
Du bekommst einen Transkriptblock mit Zeitangaben. Extrahiere daraus 13 besonders interessante Abschnitte, die sich als eigenständige Social Media Clips eignen.
Ein guter Clip:
- ist abgeschlossen und verständlich
- enthält eine Pointe, Erkenntnis oder einen emotionalen Moment
- wirkt wie ein Mini-Ausschnitt mit Anfang, Spannungsbogen, Auflösung oder Punchline
- ist mindestens {MIN_CLIP_LEN:.0f} Sekunden lang
Nutze ausschließlich die vorhandenen Start- und Endzeiten keine neuen erfinden.
Gib ein JSON-Objekt zurück im Format:
{{
"clips": [
{{
"start": float,
"end": float,
"summary": "Kurze Beschreibung des Inhalts"
}}
]
}}
TRANSKRIPT:
{block_text}
""".strip()
log_text(f"block_prompt_{i:02d}.txt", prompt)
# --- robuster API-Call mit Schema (Root=object) und kleinem Retry ---
import time as _time
clips = []
for attempt in range(3):
try:
resp = client.chat.completions.create(
model=OPENAI_MODEL,
messages=[{"role": "user", "content": prompt}],
response_format={
"type": "json_schema",
"json_schema": {
"name": "clips_payload",
"schema": {
"type": "object",
"properties": {
"clips": {
"type": "array",
"items": {
"type": "object",
"properties": {
"start": {"type": "number"},
"end": {"type": "number"},
"summary": {"type": "string"}
},
"required": ["start", "end", "summary"],
"additionalProperties": False
}
}
},
"required": ["clips"],
"additionalProperties": False
}
}
}
)
msg = resp.choices[0].message
payload = getattr(msg, "parsed", None)
if payload is None:
payload = json.loads(msg.content)
clips = payload.get("clips", []) or []
try:
log_text(f"block_output_{i:02d}.txt", json.dumps(payload, ensure_ascii=False, indent=2))
except Exception:
pass
break
except Exception as e:
if attempt == 2:
append_error_log(f"❌ OpenAI-Fehler Block {i}: {e}")
print(f"❌ Fehler bei Block {i}: {e}")
else:
_time.sleep(1.5 * (attempt + 1))
print(f"{len(clips)} Clips empfangen in Block {i}")
# --- Clips filtern & clampen ---
for clip in clips:
try:
b_start, b_end = block[0]["start"], block[-1]["end"]
start = max(b_start, min(float(clip["start"]), b_end))
end = max(b_start, min(float(clip["end"]), b_end))
dur = end - start
if MIN_CLIP_LEN <= dur <= MAX_CLIP_LEN:
clip["start"] = start
clip["end"] = end
clip["duration"] = round(dur, 2)
all_clips.append(clip)
except Exception as e:
append_error_log(f"⛔ Clip-Filterfehler: {clip}\n{e}")
elapsed = time.perf_counter() - t0
avg = elapsed / i
eta = max(0.0, avg * (len(blocks) - i))
print(f"⏱️ Geschätzte Restzeit: {eta:.1f} s")
# --- Duplikate entfernen (auf 2 Dezimalen) ---
dedup, seen = [], set()
for c in all_clips:
k = (round(c["start"], 2), round(c["end"], 2))
if k in seen:
continue
seen.add(k)
dedup.append(c)
all_clips = dedup
print(f"\n📈 Gesamtclips vor DB-Insert: {len(all_clips)}")
# === DB SPEICHERN ===
conn = sqlite3.connect(DB_PATH)
cur = conn.cursor()
cur.execute("""
CREATE TABLE IF NOT EXISTS highlights (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file TEXT,
start REAL,
end REAL,
duration REAL,
text TEXT,
summary TEXT,
json_raw TEXT,
viralitaet INTEGER,
emotionalitaet INTEGER,
witz INTEGER,
provokation INTEGER,
score_total INTEGER,
UNIQUE(file,start,end)
)
""")
# --- Tabelle vor neuem Lauf komplett leeren ---
cur.execute("DELETE FROM highlights")
conn.commit() # Transaktion schließen, damit VACUUM außerhalb läuft
# VACUUM separat (optional)
try:
conn.execute("VACUUM") # oder: sqlite3.connect(DB_PATH).execute("VACUUM").close()
print("🧹 Alte Highlights gelöscht und Datenbank komprimiert.")
except sqlite3.OperationalError as e:
print(f"⚠️ VACUUM übersprungen: {e}")
inserted = 0
failed = 0
for clip in all_clips:
try:
start = float(clip["start"])
end = float(clip["end"])
duration = float(clip["duration"])
summary = (clip.get("summary") or "").strip()
if end <= start or start < 0:
raise ValueError("Ungültige Zeiten")
# JSON-Segmente (zur Nachvollziehbarkeit) + Wort-basierter Text (dopplerfrei)
json_snippets = get_json_snippets_for_clip(start, end, segment_json_data)
json_raw = json.dumps(json_snippets, ensure_ascii=False)
original_text = build_text_strict_from_words(start, end, WORDS)
cur.execute("""
INSERT OR IGNORE INTO highlights (
file, start, end, duration, text, summary, json_raw,
viralitaet, emotionalitaet, witz, provokation, score_total
)
VALUES (?, ?, ?, ?, ?, ?, ?, NULL, NULL, NULL, NULL, NULL)
""", (
# 'file' = Basename (z. B. testVideoShort)
Path(base).name,
start, end, duration,
original_text, summary, json_raw
))
if cur.rowcount > 0:
inserted += 1
except Exception as e:
failed += 1
append_error_log(f"❌ DB-Fehler: {clip}\n{e}")
conn.commit()
conn.close()
print("\n📊 Ergebnisse:")
print(f" ✅ Highlights gespeichert: {inserted}")
print(f" ❌ Fehlerhafte Clips: {failed}")
print(f"📁 Logs: {LOG_DIR.resolve()}")
if __name__ == "__main__":
main()

276
src/text/transcription.py Normal file
View File

@ -0,0 +1,276 @@
#!/usr/bin/env python3
# transcription_chunked_words.py — Whisper mit Wortzeitstempeln, doppler-sicher
import os
import sys
import json
import argparse
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import List, Dict, Tuple, Optional
import ffmpeg
import whisper
# ── Projektwurzel in sys.path aufnehmen (dieses Skript liegt z. B. unter src/text/)
ROOT = Path(__file__).resolve().parents[2]
sys.path.insert(0, str(ROOT))
from config import INPUT_DIR, TRANSCRIPTS_DIR # zentrale Pfade
# ──────────────────────────────────────────────────────────────────────────────
# Utilities
# ──────────────────────────────────────────────────────────────────────────────
def probe_duration(path: Path) -> float:
"""Ermittle die Videodauer in Sekunden (ffmpeg.probe)."""
try:
meta = ffmpeg.probe(str(path))
except ffmpeg.Error as e:
raise RuntimeError(f"ffmpeg.probe fehlgeschlagen für {path}: {e.stderr.decode('utf-8','ignore') if hasattr(e, 'stderr') else e}") from e
dur = meta.get("format", {}).get("duration")
if dur is not None:
return float(dur)
cand = 0.0
for s in meta.get("streams", []) or []:
d = s.get("duration")
if d:
cand = max(cand, float(d))
if cand > 0:
return cand
raise RuntimeError(f"Konnte Videodauer nicht bestimmen: {path}")
def make_chunks(total: float, chunk_seconds: float, overlap: float) -> List[Tuple[float,float]]:
"""Zerteile [0,total] in überlappende Intervalle."""
if chunk_seconds <= 0:
return [(0.0, total)]
s, out = 0.0, []
while s < total:
e = min(s + chunk_seconds, total)
out.append((s, e))
if e >= total:
break
s = max(0.0, e - overlap)
return out
def extract_audio_segment(src_video: Path, start: float, end: float, out_wav: Path) -> None:
"""Extrahiere [start,end] als Mono-16kHz-WAV."""
(
ffmpeg
.input(str(src_video), ss=start, to=end)
.output(
str(out_wav),
format="wav",
acodec="pcm_s16le",
ac=1,
ar="16000",
loglevel="error",
)
.overwrite_output()
.run()
)
def is_suspect(text: str) -> bool:
"""Heuristik: leere/loopende/zweifelhafte Zeilen markieren."""
t = (text or "").strip().lower()
if not t:
return True
words = t.split()
if not words:
return True
counts = {w: words.count(w) for w in set(words)}
most_common = max(counts.values())
return most_common / len(words) > 0.6 or most_common > 20
def merge_overlaps_keep_best(
segments: List[Dict],
max_gap: float = 0.15,
min_dur: float = 0.30
) -> List[Dict]:
"""
Zeitlich sortieren, kleine Gaps schließen. Bei Überlappung:
- keine Text-Konkatenation
- behalte das "bessere" Segment (längere Dauer, dann längerer Text)
- words: vom "best" übernehmen (falls vorhanden)
"""
cleaned = []
for s in segments:
s0 = float(s["start"]); s1 = float(s["end"])
txt = (s.get("text") or "").strip()
if s1 - s0 >= min_dur and txt:
cleaned.append({
"start": s0, "end": s1,
"text": txt,
"words": s.get("words", [])
})
if not cleaned:
return []
cleaned.sort(key=lambda x: (x["start"], x["end"]))
out = [cleaned[0]]
def score(x: Dict) -> tuple:
return (x["end"] - x["start"], len(x.get("text", "")))
for s in cleaned[1:]:
m = out[-1]
if s["start"] <= m["end"] + max_gap:
best = s if score(s) > score(m) else m
out[-1] = {
"start": min(m["start"], s["start"]),
"end": max(m["end"], s["end"]),
"text": best["text"],
"words": best.get("words", []),
}
else:
out.append(s)
return out
def write_outputs(base: Path, segments: List[Dict], out_dir: Path, ascii_dash: bool = True):
"""Schreibe _timed.txt, _suspect_lines.txt und _segments.json."""
out_dir.mkdir(parents=True, exist_ok=True)
dash = "-" if ascii_dash else ""
out_txt = out_dir / f"{base.stem}_timed.txt"
out_sus = out_dir / f"{base.stem}_suspect_lines.txt"
out_json = out_dir / f"{base.stem}_segments.json"
# TXT nur zur Ansicht
with open(out_txt, "w", encoding="utf-8") as f_txt, open(out_sus, "w", encoding="utf-8") as f_sus:
for s in segments:
line = f"[{s['start']:.2f} {dash} {s['end']:.2f}] {s['text']}\n"
f_txt.write(line)
if is_suspect(s["text"]):
f_sus.write(line)
# JSON für die Weiterverarbeitung (inkl. words)
with open(out_json, "w", encoding="utf-8") as f_json:
json.dump(segments, f_json, ensure_ascii=False, indent=2)
return out_txt, out_sus, out_json
def find_default_input() -> Optional[Path]:
"""Nimm das erste Video aus INPUT_DIR, falls kein --input übergeben wurde."""
exts = (".mp4", ".mov", ".mkv", ".m4v", ".wav", ".mp3")
for p in sorted(INPUT_DIR.iterdir()):
if p.suffix.lower() in exts:
return p
return None
# ──────────────────────────────────────────────────────────────────────────────
# CLI
# ──────────────────────────────────────────────────────────────────────────────
def parse_args():
p = argparse.ArgumentParser(
description="Chunked Whisper Transcription mit Wortzeitstempeln & doppler-sicherem Stitching."
)
p.add_argument("--input", type=Path, default=None, help=f"Eingabevideo/-audio. Default: erstes File in {INPUT_DIR}")
p.add_argument("--outdir", type=Path, default=None, help=f"Ausgabeverzeichnis. Default: {TRANSCRIPTS_DIR}")
p.add_argument("--model", type=str, default=os.getenv("WHISPER_MODEL", "small"), help="Whisper-Modell (tiny/base/small/medium/large)")
p.add_argument("--lang", type=str, default=os.getenv("LANGUAGE", "none"), help="Sprachcode (z. B. 'de') oder leer/None für Auto-Detect")
p.add_argument("--chunk", type=float, default=60.0, help="Chunk-Länge in Sekunden (0 = ganzes File)")
p.add_argument("--overlap", type=float, default=2.0, help="Overlap in Sekunden")
p.add_argument("--min-dur", type=float, default=0.30, help="Mindest-Segmentdauer (Sekunden)")
p.add_argument("--max-gap", type=float, default=0.15, help="Maximaler Zeit-Gap für Merge (Sekunden)")
p.add_argument("--fp16", action="store_true", help="fp16 aktivieren (nur sinnvoll mit GPU)")
return p.parse_args()
# ──────────────────────────────────────────────────────────────────────────────
# Main
# ──────────────────────────────────────────────────────────────────────────────
def main():
# Whisper-Cache (damit Modelle lokal landen)
os.environ.setdefault("XDG_CACHE_HOME", str(ROOT / "whisper-cache"))
args = parse_args()
input_path = args.input or find_default_input()
out_dir = args.outdir or TRANSCRIPTS_DIR
print("📁 Projekt-Root:", ROOT)
print("📄 Input:", input_path if input_path else "")
if not input_path or not input_path.exists():
raise FileNotFoundError(f"Kein gültiges Eingabefile gefunden. Lege ein Video/Audio in {INPUT_DIR} oder nutze --input.")
out_dir.mkdir(parents=True, exist_ok=True)
duration = probe_duration(input_path)
print(f"🎬 Dauer: {duration:.2f}s")
chunks = make_chunks(duration, args.chunk, args.overlap)
print(f"🔪 {len(chunks)} Chunks à {args.chunk:.1f}s mit {args.overlap:.1f}s Overlap")
# Whisper laden
print(f"🧠 Lade Whisper-Modell: {args.model}")
try:
model = whisper.load_model(args.model)
except Exception as e:
raise RuntimeError(f"Whisper-Modell '{args.model}' konnte nicht geladen werden. Installiert? (pip install openai-whisper)\n{e}") from e
all_segments: List[Dict] = []
with TemporaryDirectory() as tmpdir_str:
tmpdir = Path(tmpdir_str)
for i, (start, end) in enumerate(chunks, 1):
print(f"🔉 Chunk {i}/{len(chunks)}: {start:.2f}s - {end:.2f}s")
wav = tmpdir / f"chunk_{i:03d}.wav"
extract_audio_segment(input_path, start, end, wav)
# Sprache: ''/none = Auto-Detect
lang = None if str(args.lang).strip().lower() in {"", "none", "null"} else args.lang
# Transkribieren mit Wortzeiten, ohne Cross-Chunk-Kontext
result = model.transcribe(
str(wav),
language=lang,
fp16=args.fp16,
word_timestamps=True,
condition_on_previous_text=False,
temperature=0,
verbose=False,
)
# Center-Cut: nur Mittelteil behalten (verhindert Grenz-Doppler)
keep_start = start if i == 1 else start + args.overlap / 2.0
keep_end = end if i == len(chunks) else end - args.overlap / 2.0
for seg in result.get("segments", []) or []:
s0 = float(seg["start"]) + start
s1 = float(seg["end"]) + start
mid = (s0 + s1) / 2.0
if not (keep_start <= mid < keep_end):
continue
# Wörter mit absoluten Zeiten übernehmen
words = []
for w in (seg.get("words") or []):
txt = (w.get("word") or w.get("text") or "").strip()
if not txt:
continue
words.append({
"start": float(w["start"]) + start,
"end": float(w["end"]) + start,
"text": txt
})
all_segments.append({
"start": s0,
"end": s1,
"text": (seg.get("text") or "").strip(),
"words": words
})
print(f"🧹 Roh-Segmente: {len(all_segments)} → merge & filter …")
merged = merge_overlaps_keep_best(all_segments, max_gap=args.max_gap, min_dur=args.min_dur)
print(f"✅ Gemergte Segmente: {len(merged)}")
out_txt, out_sus, out_json = write_outputs(input_path, merged, out_dir, ascii_dash=True)
print(f"📝 TXT: {out_txt}")
print(f"⚠️ SUSPECT: {out_sus}")
print(f"💾 JSON: {out_json}")
print("🎉 Fertig.")
if __name__ == "__main__":
main()

View File

@ -0,0 +1,88 @@
import os
import json
import ffmpeg
import whisper
import tempfile
import torch
from tqdm import tqdm
from pathlib import Path
from pyannote.audio import Pipeline
# === HUGGING FACE TOKEN (für pyannote) ===
HF_TOKEN = "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"
# === Torch Optimierung (optional) ===
torch.set_float32_matmul_precision("medium")
# === Einstellungen ===
PROJECT_ROOT = Path(__file__).resolve().parents[2]
input_file = PROJECT_ROOT / "input" / "testVideoShort.mov"
output_dir = PROJECT_ROOT / "transkripte"
output_dir.mkdir(parents=True, exist_ok=True)
output_txt = output_dir / f"{input_file.stem}_timed.txt"
output_json = output_dir / f"{input_file.stem}_segments.json"
# === Video in Audio konvertieren ===
print("🎞️ Extrahiere Audio ...")
tmp_dir = Path(tempfile.mkdtemp())
wav_file = tmp_dir / "audio.wav"
ffmpeg.input(str(input_file)).output(
str(wav_file),
format="wav",
acodec="pcm_s16le",
ac=1,
ar="16000",
loglevel="error"
).overwrite_output().run()
# === Transkription mit Whisper ===
print("🧠 Starte Transkription mit Whisper ...")
model = whisper.load_model("small")
result = model.transcribe(
str(wav_file),
language="de",
fp16=False,
word_timestamps=False,
condition_on_previous_text=True,
temperature=0,
verbose=False
)
segments = result["segments"]
# === Diarisation mit Pyannote ===
print("🗣️ Starte Sprecheranalyse mit Pyannote (das dauert jetzt etwas) ...")
pipeline = Pipeline.from_pretrained(
"pyannote/speaker-diarization-3.1",
use_auth_token=HF_TOKEN
)
pipeline.to(torch.device("mps")) # ⬅️ Apple GPU beschleunigen
diarization = pipeline(str(wav_file))
# === Sprecher zuordnen ===
def assign_speakers_to_segments(segments, diarization):
assigned = []
for seg in tqdm(segments, desc="🎙️ Weise Sprecher zu"):
speaker = "unknown"
for turn, _, label in diarization.itertracks(yield_label=True):
if turn.start <= seg["start"] <= turn.end:
speaker = label
break
seg["speaker"] = speaker
assigned.append(seg)
return assigned
segments_with_speaker = assign_speakers_to_segments(segments, diarization)
# === Speichern als TXT
with open(output_txt, "w", encoding="utf-8") as f:
for seg in segments_with_speaker:
line = f"[{seg['start']:.2f} {seg['end']:.2f}] {seg['speaker'].upper()}: {seg['text'].strip()}\n"
f.write(line)
# === Speichern als JSON
with open(output_json, "w", encoding="utf-8") as f:
json.dump(segments_with_speaker, f, ensure_ascii=False, indent=2)
print(f"✅ Transkript mit Sprecherinfos gespeichert unter:\n📄 {output_txt}\n📄 {output_json}")

@ -1 +0,0 @@
Subproject commit 7815f8b37d91b75cf160ed3f0ec8550c0b58cabb

BIN
transkripte/.DS_Store vendored

Binary file not shown.

@ -1 +0,0 @@
Subproject commit 2e310b841e0b4e7cf00890b53411dd9f8578f243