cleanup: ignore text-clustering & whisper.cpp
This commit is contained in:
parent
a9d700b20e
commit
0c9b43af42
105
.gitignore
vendored
105
.gitignore
vendored
@ -1,27 +1,108 @@
|
|||||||
# IDE & Cache
|
# ─────────────────────────────
|
||||||
|
# IDEs & System Files
|
||||||
|
# ─────────────────────────────
|
||||||
.idea/
|
.idea/
|
||||||
|
.vscode/
|
||||||
__pycache__/
|
__pycache__/
|
||||||
*.pyc
|
*.pyc
|
||||||
.DS_Store
|
.DS_Store
|
||||||
|
*.log
|
||||||
|
|
||||||
# Whisper Modelle & Cache
|
# ─────────────────────────────
|
||||||
|
# Cache / Modelle / Checkpoints
|
||||||
|
# ─────────────────────────────
|
||||||
whisper-cache/
|
whisper-cache/
|
||||||
models/
|
models/
|
||||||
*.pt
|
*.pt
|
||||||
|
*.onnx
|
||||||
|
*.bin
|
||||||
|
*.safetensors
|
||||||
|
|
||||||
# Output/Temp Files
|
# ─────────────────────────────
|
||||||
|
# Datenbank / temporäre Dateien
|
||||||
|
# ─────────────────────────────
|
||||||
|
*.db
|
||||||
|
*.sqlite
|
||||||
|
logs/
|
||||||
|
temp/
|
||||||
|
tmp/
|
||||||
|
*.tmp
|
||||||
|
|
||||||
|
# ─────────────────────────────
|
||||||
|
# Transkripte / KI-Zwischenausgaben
|
||||||
|
# ─────────────────────────────
|
||||||
|
/data/transkripte/
|
||||||
|
/transcripts/
|
||||||
|
/outputs/
|
||||||
|
/results/
|
||||||
|
*_segments.json
|
||||||
|
*_timed.txt
|
||||||
|
*_suspect_lines.txt
|
||||||
|
|
||||||
|
# ─────────────────────────────
|
||||||
|
# Video / Audio Outputs
|
||||||
|
# ─────────────────────────────
|
||||||
*.mp4
|
*.mp4
|
||||||
*.mov
|
*.mov
|
||||||
*.db
|
*.mkv
|
||||||
*.wav
|
*.wav
|
||||||
*.json
|
*.webm
|
||||||
temp.*
|
*.mp3
|
||||||
logs/
|
|
||||||
|
|
||||||
# Eingebettete Repos
|
# ─────────────────────────────
|
||||||
|
# Generierte Teil-/Ergebnis-Ordner
|
||||||
|
# ─────────────────────────────
|
||||||
|
/raw_clips/
|
||||||
|
/face_combined/
|
||||||
|
/face_crop_centers/
|
||||||
|
/cropped/
|
||||||
|
/subtitled/
|
||||||
|
/segments/
|
||||||
|
/highlight_clips/
|
||||||
|
/output/
|
||||||
|
/renders/
|
||||||
|
/exports/
|
||||||
|
|
||||||
|
# ─────────────────────────────
|
||||||
|
# Eingebettete Repos oder externe Module
|
||||||
|
# ─────────────────────────────
|
||||||
|
/whisper.cpp/
|
||||||
|
/text-clustering/
|
||||||
|
/venv/
|
||||||
|
/.env/
|
||||||
|
/.env.local
|
||||||
|
.envrc
|
||||||
|
.env.*
|
||||||
|
|
||||||
|
# ─────────────────────────────
|
||||||
|
# Backups / Sonstiges
|
||||||
|
# ─────────────────────────────
|
||||||
|
*.bak
|
||||||
|
*.old
|
||||||
|
*.orig
|
||||||
|
*.swp
|
||||||
|
*.zip
|
||||||
|
*.tar
|
||||||
|
*.gz
|
||||||
|
|
||||||
|
# IDE/System
|
||||||
|
.idea/
|
||||||
|
.DS_Store
|
||||||
|
__pycache__/
|
||||||
|
*.pyc
|
||||||
|
|
||||||
|
# Secrets/Umgebung
|
||||||
|
.env
|
||||||
|
config.py
|
||||||
|
|
||||||
|
# Große/ausgeleitete Daten
|
||||||
|
data/
|
||||||
|
transkripte/
|
||||||
|
whisper-cache/
|
||||||
|
models/
|
||||||
|
*.db
|
||||||
|
*.mp4 *.mov *.mkv *.wav *.mp3 *.webm
|
||||||
|
logs/ tmp/ temp/
|
||||||
|
# embedded / external
|
||||||
text-clustering/
|
text-clustering/
|
||||||
whisper.cpp/
|
whisper.cpp/
|
||||||
|
|
||||||
# Video-Rohmaterial
|
|
||||||
*.mov
|
|
||||||
|
|
||||||
|
8
.idea/.gitignore
generated
vendored
8
.idea/.gitignore
generated
vendored
@ -1,8 +0,0 @@
|
|||||||
# Default ignored files
|
|
||||||
/shelf/
|
|
||||||
/workspace.xml
|
|
||||||
# Editor-based HTTP Client requests
|
|
||||||
/httpRequests/
|
|
||||||
# Datasource local storage ignored files
|
|
||||||
/dataSources/
|
|
||||||
/dataSources.local.xml
|
|
11
.idea/BachlorArbeit.iml
generated
11
.idea/BachlorArbeit.iml
generated
@ -1,11 +0,0 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<module type="PYTHON_MODULE" version="4">
|
|
||||||
<component name="NewModuleRootManager">
|
|
||||||
<content url="file://$MODULE_DIR$">
|
|
||||||
<sourceFolder url="file://$MODULE_DIR$" isTestSource="false" />
|
|
||||||
<excludeFolder url="file://$MODULE_DIR$/.venv" />
|
|
||||||
</content>
|
|
||||||
<orderEntry type="jdk" jdkName="Python 3.10 (BachlorArbeit)" jdkType="Python SDK" />
|
|
||||||
<orderEntry type="sourceFolder" forTests="false" />
|
|
||||||
</component>
|
|
||||||
</module>
|
|
35
.idea/dataSources.xml
generated
35
.idea/dataSources.xml
generated
@ -1,35 +0,0 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<project version="4">
|
|
||||||
<component name="DataSourceManagerImpl" format="xml" multifile-model="true">
|
|
||||||
<data-source source="LOCAL" name="segments" uuid="b474bded-3824-407e-9dc9-bcc11057235d">
|
|
||||||
<driver-ref>sqlite.xerial</driver-ref>
|
|
||||||
<synchronize>true</synchronize>
|
|
||||||
<jdbc-driver>org.sqlite.JDBC</jdbc-driver>
|
|
||||||
<jdbc-url>jdbc:sqlite:$PROJECT_DIR$/segments.db</jdbc-url>
|
|
||||||
<working-dir>$ProjectFileDir$</working-dir>
|
|
||||||
<libraries>
|
|
||||||
<library>
|
|
||||||
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/xerial/sqlite-jdbc/3.45.1.0/sqlite-jdbc-3.45.1.0.jar</url>
|
|
||||||
</library>
|
|
||||||
<library>
|
|
||||||
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/slf4j/slf4j-api/1.7.36/slf4j-api-1.7.36.jar</url>
|
|
||||||
</library>
|
|
||||||
</libraries>
|
|
||||||
</data-source>
|
|
||||||
<data-source source="LOCAL" name="clips_openai" uuid="50f21c9a-9baf-4dc5-9c33-fde0fd385e38">
|
|
||||||
<driver-ref>sqlite.xerial</driver-ref>
|
|
||||||
<synchronize>true</synchronize>
|
|
||||||
<jdbc-driver>org.sqlite.JDBC</jdbc-driver>
|
|
||||||
<jdbc-url>jdbc:sqlite:$PROJECT_DIR$/clips_openai.db</jdbc-url>
|
|
||||||
<working-dir>$ProjectFileDir$</working-dir>
|
|
||||||
<libraries>
|
|
||||||
<library>
|
|
||||||
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/xerial/sqlite-jdbc/3.45.1.0/sqlite-jdbc-3.45.1.0.jar</url>
|
|
||||||
</library>
|
|
||||||
<library>
|
|
||||||
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/slf4j/slf4j-api/1.7.36/slf4j-api-1.7.36.jar</url>
|
|
||||||
</library>
|
|
||||||
</libraries>
|
|
||||||
</data-source>
|
|
||||||
</component>
|
|
||||||
</project>
|
|
6
.idea/inspectionProfiles/profiles_settings.xml
generated
6
.idea/inspectionProfiles/profiles_settings.xml
generated
@ -1,6 +0,0 @@
|
|||||||
<component name="InspectionProjectProfileManager">
|
|
||||||
<settings>
|
|
||||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
|
||||||
<version value="1.0" />
|
|
||||||
</settings>
|
|
||||||
</component>
|
|
6
.idea/misc.xml
generated
6
.idea/misc.xml
generated
@ -1,6 +0,0 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<project version="4">
|
|
||||||
<component name="Black">
|
|
||||||
<option name="sdkName" value="Python 3.10 (BachlorArbeit)" />
|
|
||||||
</component>
|
|
||||||
</project>
|
|
8
.idea/modules.xml
generated
8
.idea/modules.xml
generated
@ -1,8 +0,0 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<project version="4">
|
|
||||||
<component name="ProjectModuleManager">
|
|
||||||
<modules>
|
|
||||||
<module fileurl="file://$PROJECT_DIR$/.idea/BachlorArbeit.iml" filepath="$PROJECT_DIR$/.idea/BachlorArbeit.iml" />
|
|
||||||
</modules>
|
|
||||||
</component>
|
|
||||||
</project>
|
|
9
.idea/vcs.xml
generated
9
.idea/vcs.xml
generated
@ -1,9 +0,0 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
|
||||||
<project version="4">
|
|
||||||
<component name="VcsDirectoryMappings">
|
|
||||||
<mapping directory="$PROJECT_DIR$/models/distiluse-base-multilingual-cased-v2" vcs="Git" />
|
|
||||||
<mapping directory="$PROJECT_DIR$/text-clustering" vcs="Git" />
|
|
||||||
<mapping directory="$PROJECT_DIR$/whisper.cpp" vcs="Git" />
|
|
||||||
<mapping directory="$PROJECT_DIR$/whisper.cpp/whisper.cpp" vcs="Git" />
|
|
||||||
</component>
|
|
||||||
</project>
|
|
250
README.md
250
README.md
@ -0,0 +1,250 @@
|
|||||||
|
# Bachelorarbeit – Pipeline: Automatisierte Highlight-Erkennung & 9:16-Aufbereitung
|
||||||
|
|
||||||
|
Diese Repository enthält eine vollständige, skriptbasierte Pipeline, um aus Langvideos automatisch Social‑Media‑taugliche 9:16‑Highlights zu erzeugen – inkl. Transkription, KI‑gestützter Clip‑Selektion, Gesichts‑/Mundaktivitätsanalyse, Auto‑Cropping, Untertitel (Word‑Caps) und finalem Export.
|
||||||
|
|
||||||
|
## Inhaltsverzeichnis
|
||||||
|
- [Features](#features)
|
||||||
|
- [Ordnerstruktur](#ordnerstruktur)
|
||||||
|
- [Voraussetzungen](#voraussetzungen)
|
||||||
|
- [Installation](#installation)
|
||||||
|
- [Schnellstart (empfohlener Workflow)](#schnellstart-empfohlener-workflow)
|
||||||
|
- [Skripte & CLI](#skripte--cli)
|
||||||
|
- [Tipps & Troubleshooting](#tipps--troubleshooting)
|
||||||
|
- [Reproduzierbarkeit](#reproduzierbarkeit)
|
||||||
|
- [Lizenz / Danksagung](#lizenz--danksagung)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Features
|
||||||
|
- **Transkription mit Wort‑Zeitstempeln (Whisper, chunked ohne Grenz‑Doppler)**
|
||||||
|
- **LLM‑gestützte Clip‑Selektion** (Viralität/Emotionalität etc. in SQLite gespeichert)
|
||||||
|
- **Face‑Detection (YOLOv8‑face) & Mundaktivität (MediaPipe)**
|
||||||
|
- **Stabiles 9:16‑Auto‑Cropping** (Median + EMA, Deadband, Szenenschnitt‑Erkennung, Switch‑Cooldown)
|
||||||
|
- **Word‑Caps Untertitel** (ASS generiert, per ffmpeg eingebrannt)
|
||||||
|
- **Batch‑Export der Highlights** (MoviePy, Längen‑/Grenz‑Checks)
|
||||||
|
|
||||||
|
## Ordnerstruktur
|
||||||
|
Die Pfade werden zentral in `config.py` definiert:
|
||||||
|
```
|
||||||
|
PROJECT_ROOT/
|
||||||
|
├─ data/
|
||||||
|
│ ├─ input/ # Eingabevideo(s)
|
||||||
|
│ ├─ transkripte/ # Whisper-Outputs (*_segments.json, *_timed.txt ...)
|
||||||
|
│ ├─ segments/ # LLM-Clip-Auswahl, DB etc.
|
||||||
|
│ ├─ output/
|
||||||
|
│ │ └─ raw_clips/ # Roh-Highlight-Clips (aus cutClips.py)
|
||||||
|
│ ├─ face_data_combined/ # faces.json je Clip (YOLO + MediaPipe)
|
||||||
|
│ └─ face_crop_centers/ # (optional) Center-Listen
|
||||||
|
├─ output/
|
||||||
|
│ ├─ output_9x16_final/ # Auto-cropped 9:16 Videos
|
||||||
|
│ ├─ output_9x16_final_subbed_word/ # 9:16 mit eingebrannten Word-Caps
|
||||||
|
│ └─ debug/ # Debug-Previews/Artefakte
|
||||||
|
├─ models/ # YOLO-Weights (z. B. yolov8n-face.pt)
|
||||||
|
├─ whisper-cache/ # Whisper Modell-Cache
|
||||||
|
└─ src/... (optional projektspezifisch)
|
||||||
|
```
|
||||||
|
> Beim ersten Start legt `config.py` fehlende Verzeichnisse automatisch an.
|
||||||
|
|
||||||
|
## Voraussetzungen
|
||||||
|
**System‑Tools**
|
||||||
|
- `ffmpeg` (inkl. `ffprobe`) im `PATH`
|
||||||
|
|
||||||
|
**Python**
|
||||||
|
- Python 3.10+ empfohlen
|
||||||
|
- Pakete (Beispiel):
|
||||||
|
`openai-whisper`, `torch`, `ffmpeg-python`, `ultralytics`, `opencv-python`, `mediapipe`, `moviepy`, `tqdm`, `numpy`, `regex`
|
||||||
|
- Optional/abhängig vom Codepfad: `pydub`, `scikit-image` (falls in Erweiterungen verwendet)
|
||||||
|
|
||||||
|
**Modelle & Keys**
|
||||||
|
- **Whisper**: lädt Modelle automatisch in `whisper-cache/` (steuerbar via `WHISPER_MODEL`)
|
||||||
|
- **YOLOv8‑face**: `models/yolov8n-face.pt` (oder größeres Modell)
|
||||||
|
- **OpenAI API Key** (für `segment_transcript.py` & `rateCluster.py`): `export OPENAI_API_KEY=...`
|
||||||
|
- Default‑Modell ggf. per `export OPENAI_MODEL=gpt-4o` setzen
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
```bash
|
||||||
|
# 1) Python-Umgebung
|
||||||
|
python3 -m venv .venv
|
||||||
|
source .venv/bin/activate
|
||||||
|
|
||||||
|
# 2) Systemabhängigkeiten
|
||||||
|
# ffmpeg installieren (Mac: brew install ffmpeg, Ubuntu: apt install ffmpeg)
|
||||||
|
|
||||||
|
# 3) Python-Pakete (Beispiel)
|
||||||
|
pip install --upgrade pip
|
||||||
|
pip install openai-whisper torch ffmpeg-python ultralytics opencv-python mediapipe moviepy tqdm numpy regex
|
||||||
|
|
||||||
|
# 4) Modelle/Dateien
|
||||||
|
# YOLO-Weights:
|
||||||
|
# Download yolov8n-face.pt → ./models/yolov8n-face.pt
|
||||||
|
# API Key für LLM:
|
||||||
|
export OPENAI_API_KEY="sk-..."
|
||||||
|
export OPENAI_MODEL="gpt-4o"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Schnellstart (empfohlener Workflow)
|
||||||
|
1) **Eingabe ablegen**
|
||||||
|
Lege dein Langvideo in `data/input/` (z. B. `meinvideo.mp4`).
|
||||||
|
|
||||||
|
2) **Transkription (Whisper, chunked & doppler-sicher)**
|
||||||
|
```bash
|
||||||
|
python transcription.py --input data/input/meinvideo.mp4 --model small --lang de
|
||||||
|
```
|
||||||
|
→ erzeugt `*_segments.json` + `*_timed.txt` in `data/transkripte/`.
|
||||||
|
|
||||||
|
3) **Clips mit LLM selektieren & in DB speichern**
|
||||||
|
```bash
|
||||||
|
export OPENAI_API_KEY="..."; export OPENAI_MODEL="gpt-4o"
|
||||||
|
python segment_transcript.py --base meinvideo --block 60 --min 6.0 --max 30.0
|
||||||
|
```
|
||||||
|
→ schreibt Clips in SQLite (`data/clips_openai.db` o. ä.)
|
||||||
|
|
||||||
|
4) **Highlights aus dem Originalvideo schneiden**
|
||||||
|
```bash
|
||||||
|
python cutClips.py --file meinvideo.mp4 --limit 10 --order score
|
||||||
|
```
|
||||||
|
→ exportiert `highlight_*.mp4` nach `data/output/raw_clips/`
|
||||||
|
|
||||||
|
5) **Face‑Detection + Mundaktivität**
|
||||||
|
```bash
|
||||||
|
python main_detect_faces.py --model models/yolov8n-face.pt --input-dir data/output/raw_clips --output-dir data/face_data_combined --frame-skip 1 --downscale 0.5
|
||||||
|
```
|
||||||
|
|
||||||
|
6) **Targets je Frame bauen (Zentren/Größe glätten)**
|
||||||
|
```bash
|
||||||
|
python make_segments.py --pattern "highlight_*.mp4" --fps 0 --smooth 7 --overwrite
|
||||||
|
```
|
||||||
|
|
||||||
|
7) **9:16 Auto‑Crop anwenden**
|
||||||
|
```bash
|
||||||
|
python main_apply_crop.py --pattern "highlight_*.mp4" --median 7 --ema 0.5 --deadband 16 --cut_detect --mux_audio --overwrite
|
||||||
|
```
|
||||||
|
→ fertige 9:16‑Clips in `output/output_9x16_final/`
|
||||||
|
|
||||||
|
8) **Word‑Caps Untertitel einbrennen (optional)**
|
||||||
|
```bash
|
||||||
|
python add_subtitles.py --clips_dir output/output_9x16_final --out_dir output/output_9x16_final_subbed_word --model small --limit 20
|
||||||
|
```
|
||||||
|
→ fertige Videos mit eingebrannten Word‑Caps in `output/output_9x16_final_subbed_word/`
|
||||||
|
|
||||||
|
> 💡 Du kannst viele Parameter (Fensterbreiten, Deadband, Erkennungsschwellen, Limits) über die CLI anpassen.
|
||||||
|
|
||||||
|
## Skripte & CLI
|
||||||
|
### `transcription.py`
|
||||||
|
Chunked‑Transkription mit Wortzeitstempeln.
|
||||||
|
```
|
||||||
|
--input PATH # Eingabevideo/-audio (Default: erstes File in data/input/)
|
||||||
|
--outdir PATH # Ausgabeverzeichnis (Default: data/transkripte/)
|
||||||
|
--model NAME # Whisper-Modell (tiny/base/small/medium/large; env: WHISPER_MODEL)
|
||||||
|
--lang CODE # Sprachcode (z. B. de) oder leer/None für Auto-Detect
|
||||||
|
--chunk FLOAT # Chunk-Länge in s (Default 60)
|
||||||
|
--overlap FLOAT # Überlappung in s (Default 2.0)
|
||||||
|
--min-dur FLOAT # Mindest-Segmentdauer (s)
|
||||||
|
--max-gap FLOAT # Max. Zeit-Gap beim Mergen (s)
|
||||||
|
--fp16 # Nur sinnvoll mit GPU
|
||||||
|
```
|
||||||
|
|
||||||
|
### `segment_transcript.py`
|
||||||
|
LLM‑Selektion & Speichern in SQLite.
|
||||||
|
```
|
||||||
|
--base STR # Basename der Transkriptdateien (z. B. 'meinvideo')
|
||||||
|
--block FLOAT # Blocklänge s für den Prompt
|
||||||
|
--min FLOAT # minimale Clip-Länge s
|
||||||
|
--max FLOAT # maximale Clip-Länge s
|
||||||
|
# env: OPENAI_API_KEY, OPENAI_MODEL (z. B. gpt-4o)
|
||||||
|
```
|
||||||
|
|
||||||
|
### `cutClips.py`
|
||||||
|
Schneidet ausgewählte Highlights als Einzelclips.
|
||||||
|
```
|
||||||
|
--file NAME # Name der Input-Datei in data/input (Default: erstes Video)
|
||||||
|
--limit INT # Anzahl zu exportierender Clips (Default 10)
|
||||||
|
--order {score,start} # Sortierung: Score (absteigend) oder Startzeit
|
||||||
|
```
|
||||||
|
|
||||||
|
### `main_detect_faces.py`
|
||||||
|
YOLOv8‑face + MediaPipe → `faces.json` pro Clip.
|
||||||
|
```
|
||||||
|
--input-dir PATH # Default: data/output/raw_clips
|
||||||
|
--output-dir PATH # Default: data/face_data_combined
|
||||||
|
--model PATH # YOLOv8-face Weights (Default: models/yolov8n-face.pt)
|
||||||
|
--conf-thresh FLOAT # Default 0.35
|
||||||
|
--frame-skip INT # z. B. 1 = jeden Frame, 2 = jeden von zwei ...
|
||||||
|
--downscale FLOAT # Frame-Downscale vor YOLO (0..1, z. B. 0.5)
|
||||||
|
--expansion FLOAT # Margin Pass 1 (relativ)
|
||||||
|
--expansion2 FLOAT # Margin Pass 2 (relativ)
|
||||||
|
--min-crop INT # minimale Croplänge (px)
|
||||||
|
--faces-upscale INT # min. Kantenlänge für FaceMesh (kleine Crops hochskalieren)
|
||||||
|
--imgsz INT # YOLO input size (Default 448)
|
||||||
|
--max-det INT # Max Detects / Frame
|
||||||
|
--use-refine # MediaPipe refine_landmarks aktivieren
|
||||||
|
```
|
||||||
|
|
||||||
|
### `make_segments.py`
|
||||||
|
Erzeugt `*_target_by_frame.json` (Zentren+Side pro Frame) aus Face/Center‑Daten.
|
||||||
|
```
|
||||||
|
--pattern STR # Dateimuster in raw_clips (Default: highlight_*.mp4)
|
||||||
|
--fps FLOAT # FPS erzwingen (0 = aus Video lesen)
|
||||||
|
--smooth INT # MA-Fensterbreite (ungerade)
|
||||||
|
--overwrite # bestehende target_by_frame.json überschreiben
|
||||||
|
```
|
||||||
|
|
||||||
|
### `main_apply_crop.py`
|
||||||
|
Wendet 9:16‑Crop mit Glättung/Szenenschnitt an.
|
||||||
|
```
|
||||||
|
--pattern STR # Dateimuster in raw_clips (Default: *.mp4)
|
||||||
|
--out_w INT # Output-Breite (Default 1080)
|
||||||
|
--out_h INT # Output-Höhe (Default 1920)
|
||||||
|
--zoom_pad FLOAT # Zoom-Pad (0..1)
|
||||||
|
--median INT # Median-Fenster (>=3, ungerade)
|
||||||
|
--ema FLOAT # EMA-Alpha (0..1)
|
||||||
|
--deadband FLOAT # Totband in Pixel
|
||||||
|
--switch_cd INT # Cooldown-Frames nach Trackwechsel
|
||||||
|
--cut_detect # Szenenschnitt-Erkennung aktivieren
|
||||||
|
--cut_corr FLOAT # Schwellwert Korrelation (0..1)
|
||||||
|
--cut_cd INT # Cooldown-Frames nach Cut
|
||||||
|
--mux_audio # Original-Audio unterlegen
|
||||||
|
--debug # Debug-Overlay anzeigen
|
||||||
|
--debug_scale FLOAT # Debug-Preview skaliert rendern
|
||||||
|
--overwrite # vorhandene Ausgaben überschreiben
|
||||||
|
```
|
||||||
|
|
||||||
|
### `add_subtitles.py`
|
||||||
|
Generiert Word‑Caps mit Whisper & brennt sie ein.
|
||||||
|
```
|
||||||
|
--clips_dir PATH # Quelle (Default: output/output_9x16_final)
|
||||||
|
--out_dir PATH # Ziel (Default: output/output_9x16_final_subbed_word)
|
||||||
|
--pattern STR # z. B. *.mp4
|
||||||
|
--limit INT # Nur die ersten N Clips
|
||||||
|
--model NAME # Whisper-Modell (tiny/base/small/medium/large)
|
||||||
|
--lang CODE # Sprachcode oder Auto
|
||||||
|
```
|
||||||
|
|
||||||
|
### `rateCluster.py` (optional)
|
||||||
|
Lässt LLM Scores (Viralität, Emotion, Humor, Provokation) nachtragen.
|
||||||
|
> Modelliere Standard‑Modell via `OPENAI_MODEL` (z. B. `gpt-4o`).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Tipps & Troubleshooting
|
||||||
|
- **Modelle/Performance**
|
||||||
|
- CPU‑only ist möglich (Whisper/YOLO langsamer). Auf Apple Silicon wird automatisch **MPS** genutzt; auf NVIDIA **CUDA**.
|
||||||
|
- `--frame-skip` und `--downscale` in `main_detect_faces.py` beschleunigen die Face‑Detection deutlich.
|
||||||
|
- **ffmpeg‑Muxing prüfen** (`main_apply_crop.py --mux_audio`): Falls Ton fehlt, `ffmpeg`-Installation checken. Rückgabecode im Log prüfen.
|
||||||
|
- **Fehlende Dateien**
|
||||||
|
- Kein Input? → `data/input/` prüfen.
|
||||||
|
- Fehlende Transkript‑Paare? → `*_timed.txt` und `*_segments.json` müssen existieren (aus `transcription.py`).
|
||||||
|
- Fehlende Faces? → Pfad zu `models/yolov8n-face.pt` korrekt?
|
||||||
|
- **Datenbank**
|
||||||
|
- Highlights liegen in SQLite (siehe `config.py`: `DB_PATH`). Bei Wiederholungen kann ein `DELETE FROM highlights; VACUUM;` sinnvoll sein.
|
||||||
|
- **Cache/Verzeichnisse**
|
||||||
|
- Whisper‑Cache via `XDG_CACHE_HOME` → `whisper-cache/` neben dem Projekt. Speicherplatz beachten.
|
||||||
|
|
||||||
|
## Reproduzierbarkeit
|
||||||
|
- Lege eine `requirements.txt` mit exakten Versionen an (Freeze deiner funktionierenden Umgebung).
|
||||||
|
- Dokumentiere verwendete **Modell‑Versionsstände** (YOLO Weights, Whisper‑Modellgröße, OPENAI_MODEL).
|
||||||
|
- Fixiere Random‑Seeds, falls nötig (hier meist deterministisch durch externe Modelle/Bibliotheken).
|
||||||
|
|
||||||
|
## Lizenz / Danksagung
|
||||||
|
- Verwendung von **OpenAI Whisper**, **Ultralytics YOLOv8**, **MediaPipe**, **OpenCV**, **MoviePy**, **ffmpeg**.
|
||||||
|
- Die jeweiligen Lizenzen der Bibliotheken beachten.
|
@ -1,38 +0,0 @@
|
|||||||
from moviepy.video.io.VideoFileClip import VideoFileClip
|
|
||||||
from pathlib import Path
|
|
||||||
import sqlite3
|
|
||||||
|
|
||||||
# === Setup ===
|
|
||||||
input_video = Path("input/testVideoShort.mov")
|
|
||||||
output_dir = Path("output")
|
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
# === SQLite DB lesen ===
|
|
||||||
db_path = "clips_openai.db"
|
|
||||||
conn = sqlite3.connect(db_path)
|
|
||||||
cursor = conn.cursor()
|
|
||||||
|
|
||||||
# Nur die Top 10 Clips mit höchstem score_total
|
|
||||||
cursor.execute("""
|
|
||||||
SELECT start, end, text
|
|
||||||
FROM highlights
|
|
||||||
ORDER BY score_total DESC
|
|
||||||
LIMIT 10
|
|
||||||
""")
|
|
||||||
highlights = cursor.fetchall()
|
|
||||||
|
|
||||||
# === Video laden ===
|
|
||||||
video = VideoFileClip(str(input_video))
|
|
||||||
|
|
||||||
# === Clips schneiden ===
|
|
||||||
for i, (start, end, text) in enumerate(highlights):
|
|
||||||
output_file = output_dir / f"highlight_{i+1}.mp4"
|
|
||||||
end = min(end, video.duration) # Sicherstellen, dass das Ende nicht über das Video hinausgeht
|
|
||||||
print(f"🎬 Exportiere Clip {i+1}: {start:.2f}s – {end:.2f}s → {output_file.name}")
|
|
||||||
clip = video.subclipped(start, end)
|
|
||||||
clip.write_videofile(str(output_file), codec="libx264", audio_codec="aac")
|
|
||||||
|
|
||||||
# === Cleanup ===
|
|
||||||
conn.close()
|
|
||||||
video.close()
|
|
||||||
print("✅ Top 10 Clips exportiert.")
|
|
@ -1,196 +0,0 @@
|
|||||||
import json
|
|
||||||
import sqlite3
|
|
||||||
import re
|
|
||||||
from pathlib import Path
|
|
||||||
from openai import OpenAI
|
|
||||||
from datetime import datetime
|
|
||||||
import time
|
|
||||||
import nltk
|
|
||||||
|
|
||||||
nltk.download("punkt")
|
|
||||||
|
|
||||||
# === SETTINGS ===
|
|
||||||
TRANSCRIPT_PATH = Path("transkripte/testVideoShort_timed.txt")
|
|
||||||
DB_PATH = Path("clips_openai.db")
|
|
||||||
LOG_DIR = Path("logs")
|
|
||||||
LOG_DIR.mkdir(exist_ok=True)
|
|
||||||
BLOCK_DURATION = 300
|
|
||||||
MIN_CLIP_LEN = 5
|
|
||||||
MAX_CLIP_LEN = 90
|
|
||||||
|
|
||||||
client = OpenAI(api_key="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")
|
|
||||||
|
|
||||||
# === HILFSFUNKTIONEN ===
|
|
||||||
def log_text(filename, content):
|
|
||||||
(LOG_DIR / filename).write_text(content.strip(), encoding="utf-8")
|
|
||||||
|
|
||||||
def append_error_log(content):
|
|
||||||
with (LOG_DIR / "errors.txt").open("a", encoding="utf-8") as f:
|
|
||||||
f.write(content + "\n\n")
|
|
||||||
|
|
||||||
def extract_json(text):
|
|
||||||
match = re.search(r"\[.*\]", text.strip(), re.DOTALL)
|
|
||||||
if match:
|
|
||||||
try:
|
|
||||||
return json.loads(match.group())
|
|
||||||
except Exception as e:
|
|
||||||
append_error_log(f"❌ JSON-Fehler: {e}\n{text}")
|
|
||||||
return []
|
|
||||||
|
|
||||||
def get_original_text(clip, segments, debug=False):
|
|
||||||
texts = []
|
|
||||||
used_segments = []
|
|
||||||
for s in segments:
|
|
||||||
# Überschneidung: Segment und Clip teilen sich Zeit
|
|
||||||
if not (s["end"] < clip["start"] or s["start"] > clip["end"]):
|
|
||||||
texts.append(s["text"])
|
|
||||||
used_segments.append(f"[{s['start']:.2f} – {s['end']:.2f}] {s['text']}")
|
|
||||||
if debug:
|
|
||||||
print(f"\n🟢 Clip {clip['start']}–{clip['end']} nutzt Segmente:\n" +
|
|
||||||
"\n".join(used_segments))
|
|
||||||
return " ".join(texts).strip()
|
|
||||||
|
|
||||||
# === TRANSKRIPT EINLESEN ===
|
|
||||||
lines = TRANSCRIPT_PATH.read_text(encoding="utf-8").splitlines()
|
|
||||||
segments = []
|
|
||||||
for line in lines:
|
|
||||||
match = re.match(r"\[(\d+(?:\.\d+)?)\s*[–-]\s*(\d+(?:\.\d+)?)\]\s*(.*)", line)
|
|
||||||
if match:
|
|
||||||
start, end, text = match.groups()
|
|
||||||
start = float(start)
|
|
||||||
end = float(end)
|
|
||||||
if end - start >= 2.0:
|
|
||||||
segments.append({"start": start, "end": end, "text": text.strip()})
|
|
||||||
|
|
||||||
if not segments:
|
|
||||||
raise RuntimeError("🚫 Keine gültigen Segmente gefunden.")
|
|
||||||
print(f"✅ {len(segments)} gültige Transkriptsegmente geladen.")
|
|
||||||
|
|
||||||
# === BLÖCKE BILDEN
|
|
||||||
blocks = []
|
|
||||||
current_block = []
|
|
||||||
current_start = 0.0
|
|
||||||
for seg in segments:
|
|
||||||
if seg["end"] - current_start > BLOCK_DURATION:
|
|
||||||
blocks.append(current_block)
|
|
||||||
current_block = []
|
|
||||||
current_start = seg["start"]
|
|
||||||
current_block.append(seg)
|
|
||||||
if current_block:
|
|
||||||
blocks.append(current_block)
|
|
||||||
print(f"🧱 {len(blocks)} Blöcke erstellt (à ~{BLOCK_DURATION}s).")
|
|
||||||
|
|
||||||
# === KI: CLIP-AUSWAHL
|
|
||||||
all_clips = []
|
|
||||||
start_time = time.perf_counter()
|
|
||||||
|
|
||||||
for i, block in enumerate(blocks):
|
|
||||||
if not block:
|
|
||||||
continue
|
|
||||||
|
|
||||||
print(f"\n🤖 Sende Block {i+1}/{len(blocks)} an GPT-4o...")
|
|
||||||
|
|
||||||
block_text = "\n".join([f"[{s['start']} – {s['end']}] {s['text']}" for s in block])
|
|
||||||
prompt = f"""
|
|
||||||
Du bekommst einen Transkriptblock mit Zeitangaben. Extrahiere daraus 1–3 besonders interessante Abschnitte, die sich als eigenständige Clips eignen.
|
|
||||||
Achte darauf das es abgeschlossene Clips sind und als eigenstaendiger Clip funktionieren kann.
|
|
||||||
|
|
||||||
Ein guter Clip:
|
|
||||||
- ist abgeschlossen und verständlich
|
|
||||||
- enthält eine Pointe, Erkenntnis oder einen emotionalen Moment
|
|
||||||
- wirkt wie ein Mini-Ausschnitt mit Anfang, Spannungsbogen, Auflösung oder Punchline
|
|
||||||
- ist **mindestens 30 Sekunden lang**
|
|
||||||
|
|
||||||
Nutze ausschließlich die vorhandenen Start- und Endzeiten – keine neuen erfinden.
|
|
||||||
|
|
||||||
Gib ein valides JSON-Array zurück im Format:
|
|
||||||
[
|
|
||||||
{{
|
|
||||||
"start": float,
|
|
||||||
"end": float,
|
|
||||||
"summary": "Kurze Beschreibung des Inhalts"
|
|
||||||
}}
|
|
||||||
]
|
|
||||||
|
|
||||||
TRANSKRIPT:
|
|
||||||
{block_text}
|
|
||||||
"""
|
|
||||||
log_text(f"block_prompt_{i+1}.txt", prompt)
|
|
||||||
|
|
||||||
try:
|
|
||||||
response = client.chat.completions.create(
|
|
||||||
model="gpt-4o",
|
|
||||||
messages=[{"role": "user", "content": prompt}],
|
|
||||||
temperature=0.4
|
|
||||||
)
|
|
||||||
raw = response.choices[0].message.content
|
|
||||||
log_text(f"block_output_{i+1}.txt", raw)
|
|
||||||
clips = extract_json(raw)
|
|
||||||
|
|
||||||
print(f"✅ {len(clips)} Clips empfangen in Block {i+1}")
|
|
||||||
|
|
||||||
for clip in clips:
|
|
||||||
try:
|
|
||||||
dur = float(clip["end"]) - float(clip["start"])
|
|
||||||
if MIN_CLIP_LEN <= dur <= MAX_CLIP_LEN:
|
|
||||||
clip["duration"] = round(dur, 2)
|
|
||||||
all_clips.append(clip)
|
|
||||||
except Exception as e:
|
|
||||||
append_error_log(f"⛔ Clip-Filterfehler: {clip}\n{e}")
|
|
||||||
|
|
||||||
print(f"📈 Aktueller Clip-Gesamtstand: {len(all_clips)}")
|
|
||||||
|
|
||||||
# ETA berechnen
|
|
||||||
elapsed = time.perf_counter() - start_time
|
|
||||||
avg_time = elapsed / (i + 1)
|
|
||||||
eta = avg_time * (len(blocks) - (i + 1))
|
|
||||||
print(f"⏱️ Geschätzte Restzeit: {eta:.1f} Sekunden")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
append_error_log(f"❌ OpenAI-Fehler Block {i+1}: {e}")
|
|
||||||
print(f"❌ Fehler bei Block {i+1}: {e}")
|
|
||||||
|
|
||||||
# === DB SPEICHERN
|
|
||||||
conn = sqlite3.connect(DB_PATH)
|
|
||||||
cur = conn.cursor()
|
|
||||||
cur.execute("DROP TABLE IF EXISTS segments")
|
|
||||||
cur.execute("""
|
|
||||||
CREATE TABLE segments (
|
|
||||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
||||||
file TEXT,
|
|
||||||
start REAL,
|
|
||||||
end REAL,
|
|
||||||
duration REAL,
|
|
||||||
text TEXT,
|
|
||||||
summary TEXT
|
|
||||||
)
|
|
||||||
""")
|
|
||||||
|
|
||||||
inserted = 0
|
|
||||||
failed = 0
|
|
||||||
for clip in all_clips:
|
|
||||||
try:
|
|
||||||
start = float(clip["start"])
|
|
||||||
end = float(clip["end"])
|
|
||||||
duration = float(clip["duration"])
|
|
||||||
summary = clip.get("summary", "")
|
|
||||||
# debug=True für print aller Segment-Texte pro Clip
|
|
||||||
original_text = get_original_text(clip, segments, debug=False)
|
|
||||||
if end <= start or start < 0:
|
|
||||||
raise ValueError("Ungültige Zeiten")
|
|
||||||
cur.execute(
|
|
||||||
"INSERT INTO segments (file, start, end, duration, text, summary) VALUES (?, ?, ?, ?, ?, ?)",
|
|
||||||
(TRANSCRIPT_PATH.stem, start, end, duration, original_text, summary.strip())
|
|
||||||
)
|
|
||||||
inserted += 1
|
|
||||||
except Exception as e:
|
|
||||||
failed += 1
|
|
||||||
append_error_log(f"❌ DB-Fehler: {clip}\n{e}")
|
|
||||||
|
|
||||||
conn.commit()
|
|
||||||
conn.close()
|
|
||||||
|
|
||||||
print("\n📊 Ergebnisse:")
|
|
||||||
print(f" ✅ Clips gespeichert: {inserted}")
|
|
||||||
print(f" ❌ Fehlerhafte Clips: {failed}")
|
|
||||||
print(f"📁 Logs gespeichert in: {LOG_DIR.resolve()}")
|
|
@ -1,108 +0,0 @@
|
|||||||
# transcription_chunked.py
|
|
||||||
import whisper
|
|
||||||
from pathlib import Path
|
|
||||||
import os
|
|
||||||
import json
|
|
||||||
import ffmpeg
|
|
||||||
import tempfile
|
|
||||||
|
|
||||||
# === Einstellungen ===
|
|
||||||
input_file = Path("input/testVideoShort.mov")
|
|
||||||
output_dir = Path("transkripte")
|
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
|
||||||
|
|
||||||
output_txt = output_dir / f"{input_file.stem}_timed.txt"
|
|
||||||
output_json = output_dir / f"{input_file.stem}_segments.json"
|
|
||||||
suspect_txt = output_dir / f"{input_file.stem}_suspect_lines.txt"
|
|
||||||
|
|
||||||
CHUNKS = 4 # Anzahl Chunks (anpassen!)
|
|
||||||
OVERLAP = 2.0 # Sekunden Überlappung
|
|
||||||
|
|
||||||
os.environ["XDG_CACHE_HOME"] = str(Path(__file__).parent / "whisper-cache")
|
|
||||||
|
|
||||||
probe = ffmpeg.probe(str(input_file))
|
|
||||||
duration = float(probe["format"]["duration"])
|
|
||||||
print(f"🎥 Videolänge: {duration:.2f} Sekunden")
|
|
||||||
|
|
||||||
def extract_audio_chunk(start_time, duration, output_path):
|
|
||||||
ffmpeg.input(str(input_file), ss=start_time, t=duration).output(
|
|
||||||
str(output_path),
|
|
||||||
format="wav",
|
|
||||||
acodec="pcm_s16le",
|
|
||||||
ac=1,
|
|
||||||
ar="16000",
|
|
||||||
loglevel="error"
|
|
||||||
).overwrite_output().run()
|
|
||||||
|
|
||||||
def is_suspect(text):
|
|
||||||
words = text.strip().lower().split()
|
|
||||||
if not words:
|
|
||||||
return True
|
|
||||||
most_common = max([words.count(w) for w in set(words)])
|
|
||||||
return most_common / len(words) > 0.6 or most_common > 20
|
|
||||||
|
|
||||||
tmp_dir = Path(tempfile.mkdtemp())
|
|
||||||
all_segments = []
|
|
||||||
|
|
||||||
print(f"✂️ Teile Audio in {CHUNKS} Chunks ...")
|
|
||||||
for i in range(CHUNKS):
|
|
||||||
chunk_start = max(0, i * (duration / CHUNKS) - OVERLAP if i > 0 else 0)
|
|
||||||
chunk_end = min(duration, (i + 1) * (duration / CHUNKS) + OVERLAP)
|
|
||||||
chunk_dur = chunk_end - chunk_start
|
|
||||||
chunk_file = tmp_dir / f"chunk_{i}.wav"
|
|
||||||
print(f"🔉 Extrahiere Chunk {i+1}/{CHUNKS}: {chunk_start:.2f}s – {chunk_end:.2f}s")
|
|
||||||
extract_audio_chunk(chunk_start, chunk_dur, chunk_file)
|
|
||||||
|
|
||||||
print(f"🧠 Transkribiere Chunk {i+1} ...")
|
|
||||||
model = whisper.load_model("small") # Wechsel zu "medium" oder "large" falls gewünscht
|
|
||||||
result = model.transcribe(
|
|
||||||
str(chunk_file),
|
|
||||||
language="de",
|
|
||||||
fp16=False,
|
|
||||||
word_timestamps=False,
|
|
||||||
condition_on_previous_text=True,
|
|
||||||
temperature=0,
|
|
||||||
verbose=False
|
|
||||||
)
|
|
||||||
|
|
||||||
segments = result["segments"]
|
|
||||||
# Zeitversatz für den aktuellen Chunk hinzufügen
|
|
||||||
offset = chunk_start
|
|
||||||
for seg in segments:
|
|
||||||
seg["start"] += offset
|
|
||||||
seg["end"] += offset
|
|
||||||
all_segments.extend(segments)
|
|
||||||
|
|
||||||
# === Sortiere und filtere doppelte/überlappende Segmente
|
|
||||||
all_segments.sort(key=lambda x: x["start"])
|
|
||||||
|
|
||||||
def segment_hash(seg):
|
|
||||||
return (round(seg["start"], 2), round(seg["end"], 2), seg["text"].strip().lower())
|
|
||||||
|
|
||||||
unique_segments = []
|
|
||||||
seen = set()
|
|
||||||
for seg in all_segments:
|
|
||||||
h = segment_hash(seg)
|
|
||||||
if h not in seen:
|
|
||||||
seen.add(h)
|
|
||||||
unique_segments.append(seg)
|
|
||||||
|
|
||||||
print(f"✅ Insgesamt {len(unique_segments)} einzigartige Segmente transkribiert.")
|
|
||||||
|
|
||||||
with open(output_txt, "w", encoding="utf-8") as f, open(suspect_txt, "w", encoding="utf-8") as f_sus:
|
|
||||||
for seg in unique_segments:
|
|
||||||
start = seg["start"]
|
|
||||||
end = seg["end"]
|
|
||||||
text = seg["text"].strip()
|
|
||||||
line = f"[{start:.2f} – {end:.2f}] {text}\n"
|
|
||||||
f.write(line) # IMMER ins Haupttranskript!
|
|
||||||
if is_suspect(text):
|
|
||||||
f_sus.write(line)
|
|
||||||
|
|
||||||
|
|
||||||
print(f"📝 Zeitmarkiertes Transkript gespeichert unter: {output_txt}")
|
|
||||||
print(f"⚠️ Verdächtige Zeilen gespeichert unter: {suspect_txt}")
|
|
||||||
|
|
||||||
with open(output_json, "w", encoding="utf-8") as f:
|
|
||||||
json.dump(unique_segments, f, ensure_ascii=False, indent=2)
|
|
||||||
print(f"💾 Segmentdaten gespeichert unter: {output_json}")
|
|
233
main.py
Normal file
233
main.py
Normal file
@ -0,0 +1,233 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Run the full Bachelor pipeline end-to-end with timing, errors, and flexible flags.
|
||||||
|
|
||||||
|
Steps:
|
||||||
|
1) transcription.py → Whisper transcripts (segments + timed words)
|
||||||
|
2) segment_transcript.py → LLM selects highlight candidates → SQLite
|
||||||
|
3) cutClips.py → export highlight_*.mp4 (raw clips)
|
||||||
|
4) main_detect_faces.py → YOLO + MediaPipe → faces.json per clip
|
||||||
|
5) make_segments.py → *_target_by_frame.json (center+side per frame)
|
||||||
|
6) main_apply_crop.py → 9:16 crop with smoothing & optional audio mux
|
||||||
|
7) rateCluster.py → (optional) LLM scoring (virality, emotion, ...)
|
||||||
|
8) add_subtitles.py → (optional) word-cap subtitles burned in
|
||||||
|
|
||||||
|
Usage examples:
|
||||||
|
python main.py --input data/input/meinvideo.mp4 --limit 10 --openai-model gpt-4o
|
||||||
|
python main.py --no-rate --no-subs
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
import argparse
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import subprocess
|
||||||
|
import time
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# --- Import project config ---
|
||||||
|
try:
|
||||||
|
from config import (
|
||||||
|
PROJECT_ROOT, INPUT_DIR, RAW_CLIPS_DIR, CROPPED_DIR, SUBTITLED_DIR,
|
||||||
|
WHISPER_CACHE_DIR
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
PROJECT_ROOT = Path(__file__).resolve().parent
|
||||||
|
sys.path.insert(0, str(PROJECT_ROOT))
|
||||||
|
from config import (
|
||||||
|
PROJECT_ROOT, INPUT_DIR, RAW_CLIPS_DIR, CROPPED_DIR, SUBTITLED_DIR,
|
||||||
|
WHISPER_CACHE_DIR
|
||||||
|
)
|
||||||
|
|
||||||
|
LOGS_DIR = PROJECT_ROOT / "logs"
|
||||||
|
LOGS_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# --- korrekte Pfade zu den Skripten ---
|
||||||
|
SCRIPTS = {
|
||||||
|
"transcription": str(PROJECT_ROOT / "src" / "text" / "transcription.py"),
|
||||||
|
"segment_transcript": str(PROJECT_ROOT / "src" / "text" / "segment_transcript.py"),
|
||||||
|
"cutClips": str(PROJECT_ROOT / "src" / "text" / "cutClips.py"),
|
||||||
|
"detect_faces": str(PROJECT_ROOT / "src" / "reformat" / "main_detect_faces.py"),
|
||||||
|
"make_segments": str(PROJECT_ROOT / "src" / "reformat" / "make_segments.py"),
|
||||||
|
"apply_crop": str(PROJECT_ROOT / "src" / "reformat" / "main_apply_crop.py"),
|
||||||
|
"rateCluster": str(PROJECT_ROOT / "src" / "text" / "rateCluster.py"),
|
||||||
|
"add_subtitles": str(PROJECT_ROOT / "src" / "subtitles" / "add_subtitles.py"),
|
||||||
|
}
|
||||||
|
|
||||||
|
def shlex_join(cmd):
|
||||||
|
return " ".join(str(c) for c in cmd)
|
||||||
|
|
||||||
|
def run_step(cmd: list[str], name: str, env: dict[str, str] | None = None) -> float:
|
||||||
|
"""Run a subprocess step, raise on error, return duration in seconds."""
|
||||||
|
t0 = time.perf_counter()
|
||||||
|
print(f"\n===== {name} =====")
|
||||||
|
print(" ", shlex_join(cmd))
|
||||||
|
cp = subprocess.run(cmd, env=env)
|
||||||
|
dt = time.perf_counter() - t0
|
||||||
|
if cp.returncode != 0:
|
||||||
|
print(f"❌ Fehler in {name} (Exit {cp.returncode}) nach {dt:.2f}s")
|
||||||
|
print(" → Prüfe das Logfile oben für Details und stelle sicher, dass Abhängigkeiten installiert sind:")
|
||||||
|
print(" - ffmpeg/ffprobe im PATH")
|
||||||
|
print(" - Python-Pakete: openai-whisper, torch, ffmpeg-python, ultralytics, opencv-python, mediapipe, moviepy, tqdm, numpy")
|
||||||
|
print(" - OPENAI_API_KEY gesetzt (für LLM-Schritte)")
|
||||||
|
raise SystemExit(cp.returncode)
|
||||||
|
print(f"✅ {name} in {dt:.2f}s")
|
||||||
|
return dt
|
||||||
|
|
||||||
|
def infer_base_from_input(input_path: Path) -> str:
|
||||||
|
return input_path.stem
|
||||||
|
|
||||||
|
def default_input() -> Path | None:
|
||||||
|
if not INPUT_DIR.exists():
|
||||||
|
return None
|
||||||
|
for p in sorted(INPUT_DIR.iterdir()):
|
||||||
|
if p.suffix.lower() in {".mp4", ".mov", ".mkv", ".m4v", ".mp3", ".wav"}:
|
||||||
|
return p
|
||||||
|
return None
|
||||||
|
|
||||||
|
def main():
|
||||||
|
ap = argparse.ArgumentParser(description="Bachelor Pipeline Runner")
|
||||||
|
ap.add_argument("--input", type=str, default=None, help="Pfad zu Eingabedatei (Default: erstes File in data/input)")
|
||||||
|
ap.add_argument("--limit", type=int, default=10, help="Anzahl Highlights (cutClips)")
|
||||||
|
ap.add_argument("--whisper-model", type=str, default=os.getenv("WHISPER_MODEL", "small"))
|
||||||
|
ap.add_argument("--lang", type=str, default=None, help="Sprachcode (z. B. de)")
|
||||||
|
ap.add_argument("--openai-model", type=str, default=os.getenv("OPENAI_MODEL", "gpt-4o"))
|
||||||
|
ap.add_argument("--pattern", type=str, default="highlight_*.mp4")
|
||||||
|
ap.add_argument("--overwrite", action="store_true")
|
||||||
|
ap.add_argument("--no-rate", action="store_true")
|
||||||
|
ap.add_argument("--no-subs", action="store_true")
|
||||||
|
ap.add_argument("--no-detect", action="store_true")
|
||||||
|
ap.add_argument("--no-make", action="store_true")
|
||||||
|
ap.add_argument("--no-apply", action="store_true")
|
||||||
|
ap.add_argument("--logfile", type=str, default=None)
|
||||||
|
args = ap.parse_args()
|
||||||
|
|
||||||
|
os.chdir(PROJECT_ROOT)
|
||||||
|
|
||||||
|
env = os.environ.copy()
|
||||||
|
env.setdefault("OPENAI_MODEL", args.openai_model)
|
||||||
|
env.setdefault("XDG_CACHE_HOME", str(WHISPER_CACHE_DIR))
|
||||||
|
|
||||||
|
if not env.get("OPENAI_API_KEY"):
|
||||||
|
print("⚠️ OPENAI_API_KEY ist nicht gesetzt – LLM-Schritte könnten fehlschlagen.")
|
||||||
|
|
||||||
|
# Input-Datei bestimmen
|
||||||
|
if args.input:
|
||||||
|
input_path = Path(args.input)
|
||||||
|
if not input_path.is_file():
|
||||||
|
candidate = INPUT_DIR / args.input
|
||||||
|
if candidate.is_file():
|
||||||
|
input_path = candidate
|
||||||
|
else:
|
||||||
|
raise SystemExit(f"Input nicht gefunden: {args.input}")
|
||||||
|
else:
|
||||||
|
picked = default_input()
|
||||||
|
if not picked:
|
||||||
|
raise SystemExit(f"Kein Input in {INPUT_DIR} gefunden. Bitte --input setzen.")
|
||||||
|
input_path = picked
|
||||||
|
|
||||||
|
base = infer_base_from_input(input_path)
|
||||||
|
print(f"📥 Input: {input_path}")
|
||||||
|
print(f"🔤 Whisper: {args.whisper_model} | 🌐 LLM: {env.get('OPENAI_MODEL')}")
|
||||||
|
print(f"🧩 Base: {base}")
|
||||||
|
|
||||||
|
# Logfile
|
||||||
|
if args.logfile:
|
||||||
|
log_path = Path(args.logfile)
|
||||||
|
else:
|
||||||
|
log_path = LOGS_DIR / f"run_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
|
||||||
|
|
||||||
|
# Tee: schreibe in Datei UND Konsole
|
||||||
|
try:
|
||||||
|
log_fh = open(log_path, "w", encoding="utf-8")
|
||||||
|
class _Tee:
|
||||||
|
def __init__(self, *streams): self.streams = streams
|
||||||
|
def write(self, data):
|
||||||
|
for s in self.streams:
|
||||||
|
try: s.write(data); s.flush()
|
||||||
|
except Exception: pass
|
||||||
|
def flush(self):
|
||||||
|
for s in self.streams:
|
||||||
|
try: s.flush()
|
||||||
|
except Exception: pass
|
||||||
|
sys.stdout = _Tee(sys.__stdout__, log_fh)
|
||||||
|
sys.stderr = _Tee(sys.__stderr__, log_fh)
|
||||||
|
print(f"📝 Logfile: {log_path}")
|
||||||
|
except Exception:
|
||||||
|
print(f"⚠️ Konnte Logfile nicht initialisieren: {log_path}")
|
||||||
|
|
||||||
|
durations = []
|
||||||
|
started = datetime.now()
|
||||||
|
print(f"🚀 Start: {started:%Y-%m-%d %H:%M:%S}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 1) Transcription
|
||||||
|
t_args = [sys.executable, SCRIPTS["transcription"], "--input", str(input_path), "--model", args.whisper_model]
|
||||||
|
if args.lang: t_args += ["--lang", args.lang]
|
||||||
|
durations.append(("Transcription", run_step(t_args, "Transcription", env=env)))
|
||||||
|
|
||||||
|
# 2) LLM Segmentierung
|
||||||
|
st_args = [sys.executable, SCRIPTS["segment_transcript"], "--base", base]
|
||||||
|
durations.append(("Segment Transcript", run_step(st_args, "Segment Transcript", env=env)))
|
||||||
|
|
||||||
|
# 3) Highlights schneiden
|
||||||
|
cut_filename = input_path.name
|
||||||
|
cc_args = [sys.executable, SCRIPTS["cutClips"], "--file", cut_filename, "--limit", str(args.limit)]
|
||||||
|
durations.append(("Cut Clips", run_step(cc_args, "Cut Clips", env=env)))
|
||||||
|
|
||||||
|
# 4) Faces
|
||||||
|
if not args.no_detect:
|
||||||
|
df_args = [sys.executable, SCRIPTS["detect_faces"]]
|
||||||
|
durations.append(("Detect Faces", run_step(df_args, "Detect Faces", env=env)))
|
||||||
|
else:
|
||||||
|
print("⏭️ Detect Faces übersprungen.")
|
||||||
|
|
||||||
|
# 5) Make Targets
|
||||||
|
if not args.no_make:
|
||||||
|
ms_args = [sys.executable, SCRIPTS["make_segments"], "--pattern", args.pattern]
|
||||||
|
durations.append(("Make Targets", run_step(ms_args, "Make Targets", env=env)))
|
||||||
|
else:
|
||||||
|
print("⏭️ Make Targets übersprungen.")
|
||||||
|
|
||||||
|
# 6) Crop
|
||||||
|
if not args.no_apply:
|
||||||
|
ac_args = [sys.executable, SCRIPTS["apply_crop"], "--pattern", args.pattern, "--mux_audio"]
|
||||||
|
if args.overwrite: ac_args.append("--overwrite")
|
||||||
|
durations.append(("Apply Crop", run_step(ac_args, "Apply Crop", env=env)))
|
||||||
|
else:
|
||||||
|
print("⏭️ Apply Crop übersprungen.")
|
||||||
|
|
||||||
|
# 7) Bewertung
|
||||||
|
if not args.no_rate:
|
||||||
|
rc_args = [sys.executable, SCRIPTS["rateCluster"]]
|
||||||
|
durations.append(("Rate Clusters", run_step(rc_args, "Rate Clusters", env=env)))
|
||||||
|
else:
|
||||||
|
print("⏭️ Rate Clusters übersprungen.")
|
||||||
|
|
||||||
|
# 8) Untertitel
|
||||||
|
if not args.no_subs:
|
||||||
|
as_args = [sys.executable, SCRIPTS["add_subtitles"]]
|
||||||
|
durations.append(("Subtitles", run_step(as_args, "Subtitles", env=env)))
|
||||||
|
else:
|
||||||
|
print("⏭️ Subtitles übersprungen.")
|
||||||
|
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print("\n⛔ Abgebrochen (Ctrl+C).")
|
||||||
|
finally:
|
||||||
|
finished = datetime.now()
|
||||||
|
total = sum(dt for _, dt in durations)
|
||||||
|
print("\n======================== ZUSAMMENFASSUNG ============================")
|
||||||
|
for name, dt in durations:
|
||||||
|
print(f"✅ {name:<24} {dt:7.2f}s")
|
||||||
|
print("---------------------------------------------------------------------")
|
||||||
|
print(f"⏱️ Gesamtdauer: {total:.2f}s")
|
||||||
|
print(f"🕒 Start : {started:%Y-%m-%d %H:%M:%S}")
|
||||||
|
print(f"🕒 Ende : {finished:%Y-%m-%d %H:%M:%S}")
|
||||||
|
print(f"📂 Output:")
|
||||||
|
print(f" Raw Clips : {RAW_CLIPS_DIR}")
|
||||||
|
print(f" 9:16 : {CROPPED_DIR}")
|
||||||
|
print(f" Subbed : {SUBTITLED_DIR}")
|
||||||
|
print("=====================================================================")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
43
src/main.py
Normal file
43
src/main.py
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Einfaches Master-Skript, das alle Unter-Skripte nacheinander startet – ohne Argumente.
|
||||||
|
"""
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Reihenfolge der auszuführenden Skripte (relativer Pfad)
|
||||||
|
SCRIPTS = [
|
||||||
|
"text/transcription.py",
|
||||||
|
"text/segment_transcript.py",
|
||||||
|
"text/rateCluster.py",
|
||||||
|
"text/cutClips.py",
|
||||||
|
"reformat/track_faces_Yolo.py",
|
||||||
|
"reformat/detect_speaking_faces.py",
|
||||||
|
"reformat/crop_to_speaker.py",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def run_script(script_path: str):
|
||||||
|
"""
|
||||||
|
Führt ein Python-Skript ohne weitere Argumente aus.
|
||||||
|
"""
|
||||||
|
print(f"🔄 Running: {script_path}")
|
||||||
|
full_path = Path(__file__).parent / script_path
|
||||||
|
try:
|
||||||
|
subprocess.check_call([sys.executable, str(full_path)])
|
||||||
|
print(f"✔️ {script_path} erfolgreich abgeschlossen.\n")
|
||||||
|
except subprocess.CalledProcessError as e:
|
||||||
|
print(f"❌ Fehler in {script_path}: Rückgabecode {e.returncode}")
|
||||||
|
sys.exit(e.returncode)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
print("\n=== Starte komplette Podcast-Pipeline ===\n")
|
||||||
|
for script in SCRIPTS:
|
||||||
|
run_script(script)
|
||||||
|
print("✅ Alle Schritte erfolgreich abgeschlossen.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
315
src/reformat/main_apply_crop.py
Normal file
315
src/reformat/main_apply_crop.py
Normal file
@ -0,0 +1,315 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# src/reformat/new/main_apply_crop.py
|
||||||
|
from __future__ import annotations
|
||||||
|
import logging, json, math, subprocess, argparse
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional, Tuple, List, Dict, Any
|
||||||
|
from collections import deque
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import cv2
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
# ── Projektwurzel importierbar machen
|
||||||
|
ROOT = Path(__file__).resolve().parents[2]
|
||||||
|
sys.path.insert(0, str(ROOT))
|
||||||
|
from config import RAW_CLIPS_DIR, FACE_COMBINED_DIR, SEGMENTS_DIR, CROPPED_DIR
|
||||||
|
|
||||||
|
# ==== Defaults (per CLI überschreibbar) ======================================
|
||||||
|
OUT_W_DEFAULT, OUT_H_DEFAULT = 1080, 1920 # 9:16
|
||||||
|
DEBUG_SCALE_DEFAULT = 0.6
|
||||||
|
MEDIAN_WIN_DEFAULT = 5
|
||||||
|
EMA_ALPHA_DEFAULT = 0.22
|
||||||
|
DEADBAND_PX_DEFAULT = 8.0
|
||||||
|
SWITCH_COOLDOWN_FR_DEFAULT = 12
|
||||||
|
ZOOM_PAD_FRAC_DEFAULT = 0.10
|
||||||
|
|
||||||
|
USE_CUT_DETECT_DEFAULT = True
|
||||||
|
CUT_CORR_THRESH_DEFAULT = 0.65
|
||||||
|
CUT_COOLDOWN_DEFAULT = 6
|
||||||
|
|
||||||
|
MUX_AUDIO_DEFAULT = True
|
||||||
|
FFMPEG_BIN = "ffmpeg"
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def clamp(v, lo, hi): return max(lo, min(hi, v))
|
||||||
|
|
||||||
|
def compute_crop_rect(cx: float, cy: float, src_w: int, src_h: int,
|
||||||
|
out_w: int, out_h: int, zoom_pad_frac: float) -> tuple[int,int,int,int]:
|
||||||
|
"""9:16 (out_w:out_h) Crop um (cx,cy) — ohne Squeeze, mit Zoom-Pad, im Bild gehalten."""
|
||||||
|
target_ar = out_w / out_h
|
||||||
|
src_ar = src_w / src_h
|
||||||
|
if src_ar >= target_ar:
|
||||||
|
base_h = src_h
|
||||||
|
base_w = int(round(base_h * target_ar))
|
||||||
|
else:
|
||||||
|
base_w = src_w
|
||||||
|
base_h = int(round(base_w / target_ar))
|
||||||
|
|
||||||
|
desired_scale = 1.0 + zoom_pad_frac
|
||||||
|
s = min(desired_scale, src_w / base_w, src_h / base_h)
|
||||||
|
w = int(round(base_w * s))
|
||||||
|
h = int(round(base_h * s))
|
||||||
|
half_w, half_h = w // 2, h // 2
|
||||||
|
|
||||||
|
cx = clamp(cx, half_w, src_w - half_w)
|
||||||
|
cy = clamp(cy, half_h, src_h - half_h)
|
||||||
|
x = int(round(cx - half_w))
|
||||||
|
y = int(round(cy - half_h))
|
||||||
|
return x, y, w, h
|
||||||
|
|
||||||
|
def draw_center(img, pt, color, label=None):
|
||||||
|
if pt is None: return
|
||||||
|
x, y = int(pt[0]), int(pt[1])
|
||||||
|
cv2.circle(img, (x, y), 6, color, -1)
|
||||||
|
if label:
|
||||||
|
cv2.putText(img, label, (x + 8, y - 8),
|
||||||
|
cv2.FONT_HERSHEY_SIMPLEX, 0.55, color, 2, cv2.LINE_AA)
|
||||||
|
|
||||||
|
def scene_corr(a_small: np.ndarray, b_small: np.ndarray) -> float:
|
||||||
|
a_hsv = cv2.cvtColor(a_small, cv2.COLOR_BGR2HSV)
|
||||||
|
b_hsv = cv2.cvtColor(b_small, cv2.COLOR_BGR2HSV)
|
||||||
|
ha = cv2.calcHist([a_hsv],[0,1],None,[50,50],[0,180,0,256])
|
||||||
|
hb = cv2.calcHist([b_hsv],[0,1],None,[50,50],[0,180,0,256])
|
||||||
|
cv2.normalize(ha,ha,0,1,cv2.NORM_MINMAX); cv2.normalize(hb,hb,0,1,cv2.NORM_MINMAX)
|
||||||
|
return float((cv2.compareHist(ha, hb, cv2.HISTCMP_CORREL) + 1.0)/2.0)
|
||||||
|
|
||||||
|
def mux_audio_from_source(src_video: Path, silent_video: Path, out_video: Path):
|
||||||
|
cmd = [
|
||||||
|
FFMPEG_BIN, "-y",
|
||||||
|
"-i", str(src_video),
|
||||||
|
"-i", str(silent_video),
|
||||||
|
"-map", "1:v:0",
|
||||||
|
"-map", "0:a:0?",
|
||||||
|
"-c:v", "copy",
|
||||||
|
"-c:a", "aac", "-b:a", "192k",
|
||||||
|
"-shortest",
|
||||||
|
str(out_video),
|
||||||
|
]
|
||||||
|
subprocess.run(cmd, check=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||||
|
|
||||||
|
def load_faces(name: str) -> List[Dict[str,Any]]:
|
||||||
|
p = FACE_COMBINED_DIR / f"{name}_faces.json"
|
||||||
|
if not p.exists(): return []
|
||||||
|
return json.loads(p.read_text(encoding="utf-8"))
|
||||||
|
|
||||||
|
def load_target_map_or_segments(name: str, total_frames: int) -> List[Optional[int] | Dict]:
|
||||||
|
"""
|
||||||
|
Bevorzugt *_target_by_frame.json (Liste Dicts mit t,cx,cy,w,h).
|
||||||
|
Fallback: *_segments.json (pro Frame Track-ID).
|
||||||
|
Gibt Liste gleicher Länge wie total_frames zurück.
|
||||||
|
"""
|
||||||
|
map_p = SEGMENTS_DIR / f"{name}_target_by_frame.json"
|
||||||
|
if map_p.exists():
|
||||||
|
target = json.loads(map_p.read_text(encoding="utf-8"))
|
||||||
|
# Falls es Dicts sind (cx,cy,w,h pro frame), einfach zurückgeben:
|
||||||
|
if target and isinstance(target[0], dict):
|
||||||
|
if len(target) < total_frames:
|
||||||
|
last = target[-1] if target else {"t":0,"cx":0.5,"cy":0.5,"w":0.6,"h":0.6}
|
||||||
|
target += [last] * (total_frames - len(target))
|
||||||
|
return target[:total_frames]
|
||||||
|
# Falls numerische IDs drin wären, fällt es unten durch auf segs-Logik
|
||||||
|
seg_p = SEGMENTS_DIR / f"{name}_segments.json"
|
||||||
|
if seg_p.exists():
|
||||||
|
segs = json.loads(seg_p.read_text(encoding="utf-8"))
|
||||||
|
target_tid = [None]*total_frames
|
||||||
|
for s in segs:
|
||||||
|
a, b, tid = int(s["start_f"]), int(s["end_f"]), s["track_id"]
|
||||||
|
for t in range(max(0,a), min(total_frames, b+1)):
|
||||||
|
target_tid[t] = tid
|
||||||
|
return target_tid
|
||||||
|
return [None]*total_frames
|
||||||
|
|
||||||
|
def find_center_for_track(faces_frame: Dict[str,Any], target_tid: Optional[int], fallback: Tuple[float,float]) -> Tuple[float,float]:
|
||||||
|
if target_tid is None:
|
||||||
|
return fallback
|
||||||
|
faces = faces_frame.get("faces", [])
|
||||||
|
for f in faces:
|
||||||
|
if int(f.get("track_id", -1)) == int(target_tid):
|
||||||
|
x,y,w,h = f.get("bbox", [None,None,None,None])
|
||||||
|
if None not in (x,y,w,h):
|
||||||
|
return (float(x + w/2), float(y + h/2))
|
||||||
|
return fallback
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
p = argparse.ArgumentParser(description="Apply 9:16 Auto-Crop auf Rohclips mit Face-/Target-Daten.")
|
||||||
|
p.add_argument("--pattern", type=str, default="*.mp4", help="Dateimuster in RAW_CLIPS_DIR (Default: *.mp4)")
|
||||||
|
p.add_argument("--out_w", type=int, default=OUT_W_DEFAULT, help="Output-Breite (Default: 1080)")
|
||||||
|
p.add_argument("--out_h", type=int, default=OUT_H_DEFAULT, help="Output-Höhe (Default: 1920)")
|
||||||
|
p.add_argument("--zoom_pad", type=float, default=ZOOM_PAD_FRAC_DEFAULT, help="Zoom-Pad (0..1, Default 0.10)")
|
||||||
|
p.add_argument("--median", type=int, default=MEDIAN_WIN_DEFAULT, help="Median-Fenster (ungerade, >=3)")
|
||||||
|
p.add_argument("--ema", type=float, default=EMA_ALPHA_DEFAULT, help="EMA-Alpha (0..1)")
|
||||||
|
p.add_argument("--deadband", type=float, default=DEADBAND_PX_DEFAULT, help="Totband in Pixel")
|
||||||
|
p.add_argument("--switch_cd", type=int, default=SWITCH_COOLDOWN_FR_DEFAULT, help="Cooldown-Frames nach Trackwechsel")
|
||||||
|
p.add_argument("--cut_detect", action="store_true", default=USE_CUT_DETECT_DEFAULT, help="Szenenschnitt-Erkennung aktivieren")
|
||||||
|
p.add_argument("--cut_corr", type=float, default=CUT_CORR_THRESH_DEFAULT, help="Korrelation-Schwelle (0..1)")
|
||||||
|
p.add_argument("--cut_cd", type=int, default=CUT_COOLDOWN_DEFAULT, help="Cooldown-Frames nach Cut")
|
||||||
|
p.add_argument("--mux_audio", action="store_true", default=MUX_AUDIO_DEFAULT, help="Audio vom Original muxen")
|
||||||
|
p.add_argument("--debug", action="store_true", help="Debug-Overlay anzeigen (langsam)")
|
||||||
|
p.add_argument("--debug_scale", type=float, default=DEBUG_SCALE_DEFAULT, help="Skalierung Debug-Preview")
|
||||||
|
p.add_argument("--overwrite", action="store_true", help="Existierende Outputs überschreiben")
|
||||||
|
return p.parse_args()
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = parse_args()
|
||||||
|
OUT_DIR = CROPPED_DIR
|
||||||
|
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO)
|
||||||
|
clips = sorted(list(RAW_CLIPS_DIR.glob(args.pattern)))
|
||||||
|
if not clips:
|
||||||
|
print(f"⚠️ Keine Clips in {RAW_CLIPS_DIR} mit Pattern '{args.pattern}'")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"🔎 {len(clips)} Clips gefunden …")
|
||||||
|
for video_path in clips:
|
||||||
|
name = video_path.stem
|
||||||
|
out_path = OUT_DIR / f"{name}_9x16.mp4"
|
||||||
|
if out_path.exists() and not args.overwrite:
|
||||||
|
print(f"⏭️ Skip (existiert): {out_path.name}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Video öffnen
|
||||||
|
cap = cv2.VideoCapture(str(video_path))
|
||||||
|
if not cap.isOpened():
|
||||||
|
print(f"❌ Kann Video nicht öffnen: {video_path.name}")
|
||||||
|
continue
|
||||||
|
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||||
|
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||||
|
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
|
||||||
|
total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||||
|
|
||||||
|
# Face/Target laden
|
||||||
|
faces_all = load_faces(name)
|
||||||
|
if faces_all and len(faces_all) < total:
|
||||||
|
faces_all += [ {"faces": [], "W": width, "H": height} ] * (total - len(faces_all))
|
||||||
|
target_by_frame = load_target_map_or_segments(name, total)
|
||||||
|
|
||||||
|
# Writer vorbereiten
|
||||||
|
writer = cv2.VideoWriter(str(out_path),
|
||||||
|
cv2.VideoWriter_fourcc(*"mp4v"),
|
||||||
|
fps, (args.out_w, args.out_h))
|
||||||
|
|
||||||
|
median_buf = deque(maxlen=max(3, args.median if args.median % 2 else args.median+1))
|
||||||
|
ema_center: Optional[Tuple[float,float]] = None
|
||||||
|
last_center: Optional[Tuple[float,float]] = (width/2, height/2)
|
||||||
|
switch_cooldown = 0
|
||||||
|
|
||||||
|
prev_small = None
|
||||||
|
cut_cd = 0
|
||||||
|
|
||||||
|
print(f"🎞️ Apply: {name} src={width}x{height} fps={fps:.2f} frames={total}")
|
||||||
|
|
||||||
|
for t in range(total):
|
||||||
|
ret, frame = cap.read()
|
||||||
|
if not ret: break
|
||||||
|
|
||||||
|
# Ziel bestimmen:
|
||||||
|
desired = None
|
||||||
|
tgt = target_by_frame[t] if t < len(target_by_frame) else None
|
||||||
|
|
||||||
|
# Fall A: target_by_frame.json mit direkten Zentren (Dict)
|
||||||
|
if isinstance(tgt, dict) and all(k in tgt for k in ("cx","cy","w","h")):
|
||||||
|
desired = (float(tgt["cx"])*width, float(tgt["cy"])*height)
|
||||||
|
else:
|
||||||
|
# Fall B: numerische Track-ID
|
||||||
|
target_tid = tgt if tgt is None or isinstance(tgt, (int, float)) else None
|
||||||
|
faces_fr = faces_all[t] if (faces_all and t < len(faces_all)) else {"faces":[]}
|
||||||
|
desired = find_center_for_track(faces_fr, target_tid, last_center or (width/2, height/2))
|
||||||
|
|
||||||
|
# Szenenschnitt?
|
||||||
|
if args.cut_detect:
|
||||||
|
small = cv2.resize(frame, (128, 72))
|
||||||
|
if prev_small is not None:
|
||||||
|
corr = scene_corr(prev_small, small)
|
||||||
|
if corr < args.cut_corr:
|
||||||
|
ema_center = desired
|
||||||
|
last_center = desired
|
||||||
|
switch_cooldown = args.switch_cd
|
||||||
|
cut_cd = args.cut_cd
|
||||||
|
prev_small = small
|
||||||
|
|
||||||
|
# Median-Filter
|
||||||
|
median_buf.append(desired)
|
||||||
|
if len(median_buf) >= 3:
|
||||||
|
xs = sorted(p[0] for p in median_buf)
|
||||||
|
ys = sorted(p[1] for p in median_buf)
|
||||||
|
m = len(median_buf)//2
|
||||||
|
desired_f = (xs[m], ys[m])
|
||||||
|
else:
|
||||||
|
desired_f = desired
|
||||||
|
|
||||||
|
# Trackwechsel erkennen (nur bei Track-IDs sauber möglich)
|
||||||
|
if t > 0:
|
||||||
|
prev_tgt = target_by_frame[t-1] if t-1 < len(target_by_frame) else None
|
||||||
|
else:
|
||||||
|
prev_tgt = tgt
|
||||||
|
is_switch = (not isinstance(tgt, dict)) and (tgt != prev_tgt)
|
||||||
|
|
||||||
|
if ema_center is None:
|
||||||
|
ema_center = desired_f
|
||||||
|
if last_center is None:
|
||||||
|
last_center = desired_f
|
||||||
|
|
||||||
|
if is_switch:
|
||||||
|
ema_center = desired_f
|
||||||
|
last_center = desired_f
|
||||||
|
switch_cooldown = args.switch_cd
|
||||||
|
else:
|
||||||
|
dx = desired_f[0] - ema_center[0]
|
||||||
|
dy = desired_f[1] - ema_center[1]
|
||||||
|
dist = math.hypot(dx, dy)
|
||||||
|
if cut_cd > 0:
|
||||||
|
ema_center = desired_f
|
||||||
|
cut_cd -= 1
|
||||||
|
else:
|
||||||
|
if dist > args.deadband:
|
||||||
|
ema_center = (ema_center[0] + dx*args.ema,
|
||||||
|
ema_center[1] + dy*args.ema)
|
||||||
|
|
||||||
|
last_center = desired_f
|
||||||
|
|
||||||
|
# 9:16 Crop anwenden
|
||||||
|
x, y, w, h = compute_crop_rect(ema_center[0], ema_center[1], width, height,
|
||||||
|
args.out_w, args.out_h, args.zoom_pad)
|
||||||
|
cropped = frame[y:y+h, x:x+w]
|
||||||
|
if cropped.size == 0: cropped = frame
|
||||||
|
final = cv2.resize(cropped, (args.out_w, args.out_h), interpolation=cv2.INTER_AREA)
|
||||||
|
writer.write(final)
|
||||||
|
|
||||||
|
if args.debug:
|
||||||
|
dbg = frame.copy()
|
||||||
|
cv2.rectangle(dbg, (x, y), (x+w, y+h), (0, 0, 255), 2)
|
||||||
|
draw_center(dbg, desired, (128,128,255), "desired")
|
||||||
|
draw_center(dbg, desired_f, (255,255, 0), "median")
|
||||||
|
draw_center(dbg, ema_center, ( 0,255,255), "ema")
|
||||||
|
cv2.putText(dbg, f"t={t+1}/{total}", (12, height-14),
|
||||||
|
cv2.FONT_HERSHEY_SIMPLEX, 0.6, (20,220,20), 2, cv2.LINE_AA)
|
||||||
|
disp = cv2.resize(dbg, (int(width*args.debug_scale), int(height*args.debug_scale)))
|
||||||
|
cv2.imshow("Apply Debug", disp)
|
||||||
|
if cv2.waitKey(1) & 0xFF == ord("q"):
|
||||||
|
print("🛑 Abgebrochen (q).")
|
||||||
|
break
|
||||||
|
|
||||||
|
writer.release()
|
||||||
|
cap.release()
|
||||||
|
|
||||||
|
# Audio muxen?
|
||||||
|
if args.mux_audio:
|
||||||
|
tmp = out_path.with_suffix(".tmp.mp4")
|
||||||
|
try:
|
||||||
|
out_path.rename(tmp)
|
||||||
|
mux_audio_from_source(video_path, tmp, out_path)
|
||||||
|
finally:
|
||||||
|
if tmp.exists():
|
||||||
|
try: tmp.unlink()
|
||||||
|
except: pass
|
||||||
|
print(f"✅ Fertig (mit Audio): {out_path.name}")
|
||||||
|
else:
|
||||||
|
print(f"✅ Fertig: {out_path.name}")
|
||||||
|
|
||||||
|
if args.debug:
|
||||||
|
cv2.destroyAllWindows()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
335
src/reformat/main_detect_faces.py
Normal file
335
src/reformat/main_detect_faces.py
Normal file
@ -0,0 +1,335 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
Face-Detection + Mouth-Openness (YOLOv8-face + MediaPipe)
|
||||||
|
- liest Rohclips aus RAW_CLIPS_DIR
|
||||||
|
- schreibt pro Video eine faces.json in FACE_COMBINED_DIR
|
||||||
|
- optionaler Fortschrittsbalken (tqdm)
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from contextlib import nullcontext
|
||||||
|
from typing import List, Dict, Any
|
||||||
|
from src.reformat.speaking import get_mouth_openness
|
||||||
|
|
||||||
|
import cv2
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
from ultralytics import YOLO
|
||||||
|
import mediapipe as mp
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# ── Projekt-Root + zentrale Pfade laden
|
||||||
|
ROOT = Path(__file__).resolve().parents[2]
|
||||||
|
sys.path.insert(0, str(ROOT))
|
||||||
|
from config import RAW_CLIPS_DIR, FACE_COMBINED_DIR # zentrale Verzeichnisse
|
||||||
|
|
||||||
|
# Fortschritt hübsch, wenn verfügbar
|
||||||
|
try:
|
||||||
|
from tqdm import tqdm
|
||||||
|
_HAS_TQDM = True
|
||||||
|
except Exception:
|
||||||
|
_HAS_TQDM = False
|
||||||
|
|
||||||
|
# ---------- Performance Tweaks ----------
|
||||||
|
torch.set_float32_matmul_precision("high")
|
||||||
|
cv2.setUseOptimized(True)
|
||||||
|
|
||||||
|
# ---------- Hilfsfunktionen ----------
|
||||||
|
def make_square_crop(x1, y1, x2, y2, W, H, margin_scale, min_crop):
|
||||||
|
cx = (x1 + x2) * 0.5
|
||||||
|
cy = (y1 + y2) * 0.5
|
||||||
|
w = (x2 - x1) * (1.0 + 2.0 * margin_scale)
|
||||||
|
h = (y2 - y1) * (1.0 + 2.0 * margin_scale)
|
||||||
|
side = max(w, h, float(min_crop))
|
||||||
|
half = side * 0.5
|
||||||
|
|
||||||
|
sx1 = int(max(0, round(cx - half)))
|
||||||
|
sy1 = int(max(0, round(cy - half)))
|
||||||
|
sx2 = int(min(W, round(cx + half)))
|
||||||
|
sy2 = int(min(H, round(cy + half)))
|
||||||
|
|
||||||
|
side_w = max(0, sx2 - sx1)
|
||||||
|
side_h = max(0, sy2 - sy1)
|
||||||
|
side = max(2, min(side_w, side_h))
|
||||||
|
sx2 = sx1 + side
|
||||||
|
sy2 = sy1 + side
|
||||||
|
return sx1, sy1, sx2, sy2
|
||||||
|
|
||||||
|
|
||||||
|
def pick_landmarks_near_crop_center(lm_lists, crop_w, crop_h):
|
||||||
|
if not lm_lists:
|
||||||
|
return None
|
||||||
|
cx_t, cy_t = crop_w * 0.5, crop_h * 0.5
|
||||||
|
best, best_d = None, 1e12
|
||||||
|
for lms in lm_lists:
|
||||||
|
xs = [p.x * crop_w for p in lms.landmark]
|
||||||
|
ys = [p.y * crop_h for p in lms.landmark]
|
||||||
|
cx = sum(xs) / len(xs)
|
||||||
|
cy = sum(ys) / len(ys)
|
||||||
|
d = (cx - cx_t) ** 2 + (cy - cy_t) ** 2
|
||||||
|
if d < best_d:
|
||||||
|
best, best_d = lms, d
|
||||||
|
return best
|
||||||
|
|
||||||
|
|
||||||
|
def run_mesh(face_mesh, crop_bgr, upscale_if_small):
|
||||||
|
if crop_bgr.size == 0:
|
||||||
|
return None, 0.0
|
||||||
|
ch, cw = crop_bgr.shape[:2]
|
||||||
|
if max(ch, cw) < upscale_if_small:
|
||||||
|
scale = float(upscale_if_small) / max(ch, cw)
|
||||||
|
new_w = max(1, int(round(cw * scale)))
|
||||||
|
new_h = max(1, int(round(ch * scale)))
|
||||||
|
crop_bgr = cv2.resize(crop_bgr, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
|
||||||
|
ch, cw = new_h, new_w
|
||||||
|
rgb = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2RGB)
|
||||||
|
res = face_mesh.process(rgb)
|
||||||
|
if not res.multi_face_landmarks:
|
||||||
|
return None, 0.0
|
||||||
|
chosen = pick_landmarks_near_crop_center(res.multi_face_landmarks, cw, ch)
|
||||||
|
if chosen is None:
|
||||||
|
return None, 0.0
|
||||||
|
mo = get_mouth_openness(chosen.landmark, ch)
|
||||||
|
return chosen, float(mo)
|
||||||
|
|
||||||
|
# ---------- Kernprozess ----------
|
||||||
|
def process_video(video_path: Path,
|
||||||
|
output_path: Path,
|
||||||
|
model: YOLO,
|
||||||
|
face_mesh,
|
||||||
|
conf_thresh: float,
|
||||||
|
frame_skip: int,
|
||||||
|
downscale: float,
|
||||||
|
expansion_1: float,
|
||||||
|
expansion_2: float,
|
||||||
|
min_crop: int,
|
||||||
|
faces_upscale: int,
|
||||||
|
imgsz: int,
|
||||||
|
device: str,
|
||||||
|
max_det: int):
|
||||||
|
print(f"🎬 Starte Detection: {video_path.name}")
|
||||||
|
cap = cv2.VideoCapture(str(video_path))
|
||||||
|
if not cap.isOpened():
|
||||||
|
logging.error(f"❌ Kann Video nicht öffnen: {video_path}")
|
||||||
|
return
|
||||||
|
|
||||||
|
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
|
||||||
|
orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||||
|
orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||||
|
total_frames_raw = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||||
|
total_to_process = None
|
||||||
|
if total_frames_raw > 0:
|
||||||
|
total_to_process = (total_frames_raw + (frame_skip - 1)) // max(1, frame_skip)
|
||||||
|
|
||||||
|
scaled_w = max(1, int(round(orig_w * downscale)))
|
||||||
|
scaled_h = max(1, int(round(orig_h * downscale)))
|
||||||
|
|
||||||
|
data: List[Dict[str, Any]] = []
|
||||||
|
frame_idx = 0
|
||||||
|
processed_frames = 0
|
||||||
|
|
||||||
|
sx = (orig_w / scaled_w) if downscale != 1.0 else 1.0
|
||||||
|
sy = (orig_h / scaled_h) if downscale != 1.0 else 1.0
|
||||||
|
|
||||||
|
autocast_ctx = (
|
||||||
|
torch.autocast(device_type=device, dtype=torch.float16)
|
||||||
|
if device in ("mps", "cuda") else nullcontext()
|
||||||
|
)
|
||||||
|
|
||||||
|
bar = None
|
||||||
|
start_t = time.time()
|
||||||
|
if _HAS_TQDM and total_to_process:
|
||||||
|
bar = tqdm(total=total_to_process, desc=f"{video_path.name}", unit="f", leave=True)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
ret, frame = cap.read()
|
||||||
|
if not ret:
|
||||||
|
break
|
||||||
|
|
||||||
|
if frame_skip > 1 and (frame_idx % frame_skip != 0):
|
||||||
|
frame_idx += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
frame_infer = frame if downscale == 1.0 else cv2.resize(frame, (scaled_w, scaled_h), interpolation=cv2.INTER_AREA)
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
with autocast_ctx:
|
||||||
|
# Ultralytics 8 API: __call__ statt .predict() (beide funktionieren)
|
||||||
|
result = model(frame_infer, imgsz=imgsz, device=device, verbose=False,
|
||||||
|
conf=conf_thresh, iou=0.5, max_det=max_det)
|
||||||
|
detections = result[0]
|
||||||
|
|
||||||
|
faces = []
|
||||||
|
for i in range(len(detections.boxes)):
|
||||||
|
box = detections.boxes[i]
|
||||||
|
conf = float(box.conf[0]) if hasattr(box.conf, "__len__") else float(box.conf)
|
||||||
|
if conf < conf_thresh:
|
||||||
|
continue
|
||||||
|
x1, y1, x2, y2 = [float(v) for v in box.xyxy[0].tolist()]
|
||||||
|
if downscale != 1.0:
|
||||||
|
x1 *= sx; x2 *= sx; y1 *= sy; y2 *= sy
|
||||||
|
x1 = max(0.0, min(x1, orig_w - 1))
|
||||||
|
y1 = max(0.0, min(y1, orig_h - 1))
|
||||||
|
x2 = max(0.0, min(x2, orig_w - 1))
|
||||||
|
y2 = max(0.0, min(y2, orig_h - 1))
|
||||||
|
|
||||||
|
w = max(1.0, x2 - x1)
|
||||||
|
h = max(1.0, y2 - y1)
|
||||||
|
cx = x1 + w / 2.0
|
||||||
|
cy = y1 + h / 2.0
|
||||||
|
|
||||||
|
# Pass 1
|
||||||
|
sx1, sy1, sx2, sy2 = make_square_crop(x1, y1, x2, y2, orig_w, orig_h, expansion_1, min_crop)
|
||||||
|
if sx2 - sx1 < 4 or sy2 - sy1 < 4:
|
||||||
|
continue
|
||||||
|
face_crop = frame[sy1:sy2, sx1:sx2]
|
||||||
|
_, mouth_open = run_mesh(face_mesh, face_crop, faces_upscale)
|
||||||
|
|
||||||
|
# Pass 2 nur wenn nötig
|
||||||
|
if mouth_open == 0.0:
|
||||||
|
sx1b, sy1b, sx2b, sy2b = make_square_crop(x1, y1, x2, y2, orig_w, orig_h, expansion_2, min_crop)
|
||||||
|
if (sx2b - sx1b) >= 4 and (sy2b - sy1b) >= 4:
|
||||||
|
face_crop_b = frame[sy1b:sy2b, sx1b:sx2b]
|
||||||
|
_, mouth_open = run_mesh(face_mesh, face_crop_b, faces_upscale)
|
||||||
|
|
||||||
|
faces.append({
|
||||||
|
"bbox": [int(round(x1)), int(round(y1)), int(round(w)), int(round(h))],
|
||||||
|
"conf": round(conf, 3),
|
||||||
|
"center": [round(cx, 1), round(cy, 1)],
|
||||||
|
"mouth_openness": round(float(mouth_open), 3)
|
||||||
|
})
|
||||||
|
|
||||||
|
data.append({
|
||||||
|
"frame": frame_idx,
|
||||||
|
"timestamp": round(frame_idx / fps, 3),
|
||||||
|
"W": orig_w,
|
||||||
|
"H": orig_h,
|
||||||
|
"faces": faces
|
||||||
|
})
|
||||||
|
frame_idx += 1
|
||||||
|
processed_frames += 1
|
||||||
|
|
||||||
|
# Fortschritt
|
||||||
|
if bar is not None:
|
||||||
|
bar.update(1)
|
||||||
|
else:
|
||||||
|
if processed_frames % 30 == 0:
|
||||||
|
elapsed = time.time() - start_t
|
||||||
|
rate = processed_frames / max(1e-6, elapsed) # frames/sec
|
||||||
|
if total_to_process:
|
||||||
|
remaining = max(0, total_to_process - processed_frames)
|
||||||
|
eta_sec = remaining / max(1e-6, rate)
|
||||||
|
print(f"[{video_path.name}] {processed_frames}/{total_to_process} "
|
||||||
|
f"({processed_frames/total_to_process*100:.1f}%) "
|
||||||
|
f"— {rate:.1f} f/s — ETA {eta_sec/60:.1f} min")
|
||||||
|
else:
|
||||||
|
print(f"[{video_path.name}] {processed_frames} frames — {rate:.1f} f/s")
|
||||||
|
|
||||||
|
cap.release()
|
||||||
|
if bar is not None:
|
||||||
|
bar.close()
|
||||||
|
|
||||||
|
# schön formatiertes JSON
|
||||||
|
output_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||||
|
print(f"✅ Faces gespeichert: {output_path.name}")
|
||||||
|
|
||||||
|
# ---------- CLI ----------
|
||||||
|
def parse_args():
|
||||||
|
p = argparse.ArgumentParser(description="YOLOv8-face + MediaPipe FaceMesh → faces.json pro Clip")
|
||||||
|
# Verzeichnisse (Default aus config.py)
|
||||||
|
p.add_argument("--input-dir", type=Path, default=RAW_CLIPS_DIR, help=f"Rohclips (Default: {RAW_CLIPS_DIR})")
|
||||||
|
p.add_argument("--output-dir", type=Path, default=FACE_COMBINED_DIR, help=f"Zielordner (Default: {FACE_COMBINED_DIR})")
|
||||||
|
# Modell
|
||||||
|
p.add_argument("--model", type=Path, default=ROOT / "models" / "yolov8n-face.pt",
|
||||||
|
help="Pfad zum YOLOv8-face Modell (.pt)")
|
||||||
|
# Optimierte Defaults
|
||||||
|
p.add_argument("--conf-thresh", type=float, default=0.35)
|
||||||
|
p.add_argument("--frame-skip", type=int, default=1, help="Nur jeden n-ten Frame verarbeiten")
|
||||||
|
p.add_argument("--downscale", type=float, default=0.5, help="Eingangsframe auf Faktor verkleinern (0..1)")
|
||||||
|
p.add_argument("--expansion", type=float, default=0.4, help="Crop-Margin Pass 1 (relativ)")
|
||||||
|
p.add_argument("--expansion2", type=float, default=0.8, help="Crop-Margin Pass 2 (relativ)")
|
||||||
|
p.add_argument("--min-crop", type=int, default=160, help="Minimaler Croprand in Pixeln (quadratisch)")
|
||||||
|
p.add_argument("--faces-upscale", type=int, default=192, help="Minimale Kantenlänge für FaceMesh (bei kleineren Crops upscalen)")
|
||||||
|
p.add_argument("--imgsz", type=int, default=448)
|
||||||
|
p.add_argument("--max-det", type=int, default=20)
|
||||||
|
p.add_argument("--use-refine", action="store_true", default=False, help="MediaPipe mit refine_landmarks")
|
||||||
|
return p.parse_args()
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO)
|
||||||
|
args.output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# YOLO Modell & Device
|
||||||
|
yolo = YOLO(str(args.model))
|
||||||
|
if torch.backends.mps.is_available():
|
||||||
|
device = "mps"
|
||||||
|
elif torch.cuda.is_available():
|
||||||
|
device = "cuda"
|
||||||
|
else:
|
||||||
|
device = "cpu"
|
||||||
|
yolo.to(device)
|
||||||
|
print(f"🖥️ Inference-Device: {device}")
|
||||||
|
|
||||||
|
# Warmup
|
||||||
|
try:
|
||||||
|
with torch.no_grad():
|
||||||
|
dummy = np.zeros((args.imgsz, args.imgsz, 3), dtype=np.uint8)
|
||||||
|
_ = yolo(source=[dummy], imgsz=args.imgsz, verbose=False, device=device)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Eingabedateien anzeigen
|
||||||
|
videos = sorted([*args.input_dir.glob("*.mp4"), *args.input_dir.glob("*.mov"), *args.input_dir.glob("*.mkv")])
|
||||||
|
print(f"🔍 Input-Ordner: {args.input_dir.resolve()}")
|
||||||
|
if not videos:
|
||||||
|
print("⚠️ Keine passenden Videos gefunden.")
|
||||||
|
return
|
||||||
|
print("📁 Dateien:")
|
||||||
|
for p in videos:
|
||||||
|
print(" →", p.name)
|
||||||
|
|
||||||
|
outer = None
|
||||||
|
if _HAS_TQDM:
|
||||||
|
outer = tqdm(total=len(videos), desc="Gesamt", unit="vid", leave=False)
|
||||||
|
|
||||||
|
with mp.solutions.face_mesh.FaceMesh(
|
||||||
|
static_image_mode=False,
|
||||||
|
max_num_faces=10,
|
||||||
|
refine_landmarks=args.use_refine,
|
||||||
|
min_detection_confidence=0.5,
|
||||||
|
min_tracking_confidence=0.5
|
||||||
|
) as face_mesh:
|
||||||
|
for vid in videos:
|
||||||
|
out = args.output_dir / f"{vid.stem}_faces.json"
|
||||||
|
process_video(
|
||||||
|
video_path=vid,
|
||||||
|
output_path=out,
|
||||||
|
model=yolo,
|
||||||
|
face_mesh=face_mesh,
|
||||||
|
conf_thresh=args.conf_thresh,
|
||||||
|
frame_skip=args.frame_skip,
|
||||||
|
downscale=args.downscale,
|
||||||
|
expansion_1=args.expansion,
|
||||||
|
expansion_2=args.expansion2,
|
||||||
|
min_crop=args.min_crop,
|
||||||
|
faces_upscale=args.faces_upscale,
|
||||||
|
imgsz=args.imgsz,
|
||||||
|
device=device,
|
||||||
|
max_det=args.max_det
|
||||||
|
)
|
||||||
|
if outer is not None:
|
||||||
|
outer.update(1)
|
||||||
|
|
||||||
|
if outer is not None:
|
||||||
|
outer.close()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
78
src/reformat/main_track_faces.py
Normal file
78
src/reformat/main_track_faces.py
Normal file
@ -0,0 +1,78 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import logging, json
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Dict, Any
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# Projekt-Root verfügbar machen
|
||||||
|
ROOT = Path(__file__).resolve().parents[2]
|
||||||
|
sys.path.insert(0, str(ROOT))
|
||||||
|
|
||||||
|
from config import FACE_COMBINED_DIR, FACE_CROP_CENTERS # ggf. SEGMENTS_DIR, wenn du dorthin schreibst
|
||||||
|
|
||||||
|
|
||||||
|
def iou(boxA, boxB):
|
||||||
|
xA = max(boxA[0], boxB[0])
|
||||||
|
yA = max(boxA[1], boxB[1])
|
||||||
|
xB = min(boxA[0]+boxA[2], boxB[0]+boxB[2])
|
||||||
|
yB = min(boxA[1]+boxA[3], boxB[1]+boxB[3])
|
||||||
|
interW, interH = max(0, xB-xA), max(0, yB-yA)
|
||||||
|
inter = interW * interH
|
||||||
|
union = boxA[2]*boxA[3] + boxB[2]*boxB[3] - inter
|
||||||
|
return inter/union if union > 0 else 0.0
|
||||||
|
|
||||||
|
def track_faces(faces_all: List[Dict[str,Any]], iou_thresh=0.3):
|
||||||
|
next_id = 0
|
||||||
|
last_boxes = {} # track_id -> bbox
|
||||||
|
for frame in faces_all:
|
||||||
|
new_boxes = {}
|
||||||
|
for face in frame["faces"]:
|
||||||
|
box = face["bbox"]
|
||||||
|
# match gegen bestehende
|
||||||
|
best_id, best_iou = None, 0.0
|
||||||
|
for tid, prev_box in last_boxes.items():
|
||||||
|
ov = iou(box, prev_box)
|
||||||
|
if ov > best_iou:
|
||||||
|
best_id, best_iou = tid, ov
|
||||||
|
if best_iou > iou_thresh:
|
||||||
|
face["track_id"] = best_id
|
||||||
|
new_boxes[best_id] = box
|
||||||
|
else:
|
||||||
|
face["track_id"] = next_id
|
||||||
|
new_boxes[next_id] = box
|
||||||
|
next_id += 1
|
||||||
|
last_boxes = new_boxes
|
||||||
|
return faces_all
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# Eingabe: erkannte Gesichter/Tracks
|
||||||
|
FACE_DIR = FACE_COMBINED_DIR
|
||||||
|
# Ausgabe: z. B. berechnete Center pro Frame
|
||||||
|
OUT_DIR = FACE_CROP_CENTERS
|
||||||
|
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
for f in FACE_DIR.glob("*_faces.json"):
|
||||||
|
try:
|
||||||
|
faces_all = json.loads(f.read_text(encoding="utf-8"))
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Fehler beim Laden {f.name}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
tracked = track_faces(faces_all)
|
||||||
|
f.write_text(json.dumps(tracked, ensure_ascii=False), encoding="utf-8")
|
||||||
|
print(f"✅ Track-IDs ergänzt: {f.name}")
|
||||||
|
|
||||||
|
# zusätzlich centers.json (dominant = höchster mouth_openness pro Frame)
|
||||||
|
centers = []
|
||||||
|
for fr in tracked:
|
||||||
|
if fr["faces"]:
|
||||||
|
best = max(fr["faces"], key=lambda ff: ff.get("mouth_openness", 0.0))
|
||||||
|
centers.append([best["center"][0], best["center"][1]])
|
||||||
|
else:
|
||||||
|
centers.append([fr["W"]/2, fr["H"]/2])
|
||||||
|
centers_path = f.with_name(f.stem.replace("_faces","_centers")+".json")
|
||||||
|
centers_path.write_text(json.dumps(centers, ensure_ascii=False), encoding="utf-8")
|
||||||
|
print(f"📝 Centers gespeichert: {centers_path.name}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
306
src/reformat/make_segments.py
Normal file
306
src/reformat/make_segments.py
Normal file
@ -0,0 +1,306 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# make_segments.py — erzeugt pro Highlight eine Zielspur (target_by_frame.json) fürs Cropping
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
import json
|
||||||
|
import argparse
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import List, Dict, Optional, Tuple
|
||||||
|
from pathlib import Path
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# ── Projektwurzel in sys.path aufnehmen (dieses Skript liegt z. B. unter src/reformat/)
|
||||||
|
ROOT = Path(__file__).resolve().parents[2]
|
||||||
|
sys.path.insert(0, str(ROOT))
|
||||||
|
|
||||||
|
from config import RAW_CLIPS_DIR, FACE_COMBINED_DIR, FACE_CROP_CENTERS, SEGMENTS_DIR
|
||||||
|
|
||||||
|
try:
|
||||||
|
from moviepy.video.io.VideoFileClip import VideoFileClip
|
||||||
|
MOVIEPY_OK = True
|
||||||
|
except Exception:
|
||||||
|
MOVIEPY_OK = False
|
||||||
|
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
# Hilfsstrukturen
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class FaceDet:
|
||||||
|
t: float # Sekunden
|
||||||
|
cx: float # Zentrum x (0..1)
|
||||||
|
cy: float # Zentrum y (0..1)
|
||||||
|
w: float # Breite rel. (0..1)
|
||||||
|
h: float # Höhe rel. (0..1)
|
||||||
|
track_id: Optional[int] = None
|
||||||
|
mouth_prob: Optional[float] = None
|
||||||
|
|
||||||
|
def moving_average(xs: List[float], win: int) -> List[float]:
|
||||||
|
if win <= 1 or len(xs) <= 2:
|
||||||
|
return xs[:]
|
||||||
|
# ungerade Fensterbreite erzwingen
|
||||||
|
win = win if win % 2 == 1 else win + 1
|
||||||
|
r = win // 2
|
||||||
|
out = []
|
||||||
|
for i in range(len(xs)):
|
||||||
|
a = max(0, i - r)
|
||||||
|
b = min(len(xs), i + r + 1)
|
||||||
|
out.append(sum(xs[a:b]) / (b - a))
|
||||||
|
return out
|
||||||
|
|
||||||
|
def clamp01(x: float) -> float:
|
||||||
|
return max(0.0, min(1.0, x))
|
||||||
|
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
# Lesen möglicher Eingabeformate (robust, schema-tolerant)
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _parse_face_like(obj: Dict, t: float, W: float | None = None, H: float | None = None) -> FaceDet:
|
||||||
|
"""
|
||||||
|
Erwartet entweder:
|
||||||
|
- bbox=[x,y,w,h] in Pixel → wird via W,H auf 0..1 normiert
|
||||||
|
- oder bereits normierte Felder cx,cy,w,h in 0..1
|
||||||
|
Optional: track_id, mouth_prob / mouth_open / talking_prob
|
||||||
|
"""
|
||||||
|
if "bbox" in obj and isinstance(obj["bbox"], (list, tuple)) and len(obj["bbox"]) >= 4:
|
||||||
|
x, y, w, h = [float(v) for v in obj["bbox"][:4]]
|
||||||
|
if W and H and W > 0 and H > 0:
|
||||||
|
cx = (x + w * 0.5) / W
|
||||||
|
cy = (y + h * 0.5) / H
|
||||||
|
w = w / W
|
||||||
|
h = h / H
|
||||||
|
else:
|
||||||
|
# Falls Maße fehlen: best effort, danach clampen
|
||||||
|
cx = x + w * 0.5
|
||||||
|
cy = y + h * 0.5
|
||||||
|
cx, cy = clamp01(cx), clamp01(cy)
|
||||||
|
w, h = max(0.0, min(1.0, w)), max(0.0, min(1.0, h))
|
||||||
|
else:
|
||||||
|
cx = float(obj.get("cx", 0.5))
|
||||||
|
cy = float(obj.get("cy", 0.5))
|
||||||
|
w = float(obj.get("w", 0.3))
|
||||||
|
h = float(obj.get("h", 0.3))
|
||||||
|
cx, cy = clamp01(cx), clamp01(cy)
|
||||||
|
w, h = max(0.0, min(1.0, w)), max(0.0, min(1.0, h))
|
||||||
|
|
||||||
|
track_id = obj.get("track_id")
|
||||||
|
mouth_prob = obj.get("mouth_prob") or obj.get("mouth_open") or obj.get("talking_prob")
|
||||||
|
mouth_prob = None if mouth_prob is None else float(mouth_prob)
|
||||||
|
|
||||||
|
return FaceDet(t=t, cx=cx, cy=cy, w=w, h=h, track_id=track_id, mouth_prob=mouth_prob)
|
||||||
|
|
||||||
|
|
||||||
|
def load_faces_or_centers(stem: str, fps_hint: float | None = None) -> List[FaceDet]:
|
||||||
|
"""
|
||||||
|
Lädt die beste verfügbare Gesichts/Center-Quelle für ein Highlight.
|
||||||
|
Suchreihenfolge:
|
||||||
|
1) FACE_COMBINED_DIR/{stem}_faces.json (Liste von Frames mit 'faces')
|
||||||
|
2) FACE_CROP_CENTERS/{stem}_centers.json
|
||||||
|
- akzeptiert entweder [[cx,cy], ...] oder [{t,cx,cy,w,h}, ...]
|
||||||
|
"""
|
||||||
|
candidates = [
|
||||||
|
(FACE_COMBINED_DIR / f"{stem}_faces.json", "faces"),
|
||||||
|
(FACE_CROP_CENTERS / f"{stem}_centers.json", "centers"),
|
||||||
|
]
|
||||||
|
path = kind = None
|
||||||
|
for p, k in candidates:
|
||||||
|
if p.exists():
|
||||||
|
path, kind = p, k
|
||||||
|
break
|
||||||
|
|
||||||
|
if path is None:
|
||||||
|
print(f"⚠️ Keine Face/Centers-Datei gefunden für {stem}. Fallback später → (0.5,0.5).")
|
||||||
|
return []
|
||||||
|
|
||||||
|
try:
|
||||||
|
raw = path.read_text(encoding="utf-8")
|
||||||
|
data = json.loads(raw)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Konnte {path.name} nicht lesen: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
dets: List[FaceDet] = []
|
||||||
|
|
||||||
|
# 1) Liste von Frames: [{ "W":..,"H":..,"timestamp"/"t":.., "faces":[...] }, ...]
|
||||||
|
if isinstance(data, list) and data and isinstance(data[0], dict) and "faces" in data[0]:
|
||||||
|
for fr in data:
|
||||||
|
W = float(fr.get("W") or 0.0)
|
||||||
|
H = float(fr.get("H") or 0.0)
|
||||||
|
t = float(fr.get("t") or fr.get("timestamp") or fr.get("time") or 0.0)
|
||||||
|
for f in fr.get("faces", []):
|
||||||
|
dets.append(_parse_face_like(f, t, W, H))
|
||||||
|
|
||||||
|
# 2) Dict mit "frames": [...]
|
||||||
|
elif isinstance(data, dict) and "frames" in data:
|
||||||
|
for fr in data["frames"]:
|
||||||
|
W = float(fr.get("W") or 0.0)
|
||||||
|
H = float(fr.get("H") or 0.0)
|
||||||
|
t = float(fr.get("t") or fr.get("timestamp") or fr.get("time") or 0.0)
|
||||||
|
for f in fr.get("faces", []):
|
||||||
|
dets.append(_parse_face_like(f, t, W, H))
|
||||||
|
|
||||||
|
# 3) centers.json als Liste von Listen: [[cx,cy], ...]
|
||||||
|
elif isinstance(data, list) and data and isinstance(data[0], (list, tuple)) and len(data[0]) >= 2:
|
||||||
|
fps = float(fps_hint or 25.0)
|
||||||
|
for i, pair in enumerate(data):
|
||||||
|
cx, cy = float(pair[0]), float(pair[1])
|
||||||
|
dets.append(FaceDet(t=i / fps, cx=clamp01(cx), cy=clamp01(cy), w=0.6, h=0.6))
|
||||||
|
|
||||||
|
# 4) Liste von Dicts mit evtl. bereits normierten Feldern
|
||||||
|
elif isinstance(data, list) and data and isinstance(data[0], dict):
|
||||||
|
for item in data:
|
||||||
|
t = float(item.get("t") or item.get("time") or 0.0)
|
||||||
|
dets.append(_parse_face_like(item, t))
|
||||||
|
|
||||||
|
else:
|
||||||
|
print(f"⚠️ Unbekanntes JSON-Format in {path.name}.")
|
||||||
|
return []
|
||||||
|
|
||||||
|
# filtern & sortieren
|
||||||
|
dets = [d for d in dets if 0.0 <= d.cx <= 1.0 and 0.0 <= d.cy <= 1.0]
|
||||||
|
dets.sort(key=lambda d: d.t)
|
||||||
|
print(f"✅ {len(dets)} Detektionen aus {path.name} ({kind}).")
|
||||||
|
return dets
|
||||||
|
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
# Zielspur berechnen
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def build_target_by_frame(
|
||||||
|
faces: List[FaceDet],
|
||||||
|
duration: float,
|
||||||
|
fps: float,
|
||||||
|
smooth_win: int = 7
|
||||||
|
) -> List[Dict]:
|
||||||
|
"""
|
||||||
|
Wählt pro Frame eine Zielposition (cx,cy,w,h).
|
||||||
|
Heuristik:
|
||||||
|
- bevorzuge Gesicht mit höchster mouth_prob (wenn vorhanden),
|
||||||
|
- sonst größtes Bounding-Box-Areal (w*h),
|
||||||
|
- halte IDs stabil (nicht zu häufige Sprünge).
|
||||||
|
Anschließend leichte Glättung (Moving Average) der Zentren/Größen.
|
||||||
|
"""
|
||||||
|
if fps <= 0:
|
||||||
|
fps = 25.0
|
||||||
|
total_frames = max(1, int(round(duration * fps)))
|
||||||
|
if not faces:
|
||||||
|
# Fallback: center track
|
||||||
|
return [{"frame": i, "t": round(i / fps, 4), "cx": 0.5, "cy": 0.5, "w": 0.6, "h": 0.6} for i in range(total_frames)]
|
||||||
|
|
||||||
|
frame_targets: List[Tuple[float, float, float, float]] = [] # (cx, cy, w, h)
|
||||||
|
last_track: Optional[int] = None
|
||||||
|
|
||||||
|
# lineare Suche über faces (bei Bedarf später bucketisieren)
|
||||||
|
for i in range(total_frames):
|
||||||
|
t = i / fps
|
||||||
|
lo, hi = t - 1.0 / fps, t + 1.0 / fps
|
||||||
|
|
||||||
|
cand: List[FaceDet] = [d for d in faces if lo <= d.t <= hi]
|
||||||
|
if not cand:
|
||||||
|
# Nimm den zeitlich nächsten
|
||||||
|
nearest = min(faces, key=lambda d: abs(d.t - t))
|
||||||
|
cand = [nearest]
|
||||||
|
|
||||||
|
def score(d: FaceDet) -> Tuple[float, float, float]:
|
||||||
|
mouth = -1.0 if d.mouth_prob is None else float(d.mouth_prob) # None schlechter als 0
|
||||||
|
area = float(d.w) * float(d.h)
|
||||||
|
stable = 1.0 if (last_track is not None and d.track_id == last_track) else 0.0
|
||||||
|
return (mouth, area, stable)
|
||||||
|
|
||||||
|
cand.sort(key=score, reverse=True)
|
||||||
|
best = cand[0]
|
||||||
|
if best.track_id is not None:
|
||||||
|
last_track = best.track_id
|
||||||
|
frame_targets.append((best.cx, best.cy, best.w, best.h))
|
||||||
|
|
||||||
|
# Glätten
|
||||||
|
cxs = moving_average([c for c, _, _, _ in frame_targets], smooth_win)
|
||||||
|
cys = moving_average([c for _, c, _, _ in frame_targets], smooth_win)
|
||||||
|
ws = moving_average([w for *_, w, _ in frame_targets], max(3, smooth_win // 2))
|
||||||
|
hs = moving_average([h for *_, _, h in frame_targets], max(3, smooth_win // 2))
|
||||||
|
|
||||||
|
out = []
|
||||||
|
for i, (cx, cy, w, h) in enumerate(zip(cxs, cys, ws, hs)):
|
||||||
|
t = i / fps
|
||||||
|
out.append({
|
||||||
|
"frame": i,
|
||||||
|
"t": round(t, 4),
|
||||||
|
"cx": round(clamp01(cx), 4),
|
||||||
|
"cy": round(clamp01(cy), 4),
|
||||||
|
"w": round(max(0.05, min(1.0, w)), 4),
|
||||||
|
"h": round(max(0.05, min(1.0, h)), 4),
|
||||||
|
})
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
# I/O
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def write_target_json(stem: str, target: List[Dict]) -> Path:
|
||||||
|
SEGMENTS_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
out_path = SEGMENTS_DIR / f"{stem}_target_by_frame.json"
|
||||||
|
out_path.write_text(json.dumps(target, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||||
|
return out_path
|
||||||
|
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
# CLI / Main
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
p = argparse.ArgumentParser(description="Erzeugt target_by_frame.json aus Face/Center-Detektionen für Cropping.")
|
||||||
|
p.add_argument("--pattern", type=str, default="highlight_*.mp4", help="Dateimuster in RAW_CLIPS_DIR (Default: highlight_*.mp4)")
|
||||||
|
p.add_argument("--fps", type=float, default=0.0, help="FPS erzwingen (0 = aus Video lesen).")
|
||||||
|
p.add_argument("--smooth", type=int, default=7, help="Fensterbreite für Moving-Average-Glättung (ungerade).")
|
||||||
|
p.add_argument("--overwrite", action="store_true", help="Existierende target_by_frame.json überschreiben.")
|
||||||
|
return p.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
if not MOVIEPY_OK:
|
||||||
|
raise RuntimeError("moviepy ist nicht installiert. Bitte `pip install moviepy` ausführen.")
|
||||||
|
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
vids = sorted(RAW_CLIPS_DIR.glob(args.pattern))
|
||||||
|
if not vids:
|
||||||
|
print(f"⚠️ Keine Rohclips gefunden in {RAW_CLIPS_DIR} mit Pattern '{args.pattern}'.")
|
||||||
|
return
|
||||||
|
|
||||||
|
print(f"🔎 Finde {len(vids)} Clips …")
|
||||||
|
|
||||||
|
for vid in vids:
|
||||||
|
stem = vid.stem # z. B. highlight_3
|
||||||
|
out_json = SEGMENTS_DIR / f"{stem}_target_by_frame.json"
|
||||||
|
if out_json.exists() and not args.overwrite:
|
||||||
|
print(f"⏭️ {out_json.name} existiert bereits – überspringe (nutze --overwrite zum Ersetzen).")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Video-Metadaten
|
||||||
|
try:
|
||||||
|
with VideoFileClip(str(vid)) as V:
|
||||||
|
duration = float(V.duration or 0.0)
|
||||||
|
fps = float(args.fps or (V.fps or 25.0))
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Kann Video {vid.name} nicht öffnen: {e} – Fallback duration/fps (10s/25fps).")
|
||||||
|
duration, fps = 10.0, (args.fps or 25.0)
|
||||||
|
|
||||||
|
# Face/Centers laden (fps_hint durchreichen, wichtig für centers-Listen)
|
||||||
|
faces = load_faces_or_centers(stem, fps_hint=fps)
|
||||||
|
|
||||||
|
# Zielspur bauen
|
||||||
|
target = build_target_by_frame(faces, duration=duration, fps=fps, smooth_win=args.smooth)
|
||||||
|
|
||||||
|
# Schreiben
|
||||||
|
out = write_target_json(stem, target)
|
||||||
|
print(f"💾 geschrieben: {out}")
|
||||||
|
|
||||||
|
print("🎉 Fertig.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
118
src/reformat/new/analyze_mouth_activity.py
Normal file
118
src/reformat/new/analyze_mouth_activity.py
Normal file
@ -0,0 +1,118 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# src/reformat/new/analyze_mouth_activity.py
|
||||||
|
import logging
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Dict, Any, Tuple, Optional
|
||||||
|
|
||||||
|
# OpenAI optional; aktuell nicht genutzt (Flag fehlt bewusst)
|
||||||
|
# from openai import OpenAI
|
||||||
|
|
||||||
|
# === HARTE DEFAULTS: einfach Play drücken ===
|
||||||
|
PROJECT_ROOT = Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit")
|
||||||
|
RAW_DIR = PROJECT_ROOT / "data" / "output" / "raw_clips"
|
||||||
|
FACES_DIR = PROJECT_ROOT / "data" / "face_data_combined"
|
||||||
|
TIMED_DIR = PROJECT_ROOT / "data" / "transkripte"
|
||||||
|
CENTERS_DIR = PROJECT_ROOT / "data" / "face_crop_centers"
|
||||||
|
|
||||||
|
def parse_timed_file(path: Path) -> List[Tuple[float, float]]:
|
||||||
|
"""
|
||||||
|
Erwartet Zeilen wie:
|
||||||
|
[00:00.00 - 00:05.20] Text...
|
||||||
|
Gibt Liste [(start_sec, end_sec)] zurück. Falls keine Treffer: leere Liste.
|
||||||
|
"""
|
||||||
|
import re
|
||||||
|
rx = re.compile(r"\[(\d+):(\d+)\.(\d+)\s*-\s*(\d+):(\d+)\.(\d+)\]")
|
||||||
|
segs = []
|
||||||
|
try:
|
||||||
|
for line in path.read_text(encoding="utf-8").splitlines():
|
||||||
|
m = rx.search(line)
|
||||||
|
if not m:
|
||||||
|
continue
|
||||||
|
smin, ssec, sms, emin, esec, ems = map(int, m.groups())
|
||||||
|
start = smin * 60 + ssec + sms / 100.0
|
||||||
|
end = emin * 60 + esec + ems / 100.0
|
||||||
|
if end > start:
|
||||||
|
segs.append((start, end))
|
||||||
|
except FileNotFoundError:
|
||||||
|
pass
|
||||||
|
return segs
|
||||||
|
|
||||||
|
def select_speaker_center(faces: List[Dict[str, Any]]) -> Tuple[float, float]:
|
||||||
|
"""Priorität: mouth_openness, Fallback: größte Fläche; sonst Bildmitte."""
|
||||||
|
if not faces:
|
||||||
|
return (960.0, 540.0)
|
||||||
|
def area(f):
|
||||||
|
bx = f.get("bbox",[0,0,0,0]); return float(bx[2]*bx[3])
|
||||||
|
best = max(
|
||||||
|
faces,
|
||||||
|
key=lambda f: (float(f.get("mouth_openness", 0.0)), area(f))
|
||||||
|
)
|
||||||
|
x, y, w, h = best["bbox"]
|
||||||
|
return (x + w/2.0, y + h/2.0)
|
||||||
|
|
||||||
|
def load_json(path: Path):
|
||||||
|
import json
|
||||||
|
return json.loads(path.read_text(encoding="utf-8"))
|
||||||
|
|
||||||
|
def save_json(obj, path: Path):
|
||||||
|
import json
|
||||||
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
path.write_text(json.dumps(obj, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||||
|
|
||||||
|
def process_one(base_name: str) -> bool:
|
||||||
|
faces_path = FACES_DIR / f"{base_name}_faces.json"
|
||||||
|
timed_path = TIMED_DIR / f"{base_name}_timed.txt"
|
||||||
|
centers_path = CENTERS_DIR / f"{base_name}_centers.json"
|
||||||
|
|
||||||
|
if not faces_path.exists():
|
||||||
|
logging.warning("Skip %-18s | Faces fehlen: %s", base_name, faces_path)
|
||||||
|
return False
|
||||||
|
if centers_path.exists():
|
||||||
|
logging.info("Skip %-18s | Centers existieren schon: %s", base_name, centers_path.name)
|
||||||
|
return True
|
||||||
|
|
||||||
|
try:
|
||||||
|
face_data: List[Dict[str, Any]] = load_json(faces_path)
|
||||||
|
except Exception as e:
|
||||||
|
logging.error("Fehler beim Lesen von %s: %s", faces_path, e)
|
||||||
|
return False
|
||||||
|
|
||||||
|
segments = parse_timed_file(timed_path)
|
||||||
|
if not segments:
|
||||||
|
logging.warning("[%s] Keine Segmente erkannt oder Datei fehlt: %s", base_name, timed_path.name)
|
||||||
|
|
||||||
|
centers: List[List[float]] = []
|
||||||
|
for entry in face_data:
|
||||||
|
faces = entry.get("faces", [])
|
||||||
|
cx, cy = select_speaker_center(faces)
|
||||||
|
centers.append([float(cx), float(cy)])
|
||||||
|
|
||||||
|
save_json(centers, centers_path)
|
||||||
|
logging.info("OK %-18s | Centers gespeichert: %s (frames=%d)", base_name, centers_path.name, len(centers))
|
||||||
|
return True
|
||||||
|
|
||||||
|
def main():
|
||||||
|
logging.basicConfig(
|
||||||
|
format="%(asctime)s %(levelname)s: %(message)s",
|
||||||
|
level=logging.INFO
|
||||||
|
)
|
||||||
|
|
||||||
|
if not RAW_DIR.exists():
|
||||||
|
logging.error("RAW_DIR existiert nicht: %s", RAW_DIR)
|
||||||
|
return
|
||||||
|
|
||||||
|
clips = sorted(RAW_DIR.glob("*.mp4"))
|
||||||
|
if not clips:
|
||||||
|
logging.warning("Keine Clips gefunden in %s", RAW_DIR)
|
||||||
|
return
|
||||||
|
|
||||||
|
logging.info("Analyze (mouth) Batch-Mode: %d Clips", len(clips))
|
||||||
|
ok = 0
|
||||||
|
for clip in clips:
|
||||||
|
base = clip.stem
|
||||||
|
if process_one(base):
|
||||||
|
ok += 1
|
||||||
|
logging.info("Fertig. %d/%d Clips verarbeitet.", ok, len(clips))
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
294
src/reformat/new/main_apply_crop.py
Normal file
294
src/reformat/new/main_apply_crop.py
Normal file
@ -0,0 +1,294 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# src/reformat/new/main_apply_crop.py
|
||||||
|
from __future__ import annotations
|
||||||
|
import logging, json, math, subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional, Tuple, List, Dict, Any
|
||||||
|
from collections import deque
|
||||||
|
|
||||||
|
import cv2
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
# ==== Pfade =================================================================
|
||||||
|
PROJECT_ROOT = Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit")
|
||||||
|
INPUT_VIDEO_DIR = PROJECT_ROOT / "data" / "output" / "raw_clips"
|
||||||
|
FACE_COMBINED_DIR = PROJECT_ROOT / "data" / "face_data_combined"
|
||||||
|
SEGMENTS_DIR = PROJECT_ROOT / "data" / "segments"
|
||||||
|
OUTPUT_DIR = PROJECT_ROOT / "output" / "output_9x16_final"
|
||||||
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
OUT_W, OUT_H = 1080, 1920
|
||||||
|
TARGET_AR = OUT_W / OUT_H # 0.5625
|
||||||
|
|
||||||
|
# ==== Debug =================================================================
|
||||||
|
DEBUG_MODE = False
|
||||||
|
DEBUG_SCALE = 0.6
|
||||||
|
DRAW_GUIDES = True
|
||||||
|
|
||||||
|
# ==== Smooth / Switch =======================================================
|
||||||
|
MEDIAN_WIN = 5
|
||||||
|
EMA_ALPHA = 0.22
|
||||||
|
DEADBAND_PX = 8.0
|
||||||
|
SWITCH_COOLDOWN_FRAMES = 12 # kurze Ruhe nach Segmentwechsel
|
||||||
|
ZOOM_PAD_FRAC = 0.10
|
||||||
|
|
||||||
|
# ==== Scene-Cut-Erkennung ===================================================
|
||||||
|
USE_CUT_DETECT = True
|
||||||
|
CUT_CORR_THRESH = 0.65
|
||||||
|
CUT_COOLDOWN = 6
|
||||||
|
|
||||||
|
# ==== Audio-Mux =============================================================
|
||||||
|
MUX_AUDIO = True
|
||||||
|
FFMPEG_BIN = "ffmpeg"
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
def clamp(v, lo, hi): return max(lo, min(hi, v))
|
||||||
|
|
||||||
|
def compute_crop_rect(cx: float, cy: float, src_w: int, src_h: int) -> tuple[int,int,int,int]:
|
||||||
|
"""
|
||||||
|
Liefert ein 9:16-Croprechteck (x,y,w,h) um (cx,cy).
|
||||||
|
- AR bleibt IMMER exakt 9:16 (kein Squeeze)
|
||||||
|
- ZOOM_PAD_FRAC wirkt als uniformer Scale auf Breite und Höhe
|
||||||
|
- Rechteck bleibt vollständig im Bild
|
||||||
|
"""
|
||||||
|
src_ar = src_w / src_h
|
||||||
|
|
||||||
|
if src_ar >= TARGET_AR:
|
||||||
|
base_h = src_h
|
||||||
|
base_w = int(round(base_h * TARGET_AR))
|
||||||
|
else:
|
||||||
|
base_w = src_w
|
||||||
|
base_h = int(round(base_w / TARGET_AR))
|
||||||
|
|
||||||
|
desired_scale = 1.0 + ZOOM_PAD_FRAC
|
||||||
|
max_scale_w = src_w / base_w
|
||||||
|
max_scale_h = src_h / base_h
|
||||||
|
s = min(desired_scale, max_scale_w, max_scale_h)
|
||||||
|
|
||||||
|
w = int(round(base_w * s))
|
||||||
|
h = int(round(base_h * s))
|
||||||
|
|
||||||
|
half_w, half_h = w // 2, h // 2
|
||||||
|
|
||||||
|
cx = clamp(cx, half_w, src_w - half_w)
|
||||||
|
cy = clamp(cy, half_h, src_h - half_h)
|
||||||
|
|
||||||
|
x = int(round(cx - half_w))
|
||||||
|
y = int(round(cy - half_h))
|
||||||
|
return x, y, w, h
|
||||||
|
|
||||||
|
def draw_center(img, pt, color, label=None):
|
||||||
|
if pt is None: return
|
||||||
|
x, y = int(pt[0]), int(pt[1])
|
||||||
|
cv2.circle(img, (x, y), 6, color, -1)
|
||||||
|
if label:
|
||||||
|
cv2.putText(img, label, (x + 8, y - 8),
|
||||||
|
cv2.FONT_HERSHEY_SIMPLEX, 0.55, color, 2, cv2.LINE_AA)
|
||||||
|
|
||||||
|
def scene_corr(a_small: np.ndarray, b_small: np.ndarray) -> float:
|
||||||
|
a_hsv = cv2.cvtColor(a_small, cv2.COLOR_BGR2HSV)
|
||||||
|
b_hsv = cv2.cvtColor(b_small, cv2.COLOR_BGR2HSV)
|
||||||
|
ha = cv2.calcHist([a_hsv],[0,1],None,[50,50],[0,180,0,256])
|
||||||
|
hb = cv2.calcHist([b_hsv],[0,1],None,[50,50],[0,180,0,256])
|
||||||
|
cv2.normalize(ha,ha,0,1,cv2.NORM_MINMAX); cv2.normalize(hb,hb,0,1,cv2.NORM_MINMAX)
|
||||||
|
corr = cv2.compareHist(ha, hb, cv2.HISTCMP_CORREL)
|
||||||
|
return float((corr + 1.0)/2.0)
|
||||||
|
|
||||||
|
def mux_audio_from_source(src_video: Path, silent_video: Path, out_video: Path):
|
||||||
|
cmd = [
|
||||||
|
FFMPEG_BIN, "-y",
|
||||||
|
"-i", str(src_video),
|
||||||
|
"-i", str(silent_video),
|
||||||
|
"-map", "1:v:0",
|
||||||
|
"-map", "0:a:0?",
|
||||||
|
"-c:v", "copy",
|
||||||
|
"-c:a", "aac", "-b:a", "192k",
|
||||||
|
"-shortest",
|
||||||
|
str(out_video),
|
||||||
|
]
|
||||||
|
subprocess.run(cmd, check=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||||
|
|
||||||
|
def load_faces(name: str) -> List[Dict[str,Any]]:
|
||||||
|
p = FACE_COMBINED_DIR / f"{name}_faces.json"
|
||||||
|
return json.loads(p.read_text(encoding="utf-8"))
|
||||||
|
|
||||||
|
def load_segments(name: str, total_frames: int) -> List[Optional[int]]:
|
||||||
|
seg_p = SEGMENTS_DIR / f"{name}_segments.json"
|
||||||
|
map_p = SEGMENTS_DIR / f"{name}_target_by_frame.json"
|
||||||
|
if map_p.exists():
|
||||||
|
target = json.loads(map_p.read_text(encoding="utf-8"))
|
||||||
|
if len(target) < total_frames:
|
||||||
|
target += [target[-1] if target else None] * (total_frames - len(target))
|
||||||
|
return target[:total_frames]
|
||||||
|
if seg_p.exists():
|
||||||
|
segs = json.loads(seg_p.read_text(encoding="utf-8"))
|
||||||
|
target = [None]*total_frames
|
||||||
|
for s in segs:
|
||||||
|
a, b, tid = int(s["start_f"]), int(s["end_f"]), s["track_id"]
|
||||||
|
for t in range(max(0,a), min(total_frames, b+1)):
|
||||||
|
target[t] = tid
|
||||||
|
return target
|
||||||
|
return [None]*total_frames
|
||||||
|
|
||||||
|
def find_center_for_track(faces_frame: Dict[str,Any], target_tid: Optional[int], fallback: Tuple[float,float]) -> Tuple[float,float]:
|
||||||
|
if target_tid is None:
|
||||||
|
return fallback
|
||||||
|
faces = faces_frame.get("faces", [])
|
||||||
|
for f in faces:
|
||||||
|
if int(f.get("track_id", -1)) == int(target_tid):
|
||||||
|
x,y,w,h = f.get("bbox", [None,None,None,None])
|
||||||
|
if None not in (x,y,w,h):
|
||||||
|
return (float(x + w/2), float(y + h/2))
|
||||||
|
return fallback
|
||||||
|
|
||||||
|
def main():
|
||||||
|
logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO)
|
||||||
|
clips = sorted(list(INPUT_VIDEO_DIR.glob("*.mp4")) + list(INPUT_VIDEO_DIR.glob("*.mov")))
|
||||||
|
if not clips:
|
||||||
|
print(f"⚠️ Keine Clips in {INPUT_VIDEO_DIR}")
|
||||||
|
return
|
||||||
|
|
||||||
|
for video_path in clips:
|
||||||
|
name = video_path.stem
|
||||||
|
faces_path = FACE_COMBINED_DIR / f"{name}_faces.json"
|
||||||
|
if not faces_path.exists():
|
||||||
|
print(f"⏭️ Skip (keine Faces): {faces_path.name}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
cap = cv2.VideoCapture(str(video_path))
|
||||||
|
if not cap.isOpened():
|
||||||
|
print(f"❌ Kann Video nicht öffnen: {video_path.name}")
|
||||||
|
continue
|
||||||
|
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||||
|
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||||
|
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
|
||||||
|
total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||||
|
|
||||||
|
faces_all = load_faces(name)
|
||||||
|
if len(faces_all) < total:
|
||||||
|
faces_all += [ {"faces": [], "W": width, "H": height} ] * (total - len(faces_all))
|
||||||
|
|
||||||
|
target_by_frame = load_segments(name, total)
|
||||||
|
|
||||||
|
out_path = OUTPUT_DIR / f"{name}_9x16.mp4"
|
||||||
|
if out_path.exists():
|
||||||
|
print(f"⏭️ Skip: Output existiert bereits → {out_path.name}")
|
||||||
|
cap.release()
|
||||||
|
continue
|
||||||
|
|
||||||
|
writer = cv2.VideoWriter(
|
||||||
|
str(out_path),
|
||||||
|
cv2.VideoWriter_fourcc(*"mp4v"),
|
||||||
|
fps,
|
||||||
|
(OUT_W, OUT_H)
|
||||||
|
)
|
||||||
|
|
||||||
|
median_buf = deque(maxlen=max(3, MEDIAN_WIN if MEDIAN_WIN % 2 else MEDIAN_WIN+1))
|
||||||
|
ema_center: Optional[Tuple[float,float]] = None
|
||||||
|
last_center: Optional[Tuple[float,float]] = (width/2, height/2)
|
||||||
|
switch_cooldown = 0
|
||||||
|
|
||||||
|
prev_small = None
|
||||||
|
cut_cd = 0
|
||||||
|
|
||||||
|
print(f"🎞️ Apply: {name} src={width}x{height} fps={fps:.2f} frames={total}")
|
||||||
|
|
||||||
|
for t in range(total):
|
||||||
|
ret, frame = cap.read()
|
||||||
|
if not ret: break
|
||||||
|
|
||||||
|
target_tid = target_by_frame[t] if t < len(target_by_frame) else None
|
||||||
|
faces_fr = faces_all[t] if t < len(faces_all) else {"faces":[]}
|
||||||
|
desired = find_center_for_track(faces_fr, target_tid, last_center or (width/2, height/2))
|
||||||
|
|
||||||
|
if USE_CUT_DETECT:
|
||||||
|
small = cv2.resize(frame, (128, 72))
|
||||||
|
if prev_small is not None:
|
||||||
|
corr = scene_corr(prev_small, small)
|
||||||
|
if corr < CUT_CORR_THRESH:
|
||||||
|
ema_center = desired
|
||||||
|
last_center = desired
|
||||||
|
switch_cooldown = SWITCH_COOLDOWN_FRAMES
|
||||||
|
cut_cd = CUT_COOLDOWN
|
||||||
|
prev_small = small
|
||||||
|
|
||||||
|
median_buf.append(desired)
|
||||||
|
if len(median_buf) >= 3:
|
||||||
|
xs = sorted(p[0] for p in median_buf)
|
||||||
|
ys = sorted(p[1] for p in median_buf)
|
||||||
|
m = len(median_buf)//2
|
||||||
|
desired_f = (xs[m], ys[m])
|
||||||
|
else:
|
||||||
|
desired_f = desired
|
||||||
|
|
||||||
|
if t > 0:
|
||||||
|
prev_tid = target_by_frame[t-1] if t-1 < len(target_by_frame) else None
|
||||||
|
else:
|
||||||
|
prev_tid = target_tid
|
||||||
|
|
||||||
|
if ema_center is None:
|
||||||
|
ema_center = desired_f
|
||||||
|
if last_center is None:
|
||||||
|
last_center = desired_f
|
||||||
|
|
||||||
|
if target_tid != prev_tid:
|
||||||
|
ema_center = desired_f
|
||||||
|
last_center = desired_f
|
||||||
|
switch_cooldown = SWITCH_COOLDOWN_FRAMES
|
||||||
|
else:
|
||||||
|
dx = desired_f[0] - ema_center[0]
|
||||||
|
dy = desired_f[1] - ema_center[1]
|
||||||
|
dist = math.hypot(dx, dy)
|
||||||
|
if cut_cd > 0:
|
||||||
|
ema_center = desired_f
|
||||||
|
cut_cd -= 1
|
||||||
|
else:
|
||||||
|
if dist > DEADBAND_PX:
|
||||||
|
ema_center = (ema_center[0] + dx*EMA_ALPHA,
|
||||||
|
ema_center[1] + dy*EMA_ALPHA)
|
||||||
|
|
||||||
|
last_center = desired_f
|
||||||
|
|
||||||
|
# neuer 9:16 Crop
|
||||||
|
x, y, w, h = compute_crop_rect(ema_center[0], ema_center[1], width, height)
|
||||||
|
cropped = frame[y:y+h, x:x+w]
|
||||||
|
if cropped.size == 0: cropped = frame
|
||||||
|
final = cv2.resize(cropped, (OUT_W, OUT_H), interpolation=cv2.INTER_AREA)
|
||||||
|
writer.write(final)
|
||||||
|
|
||||||
|
if DEBUG_MODE:
|
||||||
|
dbg = frame.copy()
|
||||||
|
cv2.rectangle(dbg, (x, y), (x+w, y+h), (0, 0, 255), 2)
|
||||||
|
if DRAW_GUIDES:
|
||||||
|
draw_center(dbg, desired, (128,128,255), "desired")
|
||||||
|
draw_center(dbg, desired_f, (255,255, 0), "median")
|
||||||
|
draw_center(dbg, ema_center, ( 0,255,255), "ema")
|
||||||
|
cv2.putText(dbg, f"t={t+1}/{total} tid={target_tid}",
|
||||||
|
(12, height-14), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (20,220,20), 2, cv2.LINE_AA)
|
||||||
|
disp = cv2.resize(dbg, (int(width*DEBUG_SCALE), int(height*DEBUG_SCALE)))
|
||||||
|
cv2.imshow("Apply Debug", disp)
|
||||||
|
if cv2.waitKey(1) & 0xFF == ord("q"):
|
||||||
|
print("🛑 Abgebrochen (q).")
|
||||||
|
break
|
||||||
|
|
||||||
|
writer.release()
|
||||||
|
cap.release()
|
||||||
|
|
||||||
|
if MUX_AUDIO:
|
||||||
|
tmp = out_path.with_suffix(".tmp.mp4")
|
||||||
|
try:
|
||||||
|
out_path.rename(tmp)
|
||||||
|
mux_audio_from_source(video_path, tmp, out_path)
|
||||||
|
finally:
|
||||||
|
if tmp.exists():
|
||||||
|
try: tmp.unlink()
|
||||||
|
except: pass
|
||||||
|
print(f"✅ Fertig (mit Audio): {out_path.name}")
|
||||||
|
else:
|
||||||
|
print(f"✅ Fertig: {out_path.name}")
|
||||||
|
|
||||||
|
if DEBUG_MODE:
|
||||||
|
cv2.destroyAllWindows()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
319
src/reformat/new/main_detect_faces.py
Normal file
319
src/reformat/new/main_detect_faces.py
Normal file
@ -0,0 +1,319 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
import json
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from contextlib import nullcontext
|
||||||
|
|
||||||
|
import cv2
|
||||||
|
import numpy as np
|
||||||
|
import torch
|
||||||
|
from ultralytics import YOLO
|
||||||
|
import mediapipe as mp
|
||||||
|
|
||||||
|
# Fortschritt hübsch, wenn verfügbar
|
||||||
|
try:
|
||||||
|
from tqdm import tqdm
|
||||||
|
_HAS_TQDM = True
|
||||||
|
except Exception:
|
||||||
|
_HAS_TQDM = False
|
||||||
|
|
||||||
|
from src.reformat.new.speaking import get_mouth_openness
|
||||||
|
|
||||||
|
# ---------- Performance Tweaks ----------
|
||||||
|
torch.set_float32_matmul_precision("high")
|
||||||
|
cv2.setUseOptimized(True)
|
||||||
|
|
||||||
|
# ---------- Hilfsfunktionen ----------
|
||||||
|
|
||||||
|
def make_square_crop(x1, y1, x2, y2, W, H, margin_scale, min_crop):
|
||||||
|
cx = (x1 + x2) * 0.5
|
||||||
|
cy = (y1 + y2) * 0.5
|
||||||
|
w = (x2 - x1) * (1.0 + 2.0 * margin_scale)
|
||||||
|
h = (y2 - y1) * (1.0 + 2.0 * margin_scale)
|
||||||
|
side = max(w, h, float(min_crop))
|
||||||
|
half = side * 0.5
|
||||||
|
|
||||||
|
sx1 = int(max(0, round(cx - half)))
|
||||||
|
sy1 = int(max(0, round(cy - half)))
|
||||||
|
sx2 = int(min(W, round(cx + half)))
|
||||||
|
sy2 = int(min(H, round(cy + half)))
|
||||||
|
|
||||||
|
side_w = max(0, sx2 - sx1)
|
||||||
|
side_h = max(0, sy2 - sy1)
|
||||||
|
side = max(2, min(side_w, side_h))
|
||||||
|
sx2 = sx1 + side
|
||||||
|
sy2 = sy1 + side
|
||||||
|
return sx1, sy1, sx2, sy2
|
||||||
|
|
||||||
|
|
||||||
|
def pick_landmarks_near_crop_center(lm_lists, crop_w, crop_h):
|
||||||
|
if not lm_lists:
|
||||||
|
return None
|
||||||
|
cx_t, cy_t = crop_w * 0.5, crop_h * 0.5
|
||||||
|
best, best_d = None, 1e12
|
||||||
|
for lms in lm_lists:
|
||||||
|
xs = [p.x * crop_w for p in lms.landmark]
|
||||||
|
ys = [p.y * crop_h for p in lms.landmark]
|
||||||
|
cx = sum(xs) / len(xs)
|
||||||
|
cy = sum(ys) / len(ys)
|
||||||
|
d = (cx - cx_t) ** 2 + (cy - cy_t) ** 2
|
||||||
|
if d < best_d:
|
||||||
|
best, best_d = lms, d
|
||||||
|
return best
|
||||||
|
|
||||||
|
|
||||||
|
def run_mesh(face_mesh, crop_bgr, upscale_if_small):
|
||||||
|
if crop_bgr.size == 0:
|
||||||
|
return None, 0.0
|
||||||
|
ch, cw = crop_bgr.shape[:2]
|
||||||
|
if max(ch, cw) < upscale_if_small:
|
||||||
|
scale = float(upscale_if_small) / max(ch, cw)
|
||||||
|
new_w = max(1, int(round(cw * scale)))
|
||||||
|
new_h = max(1, int(round(ch * scale)))
|
||||||
|
crop_bgr = cv2.resize(crop_bgr, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
|
||||||
|
ch, cw = new_h, new_w
|
||||||
|
rgb = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2RGB)
|
||||||
|
res = face_mesh.process(rgb)
|
||||||
|
if not res.multi_face_landmarks:
|
||||||
|
return None, 0.0
|
||||||
|
chosen = pick_landmarks_near_crop_center(res.multi_face_landmarks, cw, ch)
|
||||||
|
if chosen is None:
|
||||||
|
return None, 0.0
|
||||||
|
mo = get_mouth_openness(chosen.landmark, ch)
|
||||||
|
return chosen, float(mo)
|
||||||
|
|
||||||
|
# ---------- Kernprozess ----------
|
||||||
|
|
||||||
|
def process_video(video_path: Path,
|
||||||
|
output_path: Path,
|
||||||
|
model: YOLO,
|
||||||
|
face_mesh,
|
||||||
|
conf_thresh: float,
|
||||||
|
frame_skip: int,
|
||||||
|
downscale: float,
|
||||||
|
expansion_1: float,
|
||||||
|
expansion_2: float,
|
||||||
|
min_crop: int,
|
||||||
|
faces_upscale: int,
|
||||||
|
imgsz: int,
|
||||||
|
device: str,
|
||||||
|
max_det: int):
|
||||||
|
print(f"🎬 Starte Detection: {video_path.name}")
|
||||||
|
cap = cv2.VideoCapture(str(video_path))
|
||||||
|
if not cap.isOpened():
|
||||||
|
logging.error(f"❌ Kann Video nicht öffnen: {video_path}")
|
||||||
|
return
|
||||||
|
|
||||||
|
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
|
||||||
|
orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||||
|
orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||||
|
total_frames_raw = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||||
|
# Wenn frame_skip > 1, reduziert sich die tatsächlich verarbeitete Anzahl
|
||||||
|
total_to_process = None
|
||||||
|
if total_frames_raw > 0:
|
||||||
|
total_to_process = (total_frames_raw + (frame_skip - 1)) // max(1, frame_skip)
|
||||||
|
|
||||||
|
scaled_w = max(1, int(round(orig_w * downscale)))
|
||||||
|
scaled_h = max(1, int(round(orig_h * downscale)))
|
||||||
|
|
||||||
|
data = []
|
||||||
|
frame_idx = 0
|
||||||
|
processed_frames = 0
|
||||||
|
|
||||||
|
sx = (orig_w / scaled_w) if downscale != 1.0 else 1.0
|
||||||
|
sy = (orig_h / scaled_h) if downscale != 1.0 else 1.0
|
||||||
|
|
||||||
|
autocast_ctx = (
|
||||||
|
torch.autocast(device_type=device, dtype=torch.float16)
|
||||||
|
if device in ("mps", "cuda") else nullcontext()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Fortschrittsbalken pro Video
|
||||||
|
bar = None
|
||||||
|
start_t = time.time()
|
||||||
|
if _HAS_TQDM:
|
||||||
|
bar = tqdm(total=total_to_process, desc=f"{video_path.name}", unit="f", leave=True)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
ret, frame = cap.read()
|
||||||
|
if not ret:
|
||||||
|
break
|
||||||
|
|
||||||
|
if frame_skip > 1 and (frame_idx % frame_skip != 0):
|
||||||
|
frame_idx += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
frame_infer = frame if downscale == 1.0 else cv2.resize(frame, (scaled_w, scaled_h), interpolation=cv2.INTER_AREA)
|
||||||
|
|
||||||
|
with torch.no_grad():
|
||||||
|
with autocast_ctx:
|
||||||
|
detections = model(frame_infer, imgsz=imgsz, device=device, verbose=False,
|
||||||
|
conf=conf_thresh, iou=0.5, max_det=max_det)[0]
|
||||||
|
|
||||||
|
faces = []
|
||||||
|
for i in range(len(detections.boxes)):
|
||||||
|
box = detections.boxes[i]
|
||||||
|
conf = float(box.conf[0]) if hasattr(box.conf, "__len__") else float(box.conf)
|
||||||
|
if conf < conf_thresh:
|
||||||
|
continue
|
||||||
|
x1, y1, x2, y2 = [float(v) for v in box.xyxy[0].tolist()]
|
||||||
|
if downscale != 1.0:
|
||||||
|
x1 *= sx; x2 *= sx; y1 *= sy; y2 *= sy
|
||||||
|
x1 = max(0.0, min(x1, orig_w - 1))
|
||||||
|
y1 = max(0.0, min(y1, orig_h - 1))
|
||||||
|
x2 = max(0.0, min(x2, orig_w - 1))
|
||||||
|
y2 = max(0.0, min(y2, orig_h - 1))
|
||||||
|
|
||||||
|
w = max(1.0, x2 - x1)
|
||||||
|
h = max(1.0, y2 - y1)
|
||||||
|
cx = x1 + w / 2.0
|
||||||
|
cy = y1 + h / 2.0
|
||||||
|
|
||||||
|
# Pass 1
|
||||||
|
sx1, sy1, sx2, sy2 = make_square_crop(x1, y1, x2, y2, orig_w, orig_h, expansion_1, min_crop)
|
||||||
|
if sx2 - sx1 < 4 or sy2 - sy1 < 4:
|
||||||
|
continue
|
||||||
|
face_crop = frame[sy1:sy2, sx1:sx2]
|
||||||
|
_, mouth_open = run_mesh(face_mesh, face_crop, faces_upscale)
|
||||||
|
|
||||||
|
# Pass 2 nur wenn nötig
|
||||||
|
if mouth_open == 0.0:
|
||||||
|
sx1b, sy1b, sx2b, sy2b = make_square_crop(x1, y1, x2, y2, orig_w, orig_h, expansion_2, min_crop)
|
||||||
|
if (sx2b - sx1b) >= 4 and (sy2b - sy1b) >= 4:
|
||||||
|
face_crop_b = frame[sy1b:sy2b, sx1b:sx2b]
|
||||||
|
_, mouth_open = run_mesh(face_mesh, face_crop_b, faces_upscale)
|
||||||
|
|
||||||
|
faces.append({
|
||||||
|
"bbox": [int(round(x1)), int(round(y1)), int(round(w)), int(round(h))],
|
||||||
|
"conf": round(conf, 3),
|
||||||
|
"center": [round(cx, 1), round(cy, 1)],
|
||||||
|
"mouth_openness": round(float(mouth_open), 3)
|
||||||
|
})
|
||||||
|
|
||||||
|
data.append({
|
||||||
|
"frame": frame_idx,
|
||||||
|
"timestamp": round(frame_idx / fps, 3),
|
||||||
|
"W": orig_w,
|
||||||
|
"H": orig_h,
|
||||||
|
"faces": faces
|
||||||
|
})
|
||||||
|
frame_idx += 1
|
||||||
|
processed_frames += 1
|
||||||
|
|
||||||
|
# Fortschritt aktualisieren
|
||||||
|
if _HAS_TQDM:
|
||||||
|
bar.update(1)
|
||||||
|
else:
|
||||||
|
# leichter Fallback: ETA Ausgabe alle 30 verarbeitete Frames
|
||||||
|
if processed_frames % 30 == 0:
|
||||||
|
elapsed = time.time() - start_t
|
||||||
|
rate = processed_frames / max(1e-6, elapsed) # frames/sec
|
||||||
|
if total_to_process:
|
||||||
|
remaining = max(0, total_to_process - processed_frames)
|
||||||
|
eta_sec = remaining / max(1e-6, rate)
|
||||||
|
print(f"[{video_path.name}] {processed_frames}/{total_to_process} "
|
||||||
|
f"({processed_frames/total_to_process*100:.1f}%) "
|
||||||
|
f"— {rate:.1f} f/s — ETA {eta_sec/60:.1f} min")
|
||||||
|
else:
|
||||||
|
print(f"[{video_path.name}] {processed_frames} frames — {rate:.1f} f/s")
|
||||||
|
|
||||||
|
cap.release()
|
||||||
|
if _HAS_TQDM and bar is not None:
|
||||||
|
bar.close()
|
||||||
|
|
||||||
|
output_path.write_text(json.dumps(data, ensure_ascii=False), encoding="utf-8")
|
||||||
|
print(f"✅ Faces gespeichert: {output_path.name}")
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
# Verzeichnisse
|
||||||
|
parser.add_argument("--input-dir", type=Path,
|
||||||
|
default=Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit/data/output/raw_clips"))
|
||||||
|
parser.add_argument("--output-dir", type=Path,
|
||||||
|
default=Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit/data/face_data_combined"))
|
||||||
|
parser.add_argument("--model", type=Path,
|
||||||
|
default=Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit/models/yolov8n-face.pt"))
|
||||||
|
# Optimierte Defaults (keine Presets nötig)
|
||||||
|
parser.add_argument("--conf-thresh", type=float, default=0.35)
|
||||||
|
parser.add_argument("--frame-skip", type=int, default=1)
|
||||||
|
parser.add_argument("--downscale", type=float, default=0.5)
|
||||||
|
parser.add_argument("--expansion", type=float, default=0.4)
|
||||||
|
parser.add_argument("--expansion2", type=float, default=0.8)
|
||||||
|
parser.add_argument("--min-crop", type=int, default=160)
|
||||||
|
parser.add_argument("--faces-upscale", type=int, default=192)
|
||||||
|
parser.add_argument("--imgsz", type=int, default=448)
|
||||||
|
parser.add_argument("--max-det", type=int, default=20)
|
||||||
|
parser.add_argument("--use-refine", action="store_true", default=False)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO)
|
||||||
|
args.output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Model & Device
|
||||||
|
yolo = YOLO(str(args.model))
|
||||||
|
if torch.backends.mps.is_available():
|
||||||
|
device = "mps"
|
||||||
|
elif torch.cuda.is_available():
|
||||||
|
device = "cuda"
|
||||||
|
else:
|
||||||
|
device = "cpu"
|
||||||
|
yolo.to(device)
|
||||||
|
print(f"🖥️ Inference-Device: {device}")
|
||||||
|
|
||||||
|
# Warmup (reduziert Anlaufschwankungen)
|
||||||
|
try:
|
||||||
|
with torch.no_grad():
|
||||||
|
dummy = np.zeros((args.imgsz, args.imgsz, 3), dtype=np.uint8)
|
||||||
|
_ = yolo.predict(source=[dummy], imgsz=args.imgsz, verbose=False, device=device)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Liste der Videos (für Gesamt-Fortschritt)
|
||||||
|
videos = sorted(args.input_dir.glob("*.mp4"))
|
||||||
|
print(f"🔍 Input-Ordner: {args.input_dir.resolve()}")
|
||||||
|
print("📁 Dateien:")
|
||||||
|
for p in sorted(args.input_dir.glob("*")):
|
||||||
|
print(" →", p.name)
|
||||||
|
|
||||||
|
# Gesamt-Fortschrittsbalken pro Datei
|
||||||
|
outer = None
|
||||||
|
if _HAS_TQDM:
|
||||||
|
outer = tqdm(total=len(videos), desc="Gesamt", unit="vid", leave=False)
|
||||||
|
|
||||||
|
with mp.solutions.face_mesh.FaceMesh(
|
||||||
|
static_image_mode=False,
|
||||||
|
max_num_faces=10,
|
||||||
|
refine_landmarks=args.use_refine,
|
||||||
|
min_detection_confidence=0.5,
|
||||||
|
min_tracking_confidence=0.5
|
||||||
|
) as face_mesh:
|
||||||
|
for vid in videos:
|
||||||
|
out = args.output_dir / f"{vid.stem}_faces.json"
|
||||||
|
process_video(
|
||||||
|
video_path=vid,
|
||||||
|
output_path=out,
|
||||||
|
model=yolo,
|
||||||
|
face_mesh=face_mesh,
|
||||||
|
conf_thresh=args.conf_thresh,
|
||||||
|
frame_skip=args.frame_skip,
|
||||||
|
downscale=args.downscale,
|
||||||
|
expansion_1=args.expansion,
|
||||||
|
expansion_2=args.expansion2,
|
||||||
|
min_crop=args.min_crop,
|
||||||
|
faces_upscale=args.faces_upscale,
|
||||||
|
imgsz=args.imgsz,
|
||||||
|
device=device,
|
||||||
|
max_det=args.max_det
|
||||||
|
)
|
||||||
|
if _HAS_TQDM and outer is not None:
|
||||||
|
outer.update(1)
|
||||||
|
|
||||||
|
if _HAS_TQDM and outer is not None:
|
||||||
|
outer.close()
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
67
src/reformat/new/main_track_faces.py
Normal file
67
src/reformat/new/main_track_faces.py
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import logging, json
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Dict, Any
|
||||||
|
|
||||||
|
def iou(boxA, boxB):
|
||||||
|
xA = max(boxA[0], boxB[0])
|
||||||
|
yA = max(boxA[1], boxB[1])
|
||||||
|
xB = min(boxA[0]+boxA[2], boxB[0]+boxB[2])
|
||||||
|
yB = min(boxA[1]+boxA[3], boxB[1]+boxB[3])
|
||||||
|
interW, interH = max(0, xB-xA), max(0, yB-yA)
|
||||||
|
inter = interW * interH
|
||||||
|
union = boxA[2]*boxA[3] + boxB[2]*boxB[3] - inter
|
||||||
|
return inter/union if union > 0 else 0.0
|
||||||
|
|
||||||
|
def track_faces(faces_all: List[Dict[str,Any]], iou_thresh=0.3):
|
||||||
|
next_id = 0
|
||||||
|
last_boxes = {} # track_id -> bbox
|
||||||
|
for frame in faces_all:
|
||||||
|
new_boxes = {}
|
||||||
|
for face in frame["faces"]:
|
||||||
|
box = face["bbox"]
|
||||||
|
# match gegen bestehende
|
||||||
|
best_id, best_iou = None, 0.0
|
||||||
|
for tid, prev_box in last_boxes.items():
|
||||||
|
ov = iou(box, prev_box)
|
||||||
|
if ov > best_iou:
|
||||||
|
best_id, best_iou = tid, ov
|
||||||
|
if best_iou > iou_thresh:
|
||||||
|
face["track_id"] = best_id
|
||||||
|
new_boxes[best_id] = box
|
||||||
|
else:
|
||||||
|
face["track_id"] = next_id
|
||||||
|
new_boxes[next_id] = box
|
||||||
|
next_id += 1
|
||||||
|
last_boxes = new_boxes
|
||||||
|
return faces_all
|
||||||
|
|
||||||
|
def main():
|
||||||
|
PROJECT_ROOT = Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit")
|
||||||
|
FACE_DIR = PROJECT_ROOT / "data" / "face_data_combined"
|
||||||
|
|
||||||
|
for f in FACE_DIR.glob("*_faces.json"):
|
||||||
|
try:
|
||||||
|
faces_all = json.loads(f.read_text(encoding="utf-8"))
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Fehler beim Laden {f.name}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
tracked = track_faces(faces_all)
|
||||||
|
f.write_text(json.dumps(tracked, ensure_ascii=False), encoding="utf-8")
|
||||||
|
print(f"✅ Track-IDs ergänzt: {f.name}")
|
||||||
|
|
||||||
|
# zusätzlich centers.json (dominant = höchster mouth_openness pro Frame)
|
||||||
|
centers = []
|
||||||
|
for fr in tracked:
|
||||||
|
if fr["faces"]:
|
||||||
|
best = max(fr["faces"], key=lambda ff: ff.get("mouth_openness", 0.0))
|
||||||
|
centers.append([best["center"][0], best["center"][1]])
|
||||||
|
else:
|
||||||
|
centers.append([fr["W"]/2, fr["H"]/2])
|
||||||
|
centers_path = f.with_name(f.stem.replace("_faces","_centers")+".json")
|
||||||
|
centers_path.write_text(json.dumps(centers, ensure_ascii=False), encoding="utf-8")
|
||||||
|
print(f"📝 Centers gespeichert: {centers_path.name}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
179
src/reformat/new/make_segments.py
Normal file
179
src/reformat/new/make_segments.py
Normal file
@ -0,0 +1,179 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# src/reformat/new/make_segments.py
|
||||||
|
from __future__ import annotations
|
||||||
|
import json, math
|
||||||
|
from pathlib import Path
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import List, Dict, Any, Optional
|
||||||
|
import numpy as np
|
||||||
|
import cv2
|
||||||
|
|
||||||
|
# ==== Pfade (an dein Projekt angepasst) =====================================
|
||||||
|
PROJECT_ROOT = Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit")
|
||||||
|
RAW_DIR = PROJECT_ROOT / "data" / "output" / "raw_clips" # Videos
|
||||||
|
FACE_COMBINED_DIR= PROJECT_ROOT / "data" / "face_data_combined" # *_faces.json
|
||||||
|
SEGMENTS_DIR = PROJECT_ROOT / "data" / "segments" # Output
|
||||||
|
SEGMENTS_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
# ===========================================================================
|
||||||
|
|
||||||
|
# === Segment-Parameter ===
|
||||||
|
WIN_SEC = 1.2 # Fensterlänge
|
||||||
|
STRIDE_SEC = 0.6 # Schrittweite
|
||||||
|
HYSTERESIS_FACTOR = 1.25 # neuer Sprecher muss +25% besser sein
|
||||||
|
MIN_SEG_SEC = 1.0 # kürzere Segmente werden an Nachbarn gemerged
|
||||||
|
CONF_MIN = 0.35 # Sichtbarkeits-Threshold
|
||||||
|
AREA_CAP_FRAC = 0.12 # ab 12% Framefläche kappen wir den Flächenbonus
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Segment:
|
||||||
|
start_f: int
|
||||||
|
end_f: int
|
||||||
|
track_id: Optional[int]
|
||||||
|
|
||||||
|
def robust_minmax(vals, p_lo=5, p_hi=95):
|
||||||
|
v = np.array(vals, dtype=float)
|
||||||
|
lo, hi = np.percentile(v, [p_lo, p_hi])
|
||||||
|
if hi <= lo: hi = lo + 1e-6
|
||||||
|
return float(lo), float(hi)
|
||||||
|
|
||||||
|
def score_face(face: Dict[str,Any], W: int, H: int, cx: float, cy: float,
|
||||||
|
lo: float, hi: float) -> float:
|
||||||
|
# Mundaktivität robust normalisieren
|
||||||
|
mo = float(face.get("mouth_openness", 0.0))
|
||||||
|
mo = (mo - lo) / (hi - lo + 1e-9)
|
||||||
|
mo = float(np.clip(mo, 0.0, 1.0))
|
||||||
|
|
||||||
|
x, y, w, h = map(float, face.get("bbox", [0,0,0,0]))
|
||||||
|
conf = float(face.get("conf", 1.0))
|
||||||
|
if conf < CONF_MIN or w <= 5 or h <= 5: # sehr kleine/unsichere Gesichter raus
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
area = (w*h) / (W*H + 1e-9)
|
||||||
|
size_w = min(1.0, area / AREA_CAP_FRAC) # Flächengewicht
|
||||||
|
fx = x + w/2; fy = y + h/2
|
||||||
|
dist = math.hypot(fx - cx, fy - cy) / math.hypot(W/2, H/2)
|
||||||
|
center_w = max(0.0, 1.0 - dist**2) # Mitte leicht bevorzugen
|
||||||
|
|
||||||
|
# MO dominiert, Fläche und Mitte geben Stabilität
|
||||||
|
return mo * (0.6 + 0.3*size_w + 0.1*center_w)
|
||||||
|
|
||||||
|
def build_segments_for_clip(faces_per_frame: List[Dict[str,Any]], fps: float) -> (List[Segment], List[Optional[int]]):
|
||||||
|
T = len(faces_per_frame)
|
||||||
|
if T == 0:
|
||||||
|
return [], []
|
||||||
|
|
||||||
|
# Framegröße
|
||||||
|
W = faces_per_frame[0].get("W") or faces_per_frame[0].get("width")
|
||||||
|
H = faces_per_frame[0].get("H") or faces_per_frame[0].get("height")
|
||||||
|
if not W or not H:
|
||||||
|
# Versuch, aus BBox-Max abzuleiten (Fallback)
|
||||||
|
max_w = max((f["bbox"][0]+f["bbox"][2]) for fr in faces_per_frame for f in fr.get("faces", []) if "bbox" in f) if any(fr.get("faces") for fr in faces_per_frame) else 1920
|
||||||
|
max_h = max((f["bbox"][1]+f["bbox"][3]) for fr in faces_per_frame for f in fr.get("faces", []) if "bbox" in f) if any(fr.get("faces") for fr in faces_per_frame) else 1080
|
||||||
|
W, H = int(max_w), int(max_h)
|
||||||
|
|
||||||
|
# Mundwerte für robuste Normierung sammeln
|
||||||
|
all_mo = [float(f.get("mouth_openness", 0.0))
|
||||||
|
for fr in faces_per_frame for f in fr.get("faces", [])]
|
||||||
|
lo, hi = robust_minmax(all_mo) if all_mo else (0.0, 1.0)
|
||||||
|
|
||||||
|
win = max(1, int(round(WIN_SEC * fps)))
|
||||||
|
stride = max(1, int(round(STRIDE_SEC * fps)))
|
||||||
|
minseg = max(1, int(round(MIN_SEG_SEC * fps)))
|
||||||
|
|
||||||
|
chosen_by_frame: List[Optional[int]] = [None]*T
|
||||||
|
last_track: Optional[int] = None
|
||||||
|
|
||||||
|
for start in range(0, T, stride):
|
||||||
|
end = min(T, start + win)
|
||||||
|
sums: Dict[int, float] = {}
|
||||||
|
for t in range(start, end):
|
||||||
|
faces = faces_per_frame[t].get("faces", [])
|
||||||
|
if not faces: continue
|
||||||
|
for face in faces:
|
||||||
|
tid = face.get("track_id")
|
||||||
|
if tid is None:
|
||||||
|
continue
|
||||||
|
s = score_face(face, W, H, W/2, H/2, lo, hi)
|
||||||
|
if s <= 0:
|
||||||
|
continue
|
||||||
|
tid = int(tid)
|
||||||
|
sums[tid] = sums.get(tid, 0.0) + s
|
||||||
|
|
||||||
|
if not sums:
|
||||||
|
chosen = last_track
|
||||||
|
else:
|
||||||
|
best_tid, best_val = max(sums.items(), key=lambda kv: kv[1])
|
||||||
|
if last_track is None:
|
||||||
|
chosen = best_tid
|
||||||
|
else:
|
||||||
|
prev_val = sums.get(last_track, 0.0)
|
||||||
|
chosen = best_tid if best_val > prev_val * HYSTERESIS_FACTOR else last_track
|
||||||
|
|
||||||
|
for t in range(start, end):
|
||||||
|
chosen_by_frame[t] = chosen
|
||||||
|
last_track = chosen
|
||||||
|
|
||||||
|
# Lücken auffüllen
|
||||||
|
for t in range(T):
|
||||||
|
if chosen_by_frame[t] is None:
|
||||||
|
chosen_by_frame[t] = last_track
|
||||||
|
|
||||||
|
# Segmente bauen
|
||||||
|
segs: List[Segment] = []
|
||||||
|
cur = chosen_by_frame[0]
|
||||||
|
seg_start = 0
|
||||||
|
for t in range(1, T):
|
||||||
|
if chosen_by_frame[t] != cur:
|
||||||
|
segs.append(Segment(seg_start, t-1, cur))
|
||||||
|
cur = chosen_by_frame[t]
|
||||||
|
seg_start = t
|
||||||
|
segs.append(Segment(seg_start, T-1, cur))
|
||||||
|
|
||||||
|
# Mindestlänge: zu kurze an vorheriges mergen
|
||||||
|
out: List[Segment] = []
|
||||||
|
for s in segs:
|
||||||
|
if out and (s.end_f - s.start_f + 1) < minseg:
|
||||||
|
out[-1].end_f = s.end_f
|
||||||
|
else:
|
||||||
|
out.append(s)
|
||||||
|
|
||||||
|
return out, chosen_by_frame
|
||||||
|
|
||||||
|
def main():
|
||||||
|
clips = sorted(list(RAW_DIR.glob("*.mp4")) + list(RAW_DIR.glob("*.mov")))
|
||||||
|
if not clips:
|
||||||
|
print(f"⚠️ Keine Videos in {RAW_DIR}")
|
||||||
|
return
|
||||||
|
|
||||||
|
for vid in clips:
|
||||||
|
name = vid.stem
|
||||||
|
faces_path = FACE_COMBINED_DIR / f"{name}_faces.json"
|
||||||
|
if not faces_path.exists():
|
||||||
|
print(f"⏭️ Skip (keine Faces): {faces_path.name}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# FPS vom Video
|
||||||
|
cap = cv2.VideoCapture(str(vid))
|
||||||
|
if not cap.isOpened():
|
||||||
|
print(f"❌ Kann Video nicht öffnen: {vid.name}")
|
||||||
|
continue
|
||||||
|
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
|
||||||
|
cap.release()
|
||||||
|
|
||||||
|
try:
|
||||||
|
faces_per_frame = json.loads(faces_path.read_text(encoding="utf-8"))
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Fehler beim Lesen {faces_path.name}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
segs, chosen = build_segments_for_clip(faces_per_frame, fps)
|
||||||
|
|
||||||
|
seg_out = SEGMENTS_DIR / f"{name}_segments.json"
|
||||||
|
map_out = SEGMENTS_DIR / f"{name}_target_by_frame.json"
|
||||||
|
seg_out.write_text(json.dumps([s.__dict__ for s in segs], ensure_ascii=False), encoding="utf-8")
|
||||||
|
map_out.write_text(json.dumps(chosen, ensure_ascii=False), encoding="utf-8")
|
||||||
|
|
||||||
|
print(f"✅ Segmente erzeugt: {seg_out.name} ({len(segs)} Segmente)")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
58
src/reformat/new/smart_speaker_tracker.py
Normal file
58
src/reformat/new/smart_speaker_tracker.py
Normal file
@ -0,0 +1,58 @@
|
|||||||
|
from typing import Dict, List, Tuple, Optional
|
||||||
|
from .tracking import FaceTracker
|
||||||
|
|
||||||
|
class SmartSpeakerTracker:
|
||||||
|
def __init__(self):
|
||||||
|
self.face_tracker = FaceTracker()
|
||||||
|
self.movement_per_id: Dict[int, float] = {}
|
||||||
|
self.prev_openness: Dict[int, float] = {}
|
||||||
|
self.confirmation_counter: Dict[int, int] = {}
|
||||||
|
self.speaker_threshold = 3.0 # wie viel Lippenbewegung braucht es mind.
|
||||||
|
self.decay_factor = 0.9 # wie schnell "verblasst" die Bewegung
|
||||||
|
self.speaker_confirm_frames = 25 # wie viele Frames muss ein Sprecher dominieren
|
||||||
|
self.speaker_id: Optional[int] = None
|
||||||
|
|
||||||
|
def update(self, faces: List[Dict]) -> Tuple[float, float]:
|
||||||
|
if not faces:
|
||||||
|
return self.face_tracker.update([])
|
||||||
|
|
||||||
|
# Lippenbewegung analysieren
|
||||||
|
for face in faces:
|
||||||
|
id = face.get("id")
|
||||||
|
openness = face.get("mouth_openness", 0.0)
|
||||||
|
prev = self.prev_openness.get(id, openness)
|
||||||
|
movement = abs(openness - prev)
|
||||||
|
|
||||||
|
# Bewegung aufaddieren mit Decay
|
||||||
|
old_score = self.movement_per_id.get(id, 0.0) * self.decay_factor
|
||||||
|
self.movement_per_id[id] = old_score + movement
|
||||||
|
self.prev_openness[id] = openness
|
||||||
|
|
||||||
|
# Finde ID mit größter Bewegung
|
||||||
|
if self.movement_per_id:
|
||||||
|
top_id = max(self.movement_per_id, key=self.movement_per_id.get)
|
||||||
|
top_movement = self.movement_per_id[top_id]
|
||||||
|
|
||||||
|
if top_movement >= self.speaker_threshold:
|
||||||
|
self.confirmation_counter[top_id] = self.confirmation_counter.get(top_id, 0) + 1
|
||||||
|
# Andere runterzählen
|
||||||
|
for other_id in self.confirmation_counter:
|
||||||
|
if other_id != top_id:
|
||||||
|
self.confirmation_counter[other_id] = max(0, self.confirmation_counter[other_id] - 1)
|
||||||
|
|
||||||
|
# Wenn lange genug bestätigt, neuer Sprecher
|
||||||
|
if self.confirmation_counter[top_id] >= self.speaker_confirm_frames:
|
||||||
|
self.speaker_id = top_id
|
||||||
|
else:
|
||||||
|
# Wenn keiner über der Schwelle → kein neuer Sprecher
|
||||||
|
self.confirmation_counter = {k: max(0, v - 1) for k, v in self.confirmation_counter.items()}
|
||||||
|
|
||||||
|
# Sprecher vorhanden → dahin zentrieren
|
||||||
|
if self.speaker_id is not None:
|
||||||
|
for face in faces:
|
||||||
|
if face.get("id") == self.speaker_id:
|
||||||
|
return tuple(face["center"])
|
||||||
|
|
||||||
|
# Fallback: stabiler Durchschnitt
|
||||||
|
centers = [tuple(face["center"]) for face in faces]
|
||||||
|
return self.face_tracker.update(centers)
|
67
src/reformat/new/speaker_crop_from_segments.py
Normal file
67
src/reformat/new/speaker_crop_from_segments.py
Normal file
@ -0,0 +1,67 @@
|
|||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Dict
|
||||||
|
|
||||||
|
# === Pfade ===
|
||||||
|
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||||
|
PROJECT_ROOT = SCRIPT_DIR.parents[2]
|
||||||
|
|
||||||
|
FACES_PATH = PROJECT_ROOT / "data" / "face_data_combined" / "testVideoShort_faces.json"
|
||||||
|
SEGMENTS_PATH = PROJECT_ROOT / "data" / "transkripte" / "testVideoShort_segments.json"
|
||||||
|
OUTPUT_PATH = PROJECT_ROOT / "data" / "face_crop_centers" / "testVideoShort_centers.json"
|
||||||
|
|
||||||
|
FPS = 25 # Muss zur Framerate deines Videos passen
|
||||||
|
|
||||||
|
# === Dateien laden ===
|
||||||
|
with open(FACES_PATH) as f:
|
||||||
|
face_data = json.load(f)
|
||||||
|
|
||||||
|
with open(SEGMENTS_PATH) as f:
|
||||||
|
segments = json.load(f)
|
||||||
|
|
||||||
|
# === Zentrierungen pro Frame bestimmen ===
|
||||||
|
frame_centers: List[List[float]] = []
|
||||||
|
|
||||||
|
for segment in segments:
|
||||||
|
start_sec = segment["start"]
|
||||||
|
end_sec = segment["end"]
|
||||||
|
start_f = int(start_sec * FPS)
|
||||||
|
end_f = int(end_sec * FPS)
|
||||||
|
|
||||||
|
# Lippenbewegung pro ID in diesem Segment aufaddieren
|
||||||
|
movement: Dict[int, float] = {}
|
||||||
|
count: Dict[int, int] = {}
|
||||||
|
|
||||||
|
for f in range(start_f, min(end_f, len(face_data))):
|
||||||
|
for face in face_data[f]["faces"]:
|
||||||
|
id = face.get("id")
|
||||||
|
openness = face.get("mouth_openness", 0.0)
|
||||||
|
movement[id] = movement.get(id, 0.0) + openness
|
||||||
|
count[id] = count.get(id, 0) + 1
|
||||||
|
|
||||||
|
# Durchschnitt berechnen
|
||||||
|
avg_movement = {id: movement[id] / count[id] for id in movement if count[id] > 0}
|
||||||
|
if not avg_movement:
|
||||||
|
speaker_id = None
|
||||||
|
else:
|
||||||
|
speaker_id = max(avg_movement, key=avg_movement.get)
|
||||||
|
|
||||||
|
# Für jedes Frame in diesem Segment den Sprecher zentrieren
|
||||||
|
for f in range(start_f, min(end_f, len(face_data))):
|
||||||
|
faces = face_data[f].get("faces", [])
|
||||||
|
center = [960.0, 540.0] # Fallback
|
||||||
|
|
||||||
|
if speaker_id is not None:
|
||||||
|
for face in faces:
|
||||||
|
if face.get("id") == speaker_id:
|
||||||
|
center = face["center"][:2]
|
||||||
|
break
|
||||||
|
|
||||||
|
frame_centers.append([round(center[0], 2), round(center[1], 2)])
|
||||||
|
|
||||||
|
# === Ergebnis speichern ===
|
||||||
|
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with open(OUTPUT_PATH, "w") as f:
|
||||||
|
json.dump(frame_centers, f, indent=2)
|
||||||
|
|
||||||
|
print(f"✅ Zentrierung auf Sprecher für {len(frame_centers)} Frames gespeichert unter:\n{OUTPUT_PATH}")
|
84
src/reformat/new/tracking.py
Normal file
84
src/reformat/new/tracking.py
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
from typing import List, Tuple, Optional
|
||||||
|
|
||||||
|
|
||||||
|
class FaceTracker:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
dist_threshold: float = 200.0,
|
||||||
|
switch_frames: int = 5,
|
||||||
|
panning_window: int = 10,
|
||||||
|
panning_threshold: float = 40.0,
|
||||||
|
smooth_window: int = 3,
|
||||||
|
scene_jump_threshold: float = 400.0
|
||||||
|
):
|
||||||
|
self.dist_threshold = dist_threshold
|
||||||
|
self.switch_frames = switch_frames
|
||||||
|
self.panning_window = panning_window
|
||||||
|
self.panning_threshold = panning_threshold
|
||||||
|
self.smooth_window = smooth_window
|
||||||
|
self.scene_jump_threshold = scene_jump_threshold
|
||||||
|
|
||||||
|
self.current_center: Tuple[float, float] = (960.0, 540.0) # Default Mitte (bei 1920x1080)
|
||||||
|
self.raw_center: Tuple[float, float] = self.current_center
|
||||||
|
self.prev_center: Tuple[float, float] = self.current_center
|
||||||
|
self.prev_raw: Tuple[float, float] = self.current_center
|
||||||
|
self.candidate_center: Optional[Tuple[float, float]] = None
|
||||||
|
self.switch_counter = 0
|
||||||
|
|
||||||
|
self.recent_raw_centers: List[Tuple[float, float]] = []
|
||||||
|
self.recent_final_centers: List[Tuple[float, float]] = []
|
||||||
|
|
||||||
|
def update(self, candidates: List[Tuple[float, float]]) -> Tuple[float, float]:
|
||||||
|
if not candidates:
|
||||||
|
# kein Gesicht → verwende alten Wert
|
||||||
|
self.recent_raw_centers.append(self.raw_center)
|
||||||
|
self.recent_final_centers.append(self.current_center)
|
||||||
|
return self.current_center
|
||||||
|
|
||||||
|
# nehme das Gesicht, das am nächsten zur vorherigen Position ist
|
||||||
|
new_center = min(candidates, key=lambda pt: self._distance(self.prev_center, pt))
|
||||||
|
self.raw_center = new_center
|
||||||
|
self.recent_raw_centers.append(new_center)
|
||||||
|
|
||||||
|
dist = self._distance(self.prev_raw, new_center)
|
||||||
|
if dist > self.scene_jump_threshold:
|
||||||
|
self.current_center = new_center
|
||||||
|
self.prev_center = new_center
|
||||||
|
self.prev_raw = new_center
|
||||||
|
self._smooth_reset()
|
||||||
|
return self.current_center
|
||||||
|
|
||||||
|
if dist > self.dist_threshold:
|
||||||
|
if self.candidate_center != new_center:
|
||||||
|
self.candidate_center = new_center
|
||||||
|
self.switch_counter = 1
|
||||||
|
else:
|
||||||
|
self.switch_counter += 1
|
||||||
|
if self.switch_counter >= self.switch_frames:
|
||||||
|
self.prev_center = self.current_center
|
||||||
|
self.current_center = new_center
|
||||||
|
self.prev_raw = new_center
|
||||||
|
self.switch_counter = 0
|
||||||
|
else:
|
||||||
|
self.switch_counter = 0
|
||||||
|
self.prev_raw = new_center
|
||||||
|
|
||||||
|
# Smoothes Nachziehen
|
||||||
|
smoothed = self._moving_average(self.current_center, new_center, self.smooth_window)
|
||||||
|
self.prev_center = self.current_center
|
||||||
|
self.current_center = smoothed
|
||||||
|
self.recent_final_centers.append(smoothed)
|
||||||
|
|
||||||
|
return smoothed
|
||||||
|
|
||||||
|
def _moving_average(self, old, new, factor):
|
||||||
|
x = (old[0] * (factor - 1) + new[0]) / factor
|
||||||
|
y = (old[1] * (factor - 1) + new[1]) / factor
|
||||||
|
return (x, y)
|
||||||
|
|
||||||
|
def _distance(self, pt1, pt2):
|
||||||
|
return ((pt1[0] - pt2[0]) ** 2 + (pt1[1] - pt2[1]) ** 2) ** 0.5
|
||||||
|
|
||||||
|
def _smooth_reset(self):
|
||||||
|
self.recent_raw_centers.clear()
|
||||||
|
self.recent_final_centers.clear()
|
129
src/reformat/new/utils.py
Normal file
129
src/reformat/new/utils.py
Normal file
@ -0,0 +1,129 @@
|
|||||||
|
# src/utils.py
|
||||||
|
from __future__ import annotations
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, Tuple
|
||||||
|
|
||||||
|
try:
|
||||||
|
import cv2
|
||||||
|
except Exception:
|
||||||
|
cv2 = None # erlaubt Import ohne OpenCV (z.B. beim reinen Testen)
|
||||||
|
|
||||||
|
# --- Logging ---------------------------------------------------------------
|
||||||
|
|
||||||
|
def setup_logging(debug: bool = False) -> None:
|
||||||
|
level = logging.DEBUG if debug else logging.INFO
|
||||||
|
logging.basicConfig(
|
||||||
|
level=level,
|
||||||
|
format="%(asctime)s | %(levelname)s | %(message)s",
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Mathe/Helpers ---------------------------------------------------------
|
||||||
|
|
||||||
|
def clamp(v: float, lo: float, hi: float) -> float:
|
||||||
|
return max(lo, min(hi, v))
|
||||||
|
|
||||||
|
def compute_crop_width(orig_w: int, orig_h: int, out_w: int = 1080, out_h: int = 1920) -> int:
|
||||||
|
# Für 9:16 Ziel: Breite = (9/16) * orig_h, standardmäßig 1080x1920
|
||||||
|
return int((out_w / out_h) * orig_h)
|
||||||
|
|
||||||
|
def iou(boxA, boxB) -> float:
|
||||||
|
"""Berechnet Intersection-over-Union zweier Bounding-Boxes."""
|
||||||
|
ax1, ay1, aw, ah = boxA
|
||||||
|
ax2, ay2 = ax1 + aw, ay1 + ah
|
||||||
|
bx1, by1, bw, bh = boxB
|
||||||
|
bx2, by2 = bx1 + bw, by1 + bh
|
||||||
|
|
||||||
|
inter_x1 = max(ax1, bx1)
|
||||||
|
inter_y1 = max(ay1, by1)
|
||||||
|
inter_x2 = min(ax2, bx2)
|
||||||
|
inter_y2 = min(ay2, by2)
|
||||||
|
inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1)
|
||||||
|
|
||||||
|
union_area = aw * ah + bw * bh - inter_area
|
||||||
|
return inter_area / union_area if union_area > 0 else 0
|
||||||
|
|
||||||
|
# --- IO --------------------------------------------------------------------
|
||||||
|
|
||||||
|
def load_json(path: Path) -> Any:
|
||||||
|
if not path.exists():
|
||||||
|
raise FileNotFoundError(f"Datei fehlt: {path}")
|
||||||
|
with open(path, "r", encoding="utf-8") as f:
|
||||||
|
return json.load(f)
|
||||||
|
|
||||||
|
def save_json(obj: Any, path: Path) -> None:
|
||||||
|
path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with open(path, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(obj, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
def ensure_exists(path: Path, what: str = "Datei/Ordner") -> None:
|
||||||
|
if not path.exists():
|
||||||
|
raise FileNotFoundError(f"{what} nicht gefunden: {path}")
|
||||||
|
|
||||||
|
# --- Video / Pfade ---------------------------------------------------------
|
||||||
|
|
||||||
|
def get_fps(video_path: Path, fallback: float = 25.0) -> float:
|
||||||
|
if cv2 is None:
|
||||||
|
logging.warning("OpenCV nicht verfügbar – nutze FPS-Fallback %.2f", fallback)
|
||||||
|
return fallback
|
||||||
|
cap = cv2.VideoCapture(str(video_path))
|
||||||
|
fps = cap.get(5) # cv2.CAP_PROP_FPS
|
||||||
|
cap.release()
|
||||||
|
if not fps or fps <= 1e-3:
|
||||||
|
logging.warning("Konnte FPS nicht lesen – nutze Fallback %.2f", fallback)
|
||||||
|
return fallback
|
||||||
|
return float(fps)
|
||||||
|
|
||||||
|
def project_root_from(file: Path) -> Path:
|
||||||
|
# Dein Projekt nutzt häufig parents[2]; kapseln:
|
||||||
|
return file.resolve().parents[3]
|
||||||
|
|
||||||
|
def resolve_paths(project_root: Path, base_name: str) -> Dict[str, Path]:
|
||||||
|
data = project_root / "data"
|
||||||
|
return {
|
||||||
|
"timed_path": data / "transkripte" / f"{base_name}_timed.txt",
|
||||||
|
"segments_path":data / "transkripte" / f"{base_name}_segments.json",
|
||||||
|
"faces_path": data / "face_data_combined" / f"{base_name}_faces.json",
|
||||||
|
"centers_path": data / "face_crop_centers" / f"{base_name}_centers.json",
|
||||||
|
"video_path": data / "output" / "raw_clips" / f"{base_name}.mp4",
|
||||||
|
"out_9x16_dir": project_root / "output" / "output_9x16_final",
|
||||||
|
"face_debug_dir": project_root / "output" / "debug" / "faces",
|
||||||
|
}
|
||||||
|
|
||||||
|
def require_api_key(env_name: str = "OPENAI_API_KEY") -> str:
|
||||||
|
key = os.getenv(env_name)
|
||||||
|
if not key:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Umgebungsvariable {env_name} fehlt. "
|
||||||
|
f"Exportiere sie z.B.: export {env_name}='sk-...'")
|
||||||
|
return key
|
||||||
|
|
||||||
|
# --- Simple smoothing for centers ------------------------------------------
|
||||||
|
|
||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
class CenterSmoother:
|
||||||
|
"""Glättet Zentren mit Moving Average und optionaler Jump-Erkennung."""
|
||||||
|
def __init__(self, window: int = 7, jump_thresh: float = 120.0):
|
||||||
|
self.window = window
|
||||||
|
self.jump_thresh = jump_thresh
|
||||||
|
self.buffer: List[Tuple[float, float]] = []
|
||||||
|
self.prev: Optional[Tuple[float, float]] = None
|
||||||
|
|
||||||
|
def push(self, cx: float, cy: float) -> Tuple[float, float]:
|
||||||
|
if self.prev is not None:
|
||||||
|
dx = abs(cx - self.prev[0]) + abs(cy - self.prev[1])
|
||||||
|
if dx > self.jump_thresh:
|
||||||
|
# harter Cut: reset buffer
|
||||||
|
self.buffer.clear()
|
||||||
|
|
||||||
|
self.buffer.append((cx, cy))
|
||||||
|
if len(self.buffer) > self.window:
|
||||||
|
self.buffer.pop(0)
|
||||||
|
|
||||||
|
avgx = sum(p[0] for p in self.buffer) / len(self.buffer)
|
||||||
|
avgy = sum(p[1] for p in self.buffer) / len(self.buffer)
|
||||||
|
self.prev = (avgx, avgy)
|
||||||
|
return self.prev
|
235
src/reformat/old/analyze_crop_position.py
Normal file
235
src/reformat/old/analyze_crop_position.py
Normal file
@ -0,0 +1,235 @@
|
|||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import math
|
||||||
|
import random
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
|
|
||||||
|
|
||||||
|
class FaceTracker:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
dist_threshold: float,
|
||||||
|
switch_frames: int,
|
||||||
|
panning_window: int,
|
||||||
|
panning_threshold: float,
|
||||||
|
smooth_window: int,
|
||||||
|
scene_jump_threshold: float,
|
||||||
|
):
|
||||||
|
self.dist_threshold = dist_threshold
|
||||||
|
self.switch_frames = switch_frames
|
||||||
|
self.panning_window = panning_window
|
||||||
|
self.panning_threshold = panning_threshold
|
||||||
|
self.smooth_window = smooth_window
|
||||||
|
self.scene_jump_threshold = scene_jump_threshold
|
||||||
|
|
||||||
|
self.current_center: Tuple[float, float] = (960.0, 540.0)
|
||||||
|
self.raw_center: Tuple[float, float] = self.current_center
|
||||||
|
self.prev_center: Tuple[float, float] = self.current_center
|
||||||
|
self.prev_raw: Tuple[float, float] = self.current_center
|
||||||
|
self.candidate_center: Optional[Tuple[float, float]] = None
|
||||||
|
self.switch_counter: int = 0
|
||||||
|
self.last_speaker_set: bool = False
|
||||||
|
self.random_center: Optional[Tuple[float, float]] = None
|
||||||
|
|
||||||
|
self.panning_buffer: List[float] = []
|
||||||
|
self.smooth_buffer: List[Tuple[float, float]] = []
|
||||||
|
|
||||||
|
def process_frame(self, faces: List[Dict[str, Any]]) -> Tuple[Tuple[int, int], bool]:
|
||||||
|
valid_faces = [f for f in faces if f.get("center") and f.get("mouth_openness") is not None]
|
||||||
|
all_faces = [f for f in faces if f.get("center")]
|
||||||
|
|
||||||
|
# Speaker tracking
|
||||||
|
if valid_faces:
|
||||||
|
self._update_speaker(valid_faces)
|
||||||
|
else:
|
||||||
|
self._retain_or_random_center(all_faces)
|
||||||
|
|
||||||
|
# Panning detection
|
||||||
|
is_panning = self._detect_panning()
|
||||||
|
|
||||||
|
# Smooth / moving average
|
||||||
|
center = self._smooth_center()
|
||||||
|
|
||||||
|
return (int(center[0]), int(center[1])), is_panning
|
||||||
|
|
||||||
|
def _update_speaker(self, valid_faces: List[Dict[str, Any]]) -> None:
|
||||||
|
best = max(valid_faces, key=lambda x: x["mouth_openness"])
|
||||||
|
cx, cy, *_ = best["center"]
|
||||||
|
new_center = (cx, cy)
|
||||||
|
|
||||||
|
dist = math.hypot(new_center[0] - self.raw_center[0], new_center[1] - self.raw_center[1])
|
||||||
|
if dist < self.dist_threshold:
|
||||||
|
self.raw_center = new_center
|
||||||
|
self.candidate_center = None
|
||||||
|
self.switch_counter = 0
|
||||||
|
else:
|
||||||
|
if (
|
||||||
|
self.candidate_center is None
|
||||||
|
or math.hypot(
|
||||||
|
new_center[0] - self.candidate_center[0], new_center[1] - self.candidate_center[1]
|
||||||
|
)
|
||||||
|
> self.dist_threshold
|
||||||
|
):
|
||||||
|
self.candidate_center = new_center
|
||||||
|
self.switch_counter = 1
|
||||||
|
else:
|
||||||
|
self.switch_counter += 1
|
||||||
|
|
||||||
|
if self.switch_counter >= self.switch_frames:
|
||||||
|
self.raw_center = self.candidate_center # type: ignore
|
||||||
|
self.candidate_center = None
|
||||||
|
self.switch_counter = 0
|
||||||
|
|
||||||
|
self.random_center = None
|
||||||
|
self.last_speaker_set = True
|
||||||
|
|
||||||
|
def _retain_or_random_center(self, all_faces: List[Dict[str, Any]]) -> None:
|
||||||
|
if self.last_speaker_set:
|
||||||
|
# keep previous raw_center
|
||||||
|
pass
|
||||||
|
elif self.random_center is not None:
|
||||||
|
self.raw_center = self.random_center
|
||||||
|
elif all_faces:
|
||||||
|
f = random.choice(all_faces)
|
||||||
|
cx, cy, *_ = f["center"]
|
||||||
|
self.random_center = (cx, cy)
|
||||||
|
self.raw_center = self.random_center
|
||||||
|
|
||||||
|
def _detect_panning(self) -> bool:
|
||||||
|
dx = self.raw_center[0] - self.prev_raw[0]
|
||||||
|
self.panning_buffer.append(dx)
|
||||||
|
if len(self.panning_buffer) > self.panning_window:
|
||||||
|
self.panning_buffer.pop(0)
|
||||||
|
avg_dx = sum(abs(d) for d in self.panning_buffer) / len(self.panning_buffer)
|
||||||
|
is_panning = avg_dx > self.panning_threshold
|
||||||
|
self.prev_raw = self.raw_center
|
||||||
|
return is_panning
|
||||||
|
|
||||||
|
def _smooth_center(self) -> Tuple[float, float]:
|
||||||
|
sudden_jump = (
|
||||||
|
math.hypot(
|
||||||
|
self.raw_center[0] - self.prev_center[0],
|
||||||
|
self.raw_center[1] - self.prev_center[1],
|
||||||
|
)
|
||||||
|
> self.scene_jump_threshold
|
||||||
|
)
|
||||||
|
if not sudden_jump:
|
||||||
|
self.smooth_buffer.append(self.raw_center)
|
||||||
|
if len(self.smooth_buffer) > self.smooth_window:
|
||||||
|
self.smooth_buffer.pop(0)
|
||||||
|
avg_x = sum(p[0] for p in self.smooth_buffer) / len(self.smooth_buffer)
|
||||||
|
avg_y = sum(p[1] for p in self.smooth_buffer) / len(self.smooth_buffer)
|
||||||
|
center = (avg_x, avg_y)
|
||||||
|
else:
|
||||||
|
center = self.raw_center
|
||||||
|
self.smooth_buffer.clear()
|
||||||
|
|
||||||
|
self.prev_center = center
|
||||||
|
return center
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args() -> argparse.Namespace:
|
||||||
|
script_dir = Path(__file__).resolve().parent
|
||||||
|
project_root = script_dir.parents[1]
|
||||||
|
default_input = project_root / "data" / "face_data_combined"
|
||||||
|
default_output = project_root / "data" / "face_crop_centers"
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Track and smooth face crop centers based on mouth openness."
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-i", "--input-dir", type=Path,
|
||||||
|
default=default_input,
|
||||||
|
help=f"Directory containing *_faces.json files (default: {default_input})"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-o", "--output-dir", type=Path,
|
||||||
|
default=default_output,
|
||||||
|
help=f"Directory to save *_centers.json files (default: {default_output})"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--dist-threshold", type=float, default=30.0,
|
||||||
|
help="Pixel distance threshold to switch speaker"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--switch-frames", type=int, default=20,
|
||||||
|
help="Number of consecutive frames required to confirm speaker switch"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--panning-window", type=int, default=30,
|
||||||
|
help="Frame window size for panning detection"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--panning-threshold", type=float, default=3.0,
|
||||||
|
help="Average dx threshold for panning detection"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--smooth-window", type=int, default=8,
|
||||||
|
help="Moving average window for smoothing"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--scene-jump-threshold", type=float, default=300.0,
|
||||||
|
help="Jump threshold to detect scene cuts"
|
||||||
|
)
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def setup_logging() -> None:
|
||||||
|
logging.basicConfig(
|
||||||
|
format="%(asctime)s %(levelname)s: %(message)s",
|
||||||
|
level=logging.INFO,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
setup_logging()
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
input_dir: Path = args.input_dir.resolve()
|
||||||
|
output_dir: Path = args.output_dir.resolve()
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
tracker = FaceTracker(
|
||||||
|
dist_threshold=args.dist_threshold,
|
||||||
|
switch_frames=args.switch_frames,
|
||||||
|
panning_window=args.panning_window,
|
||||||
|
panning_threshold=args.panning_threshold,
|
||||||
|
smooth_window=args.smooth_window,
|
||||||
|
scene_jump_threshold=args.scene_jump_threshold,
|
||||||
|
)
|
||||||
|
|
||||||
|
json_files = sorted(input_dir.glob("*_faces.json"))
|
||||||
|
if not json_files:
|
||||||
|
logging.error("Keine *_faces.json-Dateien gefunden in %s", input_dir)
|
||||||
|
return
|
||||||
|
|
||||||
|
logging.info("Gefundene Dateien: %d", len(json_files))
|
||||||
|
|
||||||
|
for json_path in json_files:
|
||||||
|
logging.info("Verarbeite %s", json_path.name)
|
||||||
|
try:
|
||||||
|
frames_data = json.loads(json_path.read_text())
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
logging.error("JSON-Fehler in %s: %s", json_path.name, e)
|
||||||
|
continue
|
||||||
|
|
||||||
|
out_data: List[Dict[str, Any]] = []
|
||||||
|
for frame_idx, frame in enumerate(frames_data):
|
||||||
|
faces = frame.get("faces", [])
|
||||||
|
center, is_panning = tracker.process_frame(faces)
|
||||||
|
out_data.append({
|
||||||
|
"frame": frame_idx,
|
||||||
|
"center": [center[0], center[1]],
|
||||||
|
"panning": is_panning,
|
||||||
|
})
|
||||||
|
|
||||||
|
out_path = output_dir / f"{json_path.stem.replace('_faces', '')}_centers.json"
|
||||||
|
with out_path.open("w") as f:
|
||||||
|
json.dump(out_data, f, indent=2)
|
||||||
|
logging.info("Gespeichert: %s (%d Frames)", out_path.name, len(out_data))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
180
src/reformat/old/crop_to_speaker.py
Normal file
180
src/reformat/old/crop_to_speaker.py
Normal file
@ -0,0 +1,180 @@
|
|||||||
|
import json
|
||||||
|
import cv2
|
||||||
|
import subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# === Pfade & globale Settings ===
|
||||||
|
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||||
|
PROJECT_ROOT = SCRIPT_DIR.parents[1]
|
||||||
|
|
||||||
|
INPUT_VIDEO_DIR = PROJECT_ROOT / "data" / "output" / "raw_clips"
|
||||||
|
INPUT_CENTER_DIR = PROJECT_ROOT / "data" / "face_crop_centers"
|
||||||
|
INPUT_FACES_DIR = PROJECT_ROOT / "data" / "face_data_combined"
|
||||||
|
OUTPUT_DIR = PROJECT_ROOT / "output" / "output_9x16_final"
|
||||||
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
OUT_W, OUT_H = 1080, 1920
|
||||||
|
|
||||||
|
DEBUG_MODE = True
|
||||||
|
DEBUG_SCALE = 0.75
|
||||||
|
# Ab welcher Offenheit wir "Bewegung" annehmen
|
||||||
|
DEBUG_MOUTH_THRESHOLD = 0.02
|
||||||
|
|
||||||
|
# === Hilfsfunktionen ===
|
||||||
|
def clamp(v, lo, hi):
|
||||||
|
return max(lo, min(hi, v))
|
||||||
|
|
||||||
|
def compute_crop_width(orig_w, orig_h):
|
||||||
|
return int((OUT_W / OUT_H) * orig_h)
|
||||||
|
|
||||||
|
# === Verarbeitung ===
|
||||||
|
for center_path in sorted(INPUT_CENTER_DIR.glob("*_centers.json")):
|
||||||
|
stem = center_path.stem.replace("_centers", "")
|
||||||
|
video_path = INPUT_VIDEO_DIR / f"{stem}.mp4"
|
||||||
|
faces_path = INPUT_FACES_DIR / f"{stem}_faces.json"
|
||||||
|
|
||||||
|
if not video_path.exists():
|
||||||
|
print(f"⚠️ Video fehlt: {stem}.mp4")
|
||||||
|
continue
|
||||||
|
if not faces_path.exists():
|
||||||
|
print(f"⚠️ Gesichtsdaten fehlen: {stem}_faces.json")
|
||||||
|
continue
|
||||||
|
|
||||||
|
centers_data = json.loads(center_path.read_text())
|
||||||
|
faces_data = json.loads(faces_path.read_text())
|
||||||
|
|
||||||
|
# Debug-Liste pro Video anlegen
|
||||||
|
if DEBUG_MODE:
|
||||||
|
debug_results: list = []
|
||||||
|
|
||||||
|
cap = cv2.VideoCapture(str(video_path))
|
||||||
|
fps = cap.get(cv2.CAP_PROP_FPS)
|
||||||
|
orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||||
|
orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||||
|
crop_w = compute_crop_width(orig_w, orig_h)
|
||||||
|
crop_h = orig_h
|
||||||
|
|
||||||
|
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
|
||||||
|
temp_vid = OUTPUT_DIR / f"{stem}_cropped.mp4"
|
||||||
|
out_vid = cv2.VideoWriter(str(temp_vid), fourcc, fps, (OUT_W, OUT_H))
|
||||||
|
if not out_vid.isOpened():
|
||||||
|
print(f"❌ Kann nicht schreiben: {temp_vid.name}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if DEBUG_MODE:
|
||||||
|
cv2.namedWindow("Debug", cv2.WINDOW_NORMAL)
|
||||||
|
|
||||||
|
frame_idx = 0
|
||||||
|
while True:
|
||||||
|
ret, frame = cap.read()
|
||||||
|
if not ret or frame_idx >= len(centers_data):
|
||||||
|
break
|
||||||
|
|
||||||
|
# Crop-Infos
|
||||||
|
info = centers_data[frame_idx]
|
||||||
|
cx, cy = info["center"]
|
||||||
|
is_panning = info.get("panning", False)
|
||||||
|
if is_panning:
|
||||||
|
cx = orig_w // 2
|
||||||
|
|
||||||
|
x0 = int(cx - crop_w / 2)
|
||||||
|
x0 = clamp(x0, 0, orig_w - crop_w)
|
||||||
|
y0 = 0
|
||||||
|
|
||||||
|
# Ausschneiden + Resize
|
||||||
|
crop = frame[y0:y0+crop_h, x0:x0+crop_w]
|
||||||
|
if crop.shape[1] != crop_w or crop.shape[0] != crop_h:
|
||||||
|
crop = cv2.copyMakeBorder(
|
||||||
|
crop, 0, crop_h - crop.shape[0],
|
||||||
|
0, crop_w - crop.shape[1],
|
||||||
|
cv2.BORDER_CONSTANT, value=[0, 0, 0]
|
||||||
|
)
|
||||||
|
out_frame = cv2.resize(crop, (OUT_W, OUT_H), interpolation=cv2.INTER_LINEAR)
|
||||||
|
out_vid.write(out_frame)
|
||||||
|
|
||||||
|
if DEBUG_MODE:
|
||||||
|
debug_frame = frame.copy()
|
||||||
|
frame_faces = faces_data[frame_idx].get("faces", [])
|
||||||
|
|
||||||
|
# Build debug entry for this frame
|
||||||
|
dbg_faces = []
|
||||||
|
for f in frame_faces:
|
||||||
|
# center und Offenheit
|
||||||
|
cx_f, cy_f = map(int, f["center"][:2])
|
||||||
|
openness = f.get("mouth_openness", 0.0)
|
||||||
|
moving = openness > DEBUG_MOUTH_THRESHOLD
|
||||||
|
dbg_faces.append({
|
||||||
|
"center": [cx_f, cy_f],
|
||||||
|
"mouth_openness": openness,
|
||||||
|
"mouth_moving": moving
|
||||||
|
})
|
||||||
|
|
||||||
|
# Anzeige im Debug-Fenster
|
||||||
|
cv2.circle(debug_frame, (cx_f, cy_f), 4, (180, 180, 180), -1)
|
||||||
|
cv2.putText(
|
||||||
|
debug_frame,
|
||||||
|
f"{round(openness,2)}",
|
||||||
|
(cx_f + 6, cy_f - 6),
|
||||||
|
cv2.FONT_HERSHEY_SIMPLEX,
|
||||||
|
0.5,
|
||||||
|
(255, 255, 255),
|
||||||
|
1,
|
||||||
|
cv2.LINE_AA
|
||||||
|
)
|
||||||
|
# roter Punkt, wenn Bewegung
|
||||||
|
color = (0,0,255) if moving else (0,255,255)
|
||||||
|
cv2.circle(debug_frame, (cx_f, cy_f), 6, color, 1)
|
||||||
|
|
||||||
|
debug_results.append({
|
||||||
|
"frame": frame_idx,
|
||||||
|
"faces": dbg_faces
|
||||||
|
})
|
||||||
|
|
||||||
|
# Haupt-Center & Crop-Rahmen
|
||||||
|
cv2.circle(debug_frame, (int(cx), int(cy)), 18, (0, 255, 0), 2)
|
||||||
|
cv2.rectangle(debug_frame, (x0, 0), (x0 + crop_w, crop_h), (0, 0, 255), 2)
|
||||||
|
|
||||||
|
dbg = cv2.resize(
|
||||||
|
debug_frame,
|
||||||
|
(int(orig_w * DEBUG_SCALE), int(orig_h * DEBUG_SCALE))
|
||||||
|
)
|
||||||
|
cv2.imshow("Debug", dbg)
|
||||||
|
if cv2.waitKey(1) & 0xFF == ord('q'):
|
||||||
|
break
|
||||||
|
|
||||||
|
frame_idx += 1
|
||||||
|
|
||||||
|
cap.release()
|
||||||
|
out_vid.release()
|
||||||
|
if DEBUG_MODE:
|
||||||
|
cv2.destroyAllWindows()
|
||||||
|
|
||||||
|
# Audio extrahieren & muxen
|
||||||
|
audio_tmp = OUTPUT_DIR / f"{stem}_audio.aac"
|
||||||
|
final_vid = OUTPUT_DIR / f"{stem}.mp4"
|
||||||
|
try:
|
||||||
|
subprocess.run(
|
||||||
|
["ffmpeg", "-y", "-i", str(video_path), "-vn", "-acodec", "copy", str(audio_tmp)],
|
||||||
|
check=True
|
||||||
|
)
|
||||||
|
subprocess.run(
|
||||||
|
["ffmpeg", "-y", "-i", str(temp_vid), "-i", str(audio_tmp),
|
||||||
|
"-c:v", "copy", "-c:a", "aac", "-b:a", "128k", str(final_vid)],
|
||||||
|
check=True
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
try: temp_vid.unlink()
|
||||||
|
except: pass
|
||||||
|
try: audio_tmp.unlink()
|
||||||
|
except: pass
|
||||||
|
|
||||||
|
# Debug-JSON schreiben
|
||||||
|
if DEBUG_MODE:
|
||||||
|
dbg_path = OUTPUT_DIR / f"{stem}_debug.json"
|
||||||
|
with dbg_path.open("w") as f:
|
||||||
|
json.dump(debug_results, f, indent=2)
|
||||||
|
print(f"🛠️ Debug-Daten: {dbg_path.name}")
|
||||||
|
|
||||||
|
print(f"✅ Finales Video: {final_vid.name}")
|
||||||
|
|
||||||
|
print("\n🏁 Alle Videos fertig in:", OUTPUT_DIR.resolve())
|
126
src/reformat/old/detect_speaking_faces.py
Normal file
126
src/reformat/old/detect_speaking_faces.py
Normal file
@ -0,0 +1,126 @@
|
|||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from collections import defaultdict
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
# === Einstellungen ===
|
||||||
|
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||||
|
PROJECT_ROOT = SCRIPT_DIR.parents[1]
|
||||||
|
INPUT_DIR = PROJECT_ROOT / "data" / "face_data_combined"
|
||||||
|
OUTPUT_PATH = INPUT_DIR / "dominant_faces.json"
|
||||||
|
|
||||||
|
SEGMENT_LENGTH = 2.0 # Länge jedes Segments in Sekunden
|
||||||
|
MOUTH_THRESHOLD = 0.01 # minimale Mundöffnung, um einen Sprecher zu zählen
|
||||||
|
SMOOTH_WINDOW = 5 # Fenstergröße (in Segmenten) für Moving Average
|
||||||
|
|
||||||
|
def analyze_clip_timed(path):
|
||||||
|
# 1) JSON einlesen
|
||||||
|
try:
|
||||||
|
data = json.loads(path.read_text())
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Fehler beim Lesen von {path.name}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 2) Nur valide Frames verwenden
|
||||||
|
frames = [d for d in data if "timestamp" in d and isinstance(d.get("faces"), list)]
|
||||||
|
if not frames:
|
||||||
|
print(f"⚠️ Keine validen Frames in {path.name}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
frames.sort(key=lambda x: x["timestamp"])
|
||||||
|
max_time = frames[-1]["timestamp"]
|
||||||
|
|
||||||
|
# 3) Segmente erzeugen und dominanten Sprecher per Segment finden
|
||||||
|
segments = []
|
||||||
|
t = 0.0
|
||||||
|
while t < max_time:
|
||||||
|
t_end = t + SEGMENT_LENGTH
|
||||||
|
face_scores = defaultdict(list) # mouth_openness pro bbox
|
||||||
|
face_boxes = defaultdict(list) # raw bbox pro bbox-key
|
||||||
|
face_centers = defaultdict(list) # center [cx,cy,w,h] pro bbox-key
|
||||||
|
|
||||||
|
# alle Frames durchsuchen, die in dieses Segment fallen
|
||||||
|
for f in frames:
|
||||||
|
ts = f["timestamp"]
|
||||||
|
if t <= ts < t_end:
|
||||||
|
for face in f["faces"]:
|
||||||
|
bbox = face["bbox"] # [x,y,w,h]
|
||||||
|
score = face.get("mouth_openness", 0.0)
|
||||||
|
center = face.get("center", None) # [cx,cy,w,h]
|
||||||
|
key = tuple(bbox)
|
||||||
|
|
||||||
|
if score >= MOUTH_THRESHOLD and center is not None:
|
||||||
|
face_scores[key].append(score)
|
||||||
|
face_boxes[key].append(bbox)
|
||||||
|
face_centers[key].append(center)
|
||||||
|
|
||||||
|
if face_scores:
|
||||||
|
# den Key mit dem höchsten Durchschnittsscore wählen
|
||||||
|
avg_scores = {k: np.mean(v) for k, v in face_scores.items()}
|
||||||
|
dominant_key = max(avg_scores, key=avg_scores.get)
|
||||||
|
|
||||||
|
# mittlere Bounding‑Box und mittleres Center berechnen
|
||||||
|
avg_bbox = np.mean(face_boxes[dominant_key], axis=0).astype(int).tolist()
|
||||||
|
avg_center = np.mean(face_centers[dominant_key], axis=0).tolist() # [cx,cy,w,h]
|
||||||
|
|
||||||
|
segments.append({
|
||||||
|
"start": round(t, 2),
|
||||||
|
"end": round(t_end if t_end < max_time else max_time, 2),
|
||||||
|
"bbox": avg_bbox,
|
||||||
|
"center": [float(avg_center[0]), float(avg_center[1]), float(avg_center[2]), float(avg_center[3])]
|
||||||
|
})
|
||||||
|
|
||||||
|
t += SEGMENT_LENGTH
|
||||||
|
|
||||||
|
if not segments:
|
||||||
|
print(f"⚠️ Keine Segmente für Clip {path.name}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# 4) Glätten der Segment‑Zentren mit Moving Average
|
||||||
|
seg_centers = [s["center"] for s in segments] # Liste von [cx,cy,w,h]
|
||||||
|
sm_centers = []
|
||||||
|
n = len(seg_centers)
|
||||||
|
half = SMOOTH_WINDOW // 2
|
||||||
|
|
||||||
|
for i in range(n):
|
||||||
|
start = max(0, i - half)
|
||||||
|
end = min(n, i + half + 1)
|
||||||
|
window = seg_centers[start:end]
|
||||||
|
avg = np.mean(window, axis=0) # ergibt [cx,cy,w,h]
|
||||||
|
sm_centers.append(avg.tolist())
|
||||||
|
|
||||||
|
# 5) Ausgabe des geglätteten Pfades in die Konsole
|
||||||
|
print(f"\n🔄 Smoothed path für Clip {path.stem}:")
|
||||||
|
for i, s in enumerate(segments):
|
||||||
|
cx, cy, w, h = sm_centers[i]
|
||||||
|
print(f" Segment {i} [{s['start']}–{s['end']}s]: "
|
||||||
|
f"center=({cx:.1f}, {cy:.1f}), size=({w:.1f}×{h:.1f})")
|
||||||
|
|
||||||
|
# 6) Neue Felder für Ausgabe‑JSON bauen
|
||||||
|
sm_segments = []
|
||||||
|
for i, s in enumerate(segments):
|
||||||
|
cx, cy, w, h = sm_centers[i]
|
||||||
|
x0 = int(cx - w/2)
|
||||||
|
y0 = int(cy - h/2)
|
||||||
|
sm_segments.append({
|
||||||
|
"start": s["start"],
|
||||||
|
"end": s["end"],
|
||||||
|
"bbox": [x0, y0, int(w), int(h)]
|
||||||
|
})
|
||||||
|
|
||||||
|
return {
|
||||||
|
"clip": path.stem.replace("_faces", "") + ".mp4",
|
||||||
|
"segments": sm_segments
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# === Hauptschleife über alle Clips ===
|
||||||
|
results = []
|
||||||
|
for json_file in sorted(INPUT_DIR.glob("*_faces.json")):
|
||||||
|
out = analyze_clip_timed(json_file)
|
||||||
|
if out:
|
||||||
|
results.append(out)
|
||||||
|
|
||||||
|
OUTPUT_PATH.write_text(json.dumps(results, indent=2))
|
||||||
|
print(f"\n✅ Analyse abgeschlossen – {len(results)} Clips erkannt.")
|
||||||
|
print(f"📄 Gespeichert in: {OUTPUT_PATH.resolve()}")
|
114
src/reformat/old/grid_faces_from_yolo.py
Normal file
114
src/reformat/old/grid_faces_from_yolo.py
Normal file
@ -0,0 +1,114 @@
|
|||||||
|
import json
|
||||||
|
import cv2
|
||||||
|
import numpy as np
|
||||||
|
from pathlib import Path
|
||||||
|
from tqdm import tqdm
|
||||||
|
from collections import defaultdict, Counter
|
||||||
|
from sklearn.cluster import DBSCAN
|
||||||
|
|
||||||
|
# === Einstellungen ===
|
||||||
|
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||||
|
VIDEO_DIR = SCRIPT_DIR.parents[1] / "output"
|
||||||
|
FACE_JSON_DIR = SCRIPT_DIR / "face_data_yolo"
|
||||||
|
OUTPUT_DIR = SCRIPT_DIR.parents[1] / "output_stacked_faces"
|
||||||
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
OUT_WIDTH = 1080
|
||||||
|
OUT_HEIGHT = 1920
|
||||||
|
GRID_ROWS = 4
|
||||||
|
FACE_CROP_HEIGHT = OUT_HEIGHT // GRID_ROWS
|
||||||
|
FACE_CROP_WIDTH = OUT_WIDTH
|
||||||
|
|
||||||
|
# === Hilfsfunktion
|
||||||
|
def bbox_center(bbox):
|
||||||
|
x, y, w, h = bbox
|
||||||
|
return int(x + w // 2), int(y + h // 2)
|
||||||
|
|
||||||
|
# === Hauptverarbeitung ===
|
||||||
|
for json_path in tqdm(sorted(FACE_JSON_DIR.glob("*_faces.json")), desc="🔍 Erzeuge Grid-Clips"):
|
||||||
|
video_name = json_path.stem.replace("_faces", "") + ".mp4"
|
||||||
|
video_path = VIDEO_DIR / video_name
|
||||||
|
if not video_path.exists():
|
||||||
|
print(f"❌ Video nicht gefunden: {video_name}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
data = json.loads(json_path.read_text())
|
||||||
|
|
||||||
|
# === Alle Gesichtszentren sammeln
|
||||||
|
all_faces = []
|
||||||
|
for frame in data:
|
||||||
|
for face in frame["faces"]:
|
||||||
|
center = bbox_center(face["bbox"])
|
||||||
|
all_faces.append((center, face["bbox"]))
|
||||||
|
|
||||||
|
if not all_faces:
|
||||||
|
print(f"⚠️ Keine Gesichter erkannt in {video_name}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# === Clustern
|
||||||
|
coords = [pos for pos, _ in all_faces]
|
||||||
|
clustering = DBSCAN(eps=80, min_samples=5).fit(coords)
|
||||||
|
cluster_labels = clustering.labels_
|
||||||
|
label_counts = Counter(cluster_labels)
|
||||||
|
most_common_labels = [lbl for lbl, _ in label_counts.most_common(GRID_ROWS) if lbl != -1]
|
||||||
|
|
||||||
|
if not most_common_labels:
|
||||||
|
print(f"⚠️ Keine gültigen Cluster in {video_name}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# === Zuordnung: cluster_id → feste Zeile
|
||||||
|
cluster_faces = defaultdict(list)
|
||||||
|
for (_, bbox), label in zip(all_faces, cluster_labels):
|
||||||
|
if label in most_common_labels:
|
||||||
|
cluster_faces[label].append(bbox)
|
||||||
|
|
||||||
|
def cluster_y(label):
|
||||||
|
return np.mean([bbox[1] for bbox in cluster_faces[label]])
|
||||||
|
|
||||||
|
sorted_labels = sorted(most_common_labels, key=cluster_y)
|
||||||
|
label_to_row = {lbl: idx for idx, lbl in enumerate(sorted_labels)}
|
||||||
|
|
||||||
|
# === cluster_id zu jedem Gesicht hinzufügen
|
||||||
|
for frame in data:
|
||||||
|
for face in frame["faces"]:
|
||||||
|
center = bbox_center(face["bbox"])
|
||||||
|
distances = [np.linalg.norm(np.array(center) - np.array(c)) for c in coords]
|
||||||
|
nearest = np.argmin(distances)
|
||||||
|
label = cluster_labels[nearest]
|
||||||
|
face["cluster_id"] = label
|
||||||
|
|
||||||
|
# === Video verarbeiten
|
||||||
|
cap = cv2.VideoCapture(str(video_path))
|
||||||
|
fps = cap.get(cv2.CAP_PROP_FPS)
|
||||||
|
out_path = OUTPUT_DIR / video_name.replace(".mp4", "_stacked.mp4")
|
||||||
|
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
|
||||||
|
writer = cv2.VideoWriter(str(out_path), fourcc, fps, (OUT_WIDTH, OUT_HEIGHT))
|
||||||
|
|
||||||
|
frame_idx = 0
|
||||||
|
while cap.isOpened():
|
||||||
|
ret, frame = cap.read()
|
||||||
|
if not ret or frame_idx >= len(data):
|
||||||
|
break
|
||||||
|
|
||||||
|
output_frame = np.zeros((OUT_HEIGHT, OUT_WIDTH, 3), dtype=np.uint8)
|
||||||
|
for face in data[frame_idx]["faces"]:
|
||||||
|
label = face.get("cluster_id", -1)
|
||||||
|
if label not in label_to_row:
|
||||||
|
continue
|
||||||
|
row = label_to_row[label]
|
||||||
|
x, y, w, h = face["bbox"]
|
||||||
|
crop = frame[y:y+h, x:x+w]
|
||||||
|
if crop.size == 0:
|
||||||
|
continue
|
||||||
|
resized = cv2.resize(crop, (FACE_CROP_WIDTH, FACE_CROP_HEIGHT))
|
||||||
|
y_offset = row * FACE_CROP_HEIGHT
|
||||||
|
output_frame[y_offset:y_offset+FACE_CROP_HEIGHT, :] = resized
|
||||||
|
|
||||||
|
writer.write(output_frame)
|
||||||
|
frame_idx += 1
|
||||||
|
|
||||||
|
cap.release()
|
||||||
|
writer.release()
|
||||||
|
print(f"✅ Exportiert: {out_path.name}")
|
||||||
|
|
||||||
|
print("🏁 Alle Grid-Videos fertig.")
|
75
src/reformat/old/preview_faces.py
Normal file
75
src/reformat/old/preview_faces.py
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
import cv2
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||||
|
PROJECT_DIR = SCRIPT_DIR.parents[1] # ← geht von /src/reformat zu /BachlorArbeit
|
||||||
|
|
||||||
|
FACES_DIR = PROJECT_DIR / "data" / "face_data_combined"
|
||||||
|
INPUT_VIDEO_DIR = PROJECT_DIR / "data" / "output" / "raw_clips"
|
||||||
|
OUTPUT_DIR = PROJECT_DIR / "output" / "output_preview_faces"
|
||||||
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# === Alle *_faces.json Dateien durchgehen ===
|
||||||
|
face_files = sorted(FACES_DIR.glob("*_faces.json"))
|
||||||
|
|
||||||
|
for face_file in tqdm(face_files, desc="🔍 Erzeuge Vorschau mit Sprechererkennung"):
|
||||||
|
clip_name = face_file.stem.replace("_faces", "") + ".mp4"
|
||||||
|
input_path = INPUT_VIDEO_DIR / clip_name
|
||||||
|
output_path = OUTPUT_DIR / clip_name.replace(".mp4", "_preview_faces.mp4")
|
||||||
|
|
||||||
|
if not input_path.exists():
|
||||||
|
print(f"❌ Clip nicht gefunden: {clip_name}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Video-Setup
|
||||||
|
cap = cv2.VideoCapture(str(input_path))
|
||||||
|
fps = cap.get(cv2.CAP_PROP_FPS)
|
||||||
|
fps = fps if fps > 1 else 25 # fallback falls FPS = 0
|
||||||
|
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||||
|
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||||
|
fourcc = cv2.VideoWriter_fourcc(*"avc1") # Kompatibler als mp4v
|
||||||
|
out = cv2.VideoWriter(str(output_path), fourcc, fps, (width, height))
|
||||||
|
|
||||||
|
# Gesichts-Daten laden
|
||||||
|
data = json.loads(face_file.read_text())
|
||||||
|
data_by_frame = {d["frame"]: d["faces"] for d in data if d["faces"]}
|
||||||
|
|
||||||
|
print(f"🔢 Frames mit Gesichtern: {len(data_by_frame)}")
|
||||||
|
|
||||||
|
frame_idx = 0
|
||||||
|
while cap.isOpened():
|
||||||
|
ret, frame = cap.read()
|
||||||
|
if not ret:
|
||||||
|
break
|
||||||
|
|
||||||
|
faces = data_by_frame.get(frame_idx, [])
|
||||||
|
speaker_idx = None
|
||||||
|
|
||||||
|
# Sprecher anhand Mundöffnung
|
||||||
|
if faces and all("mouth_openness" in f for f in faces):
|
||||||
|
mouth_vals = [f["mouth_openness"] for f in faces]
|
||||||
|
if any(v > 0.01 for v in mouth_vals): # einfache Aktivitäts-Schwelle
|
||||||
|
speaker_idx = mouth_vals.index(max(mouth_vals))
|
||||||
|
|
||||||
|
for i, face in enumerate(faces):
|
||||||
|
x, y, w, h = face["bbox"]
|
||||||
|
color = (0, 255, 0) if i == speaker_idx else (255, 255, 255)
|
||||||
|
label = f"Mouth: {face.get('mouth_openness', 0):.2f}"
|
||||||
|
|
||||||
|
# Debug-Ausgabe (optional)
|
||||||
|
print(f"Frame {frame_idx} | Face {i} | BBox: ({x},{y},{w},{h}) | Speaker: {speaker_idx}")
|
||||||
|
|
||||||
|
cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
|
||||||
|
cv2.putText(frame, label, (x, y - 10),
|
||||||
|
cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
|
||||||
|
|
||||||
|
out.write(frame)
|
||||||
|
frame_idx += 1
|
||||||
|
|
||||||
|
cap.release()
|
||||||
|
out.release()
|
||||||
|
print(f"✅ Vorschau exportiert: {output_path.name}")
|
||||||
|
|
||||||
|
print("🏁 Alle Vorschauvideos mit Sprecherkennung erstellt.")
|
92
src/reformat/old/track_faces.py
Normal file
92
src/reformat/old/track_faces.py
Normal file
@ -0,0 +1,92 @@
|
|||||||
|
import cv2
|
||||||
|
import mediapipe as mp
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
# === Einstellungen ===
|
||||||
|
INPUT_DIR = Path(__file__).resolve().parents[2] / "output"
|
||||||
|
OUTPUT_DIR = Path(__file__).resolve().parent / "face_data"
|
||||||
|
OUTPUT_DIR.mkdir(exist_ok=True)
|
||||||
|
FRAME_SKIP = 1 # analysiere jeden Frame für maximale Genauigkeit
|
||||||
|
PADDING = 30 # Pixel Padding um Gesicht
|
||||||
|
|
||||||
|
mp_face_mesh = mp.solutions.face_mesh
|
||||||
|
|
||||||
|
# Erweiterte Lippen-Landmarks (innen)
|
||||||
|
TOP_LIPS = [13, 78, 82]
|
||||||
|
BOTTOM_LIPS = [14, 87, 317]
|
||||||
|
|
||||||
|
def mouth_openness(landmarks, image_height):
|
||||||
|
try:
|
||||||
|
top_avg = sum([landmarks[i].y for i in TOP_LIPS]) / len(TOP_LIPS)
|
||||||
|
bottom_avg = sum([landmarks[i].y for i in BOTTOM_LIPS]) / len(BOTTOM_LIPS)
|
||||||
|
return abs(bottom_avg - top_avg)
|
||||||
|
except:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
def process_video(path):
|
||||||
|
cap = cv2.VideoCapture(str(path))
|
||||||
|
fps = cap.get(cv2.CAP_PROP_FPS)
|
||||||
|
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||||
|
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||||
|
results = []
|
||||||
|
|
||||||
|
with mp_face_mesh.FaceMesh(
|
||||||
|
static_image_mode=False,
|
||||||
|
max_num_faces=5,
|
||||||
|
refine_landmarks=True,
|
||||||
|
min_detection_confidence=0.6,
|
||||||
|
min_tracking_confidence=0.6
|
||||||
|
) as face_mesh:
|
||||||
|
|
||||||
|
frame_idx = 0
|
||||||
|
while cap.isOpened():
|
||||||
|
ret, frame = cap.read()
|
||||||
|
if not ret:
|
||||||
|
break
|
||||||
|
|
||||||
|
if frame_idx % FRAME_SKIP != 0:
|
||||||
|
frame_idx += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
||||||
|
output = face_mesh.process(rgb)
|
||||||
|
|
||||||
|
faces = []
|
||||||
|
if output.multi_face_landmarks:
|
||||||
|
for landmarks in output.multi_face_landmarks:
|
||||||
|
mouth = mouth_openness(landmarks.landmark, height)
|
||||||
|
|
||||||
|
xs = [lm.x * width for lm in landmarks.landmark]
|
||||||
|
ys = [lm.y * height for lm in landmarks.landmark]
|
||||||
|
x1 = max(0, int(min(xs)) - PADDING)
|
||||||
|
y1 = max(0, int(min(ys)) - PADDING)
|
||||||
|
x2 = min(width, int(max(xs)) + PADDING)
|
||||||
|
y2 = min(height, int(max(ys)) + PADDING)
|
||||||
|
bbox = [x1, y1, x2 - x1, y2 - y1]
|
||||||
|
|
||||||
|
faces.append({
|
||||||
|
"bbox": bbox,
|
||||||
|
"mouth_openness": round(mouth, 4)
|
||||||
|
})
|
||||||
|
|
||||||
|
results.append({
|
||||||
|
"frame": frame_idx,
|
||||||
|
"timestamp": round(frame_idx / fps, 2),
|
||||||
|
"faces": faces
|
||||||
|
})
|
||||||
|
|
||||||
|
frame_idx += 1
|
||||||
|
|
||||||
|
cap.release()
|
||||||
|
out_path = OUTPUT_DIR / f"{path.stem}_faces.json"
|
||||||
|
out_path.write_text(json.dumps(results, indent=2))
|
||||||
|
print(f"✅ {path.name} verarbeitet → {out_path.name}")
|
||||||
|
|
||||||
|
# === Alle Videos im output/ Ordner durchgehen
|
||||||
|
videos = list(INPUT_DIR.glob("*.mp4"))
|
||||||
|
print(f"🎬 {len(videos)} Videos gefunden in: {INPUT_DIR}")
|
||||||
|
|
||||||
|
for video in tqdm(videos):
|
||||||
|
process_video(video)
|
206
src/reformat/old/track_faces_Yolo.py
Normal file
206
src/reformat/old/track_faces_Yolo.py
Normal file
@ -0,0 +1,206 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
import argparse
|
||||||
|
import logging
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import cv2
|
||||||
|
from ultralytics import YOLO
|
||||||
|
import mediapipe as mp
|
||||||
|
|
||||||
|
# === Pfade und Standardwerte ===
|
||||||
|
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||||
|
PROJECT_ROOT = SCRIPT_DIR.parents[1]
|
||||||
|
DEFAULT_INPUT_DIR = PROJECT_ROOT / "data" / "output" / "raw_clips"
|
||||||
|
DEFAULT_OUTPUT_DIR = PROJECT_ROOT / "data" / "face_data_combined"
|
||||||
|
DEFAULT_MODEL_PATH = PROJECT_ROOT / "models" / "yolov8n-face.pt"
|
||||||
|
|
||||||
|
# Stelle sicher, dass das Standard-Output-Verzeichnis existiert
|
||||||
|
DEFAULT_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# === Landmarks für Lippen ===
|
||||||
|
TOP_LIPS = [61, 185, 40, 39, 37, 0, 267, 269, 270, 409]
|
||||||
|
BOTTOM_LIPS = [146, 91, 181, 84, 17, 314, 405, 321, 375, 291]
|
||||||
|
|
||||||
|
|
||||||
|
def get_mouth_openness(landmarks, image_height):
|
||||||
|
"""
|
||||||
|
Berechnet die Mundöffnung in Pixeln basierend auf normierten Landmark-Koordinaten.
|
||||||
|
"""
|
||||||
|
top_avg = sum(landmarks[i].y for i in TOP_LIPS) / len(TOP_LIPS)
|
||||||
|
bottom_avg = sum(landmarks[i].y for i in BOTTOM_LIPS) / len(BOTTOM_LIPS)
|
||||||
|
return abs(bottom_avg - top_avg) * image_height
|
||||||
|
|
||||||
|
|
||||||
|
def iou(boxA, boxB):
|
||||||
|
"""Berechnet Intersection-over-Union zweier Bounding-Boxes im Format (x, y, w, h)."""
|
||||||
|
ax1, ay1, aw, ah = boxA
|
||||||
|
ax2, ay2 = ax1 + aw, ay1 + ah
|
||||||
|
bx1, by1, bw, bh = boxB
|
||||||
|
bx2, by2 = bx1 + bw, by1 + bh
|
||||||
|
|
||||||
|
inter_x1 = max(ax1, bx1)
|
||||||
|
inter_y1 = max(ay1, by1)
|
||||||
|
inter_x2 = min(ax2, bx2)
|
||||||
|
inter_y2 = min(ay2, by2)
|
||||||
|
inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1)
|
||||||
|
|
||||||
|
union_area = aw * ah + bw * bh - inter_area
|
||||||
|
return inter_area / union_area if union_area > 0 else 0
|
||||||
|
|
||||||
|
|
||||||
|
def process_video(
|
||||||
|
video_path: Path,
|
||||||
|
output_path: Path,
|
||||||
|
model: YOLO,
|
||||||
|
face_mesh: mp.solutions.face_mesh.FaceMesh,
|
||||||
|
conf_thresh: float,
|
||||||
|
frame_skip: int,
|
||||||
|
downscale: float,
|
||||||
|
):
|
||||||
|
cap = cv2.VideoCapture(str(video_path))
|
||||||
|
if not cap.isOpened():
|
||||||
|
logging.error(f"Kann Video nicht öffnen: {video_path}")
|
||||||
|
return
|
||||||
|
|
||||||
|
fps = cap.get(cv2.CAP_PROP_FPS)
|
||||||
|
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH) * downscale)
|
||||||
|
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT) * downscale)
|
||||||
|
|
||||||
|
# JSON-Ausgabe mit Streaming
|
||||||
|
with output_path.open('w', encoding='utf-8') as f_out:
|
||||||
|
f_out.write('[\n')
|
||||||
|
first = True
|
||||||
|
frame_idx = 0
|
||||||
|
|
||||||
|
while True:
|
||||||
|
ret, frame = cap.read()
|
||||||
|
if not ret:
|
||||||
|
break
|
||||||
|
if frame_skip > 1 and frame_idx % frame_skip != 0:
|
||||||
|
frame_idx += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
if downscale != 1.0:
|
||||||
|
frame = cv2.resize(frame, (width, height))
|
||||||
|
|
||||||
|
detections = model(frame, verbose=False)[0]
|
||||||
|
yolo_boxes = []
|
||||||
|
for box in detections.boxes:
|
||||||
|
conf = float(box.conf[0] if hasattr(box.conf, '__getitem__') else box.conf)
|
||||||
|
if conf < conf_thresh:
|
||||||
|
continue
|
||||||
|
coords = box.xyxy[0].cpu().numpy()
|
||||||
|
x1, y1, x2, y2 = map(int, coords)
|
||||||
|
yolo_boxes.append([x1, y1, x2 - x1, y2 - y1])
|
||||||
|
|
||||||
|
rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
||||||
|
mp_result = face_mesh.process(rgb)
|
||||||
|
mp_faces = []
|
||||||
|
if mp_result.multi_face_landmarks:
|
||||||
|
for landmarks in mp_result.multi_face_landmarks:
|
||||||
|
mouth_px = get_mouth_openness(landmarks.landmark, height)
|
||||||
|
xs = [lm.x * width for lm in landmarks.landmark]
|
||||||
|
ys = [lm.y * height for lm in landmarks.landmark]
|
||||||
|
x1, y1 = int(min(xs)), int(min(ys))
|
||||||
|
x2, y2 = int(max(xs)), int(max(ys))
|
||||||
|
mp_faces.append({
|
||||||
|
"bbox": [x1, y1, x2 - x1, y2 - y1],
|
||||||
|
"mouth_openness": round(mouth_px, 1)
|
||||||
|
})
|
||||||
|
|
||||||
|
combined = []
|
||||||
|
for yb in yolo_boxes:
|
||||||
|
if mp_faces:
|
||||||
|
best = max(mp_faces, key=lambda m: iou(yb, m["bbox"]))
|
||||||
|
best_iou = iou(yb, best["bbox"])
|
||||||
|
mouth = best["mouth_openness"] if best_iou > 0.2 else 0.0
|
||||||
|
else:
|
||||||
|
mouth = 0.0
|
||||||
|
|
||||||
|
x, y, w, h = yb
|
||||||
|
cx, cy = x + w / 2, y + h / 2
|
||||||
|
combined.append({
|
||||||
|
"bbox": yb,
|
||||||
|
"mouth_openness": round(mouth, 1),
|
||||||
|
"center": [round(cx, 1), round(cy, 1), w, h]
|
||||||
|
})
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"frame": frame_idx,
|
||||||
|
"timestamp": round(frame_idx / fps, 3),
|
||||||
|
"faces": combined
|
||||||
|
}
|
||||||
|
|
||||||
|
if not first:
|
||||||
|
f_out.write(',\n')
|
||||||
|
json.dump(result, f_out, ensure_ascii=False)
|
||||||
|
first = False
|
||||||
|
frame_idx += 1
|
||||||
|
|
||||||
|
f_out.write('\n]')
|
||||||
|
|
||||||
|
cap.release()
|
||||||
|
logging.info(f"Verarbeitet: {video_path.name} → {output_path.name}")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Analyse von Videos: Gesichter und Mundöffnung erkennen"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--input-dir", type=Path,
|
||||||
|
default=DEFAULT_INPUT_DIR,
|
||||||
|
help=f"Verzeichnis mit MP4-Videos (standard: {DEFAULT_INPUT_DIR})"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output-dir", type=Path,
|
||||||
|
default=DEFAULT_OUTPUT_DIR,
|
||||||
|
help=f"Verzeichnis für JSON-Ergebnisse (standard: {DEFAULT_OUTPUT_DIR})"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--model", type=Path,
|
||||||
|
default=DEFAULT_MODEL_PATH,
|
||||||
|
help=f"Pfad zum YOLOv8-Face-Modell (.pt) (standard: {DEFAULT_MODEL_PATH})"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--conf-thresh", type=float, default=0.5,
|
||||||
|
help="Schwelle für YOLO-Confidence"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--frame-skip", type=int, default=1,
|
||||||
|
help="Nur jede n-te Frame verarbeiten"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--downscale", type=float, default=1.0,
|
||||||
|
help="Skalierungsfaktor für Frames"
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO)
|
||||||
|
args.output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
yolo = YOLO(str(args.model))
|
||||||
|
face_mesh = mp.solutions.face_mesh.FaceMesh(
|
||||||
|
static_image_mode=False,
|
||||||
|
max_num_faces=5,
|
||||||
|
refine_landmarks=True,
|
||||||
|
min_detection_confidence=0.5,
|
||||||
|
min_tracking_confidence=0.5
|
||||||
|
)
|
||||||
|
|
||||||
|
for video_path in sorted(args.input_dir.glob("*.mp4")):
|
||||||
|
out_path = args.output_dir / f"{video_path.stem}_faces.json"
|
||||||
|
process_video(
|
||||||
|
video_path,
|
||||||
|
out_path,
|
||||||
|
yolo,
|
||||||
|
face_mesh,
|
||||||
|
args.conf_thresh,
|
||||||
|
args.frame_skip,
|
||||||
|
args.downscale,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
12
src/reformat/speaking.py
Normal file
12
src/reformat/speaking.py
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
# src/speaking.py
|
||||||
|
|
||||||
|
TOP_LIPS = [61, 185, 40, 39, 37, 0, 267, 269, 270, 409]
|
||||||
|
BOTTOM_LIPS = [146, 91, 181, 84, 17, 314, 405, 321, 375, 291]
|
||||||
|
|
||||||
|
def get_mouth_openness(landmarks, image_height):
|
||||||
|
"""
|
||||||
|
Berechnet die Mundöffnung basierend auf MediaPipe-Landmarks.
|
||||||
|
"""
|
||||||
|
top_avg = sum(landmarks[i].y for i in TOP_LIPS) / len(TOP_LIPS)
|
||||||
|
bottom_avg = sum(landmarks[i].y for i in BOTTOM_LIPS) / len(BOTTOM_LIPS)
|
||||||
|
return abs(bottom_avg - top_avg) * image_height
|
265
src/subtitles/add_subtitles.py
Normal file
265
src/subtitles/add_subtitles.py
Normal file
@ -0,0 +1,265 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
add_subtitles.py — TikTok-Word-Caps mit OpenAI Whisper (CPU)
|
||||||
|
- läuft Ordner-weise über 9:16-Kurzclips
|
||||||
|
- transkribiert mit word_timestamps=True
|
||||||
|
- erzeugt ASS (ein Wort pro Zeile, Pop-Animation, Bottom-Center)
|
||||||
|
- brennt via ffmpeg in *_subtitled.mp4
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import glob
|
||||||
|
import json
|
||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
import traceback
|
||||||
|
import argparse
|
||||||
|
from typing import List, Tuple, Optional
|
||||||
|
from pathlib import Path
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# ── Projektwurzel in sys.path aufnehmen (dieses Skript liegt z. B. unter src/subtitles/)
|
||||||
|
ROOT = Path(__file__).resolve().parents[2]
|
||||||
|
sys.path.insert(0, str(ROOT))
|
||||||
|
|
||||||
|
from config import CROPPED_DIR, SUBTITLED_DIR # zentrale Pfade
|
||||||
|
|
||||||
|
# --- Stabil auf CPU (vermeidet MPS-Sparse-Fehler) ---
|
||||||
|
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
|
||||||
|
os.environ["CUDA_VISIBLE_DEVICES"] = ""
|
||||||
|
|
||||||
|
def log(*a): print("[LOG]", *a)
|
||||||
|
def ensure_dir(p: Path): p.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
def has_audio_stream(video_path: str) -> bool:
|
||||||
|
cmd = ["ffprobe","-v","error","-select_streams","a","-show_entries","stream=index","-of","json",video_path]
|
||||||
|
try:
|
||||||
|
out = subprocess.check_output(cmd).decode("utf-8")
|
||||||
|
data = json.loads(out)
|
||||||
|
return bool(data.get("streams"))
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
def load_whisper_cpu(model_name: str):
|
||||||
|
import whisper # openai-whisper
|
||||||
|
device = "cpu"
|
||||||
|
model = whisper.load_model(model_name, device=device)
|
||||||
|
fp16 = False
|
||||||
|
return model, device, fp16
|
||||||
|
|
||||||
|
def transcribe_words_whisper(model, media_path: str, language: Optional[str], fp16: bool) -> List[Tuple[float,float,str]]:
|
||||||
|
"""
|
||||||
|
Nutzt 'openai-whisper' mit word_timestamps=True.
|
||||||
|
Fallback: wenn 'words' fehlen, werden Segmenttexte approx. auf Wörter verteilt.
|
||||||
|
"""
|
||||||
|
result = model.transcribe(
|
||||||
|
media_path,
|
||||||
|
language=language,
|
||||||
|
task="transcribe",
|
||||||
|
word_timestamps=True,
|
||||||
|
condition_on_previous_text=False,
|
||||||
|
verbose=False,
|
||||||
|
fp16=fp16
|
||||||
|
)
|
||||||
|
words: List[Tuple[float,float,str]] = []
|
||||||
|
segs = result.get("segments", []) or []
|
||||||
|
for seg in segs:
|
||||||
|
wlist = seg.get("words")
|
||||||
|
if isinstance(wlist, list) and wlist and all(isinstance(w, dict) for w in wlist):
|
||||||
|
for w in wlist:
|
||||||
|
t = (w.get("word") or w.get("text") or "").strip()
|
||||||
|
if not t:
|
||||||
|
continue
|
||||||
|
ws = w.get("start"); we = w.get("end")
|
||||||
|
if ws is None or we is None:
|
||||||
|
continue
|
||||||
|
t = re.sub(r"\s+", " ", t)
|
||||||
|
if t:
|
||||||
|
words.append((float(ws), float(we), t))
|
||||||
|
else:
|
||||||
|
# Fallback: Segment auf Wörter aufteilen & Zeiten gleichmäßig verteilen
|
||||||
|
text = (seg.get("text") or "").strip()
|
||||||
|
if not text:
|
||||||
|
continue
|
||||||
|
seg_start = float(seg.get("start", 0.0))
|
||||||
|
seg_end = float(seg.get("end", seg_start))
|
||||||
|
toks = [w for w in re.split(r"(\s+)", text) if w.strip()]
|
||||||
|
if not toks or seg_end <= seg_start:
|
||||||
|
continue
|
||||||
|
dur = seg_end - seg_start
|
||||||
|
step = dur / len(toks)
|
||||||
|
for i, tok in enumerate(toks):
|
||||||
|
ws = seg_start + i * step
|
||||||
|
we = seg_start + (i+1) * step
|
||||||
|
words.append((ws, we, tok))
|
||||||
|
return words
|
||||||
|
|
||||||
|
def ass_time(t: float) -> str:
|
||||||
|
if t < 0: t = 0
|
||||||
|
h = int(t // 3600); m = int((t % 3600)//60); s = int(t % 60); cs = int(round((t - int(t))*100))
|
||||||
|
return f"{h:d}:{m:02d}:{s:02d}.{cs:02d}"
|
||||||
|
|
||||||
|
def write_ass_words(words: List[Tuple[float,float,str]], ass_path: Path, font_size: int, margin_v: int, uppercase: bool):
|
||||||
|
"""
|
||||||
|
Ein Wort pro Zeile, ohne Überlappung:
|
||||||
|
- Ende = min(eigene Endzeit, Start nächstes Wort - 0.02)
|
||||||
|
- Pop-Animation 150ms, fette Outline, Bottom-Center (PlayResY=1920)
|
||||||
|
"""
|
||||||
|
header = f"""[Script Info]
|
||||||
|
ScriptType: v4.00+
|
||||||
|
Collisions: Normal
|
||||||
|
PlayResX: 1080
|
||||||
|
PlayResY: 1920
|
||||||
|
ScaledBorderAndShadow: yes
|
||||||
|
|
||||||
|
[V4+ Styles]
|
||||||
|
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
|
||||||
|
Style: WordCap,Inter,{font_size},&H00FFFFFF,&H00FFFFFF,&H00101010,&H64000000,1,0,0,0,100,100,0,0,1,6,0.8,2,80,80,{margin_v},1
|
||||||
|
|
||||||
|
[Events]
|
||||||
|
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
|
||||||
|
"""
|
||||||
|
# Zeiten glätten, damit immer nur ein Wort sichtbar ist
|
||||||
|
adjusted = []
|
||||||
|
for i, (s, e, t) in enumerate(words):
|
||||||
|
nstart = words[i+1][0] if i+1 < len(words) else e
|
||||||
|
new_end = min(e, nstart - 0.02) if nstart > s else e
|
||||||
|
if new_end <= s:
|
||||||
|
new_end = s + 0.06
|
||||||
|
adjusted.append((s, new_end, t))
|
||||||
|
|
||||||
|
with open(ass_path, "w", encoding="utf-8") as f:
|
||||||
|
f.write(header)
|
||||||
|
for s, e, t in adjusted:
|
||||||
|
st, en = ass_time(s), ass_time(e)
|
||||||
|
txt = t.upper() if uppercase else t
|
||||||
|
# \fs sichere Größe, \blur für weiche Outline, \fad Ein/Aus,
|
||||||
|
# \fscx135\fscy135 → Start groß, \t(...) schrumpft in 150ms auf 100% = Pop
|
||||||
|
overrides = r"\blur1\bord8\1c&H0000FFFF&\3c&H000000&\4c&H000000&\fad(50,20)\fscx135\fscy135\t(0,150,\fscx100\fscy100)"
|
||||||
|
f.write(f"Dialogue: 0,{st},{en},WordCap,,0,0,0,,{{{overrides}}}{txt}\n")
|
||||||
|
|
||||||
|
def ffmpeg_escape_for_subtitles(path: Path) -> str:
|
||||||
|
"""
|
||||||
|
Pfad für -vf subtitles=… escapen (für Leerzeichen, Doppelpunkte etc.).
|
||||||
|
ffmpeg erwartet Backslash-escaping für Filter-Argumente.
|
||||||
|
"""
|
||||||
|
s = str(path)
|
||||||
|
s = s.replace("\\", "\\\\")
|
||||||
|
s = s.replace(":", "\\:")
|
||||||
|
s = s.replace("'", "\\'")
|
||||||
|
s = s.replace(",", "\\,")
|
||||||
|
s = s.replace("[", "\\[")
|
||||||
|
s = s.replace("]", "\\]")
|
||||||
|
s = s.replace(";", "\\;")
|
||||||
|
s = s.replace("=", "\\=")
|
||||||
|
return s
|
||||||
|
|
||||||
|
def burn(video_in: Path, ass_file: Path, out_path: Path, crf=18, preset="medium") -> int:
|
||||||
|
vf = f"subtitles={ffmpeg_escape_for_subtitles(ass_file)}"
|
||||||
|
cmd = [
|
||||||
|
"ffmpeg","-y","-i",str(video_in),
|
||||||
|
"-vf", vf,
|
||||||
|
"-c:v","libx264","-preset",preset,"-crf",str(crf),
|
||||||
|
"-c:a","copy",
|
||||||
|
str(out_path)
|
||||||
|
]
|
||||||
|
log("FFmpeg:", " ".join(cmd))
|
||||||
|
return subprocess.call(cmd)
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
p = argparse.ArgumentParser(description="Brennt Word-Caps (ASS) via Whisper-Transkription in 9:16-Clips.")
|
||||||
|
p.add_argument("--clips_dir", type=Path, default=CROPPED_DIR, help=f"Quellordner (Default: {CROPPED_DIR})")
|
||||||
|
p.add_argument("--out_dir", type=Path, default=SUBTITLED_DIR, help=f"Zielordner (Default: {SUBTITLED_DIR})")
|
||||||
|
p.add_argument("--pattern", type=str, default="*.mp4", help="Dateimuster (Default: *.mp4)")
|
||||||
|
p.add_argument("--limit", type=int, default=None, help="Nur die ersten N Clips verarbeiten")
|
||||||
|
p.add_argument("--model", type=str, default=os.getenv("WHISPER_MODEL", "medium"), help="Whisper-Modell")
|
||||||
|
p.add_argument("--lang", type=str, default=os.getenv("LANGUAGE", "none"), help="Sprachcode (z. B. de, en, None=Auto)")
|
||||||
|
p.add_argument("--uppercase", action="store_true", help="Text in Großbuchstaben rendern")
|
||||||
|
p.add_argument("--font_size", type=int, default=108, help="ASS-Fontgröße")
|
||||||
|
p.add_argument("--margin_v", type=int, default=320, help="ASS-MarginV (Abstand vom unteren Rand)")
|
||||||
|
p.add_argument("--crf", type=int, default=18, help="ffmpeg CRF (Qualität)")
|
||||||
|
p.add_argument("--preset", type=str, default="medium", help="ffmpeg Preset")
|
||||||
|
return p.parse_args()
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
clips_dir = args.clips_dir
|
||||||
|
output_dir = args.out_dir
|
||||||
|
ensure_dir(output_dir)
|
||||||
|
|
||||||
|
log("Starte TikTok Word-Caps (Whisper)")
|
||||||
|
log("CLIPS_DIR =", clips_dir)
|
||||||
|
log("OUTPUT_DIR =", output_dir)
|
||||||
|
|
||||||
|
clips: List[str] = []
|
||||||
|
for pat in (args.pattern,):
|
||||||
|
clips += glob.glob(str(clips_dir / pat))
|
||||||
|
clips.sort()
|
||||||
|
log(f"{len(clips)} Clips gefunden.")
|
||||||
|
if args.limit:
|
||||||
|
clips = clips[:args.limit]
|
||||||
|
log(f"LIMIT aktiv: {args.limit}")
|
||||||
|
|
||||||
|
if not clips:
|
||||||
|
log("Keine Clips gefunden. Pfad/Pattern checken.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Whisper laden (CPU)
|
||||||
|
try:
|
||||||
|
model, device, fp16 = load_whisper_cpu(args.model)
|
||||||
|
log(f"Whisper geladen: {args.model} auf {device} (fp16={fp16})")
|
||||||
|
log("Hinweis: Beim ersten Lauf kann das Modell nachgeladen werden.")
|
||||||
|
except Exception as e:
|
||||||
|
print("[ERROR] Whisper konnte nicht geladen werden:", e)
|
||||||
|
traceback.print_exc()
|
||||||
|
return
|
||||||
|
|
||||||
|
lang = None if str(args.lang).strip().lower() in {"", "none", "null"} else args.lang
|
||||||
|
|
||||||
|
for clip in clips:
|
||||||
|
base = os.path.basename(clip)
|
||||||
|
stem, _ = os.path.splitext(base)
|
||||||
|
log("="*60)
|
||||||
|
log("Clip:", base)
|
||||||
|
|
||||||
|
if not has_audio_stream(clip):
|
||||||
|
log("WARN: Keine Audio-Spur → übersprungen.")
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Transkription
|
||||||
|
try:
|
||||||
|
log("Transkription startet …")
|
||||||
|
words = transcribe_words_whisper(model, clip, language=lang, fp16=fp16)
|
||||||
|
log(f"Transkription fertig. {len(words)} Wörter.")
|
||||||
|
if not words:
|
||||||
|
log("WARN: 0 Wörter erkannt → übersprungen.")
|
||||||
|
continue
|
||||||
|
except Exception as e:
|
||||||
|
print("[ERROR] Transkription fehlgeschlagen:", e)
|
||||||
|
traceback.print_exc()
|
||||||
|
continue
|
||||||
|
|
||||||
|
# ASS erzeugen & brennen
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".ass", delete=False) as tmp:
|
||||||
|
ass_path = Path(tmp.name)
|
||||||
|
try:
|
||||||
|
log("Erzeuge ASS …")
|
||||||
|
write_ass_words(words, ass_path, font_size=args.font_size, margin_v=args.margin_v, uppercase=args.uppercase)
|
||||||
|
out_path = output_dir / f"{stem}_subtitled.mp4"
|
||||||
|
log("Brenne Untertitel …")
|
||||||
|
rc = burn(Path(clip), ass_path, out_path, crf=args.crf, preset=args.preset)
|
||||||
|
if rc == 0:
|
||||||
|
log("OK:", out_path)
|
||||||
|
else:
|
||||||
|
log("ERROR: ffmpeg fehlgeschlagen, code", rc)
|
||||||
|
finally:
|
||||||
|
try: ass_path.unlink(missing_ok=True)
|
||||||
|
except Exception: pass
|
||||||
|
|
||||||
|
log("Fertig.")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
25
src/subtitles/run_subtitles.py
Normal file
25
src/subtitles/run_subtitles.py
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
from add_subtitles import process # wir nutzen die Logik aus dem großen Skript
|
||||||
|
|
||||||
|
# ==== HIER EINSTELLEN ====
|
||||||
|
VIDEO_PATH = "data/input.mp4" # Dein Video
|
||||||
|
TRANSCRIPT_PATH = "data/transcript.srt" # Oder .json (Whisper)
|
||||||
|
OUTPUT_DIR = "data/output" # Ordner für Ergebnisse
|
||||||
|
CLIPS_PATH = None # Optional: "data/clips.csv" oder "data/clips.json"
|
||||||
|
CRF = 18
|
||||||
|
PRESET = "medium"
|
||||||
|
STYLE = r"\\bord4\\shad4\\outline3\\fs64\\b1\\1c&HFFFFFF&\\3c&H000000&\\4c&H000000&"
|
||||||
|
# ==========================
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||||
|
process(
|
||||||
|
video_path=VIDEO_PATH,
|
||||||
|
transcript_path=TRANSCRIPT_PATH,
|
||||||
|
output_dir=OUTPUT_DIR,
|
||||||
|
clips_path=CLIPS_PATH,
|
||||||
|
crf=CRF,
|
||||||
|
preset=PRESET,
|
||||||
|
style_overrides=STYLE,
|
||||||
|
)
|
100
src/text/cutClips.py
Normal file
100
src/text/cutClips.py
Normal file
@ -0,0 +1,100 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# cutClips.py — exportiert Clips aus dem ersten gefundenen Video oder aus angegebener Datei
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
import sqlite3
|
||||||
|
import argparse
|
||||||
|
from moviepy.video.io.VideoFileClip import VideoFileClip
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# ── Projektwurzel in sys.path aufnehmen
|
||||||
|
ROOT = Path(__file__).resolve().parents[2]
|
||||||
|
sys.path.insert(0, str(ROOT))
|
||||||
|
|
||||||
|
from config import INPUT_DIR, RAW_CLIPS_DIR, DB_PATH
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
p = argparse.ArgumentParser(description="Exportiert Highlights aus dem Video gemäß SQLite-DB.")
|
||||||
|
p.add_argument("--file", type=str, default=None,
|
||||||
|
help="Name der Input-Datei im INPUT_DIR. Wenn leer, wird das erste Video im Ordner verwendet.")
|
||||||
|
p.add_argument("--limit", type=int, default=10,
|
||||||
|
help="Anzahl der zu exportierenden Clips (Default: 10)")
|
||||||
|
p.add_argument("--order", type=str, choices=["score", "start"], default="score",
|
||||||
|
help="Sortierung: 'score' (score_total absteigend) oder 'start' (zeitlich).")
|
||||||
|
return p.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def find_first_video(directory: Path):
|
||||||
|
"""Suche nach der ersten Videodatei im Verzeichnis (mp4, mov, mkv)."""
|
||||||
|
for ext in ("*.mp4", "*.mov", "*.mkv"):
|
||||||
|
files = sorted(directory.glob(ext))
|
||||||
|
if files:
|
||||||
|
return files[0]
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
# === Eingabevideo bestimmen ===
|
||||||
|
if args.file:
|
||||||
|
input_video = INPUT_DIR / args.file
|
||||||
|
else:
|
||||||
|
input_video = find_first_video(INPUT_DIR)
|
||||||
|
if not input_video:
|
||||||
|
raise FileNotFoundError(f"🚫 Kein Video im Eingabeordner {INPUT_DIR} gefunden.")
|
||||||
|
print(f"📂 Kein --file angegeben → verwende automatisch: {input_video.name}")
|
||||||
|
|
||||||
|
if not input_video.exists():
|
||||||
|
raise FileNotFoundError(f"🚫 Input-Video nicht gefunden: {input_video}")
|
||||||
|
|
||||||
|
output_dir = RAW_CLIPS_DIR
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# === SQLite DB lesen ===
|
||||||
|
conn = sqlite3.connect(DB_PATH)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
order_clause = "ORDER BY score_total DESC" if args.order == "score" else "ORDER BY start ASC"
|
||||||
|
cursor.execute(f"""
|
||||||
|
SELECT start, end, text
|
||||||
|
FROM highlights
|
||||||
|
{order_clause}
|
||||||
|
LIMIT ?
|
||||||
|
""", (args.limit,))
|
||||||
|
highlights = cursor.fetchall()
|
||||||
|
|
||||||
|
if not highlights:
|
||||||
|
print("⚠️ Keine Highlights in der Datenbank gefunden.")
|
||||||
|
conn.close()
|
||||||
|
return
|
||||||
|
|
||||||
|
# === Video laden ===
|
||||||
|
video = VideoFileClip(str(input_video))
|
||||||
|
|
||||||
|
# === Clips schneiden ===
|
||||||
|
for i, (start, end, text) in enumerate(highlights, start=1):
|
||||||
|
if start >= video.duration:
|
||||||
|
print(f"⚠️ Clip {i} übersprungen – Startzeit {start:.2f}s liegt außerhalb der Videolänge ({video.duration:.2f}s).")
|
||||||
|
continue
|
||||||
|
|
||||||
|
end = min(end, video.duration)
|
||||||
|
output_file = output_dir / f"highlight_{i}.mp4"
|
||||||
|
print(f"🎬 Exportiere Clip {i}: {start:.2f}s – {end:.2f}s → {output_file.name}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
clip = video.subclipped(start, end)
|
||||||
|
clip.write_videofile(str(output_file), codec="libx264", audio_codec="aac", logger=None)
|
||||||
|
clip.close()
|
||||||
|
except Exception as e:
|
||||||
|
print(f"❌ Fehler beim Export von Clip {i}: {e}")
|
||||||
|
|
||||||
|
# === Cleanup ===
|
||||||
|
conn.close()
|
||||||
|
video.close()
|
||||||
|
print(f"✅ {len(highlights)} Clips exportiert nach {output_dir}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
@ -2,44 +2,41 @@ import sqlite3
|
|||||||
import re
|
import re
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
from time import sleep
|
from time import sleep
|
||||||
|
from pathlib import Path
|
||||||
|
import os
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# Projekt-Root einfügen (2 Ebenen hoch von src/* ausgehend)
|
||||||
|
ROOT = Path(__file__).resolve().parents[2]
|
||||||
|
sys.path.insert(0, str(ROOT))
|
||||||
|
|
||||||
|
from config import DB_PATH
|
||||||
|
|
||||||
|
|
||||||
# === Einstellungen ===
|
|
||||||
DB_PATH = "clips_openai.db"
|
|
||||||
VIDEO_ID = "testVideoShort"
|
|
||||||
MAX_CLIPS = 5 # oder "all"
|
MAX_CLIPS = 5 # oder "all"
|
||||||
OPENAI_API_KEY = "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"
|
|
||||||
|
|
||||||
client = OpenAI(api_key=OPENAI_API_KEY)
|
# === OPENAI-CLIENT (API-Key aus Env) ===
|
||||||
|
if not os.getenv("OPENAI_API_KEY"):
|
||||||
|
raise RuntimeError("🚫 OPENAI_API_KEY fehlt in der Umgebung")
|
||||||
|
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
||||||
|
|
||||||
# === DB-Verbindung
|
# === DB-Verbindung
|
||||||
conn = sqlite3.connect(DB_PATH)
|
conn = sqlite3.connect(DB_PATH)
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
cursor.execute("DROP TABLE IF EXISTS highlights")
|
# === Unbewertete Highlights laden
|
||||||
cursor.execute("""
|
cursor.execute("""
|
||||||
CREATE TABLE highlights (
|
SELECT id, start, end, text FROM highlights
|
||||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
WHERE viralitaet IS NULL OR emotionalitaet IS NULL
|
||||||
file TEXT,
|
ORDER BY start
|
||||||
start REAL,
|
|
||||||
end REAL,
|
|
||||||
text TEXT,
|
|
||||||
viralitaet INTEGER,
|
|
||||||
emotionalitaet INTEGER,
|
|
||||||
witz INTEGER,
|
|
||||||
provokation INTEGER,
|
|
||||||
score_total INTEGER
|
|
||||||
)
|
|
||||||
""")
|
""")
|
||||||
conn.commit()
|
|
||||||
print(f"🧹 Tabelle 'highlights' neu erstellt für: {VIDEO_ID}")
|
|
||||||
|
|
||||||
# === Segmente laden
|
|
||||||
cursor.execute("SELECT start, end, text FROM segments ORDER BY start")
|
|
||||||
segments = cursor.fetchall()
|
segments = cursor.fetchall()
|
||||||
print(f"📥 {len(segments)} Segmente (Originaltext) geladen.")
|
print(f"📥 {len(segments)} unbewertete Highlights geladen.")
|
||||||
|
|
||||||
# === Bewertungsfunktion (GPT-4o)
|
# === Bewertungsfunktion (GPT-4o)
|
||||||
def analyse_segment(text, start, end):
|
def analyse_segment(clip_id, text, start, end):
|
||||||
print(f"\n🔎 Bewerte Clip: {start:.2f}s – {end:.2f}s")
|
print(f"\n🔎 Bewerte Clip: {start:.2f}s – {end:.2f}s")
|
||||||
|
|
||||||
prompt = f"""
|
prompt = f"""
|
||||||
@ -86,19 +83,19 @@ Provokation: [Zahl]
|
|||||||
if all(v is not None for v in values.values()):
|
if all(v is not None for v in values.values()):
|
||||||
total_score = sum(values.values())
|
total_score = sum(values.values())
|
||||||
cursor.execute("""
|
cursor.execute("""
|
||||||
INSERT INTO highlights (
|
UPDATE highlights SET
|
||||||
file, start, end, text,
|
viralitaet = ?, emotionalitaet = ?, witz = ?, provokation = ?, score_total = ?
|
||||||
viralitaet, emotionalitaet, witz, provokation, score_total
|
WHERE id = ?
|
||||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
|
||||||
""", (
|
""", (
|
||||||
VIDEO_ID, start, end, text.strip(),
|
|
||||||
values["viralitaet"], values["emotionalitaet"],
|
values["viralitaet"], values["emotionalitaet"],
|
||||||
values["witz"], values["provokation"],
|
values["witz"], values["provokation"],
|
||||||
total_score
|
total_score,
|
||||||
|
clip_id
|
||||||
))
|
))
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
"id": clip_id,
|
||||||
"start": start,
|
"start": start,
|
||||||
"end": end,
|
"end": end,
|
||||||
"text": text.strip(),
|
"text": text.strip(),
|
||||||
@ -113,8 +110,8 @@ Provokation: [Zahl]
|
|||||||
|
|
||||||
# === Clips bewerten
|
# === Clips bewerten
|
||||||
rated = []
|
rated = []
|
||||||
for start, end, text in segments:
|
for clip_id, start, end, text in segments:
|
||||||
result = analyse_segment(text, float(start), float(end))
|
result = analyse_segment(clip_id, text, float(start), float(end))
|
||||||
if result:
|
if result:
|
||||||
rated.append(result)
|
rated.append(result)
|
||||||
sleep(1.2) # Anti-Rate-Limit
|
sleep(1.2) # Anti-Rate-Limit
|
||||||
@ -123,7 +120,7 @@ for start, end, text in segments:
|
|||||||
rated.sort(key=lambda x: x["total"], reverse=True)
|
rated.sort(key=lambda x: x["total"], reverse=True)
|
||||||
selected = rated if MAX_CLIPS == "all" else rated[:int(MAX_CLIPS)]
|
selected = rated if MAX_CLIPS == "all" else rated[:int(MAX_CLIPS)]
|
||||||
|
|
||||||
print(f"\n🎬 Beste {len(selected)} Highlights für: {VIDEO_ID}")
|
print(f"\n🎬 Beste {len(selected)} Highlights nach Bewertung:")
|
||||||
for clip in selected:
|
for clip in selected:
|
||||||
print(f"\n🚀 {clip['start']:.2f}s – {clip['end']:.2f}s")
|
print(f"\n🚀 {clip['start']:.2f}s – {clip['end']:.2f}s")
|
||||||
print(f"🎙️ {clip['text'][:200]}...")
|
print(f"🎙️ {clip['text'][:200]}...")
|
409
src/text/segment_transcript.py
Normal file
409
src/text/segment_transcript.py
Normal file
@ -0,0 +1,409 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# clip_selector_optimized.py — word-based text rebuild (no duplicates)
|
||||||
|
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import json
|
||||||
|
import sqlite3
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import datetime
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
from typing import List, Dict, Optional
|
||||||
|
|
||||||
|
from openai import OpenAI
|
||||||
|
|
||||||
|
# ── Projektwurzel in sys.path aufnehmen (dieses Skript kann z. B. unter src/text/ liegen)
|
||||||
|
ROOT = Path(__file__).resolve().parents[2]
|
||||||
|
sys.path.insert(0, str(ROOT))
|
||||||
|
|
||||||
|
from config import TRANSCRIPTS_DIR, DB_PATH # zentrale Pfade
|
||||||
|
|
||||||
|
LOG_DIR = ROOT / "logs"
|
||||||
|
LOG_DIR.mkdir(exist_ok=True, parents=True)
|
||||||
|
|
||||||
|
# === DEFAULTS (per CLI überschreibbar) ===
|
||||||
|
DEFAULT_BLOCK_DURATION = 300.0 # Sek. pro Block
|
||||||
|
DEFAULT_MIN_CLIP_LEN = 30.0 # konsistent mit Prompt
|
||||||
|
DEFAULT_MAX_CLIP_LEN = 90.0
|
||||||
|
|
||||||
|
# === OPENAI-CLIENT (API-Key aus Env) ===
|
||||||
|
API_KEY = os.getenv("OPENAI_API_KEY")
|
||||||
|
if not API_KEY:
|
||||||
|
raise RuntimeError("🚫 OPENAI_API_KEY fehlt in der Umgebung")
|
||||||
|
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-5") # bei Bedarf überschreiben
|
||||||
|
client = OpenAI(api_key=API_KEY)
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
# Hilfsfunktionen
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def log_text(filename: str, content: str) -> None:
|
||||||
|
(LOG_DIR / filename).write_text((content or "").strip(), encoding="utf-8")
|
||||||
|
|
||||||
|
def append_error_log(content: str) -> None:
|
||||||
|
with (LOG_DIR / "errors.txt").open("a", encoding="utf-8") as f:
|
||||||
|
f.write(f"{datetime.now().isoformat()} {content}\n\n")
|
||||||
|
|
||||||
|
def extract_json(text: str) -> list:
|
||||||
|
"""Nur für Debug: versucht JSON-Array aus beliebigem Text zu extrahieren."""
|
||||||
|
txt = (text or "").strip()
|
||||||
|
txt = re.sub(r"^```(?:json)?\s*|\s*```$", "", txt, flags=re.IGNORECASE | re.DOTALL)
|
||||||
|
m = re.search(r"\[\s*{.*?}\s*\]", txt, re.DOTALL)
|
||||||
|
if not m:
|
||||||
|
append_error_log(f"❌ Kein JSON-Array gefunden.\n{txt}")
|
||||||
|
return []
|
||||||
|
try:
|
||||||
|
return json.loads(m.group(0))
|
||||||
|
except Exception as e:
|
||||||
|
append_error_log(f"❌ JSON-Fehler: {e}\n{txt}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def get_json_snippets_for_clip(start: float, end: float, segment_json: List[Dict]) -> List[Dict]:
|
||||||
|
"""halb-offenes Fenster [start, end)"""
|
||||||
|
return [s for s in segment_json if not (float(s["end"]) <= start or float(s["start"]) >= end)]
|
||||||
|
|
||||||
|
def _norm_space(s: str) -> str:
|
||||||
|
return re.sub(r"\s+", " ", (s or "").strip())
|
||||||
|
|
||||||
|
def explode_segments_to_words(segments: List[Dict]) -> List[Dict]:
|
||||||
|
"""
|
||||||
|
Baut eine globale Wortliste. Bevorzugt echte 'words' aus JSON,
|
||||||
|
fällt ansonsten auf lineare Interpolation über Segmentdauer zurück.
|
||||||
|
Ausgabe-Items: {idx, mid, text}
|
||||||
|
"""
|
||||||
|
words = []
|
||||||
|
idx = 0
|
||||||
|
for seg in sorted(segments, key=lambda s: (float(s["start"]), float(s["end"]))):
|
||||||
|
s0, s1 = float(seg["start"]), float(seg["end"])
|
||||||
|
txt = (seg.get("text") or "").strip()
|
||||||
|
seg_words = seg.get("words") or []
|
||||||
|
if seg_words:
|
||||||
|
for w in seg_words:
|
||||||
|
t = (w.get("text") or w.get("word") or "").strip()
|
||||||
|
if not t:
|
||||||
|
continue
|
||||||
|
w0 = float(w["start"]); w1 = float(w["end"])
|
||||||
|
words.append({"idx": idx, "mid": round((w0 + w1) / 2.0, 4), "text": t})
|
||||||
|
idx += 1
|
||||||
|
else:
|
||||||
|
toks = txt.split()
|
||||||
|
n = len(toks)
|
||||||
|
if n == 0:
|
||||||
|
continue
|
||||||
|
dur = max(1e-6, s1 - s0)
|
||||||
|
for i, tok in enumerate(toks):
|
||||||
|
w0 = s0 + (i / n) * dur
|
||||||
|
w1 = s0 + ((i + 1) / n) * dur
|
||||||
|
words.append({"idx": idx, "mid": round((w0 + w1) / 2.0, 4), "text": tok})
|
||||||
|
idx += 1
|
||||||
|
return words
|
||||||
|
|
||||||
|
def build_text_strict_from_words(clip_start: float, clip_end: float, WORDS: List[Dict]) -> str:
|
||||||
|
"""Nimmt jedes Wort genau einmal, wenn mid ∈ [start, end)."""
|
||||||
|
sel = [w for w in WORDS if clip_start <= w["mid"] < clip_end]
|
||||||
|
sel.sort(key=lambda w: w["idx"])
|
||||||
|
return _norm_space(" ".join(w["text"] for w in sel))
|
||||||
|
|
||||||
|
def find_transcript_pair(base: Optional[str]) -> tuple[Path, Path, str]:
|
||||||
|
"""
|
||||||
|
Finde (timed.txt, segments.json) in TRANSCRIPTS_DIR.
|
||||||
|
- Wenn base übergeben: benutzt {base}_timed.txt und {base}_segments.json.
|
||||||
|
- Sonst: nimmt das lexikographisch erste *_timed.txt und leitet die JSON davon ab.
|
||||||
|
"""
|
||||||
|
if base:
|
||||||
|
txt = TRANSCRIPTS_DIR / f"{base}_timed.txt"
|
||||||
|
jsn = TRANSCRIPTS_DIR / f"{base}_segments.json"
|
||||||
|
if not txt.exists():
|
||||||
|
raise FileNotFoundError(f"Transkript nicht gefunden: {txt}")
|
||||||
|
if not jsn.exists():
|
||||||
|
raise FileNotFoundError(f"Segment-JSON nicht gefunden: {jsn}")
|
||||||
|
return txt, jsn, base
|
||||||
|
|
||||||
|
# auto-detect
|
||||||
|
candidates = sorted(TRANSCRIPTS_DIR.glob("*_timed.txt"))
|
||||||
|
if not candidates:
|
||||||
|
raise FileNotFoundError(f"Keine *_timed.txt in {TRANSCRIPTS_DIR} gefunden.")
|
||||||
|
txt = candidates[0]
|
||||||
|
stem = txt.stem.replace("_timed", "")
|
||||||
|
jsn = TRANSCRIPTS_DIR / f"{stem}_segments.json"
|
||||||
|
if not jsn.exists():
|
||||||
|
raise FileNotFoundError(f"Gefundenes TXT: {txt.name}, aber JSON fehlt: {jsn.name}")
|
||||||
|
return txt, jsn, stem
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
# CLI
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
p = argparse.ArgumentParser(description="Selektiert Social-Media-taugliche Clips aus Transkripten (LLM-gestützt).")
|
||||||
|
p.add_argument("--base", type=str, default=None,
|
||||||
|
help="Basename der Transkriptdateien (z. B. 'testVideoShort' für *_timed.txt und *_segments.json).")
|
||||||
|
p.add_argument("--block", type=float, default=DEFAULT_BLOCK_DURATION, help="Blocklänge in Sekunden für die Prompt-Bildung.")
|
||||||
|
p.add_argument("--min", type=float, default=DEFAULT_MIN_CLIP_LEN, help="Minimale Clip-Länge (Sekunden).")
|
||||||
|
p.add_argument("--max", type=float, default=DEFAULT_MAX_CLIP_LEN, help="Maximale Clip-Länge (Sekunden).")
|
||||||
|
return p.parse_args()
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
# Main
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = parse_args()
|
||||||
|
BLOCK_DURATION = float(args.block)
|
||||||
|
MIN_CLIP_LEN = float(args.min)
|
||||||
|
MAX_CLIP_LEN = float(args.max)
|
||||||
|
|
||||||
|
# --- Transkriptdateien finden ---
|
||||||
|
TRANSCRIPT_PATH, SEGMENT_JSON_PATH, base = find_transcript_pair(args.base)
|
||||||
|
print(f"📄 TXT : {TRANSCRIPT_PATH}")
|
||||||
|
print(f"🧾 JSON: {SEGMENT_JSON_PATH}")
|
||||||
|
|
||||||
|
# === TRANSKRIPT EINLESEN (TXT) -> NUR für Blockbildung & Promptanzeige ===
|
||||||
|
lines = TRANSCRIPT_PATH.read_text(encoding="utf-8").splitlines()
|
||||||
|
segments_txt: List[Dict] = []
|
||||||
|
for line in lines:
|
||||||
|
m = re.match(r"\[(\d+(?:\.\d+)?)\s*[–-]\s*(\d+(?:\.\d+)?)\]\s*(?:[A-Z_0-9]+:)?\s*(.*)", line)
|
||||||
|
if not m:
|
||||||
|
continue
|
||||||
|
start, end, text = m.groups()
|
||||||
|
start, end = float(start), float(end)
|
||||||
|
if end - start >= 2.0:
|
||||||
|
segments_txt.append({"start": start, "end": end, "text": (text or "").strip()})
|
||||||
|
|
||||||
|
if not segments_txt:
|
||||||
|
raise RuntimeError("🚫 Keine gültigen TXT-Segmente gefunden.")
|
||||||
|
print(f"✅ {len(segments_txt)} gültige TXT-Segmente geladen.")
|
||||||
|
|
||||||
|
# === TRANSKRIPT EINLESEN (JSON) -> Quelle für DB-Text/Wörter ===
|
||||||
|
segment_json_data = json.loads(SEGMENT_JSON_PATH.read_text(encoding="utf-8"))
|
||||||
|
if not isinstance(segment_json_data, list) or not segment_json_data:
|
||||||
|
raise RuntimeError("🚫 JSON-Segmente leer/ungültig.")
|
||||||
|
print(f"✅ {len(segment_json_data)} JSON-Segmente geladen.")
|
||||||
|
|
||||||
|
# Globale Wörterliste einmal berechnen (bevor wir Clips bilden)
|
||||||
|
WORDS = explode_segments_to_words(segment_json_data)
|
||||||
|
print(f"🔤 Globale Wörter im Korpus: {len(WORDS)}")
|
||||||
|
|
||||||
|
# === BLÖCKE BILDEN (aus TXT) ===
|
||||||
|
segments_txt.sort(key=lambda s: (s["start"], s["end"]))
|
||||||
|
blocks, current_block, current_start = [], [], 0.0
|
||||||
|
for seg in segments_txt:
|
||||||
|
if not current_block:
|
||||||
|
current_start = seg["start"]
|
||||||
|
# Blockwechsel, wenn Dauer überschritten
|
||||||
|
if seg["end"] - current_start > BLOCK_DURATION:
|
||||||
|
blocks.append(current_block)
|
||||||
|
current_block = []
|
||||||
|
current_start = seg["start"]
|
||||||
|
current_block.append(seg)
|
||||||
|
if current_block:
|
||||||
|
blocks.append(current_block)
|
||||||
|
print(f"🧱 {len(blocks)} Blöcke erstellt (à ~{BLOCK_DURATION:.0f}s).")
|
||||||
|
|
||||||
|
# === KI: CLIP-AUSWAHL ===
|
||||||
|
all_clips = []
|
||||||
|
t0 = time.perf_counter()
|
||||||
|
|
||||||
|
for i, block in enumerate(blocks, start=1):
|
||||||
|
if not block:
|
||||||
|
continue
|
||||||
|
print(f"\n🤖 Sende Block {i}/{len(blocks)} an {OPENAI_MODEL} …")
|
||||||
|
block_text = "\n".join([f"[{s['start']} – {s['end']}] {s['text']}" for s in block])
|
||||||
|
|
||||||
|
prompt = f"""
|
||||||
|
Du bekommst einen Transkriptblock mit Zeitangaben. Extrahiere daraus 1–3 besonders interessante Abschnitte, die sich als eigenständige Social Media Clips eignen.
|
||||||
|
Ein guter Clip:
|
||||||
|
- ist abgeschlossen und verständlich
|
||||||
|
- enthält eine Pointe, Erkenntnis oder einen emotionalen Moment
|
||||||
|
- wirkt wie ein Mini-Ausschnitt mit Anfang, Spannungsbogen, Auflösung oder Punchline
|
||||||
|
- ist mindestens {MIN_CLIP_LEN:.0f} Sekunden lang
|
||||||
|
Nutze ausschließlich die vorhandenen Start- und Endzeiten – keine neuen erfinden.
|
||||||
|
|
||||||
|
Gib ein JSON-Objekt zurück im Format:
|
||||||
|
{{
|
||||||
|
"clips": [
|
||||||
|
{{
|
||||||
|
"start": float,
|
||||||
|
"end": float,
|
||||||
|
"summary": "Kurze Beschreibung des Inhalts"
|
||||||
|
}}
|
||||||
|
]
|
||||||
|
}}
|
||||||
|
|
||||||
|
TRANSKRIPT:
|
||||||
|
{block_text}
|
||||||
|
""".strip()
|
||||||
|
|
||||||
|
log_text(f"block_prompt_{i:02d}.txt", prompt)
|
||||||
|
|
||||||
|
# --- robuster API-Call mit Schema (Root=object) und kleinem Retry ---
|
||||||
|
import time as _time
|
||||||
|
clips = []
|
||||||
|
for attempt in range(3):
|
||||||
|
try:
|
||||||
|
resp = client.chat.completions.create(
|
||||||
|
model=OPENAI_MODEL,
|
||||||
|
messages=[{"role": "user", "content": prompt}],
|
||||||
|
response_format={
|
||||||
|
"type": "json_schema",
|
||||||
|
"json_schema": {
|
||||||
|
"name": "clips_payload",
|
||||||
|
"schema": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"clips": {
|
||||||
|
"type": "array",
|
||||||
|
"items": {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"start": {"type": "number"},
|
||||||
|
"end": {"type": "number"},
|
||||||
|
"summary": {"type": "string"}
|
||||||
|
},
|
||||||
|
"required": ["start", "end", "summary"],
|
||||||
|
"additionalProperties": False
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"required": ["clips"],
|
||||||
|
"additionalProperties": False
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
msg = resp.choices[0].message
|
||||||
|
payload = getattr(msg, "parsed", None)
|
||||||
|
if payload is None:
|
||||||
|
payload = json.loads(msg.content)
|
||||||
|
|
||||||
|
clips = payload.get("clips", []) or []
|
||||||
|
|
||||||
|
try:
|
||||||
|
log_text(f"block_output_{i:02d}.txt", json.dumps(payload, ensure_ascii=False, indent=2))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
break
|
||||||
|
except Exception as e:
|
||||||
|
if attempt == 2:
|
||||||
|
append_error_log(f"❌ OpenAI-Fehler Block {i}: {e}")
|
||||||
|
print(f"❌ Fehler bei Block {i}: {e}")
|
||||||
|
else:
|
||||||
|
_time.sleep(1.5 * (attempt + 1))
|
||||||
|
|
||||||
|
print(f"✅ {len(clips)} Clips empfangen in Block {i}")
|
||||||
|
|
||||||
|
# --- Clips filtern & clampen ---
|
||||||
|
for clip in clips:
|
||||||
|
try:
|
||||||
|
b_start, b_end = block[0]["start"], block[-1]["end"]
|
||||||
|
start = max(b_start, min(float(clip["start"]), b_end))
|
||||||
|
end = max(b_start, min(float(clip["end"]), b_end))
|
||||||
|
dur = end - start
|
||||||
|
if MIN_CLIP_LEN <= dur <= MAX_CLIP_LEN:
|
||||||
|
clip["start"] = start
|
||||||
|
clip["end"] = end
|
||||||
|
clip["duration"] = round(dur, 2)
|
||||||
|
all_clips.append(clip)
|
||||||
|
except Exception as e:
|
||||||
|
append_error_log(f"⛔ Clip-Filterfehler: {clip}\n{e}")
|
||||||
|
|
||||||
|
elapsed = time.perf_counter() - t0
|
||||||
|
avg = elapsed / i
|
||||||
|
eta = max(0.0, avg * (len(blocks) - i))
|
||||||
|
print(f"⏱️ Geschätzte Restzeit: {eta:.1f} s")
|
||||||
|
|
||||||
|
# --- Duplikate entfernen (auf 2 Dezimalen) ---
|
||||||
|
dedup, seen = [], set()
|
||||||
|
for c in all_clips:
|
||||||
|
k = (round(c["start"], 2), round(c["end"], 2))
|
||||||
|
if k in seen:
|
||||||
|
continue
|
||||||
|
seen.add(k)
|
||||||
|
dedup.append(c)
|
||||||
|
all_clips = dedup
|
||||||
|
|
||||||
|
print(f"\n📈 Gesamtclips vor DB-Insert: {len(all_clips)}")
|
||||||
|
|
||||||
|
# === DB SPEICHERN ===
|
||||||
|
conn = sqlite3.connect(DB_PATH)
|
||||||
|
cur = conn.cursor()
|
||||||
|
|
||||||
|
cur.execute("""
|
||||||
|
CREATE TABLE IF NOT EXISTS highlights (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
file TEXT,
|
||||||
|
start REAL,
|
||||||
|
end REAL,
|
||||||
|
duration REAL,
|
||||||
|
text TEXT,
|
||||||
|
summary TEXT,
|
||||||
|
json_raw TEXT,
|
||||||
|
viralitaet INTEGER,
|
||||||
|
emotionalitaet INTEGER,
|
||||||
|
witz INTEGER,
|
||||||
|
provokation INTEGER,
|
||||||
|
score_total INTEGER,
|
||||||
|
UNIQUE(file,start,end)
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
|
||||||
|
# --- Tabelle vor neuem Lauf komplett leeren ---
|
||||||
|
cur.execute("DELETE FROM highlights")
|
||||||
|
conn.commit() # Transaktion schließen, damit VACUUM außerhalb läuft
|
||||||
|
|
||||||
|
# VACUUM separat (optional)
|
||||||
|
try:
|
||||||
|
conn.execute("VACUUM") # oder: sqlite3.connect(DB_PATH).execute("VACUUM").close()
|
||||||
|
print("🧹 Alte Highlights gelöscht und Datenbank komprimiert.")
|
||||||
|
except sqlite3.OperationalError as e:
|
||||||
|
print(f"⚠️ VACUUM übersprungen: {e}")
|
||||||
|
|
||||||
|
inserted = 0
|
||||||
|
failed = 0
|
||||||
|
|
||||||
|
for clip in all_clips:
|
||||||
|
try:
|
||||||
|
start = float(clip["start"])
|
||||||
|
end = float(clip["end"])
|
||||||
|
duration = float(clip["duration"])
|
||||||
|
summary = (clip.get("summary") or "").strip()
|
||||||
|
|
||||||
|
if end <= start or start < 0:
|
||||||
|
raise ValueError("Ungültige Zeiten")
|
||||||
|
|
||||||
|
# JSON-Segmente (zur Nachvollziehbarkeit) + Wort-basierter Text (dopplerfrei)
|
||||||
|
json_snippets = get_json_snippets_for_clip(start, end, segment_json_data)
|
||||||
|
json_raw = json.dumps(json_snippets, ensure_ascii=False)
|
||||||
|
|
||||||
|
original_text = build_text_strict_from_words(start, end, WORDS)
|
||||||
|
|
||||||
|
cur.execute("""
|
||||||
|
INSERT OR IGNORE INTO highlights (
|
||||||
|
file, start, end, duration, text, summary, json_raw,
|
||||||
|
viralitaet, emotionalitaet, witz, provokation, score_total
|
||||||
|
)
|
||||||
|
VALUES (?, ?, ?, ?, ?, ?, ?, NULL, NULL, NULL, NULL, NULL)
|
||||||
|
""", (
|
||||||
|
# 'file' = Basename (z. B. testVideoShort)
|
||||||
|
Path(base).name,
|
||||||
|
start, end, duration,
|
||||||
|
original_text, summary, json_raw
|
||||||
|
))
|
||||||
|
if cur.rowcount > 0:
|
||||||
|
inserted += 1
|
||||||
|
except Exception as e:
|
||||||
|
failed += 1
|
||||||
|
append_error_log(f"❌ DB-Fehler: {clip}\n{e}")
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
print("\n📊 Ergebnisse:")
|
||||||
|
print(f" ✅ Highlights gespeichert: {inserted}")
|
||||||
|
print(f" ❌ Fehlerhafte Clips: {failed}")
|
||||||
|
print(f"📁 Logs: {LOG_DIR.resolve()}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
276
src/text/transcription.py
Normal file
276
src/text/transcription.py
Normal file
@ -0,0 +1,276 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# transcription_chunked_words.py — Whisper mit Wortzeitstempeln, doppler-sicher
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
from tempfile import TemporaryDirectory
|
||||||
|
from typing import List, Dict, Tuple, Optional
|
||||||
|
|
||||||
|
import ffmpeg
|
||||||
|
import whisper
|
||||||
|
|
||||||
|
# ── Projektwurzel in sys.path aufnehmen (dieses Skript liegt z. B. unter src/text/)
|
||||||
|
ROOT = Path(__file__).resolve().parents[2]
|
||||||
|
sys.path.insert(0, str(ROOT))
|
||||||
|
|
||||||
|
from config import INPUT_DIR, TRANSCRIPTS_DIR # zentrale Pfade
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
# Utilities
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def probe_duration(path: Path) -> float:
|
||||||
|
"""Ermittle die Videodauer in Sekunden (ffmpeg.probe)."""
|
||||||
|
try:
|
||||||
|
meta = ffmpeg.probe(str(path))
|
||||||
|
except ffmpeg.Error as e:
|
||||||
|
raise RuntimeError(f"ffmpeg.probe fehlgeschlagen für {path}: {e.stderr.decode('utf-8','ignore') if hasattr(e, 'stderr') else e}") from e
|
||||||
|
|
||||||
|
dur = meta.get("format", {}).get("duration")
|
||||||
|
if dur is not None:
|
||||||
|
return float(dur)
|
||||||
|
|
||||||
|
cand = 0.0
|
||||||
|
for s in meta.get("streams", []) or []:
|
||||||
|
d = s.get("duration")
|
||||||
|
if d:
|
||||||
|
cand = max(cand, float(d))
|
||||||
|
if cand > 0:
|
||||||
|
return cand
|
||||||
|
raise RuntimeError(f"Konnte Videodauer nicht bestimmen: {path}")
|
||||||
|
|
||||||
|
def make_chunks(total: float, chunk_seconds: float, overlap: float) -> List[Tuple[float,float]]:
|
||||||
|
"""Zerteile [0,total] in überlappende Intervalle."""
|
||||||
|
if chunk_seconds <= 0:
|
||||||
|
return [(0.0, total)]
|
||||||
|
s, out = 0.0, []
|
||||||
|
while s < total:
|
||||||
|
e = min(s + chunk_seconds, total)
|
||||||
|
out.append((s, e))
|
||||||
|
if e >= total:
|
||||||
|
break
|
||||||
|
s = max(0.0, e - overlap)
|
||||||
|
return out
|
||||||
|
|
||||||
|
def extract_audio_segment(src_video: Path, start: float, end: float, out_wav: Path) -> None:
|
||||||
|
"""Extrahiere [start,end] als Mono-16kHz-WAV."""
|
||||||
|
(
|
||||||
|
ffmpeg
|
||||||
|
.input(str(src_video), ss=start, to=end)
|
||||||
|
.output(
|
||||||
|
str(out_wav),
|
||||||
|
format="wav",
|
||||||
|
acodec="pcm_s16le",
|
||||||
|
ac=1,
|
||||||
|
ar="16000",
|
||||||
|
loglevel="error",
|
||||||
|
)
|
||||||
|
.overwrite_output()
|
||||||
|
.run()
|
||||||
|
)
|
||||||
|
|
||||||
|
def is_suspect(text: str) -> bool:
|
||||||
|
"""Heuristik: leere/loopende/zweifelhafte Zeilen markieren."""
|
||||||
|
t = (text or "").strip().lower()
|
||||||
|
if not t:
|
||||||
|
return True
|
||||||
|
words = t.split()
|
||||||
|
if not words:
|
||||||
|
return True
|
||||||
|
counts = {w: words.count(w) for w in set(words)}
|
||||||
|
most_common = max(counts.values())
|
||||||
|
return most_common / len(words) > 0.6 or most_common > 20
|
||||||
|
|
||||||
|
def merge_overlaps_keep_best(
|
||||||
|
segments: List[Dict],
|
||||||
|
max_gap: float = 0.15,
|
||||||
|
min_dur: float = 0.30
|
||||||
|
) -> List[Dict]:
|
||||||
|
"""
|
||||||
|
Zeitlich sortieren, kleine Gaps schließen. Bei Überlappung:
|
||||||
|
- keine Text-Konkatenation
|
||||||
|
- behalte das "bessere" Segment (längere Dauer, dann längerer Text)
|
||||||
|
- words: vom "best" übernehmen (falls vorhanden)
|
||||||
|
"""
|
||||||
|
cleaned = []
|
||||||
|
for s in segments:
|
||||||
|
s0 = float(s["start"]); s1 = float(s["end"])
|
||||||
|
txt = (s.get("text") or "").strip()
|
||||||
|
if s1 - s0 >= min_dur and txt:
|
||||||
|
cleaned.append({
|
||||||
|
"start": s0, "end": s1,
|
||||||
|
"text": txt,
|
||||||
|
"words": s.get("words", [])
|
||||||
|
})
|
||||||
|
if not cleaned:
|
||||||
|
return []
|
||||||
|
|
||||||
|
cleaned.sort(key=lambda x: (x["start"], x["end"]))
|
||||||
|
out = [cleaned[0]]
|
||||||
|
|
||||||
|
def score(x: Dict) -> tuple:
|
||||||
|
return (x["end"] - x["start"], len(x.get("text", "")))
|
||||||
|
|
||||||
|
for s in cleaned[1:]:
|
||||||
|
m = out[-1]
|
||||||
|
if s["start"] <= m["end"] + max_gap:
|
||||||
|
best = s if score(s) > score(m) else m
|
||||||
|
out[-1] = {
|
||||||
|
"start": min(m["start"], s["start"]),
|
||||||
|
"end": max(m["end"], s["end"]),
|
||||||
|
"text": best["text"],
|
||||||
|
"words": best.get("words", []),
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
out.append(s)
|
||||||
|
return out
|
||||||
|
|
||||||
|
def write_outputs(base: Path, segments: List[Dict], out_dir: Path, ascii_dash: bool = True):
|
||||||
|
"""Schreibe _timed.txt, _suspect_lines.txt und _segments.json."""
|
||||||
|
out_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
dash = "-" if ascii_dash else "–"
|
||||||
|
|
||||||
|
out_txt = out_dir / f"{base.stem}_timed.txt"
|
||||||
|
out_sus = out_dir / f"{base.stem}_suspect_lines.txt"
|
||||||
|
out_json = out_dir / f"{base.stem}_segments.json"
|
||||||
|
|
||||||
|
# TXT nur zur Ansicht
|
||||||
|
with open(out_txt, "w", encoding="utf-8") as f_txt, open(out_sus, "w", encoding="utf-8") as f_sus:
|
||||||
|
for s in segments:
|
||||||
|
line = f"[{s['start']:.2f} {dash} {s['end']:.2f}] {s['text']}\n"
|
||||||
|
f_txt.write(line)
|
||||||
|
if is_suspect(s["text"]):
|
||||||
|
f_sus.write(line)
|
||||||
|
|
||||||
|
# JSON für die Weiterverarbeitung (inkl. words)
|
||||||
|
with open(out_json, "w", encoding="utf-8") as f_json:
|
||||||
|
json.dump(segments, f_json, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
return out_txt, out_sus, out_json
|
||||||
|
|
||||||
|
def find_default_input() -> Optional[Path]:
|
||||||
|
"""Nimm das erste Video aus INPUT_DIR, falls kein --input übergeben wurde."""
|
||||||
|
exts = (".mp4", ".mov", ".mkv", ".m4v", ".wav", ".mp3")
|
||||||
|
for p in sorted(INPUT_DIR.iterdir()):
|
||||||
|
if p.suffix.lower() in exts:
|
||||||
|
return p
|
||||||
|
return None
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
# CLI
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
p = argparse.ArgumentParser(
|
||||||
|
description="Chunked Whisper Transcription mit Wortzeitstempeln & doppler-sicherem Stitching."
|
||||||
|
)
|
||||||
|
p.add_argument("--input", type=Path, default=None, help=f"Eingabevideo/-audio. Default: erstes File in {INPUT_DIR}")
|
||||||
|
p.add_argument("--outdir", type=Path, default=None, help=f"Ausgabeverzeichnis. Default: {TRANSCRIPTS_DIR}")
|
||||||
|
p.add_argument("--model", type=str, default=os.getenv("WHISPER_MODEL", "small"), help="Whisper-Modell (tiny/base/small/medium/large)")
|
||||||
|
p.add_argument("--lang", type=str, default=os.getenv("LANGUAGE", "none"), help="Sprachcode (z. B. 'de') oder leer/None für Auto-Detect")
|
||||||
|
p.add_argument("--chunk", type=float, default=60.0, help="Chunk-Länge in Sekunden (0 = ganzes File)")
|
||||||
|
p.add_argument("--overlap", type=float, default=2.0, help="Overlap in Sekunden")
|
||||||
|
p.add_argument("--min-dur", type=float, default=0.30, help="Mindest-Segmentdauer (Sekunden)")
|
||||||
|
p.add_argument("--max-gap", type=float, default=0.15, help="Maximaler Zeit-Gap für Merge (Sekunden)")
|
||||||
|
p.add_argument("--fp16", action="store_true", help="fp16 aktivieren (nur sinnvoll mit GPU)")
|
||||||
|
return p.parse_args()
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
# Main
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def main():
|
||||||
|
# Whisper-Cache (damit Modelle lokal landen)
|
||||||
|
os.environ.setdefault("XDG_CACHE_HOME", str(ROOT / "whisper-cache"))
|
||||||
|
|
||||||
|
args = parse_args()
|
||||||
|
input_path = args.input or find_default_input()
|
||||||
|
out_dir = args.outdir or TRANSCRIPTS_DIR
|
||||||
|
|
||||||
|
print("📁 Projekt-Root:", ROOT)
|
||||||
|
print("📄 Input:", input_path if input_path else "—")
|
||||||
|
if not input_path or not input_path.exists():
|
||||||
|
raise FileNotFoundError(f"Kein gültiges Eingabefile gefunden. Lege ein Video/Audio in {INPUT_DIR} oder nutze --input.")
|
||||||
|
|
||||||
|
out_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
duration = probe_duration(input_path)
|
||||||
|
print(f"🎬 Dauer: {duration:.2f}s")
|
||||||
|
|
||||||
|
chunks = make_chunks(duration, args.chunk, args.overlap)
|
||||||
|
print(f"🔪 {len(chunks)} Chunks à {args.chunk:.1f}s mit {args.overlap:.1f}s Overlap")
|
||||||
|
|
||||||
|
# Whisper laden
|
||||||
|
print(f"🧠 Lade Whisper-Modell: {args.model}")
|
||||||
|
try:
|
||||||
|
model = whisper.load_model(args.model)
|
||||||
|
except Exception as e:
|
||||||
|
raise RuntimeError(f"Whisper-Modell '{args.model}' konnte nicht geladen werden. Installiert? (pip install openai-whisper)\n{e}") from e
|
||||||
|
|
||||||
|
all_segments: List[Dict] = []
|
||||||
|
with TemporaryDirectory() as tmpdir_str:
|
||||||
|
tmpdir = Path(tmpdir_str)
|
||||||
|
for i, (start, end) in enumerate(chunks, 1):
|
||||||
|
print(f"🔉 Chunk {i}/{len(chunks)}: {start:.2f}s - {end:.2f}s")
|
||||||
|
wav = tmpdir / f"chunk_{i:03d}.wav"
|
||||||
|
extract_audio_segment(input_path, start, end, wav)
|
||||||
|
|
||||||
|
# Sprache: ''/none = Auto-Detect
|
||||||
|
lang = None if str(args.lang).strip().lower() in {"", "none", "null"} else args.lang
|
||||||
|
|
||||||
|
# Transkribieren mit Wortzeiten, ohne Cross-Chunk-Kontext
|
||||||
|
result = model.transcribe(
|
||||||
|
str(wav),
|
||||||
|
language=lang,
|
||||||
|
fp16=args.fp16,
|
||||||
|
word_timestamps=True,
|
||||||
|
condition_on_previous_text=False,
|
||||||
|
temperature=0,
|
||||||
|
verbose=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Center-Cut: nur Mittelteil behalten (verhindert Grenz-Doppler)
|
||||||
|
keep_start = start if i == 1 else start + args.overlap / 2.0
|
||||||
|
keep_end = end if i == len(chunks) else end - args.overlap / 2.0
|
||||||
|
|
||||||
|
for seg in result.get("segments", []) or []:
|
||||||
|
s0 = float(seg["start"]) + start
|
||||||
|
s1 = float(seg["end"]) + start
|
||||||
|
mid = (s0 + s1) / 2.0
|
||||||
|
if not (keep_start <= mid < keep_end):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Wörter mit absoluten Zeiten übernehmen
|
||||||
|
words = []
|
||||||
|
for w in (seg.get("words") or []):
|
||||||
|
txt = (w.get("word") or w.get("text") or "").strip()
|
||||||
|
if not txt:
|
||||||
|
continue
|
||||||
|
words.append({
|
||||||
|
"start": float(w["start"]) + start,
|
||||||
|
"end": float(w["end"]) + start,
|
||||||
|
"text": txt
|
||||||
|
})
|
||||||
|
|
||||||
|
all_segments.append({
|
||||||
|
"start": s0,
|
||||||
|
"end": s1,
|
||||||
|
"text": (seg.get("text") or "").strip(),
|
||||||
|
"words": words
|
||||||
|
})
|
||||||
|
|
||||||
|
print(f"🧹 Roh-Segmente: {len(all_segments)} → merge & filter …")
|
||||||
|
merged = merge_overlaps_keep_best(all_segments, max_gap=args.max_gap, min_dur=args.min_dur)
|
||||||
|
print(f"✅ Gemergte Segmente: {len(merged)}")
|
||||||
|
|
||||||
|
out_txt, out_sus, out_json = write_outputs(input_path, merged, out_dir, ascii_dash=True)
|
||||||
|
print(f"📝 TXT: {out_txt}")
|
||||||
|
print(f"⚠️ SUSPECT: {out_sus}")
|
||||||
|
print(f"💾 JSON: {out_json}")
|
||||||
|
print("🎉 Fertig.")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
88
src/text/transcription_with_speaker.py
Normal file
88
src/text/transcription_with_speaker.py
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
import os
|
||||||
|
import json
|
||||||
|
import ffmpeg
|
||||||
|
import whisper
|
||||||
|
import tempfile
|
||||||
|
import torch
|
||||||
|
from tqdm import tqdm
|
||||||
|
from pathlib import Path
|
||||||
|
from pyannote.audio import Pipeline
|
||||||
|
|
||||||
|
# === HUGGING FACE TOKEN (für pyannote) ===
|
||||||
|
HF_TOKEN = "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"
|
||||||
|
|
||||||
|
# === Torch Optimierung (optional) ===
|
||||||
|
torch.set_float32_matmul_precision("medium")
|
||||||
|
|
||||||
|
# === Einstellungen ===
|
||||||
|
PROJECT_ROOT = Path(__file__).resolve().parents[2]
|
||||||
|
input_file = PROJECT_ROOT / "input" / "testVideoShort.mov"
|
||||||
|
output_dir = PROJECT_ROOT / "transkripte"
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
output_txt = output_dir / f"{input_file.stem}_timed.txt"
|
||||||
|
output_json = output_dir / f"{input_file.stem}_segments.json"
|
||||||
|
|
||||||
|
# === Video in Audio konvertieren ===
|
||||||
|
print("🎞️ Extrahiere Audio ...")
|
||||||
|
tmp_dir = Path(tempfile.mkdtemp())
|
||||||
|
wav_file = tmp_dir / "audio.wav"
|
||||||
|
ffmpeg.input(str(input_file)).output(
|
||||||
|
str(wav_file),
|
||||||
|
format="wav",
|
||||||
|
acodec="pcm_s16le",
|
||||||
|
ac=1,
|
||||||
|
ar="16000",
|
||||||
|
loglevel="error"
|
||||||
|
).overwrite_output().run()
|
||||||
|
|
||||||
|
# === Transkription mit Whisper ===
|
||||||
|
print("🧠 Starte Transkription mit Whisper ...")
|
||||||
|
model = whisper.load_model("small")
|
||||||
|
result = model.transcribe(
|
||||||
|
str(wav_file),
|
||||||
|
language="de",
|
||||||
|
fp16=False,
|
||||||
|
word_timestamps=False,
|
||||||
|
condition_on_previous_text=True,
|
||||||
|
temperature=0,
|
||||||
|
verbose=False
|
||||||
|
)
|
||||||
|
segments = result["segments"]
|
||||||
|
|
||||||
|
# === Diarisation mit Pyannote ===
|
||||||
|
print("🗣️ Starte Sprecheranalyse mit Pyannote (das dauert jetzt etwas) ...")
|
||||||
|
pipeline = Pipeline.from_pretrained(
|
||||||
|
"pyannote/speaker-diarization-3.1",
|
||||||
|
use_auth_token=HF_TOKEN
|
||||||
|
)
|
||||||
|
pipeline.to(torch.device("mps")) # ⬅️ Apple GPU beschleunigen
|
||||||
|
|
||||||
|
diarization = pipeline(str(wav_file))
|
||||||
|
|
||||||
|
# === Sprecher zuordnen ===
|
||||||
|
def assign_speakers_to_segments(segments, diarization):
|
||||||
|
assigned = []
|
||||||
|
for seg in tqdm(segments, desc="🎙️ Weise Sprecher zu"):
|
||||||
|
speaker = "unknown"
|
||||||
|
for turn, _, label in diarization.itertracks(yield_label=True):
|
||||||
|
if turn.start <= seg["start"] <= turn.end:
|
||||||
|
speaker = label
|
||||||
|
break
|
||||||
|
seg["speaker"] = speaker
|
||||||
|
assigned.append(seg)
|
||||||
|
return assigned
|
||||||
|
|
||||||
|
segments_with_speaker = assign_speakers_to_segments(segments, diarization)
|
||||||
|
|
||||||
|
# === Speichern als TXT
|
||||||
|
with open(output_txt, "w", encoding="utf-8") as f:
|
||||||
|
for seg in segments_with_speaker:
|
||||||
|
line = f"[{seg['start']:.2f} – {seg['end']:.2f}] {seg['speaker'].upper()}: {seg['text'].strip()}\n"
|
||||||
|
f.write(line)
|
||||||
|
|
||||||
|
# === Speichern als JSON
|
||||||
|
with open(output_json, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(segments_with_speaker, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
print(f"✅ Transkript mit Sprecherinfos gespeichert unter:\n📄 {output_txt}\n📄 {output_json}")
|
@ -1 +0,0 @@
|
|||||||
Subproject commit 7815f8b37d91b75cf160ed3f0ec8550c0b58cabb
|
|
BIN
transkripte/.DS_Store
vendored
BIN
transkripte/.DS_Store
vendored
Binary file not shown.
@ -1 +0,0 @@
|
|||||||
Subproject commit 2e310b841e0b4e7cf00890b53411dd9f8578f243
|
|
Loading…
x
Reference in New Issue
Block a user