cleanup: ignore text-clustering & whisper.cpp
This commit is contained in:
parent
a9d700b20e
commit
0c9b43af42
105
.gitignore
vendored
105
.gitignore
vendored
@ -1,27 +1,108 @@
|
||||
# IDE & Cache
|
||||
# ─────────────────────────────
|
||||
# IDEs & System Files
|
||||
# ─────────────────────────────
|
||||
.idea/
|
||||
.vscode/
|
||||
__pycache__/
|
||||
*.pyc
|
||||
.DS_Store
|
||||
*.log
|
||||
|
||||
# Whisper Modelle & Cache
|
||||
# ─────────────────────────────
|
||||
# Cache / Modelle / Checkpoints
|
||||
# ─────────────────────────────
|
||||
whisper-cache/
|
||||
models/
|
||||
*.pt
|
||||
*.onnx
|
||||
*.bin
|
||||
*.safetensors
|
||||
|
||||
# Output/Temp Files
|
||||
# ─────────────────────────────
|
||||
# Datenbank / temporäre Dateien
|
||||
# ─────────────────────────────
|
||||
*.db
|
||||
*.sqlite
|
||||
logs/
|
||||
temp/
|
||||
tmp/
|
||||
*.tmp
|
||||
|
||||
# ─────────────────────────────
|
||||
# Transkripte / KI-Zwischenausgaben
|
||||
# ─────────────────────────────
|
||||
/data/transkripte/
|
||||
/transcripts/
|
||||
/outputs/
|
||||
/results/
|
||||
*_segments.json
|
||||
*_timed.txt
|
||||
*_suspect_lines.txt
|
||||
|
||||
# ─────────────────────────────
|
||||
# Video / Audio Outputs
|
||||
# ─────────────────────────────
|
||||
*.mp4
|
||||
*.mov
|
||||
*.db
|
||||
*.mkv
|
||||
*.wav
|
||||
*.json
|
||||
temp.*
|
||||
logs/
|
||||
*.webm
|
||||
*.mp3
|
||||
|
||||
# Eingebettete Repos
|
||||
# ─────────────────────────────
|
||||
# Generierte Teil-/Ergebnis-Ordner
|
||||
# ─────────────────────────────
|
||||
/raw_clips/
|
||||
/face_combined/
|
||||
/face_crop_centers/
|
||||
/cropped/
|
||||
/subtitled/
|
||||
/segments/
|
||||
/highlight_clips/
|
||||
/output/
|
||||
/renders/
|
||||
/exports/
|
||||
|
||||
# ─────────────────────────────
|
||||
# Eingebettete Repos oder externe Module
|
||||
# ─────────────────────────────
|
||||
/whisper.cpp/
|
||||
/text-clustering/
|
||||
/venv/
|
||||
/.env/
|
||||
/.env.local
|
||||
.envrc
|
||||
.env.*
|
||||
|
||||
# ─────────────────────────────
|
||||
# Backups / Sonstiges
|
||||
# ─────────────────────────────
|
||||
*.bak
|
||||
*.old
|
||||
*.orig
|
||||
*.swp
|
||||
*.zip
|
||||
*.tar
|
||||
*.gz
|
||||
|
||||
# IDE/System
|
||||
.idea/
|
||||
.DS_Store
|
||||
__pycache__/
|
||||
*.pyc
|
||||
|
||||
# Secrets/Umgebung
|
||||
.env
|
||||
config.py
|
||||
|
||||
# Große/ausgeleitete Daten
|
||||
data/
|
||||
transkripte/
|
||||
whisper-cache/
|
||||
models/
|
||||
*.db
|
||||
*.mp4 *.mov *.mkv *.wav *.mp3 *.webm
|
||||
logs/ tmp/ temp/
|
||||
# embedded / external
|
||||
text-clustering/
|
||||
whisper.cpp/
|
||||
|
||||
# Video-Rohmaterial
|
||||
*.mov
|
||||
|
||||
|
8
.idea/.gitignore
generated
vendored
8
.idea/.gitignore
generated
vendored
@ -1,8 +0,0 @@
|
||||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
||||
# Editor-based HTTP Client requests
|
||||
/httpRequests/
|
||||
# Datasource local storage ignored files
|
||||
/dataSources/
|
||||
/dataSources.local.xml
|
11
.idea/BachlorArbeit.iml
generated
11
.idea/BachlorArbeit.iml
generated
@ -1,11 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<sourceFolder url="file://$MODULE_DIR$" isTestSource="false" />
|
||||
<excludeFolder url="file://$MODULE_DIR$/.venv" />
|
||||
</content>
|
||||
<orderEntry type="jdk" jdkName="Python 3.10 (BachlorArbeit)" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
35
.idea/dataSources.xml
generated
35
.idea/dataSources.xml
generated
@ -1,35 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="DataSourceManagerImpl" format="xml" multifile-model="true">
|
||||
<data-source source="LOCAL" name="segments" uuid="b474bded-3824-407e-9dc9-bcc11057235d">
|
||||
<driver-ref>sqlite.xerial</driver-ref>
|
||||
<synchronize>true</synchronize>
|
||||
<jdbc-driver>org.sqlite.JDBC</jdbc-driver>
|
||||
<jdbc-url>jdbc:sqlite:$PROJECT_DIR$/segments.db</jdbc-url>
|
||||
<working-dir>$ProjectFileDir$</working-dir>
|
||||
<libraries>
|
||||
<library>
|
||||
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/xerial/sqlite-jdbc/3.45.1.0/sqlite-jdbc-3.45.1.0.jar</url>
|
||||
</library>
|
||||
<library>
|
||||
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/slf4j/slf4j-api/1.7.36/slf4j-api-1.7.36.jar</url>
|
||||
</library>
|
||||
</libraries>
|
||||
</data-source>
|
||||
<data-source source="LOCAL" name="clips_openai" uuid="50f21c9a-9baf-4dc5-9c33-fde0fd385e38">
|
||||
<driver-ref>sqlite.xerial</driver-ref>
|
||||
<synchronize>true</synchronize>
|
||||
<jdbc-driver>org.sqlite.JDBC</jdbc-driver>
|
||||
<jdbc-url>jdbc:sqlite:$PROJECT_DIR$/clips_openai.db</jdbc-url>
|
||||
<working-dir>$ProjectFileDir$</working-dir>
|
||||
<libraries>
|
||||
<library>
|
||||
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/xerial/sqlite-jdbc/3.45.1.0/sqlite-jdbc-3.45.1.0.jar</url>
|
||||
</library>
|
||||
<library>
|
||||
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/slf4j/slf4j-api/1.7.36/slf4j-api-1.7.36.jar</url>
|
||||
</library>
|
||||
</libraries>
|
||||
</data-source>
|
||||
</component>
|
||||
</project>
|
6
.idea/inspectionProfiles/profiles_settings.xml
generated
6
.idea/inspectionProfiles/profiles_settings.xml
generated
@ -1,6 +0,0 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<settings>
|
||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||
<version value="1.0" />
|
||||
</settings>
|
||||
</component>
|
6
.idea/misc.xml
generated
6
.idea/misc.xml
generated
@ -1,6 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="Black">
|
||||
<option name="sdkName" value="Python 3.10 (BachlorArbeit)" />
|
||||
</component>
|
||||
</project>
|
8
.idea/modules.xml
generated
8
.idea/modules.xml
generated
@ -1,8 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/BachlorArbeit.iml" filepath="$PROJECT_DIR$/.idea/BachlorArbeit.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
9
.idea/vcs.xml
generated
9
.idea/vcs.xml
generated
@ -1,9 +0,0 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="$PROJECT_DIR$/models/distiluse-base-multilingual-cased-v2" vcs="Git" />
|
||||
<mapping directory="$PROJECT_DIR$/text-clustering" vcs="Git" />
|
||||
<mapping directory="$PROJECT_DIR$/whisper.cpp" vcs="Git" />
|
||||
<mapping directory="$PROJECT_DIR$/whisper.cpp/whisper.cpp" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
250
README.md
250
README.md
@ -0,0 +1,250 @@
|
||||
# Bachelorarbeit – Pipeline: Automatisierte Highlight-Erkennung & 9:16-Aufbereitung
|
||||
|
||||
Diese Repository enthält eine vollständige, skriptbasierte Pipeline, um aus Langvideos automatisch Social‑Media‑taugliche 9:16‑Highlights zu erzeugen – inkl. Transkription, KI‑gestützter Clip‑Selektion, Gesichts‑/Mundaktivitätsanalyse, Auto‑Cropping, Untertitel (Word‑Caps) und finalem Export.
|
||||
|
||||
## Inhaltsverzeichnis
|
||||
- [Features](#features)
|
||||
- [Ordnerstruktur](#ordnerstruktur)
|
||||
- [Voraussetzungen](#voraussetzungen)
|
||||
- [Installation](#installation)
|
||||
- [Schnellstart (empfohlener Workflow)](#schnellstart-empfohlener-workflow)
|
||||
- [Skripte & CLI](#skripte--cli)
|
||||
- [Tipps & Troubleshooting](#tipps--troubleshooting)
|
||||
- [Reproduzierbarkeit](#reproduzierbarkeit)
|
||||
- [Lizenz / Danksagung](#lizenz--danksagung)
|
||||
|
||||
---
|
||||
|
||||
## Features
|
||||
- **Transkription mit Wort‑Zeitstempeln (Whisper, chunked ohne Grenz‑Doppler)**
|
||||
- **LLM‑gestützte Clip‑Selektion** (Viralität/Emotionalität etc. in SQLite gespeichert)
|
||||
- **Face‑Detection (YOLOv8‑face) & Mundaktivität (MediaPipe)**
|
||||
- **Stabiles 9:16‑Auto‑Cropping** (Median + EMA, Deadband, Szenenschnitt‑Erkennung, Switch‑Cooldown)
|
||||
- **Word‑Caps Untertitel** (ASS generiert, per ffmpeg eingebrannt)
|
||||
- **Batch‑Export der Highlights** (MoviePy, Längen‑/Grenz‑Checks)
|
||||
|
||||
## Ordnerstruktur
|
||||
Die Pfade werden zentral in `config.py` definiert:
|
||||
```
|
||||
PROJECT_ROOT/
|
||||
├─ data/
|
||||
│ ├─ input/ # Eingabevideo(s)
|
||||
│ ├─ transkripte/ # Whisper-Outputs (*_segments.json, *_timed.txt ...)
|
||||
│ ├─ segments/ # LLM-Clip-Auswahl, DB etc.
|
||||
│ ├─ output/
|
||||
│ │ └─ raw_clips/ # Roh-Highlight-Clips (aus cutClips.py)
|
||||
│ ├─ face_data_combined/ # faces.json je Clip (YOLO + MediaPipe)
|
||||
│ └─ face_crop_centers/ # (optional) Center-Listen
|
||||
├─ output/
|
||||
│ ├─ output_9x16_final/ # Auto-cropped 9:16 Videos
|
||||
│ ├─ output_9x16_final_subbed_word/ # 9:16 mit eingebrannten Word-Caps
|
||||
│ └─ debug/ # Debug-Previews/Artefakte
|
||||
├─ models/ # YOLO-Weights (z. B. yolov8n-face.pt)
|
||||
├─ whisper-cache/ # Whisper Modell-Cache
|
||||
└─ src/... (optional projektspezifisch)
|
||||
```
|
||||
> Beim ersten Start legt `config.py` fehlende Verzeichnisse automatisch an.
|
||||
|
||||
## Voraussetzungen
|
||||
**System‑Tools**
|
||||
- `ffmpeg` (inkl. `ffprobe`) im `PATH`
|
||||
|
||||
**Python**
|
||||
- Python 3.10+ empfohlen
|
||||
- Pakete (Beispiel):
|
||||
`openai-whisper`, `torch`, `ffmpeg-python`, `ultralytics`, `opencv-python`, `mediapipe`, `moviepy`, `tqdm`, `numpy`, `regex`
|
||||
- Optional/abhängig vom Codepfad: `pydub`, `scikit-image` (falls in Erweiterungen verwendet)
|
||||
|
||||
**Modelle & Keys**
|
||||
- **Whisper**: lädt Modelle automatisch in `whisper-cache/` (steuerbar via `WHISPER_MODEL`)
|
||||
- **YOLOv8‑face**: `models/yolov8n-face.pt` (oder größeres Modell)
|
||||
- **OpenAI API Key** (für `segment_transcript.py` & `rateCluster.py`): `export OPENAI_API_KEY=...`
|
||||
- Default‑Modell ggf. per `export OPENAI_MODEL=gpt-4o` setzen
|
||||
|
||||
## Installation
|
||||
```bash
|
||||
# 1) Python-Umgebung
|
||||
python3 -m venv .venv
|
||||
source .venv/bin/activate
|
||||
|
||||
# 2) Systemabhängigkeiten
|
||||
# ffmpeg installieren (Mac: brew install ffmpeg, Ubuntu: apt install ffmpeg)
|
||||
|
||||
# 3) Python-Pakete (Beispiel)
|
||||
pip install --upgrade pip
|
||||
pip install openai-whisper torch ffmpeg-python ultralytics opencv-python mediapipe moviepy tqdm numpy regex
|
||||
|
||||
# 4) Modelle/Dateien
|
||||
# YOLO-Weights:
|
||||
# Download yolov8n-face.pt → ./models/yolov8n-face.pt
|
||||
# API Key für LLM:
|
||||
export OPENAI_API_KEY="sk-..."
|
||||
export OPENAI_MODEL="gpt-4o"
|
||||
```
|
||||
|
||||
## Schnellstart (empfohlener Workflow)
|
||||
1) **Eingabe ablegen**
|
||||
Lege dein Langvideo in `data/input/` (z. B. `meinvideo.mp4`).
|
||||
|
||||
2) **Transkription (Whisper, chunked & doppler-sicher)**
|
||||
```bash
|
||||
python transcription.py --input data/input/meinvideo.mp4 --model small --lang de
|
||||
```
|
||||
→ erzeugt `*_segments.json` + `*_timed.txt` in `data/transkripte/`.
|
||||
|
||||
3) **Clips mit LLM selektieren & in DB speichern**
|
||||
```bash
|
||||
export OPENAI_API_KEY="..."; export OPENAI_MODEL="gpt-4o"
|
||||
python segment_transcript.py --base meinvideo --block 60 --min 6.0 --max 30.0
|
||||
```
|
||||
→ schreibt Clips in SQLite (`data/clips_openai.db` o. ä.)
|
||||
|
||||
4) **Highlights aus dem Originalvideo schneiden**
|
||||
```bash
|
||||
python cutClips.py --file meinvideo.mp4 --limit 10 --order score
|
||||
```
|
||||
→ exportiert `highlight_*.mp4` nach `data/output/raw_clips/`
|
||||
|
||||
5) **Face‑Detection + Mundaktivität**
|
||||
```bash
|
||||
python main_detect_faces.py --model models/yolov8n-face.pt --input-dir data/output/raw_clips --output-dir data/face_data_combined --frame-skip 1 --downscale 0.5
|
||||
```
|
||||
|
||||
6) **Targets je Frame bauen (Zentren/Größe glätten)**
|
||||
```bash
|
||||
python make_segments.py --pattern "highlight_*.mp4" --fps 0 --smooth 7 --overwrite
|
||||
```
|
||||
|
||||
7) **9:16 Auto‑Crop anwenden**
|
||||
```bash
|
||||
python main_apply_crop.py --pattern "highlight_*.mp4" --median 7 --ema 0.5 --deadband 16 --cut_detect --mux_audio --overwrite
|
||||
```
|
||||
→ fertige 9:16‑Clips in `output/output_9x16_final/`
|
||||
|
||||
8) **Word‑Caps Untertitel einbrennen (optional)**
|
||||
```bash
|
||||
python add_subtitles.py --clips_dir output/output_9x16_final --out_dir output/output_9x16_final_subbed_word --model small --limit 20
|
||||
```
|
||||
→ fertige Videos mit eingebrannten Word‑Caps in `output/output_9x16_final_subbed_word/`
|
||||
|
||||
> 💡 Du kannst viele Parameter (Fensterbreiten, Deadband, Erkennungsschwellen, Limits) über die CLI anpassen.
|
||||
|
||||
## Skripte & CLI
|
||||
### `transcription.py`
|
||||
Chunked‑Transkription mit Wortzeitstempeln.
|
||||
```
|
||||
--input PATH # Eingabevideo/-audio (Default: erstes File in data/input/)
|
||||
--outdir PATH # Ausgabeverzeichnis (Default: data/transkripte/)
|
||||
--model NAME # Whisper-Modell (tiny/base/small/medium/large; env: WHISPER_MODEL)
|
||||
--lang CODE # Sprachcode (z. B. de) oder leer/None für Auto-Detect
|
||||
--chunk FLOAT # Chunk-Länge in s (Default 60)
|
||||
--overlap FLOAT # Überlappung in s (Default 2.0)
|
||||
--min-dur FLOAT # Mindest-Segmentdauer (s)
|
||||
--max-gap FLOAT # Max. Zeit-Gap beim Mergen (s)
|
||||
--fp16 # Nur sinnvoll mit GPU
|
||||
```
|
||||
|
||||
### `segment_transcript.py`
|
||||
LLM‑Selektion & Speichern in SQLite.
|
||||
```
|
||||
--base STR # Basename der Transkriptdateien (z. B. 'meinvideo')
|
||||
--block FLOAT # Blocklänge s für den Prompt
|
||||
--min FLOAT # minimale Clip-Länge s
|
||||
--max FLOAT # maximale Clip-Länge s
|
||||
# env: OPENAI_API_KEY, OPENAI_MODEL (z. B. gpt-4o)
|
||||
```
|
||||
|
||||
### `cutClips.py`
|
||||
Schneidet ausgewählte Highlights als Einzelclips.
|
||||
```
|
||||
--file NAME # Name der Input-Datei in data/input (Default: erstes Video)
|
||||
--limit INT # Anzahl zu exportierender Clips (Default 10)
|
||||
--order {score,start} # Sortierung: Score (absteigend) oder Startzeit
|
||||
```
|
||||
|
||||
### `main_detect_faces.py`
|
||||
YOLOv8‑face + MediaPipe → `faces.json` pro Clip.
|
||||
```
|
||||
--input-dir PATH # Default: data/output/raw_clips
|
||||
--output-dir PATH # Default: data/face_data_combined
|
||||
--model PATH # YOLOv8-face Weights (Default: models/yolov8n-face.pt)
|
||||
--conf-thresh FLOAT # Default 0.35
|
||||
--frame-skip INT # z. B. 1 = jeden Frame, 2 = jeden von zwei ...
|
||||
--downscale FLOAT # Frame-Downscale vor YOLO (0..1, z. B. 0.5)
|
||||
--expansion FLOAT # Margin Pass 1 (relativ)
|
||||
--expansion2 FLOAT # Margin Pass 2 (relativ)
|
||||
--min-crop INT # minimale Croplänge (px)
|
||||
--faces-upscale INT # min. Kantenlänge für FaceMesh (kleine Crops hochskalieren)
|
||||
--imgsz INT # YOLO input size (Default 448)
|
||||
--max-det INT # Max Detects / Frame
|
||||
--use-refine # MediaPipe refine_landmarks aktivieren
|
||||
```
|
||||
|
||||
### `make_segments.py`
|
||||
Erzeugt `*_target_by_frame.json` (Zentren+Side pro Frame) aus Face/Center‑Daten.
|
||||
```
|
||||
--pattern STR # Dateimuster in raw_clips (Default: highlight_*.mp4)
|
||||
--fps FLOAT # FPS erzwingen (0 = aus Video lesen)
|
||||
--smooth INT # MA-Fensterbreite (ungerade)
|
||||
--overwrite # bestehende target_by_frame.json überschreiben
|
||||
```
|
||||
|
||||
### `main_apply_crop.py`
|
||||
Wendet 9:16‑Crop mit Glättung/Szenenschnitt an.
|
||||
```
|
||||
--pattern STR # Dateimuster in raw_clips (Default: *.mp4)
|
||||
--out_w INT # Output-Breite (Default 1080)
|
||||
--out_h INT # Output-Höhe (Default 1920)
|
||||
--zoom_pad FLOAT # Zoom-Pad (0..1)
|
||||
--median INT # Median-Fenster (>=3, ungerade)
|
||||
--ema FLOAT # EMA-Alpha (0..1)
|
||||
--deadband FLOAT # Totband in Pixel
|
||||
--switch_cd INT # Cooldown-Frames nach Trackwechsel
|
||||
--cut_detect # Szenenschnitt-Erkennung aktivieren
|
||||
--cut_corr FLOAT # Schwellwert Korrelation (0..1)
|
||||
--cut_cd INT # Cooldown-Frames nach Cut
|
||||
--mux_audio # Original-Audio unterlegen
|
||||
--debug # Debug-Overlay anzeigen
|
||||
--debug_scale FLOAT # Debug-Preview skaliert rendern
|
||||
--overwrite # vorhandene Ausgaben überschreiben
|
||||
```
|
||||
|
||||
### `add_subtitles.py`
|
||||
Generiert Word‑Caps mit Whisper & brennt sie ein.
|
||||
```
|
||||
--clips_dir PATH # Quelle (Default: output/output_9x16_final)
|
||||
--out_dir PATH # Ziel (Default: output/output_9x16_final_subbed_word)
|
||||
--pattern STR # z. B. *.mp4
|
||||
--limit INT # Nur die ersten N Clips
|
||||
--model NAME # Whisper-Modell (tiny/base/small/medium/large)
|
||||
--lang CODE # Sprachcode oder Auto
|
||||
```
|
||||
|
||||
### `rateCluster.py` (optional)
|
||||
Lässt LLM Scores (Viralität, Emotion, Humor, Provokation) nachtragen.
|
||||
> Modelliere Standard‑Modell via `OPENAI_MODEL` (z. B. `gpt-4o`).
|
||||
|
||||
---
|
||||
|
||||
## Tipps & Troubleshooting
|
||||
- **Modelle/Performance**
|
||||
- CPU‑only ist möglich (Whisper/YOLO langsamer). Auf Apple Silicon wird automatisch **MPS** genutzt; auf NVIDIA **CUDA**.
|
||||
- `--frame-skip` und `--downscale` in `main_detect_faces.py` beschleunigen die Face‑Detection deutlich.
|
||||
- **ffmpeg‑Muxing prüfen** (`main_apply_crop.py --mux_audio`): Falls Ton fehlt, `ffmpeg`-Installation checken. Rückgabecode im Log prüfen.
|
||||
- **Fehlende Dateien**
|
||||
- Kein Input? → `data/input/` prüfen.
|
||||
- Fehlende Transkript‑Paare? → `*_timed.txt` und `*_segments.json` müssen existieren (aus `transcription.py`).
|
||||
- Fehlende Faces? → Pfad zu `models/yolov8n-face.pt` korrekt?
|
||||
- **Datenbank**
|
||||
- Highlights liegen in SQLite (siehe `config.py`: `DB_PATH`). Bei Wiederholungen kann ein `DELETE FROM highlights; VACUUM;` sinnvoll sein.
|
||||
- **Cache/Verzeichnisse**
|
||||
- Whisper‑Cache via `XDG_CACHE_HOME` → `whisper-cache/` neben dem Projekt. Speicherplatz beachten.
|
||||
|
||||
## Reproduzierbarkeit
|
||||
- Lege eine `requirements.txt` mit exakten Versionen an (Freeze deiner funktionierenden Umgebung).
|
||||
- Dokumentiere verwendete **Modell‑Versionsstände** (YOLO Weights, Whisper‑Modellgröße, OPENAI_MODEL).
|
||||
- Fixiere Random‑Seeds, falls nötig (hier meist deterministisch durch externe Modelle/Bibliotheken).
|
||||
|
||||
## Lizenz / Danksagung
|
||||
- Verwendung von **OpenAI Whisper**, **Ultralytics YOLOv8**, **MediaPipe**, **OpenCV**, **MoviePy**, **ffmpeg**.
|
||||
- Die jeweiligen Lizenzen der Bibliotheken beachten.
|
@ -1,38 +0,0 @@
|
||||
from moviepy.video.io.VideoFileClip import VideoFileClip
|
||||
from pathlib import Path
|
||||
import sqlite3
|
||||
|
||||
# === Setup ===
|
||||
input_video = Path("input/testVideoShort.mov")
|
||||
output_dir = Path("output")
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# === SQLite DB lesen ===
|
||||
db_path = "clips_openai.db"
|
||||
conn = sqlite3.connect(db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Nur die Top 10 Clips mit höchstem score_total
|
||||
cursor.execute("""
|
||||
SELECT start, end, text
|
||||
FROM highlights
|
||||
ORDER BY score_total DESC
|
||||
LIMIT 10
|
||||
""")
|
||||
highlights = cursor.fetchall()
|
||||
|
||||
# === Video laden ===
|
||||
video = VideoFileClip(str(input_video))
|
||||
|
||||
# === Clips schneiden ===
|
||||
for i, (start, end, text) in enumerate(highlights):
|
||||
output_file = output_dir / f"highlight_{i+1}.mp4"
|
||||
end = min(end, video.duration) # Sicherstellen, dass das Ende nicht über das Video hinausgeht
|
||||
print(f"🎬 Exportiere Clip {i+1}: {start:.2f}s – {end:.2f}s → {output_file.name}")
|
||||
clip = video.subclipped(start, end)
|
||||
clip.write_videofile(str(output_file), codec="libx264", audio_codec="aac")
|
||||
|
||||
# === Cleanup ===
|
||||
conn.close()
|
||||
video.close()
|
||||
print("✅ Top 10 Clips exportiert.")
|
@ -1,196 +0,0 @@
|
||||
import json
|
||||
import sqlite3
|
||||
import re
|
||||
from pathlib import Path
|
||||
from openai import OpenAI
|
||||
from datetime import datetime
|
||||
import time
|
||||
import nltk
|
||||
|
||||
nltk.download("punkt")
|
||||
|
||||
# === SETTINGS ===
|
||||
TRANSCRIPT_PATH = Path("transkripte/testVideoShort_timed.txt")
|
||||
DB_PATH = Path("clips_openai.db")
|
||||
LOG_DIR = Path("logs")
|
||||
LOG_DIR.mkdir(exist_ok=True)
|
||||
BLOCK_DURATION = 300
|
||||
MIN_CLIP_LEN = 5
|
||||
MAX_CLIP_LEN = 90
|
||||
|
||||
client = OpenAI(api_key="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")
|
||||
|
||||
# === HILFSFUNKTIONEN ===
|
||||
def log_text(filename, content):
|
||||
(LOG_DIR / filename).write_text(content.strip(), encoding="utf-8")
|
||||
|
||||
def append_error_log(content):
|
||||
with (LOG_DIR / "errors.txt").open("a", encoding="utf-8") as f:
|
||||
f.write(content + "\n\n")
|
||||
|
||||
def extract_json(text):
|
||||
match = re.search(r"\[.*\]", text.strip(), re.DOTALL)
|
||||
if match:
|
||||
try:
|
||||
return json.loads(match.group())
|
||||
except Exception as e:
|
||||
append_error_log(f"❌ JSON-Fehler: {e}\n{text}")
|
||||
return []
|
||||
|
||||
def get_original_text(clip, segments, debug=False):
|
||||
texts = []
|
||||
used_segments = []
|
||||
for s in segments:
|
||||
# Überschneidung: Segment und Clip teilen sich Zeit
|
||||
if not (s["end"] < clip["start"] or s["start"] > clip["end"]):
|
||||
texts.append(s["text"])
|
||||
used_segments.append(f"[{s['start']:.2f} – {s['end']:.2f}] {s['text']}")
|
||||
if debug:
|
||||
print(f"\n🟢 Clip {clip['start']}–{clip['end']} nutzt Segmente:\n" +
|
||||
"\n".join(used_segments))
|
||||
return " ".join(texts).strip()
|
||||
|
||||
# === TRANSKRIPT EINLESEN ===
|
||||
lines = TRANSCRIPT_PATH.read_text(encoding="utf-8").splitlines()
|
||||
segments = []
|
||||
for line in lines:
|
||||
match = re.match(r"\[(\d+(?:\.\d+)?)\s*[–-]\s*(\d+(?:\.\d+)?)\]\s*(.*)", line)
|
||||
if match:
|
||||
start, end, text = match.groups()
|
||||
start = float(start)
|
||||
end = float(end)
|
||||
if end - start >= 2.0:
|
||||
segments.append({"start": start, "end": end, "text": text.strip()})
|
||||
|
||||
if not segments:
|
||||
raise RuntimeError("🚫 Keine gültigen Segmente gefunden.")
|
||||
print(f"✅ {len(segments)} gültige Transkriptsegmente geladen.")
|
||||
|
||||
# === BLÖCKE BILDEN
|
||||
blocks = []
|
||||
current_block = []
|
||||
current_start = 0.0
|
||||
for seg in segments:
|
||||
if seg["end"] - current_start > BLOCK_DURATION:
|
||||
blocks.append(current_block)
|
||||
current_block = []
|
||||
current_start = seg["start"]
|
||||
current_block.append(seg)
|
||||
if current_block:
|
||||
blocks.append(current_block)
|
||||
print(f"🧱 {len(blocks)} Blöcke erstellt (à ~{BLOCK_DURATION}s).")
|
||||
|
||||
# === KI: CLIP-AUSWAHL
|
||||
all_clips = []
|
||||
start_time = time.perf_counter()
|
||||
|
||||
for i, block in enumerate(blocks):
|
||||
if not block:
|
||||
continue
|
||||
|
||||
print(f"\n🤖 Sende Block {i+1}/{len(blocks)} an GPT-4o...")
|
||||
|
||||
block_text = "\n".join([f"[{s['start']} – {s['end']}] {s['text']}" for s in block])
|
||||
prompt = f"""
|
||||
Du bekommst einen Transkriptblock mit Zeitangaben. Extrahiere daraus 1–3 besonders interessante Abschnitte, die sich als eigenständige Clips eignen.
|
||||
Achte darauf das es abgeschlossene Clips sind und als eigenstaendiger Clip funktionieren kann.
|
||||
|
||||
Ein guter Clip:
|
||||
- ist abgeschlossen und verständlich
|
||||
- enthält eine Pointe, Erkenntnis oder einen emotionalen Moment
|
||||
- wirkt wie ein Mini-Ausschnitt mit Anfang, Spannungsbogen, Auflösung oder Punchline
|
||||
- ist **mindestens 30 Sekunden lang**
|
||||
|
||||
Nutze ausschließlich die vorhandenen Start- und Endzeiten – keine neuen erfinden.
|
||||
|
||||
Gib ein valides JSON-Array zurück im Format:
|
||||
[
|
||||
{{
|
||||
"start": float,
|
||||
"end": float,
|
||||
"summary": "Kurze Beschreibung des Inhalts"
|
||||
}}
|
||||
]
|
||||
|
||||
TRANSKRIPT:
|
||||
{block_text}
|
||||
"""
|
||||
log_text(f"block_prompt_{i+1}.txt", prompt)
|
||||
|
||||
try:
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-4o",
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
temperature=0.4
|
||||
)
|
||||
raw = response.choices[0].message.content
|
||||
log_text(f"block_output_{i+1}.txt", raw)
|
||||
clips = extract_json(raw)
|
||||
|
||||
print(f"✅ {len(clips)} Clips empfangen in Block {i+1}")
|
||||
|
||||
for clip in clips:
|
||||
try:
|
||||
dur = float(clip["end"]) - float(clip["start"])
|
||||
if MIN_CLIP_LEN <= dur <= MAX_CLIP_LEN:
|
||||
clip["duration"] = round(dur, 2)
|
||||
all_clips.append(clip)
|
||||
except Exception as e:
|
||||
append_error_log(f"⛔ Clip-Filterfehler: {clip}\n{e}")
|
||||
|
||||
print(f"📈 Aktueller Clip-Gesamtstand: {len(all_clips)}")
|
||||
|
||||
# ETA berechnen
|
||||
elapsed = time.perf_counter() - start_time
|
||||
avg_time = elapsed / (i + 1)
|
||||
eta = avg_time * (len(blocks) - (i + 1))
|
||||
print(f"⏱️ Geschätzte Restzeit: {eta:.1f} Sekunden")
|
||||
|
||||
except Exception as e:
|
||||
append_error_log(f"❌ OpenAI-Fehler Block {i+1}: {e}")
|
||||
print(f"❌ Fehler bei Block {i+1}: {e}")
|
||||
|
||||
# === DB SPEICHERN
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cur = conn.cursor()
|
||||
cur.execute("DROP TABLE IF EXISTS segments")
|
||||
cur.execute("""
|
||||
CREATE TABLE segments (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
file TEXT,
|
||||
start REAL,
|
||||
end REAL,
|
||||
duration REAL,
|
||||
text TEXT,
|
||||
summary TEXT
|
||||
)
|
||||
""")
|
||||
|
||||
inserted = 0
|
||||
failed = 0
|
||||
for clip in all_clips:
|
||||
try:
|
||||
start = float(clip["start"])
|
||||
end = float(clip["end"])
|
||||
duration = float(clip["duration"])
|
||||
summary = clip.get("summary", "")
|
||||
# debug=True für print aller Segment-Texte pro Clip
|
||||
original_text = get_original_text(clip, segments, debug=False)
|
||||
if end <= start or start < 0:
|
||||
raise ValueError("Ungültige Zeiten")
|
||||
cur.execute(
|
||||
"INSERT INTO segments (file, start, end, duration, text, summary) VALUES (?, ?, ?, ?, ?, ?)",
|
||||
(TRANSCRIPT_PATH.stem, start, end, duration, original_text, summary.strip())
|
||||
)
|
||||
inserted += 1
|
||||
except Exception as e:
|
||||
failed += 1
|
||||
append_error_log(f"❌ DB-Fehler: {clip}\n{e}")
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
print("\n📊 Ergebnisse:")
|
||||
print(f" ✅ Clips gespeichert: {inserted}")
|
||||
print(f" ❌ Fehlerhafte Clips: {failed}")
|
||||
print(f"📁 Logs gespeichert in: {LOG_DIR.resolve()}")
|
@ -1,108 +0,0 @@
|
||||
# transcription_chunked.py
|
||||
import whisper
|
||||
from pathlib import Path
|
||||
import os
|
||||
import json
|
||||
import ffmpeg
|
||||
import tempfile
|
||||
|
||||
# === Einstellungen ===
|
||||
input_file = Path("input/testVideoShort.mov")
|
||||
output_dir = Path("transkripte")
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
output_txt = output_dir / f"{input_file.stem}_timed.txt"
|
||||
output_json = output_dir / f"{input_file.stem}_segments.json"
|
||||
suspect_txt = output_dir / f"{input_file.stem}_suspect_lines.txt"
|
||||
|
||||
CHUNKS = 4 # Anzahl Chunks (anpassen!)
|
||||
OVERLAP = 2.0 # Sekunden Überlappung
|
||||
|
||||
os.environ["XDG_CACHE_HOME"] = str(Path(__file__).parent / "whisper-cache")
|
||||
|
||||
probe = ffmpeg.probe(str(input_file))
|
||||
duration = float(probe["format"]["duration"])
|
||||
print(f"🎥 Videolänge: {duration:.2f} Sekunden")
|
||||
|
||||
def extract_audio_chunk(start_time, duration, output_path):
|
||||
ffmpeg.input(str(input_file), ss=start_time, t=duration).output(
|
||||
str(output_path),
|
||||
format="wav",
|
||||
acodec="pcm_s16le",
|
||||
ac=1,
|
||||
ar="16000",
|
||||
loglevel="error"
|
||||
).overwrite_output().run()
|
||||
|
||||
def is_suspect(text):
|
||||
words = text.strip().lower().split()
|
||||
if not words:
|
||||
return True
|
||||
most_common = max([words.count(w) for w in set(words)])
|
||||
return most_common / len(words) > 0.6 or most_common > 20
|
||||
|
||||
tmp_dir = Path(tempfile.mkdtemp())
|
||||
all_segments = []
|
||||
|
||||
print(f"✂️ Teile Audio in {CHUNKS} Chunks ...")
|
||||
for i in range(CHUNKS):
|
||||
chunk_start = max(0, i * (duration / CHUNKS) - OVERLAP if i > 0 else 0)
|
||||
chunk_end = min(duration, (i + 1) * (duration / CHUNKS) + OVERLAP)
|
||||
chunk_dur = chunk_end - chunk_start
|
||||
chunk_file = tmp_dir / f"chunk_{i}.wav"
|
||||
print(f"🔉 Extrahiere Chunk {i+1}/{CHUNKS}: {chunk_start:.2f}s – {chunk_end:.2f}s")
|
||||
extract_audio_chunk(chunk_start, chunk_dur, chunk_file)
|
||||
|
||||
print(f"🧠 Transkribiere Chunk {i+1} ...")
|
||||
model = whisper.load_model("small") # Wechsel zu "medium" oder "large" falls gewünscht
|
||||
result = model.transcribe(
|
||||
str(chunk_file),
|
||||
language="de",
|
||||
fp16=False,
|
||||
word_timestamps=False,
|
||||
condition_on_previous_text=True,
|
||||
temperature=0,
|
||||
verbose=False
|
||||
)
|
||||
|
||||
segments = result["segments"]
|
||||
# Zeitversatz für den aktuellen Chunk hinzufügen
|
||||
offset = chunk_start
|
||||
for seg in segments:
|
||||
seg["start"] += offset
|
||||
seg["end"] += offset
|
||||
all_segments.extend(segments)
|
||||
|
||||
# === Sortiere und filtere doppelte/überlappende Segmente
|
||||
all_segments.sort(key=lambda x: x["start"])
|
||||
|
||||
def segment_hash(seg):
|
||||
return (round(seg["start"], 2), round(seg["end"], 2), seg["text"].strip().lower())
|
||||
|
||||
unique_segments = []
|
||||
seen = set()
|
||||
for seg in all_segments:
|
||||
h = segment_hash(seg)
|
||||
if h not in seen:
|
||||
seen.add(h)
|
||||
unique_segments.append(seg)
|
||||
|
||||
print(f"✅ Insgesamt {len(unique_segments)} einzigartige Segmente transkribiert.")
|
||||
|
||||
with open(output_txt, "w", encoding="utf-8") as f, open(suspect_txt, "w", encoding="utf-8") as f_sus:
|
||||
for seg in unique_segments:
|
||||
start = seg["start"]
|
||||
end = seg["end"]
|
||||
text = seg["text"].strip()
|
||||
line = f"[{start:.2f} – {end:.2f}] {text}\n"
|
||||
f.write(line) # IMMER ins Haupttranskript!
|
||||
if is_suspect(text):
|
||||
f_sus.write(line)
|
||||
|
||||
|
||||
print(f"📝 Zeitmarkiertes Transkript gespeichert unter: {output_txt}")
|
||||
print(f"⚠️ Verdächtige Zeilen gespeichert unter: {suspect_txt}")
|
||||
|
||||
with open(output_json, "w", encoding="utf-8") as f:
|
||||
json.dump(unique_segments, f, ensure_ascii=False, indent=2)
|
||||
print(f"💾 Segmentdaten gespeichert unter: {output_json}")
|
233
main.py
Normal file
233
main.py
Normal file
@ -0,0 +1,233 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Run the full Bachelor pipeline end-to-end with timing, errors, and flexible flags.
|
||||
|
||||
Steps:
|
||||
1) transcription.py → Whisper transcripts (segments + timed words)
|
||||
2) segment_transcript.py → LLM selects highlight candidates → SQLite
|
||||
3) cutClips.py → export highlight_*.mp4 (raw clips)
|
||||
4) main_detect_faces.py → YOLO + MediaPipe → faces.json per clip
|
||||
5) make_segments.py → *_target_by_frame.json (center+side per frame)
|
||||
6) main_apply_crop.py → 9:16 crop with smoothing & optional audio mux
|
||||
7) rateCluster.py → (optional) LLM scoring (virality, emotion, ...)
|
||||
8) add_subtitles.py → (optional) word-cap subtitles burned in
|
||||
|
||||
Usage examples:
|
||||
python main.py --input data/input/meinvideo.mp4 --limit 10 --openai-model gpt-4o
|
||||
python main.py --no-rate --no-subs
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import subprocess
|
||||
import time
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
# --- Import project config ---
|
||||
try:
|
||||
from config import (
|
||||
PROJECT_ROOT, INPUT_DIR, RAW_CLIPS_DIR, CROPPED_DIR, SUBTITLED_DIR,
|
||||
WHISPER_CACHE_DIR
|
||||
)
|
||||
except Exception:
|
||||
PROJECT_ROOT = Path(__file__).resolve().parent
|
||||
sys.path.insert(0, str(PROJECT_ROOT))
|
||||
from config import (
|
||||
PROJECT_ROOT, INPUT_DIR, RAW_CLIPS_DIR, CROPPED_DIR, SUBTITLED_DIR,
|
||||
WHISPER_CACHE_DIR
|
||||
)
|
||||
|
||||
LOGS_DIR = PROJECT_ROOT / "logs"
|
||||
LOGS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# --- korrekte Pfade zu den Skripten ---
|
||||
SCRIPTS = {
|
||||
"transcription": str(PROJECT_ROOT / "src" / "text" / "transcription.py"),
|
||||
"segment_transcript": str(PROJECT_ROOT / "src" / "text" / "segment_transcript.py"),
|
||||
"cutClips": str(PROJECT_ROOT / "src" / "text" / "cutClips.py"),
|
||||
"detect_faces": str(PROJECT_ROOT / "src" / "reformat" / "main_detect_faces.py"),
|
||||
"make_segments": str(PROJECT_ROOT / "src" / "reformat" / "make_segments.py"),
|
||||
"apply_crop": str(PROJECT_ROOT / "src" / "reformat" / "main_apply_crop.py"),
|
||||
"rateCluster": str(PROJECT_ROOT / "src" / "text" / "rateCluster.py"),
|
||||
"add_subtitles": str(PROJECT_ROOT / "src" / "subtitles" / "add_subtitles.py"),
|
||||
}
|
||||
|
||||
def shlex_join(cmd):
|
||||
return " ".join(str(c) for c in cmd)
|
||||
|
||||
def run_step(cmd: list[str], name: str, env: dict[str, str] | None = None) -> float:
|
||||
"""Run a subprocess step, raise on error, return duration in seconds."""
|
||||
t0 = time.perf_counter()
|
||||
print(f"\n===== {name} =====")
|
||||
print(" ", shlex_join(cmd))
|
||||
cp = subprocess.run(cmd, env=env)
|
||||
dt = time.perf_counter() - t0
|
||||
if cp.returncode != 0:
|
||||
print(f"❌ Fehler in {name} (Exit {cp.returncode}) nach {dt:.2f}s")
|
||||
print(" → Prüfe das Logfile oben für Details und stelle sicher, dass Abhängigkeiten installiert sind:")
|
||||
print(" - ffmpeg/ffprobe im PATH")
|
||||
print(" - Python-Pakete: openai-whisper, torch, ffmpeg-python, ultralytics, opencv-python, mediapipe, moviepy, tqdm, numpy")
|
||||
print(" - OPENAI_API_KEY gesetzt (für LLM-Schritte)")
|
||||
raise SystemExit(cp.returncode)
|
||||
print(f"✅ {name} in {dt:.2f}s")
|
||||
return dt
|
||||
|
||||
def infer_base_from_input(input_path: Path) -> str:
|
||||
return input_path.stem
|
||||
|
||||
def default_input() -> Path | None:
|
||||
if not INPUT_DIR.exists():
|
||||
return None
|
||||
for p in sorted(INPUT_DIR.iterdir()):
|
||||
if p.suffix.lower() in {".mp4", ".mov", ".mkv", ".m4v", ".mp3", ".wav"}:
|
||||
return p
|
||||
return None
|
||||
|
||||
def main():
|
||||
ap = argparse.ArgumentParser(description="Bachelor Pipeline Runner")
|
||||
ap.add_argument("--input", type=str, default=None, help="Pfad zu Eingabedatei (Default: erstes File in data/input)")
|
||||
ap.add_argument("--limit", type=int, default=10, help="Anzahl Highlights (cutClips)")
|
||||
ap.add_argument("--whisper-model", type=str, default=os.getenv("WHISPER_MODEL", "small"))
|
||||
ap.add_argument("--lang", type=str, default=None, help="Sprachcode (z. B. de)")
|
||||
ap.add_argument("--openai-model", type=str, default=os.getenv("OPENAI_MODEL", "gpt-4o"))
|
||||
ap.add_argument("--pattern", type=str, default="highlight_*.mp4")
|
||||
ap.add_argument("--overwrite", action="store_true")
|
||||
ap.add_argument("--no-rate", action="store_true")
|
||||
ap.add_argument("--no-subs", action="store_true")
|
||||
ap.add_argument("--no-detect", action="store_true")
|
||||
ap.add_argument("--no-make", action="store_true")
|
||||
ap.add_argument("--no-apply", action="store_true")
|
||||
ap.add_argument("--logfile", type=str, default=None)
|
||||
args = ap.parse_args()
|
||||
|
||||
os.chdir(PROJECT_ROOT)
|
||||
|
||||
env = os.environ.copy()
|
||||
env.setdefault("OPENAI_MODEL", args.openai_model)
|
||||
env.setdefault("XDG_CACHE_HOME", str(WHISPER_CACHE_DIR))
|
||||
|
||||
if not env.get("OPENAI_API_KEY"):
|
||||
print("⚠️ OPENAI_API_KEY ist nicht gesetzt – LLM-Schritte könnten fehlschlagen.")
|
||||
|
||||
# Input-Datei bestimmen
|
||||
if args.input:
|
||||
input_path = Path(args.input)
|
||||
if not input_path.is_file():
|
||||
candidate = INPUT_DIR / args.input
|
||||
if candidate.is_file():
|
||||
input_path = candidate
|
||||
else:
|
||||
raise SystemExit(f"Input nicht gefunden: {args.input}")
|
||||
else:
|
||||
picked = default_input()
|
||||
if not picked:
|
||||
raise SystemExit(f"Kein Input in {INPUT_DIR} gefunden. Bitte --input setzen.")
|
||||
input_path = picked
|
||||
|
||||
base = infer_base_from_input(input_path)
|
||||
print(f"📥 Input: {input_path}")
|
||||
print(f"🔤 Whisper: {args.whisper_model} | 🌐 LLM: {env.get('OPENAI_MODEL')}")
|
||||
print(f"🧩 Base: {base}")
|
||||
|
||||
# Logfile
|
||||
if args.logfile:
|
||||
log_path = Path(args.logfile)
|
||||
else:
|
||||
log_path = LOGS_DIR / f"run_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
|
||||
|
||||
# Tee: schreibe in Datei UND Konsole
|
||||
try:
|
||||
log_fh = open(log_path, "w", encoding="utf-8")
|
||||
class _Tee:
|
||||
def __init__(self, *streams): self.streams = streams
|
||||
def write(self, data):
|
||||
for s in self.streams:
|
||||
try: s.write(data); s.flush()
|
||||
except Exception: pass
|
||||
def flush(self):
|
||||
for s in self.streams:
|
||||
try: s.flush()
|
||||
except Exception: pass
|
||||
sys.stdout = _Tee(sys.__stdout__, log_fh)
|
||||
sys.stderr = _Tee(sys.__stderr__, log_fh)
|
||||
print(f"📝 Logfile: {log_path}")
|
||||
except Exception:
|
||||
print(f"⚠️ Konnte Logfile nicht initialisieren: {log_path}")
|
||||
|
||||
durations = []
|
||||
started = datetime.now()
|
||||
print(f"🚀 Start: {started:%Y-%m-%d %H:%M:%S}")
|
||||
|
||||
try:
|
||||
# 1) Transcription
|
||||
t_args = [sys.executable, SCRIPTS["transcription"], "--input", str(input_path), "--model", args.whisper_model]
|
||||
if args.lang: t_args += ["--lang", args.lang]
|
||||
durations.append(("Transcription", run_step(t_args, "Transcription", env=env)))
|
||||
|
||||
# 2) LLM Segmentierung
|
||||
st_args = [sys.executable, SCRIPTS["segment_transcript"], "--base", base]
|
||||
durations.append(("Segment Transcript", run_step(st_args, "Segment Transcript", env=env)))
|
||||
|
||||
# 3) Highlights schneiden
|
||||
cut_filename = input_path.name
|
||||
cc_args = [sys.executable, SCRIPTS["cutClips"], "--file", cut_filename, "--limit", str(args.limit)]
|
||||
durations.append(("Cut Clips", run_step(cc_args, "Cut Clips", env=env)))
|
||||
|
||||
# 4) Faces
|
||||
if not args.no_detect:
|
||||
df_args = [sys.executable, SCRIPTS["detect_faces"]]
|
||||
durations.append(("Detect Faces", run_step(df_args, "Detect Faces", env=env)))
|
||||
else:
|
||||
print("⏭️ Detect Faces übersprungen.")
|
||||
|
||||
# 5) Make Targets
|
||||
if not args.no_make:
|
||||
ms_args = [sys.executable, SCRIPTS["make_segments"], "--pattern", args.pattern]
|
||||
durations.append(("Make Targets", run_step(ms_args, "Make Targets", env=env)))
|
||||
else:
|
||||
print("⏭️ Make Targets übersprungen.")
|
||||
|
||||
# 6) Crop
|
||||
if not args.no_apply:
|
||||
ac_args = [sys.executable, SCRIPTS["apply_crop"], "--pattern", args.pattern, "--mux_audio"]
|
||||
if args.overwrite: ac_args.append("--overwrite")
|
||||
durations.append(("Apply Crop", run_step(ac_args, "Apply Crop", env=env)))
|
||||
else:
|
||||
print("⏭️ Apply Crop übersprungen.")
|
||||
|
||||
# 7) Bewertung
|
||||
if not args.no_rate:
|
||||
rc_args = [sys.executable, SCRIPTS["rateCluster"]]
|
||||
durations.append(("Rate Clusters", run_step(rc_args, "Rate Clusters", env=env)))
|
||||
else:
|
||||
print("⏭️ Rate Clusters übersprungen.")
|
||||
|
||||
# 8) Untertitel
|
||||
if not args.no_subs:
|
||||
as_args = [sys.executable, SCRIPTS["add_subtitles"]]
|
||||
durations.append(("Subtitles", run_step(as_args, "Subtitles", env=env)))
|
||||
else:
|
||||
print("⏭️ Subtitles übersprungen.")
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n⛔ Abgebrochen (Ctrl+C).")
|
||||
finally:
|
||||
finished = datetime.now()
|
||||
total = sum(dt for _, dt in durations)
|
||||
print("\n======================== ZUSAMMENFASSUNG ============================")
|
||||
for name, dt in durations:
|
||||
print(f"✅ {name:<24} {dt:7.2f}s")
|
||||
print("---------------------------------------------------------------------")
|
||||
print(f"⏱️ Gesamtdauer: {total:.2f}s")
|
||||
print(f"🕒 Start : {started:%Y-%m-%d %H:%M:%S}")
|
||||
print(f"🕒 Ende : {finished:%Y-%m-%d %H:%M:%S}")
|
||||
print(f"📂 Output:")
|
||||
print(f" Raw Clips : {RAW_CLIPS_DIR}")
|
||||
print(f" 9:16 : {CROPPED_DIR}")
|
||||
print(f" Subbed : {SUBTITLED_DIR}")
|
||||
print("=====================================================================")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
43
src/main.py
Normal file
43
src/main.py
Normal file
@ -0,0 +1,43 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Einfaches Master-Skript, das alle Unter-Skripte nacheinander startet – ohne Argumente.
|
||||
"""
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Reihenfolge der auszuführenden Skripte (relativer Pfad)
|
||||
SCRIPTS = [
|
||||
"text/transcription.py",
|
||||
"text/segment_transcript.py",
|
||||
"text/rateCluster.py",
|
||||
"text/cutClips.py",
|
||||
"reformat/track_faces_Yolo.py",
|
||||
"reformat/detect_speaking_faces.py",
|
||||
"reformat/crop_to_speaker.py",
|
||||
]
|
||||
|
||||
|
||||
def run_script(script_path: str):
|
||||
"""
|
||||
Führt ein Python-Skript ohne weitere Argumente aus.
|
||||
"""
|
||||
print(f"🔄 Running: {script_path}")
|
||||
full_path = Path(__file__).parent / script_path
|
||||
try:
|
||||
subprocess.check_call([sys.executable, str(full_path)])
|
||||
print(f"✔️ {script_path} erfolgreich abgeschlossen.\n")
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"❌ Fehler in {script_path}: Rückgabecode {e.returncode}")
|
||||
sys.exit(e.returncode)
|
||||
|
||||
|
||||
def main():
|
||||
print("\n=== Starte komplette Podcast-Pipeline ===\n")
|
||||
for script in SCRIPTS:
|
||||
run_script(script)
|
||||
print("✅ Alle Schritte erfolgreich abgeschlossen.")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
315
src/reformat/main_apply_crop.py
Normal file
315
src/reformat/main_apply_crop.py
Normal file
@ -0,0 +1,315 @@
|
||||
#!/usr/bin/env python3
|
||||
# src/reformat/new/main_apply_crop.py
|
||||
from __future__ import annotations
|
||||
import logging, json, math, subprocess, argparse
|
||||
from pathlib import Path
|
||||
from typing import Optional, Tuple, List, Dict, Any
|
||||
from collections import deque
|
||||
import sys
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
# ── Projektwurzel importierbar machen
|
||||
ROOT = Path(__file__).resolve().parents[2]
|
||||
sys.path.insert(0, str(ROOT))
|
||||
from config import RAW_CLIPS_DIR, FACE_COMBINED_DIR, SEGMENTS_DIR, CROPPED_DIR
|
||||
|
||||
# ==== Defaults (per CLI überschreibbar) ======================================
|
||||
OUT_W_DEFAULT, OUT_H_DEFAULT = 1080, 1920 # 9:16
|
||||
DEBUG_SCALE_DEFAULT = 0.6
|
||||
MEDIAN_WIN_DEFAULT = 5
|
||||
EMA_ALPHA_DEFAULT = 0.22
|
||||
DEADBAND_PX_DEFAULT = 8.0
|
||||
SWITCH_COOLDOWN_FR_DEFAULT = 12
|
||||
ZOOM_PAD_FRAC_DEFAULT = 0.10
|
||||
|
||||
USE_CUT_DETECT_DEFAULT = True
|
||||
CUT_CORR_THRESH_DEFAULT = 0.65
|
||||
CUT_COOLDOWN_DEFAULT = 6
|
||||
|
||||
MUX_AUDIO_DEFAULT = True
|
||||
FFMPEG_BIN = "ffmpeg"
|
||||
# ============================================================================
|
||||
|
||||
def clamp(v, lo, hi): return max(lo, min(hi, v))
|
||||
|
||||
def compute_crop_rect(cx: float, cy: float, src_w: int, src_h: int,
|
||||
out_w: int, out_h: int, zoom_pad_frac: float) -> tuple[int,int,int,int]:
|
||||
"""9:16 (out_w:out_h) Crop um (cx,cy) — ohne Squeeze, mit Zoom-Pad, im Bild gehalten."""
|
||||
target_ar = out_w / out_h
|
||||
src_ar = src_w / src_h
|
||||
if src_ar >= target_ar:
|
||||
base_h = src_h
|
||||
base_w = int(round(base_h * target_ar))
|
||||
else:
|
||||
base_w = src_w
|
||||
base_h = int(round(base_w / target_ar))
|
||||
|
||||
desired_scale = 1.0 + zoom_pad_frac
|
||||
s = min(desired_scale, src_w / base_w, src_h / base_h)
|
||||
w = int(round(base_w * s))
|
||||
h = int(round(base_h * s))
|
||||
half_w, half_h = w // 2, h // 2
|
||||
|
||||
cx = clamp(cx, half_w, src_w - half_w)
|
||||
cy = clamp(cy, half_h, src_h - half_h)
|
||||
x = int(round(cx - half_w))
|
||||
y = int(round(cy - half_h))
|
||||
return x, y, w, h
|
||||
|
||||
def draw_center(img, pt, color, label=None):
|
||||
if pt is None: return
|
||||
x, y = int(pt[0]), int(pt[1])
|
||||
cv2.circle(img, (x, y), 6, color, -1)
|
||||
if label:
|
||||
cv2.putText(img, label, (x + 8, y - 8),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.55, color, 2, cv2.LINE_AA)
|
||||
|
||||
def scene_corr(a_small: np.ndarray, b_small: np.ndarray) -> float:
|
||||
a_hsv = cv2.cvtColor(a_small, cv2.COLOR_BGR2HSV)
|
||||
b_hsv = cv2.cvtColor(b_small, cv2.COLOR_BGR2HSV)
|
||||
ha = cv2.calcHist([a_hsv],[0,1],None,[50,50],[0,180,0,256])
|
||||
hb = cv2.calcHist([b_hsv],[0,1],None,[50,50],[0,180,0,256])
|
||||
cv2.normalize(ha,ha,0,1,cv2.NORM_MINMAX); cv2.normalize(hb,hb,0,1,cv2.NORM_MINMAX)
|
||||
return float((cv2.compareHist(ha, hb, cv2.HISTCMP_CORREL) + 1.0)/2.0)
|
||||
|
||||
def mux_audio_from_source(src_video: Path, silent_video: Path, out_video: Path):
|
||||
cmd = [
|
||||
FFMPEG_BIN, "-y",
|
||||
"-i", str(src_video),
|
||||
"-i", str(silent_video),
|
||||
"-map", "1:v:0",
|
||||
"-map", "0:a:0?",
|
||||
"-c:v", "copy",
|
||||
"-c:a", "aac", "-b:a", "192k",
|
||||
"-shortest",
|
||||
str(out_video),
|
||||
]
|
||||
subprocess.run(cmd, check=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
|
||||
def load_faces(name: str) -> List[Dict[str,Any]]:
|
||||
p = FACE_COMBINED_DIR / f"{name}_faces.json"
|
||||
if not p.exists(): return []
|
||||
return json.loads(p.read_text(encoding="utf-8"))
|
||||
|
||||
def load_target_map_or_segments(name: str, total_frames: int) -> List[Optional[int] | Dict]:
|
||||
"""
|
||||
Bevorzugt *_target_by_frame.json (Liste Dicts mit t,cx,cy,w,h).
|
||||
Fallback: *_segments.json (pro Frame Track-ID).
|
||||
Gibt Liste gleicher Länge wie total_frames zurück.
|
||||
"""
|
||||
map_p = SEGMENTS_DIR / f"{name}_target_by_frame.json"
|
||||
if map_p.exists():
|
||||
target = json.loads(map_p.read_text(encoding="utf-8"))
|
||||
# Falls es Dicts sind (cx,cy,w,h pro frame), einfach zurückgeben:
|
||||
if target and isinstance(target[0], dict):
|
||||
if len(target) < total_frames:
|
||||
last = target[-1] if target else {"t":0,"cx":0.5,"cy":0.5,"w":0.6,"h":0.6}
|
||||
target += [last] * (total_frames - len(target))
|
||||
return target[:total_frames]
|
||||
# Falls numerische IDs drin wären, fällt es unten durch auf segs-Logik
|
||||
seg_p = SEGMENTS_DIR / f"{name}_segments.json"
|
||||
if seg_p.exists():
|
||||
segs = json.loads(seg_p.read_text(encoding="utf-8"))
|
||||
target_tid = [None]*total_frames
|
||||
for s in segs:
|
||||
a, b, tid = int(s["start_f"]), int(s["end_f"]), s["track_id"]
|
||||
for t in range(max(0,a), min(total_frames, b+1)):
|
||||
target_tid[t] = tid
|
||||
return target_tid
|
||||
return [None]*total_frames
|
||||
|
||||
def find_center_for_track(faces_frame: Dict[str,Any], target_tid: Optional[int], fallback: Tuple[float,float]) -> Tuple[float,float]:
|
||||
if target_tid is None:
|
||||
return fallback
|
||||
faces = faces_frame.get("faces", [])
|
||||
for f in faces:
|
||||
if int(f.get("track_id", -1)) == int(target_tid):
|
||||
x,y,w,h = f.get("bbox", [None,None,None,None])
|
||||
if None not in (x,y,w,h):
|
||||
return (float(x + w/2), float(y + h/2))
|
||||
return fallback
|
||||
|
||||
def parse_args():
|
||||
p = argparse.ArgumentParser(description="Apply 9:16 Auto-Crop auf Rohclips mit Face-/Target-Daten.")
|
||||
p.add_argument("--pattern", type=str, default="*.mp4", help="Dateimuster in RAW_CLIPS_DIR (Default: *.mp4)")
|
||||
p.add_argument("--out_w", type=int, default=OUT_W_DEFAULT, help="Output-Breite (Default: 1080)")
|
||||
p.add_argument("--out_h", type=int, default=OUT_H_DEFAULT, help="Output-Höhe (Default: 1920)")
|
||||
p.add_argument("--zoom_pad", type=float, default=ZOOM_PAD_FRAC_DEFAULT, help="Zoom-Pad (0..1, Default 0.10)")
|
||||
p.add_argument("--median", type=int, default=MEDIAN_WIN_DEFAULT, help="Median-Fenster (ungerade, >=3)")
|
||||
p.add_argument("--ema", type=float, default=EMA_ALPHA_DEFAULT, help="EMA-Alpha (0..1)")
|
||||
p.add_argument("--deadband", type=float, default=DEADBAND_PX_DEFAULT, help="Totband in Pixel")
|
||||
p.add_argument("--switch_cd", type=int, default=SWITCH_COOLDOWN_FR_DEFAULT, help="Cooldown-Frames nach Trackwechsel")
|
||||
p.add_argument("--cut_detect", action="store_true", default=USE_CUT_DETECT_DEFAULT, help="Szenenschnitt-Erkennung aktivieren")
|
||||
p.add_argument("--cut_corr", type=float, default=CUT_CORR_THRESH_DEFAULT, help="Korrelation-Schwelle (0..1)")
|
||||
p.add_argument("--cut_cd", type=int, default=CUT_COOLDOWN_DEFAULT, help="Cooldown-Frames nach Cut")
|
||||
p.add_argument("--mux_audio", action="store_true", default=MUX_AUDIO_DEFAULT, help="Audio vom Original muxen")
|
||||
p.add_argument("--debug", action="store_true", help="Debug-Overlay anzeigen (langsam)")
|
||||
p.add_argument("--debug_scale", type=float, default=DEBUG_SCALE_DEFAULT, help="Skalierung Debug-Preview")
|
||||
p.add_argument("--overwrite", action="store_true", help="Existierende Outputs überschreiben")
|
||||
return p.parse_args()
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
OUT_DIR = CROPPED_DIR
|
||||
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO)
|
||||
clips = sorted(list(RAW_CLIPS_DIR.glob(args.pattern)))
|
||||
if not clips:
|
||||
print(f"⚠️ Keine Clips in {RAW_CLIPS_DIR} mit Pattern '{args.pattern}'")
|
||||
return
|
||||
|
||||
print(f"🔎 {len(clips)} Clips gefunden …")
|
||||
for video_path in clips:
|
||||
name = video_path.stem
|
||||
out_path = OUT_DIR / f"{name}_9x16.mp4"
|
||||
if out_path.exists() and not args.overwrite:
|
||||
print(f"⏭️ Skip (existiert): {out_path.name}")
|
||||
continue
|
||||
|
||||
# Video öffnen
|
||||
cap = cv2.VideoCapture(str(video_path))
|
||||
if not cap.isOpened():
|
||||
print(f"❌ Kann Video nicht öffnen: {video_path.name}")
|
||||
continue
|
||||
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
|
||||
total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
|
||||
# Face/Target laden
|
||||
faces_all = load_faces(name)
|
||||
if faces_all and len(faces_all) < total:
|
||||
faces_all += [ {"faces": [], "W": width, "H": height} ] * (total - len(faces_all))
|
||||
target_by_frame = load_target_map_or_segments(name, total)
|
||||
|
||||
# Writer vorbereiten
|
||||
writer = cv2.VideoWriter(str(out_path),
|
||||
cv2.VideoWriter_fourcc(*"mp4v"),
|
||||
fps, (args.out_w, args.out_h))
|
||||
|
||||
median_buf = deque(maxlen=max(3, args.median if args.median % 2 else args.median+1))
|
||||
ema_center: Optional[Tuple[float,float]] = None
|
||||
last_center: Optional[Tuple[float,float]] = (width/2, height/2)
|
||||
switch_cooldown = 0
|
||||
|
||||
prev_small = None
|
||||
cut_cd = 0
|
||||
|
||||
print(f"🎞️ Apply: {name} src={width}x{height} fps={fps:.2f} frames={total}")
|
||||
|
||||
for t in range(total):
|
||||
ret, frame = cap.read()
|
||||
if not ret: break
|
||||
|
||||
# Ziel bestimmen:
|
||||
desired = None
|
||||
tgt = target_by_frame[t] if t < len(target_by_frame) else None
|
||||
|
||||
# Fall A: target_by_frame.json mit direkten Zentren (Dict)
|
||||
if isinstance(tgt, dict) and all(k in tgt for k in ("cx","cy","w","h")):
|
||||
desired = (float(tgt["cx"])*width, float(tgt["cy"])*height)
|
||||
else:
|
||||
# Fall B: numerische Track-ID
|
||||
target_tid = tgt if tgt is None or isinstance(tgt, (int, float)) else None
|
||||
faces_fr = faces_all[t] if (faces_all and t < len(faces_all)) else {"faces":[]}
|
||||
desired = find_center_for_track(faces_fr, target_tid, last_center or (width/2, height/2))
|
||||
|
||||
# Szenenschnitt?
|
||||
if args.cut_detect:
|
||||
small = cv2.resize(frame, (128, 72))
|
||||
if prev_small is not None:
|
||||
corr = scene_corr(prev_small, small)
|
||||
if corr < args.cut_corr:
|
||||
ema_center = desired
|
||||
last_center = desired
|
||||
switch_cooldown = args.switch_cd
|
||||
cut_cd = args.cut_cd
|
||||
prev_small = small
|
||||
|
||||
# Median-Filter
|
||||
median_buf.append(desired)
|
||||
if len(median_buf) >= 3:
|
||||
xs = sorted(p[0] for p in median_buf)
|
||||
ys = sorted(p[1] for p in median_buf)
|
||||
m = len(median_buf)//2
|
||||
desired_f = (xs[m], ys[m])
|
||||
else:
|
||||
desired_f = desired
|
||||
|
||||
# Trackwechsel erkennen (nur bei Track-IDs sauber möglich)
|
||||
if t > 0:
|
||||
prev_tgt = target_by_frame[t-1] if t-1 < len(target_by_frame) else None
|
||||
else:
|
||||
prev_tgt = tgt
|
||||
is_switch = (not isinstance(tgt, dict)) and (tgt != prev_tgt)
|
||||
|
||||
if ema_center is None:
|
||||
ema_center = desired_f
|
||||
if last_center is None:
|
||||
last_center = desired_f
|
||||
|
||||
if is_switch:
|
||||
ema_center = desired_f
|
||||
last_center = desired_f
|
||||
switch_cooldown = args.switch_cd
|
||||
else:
|
||||
dx = desired_f[0] - ema_center[0]
|
||||
dy = desired_f[1] - ema_center[1]
|
||||
dist = math.hypot(dx, dy)
|
||||
if cut_cd > 0:
|
||||
ema_center = desired_f
|
||||
cut_cd -= 1
|
||||
else:
|
||||
if dist > args.deadband:
|
||||
ema_center = (ema_center[0] + dx*args.ema,
|
||||
ema_center[1] + dy*args.ema)
|
||||
|
||||
last_center = desired_f
|
||||
|
||||
# 9:16 Crop anwenden
|
||||
x, y, w, h = compute_crop_rect(ema_center[0], ema_center[1], width, height,
|
||||
args.out_w, args.out_h, args.zoom_pad)
|
||||
cropped = frame[y:y+h, x:x+w]
|
||||
if cropped.size == 0: cropped = frame
|
||||
final = cv2.resize(cropped, (args.out_w, args.out_h), interpolation=cv2.INTER_AREA)
|
||||
writer.write(final)
|
||||
|
||||
if args.debug:
|
||||
dbg = frame.copy()
|
||||
cv2.rectangle(dbg, (x, y), (x+w, y+h), (0, 0, 255), 2)
|
||||
draw_center(dbg, desired, (128,128,255), "desired")
|
||||
draw_center(dbg, desired_f, (255,255, 0), "median")
|
||||
draw_center(dbg, ema_center, ( 0,255,255), "ema")
|
||||
cv2.putText(dbg, f"t={t+1}/{total}", (12, height-14),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.6, (20,220,20), 2, cv2.LINE_AA)
|
||||
disp = cv2.resize(dbg, (int(width*args.debug_scale), int(height*args.debug_scale)))
|
||||
cv2.imshow("Apply Debug", disp)
|
||||
if cv2.waitKey(1) & 0xFF == ord("q"):
|
||||
print("🛑 Abgebrochen (q).")
|
||||
break
|
||||
|
||||
writer.release()
|
||||
cap.release()
|
||||
|
||||
# Audio muxen?
|
||||
if args.mux_audio:
|
||||
tmp = out_path.with_suffix(".tmp.mp4")
|
||||
try:
|
||||
out_path.rename(tmp)
|
||||
mux_audio_from_source(video_path, tmp, out_path)
|
||||
finally:
|
||||
if tmp.exists():
|
||||
try: tmp.unlink()
|
||||
except: pass
|
||||
print(f"✅ Fertig (mit Audio): {out_path.name}")
|
||||
else:
|
||||
print(f"✅ Fertig: {out_path.name}")
|
||||
|
||||
if args.debug:
|
||||
cv2.destroyAllWindows()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
335
src/reformat/main_detect_faces.py
Normal file
335
src/reformat/main_detect_faces.py
Normal file
@ -0,0 +1,335 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
Face-Detection + Mouth-Openness (YOLOv8-face + MediaPipe)
|
||||
- liest Rohclips aus RAW_CLIPS_DIR
|
||||
- schreibt pro Video eine faces.json in FACE_COMBINED_DIR
|
||||
- optionaler Fortschrittsbalken (tqdm)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
import logging
|
||||
import json
|
||||
import time
|
||||
from pathlib import Path
|
||||
from contextlib import nullcontext
|
||||
from typing import List, Dict, Any
|
||||
from src.reformat.speaking import get_mouth_openness
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import torch
|
||||
from ultralytics import YOLO
|
||||
import mediapipe as mp
|
||||
import sys
|
||||
|
||||
# ── Projekt-Root + zentrale Pfade laden
|
||||
ROOT = Path(__file__).resolve().parents[2]
|
||||
sys.path.insert(0, str(ROOT))
|
||||
from config import RAW_CLIPS_DIR, FACE_COMBINED_DIR # zentrale Verzeichnisse
|
||||
|
||||
# Fortschritt hübsch, wenn verfügbar
|
||||
try:
|
||||
from tqdm import tqdm
|
||||
_HAS_TQDM = True
|
||||
except Exception:
|
||||
_HAS_TQDM = False
|
||||
|
||||
# ---------- Performance Tweaks ----------
|
||||
torch.set_float32_matmul_precision("high")
|
||||
cv2.setUseOptimized(True)
|
||||
|
||||
# ---------- Hilfsfunktionen ----------
|
||||
def make_square_crop(x1, y1, x2, y2, W, H, margin_scale, min_crop):
|
||||
cx = (x1 + x2) * 0.5
|
||||
cy = (y1 + y2) * 0.5
|
||||
w = (x2 - x1) * (1.0 + 2.0 * margin_scale)
|
||||
h = (y2 - y1) * (1.0 + 2.0 * margin_scale)
|
||||
side = max(w, h, float(min_crop))
|
||||
half = side * 0.5
|
||||
|
||||
sx1 = int(max(0, round(cx - half)))
|
||||
sy1 = int(max(0, round(cy - half)))
|
||||
sx2 = int(min(W, round(cx + half)))
|
||||
sy2 = int(min(H, round(cy + half)))
|
||||
|
||||
side_w = max(0, sx2 - sx1)
|
||||
side_h = max(0, sy2 - sy1)
|
||||
side = max(2, min(side_w, side_h))
|
||||
sx2 = sx1 + side
|
||||
sy2 = sy1 + side
|
||||
return sx1, sy1, sx2, sy2
|
||||
|
||||
|
||||
def pick_landmarks_near_crop_center(lm_lists, crop_w, crop_h):
|
||||
if not lm_lists:
|
||||
return None
|
||||
cx_t, cy_t = crop_w * 0.5, crop_h * 0.5
|
||||
best, best_d = None, 1e12
|
||||
for lms in lm_lists:
|
||||
xs = [p.x * crop_w for p in lms.landmark]
|
||||
ys = [p.y * crop_h for p in lms.landmark]
|
||||
cx = sum(xs) / len(xs)
|
||||
cy = sum(ys) / len(ys)
|
||||
d = (cx - cx_t) ** 2 + (cy - cy_t) ** 2
|
||||
if d < best_d:
|
||||
best, best_d = lms, d
|
||||
return best
|
||||
|
||||
|
||||
def run_mesh(face_mesh, crop_bgr, upscale_if_small):
|
||||
if crop_bgr.size == 0:
|
||||
return None, 0.0
|
||||
ch, cw = crop_bgr.shape[:2]
|
||||
if max(ch, cw) < upscale_if_small:
|
||||
scale = float(upscale_if_small) / max(ch, cw)
|
||||
new_w = max(1, int(round(cw * scale)))
|
||||
new_h = max(1, int(round(ch * scale)))
|
||||
crop_bgr = cv2.resize(crop_bgr, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
|
||||
ch, cw = new_h, new_w
|
||||
rgb = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2RGB)
|
||||
res = face_mesh.process(rgb)
|
||||
if not res.multi_face_landmarks:
|
||||
return None, 0.0
|
||||
chosen = pick_landmarks_near_crop_center(res.multi_face_landmarks, cw, ch)
|
||||
if chosen is None:
|
||||
return None, 0.0
|
||||
mo = get_mouth_openness(chosen.landmark, ch)
|
||||
return chosen, float(mo)
|
||||
|
||||
# ---------- Kernprozess ----------
|
||||
def process_video(video_path: Path,
|
||||
output_path: Path,
|
||||
model: YOLO,
|
||||
face_mesh,
|
||||
conf_thresh: float,
|
||||
frame_skip: int,
|
||||
downscale: float,
|
||||
expansion_1: float,
|
||||
expansion_2: float,
|
||||
min_crop: int,
|
||||
faces_upscale: int,
|
||||
imgsz: int,
|
||||
device: str,
|
||||
max_det: int):
|
||||
print(f"🎬 Starte Detection: {video_path.name}")
|
||||
cap = cv2.VideoCapture(str(video_path))
|
||||
if not cap.isOpened():
|
||||
logging.error(f"❌ Kann Video nicht öffnen: {video_path}")
|
||||
return
|
||||
|
||||
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
|
||||
orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||
orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||
total_frames_raw = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
total_to_process = None
|
||||
if total_frames_raw > 0:
|
||||
total_to_process = (total_frames_raw + (frame_skip - 1)) // max(1, frame_skip)
|
||||
|
||||
scaled_w = max(1, int(round(orig_w * downscale)))
|
||||
scaled_h = max(1, int(round(orig_h * downscale)))
|
||||
|
||||
data: List[Dict[str, Any]] = []
|
||||
frame_idx = 0
|
||||
processed_frames = 0
|
||||
|
||||
sx = (orig_w / scaled_w) if downscale != 1.0 else 1.0
|
||||
sy = (orig_h / scaled_h) if downscale != 1.0 else 1.0
|
||||
|
||||
autocast_ctx = (
|
||||
torch.autocast(device_type=device, dtype=torch.float16)
|
||||
if device in ("mps", "cuda") else nullcontext()
|
||||
)
|
||||
|
||||
bar = None
|
||||
start_t = time.time()
|
||||
if _HAS_TQDM and total_to_process:
|
||||
bar = tqdm(total=total_to_process, desc=f"{video_path.name}", unit="f", leave=True)
|
||||
|
||||
while True:
|
||||
ret, frame = cap.read()
|
||||
if not ret:
|
||||
break
|
||||
|
||||
if frame_skip > 1 and (frame_idx % frame_skip != 0):
|
||||
frame_idx += 1
|
||||
continue
|
||||
|
||||
frame_infer = frame if downscale == 1.0 else cv2.resize(frame, (scaled_w, scaled_h), interpolation=cv2.INTER_AREA)
|
||||
|
||||
with torch.no_grad():
|
||||
with autocast_ctx:
|
||||
# Ultralytics 8 API: __call__ statt .predict() (beide funktionieren)
|
||||
result = model(frame_infer, imgsz=imgsz, device=device, verbose=False,
|
||||
conf=conf_thresh, iou=0.5, max_det=max_det)
|
||||
detections = result[0]
|
||||
|
||||
faces = []
|
||||
for i in range(len(detections.boxes)):
|
||||
box = detections.boxes[i]
|
||||
conf = float(box.conf[0]) if hasattr(box.conf, "__len__") else float(box.conf)
|
||||
if conf < conf_thresh:
|
||||
continue
|
||||
x1, y1, x2, y2 = [float(v) for v in box.xyxy[0].tolist()]
|
||||
if downscale != 1.0:
|
||||
x1 *= sx; x2 *= sx; y1 *= sy; y2 *= sy
|
||||
x1 = max(0.0, min(x1, orig_w - 1))
|
||||
y1 = max(0.0, min(y1, orig_h - 1))
|
||||
x2 = max(0.0, min(x2, orig_w - 1))
|
||||
y2 = max(0.0, min(y2, orig_h - 1))
|
||||
|
||||
w = max(1.0, x2 - x1)
|
||||
h = max(1.0, y2 - y1)
|
||||
cx = x1 + w / 2.0
|
||||
cy = y1 + h / 2.0
|
||||
|
||||
# Pass 1
|
||||
sx1, sy1, sx2, sy2 = make_square_crop(x1, y1, x2, y2, orig_w, orig_h, expansion_1, min_crop)
|
||||
if sx2 - sx1 < 4 or sy2 - sy1 < 4:
|
||||
continue
|
||||
face_crop = frame[sy1:sy2, sx1:sx2]
|
||||
_, mouth_open = run_mesh(face_mesh, face_crop, faces_upscale)
|
||||
|
||||
# Pass 2 nur wenn nötig
|
||||
if mouth_open == 0.0:
|
||||
sx1b, sy1b, sx2b, sy2b = make_square_crop(x1, y1, x2, y2, orig_w, orig_h, expansion_2, min_crop)
|
||||
if (sx2b - sx1b) >= 4 and (sy2b - sy1b) >= 4:
|
||||
face_crop_b = frame[sy1b:sy2b, sx1b:sx2b]
|
||||
_, mouth_open = run_mesh(face_mesh, face_crop_b, faces_upscale)
|
||||
|
||||
faces.append({
|
||||
"bbox": [int(round(x1)), int(round(y1)), int(round(w)), int(round(h))],
|
||||
"conf": round(conf, 3),
|
||||
"center": [round(cx, 1), round(cy, 1)],
|
||||
"mouth_openness": round(float(mouth_open), 3)
|
||||
})
|
||||
|
||||
data.append({
|
||||
"frame": frame_idx,
|
||||
"timestamp": round(frame_idx / fps, 3),
|
||||
"W": orig_w,
|
||||
"H": orig_h,
|
||||
"faces": faces
|
||||
})
|
||||
frame_idx += 1
|
||||
processed_frames += 1
|
||||
|
||||
# Fortschritt
|
||||
if bar is not None:
|
||||
bar.update(1)
|
||||
else:
|
||||
if processed_frames % 30 == 0:
|
||||
elapsed = time.time() - start_t
|
||||
rate = processed_frames / max(1e-6, elapsed) # frames/sec
|
||||
if total_to_process:
|
||||
remaining = max(0, total_to_process - processed_frames)
|
||||
eta_sec = remaining / max(1e-6, rate)
|
||||
print(f"[{video_path.name}] {processed_frames}/{total_to_process} "
|
||||
f"({processed_frames/total_to_process*100:.1f}%) "
|
||||
f"— {rate:.1f} f/s — ETA {eta_sec/60:.1f} min")
|
||||
else:
|
||||
print(f"[{video_path.name}] {processed_frames} frames — {rate:.1f} f/s")
|
||||
|
||||
cap.release()
|
||||
if bar is not None:
|
||||
bar.close()
|
||||
|
||||
# schön formatiertes JSON
|
||||
output_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
print(f"✅ Faces gespeichert: {output_path.name}")
|
||||
|
||||
# ---------- CLI ----------
|
||||
def parse_args():
|
||||
p = argparse.ArgumentParser(description="YOLOv8-face + MediaPipe FaceMesh → faces.json pro Clip")
|
||||
# Verzeichnisse (Default aus config.py)
|
||||
p.add_argument("--input-dir", type=Path, default=RAW_CLIPS_DIR, help=f"Rohclips (Default: {RAW_CLIPS_DIR})")
|
||||
p.add_argument("--output-dir", type=Path, default=FACE_COMBINED_DIR, help=f"Zielordner (Default: {FACE_COMBINED_DIR})")
|
||||
# Modell
|
||||
p.add_argument("--model", type=Path, default=ROOT / "models" / "yolov8n-face.pt",
|
||||
help="Pfad zum YOLOv8-face Modell (.pt)")
|
||||
# Optimierte Defaults
|
||||
p.add_argument("--conf-thresh", type=float, default=0.35)
|
||||
p.add_argument("--frame-skip", type=int, default=1, help="Nur jeden n-ten Frame verarbeiten")
|
||||
p.add_argument("--downscale", type=float, default=0.5, help="Eingangsframe auf Faktor verkleinern (0..1)")
|
||||
p.add_argument("--expansion", type=float, default=0.4, help="Crop-Margin Pass 1 (relativ)")
|
||||
p.add_argument("--expansion2", type=float, default=0.8, help="Crop-Margin Pass 2 (relativ)")
|
||||
p.add_argument("--min-crop", type=int, default=160, help="Minimaler Croprand in Pixeln (quadratisch)")
|
||||
p.add_argument("--faces-upscale", type=int, default=192, help="Minimale Kantenlänge für FaceMesh (bei kleineren Crops upscalen)")
|
||||
p.add_argument("--imgsz", type=int, default=448)
|
||||
p.add_argument("--max-det", type=int, default=20)
|
||||
p.add_argument("--use-refine", action="store_true", default=False, help="MediaPipe mit refine_landmarks")
|
||||
return p.parse_args()
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO)
|
||||
args.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# YOLO Modell & Device
|
||||
yolo = YOLO(str(args.model))
|
||||
if torch.backends.mps.is_available():
|
||||
device = "mps"
|
||||
elif torch.cuda.is_available():
|
||||
device = "cuda"
|
||||
else:
|
||||
device = "cpu"
|
||||
yolo.to(device)
|
||||
print(f"🖥️ Inference-Device: {device}")
|
||||
|
||||
# Warmup
|
||||
try:
|
||||
with torch.no_grad():
|
||||
dummy = np.zeros((args.imgsz, args.imgsz, 3), dtype=np.uint8)
|
||||
_ = yolo(source=[dummy], imgsz=args.imgsz, verbose=False, device=device)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Eingabedateien anzeigen
|
||||
videos = sorted([*args.input_dir.glob("*.mp4"), *args.input_dir.glob("*.mov"), *args.input_dir.glob("*.mkv")])
|
||||
print(f"🔍 Input-Ordner: {args.input_dir.resolve()}")
|
||||
if not videos:
|
||||
print("⚠️ Keine passenden Videos gefunden.")
|
||||
return
|
||||
print("📁 Dateien:")
|
||||
for p in videos:
|
||||
print(" →", p.name)
|
||||
|
||||
outer = None
|
||||
if _HAS_TQDM:
|
||||
outer = tqdm(total=len(videos), desc="Gesamt", unit="vid", leave=False)
|
||||
|
||||
with mp.solutions.face_mesh.FaceMesh(
|
||||
static_image_mode=False,
|
||||
max_num_faces=10,
|
||||
refine_landmarks=args.use_refine,
|
||||
min_detection_confidence=0.5,
|
||||
min_tracking_confidence=0.5
|
||||
) as face_mesh:
|
||||
for vid in videos:
|
||||
out = args.output_dir / f"{vid.stem}_faces.json"
|
||||
process_video(
|
||||
video_path=vid,
|
||||
output_path=out,
|
||||
model=yolo,
|
||||
face_mesh=face_mesh,
|
||||
conf_thresh=args.conf_thresh,
|
||||
frame_skip=args.frame_skip,
|
||||
downscale=args.downscale,
|
||||
expansion_1=args.expansion,
|
||||
expansion_2=args.expansion2,
|
||||
min_crop=args.min_crop,
|
||||
faces_upscale=args.faces_upscale,
|
||||
imgsz=args.imgsz,
|
||||
device=device,
|
||||
max_det=args.max_det
|
||||
)
|
||||
if outer is not None:
|
||||
outer.update(1)
|
||||
|
||||
if outer is not None:
|
||||
outer.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
78
src/reformat/main_track_faces.py
Normal file
78
src/reformat/main_track_faces.py
Normal file
@ -0,0 +1,78 @@
|
||||
#!/usr/bin/env python3
|
||||
import logging, json
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any
|
||||
import sys
|
||||
|
||||
# Projekt-Root verfügbar machen
|
||||
ROOT = Path(__file__).resolve().parents[2]
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from config import FACE_COMBINED_DIR, FACE_CROP_CENTERS # ggf. SEGMENTS_DIR, wenn du dorthin schreibst
|
||||
|
||||
|
||||
def iou(boxA, boxB):
|
||||
xA = max(boxA[0], boxB[0])
|
||||
yA = max(boxA[1], boxB[1])
|
||||
xB = min(boxA[0]+boxA[2], boxB[0]+boxB[2])
|
||||
yB = min(boxA[1]+boxA[3], boxB[1]+boxB[3])
|
||||
interW, interH = max(0, xB-xA), max(0, yB-yA)
|
||||
inter = interW * interH
|
||||
union = boxA[2]*boxA[3] + boxB[2]*boxB[3] - inter
|
||||
return inter/union if union > 0 else 0.0
|
||||
|
||||
def track_faces(faces_all: List[Dict[str,Any]], iou_thresh=0.3):
|
||||
next_id = 0
|
||||
last_boxes = {} # track_id -> bbox
|
||||
for frame in faces_all:
|
||||
new_boxes = {}
|
||||
for face in frame["faces"]:
|
||||
box = face["bbox"]
|
||||
# match gegen bestehende
|
||||
best_id, best_iou = None, 0.0
|
||||
for tid, prev_box in last_boxes.items():
|
||||
ov = iou(box, prev_box)
|
||||
if ov > best_iou:
|
||||
best_id, best_iou = tid, ov
|
||||
if best_iou > iou_thresh:
|
||||
face["track_id"] = best_id
|
||||
new_boxes[best_id] = box
|
||||
else:
|
||||
face["track_id"] = next_id
|
||||
new_boxes[next_id] = box
|
||||
next_id += 1
|
||||
last_boxes = new_boxes
|
||||
return faces_all
|
||||
|
||||
def main():
|
||||
# Eingabe: erkannte Gesichter/Tracks
|
||||
FACE_DIR = FACE_COMBINED_DIR
|
||||
# Ausgabe: z. B. berechnete Center pro Frame
|
||||
OUT_DIR = FACE_CROP_CENTERS
|
||||
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
for f in FACE_DIR.glob("*_faces.json"):
|
||||
try:
|
||||
faces_all = json.loads(f.read_text(encoding="utf-8"))
|
||||
except Exception as e:
|
||||
print(f"❌ Fehler beim Laden {f.name}: {e}")
|
||||
continue
|
||||
|
||||
tracked = track_faces(faces_all)
|
||||
f.write_text(json.dumps(tracked, ensure_ascii=False), encoding="utf-8")
|
||||
print(f"✅ Track-IDs ergänzt: {f.name}")
|
||||
|
||||
# zusätzlich centers.json (dominant = höchster mouth_openness pro Frame)
|
||||
centers = []
|
||||
for fr in tracked:
|
||||
if fr["faces"]:
|
||||
best = max(fr["faces"], key=lambda ff: ff.get("mouth_openness", 0.0))
|
||||
centers.append([best["center"][0], best["center"][1]])
|
||||
else:
|
||||
centers.append([fr["W"]/2, fr["H"]/2])
|
||||
centers_path = f.with_name(f.stem.replace("_faces","_centers")+".json")
|
||||
centers_path.write_text(json.dumps(centers, ensure_ascii=False), encoding="utf-8")
|
||||
print(f"📝 Centers gespeichert: {centers_path.name}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
306
src/reformat/make_segments.py
Normal file
306
src/reformat/make_segments.py
Normal file
@ -0,0 +1,306 @@
|
||||
#!/usr/bin/env python3
|
||||
# make_segments.py — erzeugt pro Highlight eine Zielspur (target_by_frame.json) fürs Cropping
|
||||
|
||||
from __future__ import annotations
|
||||
import json
|
||||
import argparse
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Dict, Optional, Tuple
|
||||
from pathlib import Path
|
||||
import sys
|
||||
|
||||
# ── Projektwurzel in sys.path aufnehmen (dieses Skript liegt z. B. unter src/reformat/)
|
||||
ROOT = Path(__file__).resolve().parents[2]
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from config import RAW_CLIPS_DIR, FACE_COMBINED_DIR, FACE_CROP_CENTERS, SEGMENTS_DIR
|
||||
|
||||
try:
|
||||
from moviepy.video.io.VideoFileClip import VideoFileClip
|
||||
MOVIEPY_OK = True
|
||||
except Exception:
|
||||
MOVIEPY_OK = False
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# Hilfsstrukturen
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
@dataclass
|
||||
class FaceDet:
|
||||
t: float # Sekunden
|
||||
cx: float # Zentrum x (0..1)
|
||||
cy: float # Zentrum y (0..1)
|
||||
w: float # Breite rel. (0..1)
|
||||
h: float # Höhe rel. (0..1)
|
||||
track_id: Optional[int] = None
|
||||
mouth_prob: Optional[float] = None
|
||||
|
||||
def moving_average(xs: List[float], win: int) -> List[float]:
|
||||
if win <= 1 or len(xs) <= 2:
|
||||
return xs[:]
|
||||
# ungerade Fensterbreite erzwingen
|
||||
win = win if win % 2 == 1 else win + 1
|
||||
r = win // 2
|
||||
out = []
|
||||
for i in range(len(xs)):
|
||||
a = max(0, i - r)
|
||||
b = min(len(xs), i + r + 1)
|
||||
out.append(sum(xs[a:b]) / (b - a))
|
||||
return out
|
||||
|
||||
def clamp01(x: float) -> float:
|
||||
return max(0.0, min(1.0, x))
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# Lesen möglicher Eingabeformate (robust, schema-tolerant)
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def _parse_face_like(obj: Dict, t: float, W: float | None = None, H: float | None = None) -> FaceDet:
|
||||
"""
|
||||
Erwartet entweder:
|
||||
- bbox=[x,y,w,h] in Pixel → wird via W,H auf 0..1 normiert
|
||||
- oder bereits normierte Felder cx,cy,w,h in 0..1
|
||||
Optional: track_id, mouth_prob / mouth_open / talking_prob
|
||||
"""
|
||||
if "bbox" in obj and isinstance(obj["bbox"], (list, tuple)) and len(obj["bbox"]) >= 4:
|
||||
x, y, w, h = [float(v) for v in obj["bbox"][:4]]
|
||||
if W and H and W > 0 and H > 0:
|
||||
cx = (x + w * 0.5) / W
|
||||
cy = (y + h * 0.5) / H
|
||||
w = w / W
|
||||
h = h / H
|
||||
else:
|
||||
# Falls Maße fehlen: best effort, danach clampen
|
||||
cx = x + w * 0.5
|
||||
cy = y + h * 0.5
|
||||
cx, cy = clamp01(cx), clamp01(cy)
|
||||
w, h = max(0.0, min(1.0, w)), max(0.0, min(1.0, h))
|
||||
else:
|
||||
cx = float(obj.get("cx", 0.5))
|
||||
cy = float(obj.get("cy", 0.5))
|
||||
w = float(obj.get("w", 0.3))
|
||||
h = float(obj.get("h", 0.3))
|
||||
cx, cy = clamp01(cx), clamp01(cy)
|
||||
w, h = max(0.0, min(1.0, w)), max(0.0, min(1.0, h))
|
||||
|
||||
track_id = obj.get("track_id")
|
||||
mouth_prob = obj.get("mouth_prob") or obj.get("mouth_open") or obj.get("talking_prob")
|
||||
mouth_prob = None if mouth_prob is None else float(mouth_prob)
|
||||
|
||||
return FaceDet(t=t, cx=cx, cy=cy, w=w, h=h, track_id=track_id, mouth_prob=mouth_prob)
|
||||
|
||||
|
||||
def load_faces_or_centers(stem: str, fps_hint: float | None = None) -> List[FaceDet]:
|
||||
"""
|
||||
Lädt die beste verfügbare Gesichts/Center-Quelle für ein Highlight.
|
||||
Suchreihenfolge:
|
||||
1) FACE_COMBINED_DIR/{stem}_faces.json (Liste von Frames mit 'faces')
|
||||
2) FACE_CROP_CENTERS/{stem}_centers.json
|
||||
- akzeptiert entweder [[cx,cy], ...] oder [{t,cx,cy,w,h}, ...]
|
||||
"""
|
||||
candidates = [
|
||||
(FACE_COMBINED_DIR / f"{stem}_faces.json", "faces"),
|
||||
(FACE_CROP_CENTERS / f"{stem}_centers.json", "centers"),
|
||||
]
|
||||
path = kind = None
|
||||
for p, k in candidates:
|
||||
if p.exists():
|
||||
path, kind = p, k
|
||||
break
|
||||
|
||||
if path is None:
|
||||
print(f"⚠️ Keine Face/Centers-Datei gefunden für {stem}. Fallback später → (0.5,0.5).")
|
||||
return []
|
||||
|
||||
try:
|
||||
raw = path.read_text(encoding="utf-8")
|
||||
data = json.loads(raw)
|
||||
except Exception as e:
|
||||
print(f"❌ Konnte {path.name} nicht lesen: {e}")
|
||||
return []
|
||||
|
||||
dets: List[FaceDet] = []
|
||||
|
||||
# 1) Liste von Frames: [{ "W":..,"H":..,"timestamp"/"t":.., "faces":[...] }, ...]
|
||||
if isinstance(data, list) and data and isinstance(data[0], dict) and "faces" in data[0]:
|
||||
for fr in data:
|
||||
W = float(fr.get("W") or 0.0)
|
||||
H = float(fr.get("H") or 0.0)
|
||||
t = float(fr.get("t") or fr.get("timestamp") or fr.get("time") or 0.0)
|
||||
for f in fr.get("faces", []):
|
||||
dets.append(_parse_face_like(f, t, W, H))
|
||||
|
||||
# 2) Dict mit "frames": [...]
|
||||
elif isinstance(data, dict) and "frames" in data:
|
||||
for fr in data["frames"]:
|
||||
W = float(fr.get("W") or 0.0)
|
||||
H = float(fr.get("H") or 0.0)
|
||||
t = float(fr.get("t") or fr.get("timestamp") or fr.get("time") or 0.0)
|
||||
for f in fr.get("faces", []):
|
||||
dets.append(_parse_face_like(f, t, W, H))
|
||||
|
||||
# 3) centers.json als Liste von Listen: [[cx,cy], ...]
|
||||
elif isinstance(data, list) and data and isinstance(data[0], (list, tuple)) and len(data[0]) >= 2:
|
||||
fps = float(fps_hint or 25.0)
|
||||
for i, pair in enumerate(data):
|
||||
cx, cy = float(pair[0]), float(pair[1])
|
||||
dets.append(FaceDet(t=i / fps, cx=clamp01(cx), cy=clamp01(cy), w=0.6, h=0.6))
|
||||
|
||||
# 4) Liste von Dicts mit evtl. bereits normierten Feldern
|
||||
elif isinstance(data, list) and data and isinstance(data[0], dict):
|
||||
for item in data:
|
||||
t = float(item.get("t") or item.get("time") or 0.0)
|
||||
dets.append(_parse_face_like(item, t))
|
||||
|
||||
else:
|
||||
print(f"⚠️ Unbekanntes JSON-Format in {path.name}.")
|
||||
return []
|
||||
|
||||
# filtern & sortieren
|
||||
dets = [d for d in dets if 0.0 <= d.cx <= 1.0 and 0.0 <= d.cy <= 1.0]
|
||||
dets.sort(key=lambda d: d.t)
|
||||
print(f"✅ {len(dets)} Detektionen aus {path.name} ({kind}).")
|
||||
return dets
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# Zielspur berechnen
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def build_target_by_frame(
|
||||
faces: List[FaceDet],
|
||||
duration: float,
|
||||
fps: float,
|
||||
smooth_win: int = 7
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Wählt pro Frame eine Zielposition (cx,cy,w,h).
|
||||
Heuristik:
|
||||
- bevorzuge Gesicht mit höchster mouth_prob (wenn vorhanden),
|
||||
- sonst größtes Bounding-Box-Areal (w*h),
|
||||
- halte IDs stabil (nicht zu häufige Sprünge).
|
||||
Anschließend leichte Glättung (Moving Average) der Zentren/Größen.
|
||||
"""
|
||||
if fps <= 0:
|
||||
fps = 25.0
|
||||
total_frames = max(1, int(round(duration * fps)))
|
||||
if not faces:
|
||||
# Fallback: center track
|
||||
return [{"frame": i, "t": round(i / fps, 4), "cx": 0.5, "cy": 0.5, "w": 0.6, "h": 0.6} for i in range(total_frames)]
|
||||
|
||||
frame_targets: List[Tuple[float, float, float, float]] = [] # (cx, cy, w, h)
|
||||
last_track: Optional[int] = None
|
||||
|
||||
# lineare Suche über faces (bei Bedarf später bucketisieren)
|
||||
for i in range(total_frames):
|
||||
t = i / fps
|
||||
lo, hi = t - 1.0 / fps, t + 1.0 / fps
|
||||
|
||||
cand: List[FaceDet] = [d for d in faces if lo <= d.t <= hi]
|
||||
if not cand:
|
||||
# Nimm den zeitlich nächsten
|
||||
nearest = min(faces, key=lambda d: abs(d.t - t))
|
||||
cand = [nearest]
|
||||
|
||||
def score(d: FaceDet) -> Tuple[float, float, float]:
|
||||
mouth = -1.0 if d.mouth_prob is None else float(d.mouth_prob) # None schlechter als 0
|
||||
area = float(d.w) * float(d.h)
|
||||
stable = 1.0 if (last_track is not None and d.track_id == last_track) else 0.0
|
||||
return (mouth, area, stable)
|
||||
|
||||
cand.sort(key=score, reverse=True)
|
||||
best = cand[0]
|
||||
if best.track_id is not None:
|
||||
last_track = best.track_id
|
||||
frame_targets.append((best.cx, best.cy, best.w, best.h))
|
||||
|
||||
# Glätten
|
||||
cxs = moving_average([c for c, _, _, _ in frame_targets], smooth_win)
|
||||
cys = moving_average([c for _, c, _, _ in frame_targets], smooth_win)
|
||||
ws = moving_average([w for *_, w, _ in frame_targets], max(3, smooth_win // 2))
|
||||
hs = moving_average([h for *_, _, h in frame_targets], max(3, smooth_win // 2))
|
||||
|
||||
out = []
|
||||
for i, (cx, cy, w, h) in enumerate(zip(cxs, cys, ws, hs)):
|
||||
t = i / fps
|
||||
out.append({
|
||||
"frame": i,
|
||||
"t": round(t, 4),
|
||||
"cx": round(clamp01(cx), 4),
|
||||
"cy": round(clamp01(cy), 4),
|
||||
"w": round(max(0.05, min(1.0, w)), 4),
|
||||
"h": round(max(0.05, min(1.0, h)), 4),
|
||||
})
|
||||
return out
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# I/O
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def write_target_json(stem: str, target: List[Dict]) -> Path:
|
||||
SEGMENTS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
out_path = SEGMENTS_DIR / f"{stem}_target_by_frame.json"
|
||||
out_path.write_text(json.dumps(target, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
return out_path
|
||||
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# CLI / Main
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def parse_args():
|
||||
p = argparse.ArgumentParser(description="Erzeugt target_by_frame.json aus Face/Center-Detektionen für Cropping.")
|
||||
p.add_argument("--pattern", type=str, default="highlight_*.mp4", help="Dateimuster in RAW_CLIPS_DIR (Default: highlight_*.mp4)")
|
||||
p.add_argument("--fps", type=float, default=0.0, help="FPS erzwingen (0 = aus Video lesen).")
|
||||
p.add_argument("--smooth", type=int, default=7, help="Fensterbreite für Moving-Average-Glättung (ungerade).")
|
||||
p.add_argument("--overwrite", action="store_true", help="Existierende target_by_frame.json überschreiben.")
|
||||
return p.parse_args()
|
||||
|
||||
|
||||
def main():
|
||||
if not MOVIEPY_OK:
|
||||
raise RuntimeError("moviepy ist nicht installiert. Bitte `pip install moviepy` ausführen.")
|
||||
|
||||
args = parse_args()
|
||||
|
||||
vids = sorted(RAW_CLIPS_DIR.glob(args.pattern))
|
||||
if not vids:
|
||||
print(f"⚠️ Keine Rohclips gefunden in {RAW_CLIPS_DIR} mit Pattern '{args.pattern}'.")
|
||||
return
|
||||
|
||||
print(f"🔎 Finde {len(vids)} Clips …")
|
||||
|
||||
for vid in vids:
|
||||
stem = vid.stem # z. B. highlight_3
|
||||
out_json = SEGMENTS_DIR / f"{stem}_target_by_frame.json"
|
||||
if out_json.exists() and not args.overwrite:
|
||||
print(f"⏭️ {out_json.name} existiert bereits – überspringe (nutze --overwrite zum Ersetzen).")
|
||||
continue
|
||||
|
||||
# Video-Metadaten
|
||||
try:
|
||||
with VideoFileClip(str(vid)) as V:
|
||||
duration = float(V.duration or 0.0)
|
||||
fps = float(args.fps or (V.fps or 25.0))
|
||||
except Exception as e:
|
||||
print(f"❌ Kann Video {vid.name} nicht öffnen: {e} – Fallback duration/fps (10s/25fps).")
|
||||
duration, fps = 10.0, (args.fps or 25.0)
|
||||
|
||||
# Face/Centers laden (fps_hint durchreichen, wichtig für centers-Listen)
|
||||
faces = load_faces_or_centers(stem, fps_hint=fps)
|
||||
|
||||
# Zielspur bauen
|
||||
target = build_target_by_frame(faces, duration=duration, fps=fps, smooth_win=args.smooth)
|
||||
|
||||
# Schreiben
|
||||
out = write_target_json(stem, target)
|
||||
print(f"💾 geschrieben: {out}")
|
||||
|
||||
print("🎉 Fertig.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
118
src/reformat/new/analyze_mouth_activity.py
Normal file
118
src/reformat/new/analyze_mouth_activity.py
Normal file
@ -0,0 +1,118 @@
|
||||
#!/usr/bin/env python3
|
||||
# src/reformat/new/analyze_mouth_activity.py
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any, Tuple, Optional
|
||||
|
||||
# OpenAI optional; aktuell nicht genutzt (Flag fehlt bewusst)
|
||||
# from openai import OpenAI
|
||||
|
||||
# === HARTE DEFAULTS: einfach Play drücken ===
|
||||
PROJECT_ROOT = Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit")
|
||||
RAW_DIR = PROJECT_ROOT / "data" / "output" / "raw_clips"
|
||||
FACES_DIR = PROJECT_ROOT / "data" / "face_data_combined"
|
||||
TIMED_DIR = PROJECT_ROOT / "data" / "transkripte"
|
||||
CENTERS_DIR = PROJECT_ROOT / "data" / "face_crop_centers"
|
||||
|
||||
def parse_timed_file(path: Path) -> List[Tuple[float, float]]:
|
||||
"""
|
||||
Erwartet Zeilen wie:
|
||||
[00:00.00 - 00:05.20] Text...
|
||||
Gibt Liste [(start_sec, end_sec)] zurück. Falls keine Treffer: leere Liste.
|
||||
"""
|
||||
import re
|
||||
rx = re.compile(r"\[(\d+):(\d+)\.(\d+)\s*-\s*(\d+):(\d+)\.(\d+)\]")
|
||||
segs = []
|
||||
try:
|
||||
for line in path.read_text(encoding="utf-8").splitlines():
|
||||
m = rx.search(line)
|
||||
if not m:
|
||||
continue
|
||||
smin, ssec, sms, emin, esec, ems = map(int, m.groups())
|
||||
start = smin * 60 + ssec + sms / 100.0
|
||||
end = emin * 60 + esec + ems / 100.0
|
||||
if end > start:
|
||||
segs.append((start, end))
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
return segs
|
||||
|
||||
def select_speaker_center(faces: List[Dict[str, Any]]) -> Tuple[float, float]:
|
||||
"""Priorität: mouth_openness, Fallback: größte Fläche; sonst Bildmitte."""
|
||||
if not faces:
|
||||
return (960.0, 540.0)
|
||||
def area(f):
|
||||
bx = f.get("bbox",[0,0,0,0]); return float(bx[2]*bx[3])
|
||||
best = max(
|
||||
faces,
|
||||
key=lambda f: (float(f.get("mouth_openness", 0.0)), area(f))
|
||||
)
|
||||
x, y, w, h = best["bbox"]
|
||||
return (x + w/2.0, y + h/2.0)
|
||||
|
||||
def load_json(path: Path):
|
||||
import json
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
|
||||
def save_json(obj, path: Path):
|
||||
import json
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
path.write_text(json.dumps(obj, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
|
||||
def process_one(base_name: str) -> bool:
|
||||
faces_path = FACES_DIR / f"{base_name}_faces.json"
|
||||
timed_path = TIMED_DIR / f"{base_name}_timed.txt"
|
||||
centers_path = CENTERS_DIR / f"{base_name}_centers.json"
|
||||
|
||||
if not faces_path.exists():
|
||||
logging.warning("Skip %-18s | Faces fehlen: %s", base_name, faces_path)
|
||||
return False
|
||||
if centers_path.exists():
|
||||
logging.info("Skip %-18s | Centers existieren schon: %s", base_name, centers_path.name)
|
||||
return True
|
||||
|
||||
try:
|
||||
face_data: List[Dict[str, Any]] = load_json(faces_path)
|
||||
except Exception as e:
|
||||
logging.error("Fehler beim Lesen von %s: %s", faces_path, e)
|
||||
return False
|
||||
|
||||
segments = parse_timed_file(timed_path)
|
||||
if not segments:
|
||||
logging.warning("[%s] Keine Segmente erkannt oder Datei fehlt: %s", base_name, timed_path.name)
|
||||
|
||||
centers: List[List[float]] = []
|
||||
for entry in face_data:
|
||||
faces = entry.get("faces", [])
|
||||
cx, cy = select_speaker_center(faces)
|
||||
centers.append([float(cx), float(cy)])
|
||||
|
||||
save_json(centers, centers_path)
|
||||
logging.info("OK %-18s | Centers gespeichert: %s (frames=%d)", base_name, centers_path.name, len(centers))
|
||||
return True
|
||||
|
||||
def main():
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s %(levelname)s: %(message)s",
|
||||
level=logging.INFO
|
||||
)
|
||||
|
||||
if not RAW_DIR.exists():
|
||||
logging.error("RAW_DIR existiert nicht: %s", RAW_DIR)
|
||||
return
|
||||
|
||||
clips = sorted(RAW_DIR.glob("*.mp4"))
|
||||
if not clips:
|
||||
logging.warning("Keine Clips gefunden in %s", RAW_DIR)
|
||||
return
|
||||
|
||||
logging.info("Analyze (mouth) Batch-Mode: %d Clips", len(clips))
|
||||
ok = 0
|
||||
for clip in clips:
|
||||
base = clip.stem
|
||||
if process_one(base):
|
||||
ok += 1
|
||||
logging.info("Fertig. %d/%d Clips verarbeitet.", ok, len(clips))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
294
src/reformat/new/main_apply_crop.py
Normal file
294
src/reformat/new/main_apply_crop.py
Normal file
@ -0,0 +1,294 @@
|
||||
#!/usr/bin/env python3
|
||||
# src/reformat/new/main_apply_crop.py
|
||||
from __future__ import annotations
|
||||
import logging, json, math, subprocess
|
||||
from pathlib import Path
|
||||
from typing import Optional, Tuple, List, Dict, Any
|
||||
from collections import deque
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
|
||||
# ==== Pfade =================================================================
|
||||
PROJECT_ROOT = Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit")
|
||||
INPUT_VIDEO_DIR = PROJECT_ROOT / "data" / "output" / "raw_clips"
|
||||
FACE_COMBINED_DIR = PROJECT_ROOT / "data" / "face_data_combined"
|
||||
SEGMENTS_DIR = PROJECT_ROOT / "data" / "segments"
|
||||
OUTPUT_DIR = PROJECT_ROOT / "output" / "output_9x16_final"
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
OUT_W, OUT_H = 1080, 1920
|
||||
TARGET_AR = OUT_W / OUT_H # 0.5625
|
||||
|
||||
# ==== Debug =================================================================
|
||||
DEBUG_MODE = False
|
||||
DEBUG_SCALE = 0.6
|
||||
DRAW_GUIDES = True
|
||||
|
||||
# ==== Smooth / Switch =======================================================
|
||||
MEDIAN_WIN = 5
|
||||
EMA_ALPHA = 0.22
|
||||
DEADBAND_PX = 8.0
|
||||
SWITCH_COOLDOWN_FRAMES = 12 # kurze Ruhe nach Segmentwechsel
|
||||
ZOOM_PAD_FRAC = 0.10
|
||||
|
||||
# ==== Scene-Cut-Erkennung ===================================================
|
||||
USE_CUT_DETECT = True
|
||||
CUT_CORR_THRESH = 0.65
|
||||
CUT_COOLDOWN = 6
|
||||
|
||||
# ==== Audio-Mux =============================================================
|
||||
MUX_AUDIO = True
|
||||
FFMPEG_BIN = "ffmpeg"
|
||||
# ============================================================================
|
||||
|
||||
def clamp(v, lo, hi): return max(lo, min(hi, v))
|
||||
|
||||
def compute_crop_rect(cx: float, cy: float, src_w: int, src_h: int) -> tuple[int,int,int,int]:
|
||||
"""
|
||||
Liefert ein 9:16-Croprechteck (x,y,w,h) um (cx,cy).
|
||||
- AR bleibt IMMER exakt 9:16 (kein Squeeze)
|
||||
- ZOOM_PAD_FRAC wirkt als uniformer Scale auf Breite und Höhe
|
||||
- Rechteck bleibt vollständig im Bild
|
||||
"""
|
||||
src_ar = src_w / src_h
|
||||
|
||||
if src_ar >= TARGET_AR:
|
||||
base_h = src_h
|
||||
base_w = int(round(base_h * TARGET_AR))
|
||||
else:
|
||||
base_w = src_w
|
||||
base_h = int(round(base_w / TARGET_AR))
|
||||
|
||||
desired_scale = 1.0 + ZOOM_PAD_FRAC
|
||||
max_scale_w = src_w / base_w
|
||||
max_scale_h = src_h / base_h
|
||||
s = min(desired_scale, max_scale_w, max_scale_h)
|
||||
|
||||
w = int(round(base_w * s))
|
||||
h = int(round(base_h * s))
|
||||
|
||||
half_w, half_h = w // 2, h // 2
|
||||
|
||||
cx = clamp(cx, half_w, src_w - half_w)
|
||||
cy = clamp(cy, half_h, src_h - half_h)
|
||||
|
||||
x = int(round(cx - half_w))
|
||||
y = int(round(cy - half_h))
|
||||
return x, y, w, h
|
||||
|
||||
def draw_center(img, pt, color, label=None):
|
||||
if pt is None: return
|
||||
x, y = int(pt[0]), int(pt[1])
|
||||
cv2.circle(img, (x, y), 6, color, -1)
|
||||
if label:
|
||||
cv2.putText(img, label, (x + 8, y - 8),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.55, color, 2, cv2.LINE_AA)
|
||||
|
||||
def scene_corr(a_small: np.ndarray, b_small: np.ndarray) -> float:
|
||||
a_hsv = cv2.cvtColor(a_small, cv2.COLOR_BGR2HSV)
|
||||
b_hsv = cv2.cvtColor(b_small, cv2.COLOR_BGR2HSV)
|
||||
ha = cv2.calcHist([a_hsv],[0,1],None,[50,50],[0,180,0,256])
|
||||
hb = cv2.calcHist([b_hsv],[0,1],None,[50,50],[0,180,0,256])
|
||||
cv2.normalize(ha,ha,0,1,cv2.NORM_MINMAX); cv2.normalize(hb,hb,0,1,cv2.NORM_MINMAX)
|
||||
corr = cv2.compareHist(ha, hb, cv2.HISTCMP_CORREL)
|
||||
return float((corr + 1.0)/2.0)
|
||||
|
||||
def mux_audio_from_source(src_video: Path, silent_video: Path, out_video: Path):
|
||||
cmd = [
|
||||
FFMPEG_BIN, "-y",
|
||||
"-i", str(src_video),
|
||||
"-i", str(silent_video),
|
||||
"-map", "1:v:0",
|
||||
"-map", "0:a:0?",
|
||||
"-c:v", "copy",
|
||||
"-c:a", "aac", "-b:a", "192k",
|
||||
"-shortest",
|
||||
str(out_video),
|
||||
]
|
||||
subprocess.run(cmd, check=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
|
||||
def load_faces(name: str) -> List[Dict[str,Any]]:
|
||||
p = FACE_COMBINED_DIR / f"{name}_faces.json"
|
||||
return json.loads(p.read_text(encoding="utf-8"))
|
||||
|
||||
def load_segments(name: str, total_frames: int) -> List[Optional[int]]:
|
||||
seg_p = SEGMENTS_DIR / f"{name}_segments.json"
|
||||
map_p = SEGMENTS_DIR / f"{name}_target_by_frame.json"
|
||||
if map_p.exists():
|
||||
target = json.loads(map_p.read_text(encoding="utf-8"))
|
||||
if len(target) < total_frames:
|
||||
target += [target[-1] if target else None] * (total_frames - len(target))
|
||||
return target[:total_frames]
|
||||
if seg_p.exists():
|
||||
segs = json.loads(seg_p.read_text(encoding="utf-8"))
|
||||
target = [None]*total_frames
|
||||
for s in segs:
|
||||
a, b, tid = int(s["start_f"]), int(s["end_f"]), s["track_id"]
|
||||
for t in range(max(0,a), min(total_frames, b+1)):
|
||||
target[t] = tid
|
||||
return target
|
||||
return [None]*total_frames
|
||||
|
||||
def find_center_for_track(faces_frame: Dict[str,Any], target_tid: Optional[int], fallback: Tuple[float,float]) -> Tuple[float,float]:
|
||||
if target_tid is None:
|
||||
return fallback
|
||||
faces = faces_frame.get("faces", [])
|
||||
for f in faces:
|
||||
if int(f.get("track_id", -1)) == int(target_tid):
|
||||
x,y,w,h = f.get("bbox", [None,None,None,None])
|
||||
if None not in (x,y,w,h):
|
||||
return (float(x + w/2), float(y + h/2))
|
||||
return fallback
|
||||
|
||||
def main():
|
||||
logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO)
|
||||
clips = sorted(list(INPUT_VIDEO_DIR.glob("*.mp4")) + list(INPUT_VIDEO_DIR.glob("*.mov")))
|
||||
if not clips:
|
||||
print(f"⚠️ Keine Clips in {INPUT_VIDEO_DIR}")
|
||||
return
|
||||
|
||||
for video_path in clips:
|
||||
name = video_path.stem
|
||||
faces_path = FACE_COMBINED_DIR / f"{name}_faces.json"
|
||||
if not faces_path.exists():
|
||||
print(f"⏭️ Skip (keine Faces): {faces_path.name}")
|
||||
continue
|
||||
|
||||
cap = cv2.VideoCapture(str(video_path))
|
||||
if not cap.isOpened():
|
||||
print(f"❌ Kann Video nicht öffnen: {video_path.name}")
|
||||
continue
|
||||
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
|
||||
total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
|
||||
faces_all = load_faces(name)
|
||||
if len(faces_all) < total:
|
||||
faces_all += [ {"faces": [], "W": width, "H": height} ] * (total - len(faces_all))
|
||||
|
||||
target_by_frame = load_segments(name, total)
|
||||
|
||||
out_path = OUTPUT_DIR / f"{name}_9x16.mp4"
|
||||
if out_path.exists():
|
||||
print(f"⏭️ Skip: Output existiert bereits → {out_path.name}")
|
||||
cap.release()
|
||||
continue
|
||||
|
||||
writer = cv2.VideoWriter(
|
||||
str(out_path),
|
||||
cv2.VideoWriter_fourcc(*"mp4v"),
|
||||
fps,
|
||||
(OUT_W, OUT_H)
|
||||
)
|
||||
|
||||
median_buf = deque(maxlen=max(3, MEDIAN_WIN if MEDIAN_WIN % 2 else MEDIAN_WIN+1))
|
||||
ema_center: Optional[Tuple[float,float]] = None
|
||||
last_center: Optional[Tuple[float,float]] = (width/2, height/2)
|
||||
switch_cooldown = 0
|
||||
|
||||
prev_small = None
|
||||
cut_cd = 0
|
||||
|
||||
print(f"🎞️ Apply: {name} src={width}x{height} fps={fps:.2f} frames={total}")
|
||||
|
||||
for t in range(total):
|
||||
ret, frame = cap.read()
|
||||
if not ret: break
|
||||
|
||||
target_tid = target_by_frame[t] if t < len(target_by_frame) else None
|
||||
faces_fr = faces_all[t] if t < len(faces_all) else {"faces":[]}
|
||||
desired = find_center_for_track(faces_fr, target_tid, last_center or (width/2, height/2))
|
||||
|
||||
if USE_CUT_DETECT:
|
||||
small = cv2.resize(frame, (128, 72))
|
||||
if prev_small is not None:
|
||||
corr = scene_corr(prev_small, small)
|
||||
if corr < CUT_CORR_THRESH:
|
||||
ema_center = desired
|
||||
last_center = desired
|
||||
switch_cooldown = SWITCH_COOLDOWN_FRAMES
|
||||
cut_cd = CUT_COOLDOWN
|
||||
prev_small = small
|
||||
|
||||
median_buf.append(desired)
|
||||
if len(median_buf) >= 3:
|
||||
xs = sorted(p[0] for p in median_buf)
|
||||
ys = sorted(p[1] for p in median_buf)
|
||||
m = len(median_buf)//2
|
||||
desired_f = (xs[m], ys[m])
|
||||
else:
|
||||
desired_f = desired
|
||||
|
||||
if t > 0:
|
||||
prev_tid = target_by_frame[t-1] if t-1 < len(target_by_frame) else None
|
||||
else:
|
||||
prev_tid = target_tid
|
||||
|
||||
if ema_center is None:
|
||||
ema_center = desired_f
|
||||
if last_center is None:
|
||||
last_center = desired_f
|
||||
|
||||
if target_tid != prev_tid:
|
||||
ema_center = desired_f
|
||||
last_center = desired_f
|
||||
switch_cooldown = SWITCH_COOLDOWN_FRAMES
|
||||
else:
|
||||
dx = desired_f[0] - ema_center[0]
|
||||
dy = desired_f[1] - ema_center[1]
|
||||
dist = math.hypot(dx, dy)
|
||||
if cut_cd > 0:
|
||||
ema_center = desired_f
|
||||
cut_cd -= 1
|
||||
else:
|
||||
if dist > DEADBAND_PX:
|
||||
ema_center = (ema_center[0] + dx*EMA_ALPHA,
|
||||
ema_center[1] + dy*EMA_ALPHA)
|
||||
|
||||
last_center = desired_f
|
||||
|
||||
# neuer 9:16 Crop
|
||||
x, y, w, h = compute_crop_rect(ema_center[0], ema_center[1], width, height)
|
||||
cropped = frame[y:y+h, x:x+w]
|
||||
if cropped.size == 0: cropped = frame
|
||||
final = cv2.resize(cropped, (OUT_W, OUT_H), interpolation=cv2.INTER_AREA)
|
||||
writer.write(final)
|
||||
|
||||
if DEBUG_MODE:
|
||||
dbg = frame.copy()
|
||||
cv2.rectangle(dbg, (x, y), (x+w, y+h), (0, 0, 255), 2)
|
||||
if DRAW_GUIDES:
|
||||
draw_center(dbg, desired, (128,128,255), "desired")
|
||||
draw_center(dbg, desired_f, (255,255, 0), "median")
|
||||
draw_center(dbg, ema_center, ( 0,255,255), "ema")
|
||||
cv2.putText(dbg, f"t={t+1}/{total} tid={target_tid}",
|
||||
(12, height-14), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (20,220,20), 2, cv2.LINE_AA)
|
||||
disp = cv2.resize(dbg, (int(width*DEBUG_SCALE), int(height*DEBUG_SCALE)))
|
||||
cv2.imshow("Apply Debug", disp)
|
||||
if cv2.waitKey(1) & 0xFF == ord("q"):
|
||||
print("🛑 Abgebrochen (q).")
|
||||
break
|
||||
|
||||
writer.release()
|
||||
cap.release()
|
||||
|
||||
if MUX_AUDIO:
|
||||
tmp = out_path.with_suffix(".tmp.mp4")
|
||||
try:
|
||||
out_path.rename(tmp)
|
||||
mux_audio_from_source(video_path, tmp, out_path)
|
||||
finally:
|
||||
if tmp.exists():
|
||||
try: tmp.unlink()
|
||||
except: pass
|
||||
print(f"✅ Fertig (mit Audio): {out_path.name}")
|
||||
else:
|
||||
print(f"✅ Fertig: {out_path.name}")
|
||||
|
||||
if DEBUG_MODE:
|
||||
cv2.destroyAllWindows()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
319
src/reformat/new/main_detect_faces.py
Normal file
319
src/reformat/new/main_detect_faces.py
Normal file
@ -0,0 +1,319 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import json
|
||||
import time
|
||||
from pathlib import Path
|
||||
from contextlib import nullcontext
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import torch
|
||||
from ultralytics import YOLO
|
||||
import mediapipe as mp
|
||||
|
||||
# Fortschritt hübsch, wenn verfügbar
|
||||
try:
|
||||
from tqdm import tqdm
|
||||
_HAS_TQDM = True
|
||||
except Exception:
|
||||
_HAS_TQDM = False
|
||||
|
||||
from src.reformat.new.speaking import get_mouth_openness
|
||||
|
||||
# ---------- Performance Tweaks ----------
|
||||
torch.set_float32_matmul_precision("high")
|
||||
cv2.setUseOptimized(True)
|
||||
|
||||
# ---------- Hilfsfunktionen ----------
|
||||
|
||||
def make_square_crop(x1, y1, x2, y2, W, H, margin_scale, min_crop):
|
||||
cx = (x1 + x2) * 0.5
|
||||
cy = (y1 + y2) * 0.5
|
||||
w = (x2 - x1) * (1.0 + 2.0 * margin_scale)
|
||||
h = (y2 - y1) * (1.0 + 2.0 * margin_scale)
|
||||
side = max(w, h, float(min_crop))
|
||||
half = side * 0.5
|
||||
|
||||
sx1 = int(max(0, round(cx - half)))
|
||||
sy1 = int(max(0, round(cy - half)))
|
||||
sx2 = int(min(W, round(cx + half)))
|
||||
sy2 = int(min(H, round(cy + half)))
|
||||
|
||||
side_w = max(0, sx2 - sx1)
|
||||
side_h = max(0, sy2 - sy1)
|
||||
side = max(2, min(side_w, side_h))
|
||||
sx2 = sx1 + side
|
||||
sy2 = sy1 + side
|
||||
return sx1, sy1, sx2, sy2
|
||||
|
||||
|
||||
def pick_landmarks_near_crop_center(lm_lists, crop_w, crop_h):
|
||||
if not lm_lists:
|
||||
return None
|
||||
cx_t, cy_t = crop_w * 0.5, crop_h * 0.5
|
||||
best, best_d = None, 1e12
|
||||
for lms in lm_lists:
|
||||
xs = [p.x * crop_w for p in lms.landmark]
|
||||
ys = [p.y * crop_h for p in lms.landmark]
|
||||
cx = sum(xs) / len(xs)
|
||||
cy = sum(ys) / len(ys)
|
||||
d = (cx - cx_t) ** 2 + (cy - cy_t) ** 2
|
||||
if d < best_d:
|
||||
best, best_d = lms, d
|
||||
return best
|
||||
|
||||
|
||||
def run_mesh(face_mesh, crop_bgr, upscale_if_small):
|
||||
if crop_bgr.size == 0:
|
||||
return None, 0.0
|
||||
ch, cw = crop_bgr.shape[:2]
|
||||
if max(ch, cw) < upscale_if_small:
|
||||
scale = float(upscale_if_small) / max(ch, cw)
|
||||
new_w = max(1, int(round(cw * scale)))
|
||||
new_h = max(1, int(round(ch * scale)))
|
||||
crop_bgr = cv2.resize(crop_bgr, (new_w, new_h), interpolation=cv2.INTER_CUBIC)
|
||||
ch, cw = new_h, new_w
|
||||
rgb = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2RGB)
|
||||
res = face_mesh.process(rgb)
|
||||
if not res.multi_face_landmarks:
|
||||
return None, 0.0
|
||||
chosen = pick_landmarks_near_crop_center(res.multi_face_landmarks, cw, ch)
|
||||
if chosen is None:
|
||||
return None, 0.0
|
||||
mo = get_mouth_openness(chosen.landmark, ch)
|
||||
return chosen, float(mo)
|
||||
|
||||
# ---------- Kernprozess ----------
|
||||
|
||||
def process_video(video_path: Path,
|
||||
output_path: Path,
|
||||
model: YOLO,
|
||||
face_mesh,
|
||||
conf_thresh: float,
|
||||
frame_skip: int,
|
||||
downscale: float,
|
||||
expansion_1: float,
|
||||
expansion_2: float,
|
||||
min_crop: int,
|
||||
faces_upscale: int,
|
||||
imgsz: int,
|
||||
device: str,
|
||||
max_det: int):
|
||||
print(f"🎬 Starte Detection: {video_path.name}")
|
||||
cap = cv2.VideoCapture(str(video_path))
|
||||
if not cap.isOpened():
|
||||
logging.error(f"❌ Kann Video nicht öffnen: {video_path}")
|
||||
return
|
||||
|
||||
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
|
||||
orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||
orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||
total_frames_raw = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
||||
# Wenn frame_skip > 1, reduziert sich die tatsächlich verarbeitete Anzahl
|
||||
total_to_process = None
|
||||
if total_frames_raw > 0:
|
||||
total_to_process = (total_frames_raw + (frame_skip - 1)) // max(1, frame_skip)
|
||||
|
||||
scaled_w = max(1, int(round(orig_w * downscale)))
|
||||
scaled_h = max(1, int(round(orig_h * downscale)))
|
||||
|
||||
data = []
|
||||
frame_idx = 0
|
||||
processed_frames = 0
|
||||
|
||||
sx = (orig_w / scaled_w) if downscale != 1.0 else 1.0
|
||||
sy = (orig_h / scaled_h) if downscale != 1.0 else 1.0
|
||||
|
||||
autocast_ctx = (
|
||||
torch.autocast(device_type=device, dtype=torch.float16)
|
||||
if device in ("mps", "cuda") else nullcontext()
|
||||
)
|
||||
|
||||
# Fortschrittsbalken pro Video
|
||||
bar = None
|
||||
start_t = time.time()
|
||||
if _HAS_TQDM:
|
||||
bar = tqdm(total=total_to_process, desc=f"{video_path.name}", unit="f", leave=True)
|
||||
|
||||
while True:
|
||||
ret, frame = cap.read()
|
||||
if not ret:
|
||||
break
|
||||
|
||||
if frame_skip > 1 and (frame_idx % frame_skip != 0):
|
||||
frame_idx += 1
|
||||
continue
|
||||
|
||||
frame_infer = frame if downscale == 1.0 else cv2.resize(frame, (scaled_w, scaled_h), interpolation=cv2.INTER_AREA)
|
||||
|
||||
with torch.no_grad():
|
||||
with autocast_ctx:
|
||||
detections = model(frame_infer, imgsz=imgsz, device=device, verbose=False,
|
||||
conf=conf_thresh, iou=0.5, max_det=max_det)[0]
|
||||
|
||||
faces = []
|
||||
for i in range(len(detections.boxes)):
|
||||
box = detections.boxes[i]
|
||||
conf = float(box.conf[0]) if hasattr(box.conf, "__len__") else float(box.conf)
|
||||
if conf < conf_thresh:
|
||||
continue
|
||||
x1, y1, x2, y2 = [float(v) for v in box.xyxy[0].tolist()]
|
||||
if downscale != 1.0:
|
||||
x1 *= sx; x2 *= sx; y1 *= sy; y2 *= sy
|
||||
x1 = max(0.0, min(x1, orig_w - 1))
|
||||
y1 = max(0.0, min(y1, orig_h - 1))
|
||||
x2 = max(0.0, min(x2, orig_w - 1))
|
||||
y2 = max(0.0, min(y2, orig_h - 1))
|
||||
|
||||
w = max(1.0, x2 - x1)
|
||||
h = max(1.0, y2 - y1)
|
||||
cx = x1 + w / 2.0
|
||||
cy = y1 + h / 2.0
|
||||
|
||||
# Pass 1
|
||||
sx1, sy1, sx2, sy2 = make_square_crop(x1, y1, x2, y2, orig_w, orig_h, expansion_1, min_crop)
|
||||
if sx2 - sx1 < 4 or sy2 - sy1 < 4:
|
||||
continue
|
||||
face_crop = frame[sy1:sy2, sx1:sx2]
|
||||
_, mouth_open = run_mesh(face_mesh, face_crop, faces_upscale)
|
||||
|
||||
# Pass 2 nur wenn nötig
|
||||
if mouth_open == 0.0:
|
||||
sx1b, sy1b, sx2b, sy2b = make_square_crop(x1, y1, x2, y2, orig_w, orig_h, expansion_2, min_crop)
|
||||
if (sx2b - sx1b) >= 4 and (sy2b - sy1b) >= 4:
|
||||
face_crop_b = frame[sy1b:sy2b, sx1b:sx2b]
|
||||
_, mouth_open = run_mesh(face_mesh, face_crop_b, faces_upscale)
|
||||
|
||||
faces.append({
|
||||
"bbox": [int(round(x1)), int(round(y1)), int(round(w)), int(round(h))],
|
||||
"conf": round(conf, 3),
|
||||
"center": [round(cx, 1), round(cy, 1)],
|
||||
"mouth_openness": round(float(mouth_open), 3)
|
||||
})
|
||||
|
||||
data.append({
|
||||
"frame": frame_idx,
|
||||
"timestamp": round(frame_idx / fps, 3),
|
||||
"W": orig_w,
|
||||
"H": orig_h,
|
||||
"faces": faces
|
||||
})
|
||||
frame_idx += 1
|
||||
processed_frames += 1
|
||||
|
||||
# Fortschritt aktualisieren
|
||||
if _HAS_TQDM:
|
||||
bar.update(1)
|
||||
else:
|
||||
# leichter Fallback: ETA Ausgabe alle 30 verarbeitete Frames
|
||||
if processed_frames % 30 == 0:
|
||||
elapsed = time.time() - start_t
|
||||
rate = processed_frames / max(1e-6, elapsed) # frames/sec
|
||||
if total_to_process:
|
||||
remaining = max(0, total_to_process - processed_frames)
|
||||
eta_sec = remaining / max(1e-6, rate)
|
||||
print(f"[{video_path.name}] {processed_frames}/{total_to_process} "
|
||||
f"({processed_frames/total_to_process*100:.1f}%) "
|
||||
f"— {rate:.1f} f/s — ETA {eta_sec/60:.1f} min")
|
||||
else:
|
||||
print(f"[{video_path.name}] {processed_frames} frames — {rate:.1f} f/s")
|
||||
|
||||
cap.release()
|
||||
if _HAS_TQDM and bar is not None:
|
||||
bar.close()
|
||||
|
||||
output_path.write_text(json.dumps(data, ensure_ascii=False), encoding="utf-8")
|
||||
print(f"✅ Faces gespeichert: {output_path.name}")
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
# Verzeichnisse
|
||||
parser.add_argument("--input-dir", type=Path,
|
||||
default=Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit/data/output/raw_clips"))
|
||||
parser.add_argument("--output-dir", type=Path,
|
||||
default=Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit/data/face_data_combined"))
|
||||
parser.add_argument("--model", type=Path,
|
||||
default=Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit/models/yolov8n-face.pt"))
|
||||
# Optimierte Defaults (keine Presets nötig)
|
||||
parser.add_argument("--conf-thresh", type=float, default=0.35)
|
||||
parser.add_argument("--frame-skip", type=int, default=1)
|
||||
parser.add_argument("--downscale", type=float, default=0.5)
|
||||
parser.add_argument("--expansion", type=float, default=0.4)
|
||||
parser.add_argument("--expansion2", type=float, default=0.8)
|
||||
parser.add_argument("--min-crop", type=int, default=160)
|
||||
parser.add_argument("--faces-upscale", type=int, default=192)
|
||||
parser.add_argument("--imgsz", type=int, default=448)
|
||||
parser.add_argument("--max-det", type=int, default=20)
|
||||
parser.add_argument("--use-refine", action="store_true", default=False)
|
||||
args = parser.parse_args()
|
||||
|
||||
logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO)
|
||||
args.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Model & Device
|
||||
yolo = YOLO(str(args.model))
|
||||
if torch.backends.mps.is_available():
|
||||
device = "mps"
|
||||
elif torch.cuda.is_available():
|
||||
device = "cuda"
|
||||
else:
|
||||
device = "cpu"
|
||||
yolo.to(device)
|
||||
print(f"🖥️ Inference-Device: {device}")
|
||||
|
||||
# Warmup (reduziert Anlaufschwankungen)
|
||||
try:
|
||||
with torch.no_grad():
|
||||
dummy = np.zeros((args.imgsz, args.imgsz, 3), dtype=np.uint8)
|
||||
_ = yolo.predict(source=[dummy], imgsz=args.imgsz, verbose=False, device=device)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Liste der Videos (für Gesamt-Fortschritt)
|
||||
videos = sorted(args.input_dir.glob("*.mp4"))
|
||||
print(f"🔍 Input-Ordner: {args.input_dir.resolve()}")
|
||||
print("📁 Dateien:")
|
||||
for p in sorted(args.input_dir.glob("*")):
|
||||
print(" →", p.name)
|
||||
|
||||
# Gesamt-Fortschrittsbalken pro Datei
|
||||
outer = None
|
||||
if _HAS_TQDM:
|
||||
outer = tqdm(total=len(videos), desc="Gesamt", unit="vid", leave=False)
|
||||
|
||||
with mp.solutions.face_mesh.FaceMesh(
|
||||
static_image_mode=False,
|
||||
max_num_faces=10,
|
||||
refine_landmarks=args.use_refine,
|
||||
min_detection_confidence=0.5,
|
||||
min_tracking_confidence=0.5
|
||||
) as face_mesh:
|
||||
for vid in videos:
|
||||
out = args.output_dir / f"{vid.stem}_faces.json"
|
||||
process_video(
|
||||
video_path=vid,
|
||||
output_path=out,
|
||||
model=yolo,
|
||||
face_mesh=face_mesh,
|
||||
conf_thresh=args.conf_thresh,
|
||||
frame_skip=args.frame_skip,
|
||||
downscale=args.downscale,
|
||||
expansion_1=args.expansion,
|
||||
expansion_2=args.expansion2,
|
||||
min_crop=args.min_crop,
|
||||
faces_upscale=args.faces_upscale,
|
||||
imgsz=args.imgsz,
|
||||
device=device,
|
||||
max_det=args.max_det
|
||||
)
|
||||
if _HAS_TQDM and outer is not None:
|
||||
outer.update(1)
|
||||
|
||||
if _HAS_TQDM and outer is not None:
|
||||
outer.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
67
src/reformat/new/main_track_faces.py
Normal file
67
src/reformat/new/main_track_faces.py
Normal file
@ -0,0 +1,67 @@
|
||||
#!/usr/bin/env python3
|
||||
import logging, json
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any
|
||||
|
||||
def iou(boxA, boxB):
|
||||
xA = max(boxA[0], boxB[0])
|
||||
yA = max(boxA[1], boxB[1])
|
||||
xB = min(boxA[0]+boxA[2], boxB[0]+boxB[2])
|
||||
yB = min(boxA[1]+boxA[3], boxB[1]+boxB[3])
|
||||
interW, interH = max(0, xB-xA), max(0, yB-yA)
|
||||
inter = interW * interH
|
||||
union = boxA[2]*boxA[3] + boxB[2]*boxB[3] - inter
|
||||
return inter/union if union > 0 else 0.0
|
||||
|
||||
def track_faces(faces_all: List[Dict[str,Any]], iou_thresh=0.3):
|
||||
next_id = 0
|
||||
last_boxes = {} # track_id -> bbox
|
||||
for frame in faces_all:
|
||||
new_boxes = {}
|
||||
for face in frame["faces"]:
|
||||
box = face["bbox"]
|
||||
# match gegen bestehende
|
||||
best_id, best_iou = None, 0.0
|
||||
for tid, prev_box in last_boxes.items():
|
||||
ov = iou(box, prev_box)
|
||||
if ov > best_iou:
|
||||
best_id, best_iou = tid, ov
|
||||
if best_iou > iou_thresh:
|
||||
face["track_id"] = best_id
|
||||
new_boxes[best_id] = box
|
||||
else:
|
||||
face["track_id"] = next_id
|
||||
new_boxes[next_id] = box
|
||||
next_id += 1
|
||||
last_boxes = new_boxes
|
||||
return faces_all
|
||||
|
||||
def main():
|
||||
PROJECT_ROOT = Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit")
|
||||
FACE_DIR = PROJECT_ROOT / "data" / "face_data_combined"
|
||||
|
||||
for f in FACE_DIR.glob("*_faces.json"):
|
||||
try:
|
||||
faces_all = json.loads(f.read_text(encoding="utf-8"))
|
||||
except Exception as e:
|
||||
print(f"❌ Fehler beim Laden {f.name}: {e}")
|
||||
continue
|
||||
|
||||
tracked = track_faces(faces_all)
|
||||
f.write_text(json.dumps(tracked, ensure_ascii=False), encoding="utf-8")
|
||||
print(f"✅ Track-IDs ergänzt: {f.name}")
|
||||
|
||||
# zusätzlich centers.json (dominant = höchster mouth_openness pro Frame)
|
||||
centers = []
|
||||
for fr in tracked:
|
||||
if fr["faces"]:
|
||||
best = max(fr["faces"], key=lambda ff: ff.get("mouth_openness", 0.0))
|
||||
centers.append([best["center"][0], best["center"][1]])
|
||||
else:
|
||||
centers.append([fr["W"]/2, fr["H"]/2])
|
||||
centers_path = f.with_name(f.stem.replace("_faces","_centers")+".json")
|
||||
centers_path.write_text(json.dumps(centers, ensure_ascii=False), encoding="utf-8")
|
||||
print(f"📝 Centers gespeichert: {centers_path.name}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
179
src/reformat/new/make_segments.py
Normal file
179
src/reformat/new/make_segments.py
Normal file
@ -0,0 +1,179 @@
|
||||
#!/usr/bin/env python3
|
||||
# src/reformat/new/make_segments.py
|
||||
from __future__ import annotations
|
||||
import json, math
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Dict, Any, Optional
|
||||
import numpy as np
|
||||
import cv2
|
||||
|
||||
# ==== Pfade (an dein Projekt angepasst) =====================================
|
||||
PROJECT_ROOT = Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit")
|
||||
RAW_DIR = PROJECT_ROOT / "data" / "output" / "raw_clips" # Videos
|
||||
FACE_COMBINED_DIR= PROJECT_ROOT / "data" / "face_data_combined" # *_faces.json
|
||||
SEGMENTS_DIR = PROJECT_ROOT / "data" / "segments" # Output
|
||||
SEGMENTS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
# ===========================================================================
|
||||
|
||||
# === Segment-Parameter ===
|
||||
WIN_SEC = 1.2 # Fensterlänge
|
||||
STRIDE_SEC = 0.6 # Schrittweite
|
||||
HYSTERESIS_FACTOR = 1.25 # neuer Sprecher muss +25% besser sein
|
||||
MIN_SEG_SEC = 1.0 # kürzere Segmente werden an Nachbarn gemerged
|
||||
CONF_MIN = 0.35 # Sichtbarkeits-Threshold
|
||||
AREA_CAP_FRAC = 0.12 # ab 12% Framefläche kappen wir den Flächenbonus
|
||||
|
||||
@dataclass
|
||||
class Segment:
|
||||
start_f: int
|
||||
end_f: int
|
||||
track_id: Optional[int]
|
||||
|
||||
def robust_minmax(vals, p_lo=5, p_hi=95):
|
||||
v = np.array(vals, dtype=float)
|
||||
lo, hi = np.percentile(v, [p_lo, p_hi])
|
||||
if hi <= lo: hi = lo + 1e-6
|
||||
return float(lo), float(hi)
|
||||
|
||||
def score_face(face: Dict[str,Any], W: int, H: int, cx: float, cy: float,
|
||||
lo: float, hi: float) -> float:
|
||||
# Mundaktivität robust normalisieren
|
||||
mo = float(face.get("mouth_openness", 0.0))
|
||||
mo = (mo - lo) / (hi - lo + 1e-9)
|
||||
mo = float(np.clip(mo, 0.0, 1.0))
|
||||
|
||||
x, y, w, h = map(float, face.get("bbox", [0,0,0,0]))
|
||||
conf = float(face.get("conf", 1.0))
|
||||
if conf < CONF_MIN or w <= 5 or h <= 5: # sehr kleine/unsichere Gesichter raus
|
||||
return 0.0
|
||||
|
||||
area = (w*h) / (W*H + 1e-9)
|
||||
size_w = min(1.0, area / AREA_CAP_FRAC) # Flächengewicht
|
||||
fx = x + w/2; fy = y + h/2
|
||||
dist = math.hypot(fx - cx, fy - cy) / math.hypot(W/2, H/2)
|
||||
center_w = max(0.0, 1.0 - dist**2) # Mitte leicht bevorzugen
|
||||
|
||||
# MO dominiert, Fläche und Mitte geben Stabilität
|
||||
return mo * (0.6 + 0.3*size_w + 0.1*center_w)
|
||||
|
||||
def build_segments_for_clip(faces_per_frame: List[Dict[str,Any]], fps: float) -> (List[Segment], List[Optional[int]]):
|
||||
T = len(faces_per_frame)
|
||||
if T == 0:
|
||||
return [], []
|
||||
|
||||
# Framegröße
|
||||
W = faces_per_frame[0].get("W") or faces_per_frame[0].get("width")
|
||||
H = faces_per_frame[0].get("H") or faces_per_frame[0].get("height")
|
||||
if not W or not H:
|
||||
# Versuch, aus BBox-Max abzuleiten (Fallback)
|
||||
max_w = max((f["bbox"][0]+f["bbox"][2]) for fr in faces_per_frame for f in fr.get("faces", []) if "bbox" in f) if any(fr.get("faces") for fr in faces_per_frame) else 1920
|
||||
max_h = max((f["bbox"][1]+f["bbox"][3]) for fr in faces_per_frame for f in fr.get("faces", []) if "bbox" in f) if any(fr.get("faces") for fr in faces_per_frame) else 1080
|
||||
W, H = int(max_w), int(max_h)
|
||||
|
||||
# Mundwerte für robuste Normierung sammeln
|
||||
all_mo = [float(f.get("mouth_openness", 0.0))
|
||||
for fr in faces_per_frame for f in fr.get("faces", [])]
|
||||
lo, hi = robust_minmax(all_mo) if all_mo else (0.0, 1.0)
|
||||
|
||||
win = max(1, int(round(WIN_SEC * fps)))
|
||||
stride = max(1, int(round(STRIDE_SEC * fps)))
|
||||
minseg = max(1, int(round(MIN_SEG_SEC * fps)))
|
||||
|
||||
chosen_by_frame: List[Optional[int]] = [None]*T
|
||||
last_track: Optional[int] = None
|
||||
|
||||
for start in range(0, T, stride):
|
||||
end = min(T, start + win)
|
||||
sums: Dict[int, float] = {}
|
||||
for t in range(start, end):
|
||||
faces = faces_per_frame[t].get("faces", [])
|
||||
if not faces: continue
|
||||
for face in faces:
|
||||
tid = face.get("track_id")
|
||||
if tid is None:
|
||||
continue
|
||||
s = score_face(face, W, H, W/2, H/2, lo, hi)
|
||||
if s <= 0:
|
||||
continue
|
||||
tid = int(tid)
|
||||
sums[tid] = sums.get(tid, 0.0) + s
|
||||
|
||||
if not sums:
|
||||
chosen = last_track
|
||||
else:
|
||||
best_tid, best_val = max(sums.items(), key=lambda kv: kv[1])
|
||||
if last_track is None:
|
||||
chosen = best_tid
|
||||
else:
|
||||
prev_val = sums.get(last_track, 0.0)
|
||||
chosen = best_tid if best_val > prev_val * HYSTERESIS_FACTOR else last_track
|
||||
|
||||
for t in range(start, end):
|
||||
chosen_by_frame[t] = chosen
|
||||
last_track = chosen
|
||||
|
||||
# Lücken auffüllen
|
||||
for t in range(T):
|
||||
if chosen_by_frame[t] is None:
|
||||
chosen_by_frame[t] = last_track
|
||||
|
||||
# Segmente bauen
|
||||
segs: List[Segment] = []
|
||||
cur = chosen_by_frame[0]
|
||||
seg_start = 0
|
||||
for t in range(1, T):
|
||||
if chosen_by_frame[t] != cur:
|
||||
segs.append(Segment(seg_start, t-1, cur))
|
||||
cur = chosen_by_frame[t]
|
||||
seg_start = t
|
||||
segs.append(Segment(seg_start, T-1, cur))
|
||||
|
||||
# Mindestlänge: zu kurze an vorheriges mergen
|
||||
out: List[Segment] = []
|
||||
for s in segs:
|
||||
if out and (s.end_f - s.start_f + 1) < minseg:
|
||||
out[-1].end_f = s.end_f
|
||||
else:
|
||||
out.append(s)
|
||||
|
||||
return out, chosen_by_frame
|
||||
|
||||
def main():
|
||||
clips = sorted(list(RAW_DIR.glob("*.mp4")) + list(RAW_DIR.glob("*.mov")))
|
||||
if not clips:
|
||||
print(f"⚠️ Keine Videos in {RAW_DIR}")
|
||||
return
|
||||
|
||||
for vid in clips:
|
||||
name = vid.stem
|
||||
faces_path = FACE_COMBINED_DIR / f"{name}_faces.json"
|
||||
if not faces_path.exists():
|
||||
print(f"⏭️ Skip (keine Faces): {faces_path.name}")
|
||||
continue
|
||||
|
||||
# FPS vom Video
|
||||
cap = cv2.VideoCapture(str(vid))
|
||||
if not cap.isOpened():
|
||||
print(f"❌ Kann Video nicht öffnen: {vid.name}")
|
||||
continue
|
||||
fps = cap.get(cv2.CAP_PROP_FPS) or 25.0
|
||||
cap.release()
|
||||
|
||||
try:
|
||||
faces_per_frame = json.loads(faces_path.read_text(encoding="utf-8"))
|
||||
except Exception as e:
|
||||
print(f"❌ Fehler beim Lesen {faces_path.name}: {e}")
|
||||
continue
|
||||
|
||||
segs, chosen = build_segments_for_clip(faces_per_frame, fps)
|
||||
|
||||
seg_out = SEGMENTS_DIR / f"{name}_segments.json"
|
||||
map_out = SEGMENTS_DIR / f"{name}_target_by_frame.json"
|
||||
seg_out.write_text(json.dumps([s.__dict__ for s in segs], ensure_ascii=False), encoding="utf-8")
|
||||
map_out.write_text(json.dumps(chosen, ensure_ascii=False), encoding="utf-8")
|
||||
|
||||
print(f"✅ Segmente erzeugt: {seg_out.name} ({len(segs)} Segmente)")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
58
src/reformat/new/smart_speaker_tracker.py
Normal file
58
src/reformat/new/smart_speaker_tracker.py
Normal file
@ -0,0 +1,58 @@
|
||||
from typing import Dict, List, Tuple, Optional
|
||||
from .tracking import FaceTracker
|
||||
|
||||
class SmartSpeakerTracker:
|
||||
def __init__(self):
|
||||
self.face_tracker = FaceTracker()
|
||||
self.movement_per_id: Dict[int, float] = {}
|
||||
self.prev_openness: Dict[int, float] = {}
|
||||
self.confirmation_counter: Dict[int, int] = {}
|
||||
self.speaker_threshold = 3.0 # wie viel Lippenbewegung braucht es mind.
|
||||
self.decay_factor = 0.9 # wie schnell "verblasst" die Bewegung
|
||||
self.speaker_confirm_frames = 25 # wie viele Frames muss ein Sprecher dominieren
|
||||
self.speaker_id: Optional[int] = None
|
||||
|
||||
def update(self, faces: List[Dict]) -> Tuple[float, float]:
|
||||
if not faces:
|
||||
return self.face_tracker.update([])
|
||||
|
||||
# Lippenbewegung analysieren
|
||||
for face in faces:
|
||||
id = face.get("id")
|
||||
openness = face.get("mouth_openness", 0.0)
|
||||
prev = self.prev_openness.get(id, openness)
|
||||
movement = abs(openness - prev)
|
||||
|
||||
# Bewegung aufaddieren mit Decay
|
||||
old_score = self.movement_per_id.get(id, 0.0) * self.decay_factor
|
||||
self.movement_per_id[id] = old_score + movement
|
||||
self.prev_openness[id] = openness
|
||||
|
||||
# Finde ID mit größter Bewegung
|
||||
if self.movement_per_id:
|
||||
top_id = max(self.movement_per_id, key=self.movement_per_id.get)
|
||||
top_movement = self.movement_per_id[top_id]
|
||||
|
||||
if top_movement >= self.speaker_threshold:
|
||||
self.confirmation_counter[top_id] = self.confirmation_counter.get(top_id, 0) + 1
|
||||
# Andere runterzählen
|
||||
for other_id in self.confirmation_counter:
|
||||
if other_id != top_id:
|
||||
self.confirmation_counter[other_id] = max(0, self.confirmation_counter[other_id] - 1)
|
||||
|
||||
# Wenn lange genug bestätigt, neuer Sprecher
|
||||
if self.confirmation_counter[top_id] >= self.speaker_confirm_frames:
|
||||
self.speaker_id = top_id
|
||||
else:
|
||||
# Wenn keiner über der Schwelle → kein neuer Sprecher
|
||||
self.confirmation_counter = {k: max(0, v - 1) for k, v in self.confirmation_counter.items()}
|
||||
|
||||
# Sprecher vorhanden → dahin zentrieren
|
||||
if self.speaker_id is not None:
|
||||
for face in faces:
|
||||
if face.get("id") == self.speaker_id:
|
||||
return tuple(face["center"])
|
||||
|
||||
# Fallback: stabiler Durchschnitt
|
||||
centers = [tuple(face["center"]) for face in faces]
|
||||
return self.face_tracker.update(centers)
|
67
src/reformat/new/speaker_crop_from_segments.py
Normal file
67
src/reformat/new/speaker_crop_from_segments.py
Normal file
@ -0,0 +1,67 @@
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import List, Dict
|
||||
|
||||
# === Pfade ===
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
PROJECT_ROOT = SCRIPT_DIR.parents[2]
|
||||
|
||||
FACES_PATH = PROJECT_ROOT / "data" / "face_data_combined" / "testVideoShort_faces.json"
|
||||
SEGMENTS_PATH = PROJECT_ROOT / "data" / "transkripte" / "testVideoShort_segments.json"
|
||||
OUTPUT_PATH = PROJECT_ROOT / "data" / "face_crop_centers" / "testVideoShort_centers.json"
|
||||
|
||||
FPS = 25 # Muss zur Framerate deines Videos passen
|
||||
|
||||
# === Dateien laden ===
|
||||
with open(FACES_PATH) as f:
|
||||
face_data = json.load(f)
|
||||
|
||||
with open(SEGMENTS_PATH) as f:
|
||||
segments = json.load(f)
|
||||
|
||||
# === Zentrierungen pro Frame bestimmen ===
|
||||
frame_centers: List[List[float]] = []
|
||||
|
||||
for segment in segments:
|
||||
start_sec = segment["start"]
|
||||
end_sec = segment["end"]
|
||||
start_f = int(start_sec * FPS)
|
||||
end_f = int(end_sec * FPS)
|
||||
|
||||
# Lippenbewegung pro ID in diesem Segment aufaddieren
|
||||
movement: Dict[int, float] = {}
|
||||
count: Dict[int, int] = {}
|
||||
|
||||
for f in range(start_f, min(end_f, len(face_data))):
|
||||
for face in face_data[f]["faces"]:
|
||||
id = face.get("id")
|
||||
openness = face.get("mouth_openness", 0.0)
|
||||
movement[id] = movement.get(id, 0.0) + openness
|
||||
count[id] = count.get(id, 0) + 1
|
||||
|
||||
# Durchschnitt berechnen
|
||||
avg_movement = {id: movement[id] / count[id] for id in movement if count[id] > 0}
|
||||
if not avg_movement:
|
||||
speaker_id = None
|
||||
else:
|
||||
speaker_id = max(avg_movement, key=avg_movement.get)
|
||||
|
||||
# Für jedes Frame in diesem Segment den Sprecher zentrieren
|
||||
for f in range(start_f, min(end_f, len(face_data))):
|
||||
faces = face_data[f].get("faces", [])
|
||||
center = [960.0, 540.0] # Fallback
|
||||
|
||||
if speaker_id is not None:
|
||||
for face in faces:
|
||||
if face.get("id") == speaker_id:
|
||||
center = face["center"][:2]
|
||||
break
|
||||
|
||||
frame_centers.append([round(center[0], 2), round(center[1], 2)])
|
||||
|
||||
# === Ergebnis speichern ===
|
||||
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(OUTPUT_PATH, "w") as f:
|
||||
json.dump(frame_centers, f, indent=2)
|
||||
|
||||
print(f"✅ Zentrierung auf Sprecher für {len(frame_centers)} Frames gespeichert unter:\n{OUTPUT_PATH}")
|
84
src/reformat/new/tracking.py
Normal file
84
src/reformat/new/tracking.py
Normal file
@ -0,0 +1,84 @@
|
||||
from typing import List, Tuple, Optional
|
||||
|
||||
|
||||
class FaceTracker:
|
||||
def __init__(
|
||||
self,
|
||||
dist_threshold: float = 200.0,
|
||||
switch_frames: int = 5,
|
||||
panning_window: int = 10,
|
||||
panning_threshold: float = 40.0,
|
||||
smooth_window: int = 3,
|
||||
scene_jump_threshold: float = 400.0
|
||||
):
|
||||
self.dist_threshold = dist_threshold
|
||||
self.switch_frames = switch_frames
|
||||
self.panning_window = panning_window
|
||||
self.panning_threshold = panning_threshold
|
||||
self.smooth_window = smooth_window
|
||||
self.scene_jump_threshold = scene_jump_threshold
|
||||
|
||||
self.current_center: Tuple[float, float] = (960.0, 540.0) # Default Mitte (bei 1920x1080)
|
||||
self.raw_center: Tuple[float, float] = self.current_center
|
||||
self.prev_center: Tuple[float, float] = self.current_center
|
||||
self.prev_raw: Tuple[float, float] = self.current_center
|
||||
self.candidate_center: Optional[Tuple[float, float]] = None
|
||||
self.switch_counter = 0
|
||||
|
||||
self.recent_raw_centers: List[Tuple[float, float]] = []
|
||||
self.recent_final_centers: List[Tuple[float, float]] = []
|
||||
|
||||
def update(self, candidates: List[Tuple[float, float]]) -> Tuple[float, float]:
|
||||
if not candidates:
|
||||
# kein Gesicht → verwende alten Wert
|
||||
self.recent_raw_centers.append(self.raw_center)
|
||||
self.recent_final_centers.append(self.current_center)
|
||||
return self.current_center
|
||||
|
||||
# nehme das Gesicht, das am nächsten zur vorherigen Position ist
|
||||
new_center = min(candidates, key=lambda pt: self._distance(self.prev_center, pt))
|
||||
self.raw_center = new_center
|
||||
self.recent_raw_centers.append(new_center)
|
||||
|
||||
dist = self._distance(self.prev_raw, new_center)
|
||||
if dist > self.scene_jump_threshold:
|
||||
self.current_center = new_center
|
||||
self.prev_center = new_center
|
||||
self.prev_raw = new_center
|
||||
self._smooth_reset()
|
||||
return self.current_center
|
||||
|
||||
if dist > self.dist_threshold:
|
||||
if self.candidate_center != new_center:
|
||||
self.candidate_center = new_center
|
||||
self.switch_counter = 1
|
||||
else:
|
||||
self.switch_counter += 1
|
||||
if self.switch_counter >= self.switch_frames:
|
||||
self.prev_center = self.current_center
|
||||
self.current_center = new_center
|
||||
self.prev_raw = new_center
|
||||
self.switch_counter = 0
|
||||
else:
|
||||
self.switch_counter = 0
|
||||
self.prev_raw = new_center
|
||||
|
||||
# Smoothes Nachziehen
|
||||
smoothed = self._moving_average(self.current_center, new_center, self.smooth_window)
|
||||
self.prev_center = self.current_center
|
||||
self.current_center = smoothed
|
||||
self.recent_final_centers.append(smoothed)
|
||||
|
||||
return smoothed
|
||||
|
||||
def _moving_average(self, old, new, factor):
|
||||
x = (old[0] * (factor - 1) + new[0]) / factor
|
||||
y = (old[1] * (factor - 1) + new[1]) / factor
|
||||
return (x, y)
|
||||
|
||||
def _distance(self, pt1, pt2):
|
||||
return ((pt1[0] - pt2[0]) ** 2 + (pt1[1] - pt2[1]) ** 2) ** 0.5
|
||||
|
||||
def _smooth_reset(self):
|
||||
self.recent_raw_centers.clear()
|
||||
self.recent_final_centers.clear()
|
129
src/reformat/new/utils.py
Normal file
129
src/reformat/new/utils.py
Normal file
@ -0,0 +1,129 @@
|
||||
# src/utils.py
|
||||
from __future__ import annotations
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Tuple
|
||||
|
||||
try:
|
||||
import cv2
|
||||
except Exception:
|
||||
cv2 = None # erlaubt Import ohne OpenCV (z.B. beim reinen Testen)
|
||||
|
||||
# --- Logging ---------------------------------------------------------------
|
||||
|
||||
def setup_logging(debug: bool = False) -> None:
|
||||
level = logging.DEBUG if debug else logging.INFO
|
||||
logging.basicConfig(
|
||||
level=level,
|
||||
format="%(asctime)s | %(levelname)s | %(message)s",
|
||||
)
|
||||
|
||||
# --- Mathe/Helpers ---------------------------------------------------------
|
||||
|
||||
def clamp(v: float, lo: float, hi: float) -> float:
|
||||
return max(lo, min(hi, v))
|
||||
|
||||
def compute_crop_width(orig_w: int, orig_h: int, out_w: int = 1080, out_h: int = 1920) -> int:
|
||||
# Für 9:16 Ziel: Breite = (9/16) * orig_h, standardmäßig 1080x1920
|
||||
return int((out_w / out_h) * orig_h)
|
||||
|
||||
def iou(boxA, boxB) -> float:
|
||||
"""Berechnet Intersection-over-Union zweier Bounding-Boxes."""
|
||||
ax1, ay1, aw, ah = boxA
|
||||
ax2, ay2 = ax1 + aw, ay1 + ah
|
||||
bx1, by1, bw, bh = boxB
|
||||
bx2, by2 = bx1 + bw, by1 + bh
|
||||
|
||||
inter_x1 = max(ax1, bx1)
|
||||
inter_y1 = max(ay1, by1)
|
||||
inter_x2 = min(ax2, bx2)
|
||||
inter_y2 = min(ay2, by2)
|
||||
inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1)
|
||||
|
||||
union_area = aw * ah + bw * bh - inter_area
|
||||
return inter_area / union_area if union_area > 0 else 0
|
||||
|
||||
# --- IO --------------------------------------------------------------------
|
||||
|
||||
def load_json(path: Path) -> Any:
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"Datei fehlt: {path}")
|
||||
with open(path, "r", encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
def save_json(obj: Any, path: Path) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
json.dump(obj, f, ensure_ascii=False, indent=2)
|
||||
|
||||
def ensure_exists(path: Path, what: str = "Datei/Ordner") -> None:
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"{what} nicht gefunden: {path}")
|
||||
|
||||
# --- Video / Pfade ---------------------------------------------------------
|
||||
|
||||
def get_fps(video_path: Path, fallback: float = 25.0) -> float:
|
||||
if cv2 is None:
|
||||
logging.warning("OpenCV nicht verfügbar – nutze FPS-Fallback %.2f", fallback)
|
||||
return fallback
|
||||
cap = cv2.VideoCapture(str(video_path))
|
||||
fps = cap.get(5) # cv2.CAP_PROP_FPS
|
||||
cap.release()
|
||||
if not fps or fps <= 1e-3:
|
||||
logging.warning("Konnte FPS nicht lesen – nutze Fallback %.2f", fallback)
|
||||
return fallback
|
||||
return float(fps)
|
||||
|
||||
def project_root_from(file: Path) -> Path:
|
||||
# Dein Projekt nutzt häufig parents[2]; kapseln:
|
||||
return file.resolve().parents[3]
|
||||
|
||||
def resolve_paths(project_root: Path, base_name: str) -> Dict[str, Path]:
|
||||
data = project_root / "data"
|
||||
return {
|
||||
"timed_path": data / "transkripte" / f"{base_name}_timed.txt",
|
||||
"segments_path":data / "transkripte" / f"{base_name}_segments.json",
|
||||
"faces_path": data / "face_data_combined" / f"{base_name}_faces.json",
|
||||
"centers_path": data / "face_crop_centers" / f"{base_name}_centers.json",
|
||||
"video_path": data / "output" / "raw_clips" / f"{base_name}.mp4",
|
||||
"out_9x16_dir": project_root / "output" / "output_9x16_final",
|
||||
"face_debug_dir": project_root / "output" / "debug" / "faces",
|
||||
}
|
||||
|
||||
def require_api_key(env_name: str = "OPENAI_API_KEY") -> str:
|
||||
key = os.getenv(env_name)
|
||||
if not key:
|
||||
raise RuntimeError(
|
||||
f"Umgebungsvariable {env_name} fehlt. "
|
||||
f"Exportiere sie z.B.: export {env_name}='sk-...'")
|
||||
return key
|
||||
|
||||
# --- Simple smoothing for centers ------------------------------------------
|
||||
|
||||
from typing import List, Optional
|
||||
|
||||
class CenterSmoother:
|
||||
"""Glättet Zentren mit Moving Average und optionaler Jump-Erkennung."""
|
||||
def __init__(self, window: int = 7, jump_thresh: float = 120.0):
|
||||
self.window = window
|
||||
self.jump_thresh = jump_thresh
|
||||
self.buffer: List[Tuple[float, float]] = []
|
||||
self.prev: Optional[Tuple[float, float]] = None
|
||||
|
||||
def push(self, cx: float, cy: float) -> Tuple[float, float]:
|
||||
if self.prev is not None:
|
||||
dx = abs(cx - self.prev[0]) + abs(cy - self.prev[1])
|
||||
if dx > self.jump_thresh:
|
||||
# harter Cut: reset buffer
|
||||
self.buffer.clear()
|
||||
|
||||
self.buffer.append((cx, cy))
|
||||
if len(self.buffer) > self.window:
|
||||
self.buffer.pop(0)
|
||||
|
||||
avgx = sum(p[0] for p in self.buffer) / len(self.buffer)
|
||||
avgy = sum(p[1] for p in self.buffer) / len(self.buffer)
|
||||
self.prev = (avgx, avgy)
|
||||
return self.prev
|
235
src/reformat/old/analyze_crop_position.py
Normal file
235
src/reformat/old/analyze_crop_position.py
Normal file
@ -0,0 +1,235 @@
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
import random
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
|
||||
class FaceTracker:
|
||||
def __init__(
|
||||
self,
|
||||
dist_threshold: float,
|
||||
switch_frames: int,
|
||||
panning_window: int,
|
||||
panning_threshold: float,
|
||||
smooth_window: int,
|
||||
scene_jump_threshold: float,
|
||||
):
|
||||
self.dist_threshold = dist_threshold
|
||||
self.switch_frames = switch_frames
|
||||
self.panning_window = panning_window
|
||||
self.panning_threshold = panning_threshold
|
||||
self.smooth_window = smooth_window
|
||||
self.scene_jump_threshold = scene_jump_threshold
|
||||
|
||||
self.current_center: Tuple[float, float] = (960.0, 540.0)
|
||||
self.raw_center: Tuple[float, float] = self.current_center
|
||||
self.prev_center: Tuple[float, float] = self.current_center
|
||||
self.prev_raw: Tuple[float, float] = self.current_center
|
||||
self.candidate_center: Optional[Tuple[float, float]] = None
|
||||
self.switch_counter: int = 0
|
||||
self.last_speaker_set: bool = False
|
||||
self.random_center: Optional[Tuple[float, float]] = None
|
||||
|
||||
self.panning_buffer: List[float] = []
|
||||
self.smooth_buffer: List[Tuple[float, float]] = []
|
||||
|
||||
def process_frame(self, faces: List[Dict[str, Any]]) -> Tuple[Tuple[int, int], bool]:
|
||||
valid_faces = [f for f in faces if f.get("center") and f.get("mouth_openness") is not None]
|
||||
all_faces = [f for f in faces if f.get("center")]
|
||||
|
||||
# Speaker tracking
|
||||
if valid_faces:
|
||||
self._update_speaker(valid_faces)
|
||||
else:
|
||||
self._retain_or_random_center(all_faces)
|
||||
|
||||
# Panning detection
|
||||
is_panning = self._detect_panning()
|
||||
|
||||
# Smooth / moving average
|
||||
center = self._smooth_center()
|
||||
|
||||
return (int(center[0]), int(center[1])), is_panning
|
||||
|
||||
def _update_speaker(self, valid_faces: List[Dict[str, Any]]) -> None:
|
||||
best = max(valid_faces, key=lambda x: x["mouth_openness"])
|
||||
cx, cy, *_ = best["center"]
|
||||
new_center = (cx, cy)
|
||||
|
||||
dist = math.hypot(new_center[0] - self.raw_center[0], new_center[1] - self.raw_center[1])
|
||||
if dist < self.dist_threshold:
|
||||
self.raw_center = new_center
|
||||
self.candidate_center = None
|
||||
self.switch_counter = 0
|
||||
else:
|
||||
if (
|
||||
self.candidate_center is None
|
||||
or math.hypot(
|
||||
new_center[0] - self.candidate_center[0], new_center[1] - self.candidate_center[1]
|
||||
)
|
||||
> self.dist_threshold
|
||||
):
|
||||
self.candidate_center = new_center
|
||||
self.switch_counter = 1
|
||||
else:
|
||||
self.switch_counter += 1
|
||||
|
||||
if self.switch_counter >= self.switch_frames:
|
||||
self.raw_center = self.candidate_center # type: ignore
|
||||
self.candidate_center = None
|
||||
self.switch_counter = 0
|
||||
|
||||
self.random_center = None
|
||||
self.last_speaker_set = True
|
||||
|
||||
def _retain_or_random_center(self, all_faces: List[Dict[str, Any]]) -> None:
|
||||
if self.last_speaker_set:
|
||||
# keep previous raw_center
|
||||
pass
|
||||
elif self.random_center is not None:
|
||||
self.raw_center = self.random_center
|
||||
elif all_faces:
|
||||
f = random.choice(all_faces)
|
||||
cx, cy, *_ = f["center"]
|
||||
self.random_center = (cx, cy)
|
||||
self.raw_center = self.random_center
|
||||
|
||||
def _detect_panning(self) -> bool:
|
||||
dx = self.raw_center[0] - self.prev_raw[0]
|
||||
self.panning_buffer.append(dx)
|
||||
if len(self.panning_buffer) > self.panning_window:
|
||||
self.panning_buffer.pop(0)
|
||||
avg_dx = sum(abs(d) for d in self.panning_buffer) / len(self.panning_buffer)
|
||||
is_panning = avg_dx > self.panning_threshold
|
||||
self.prev_raw = self.raw_center
|
||||
return is_panning
|
||||
|
||||
def _smooth_center(self) -> Tuple[float, float]:
|
||||
sudden_jump = (
|
||||
math.hypot(
|
||||
self.raw_center[0] - self.prev_center[0],
|
||||
self.raw_center[1] - self.prev_center[1],
|
||||
)
|
||||
> self.scene_jump_threshold
|
||||
)
|
||||
if not sudden_jump:
|
||||
self.smooth_buffer.append(self.raw_center)
|
||||
if len(self.smooth_buffer) > self.smooth_window:
|
||||
self.smooth_buffer.pop(0)
|
||||
avg_x = sum(p[0] for p in self.smooth_buffer) / len(self.smooth_buffer)
|
||||
avg_y = sum(p[1] for p in self.smooth_buffer) / len(self.smooth_buffer)
|
||||
center = (avg_x, avg_y)
|
||||
else:
|
||||
center = self.raw_center
|
||||
self.smooth_buffer.clear()
|
||||
|
||||
self.prev_center = center
|
||||
return center
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
script_dir = Path(__file__).resolve().parent
|
||||
project_root = script_dir.parents[1]
|
||||
default_input = project_root / "data" / "face_data_combined"
|
||||
default_output = project_root / "data" / "face_crop_centers"
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Track and smooth face crop centers based on mouth openness."
|
||||
)
|
||||
parser.add_argument(
|
||||
"-i", "--input-dir", type=Path,
|
||||
default=default_input,
|
||||
help=f"Directory containing *_faces.json files (default: {default_input})"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-o", "--output-dir", type=Path,
|
||||
default=default_output,
|
||||
help=f"Directory to save *_centers.json files (default: {default_output})"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dist-threshold", type=float, default=30.0,
|
||||
help="Pixel distance threshold to switch speaker"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--switch-frames", type=int, default=20,
|
||||
help="Number of consecutive frames required to confirm speaker switch"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--panning-window", type=int, default=30,
|
||||
help="Frame window size for panning detection"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--panning-threshold", type=float, default=3.0,
|
||||
help="Average dx threshold for panning detection"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--smooth-window", type=int, default=8,
|
||||
help="Moving average window for smoothing"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--scene-jump-threshold", type=float, default=300.0,
|
||||
help="Jump threshold to detect scene cuts"
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def setup_logging() -> None:
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s %(levelname)s: %(message)s",
|
||||
level=logging.INFO,
|
||||
)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
setup_logging()
|
||||
args = parse_args()
|
||||
|
||||
input_dir: Path = args.input_dir.resolve()
|
||||
output_dir: Path = args.output_dir.resolve()
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
tracker = FaceTracker(
|
||||
dist_threshold=args.dist_threshold,
|
||||
switch_frames=args.switch_frames,
|
||||
panning_window=args.panning_window,
|
||||
panning_threshold=args.panning_threshold,
|
||||
smooth_window=args.smooth_window,
|
||||
scene_jump_threshold=args.scene_jump_threshold,
|
||||
)
|
||||
|
||||
json_files = sorted(input_dir.glob("*_faces.json"))
|
||||
if not json_files:
|
||||
logging.error("Keine *_faces.json-Dateien gefunden in %s", input_dir)
|
||||
return
|
||||
|
||||
logging.info("Gefundene Dateien: %d", len(json_files))
|
||||
|
||||
for json_path in json_files:
|
||||
logging.info("Verarbeite %s", json_path.name)
|
||||
try:
|
||||
frames_data = json.loads(json_path.read_text())
|
||||
except json.JSONDecodeError as e:
|
||||
logging.error("JSON-Fehler in %s: %s", json_path.name, e)
|
||||
continue
|
||||
|
||||
out_data: List[Dict[str, Any]] = []
|
||||
for frame_idx, frame in enumerate(frames_data):
|
||||
faces = frame.get("faces", [])
|
||||
center, is_panning = tracker.process_frame(faces)
|
||||
out_data.append({
|
||||
"frame": frame_idx,
|
||||
"center": [center[0], center[1]],
|
||||
"panning": is_panning,
|
||||
})
|
||||
|
||||
out_path = output_dir / f"{json_path.stem.replace('_faces', '')}_centers.json"
|
||||
with out_path.open("w") as f:
|
||||
json.dump(out_data, f, indent=2)
|
||||
logging.info("Gespeichert: %s (%d Frames)", out_path.name, len(out_data))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
180
src/reformat/old/crop_to_speaker.py
Normal file
180
src/reformat/old/crop_to_speaker.py
Normal file
@ -0,0 +1,180 @@
|
||||
import json
|
||||
import cv2
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
# === Pfade & globale Settings ===
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
PROJECT_ROOT = SCRIPT_DIR.parents[1]
|
||||
|
||||
INPUT_VIDEO_DIR = PROJECT_ROOT / "data" / "output" / "raw_clips"
|
||||
INPUT_CENTER_DIR = PROJECT_ROOT / "data" / "face_crop_centers"
|
||||
INPUT_FACES_DIR = PROJECT_ROOT / "data" / "face_data_combined"
|
||||
OUTPUT_DIR = PROJECT_ROOT / "output" / "output_9x16_final"
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
OUT_W, OUT_H = 1080, 1920
|
||||
|
||||
DEBUG_MODE = True
|
||||
DEBUG_SCALE = 0.75
|
||||
# Ab welcher Offenheit wir "Bewegung" annehmen
|
||||
DEBUG_MOUTH_THRESHOLD = 0.02
|
||||
|
||||
# === Hilfsfunktionen ===
|
||||
def clamp(v, lo, hi):
|
||||
return max(lo, min(hi, v))
|
||||
|
||||
def compute_crop_width(orig_w, orig_h):
|
||||
return int((OUT_W / OUT_H) * orig_h)
|
||||
|
||||
# === Verarbeitung ===
|
||||
for center_path in sorted(INPUT_CENTER_DIR.glob("*_centers.json")):
|
||||
stem = center_path.stem.replace("_centers", "")
|
||||
video_path = INPUT_VIDEO_DIR / f"{stem}.mp4"
|
||||
faces_path = INPUT_FACES_DIR / f"{stem}_faces.json"
|
||||
|
||||
if not video_path.exists():
|
||||
print(f"⚠️ Video fehlt: {stem}.mp4")
|
||||
continue
|
||||
if not faces_path.exists():
|
||||
print(f"⚠️ Gesichtsdaten fehlen: {stem}_faces.json")
|
||||
continue
|
||||
|
||||
centers_data = json.loads(center_path.read_text())
|
||||
faces_data = json.loads(faces_path.read_text())
|
||||
|
||||
# Debug-Liste pro Video anlegen
|
||||
if DEBUG_MODE:
|
||||
debug_results: list = []
|
||||
|
||||
cap = cv2.VideoCapture(str(video_path))
|
||||
fps = cap.get(cv2.CAP_PROP_FPS)
|
||||
orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||
orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||
crop_w = compute_crop_width(orig_w, orig_h)
|
||||
crop_h = orig_h
|
||||
|
||||
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
|
||||
temp_vid = OUTPUT_DIR / f"{stem}_cropped.mp4"
|
||||
out_vid = cv2.VideoWriter(str(temp_vid), fourcc, fps, (OUT_W, OUT_H))
|
||||
if not out_vid.isOpened():
|
||||
print(f"❌ Kann nicht schreiben: {temp_vid.name}")
|
||||
continue
|
||||
|
||||
if DEBUG_MODE:
|
||||
cv2.namedWindow("Debug", cv2.WINDOW_NORMAL)
|
||||
|
||||
frame_idx = 0
|
||||
while True:
|
||||
ret, frame = cap.read()
|
||||
if not ret or frame_idx >= len(centers_data):
|
||||
break
|
||||
|
||||
# Crop-Infos
|
||||
info = centers_data[frame_idx]
|
||||
cx, cy = info["center"]
|
||||
is_panning = info.get("panning", False)
|
||||
if is_panning:
|
||||
cx = orig_w // 2
|
||||
|
||||
x0 = int(cx - crop_w / 2)
|
||||
x0 = clamp(x0, 0, orig_w - crop_w)
|
||||
y0 = 0
|
||||
|
||||
# Ausschneiden + Resize
|
||||
crop = frame[y0:y0+crop_h, x0:x0+crop_w]
|
||||
if crop.shape[1] != crop_w or crop.shape[0] != crop_h:
|
||||
crop = cv2.copyMakeBorder(
|
||||
crop, 0, crop_h - crop.shape[0],
|
||||
0, crop_w - crop.shape[1],
|
||||
cv2.BORDER_CONSTANT, value=[0, 0, 0]
|
||||
)
|
||||
out_frame = cv2.resize(crop, (OUT_W, OUT_H), interpolation=cv2.INTER_LINEAR)
|
||||
out_vid.write(out_frame)
|
||||
|
||||
if DEBUG_MODE:
|
||||
debug_frame = frame.copy()
|
||||
frame_faces = faces_data[frame_idx].get("faces", [])
|
||||
|
||||
# Build debug entry for this frame
|
||||
dbg_faces = []
|
||||
for f in frame_faces:
|
||||
# center und Offenheit
|
||||
cx_f, cy_f = map(int, f["center"][:2])
|
||||
openness = f.get("mouth_openness", 0.0)
|
||||
moving = openness > DEBUG_MOUTH_THRESHOLD
|
||||
dbg_faces.append({
|
||||
"center": [cx_f, cy_f],
|
||||
"mouth_openness": openness,
|
||||
"mouth_moving": moving
|
||||
})
|
||||
|
||||
# Anzeige im Debug-Fenster
|
||||
cv2.circle(debug_frame, (cx_f, cy_f), 4, (180, 180, 180), -1)
|
||||
cv2.putText(
|
||||
debug_frame,
|
||||
f"{round(openness,2)}",
|
||||
(cx_f + 6, cy_f - 6),
|
||||
cv2.FONT_HERSHEY_SIMPLEX,
|
||||
0.5,
|
||||
(255, 255, 255),
|
||||
1,
|
||||
cv2.LINE_AA
|
||||
)
|
||||
# roter Punkt, wenn Bewegung
|
||||
color = (0,0,255) if moving else (0,255,255)
|
||||
cv2.circle(debug_frame, (cx_f, cy_f), 6, color, 1)
|
||||
|
||||
debug_results.append({
|
||||
"frame": frame_idx,
|
||||
"faces": dbg_faces
|
||||
})
|
||||
|
||||
# Haupt-Center & Crop-Rahmen
|
||||
cv2.circle(debug_frame, (int(cx), int(cy)), 18, (0, 255, 0), 2)
|
||||
cv2.rectangle(debug_frame, (x0, 0), (x0 + crop_w, crop_h), (0, 0, 255), 2)
|
||||
|
||||
dbg = cv2.resize(
|
||||
debug_frame,
|
||||
(int(orig_w * DEBUG_SCALE), int(orig_h * DEBUG_SCALE))
|
||||
)
|
||||
cv2.imshow("Debug", dbg)
|
||||
if cv2.waitKey(1) & 0xFF == ord('q'):
|
||||
break
|
||||
|
||||
frame_idx += 1
|
||||
|
||||
cap.release()
|
||||
out_vid.release()
|
||||
if DEBUG_MODE:
|
||||
cv2.destroyAllWindows()
|
||||
|
||||
# Audio extrahieren & muxen
|
||||
audio_tmp = OUTPUT_DIR / f"{stem}_audio.aac"
|
||||
final_vid = OUTPUT_DIR / f"{stem}.mp4"
|
||||
try:
|
||||
subprocess.run(
|
||||
["ffmpeg", "-y", "-i", str(video_path), "-vn", "-acodec", "copy", str(audio_tmp)],
|
||||
check=True
|
||||
)
|
||||
subprocess.run(
|
||||
["ffmpeg", "-y", "-i", str(temp_vid), "-i", str(audio_tmp),
|
||||
"-c:v", "copy", "-c:a", "aac", "-b:a", "128k", str(final_vid)],
|
||||
check=True
|
||||
)
|
||||
finally:
|
||||
try: temp_vid.unlink()
|
||||
except: pass
|
||||
try: audio_tmp.unlink()
|
||||
except: pass
|
||||
|
||||
# Debug-JSON schreiben
|
||||
if DEBUG_MODE:
|
||||
dbg_path = OUTPUT_DIR / f"{stem}_debug.json"
|
||||
with dbg_path.open("w") as f:
|
||||
json.dump(debug_results, f, indent=2)
|
||||
print(f"🛠️ Debug-Daten: {dbg_path.name}")
|
||||
|
||||
print(f"✅ Finales Video: {final_vid.name}")
|
||||
|
||||
print("\n🏁 Alle Videos fertig in:", OUTPUT_DIR.resolve())
|
126
src/reformat/old/detect_speaking_faces.py
Normal file
126
src/reformat/old/detect_speaking_faces.py
Normal file
@ -0,0 +1,126 @@
|
||||
import json
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
import numpy as np
|
||||
|
||||
# === Einstellungen ===
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
PROJECT_ROOT = SCRIPT_DIR.parents[1]
|
||||
INPUT_DIR = PROJECT_ROOT / "data" / "face_data_combined"
|
||||
OUTPUT_PATH = INPUT_DIR / "dominant_faces.json"
|
||||
|
||||
SEGMENT_LENGTH = 2.0 # Länge jedes Segments in Sekunden
|
||||
MOUTH_THRESHOLD = 0.01 # minimale Mundöffnung, um einen Sprecher zu zählen
|
||||
SMOOTH_WINDOW = 5 # Fenstergröße (in Segmenten) für Moving Average
|
||||
|
||||
def analyze_clip_timed(path):
|
||||
# 1) JSON einlesen
|
||||
try:
|
||||
data = json.loads(path.read_text())
|
||||
except Exception as e:
|
||||
print(f"❌ Fehler beim Lesen von {path.name}: {e}")
|
||||
return None
|
||||
|
||||
# 2) Nur valide Frames verwenden
|
||||
frames = [d for d in data if "timestamp" in d and isinstance(d.get("faces"), list)]
|
||||
if not frames:
|
||||
print(f"⚠️ Keine validen Frames in {path.name}")
|
||||
return None
|
||||
|
||||
frames.sort(key=lambda x: x["timestamp"])
|
||||
max_time = frames[-1]["timestamp"]
|
||||
|
||||
# 3) Segmente erzeugen und dominanten Sprecher per Segment finden
|
||||
segments = []
|
||||
t = 0.0
|
||||
while t < max_time:
|
||||
t_end = t + SEGMENT_LENGTH
|
||||
face_scores = defaultdict(list) # mouth_openness pro bbox
|
||||
face_boxes = defaultdict(list) # raw bbox pro bbox-key
|
||||
face_centers = defaultdict(list) # center [cx,cy,w,h] pro bbox-key
|
||||
|
||||
# alle Frames durchsuchen, die in dieses Segment fallen
|
||||
for f in frames:
|
||||
ts = f["timestamp"]
|
||||
if t <= ts < t_end:
|
||||
for face in f["faces"]:
|
||||
bbox = face["bbox"] # [x,y,w,h]
|
||||
score = face.get("mouth_openness", 0.0)
|
||||
center = face.get("center", None) # [cx,cy,w,h]
|
||||
key = tuple(bbox)
|
||||
|
||||
if score >= MOUTH_THRESHOLD and center is not None:
|
||||
face_scores[key].append(score)
|
||||
face_boxes[key].append(bbox)
|
||||
face_centers[key].append(center)
|
||||
|
||||
if face_scores:
|
||||
# den Key mit dem höchsten Durchschnittsscore wählen
|
||||
avg_scores = {k: np.mean(v) for k, v in face_scores.items()}
|
||||
dominant_key = max(avg_scores, key=avg_scores.get)
|
||||
|
||||
# mittlere Bounding‑Box und mittleres Center berechnen
|
||||
avg_bbox = np.mean(face_boxes[dominant_key], axis=0).astype(int).tolist()
|
||||
avg_center = np.mean(face_centers[dominant_key], axis=0).tolist() # [cx,cy,w,h]
|
||||
|
||||
segments.append({
|
||||
"start": round(t, 2),
|
||||
"end": round(t_end if t_end < max_time else max_time, 2),
|
||||
"bbox": avg_bbox,
|
||||
"center": [float(avg_center[0]), float(avg_center[1]), float(avg_center[2]), float(avg_center[3])]
|
||||
})
|
||||
|
||||
t += SEGMENT_LENGTH
|
||||
|
||||
if not segments:
|
||||
print(f"⚠️ Keine Segmente für Clip {path.name}")
|
||||
return None
|
||||
|
||||
# 4) Glätten der Segment‑Zentren mit Moving Average
|
||||
seg_centers = [s["center"] for s in segments] # Liste von [cx,cy,w,h]
|
||||
sm_centers = []
|
||||
n = len(seg_centers)
|
||||
half = SMOOTH_WINDOW // 2
|
||||
|
||||
for i in range(n):
|
||||
start = max(0, i - half)
|
||||
end = min(n, i + half + 1)
|
||||
window = seg_centers[start:end]
|
||||
avg = np.mean(window, axis=0) # ergibt [cx,cy,w,h]
|
||||
sm_centers.append(avg.tolist())
|
||||
|
||||
# 5) Ausgabe des geglätteten Pfades in die Konsole
|
||||
print(f"\n🔄 Smoothed path für Clip {path.stem}:")
|
||||
for i, s in enumerate(segments):
|
||||
cx, cy, w, h = sm_centers[i]
|
||||
print(f" Segment {i} [{s['start']}–{s['end']}s]: "
|
||||
f"center=({cx:.1f}, {cy:.1f}), size=({w:.1f}×{h:.1f})")
|
||||
|
||||
# 6) Neue Felder für Ausgabe‑JSON bauen
|
||||
sm_segments = []
|
||||
for i, s in enumerate(segments):
|
||||
cx, cy, w, h = sm_centers[i]
|
||||
x0 = int(cx - w/2)
|
||||
y0 = int(cy - h/2)
|
||||
sm_segments.append({
|
||||
"start": s["start"],
|
||||
"end": s["end"],
|
||||
"bbox": [x0, y0, int(w), int(h)]
|
||||
})
|
||||
|
||||
return {
|
||||
"clip": path.stem.replace("_faces", "") + ".mp4",
|
||||
"segments": sm_segments
|
||||
}
|
||||
|
||||
|
||||
# === Hauptschleife über alle Clips ===
|
||||
results = []
|
||||
for json_file in sorted(INPUT_DIR.glob("*_faces.json")):
|
||||
out = analyze_clip_timed(json_file)
|
||||
if out:
|
||||
results.append(out)
|
||||
|
||||
OUTPUT_PATH.write_text(json.dumps(results, indent=2))
|
||||
print(f"\n✅ Analyse abgeschlossen – {len(results)} Clips erkannt.")
|
||||
print(f"📄 Gespeichert in: {OUTPUT_PATH.resolve()}")
|
114
src/reformat/old/grid_faces_from_yolo.py
Normal file
114
src/reformat/old/grid_faces_from_yolo.py
Normal file
@ -0,0 +1,114 @@
|
||||
import json
|
||||
import cv2
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
from tqdm import tqdm
|
||||
from collections import defaultdict, Counter
|
||||
from sklearn.cluster import DBSCAN
|
||||
|
||||
# === Einstellungen ===
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
VIDEO_DIR = SCRIPT_DIR.parents[1] / "output"
|
||||
FACE_JSON_DIR = SCRIPT_DIR / "face_data_yolo"
|
||||
OUTPUT_DIR = SCRIPT_DIR.parents[1] / "output_stacked_faces"
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
OUT_WIDTH = 1080
|
||||
OUT_HEIGHT = 1920
|
||||
GRID_ROWS = 4
|
||||
FACE_CROP_HEIGHT = OUT_HEIGHT // GRID_ROWS
|
||||
FACE_CROP_WIDTH = OUT_WIDTH
|
||||
|
||||
# === Hilfsfunktion
|
||||
def bbox_center(bbox):
|
||||
x, y, w, h = bbox
|
||||
return int(x + w // 2), int(y + h // 2)
|
||||
|
||||
# === Hauptverarbeitung ===
|
||||
for json_path in tqdm(sorted(FACE_JSON_DIR.glob("*_faces.json")), desc="🔍 Erzeuge Grid-Clips"):
|
||||
video_name = json_path.stem.replace("_faces", "") + ".mp4"
|
||||
video_path = VIDEO_DIR / video_name
|
||||
if not video_path.exists():
|
||||
print(f"❌ Video nicht gefunden: {video_name}")
|
||||
continue
|
||||
|
||||
data = json.loads(json_path.read_text())
|
||||
|
||||
# === Alle Gesichtszentren sammeln
|
||||
all_faces = []
|
||||
for frame in data:
|
||||
for face in frame["faces"]:
|
||||
center = bbox_center(face["bbox"])
|
||||
all_faces.append((center, face["bbox"]))
|
||||
|
||||
if not all_faces:
|
||||
print(f"⚠️ Keine Gesichter erkannt in {video_name}")
|
||||
continue
|
||||
|
||||
# === Clustern
|
||||
coords = [pos for pos, _ in all_faces]
|
||||
clustering = DBSCAN(eps=80, min_samples=5).fit(coords)
|
||||
cluster_labels = clustering.labels_
|
||||
label_counts = Counter(cluster_labels)
|
||||
most_common_labels = [lbl for lbl, _ in label_counts.most_common(GRID_ROWS) if lbl != -1]
|
||||
|
||||
if not most_common_labels:
|
||||
print(f"⚠️ Keine gültigen Cluster in {video_name}")
|
||||
continue
|
||||
|
||||
# === Zuordnung: cluster_id → feste Zeile
|
||||
cluster_faces = defaultdict(list)
|
||||
for (_, bbox), label in zip(all_faces, cluster_labels):
|
||||
if label in most_common_labels:
|
||||
cluster_faces[label].append(bbox)
|
||||
|
||||
def cluster_y(label):
|
||||
return np.mean([bbox[1] for bbox in cluster_faces[label]])
|
||||
|
||||
sorted_labels = sorted(most_common_labels, key=cluster_y)
|
||||
label_to_row = {lbl: idx for idx, lbl in enumerate(sorted_labels)}
|
||||
|
||||
# === cluster_id zu jedem Gesicht hinzufügen
|
||||
for frame in data:
|
||||
for face in frame["faces"]:
|
||||
center = bbox_center(face["bbox"])
|
||||
distances = [np.linalg.norm(np.array(center) - np.array(c)) for c in coords]
|
||||
nearest = np.argmin(distances)
|
||||
label = cluster_labels[nearest]
|
||||
face["cluster_id"] = label
|
||||
|
||||
# === Video verarbeiten
|
||||
cap = cv2.VideoCapture(str(video_path))
|
||||
fps = cap.get(cv2.CAP_PROP_FPS)
|
||||
out_path = OUTPUT_DIR / video_name.replace(".mp4", "_stacked.mp4")
|
||||
fourcc = cv2.VideoWriter_fourcc(*"mp4v")
|
||||
writer = cv2.VideoWriter(str(out_path), fourcc, fps, (OUT_WIDTH, OUT_HEIGHT))
|
||||
|
||||
frame_idx = 0
|
||||
while cap.isOpened():
|
||||
ret, frame = cap.read()
|
||||
if not ret or frame_idx >= len(data):
|
||||
break
|
||||
|
||||
output_frame = np.zeros((OUT_HEIGHT, OUT_WIDTH, 3), dtype=np.uint8)
|
||||
for face in data[frame_idx]["faces"]:
|
||||
label = face.get("cluster_id", -1)
|
||||
if label not in label_to_row:
|
||||
continue
|
||||
row = label_to_row[label]
|
||||
x, y, w, h = face["bbox"]
|
||||
crop = frame[y:y+h, x:x+w]
|
||||
if crop.size == 0:
|
||||
continue
|
||||
resized = cv2.resize(crop, (FACE_CROP_WIDTH, FACE_CROP_HEIGHT))
|
||||
y_offset = row * FACE_CROP_HEIGHT
|
||||
output_frame[y_offset:y_offset+FACE_CROP_HEIGHT, :] = resized
|
||||
|
||||
writer.write(output_frame)
|
||||
frame_idx += 1
|
||||
|
||||
cap.release()
|
||||
writer.release()
|
||||
print(f"✅ Exportiert: {out_path.name}")
|
||||
|
||||
print("🏁 Alle Grid-Videos fertig.")
|
75
src/reformat/old/preview_faces.py
Normal file
75
src/reformat/old/preview_faces.py
Normal file
@ -0,0 +1,75 @@
|
||||
import cv2
|
||||
import json
|
||||
from pathlib import Path
|
||||
from tqdm import tqdm
|
||||
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
PROJECT_DIR = SCRIPT_DIR.parents[1] # ← geht von /src/reformat zu /BachlorArbeit
|
||||
|
||||
FACES_DIR = PROJECT_DIR / "data" / "face_data_combined"
|
||||
INPUT_VIDEO_DIR = PROJECT_DIR / "data" / "output" / "raw_clips"
|
||||
OUTPUT_DIR = PROJECT_DIR / "output" / "output_preview_faces"
|
||||
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# === Alle *_faces.json Dateien durchgehen ===
|
||||
face_files = sorted(FACES_DIR.glob("*_faces.json"))
|
||||
|
||||
for face_file in tqdm(face_files, desc="🔍 Erzeuge Vorschau mit Sprechererkennung"):
|
||||
clip_name = face_file.stem.replace("_faces", "") + ".mp4"
|
||||
input_path = INPUT_VIDEO_DIR / clip_name
|
||||
output_path = OUTPUT_DIR / clip_name.replace(".mp4", "_preview_faces.mp4")
|
||||
|
||||
if not input_path.exists():
|
||||
print(f"❌ Clip nicht gefunden: {clip_name}")
|
||||
continue
|
||||
|
||||
# Video-Setup
|
||||
cap = cv2.VideoCapture(str(input_path))
|
||||
fps = cap.get(cv2.CAP_PROP_FPS)
|
||||
fps = fps if fps > 1 else 25 # fallback falls FPS = 0
|
||||
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||
fourcc = cv2.VideoWriter_fourcc(*"avc1") # Kompatibler als mp4v
|
||||
out = cv2.VideoWriter(str(output_path), fourcc, fps, (width, height))
|
||||
|
||||
# Gesichts-Daten laden
|
||||
data = json.loads(face_file.read_text())
|
||||
data_by_frame = {d["frame"]: d["faces"] for d in data if d["faces"]}
|
||||
|
||||
print(f"🔢 Frames mit Gesichtern: {len(data_by_frame)}")
|
||||
|
||||
frame_idx = 0
|
||||
while cap.isOpened():
|
||||
ret, frame = cap.read()
|
||||
if not ret:
|
||||
break
|
||||
|
||||
faces = data_by_frame.get(frame_idx, [])
|
||||
speaker_idx = None
|
||||
|
||||
# Sprecher anhand Mundöffnung
|
||||
if faces and all("mouth_openness" in f for f in faces):
|
||||
mouth_vals = [f["mouth_openness"] for f in faces]
|
||||
if any(v > 0.01 for v in mouth_vals): # einfache Aktivitäts-Schwelle
|
||||
speaker_idx = mouth_vals.index(max(mouth_vals))
|
||||
|
||||
for i, face in enumerate(faces):
|
||||
x, y, w, h = face["bbox"]
|
||||
color = (0, 255, 0) if i == speaker_idx else (255, 255, 255)
|
||||
label = f"Mouth: {face.get('mouth_openness', 0):.2f}"
|
||||
|
||||
# Debug-Ausgabe (optional)
|
||||
print(f"Frame {frame_idx} | Face {i} | BBox: ({x},{y},{w},{h}) | Speaker: {speaker_idx}")
|
||||
|
||||
cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
|
||||
cv2.putText(frame, label, (x, y - 10),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
|
||||
|
||||
out.write(frame)
|
||||
frame_idx += 1
|
||||
|
||||
cap.release()
|
||||
out.release()
|
||||
print(f"✅ Vorschau exportiert: {output_path.name}")
|
||||
|
||||
print("🏁 Alle Vorschauvideos mit Sprecherkennung erstellt.")
|
92
src/reformat/old/track_faces.py
Normal file
92
src/reformat/old/track_faces.py
Normal file
@ -0,0 +1,92 @@
|
||||
import cv2
|
||||
import mediapipe as mp
|
||||
import json
|
||||
from pathlib import Path
|
||||
from tqdm import tqdm
|
||||
|
||||
# === Einstellungen ===
|
||||
INPUT_DIR = Path(__file__).resolve().parents[2] / "output"
|
||||
OUTPUT_DIR = Path(__file__).resolve().parent / "face_data"
|
||||
OUTPUT_DIR.mkdir(exist_ok=True)
|
||||
FRAME_SKIP = 1 # analysiere jeden Frame für maximale Genauigkeit
|
||||
PADDING = 30 # Pixel Padding um Gesicht
|
||||
|
||||
mp_face_mesh = mp.solutions.face_mesh
|
||||
|
||||
# Erweiterte Lippen-Landmarks (innen)
|
||||
TOP_LIPS = [13, 78, 82]
|
||||
BOTTOM_LIPS = [14, 87, 317]
|
||||
|
||||
def mouth_openness(landmarks, image_height):
|
||||
try:
|
||||
top_avg = sum([landmarks[i].y for i in TOP_LIPS]) / len(TOP_LIPS)
|
||||
bottom_avg = sum([landmarks[i].y for i in BOTTOM_LIPS]) / len(BOTTOM_LIPS)
|
||||
return abs(bottom_avg - top_avg)
|
||||
except:
|
||||
return 0.0
|
||||
|
||||
def process_video(path):
|
||||
cap = cv2.VideoCapture(str(path))
|
||||
fps = cap.get(cv2.CAP_PROP_FPS)
|
||||
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
||||
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
||||
results = []
|
||||
|
||||
with mp_face_mesh.FaceMesh(
|
||||
static_image_mode=False,
|
||||
max_num_faces=5,
|
||||
refine_landmarks=True,
|
||||
min_detection_confidence=0.6,
|
||||
min_tracking_confidence=0.6
|
||||
) as face_mesh:
|
||||
|
||||
frame_idx = 0
|
||||
while cap.isOpened():
|
||||
ret, frame = cap.read()
|
||||
if not ret:
|
||||
break
|
||||
|
||||
if frame_idx % FRAME_SKIP != 0:
|
||||
frame_idx += 1
|
||||
continue
|
||||
|
||||
rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
||||
output = face_mesh.process(rgb)
|
||||
|
||||
faces = []
|
||||
if output.multi_face_landmarks:
|
||||
for landmarks in output.multi_face_landmarks:
|
||||
mouth = mouth_openness(landmarks.landmark, height)
|
||||
|
||||
xs = [lm.x * width for lm in landmarks.landmark]
|
||||
ys = [lm.y * height for lm in landmarks.landmark]
|
||||
x1 = max(0, int(min(xs)) - PADDING)
|
||||
y1 = max(0, int(min(ys)) - PADDING)
|
||||
x2 = min(width, int(max(xs)) + PADDING)
|
||||
y2 = min(height, int(max(ys)) + PADDING)
|
||||
bbox = [x1, y1, x2 - x1, y2 - y1]
|
||||
|
||||
faces.append({
|
||||
"bbox": bbox,
|
||||
"mouth_openness": round(mouth, 4)
|
||||
})
|
||||
|
||||
results.append({
|
||||
"frame": frame_idx,
|
||||
"timestamp": round(frame_idx / fps, 2),
|
||||
"faces": faces
|
||||
})
|
||||
|
||||
frame_idx += 1
|
||||
|
||||
cap.release()
|
||||
out_path = OUTPUT_DIR / f"{path.stem}_faces.json"
|
||||
out_path.write_text(json.dumps(results, indent=2))
|
||||
print(f"✅ {path.name} verarbeitet → {out_path.name}")
|
||||
|
||||
# === Alle Videos im output/ Ordner durchgehen
|
||||
videos = list(INPUT_DIR.glob("*.mp4"))
|
||||
print(f"🎬 {len(videos)} Videos gefunden in: {INPUT_DIR}")
|
||||
|
||||
for video in tqdm(videos):
|
||||
process_video(video)
|
206
src/reformat/old/track_faces_Yolo.py
Normal file
206
src/reformat/old/track_faces_Yolo.py
Normal file
@ -0,0 +1,206 @@
|
||||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import logging
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import cv2
|
||||
from ultralytics import YOLO
|
||||
import mediapipe as mp
|
||||
|
||||
# === Pfade und Standardwerte ===
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
PROJECT_ROOT = SCRIPT_DIR.parents[1]
|
||||
DEFAULT_INPUT_DIR = PROJECT_ROOT / "data" / "output" / "raw_clips"
|
||||
DEFAULT_OUTPUT_DIR = PROJECT_ROOT / "data" / "face_data_combined"
|
||||
DEFAULT_MODEL_PATH = PROJECT_ROOT / "models" / "yolov8n-face.pt"
|
||||
|
||||
# Stelle sicher, dass das Standard-Output-Verzeichnis existiert
|
||||
DEFAULT_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# === Landmarks für Lippen ===
|
||||
TOP_LIPS = [61, 185, 40, 39, 37, 0, 267, 269, 270, 409]
|
||||
BOTTOM_LIPS = [146, 91, 181, 84, 17, 314, 405, 321, 375, 291]
|
||||
|
||||
|
||||
def get_mouth_openness(landmarks, image_height):
|
||||
"""
|
||||
Berechnet die Mundöffnung in Pixeln basierend auf normierten Landmark-Koordinaten.
|
||||
"""
|
||||
top_avg = sum(landmarks[i].y for i in TOP_LIPS) / len(TOP_LIPS)
|
||||
bottom_avg = sum(landmarks[i].y for i in BOTTOM_LIPS) / len(BOTTOM_LIPS)
|
||||
return abs(bottom_avg - top_avg) * image_height
|
||||
|
||||
|
||||
def iou(boxA, boxB):
|
||||
"""Berechnet Intersection-over-Union zweier Bounding-Boxes im Format (x, y, w, h)."""
|
||||
ax1, ay1, aw, ah = boxA
|
||||
ax2, ay2 = ax1 + aw, ay1 + ah
|
||||
bx1, by1, bw, bh = boxB
|
||||
bx2, by2 = bx1 + bw, by1 + bh
|
||||
|
||||
inter_x1 = max(ax1, bx1)
|
||||
inter_y1 = max(ay1, by1)
|
||||
inter_x2 = min(ax2, bx2)
|
||||
inter_y2 = min(ay2, by2)
|
||||
inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1)
|
||||
|
||||
union_area = aw * ah + bw * bh - inter_area
|
||||
return inter_area / union_area if union_area > 0 else 0
|
||||
|
||||
|
||||
def process_video(
|
||||
video_path: Path,
|
||||
output_path: Path,
|
||||
model: YOLO,
|
||||
face_mesh: mp.solutions.face_mesh.FaceMesh,
|
||||
conf_thresh: float,
|
||||
frame_skip: int,
|
||||
downscale: float,
|
||||
):
|
||||
cap = cv2.VideoCapture(str(video_path))
|
||||
if not cap.isOpened():
|
||||
logging.error(f"Kann Video nicht öffnen: {video_path}")
|
||||
return
|
||||
|
||||
fps = cap.get(cv2.CAP_PROP_FPS)
|
||||
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH) * downscale)
|
||||
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT) * downscale)
|
||||
|
||||
# JSON-Ausgabe mit Streaming
|
||||
with output_path.open('w', encoding='utf-8') as f_out:
|
||||
f_out.write('[\n')
|
||||
first = True
|
||||
frame_idx = 0
|
||||
|
||||
while True:
|
||||
ret, frame = cap.read()
|
||||
if not ret:
|
||||
break
|
||||
if frame_skip > 1 and frame_idx % frame_skip != 0:
|
||||
frame_idx += 1
|
||||
continue
|
||||
|
||||
if downscale != 1.0:
|
||||
frame = cv2.resize(frame, (width, height))
|
||||
|
||||
detections = model(frame, verbose=False)[0]
|
||||
yolo_boxes = []
|
||||
for box in detections.boxes:
|
||||
conf = float(box.conf[0] if hasattr(box.conf, '__getitem__') else box.conf)
|
||||
if conf < conf_thresh:
|
||||
continue
|
||||
coords = box.xyxy[0].cpu().numpy()
|
||||
x1, y1, x2, y2 = map(int, coords)
|
||||
yolo_boxes.append([x1, y1, x2 - x1, y2 - y1])
|
||||
|
||||
rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
||||
mp_result = face_mesh.process(rgb)
|
||||
mp_faces = []
|
||||
if mp_result.multi_face_landmarks:
|
||||
for landmarks in mp_result.multi_face_landmarks:
|
||||
mouth_px = get_mouth_openness(landmarks.landmark, height)
|
||||
xs = [lm.x * width for lm in landmarks.landmark]
|
||||
ys = [lm.y * height for lm in landmarks.landmark]
|
||||
x1, y1 = int(min(xs)), int(min(ys))
|
||||
x2, y2 = int(max(xs)), int(max(ys))
|
||||
mp_faces.append({
|
||||
"bbox": [x1, y1, x2 - x1, y2 - y1],
|
||||
"mouth_openness": round(mouth_px, 1)
|
||||
})
|
||||
|
||||
combined = []
|
||||
for yb in yolo_boxes:
|
||||
if mp_faces:
|
||||
best = max(mp_faces, key=lambda m: iou(yb, m["bbox"]))
|
||||
best_iou = iou(yb, best["bbox"])
|
||||
mouth = best["mouth_openness"] if best_iou > 0.2 else 0.0
|
||||
else:
|
||||
mouth = 0.0
|
||||
|
||||
x, y, w, h = yb
|
||||
cx, cy = x + w / 2, y + h / 2
|
||||
combined.append({
|
||||
"bbox": yb,
|
||||
"mouth_openness": round(mouth, 1),
|
||||
"center": [round(cx, 1), round(cy, 1), w, h]
|
||||
})
|
||||
|
||||
result = {
|
||||
"frame": frame_idx,
|
||||
"timestamp": round(frame_idx / fps, 3),
|
||||
"faces": combined
|
||||
}
|
||||
|
||||
if not first:
|
||||
f_out.write(',\n')
|
||||
json.dump(result, f_out, ensure_ascii=False)
|
||||
first = False
|
||||
frame_idx += 1
|
||||
|
||||
f_out.write('\n]')
|
||||
|
||||
cap.release()
|
||||
logging.info(f"Verarbeitet: {video_path.name} → {output_path.name}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Analyse von Videos: Gesichter und Mundöffnung erkennen"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--input-dir", type=Path,
|
||||
default=DEFAULT_INPUT_DIR,
|
||||
help=f"Verzeichnis mit MP4-Videos (standard: {DEFAULT_INPUT_DIR})"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir", type=Path,
|
||||
default=DEFAULT_OUTPUT_DIR,
|
||||
help=f"Verzeichnis für JSON-Ergebnisse (standard: {DEFAULT_OUTPUT_DIR})"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model", type=Path,
|
||||
default=DEFAULT_MODEL_PATH,
|
||||
help=f"Pfad zum YOLOv8-Face-Modell (.pt) (standard: {DEFAULT_MODEL_PATH})"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--conf-thresh", type=float, default=0.5,
|
||||
help="Schwelle für YOLO-Confidence"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--frame-skip", type=int, default=1,
|
||||
help="Nur jede n-te Frame verarbeiten"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--downscale", type=float, default=1.0,
|
||||
help="Skalierungsfaktor für Frames"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO)
|
||||
args.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
yolo = YOLO(str(args.model))
|
||||
face_mesh = mp.solutions.face_mesh.FaceMesh(
|
||||
static_image_mode=False,
|
||||
max_num_faces=5,
|
||||
refine_landmarks=True,
|
||||
min_detection_confidence=0.5,
|
||||
min_tracking_confidence=0.5
|
||||
)
|
||||
|
||||
for video_path in sorted(args.input_dir.glob("*.mp4")):
|
||||
out_path = args.output_dir / f"{video_path.stem}_faces.json"
|
||||
process_video(
|
||||
video_path,
|
||||
out_path,
|
||||
yolo,
|
||||
face_mesh,
|
||||
args.conf_thresh,
|
||||
args.frame_skip,
|
||||
args.downscale,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
12
src/reformat/speaking.py
Normal file
12
src/reformat/speaking.py
Normal file
@ -0,0 +1,12 @@
|
||||
# src/speaking.py
|
||||
|
||||
TOP_LIPS = [61, 185, 40, 39, 37, 0, 267, 269, 270, 409]
|
||||
BOTTOM_LIPS = [146, 91, 181, 84, 17, 314, 405, 321, 375, 291]
|
||||
|
||||
def get_mouth_openness(landmarks, image_height):
|
||||
"""
|
||||
Berechnet die Mundöffnung basierend auf MediaPipe-Landmarks.
|
||||
"""
|
||||
top_avg = sum(landmarks[i].y for i in TOP_LIPS) / len(TOP_LIPS)
|
||||
bottom_avg = sum(landmarks[i].y for i in BOTTOM_LIPS) / len(BOTTOM_LIPS)
|
||||
return abs(bottom_avg - top_avg) * image_height
|
265
src/subtitles/add_subtitles.py
Normal file
265
src/subtitles/add_subtitles.py
Normal file
@ -0,0 +1,265 @@
|
||||
#!/usr/bin/env python3
|
||||
# -*- coding: utf-8 -*-
|
||||
"""
|
||||
add_subtitles.py — TikTok-Word-Caps mit OpenAI Whisper (CPU)
|
||||
- läuft Ordner-weise über 9:16-Kurzclips
|
||||
- transkribiert mit word_timestamps=True
|
||||
- erzeugt ASS (ein Wort pro Zeile, Pop-Animation, Bottom-Center)
|
||||
- brennt via ffmpeg in *_subtitled.mp4
|
||||
"""
|
||||
|
||||
import os
|
||||
import re
|
||||
import glob
|
||||
import json
|
||||
import subprocess
|
||||
import tempfile
|
||||
import traceback
|
||||
import argparse
|
||||
from typing import List, Tuple, Optional
|
||||
from pathlib import Path
|
||||
import sys
|
||||
|
||||
# ── Projektwurzel in sys.path aufnehmen (dieses Skript liegt z. B. unter src/subtitles/)
|
||||
ROOT = Path(__file__).resolve().parents[2]
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from config import CROPPED_DIR, SUBTITLED_DIR # zentrale Pfade
|
||||
|
||||
# --- Stabil auf CPU (vermeidet MPS-Sparse-Fehler) ---
|
||||
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1"
|
||||
os.environ["CUDA_VISIBLE_DEVICES"] = ""
|
||||
|
||||
def log(*a): print("[LOG]", *a)
|
||||
def ensure_dir(p: Path): p.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def has_audio_stream(video_path: str) -> bool:
|
||||
cmd = ["ffprobe","-v","error","-select_streams","a","-show_entries","stream=index","-of","json",video_path]
|
||||
try:
|
||||
out = subprocess.check_output(cmd).decode("utf-8")
|
||||
data = json.loads(out)
|
||||
return bool(data.get("streams"))
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def load_whisper_cpu(model_name: str):
|
||||
import whisper # openai-whisper
|
||||
device = "cpu"
|
||||
model = whisper.load_model(model_name, device=device)
|
||||
fp16 = False
|
||||
return model, device, fp16
|
||||
|
||||
def transcribe_words_whisper(model, media_path: str, language: Optional[str], fp16: bool) -> List[Tuple[float,float,str]]:
|
||||
"""
|
||||
Nutzt 'openai-whisper' mit word_timestamps=True.
|
||||
Fallback: wenn 'words' fehlen, werden Segmenttexte approx. auf Wörter verteilt.
|
||||
"""
|
||||
result = model.transcribe(
|
||||
media_path,
|
||||
language=language,
|
||||
task="transcribe",
|
||||
word_timestamps=True,
|
||||
condition_on_previous_text=False,
|
||||
verbose=False,
|
||||
fp16=fp16
|
||||
)
|
||||
words: List[Tuple[float,float,str]] = []
|
||||
segs = result.get("segments", []) or []
|
||||
for seg in segs:
|
||||
wlist = seg.get("words")
|
||||
if isinstance(wlist, list) and wlist and all(isinstance(w, dict) for w in wlist):
|
||||
for w in wlist:
|
||||
t = (w.get("word") or w.get("text") or "").strip()
|
||||
if not t:
|
||||
continue
|
||||
ws = w.get("start"); we = w.get("end")
|
||||
if ws is None or we is None:
|
||||
continue
|
||||
t = re.sub(r"\s+", " ", t)
|
||||
if t:
|
||||
words.append((float(ws), float(we), t))
|
||||
else:
|
||||
# Fallback: Segment auf Wörter aufteilen & Zeiten gleichmäßig verteilen
|
||||
text = (seg.get("text") or "").strip()
|
||||
if not text:
|
||||
continue
|
||||
seg_start = float(seg.get("start", 0.0))
|
||||
seg_end = float(seg.get("end", seg_start))
|
||||
toks = [w for w in re.split(r"(\s+)", text) if w.strip()]
|
||||
if not toks or seg_end <= seg_start:
|
||||
continue
|
||||
dur = seg_end - seg_start
|
||||
step = dur / len(toks)
|
||||
for i, tok in enumerate(toks):
|
||||
ws = seg_start + i * step
|
||||
we = seg_start + (i+1) * step
|
||||
words.append((ws, we, tok))
|
||||
return words
|
||||
|
||||
def ass_time(t: float) -> str:
|
||||
if t < 0: t = 0
|
||||
h = int(t // 3600); m = int((t % 3600)//60); s = int(t % 60); cs = int(round((t - int(t))*100))
|
||||
return f"{h:d}:{m:02d}:{s:02d}.{cs:02d}"
|
||||
|
||||
def write_ass_words(words: List[Tuple[float,float,str]], ass_path: Path, font_size: int, margin_v: int, uppercase: bool):
|
||||
"""
|
||||
Ein Wort pro Zeile, ohne Überlappung:
|
||||
- Ende = min(eigene Endzeit, Start nächstes Wort - 0.02)
|
||||
- Pop-Animation 150ms, fette Outline, Bottom-Center (PlayResY=1920)
|
||||
"""
|
||||
header = f"""[Script Info]
|
||||
ScriptType: v4.00+
|
||||
Collisions: Normal
|
||||
PlayResX: 1080
|
||||
PlayResY: 1920
|
||||
ScaledBorderAndShadow: yes
|
||||
|
||||
[V4+ Styles]
|
||||
Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
|
||||
Style: WordCap,Inter,{font_size},&H00FFFFFF,&H00FFFFFF,&H00101010,&H64000000,1,0,0,0,100,100,0,0,1,6,0.8,2,80,80,{margin_v},1
|
||||
|
||||
[Events]
|
||||
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
|
||||
"""
|
||||
# Zeiten glätten, damit immer nur ein Wort sichtbar ist
|
||||
adjusted = []
|
||||
for i, (s, e, t) in enumerate(words):
|
||||
nstart = words[i+1][0] if i+1 < len(words) else e
|
||||
new_end = min(e, nstart - 0.02) if nstart > s else e
|
||||
if new_end <= s:
|
||||
new_end = s + 0.06
|
||||
adjusted.append((s, new_end, t))
|
||||
|
||||
with open(ass_path, "w", encoding="utf-8") as f:
|
||||
f.write(header)
|
||||
for s, e, t in adjusted:
|
||||
st, en = ass_time(s), ass_time(e)
|
||||
txt = t.upper() if uppercase else t
|
||||
# \fs sichere Größe, \blur für weiche Outline, \fad Ein/Aus,
|
||||
# \fscx135\fscy135 → Start groß, \t(...) schrumpft in 150ms auf 100% = Pop
|
||||
overrides = r"\blur1\bord8\1c&H0000FFFF&\3c&H000000&\4c&H000000&\fad(50,20)\fscx135\fscy135\t(0,150,\fscx100\fscy100)"
|
||||
f.write(f"Dialogue: 0,{st},{en},WordCap,,0,0,0,,{{{overrides}}}{txt}\n")
|
||||
|
||||
def ffmpeg_escape_for_subtitles(path: Path) -> str:
|
||||
"""
|
||||
Pfad für -vf subtitles=… escapen (für Leerzeichen, Doppelpunkte etc.).
|
||||
ffmpeg erwartet Backslash-escaping für Filter-Argumente.
|
||||
"""
|
||||
s = str(path)
|
||||
s = s.replace("\\", "\\\\")
|
||||
s = s.replace(":", "\\:")
|
||||
s = s.replace("'", "\\'")
|
||||
s = s.replace(",", "\\,")
|
||||
s = s.replace("[", "\\[")
|
||||
s = s.replace("]", "\\]")
|
||||
s = s.replace(";", "\\;")
|
||||
s = s.replace("=", "\\=")
|
||||
return s
|
||||
|
||||
def burn(video_in: Path, ass_file: Path, out_path: Path, crf=18, preset="medium") -> int:
|
||||
vf = f"subtitles={ffmpeg_escape_for_subtitles(ass_file)}"
|
||||
cmd = [
|
||||
"ffmpeg","-y","-i",str(video_in),
|
||||
"-vf", vf,
|
||||
"-c:v","libx264","-preset",preset,"-crf",str(crf),
|
||||
"-c:a","copy",
|
||||
str(out_path)
|
||||
]
|
||||
log("FFmpeg:", " ".join(cmd))
|
||||
return subprocess.call(cmd)
|
||||
|
||||
def parse_args():
|
||||
p = argparse.ArgumentParser(description="Brennt Word-Caps (ASS) via Whisper-Transkription in 9:16-Clips.")
|
||||
p.add_argument("--clips_dir", type=Path, default=CROPPED_DIR, help=f"Quellordner (Default: {CROPPED_DIR})")
|
||||
p.add_argument("--out_dir", type=Path, default=SUBTITLED_DIR, help=f"Zielordner (Default: {SUBTITLED_DIR})")
|
||||
p.add_argument("--pattern", type=str, default="*.mp4", help="Dateimuster (Default: *.mp4)")
|
||||
p.add_argument("--limit", type=int, default=None, help="Nur die ersten N Clips verarbeiten")
|
||||
p.add_argument("--model", type=str, default=os.getenv("WHISPER_MODEL", "medium"), help="Whisper-Modell")
|
||||
p.add_argument("--lang", type=str, default=os.getenv("LANGUAGE", "none"), help="Sprachcode (z. B. de, en, None=Auto)")
|
||||
p.add_argument("--uppercase", action="store_true", help="Text in Großbuchstaben rendern")
|
||||
p.add_argument("--font_size", type=int, default=108, help="ASS-Fontgröße")
|
||||
p.add_argument("--margin_v", type=int, default=320, help="ASS-MarginV (Abstand vom unteren Rand)")
|
||||
p.add_argument("--crf", type=int, default=18, help="ffmpeg CRF (Qualität)")
|
||||
p.add_argument("--preset", type=str, default="medium", help="ffmpeg Preset")
|
||||
return p.parse_args()
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
clips_dir = args.clips_dir
|
||||
output_dir = args.out_dir
|
||||
ensure_dir(output_dir)
|
||||
|
||||
log("Starte TikTok Word-Caps (Whisper)")
|
||||
log("CLIPS_DIR =", clips_dir)
|
||||
log("OUTPUT_DIR =", output_dir)
|
||||
|
||||
clips: List[str] = []
|
||||
for pat in (args.pattern,):
|
||||
clips += glob.glob(str(clips_dir / pat))
|
||||
clips.sort()
|
||||
log(f"{len(clips)} Clips gefunden.")
|
||||
if args.limit:
|
||||
clips = clips[:args.limit]
|
||||
log(f"LIMIT aktiv: {args.limit}")
|
||||
|
||||
if not clips:
|
||||
log("Keine Clips gefunden. Pfad/Pattern checken.")
|
||||
return
|
||||
|
||||
# Whisper laden (CPU)
|
||||
try:
|
||||
model, device, fp16 = load_whisper_cpu(args.model)
|
||||
log(f"Whisper geladen: {args.model} auf {device} (fp16={fp16})")
|
||||
log("Hinweis: Beim ersten Lauf kann das Modell nachgeladen werden.")
|
||||
except Exception as e:
|
||||
print("[ERROR] Whisper konnte nicht geladen werden:", e)
|
||||
traceback.print_exc()
|
||||
return
|
||||
|
||||
lang = None if str(args.lang).strip().lower() in {"", "none", "null"} else args.lang
|
||||
|
||||
for clip in clips:
|
||||
base = os.path.basename(clip)
|
||||
stem, _ = os.path.splitext(base)
|
||||
log("="*60)
|
||||
log("Clip:", base)
|
||||
|
||||
if not has_audio_stream(clip):
|
||||
log("WARN: Keine Audio-Spur → übersprungen.")
|
||||
continue
|
||||
|
||||
# Transkription
|
||||
try:
|
||||
log("Transkription startet …")
|
||||
words = transcribe_words_whisper(model, clip, language=lang, fp16=fp16)
|
||||
log(f"Transkription fertig. {len(words)} Wörter.")
|
||||
if not words:
|
||||
log("WARN: 0 Wörter erkannt → übersprungen.")
|
||||
continue
|
||||
except Exception as e:
|
||||
print("[ERROR] Transkription fehlgeschlagen:", e)
|
||||
traceback.print_exc()
|
||||
continue
|
||||
|
||||
# ASS erzeugen & brennen
|
||||
with tempfile.NamedTemporaryFile(suffix=".ass", delete=False) as tmp:
|
||||
ass_path = Path(tmp.name)
|
||||
try:
|
||||
log("Erzeuge ASS …")
|
||||
write_ass_words(words, ass_path, font_size=args.font_size, margin_v=args.margin_v, uppercase=args.uppercase)
|
||||
out_path = output_dir / f"{stem}_subtitled.mp4"
|
||||
log("Brenne Untertitel …")
|
||||
rc = burn(Path(clip), ass_path, out_path, crf=args.crf, preset=args.preset)
|
||||
if rc == 0:
|
||||
log("OK:", out_path)
|
||||
else:
|
||||
log("ERROR: ffmpeg fehlgeschlagen, code", rc)
|
||||
finally:
|
||||
try: ass_path.unlink(missing_ok=True)
|
||||
except Exception: pass
|
||||
|
||||
log("Fertig.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
25
src/subtitles/run_subtitles.py
Normal file
25
src/subtitles/run_subtitles.py
Normal file
@ -0,0 +1,25 @@
|
||||
import os
|
||||
import tempfile
|
||||
from add_subtitles import process # wir nutzen die Logik aus dem großen Skript
|
||||
|
||||
# ==== HIER EINSTELLEN ====
|
||||
VIDEO_PATH = "data/input.mp4" # Dein Video
|
||||
TRANSCRIPT_PATH = "data/transcript.srt" # Oder .json (Whisper)
|
||||
OUTPUT_DIR = "data/output" # Ordner für Ergebnisse
|
||||
CLIPS_PATH = None # Optional: "data/clips.csv" oder "data/clips.json"
|
||||
CRF = 18
|
||||
PRESET = "medium"
|
||||
STYLE = r"\\bord4\\shad4\\outline3\\fs64\\b1\\1c&HFFFFFF&\\3c&H000000&\\4c&H000000&"
|
||||
# ==========================
|
||||
|
||||
if __name__ == "__main__":
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
process(
|
||||
video_path=VIDEO_PATH,
|
||||
transcript_path=TRANSCRIPT_PATH,
|
||||
output_dir=OUTPUT_DIR,
|
||||
clips_path=CLIPS_PATH,
|
||||
crf=CRF,
|
||||
preset=PRESET,
|
||||
style_overrides=STYLE,
|
||||
)
|
100
src/text/cutClips.py
Normal file
100
src/text/cutClips.py
Normal file
@ -0,0 +1,100 @@
|
||||
#!/usr/bin/env python3
|
||||
# cutClips.py — exportiert Clips aus dem ersten gefundenen Video oder aus angegebener Datei
|
||||
|
||||
from pathlib import Path
|
||||
import sqlite3
|
||||
import argparse
|
||||
from moviepy.video.io.VideoFileClip import VideoFileClip
|
||||
import sys
|
||||
|
||||
# ── Projektwurzel in sys.path aufnehmen
|
||||
ROOT = Path(__file__).resolve().parents[2]
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from config import INPUT_DIR, RAW_CLIPS_DIR, DB_PATH
|
||||
|
||||
|
||||
def parse_args():
|
||||
p = argparse.ArgumentParser(description="Exportiert Highlights aus dem Video gemäß SQLite-DB.")
|
||||
p.add_argument("--file", type=str, default=None,
|
||||
help="Name der Input-Datei im INPUT_DIR. Wenn leer, wird das erste Video im Ordner verwendet.")
|
||||
p.add_argument("--limit", type=int, default=10,
|
||||
help="Anzahl der zu exportierenden Clips (Default: 10)")
|
||||
p.add_argument("--order", type=str, choices=["score", "start"], default="score",
|
||||
help="Sortierung: 'score' (score_total absteigend) oder 'start' (zeitlich).")
|
||||
return p.parse_args()
|
||||
|
||||
|
||||
def find_first_video(directory: Path):
|
||||
"""Suche nach der ersten Videodatei im Verzeichnis (mp4, mov, mkv)."""
|
||||
for ext in ("*.mp4", "*.mov", "*.mkv"):
|
||||
files = sorted(directory.glob(ext))
|
||||
if files:
|
||||
return files[0]
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
# === Eingabevideo bestimmen ===
|
||||
if args.file:
|
||||
input_video = INPUT_DIR / args.file
|
||||
else:
|
||||
input_video = find_first_video(INPUT_DIR)
|
||||
if not input_video:
|
||||
raise FileNotFoundError(f"🚫 Kein Video im Eingabeordner {INPUT_DIR} gefunden.")
|
||||
print(f"📂 Kein --file angegeben → verwende automatisch: {input_video.name}")
|
||||
|
||||
if not input_video.exists():
|
||||
raise FileNotFoundError(f"🚫 Input-Video nicht gefunden: {input_video}")
|
||||
|
||||
output_dir = RAW_CLIPS_DIR
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# === SQLite DB lesen ===
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
order_clause = "ORDER BY score_total DESC" if args.order == "score" else "ORDER BY start ASC"
|
||||
cursor.execute(f"""
|
||||
SELECT start, end, text
|
||||
FROM highlights
|
||||
{order_clause}
|
||||
LIMIT ?
|
||||
""", (args.limit,))
|
||||
highlights = cursor.fetchall()
|
||||
|
||||
if not highlights:
|
||||
print("⚠️ Keine Highlights in der Datenbank gefunden.")
|
||||
conn.close()
|
||||
return
|
||||
|
||||
# === Video laden ===
|
||||
video = VideoFileClip(str(input_video))
|
||||
|
||||
# === Clips schneiden ===
|
||||
for i, (start, end, text) in enumerate(highlights, start=1):
|
||||
if start >= video.duration:
|
||||
print(f"⚠️ Clip {i} übersprungen – Startzeit {start:.2f}s liegt außerhalb der Videolänge ({video.duration:.2f}s).")
|
||||
continue
|
||||
|
||||
end = min(end, video.duration)
|
||||
output_file = output_dir / f"highlight_{i}.mp4"
|
||||
print(f"🎬 Exportiere Clip {i}: {start:.2f}s – {end:.2f}s → {output_file.name}")
|
||||
|
||||
try:
|
||||
clip = video.subclipped(start, end)
|
||||
clip.write_videofile(str(output_file), codec="libx264", audio_codec="aac", logger=None)
|
||||
clip.close()
|
||||
except Exception as e:
|
||||
print(f"❌ Fehler beim Export von Clip {i}: {e}")
|
||||
|
||||
# === Cleanup ===
|
||||
conn.close()
|
||||
video.close()
|
||||
print(f"✅ {len(highlights)} Clips exportiert nach {output_dir}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
@ -2,44 +2,41 @@ import sqlite3
|
||||
import re
|
||||
from openai import OpenAI
|
||||
from time import sleep
|
||||
from pathlib import Path
|
||||
import os
|
||||
|
||||
from pathlib import Path
|
||||
import sys
|
||||
|
||||
# Projekt-Root einfügen (2 Ebenen hoch von src/* ausgehend)
|
||||
ROOT = Path(__file__).resolve().parents[2]
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from config import DB_PATH
|
||||
|
||||
|
||||
# === Einstellungen ===
|
||||
DB_PATH = "clips_openai.db"
|
||||
VIDEO_ID = "testVideoShort"
|
||||
MAX_CLIPS = 5 # oder "all"
|
||||
OPENAI_API_KEY = "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"
|
||||
|
||||
client = OpenAI(api_key=OPENAI_API_KEY)
|
||||
# === OPENAI-CLIENT (API-Key aus Env) ===
|
||||
if not os.getenv("OPENAI_API_KEY"):
|
||||
raise RuntimeError("🚫 OPENAI_API_KEY fehlt in der Umgebung")
|
||||
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
# === DB-Verbindung
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("DROP TABLE IF EXISTS highlights")
|
||||
# === Unbewertete Highlights laden
|
||||
cursor.execute("""
|
||||
CREATE TABLE highlights (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
file TEXT,
|
||||
start REAL,
|
||||
end REAL,
|
||||
text TEXT,
|
||||
viralitaet INTEGER,
|
||||
emotionalitaet INTEGER,
|
||||
witz INTEGER,
|
||||
provokation INTEGER,
|
||||
score_total INTEGER
|
||||
)
|
||||
SELECT id, start, end, text FROM highlights
|
||||
WHERE viralitaet IS NULL OR emotionalitaet IS NULL
|
||||
ORDER BY start
|
||||
""")
|
||||
conn.commit()
|
||||
print(f"🧹 Tabelle 'highlights' neu erstellt für: {VIDEO_ID}")
|
||||
|
||||
# === Segmente laden
|
||||
cursor.execute("SELECT start, end, text FROM segments ORDER BY start")
|
||||
segments = cursor.fetchall()
|
||||
print(f"📥 {len(segments)} Segmente (Originaltext) geladen.")
|
||||
print(f"📥 {len(segments)} unbewertete Highlights geladen.")
|
||||
|
||||
# === Bewertungsfunktion (GPT-4o)
|
||||
def analyse_segment(text, start, end):
|
||||
def analyse_segment(clip_id, text, start, end):
|
||||
print(f"\n🔎 Bewerte Clip: {start:.2f}s – {end:.2f}s")
|
||||
|
||||
prompt = f"""
|
||||
@ -86,19 +83,19 @@ Provokation: [Zahl]
|
||||
if all(v is not None for v in values.values()):
|
||||
total_score = sum(values.values())
|
||||
cursor.execute("""
|
||||
INSERT INTO highlights (
|
||||
file, start, end, text,
|
||||
viralitaet, emotionalitaet, witz, provokation, score_total
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
UPDATE highlights SET
|
||||
viralitaet = ?, emotionalitaet = ?, witz = ?, provokation = ?, score_total = ?
|
||||
WHERE id = ?
|
||||
""", (
|
||||
VIDEO_ID, start, end, text.strip(),
|
||||
values["viralitaet"], values["emotionalitaet"],
|
||||
values["witz"], values["provokation"],
|
||||
total_score
|
||||
total_score,
|
||||
clip_id
|
||||
))
|
||||
conn.commit()
|
||||
|
||||
return {
|
||||
"id": clip_id,
|
||||
"start": start,
|
||||
"end": end,
|
||||
"text": text.strip(),
|
||||
@ -113,8 +110,8 @@ Provokation: [Zahl]
|
||||
|
||||
# === Clips bewerten
|
||||
rated = []
|
||||
for start, end, text in segments:
|
||||
result = analyse_segment(text, float(start), float(end))
|
||||
for clip_id, start, end, text in segments:
|
||||
result = analyse_segment(clip_id, text, float(start), float(end))
|
||||
if result:
|
||||
rated.append(result)
|
||||
sleep(1.2) # Anti-Rate-Limit
|
||||
@ -123,7 +120,7 @@ for start, end, text in segments:
|
||||
rated.sort(key=lambda x: x["total"], reverse=True)
|
||||
selected = rated if MAX_CLIPS == "all" else rated[:int(MAX_CLIPS)]
|
||||
|
||||
print(f"\n🎬 Beste {len(selected)} Highlights für: {VIDEO_ID}")
|
||||
print(f"\n🎬 Beste {len(selected)} Highlights nach Bewertung:")
|
||||
for clip in selected:
|
||||
print(f"\n🚀 {clip['start']:.2f}s – {clip['end']:.2f}s")
|
||||
print(f"🎙️ {clip['text'][:200]}...")
|
409
src/text/segment_transcript.py
Normal file
409
src/text/segment_transcript.py
Normal file
@ -0,0 +1,409 @@
|
||||
#!/usr/bin/env python3
|
||||
# clip_selector_optimized.py — word-based text rebuild (no duplicates)
|
||||
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
import sqlite3
|
||||
import time
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
import argparse
|
||||
import sys
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
# ── Projektwurzel in sys.path aufnehmen (dieses Skript kann z. B. unter src/text/ liegen)
|
||||
ROOT = Path(__file__).resolve().parents[2]
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from config import TRANSCRIPTS_DIR, DB_PATH # zentrale Pfade
|
||||
|
||||
LOG_DIR = ROOT / "logs"
|
||||
LOG_DIR.mkdir(exist_ok=True, parents=True)
|
||||
|
||||
# === DEFAULTS (per CLI überschreibbar) ===
|
||||
DEFAULT_BLOCK_DURATION = 300.0 # Sek. pro Block
|
||||
DEFAULT_MIN_CLIP_LEN = 30.0 # konsistent mit Prompt
|
||||
DEFAULT_MAX_CLIP_LEN = 90.0
|
||||
|
||||
# === OPENAI-CLIENT (API-Key aus Env) ===
|
||||
API_KEY = os.getenv("OPENAI_API_KEY")
|
||||
if not API_KEY:
|
||||
raise RuntimeError("🚫 OPENAI_API_KEY fehlt in der Umgebung")
|
||||
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-5") # bei Bedarf überschreiben
|
||||
client = OpenAI(api_key=API_KEY)
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# Hilfsfunktionen
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def log_text(filename: str, content: str) -> None:
|
||||
(LOG_DIR / filename).write_text((content or "").strip(), encoding="utf-8")
|
||||
|
||||
def append_error_log(content: str) -> None:
|
||||
with (LOG_DIR / "errors.txt").open("a", encoding="utf-8") as f:
|
||||
f.write(f"{datetime.now().isoformat()} {content}\n\n")
|
||||
|
||||
def extract_json(text: str) -> list:
|
||||
"""Nur für Debug: versucht JSON-Array aus beliebigem Text zu extrahieren."""
|
||||
txt = (text or "").strip()
|
||||
txt = re.sub(r"^```(?:json)?\s*|\s*```$", "", txt, flags=re.IGNORECASE | re.DOTALL)
|
||||
m = re.search(r"\[\s*{.*?}\s*\]", txt, re.DOTALL)
|
||||
if not m:
|
||||
append_error_log(f"❌ Kein JSON-Array gefunden.\n{txt}")
|
||||
return []
|
||||
try:
|
||||
return json.loads(m.group(0))
|
||||
except Exception as e:
|
||||
append_error_log(f"❌ JSON-Fehler: {e}\n{txt}")
|
||||
return []
|
||||
|
||||
def get_json_snippets_for_clip(start: float, end: float, segment_json: List[Dict]) -> List[Dict]:
|
||||
"""halb-offenes Fenster [start, end)"""
|
||||
return [s for s in segment_json if not (float(s["end"]) <= start or float(s["start"]) >= end)]
|
||||
|
||||
def _norm_space(s: str) -> str:
|
||||
return re.sub(r"\s+", " ", (s or "").strip())
|
||||
|
||||
def explode_segments_to_words(segments: List[Dict]) -> List[Dict]:
|
||||
"""
|
||||
Baut eine globale Wortliste. Bevorzugt echte 'words' aus JSON,
|
||||
fällt ansonsten auf lineare Interpolation über Segmentdauer zurück.
|
||||
Ausgabe-Items: {idx, mid, text}
|
||||
"""
|
||||
words = []
|
||||
idx = 0
|
||||
for seg in sorted(segments, key=lambda s: (float(s["start"]), float(s["end"]))):
|
||||
s0, s1 = float(seg["start"]), float(seg["end"])
|
||||
txt = (seg.get("text") or "").strip()
|
||||
seg_words = seg.get("words") or []
|
||||
if seg_words:
|
||||
for w in seg_words:
|
||||
t = (w.get("text") or w.get("word") or "").strip()
|
||||
if not t:
|
||||
continue
|
||||
w0 = float(w["start"]); w1 = float(w["end"])
|
||||
words.append({"idx": idx, "mid": round((w0 + w1) / 2.0, 4), "text": t})
|
||||
idx += 1
|
||||
else:
|
||||
toks = txt.split()
|
||||
n = len(toks)
|
||||
if n == 0:
|
||||
continue
|
||||
dur = max(1e-6, s1 - s0)
|
||||
for i, tok in enumerate(toks):
|
||||
w0 = s0 + (i / n) * dur
|
||||
w1 = s0 + ((i + 1) / n) * dur
|
||||
words.append({"idx": idx, "mid": round((w0 + w1) / 2.0, 4), "text": tok})
|
||||
idx += 1
|
||||
return words
|
||||
|
||||
def build_text_strict_from_words(clip_start: float, clip_end: float, WORDS: List[Dict]) -> str:
|
||||
"""Nimmt jedes Wort genau einmal, wenn mid ∈ [start, end)."""
|
||||
sel = [w for w in WORDS if clip_start <= w["mid"] < clip_end]
|
||||
sel.sort(key=lambda w: w["idx"])
|
||||
return _norm_space(" ".join(w["text"] for w in sel))
|
||||
|
||||
def find_transcript_pair(base: Optional[str]) -> tuple[Path, Path, str]:
|
||||
"""
|
||||
Finde (timed.txt, segments.json) in TRANSCRIPTS_DIR.
|
||||
- Wenn base übergeben: benutzt {base}_timed.txt und {base}_segments.json.
|
||||
- Sonst: nimmt das lexikographisch erste *_timed.txt und leitet die JSON davon ab.
|
||||
"""
|
||||
if base:
|
||||
txt = TRANSCRIPTS_DIR / f"{base}_timed.txt"
|
||||
jsn = TRANSCRIPTS_DIR / f"{base}_segments.json"
|
||||
if not txt.exists():
|
||||
raise FileNotFoundError(f"Transkript nicht gefunden: {txt}")
|
||||
if not jsn.exists():
|
||||
raise FileNotFoundError(f"Segment-JSON nicht gefunden: {jsn}")
|
||||
return txt, jsn, base
|
||||
|
||||
# auto-detect
|
||||
candidates = sorted(TRANSCRIPTS_DIR.glob("*_timed.txt"))
|
||||
if not candidates:
|
||||
raise FileNotFoundError(f"Keine *_timed.txt in {TRANSCRIPTS_DIR} gefunden.")
|
||||
txt = candidates[0]
|
||||
stem = txt.stem.replace("_timed", "")
|
||||
jsn = TRANSCRIPTS_DIR / f"{stem}_segments.json"
|
||||
if not jsn.exists():
|
||||
raise FileNotFoundError(f"Gefundenes TXT: {txt.name}, aber JSON fehlt: {jsn.name}")
|
||||
return txt, jsn, stem
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# CLI
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def parse_args():
|
||||
p = argparse.ArgumentParser(description="Selektiert Social-Media-taugliche Clips aus Transkripten (LLM-gestützt).")
|
||||
p.add_argument("--base", type=str, default=None,
|
||||
help="Basename der Transkriptdateien (z. B. 'testVideoShort' für *_timed.txt und *_segments.json).")
|
||||
p.add_argument("--block", type=float, default=DEFAULT_BLOCK_DURATION, help="Blocklänge in Sekunden für die Prompt-Bildung.")
|
||||
p.add_argument("--min", type=float, default=DEFAULT_MIN_CLIP_LEN, help="Minimale Clip-Länge (Sekunden).")
|
||||
p.add_argument("--max", type=float, default=DEFAULT_MAX_CLIP_LEN, help="Maximale Clip-Länge (Sekunden).")
|
||||
return p.parse_args()
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# Main
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
BLOCK_DURATION = float(args.block)
|
||||
MIN_CLIP_LEN = float(args.min)
|
||||
MAX_CLIP_LEN = float(args.max)
|
||||
|
||||
# --- Transkriptdateien finden ---
|
||||
TRANSCRIPT_PATH, SEGMENT_JSON_PATH, base = find_transcript_pair(args.base)
|
||||
print(f"📄 TXT : {TRANSCRIPT_PATH}")
|
||||
print(f"🧾 JSON: {SEGMENT_JSON_PATH}")
|
||||
|
||||
# === TRANSKRIPT EINLESEN (TXT) -> NUR für Blockbildung & Promptanzeige ===
|
||||
lines = TRANSCRIPT_PATH.read_text(encoding="utf-8").splitlines()
|
||||
segments_txt: List[Dict] = []
|
||||
for line in lines:
|
||||
m = re.match(r"\[(\d+(?:\.\d+)?)\s*[–-]\s*(\d+(?:\.\d+)?)\]\s*(?:[A-Z_0-9]+:)?\s*(.*)", line)
|
||||
if not m:
|
||||
continue
|
||||
start, end, text = m.groups()
|
||||
start, end = float(start), float(end)
|
||||
if end - start >= 2.0:
|
||||
segments_txt.append({"start": start, "end": end, "text": (text or "").strip()})
|
||||
|
||||
if not segments_txt:
|
||||
raise RuntimeError("🚫 Keine gültigen TXT-Segmente gefunden.")
|
||||
print(f"✅ {len(segments_txt)} gültige TXT-Segmente geladen.")
|
||||
|
||||
# === TRANSKRIPT EINLESEN (JSON) -> Quelle für DB-Text/Wörter ===
|
||||
segment_json_data = json.loads(SEGMENT_JSON_PATH.read_text(encoding="utf-8"))
|
||||
if not isinstance(segment_json_data, list) or not segment_json_data:
|
||||
raise RuntimeError("🚫 JSON-Segmente leer/ungültig.")
|
||||
print(f"✅ {len(segment_json_data)} JSON-Segmente geladen.")
|
||||
|
||||
# Globale Wörterliste einmal berechnen (bevor wir Clips bilden)
|
||||
WORDS = explode_segments_to_words(segment_json_data)
|
||||
print(f"🔤 Globale Wörter im Korpus: {len(WORDS)}")
|
||||
|
||||
# === BLÖCKE BILDEN (aus TXT) ===
|
||||
segments_txt.sort(key=lambda s: (s["start"], s["end"]))
|
||||
blocks, current_block, current_start = [], [], 0.0
|
||||
for seg in segments_txt:
|
||||
if not current_block:
|
||||
current_start = seg["start"]
|
||||
# Blockwechsel, wenn Dauer überschritten
|
||||
if seg["end"] - current_start > BLOCK_DURATION:
|
||||
blocks.append(current_block)
|
||||
current_block = []
|
||||
current_start = seg["start"]
|
||||
current_block.append(seg)
|
||||
if current_block:
|
||||
blocks.append(current_block)
|
||||
print(f"🧱 {len(blocks)} Blöcke erstellt (à ~{BLOCK_DURATION:.0f}s).")
|
||||
|
||||
# === KI: CLIP-AUSWAHL ===
|
||||
all_clips = []
|
||||
t0 = time.perf_counter()
|
||||
|
||||
for i, block in enumerate(blocks, start=1):
|
||||
if not block:
|
||||
continue
|
||||
print(f"\n🤖 Sende Block {i}/{len(blocks)} an {OPENAI_MODEL} …")
|
||||
block_text = "\n".join([f"[{s['start']} – {s['end']}] {s['text']}" for s in block])
|
||||
|
||||
prompt = f"""
|
||||
Du bekommst einen Transkriptblock mit Zeitangaben. Extrahiere daraus 1–3 besonders interessante Abschnitte, die sich als eigenständige Social Media Clips eignen.
|
||||
Ein guter Clip:
|
||||
- ist abgeschlossen und verständlich
|
||||
- enthält eine Pointe, Erkenntnis oder einen emotionalen Moment
|
||||
- wirkt wie ein Mini-Ausschnitt mit Anfang, Spannungsbogen, Auflösung oder Punchline
|
||||
- ist mindestens {MIN_CLIP_LEN:.0f} Sekunden lang
|
||||
Nutze ausschließlich die vorhandenen Start- und Endzeiten – keine neuen erfinden.
|
||||
|
||||
Gib ein JSON-Objekt zurück im Format:
|
||||
{{
|
||||
"clips": [
|
||||
{{
|
||||
"start": float,
|
||||
"end": float,
|
||||
"summary": "Kurze Beschreibung des Inhalts"
|
||||
}}
|
||||
]
|
||||
}}
|
||||
|
||||
TRANSKRIPT:
|
||||
{block_text}
|
||||
""".strip()
|
||||
|
||||
log_text(f"block_prompt_{i:02d}.txt", prompt)
|
||||
|
||||
# --- robuster API-Call mit Schema (Root=object) und kleinem Retry ---
|
||||
import time as _time
|
||||
clips = []
|
||||
for attempt in range(3):
|
||||
try:
|
||||
resp = client.chat.completions.create(
|
||||
model=OPENAI_MODEL,
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
response_format={
|
||||
"type": "json_schema",
|
||||
"json_schema": {
|
||||
"name": "clips_payload",
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"clips": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"start": {"type": "number"},
|
||||
"end": {"type": "number"},
|
||||
"summary": {"type": "string"}
|
||||
},
|
||||
"required": ["start", "end", "summary"],
|
||||
"additionalProperties": False
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["clips"],
|
||||
"additionalProperties": False
|
||||
}
|
||||
}
|
||||
}
|
||||
)
|
||||
msg = resp.choices[0].message
|
||||
payload = getattr(msg, "parsed", None)
|
||||
if payload is None:
|
||||
payload = json.loads(msg.content)
|
||||
|
||||
clips = payload.get("clips", []) or []
|
||||
|
||||
try:
|
||||
log_text(f"block_output_{i:02d}.txt", json.dumps(payload, ensure_ascii=False, indent=2))
|
||||
except Exception:
|
||||
pass
|
||||
break
|
||||
except Exception as e:
|
||||
if attempt == 2:
|
||||
append_error_log(f"❌ OpenAI-Fehler Block {i}: {e}")
|
||||
print(f"❌ Fehler bei Block {i}: {e}")
|
||||
else:
|
||||
_time.sleep(1.5 * (attempt + 1))
|
||||
|
||||
print(f"✅ {len(clips)} Clips empfangen in Block {i}")
|
||||
|
||||
# --- Clips filtern & clampen ---
|
||||
for clip in clips:
|
||||
try:
|
||||
b_start, b_end = block[0]["start"], block[-1]["end"]
|
||||
start = max(b_start, min(float(clip["start"]), b_end))
|
||||
end = max(b_start, min(float(clip["end"]), b_end))
|
||||
dur = end - start
|
||||
if MIN_CLIP_LEN <= dur <= MAX_CLIP_LEN:
|
||||
clip["start"] = start
|
||||
clip["end"] = end
|
||||
clip["duration"] = round(dur, 2)
|
||||
all_clips.append(clip)
|
||||
except Exception as e:
|
||||
append_error_log(f"⛔ Clip-Filterfehler: {clip}\n{e}")
|
||||
|
||||
elapsed = time.perf_counter() - t0
|
||||
avg = elapsed / i
|
||||
eta = max(0.0, avg * (len(blocks) - i))
|
||||
print(f"⏱️ Geschätzte Restzeit: {eta:.1f} s")
|
||||
|
||||
# --- Duplikate entfernen (auf 2 Dezimalen) ---
|
||||
dedup, seen = [], set()
|
||||
for c in all_clips:
|
||||
k = (round(c["start"], 2), round(c["end"], 2))
|
||||
if k in seen:
|
||||
continue
|
||||
seen.add(k)
|
||||
dedup.append(c)
|
||||
all_clips = dedup
|
||||
|
||||
print(f"\n📈 Gesamtclips vor DB-Insert: {len(all_clips)}")
|
||||
|
||||
# === DB SPEICHERN ===
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cur = conn.cursor()
|
||||
|
||||
cur.execute("""
|
||||
CREATE TABLE IF NOT EXISTS highlights (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
file TEXT,
|
||||
start REAL,
|
||||
end REAL,
|
||||
duration REAL,
|
||||
text TEXT,
|
||||
summary TEXT,
|
||||
json_raw TEXT,
|
||||
viralitaet INTEGER,
|
||||
emotionalitaet INTEGER,
|
||||
witz INTEGER,
|
||||
provokation INTEGER,
|
||||
score_total INTEGER,
|
||||
UNIQUE(file,start,end)
|
||||
)
|
||||
""")
|
||||
|
||||
# --- Tabelle vor neuem Lauf komplett leeren ---
|
||||
cur.execute("DELETE FROM highlights")
|
||||
conn.commit() # Transaktion schließen, damit VACUUM außerhalb läuft
|
||||
|
||||
# VACUUM separat (optional)
|
||||
try:
|
||||
conn.execute("VACUUM") # oder: sqlite3.connect(DB_PATH).execute("VACUUM").close()
|
||||
print("🧹 Alte Highlights gelöscht und Datenbank komprimiert.")
|
||||
except sqlite3.OperationalError as e:
|
||||
print(f"⚠️ VACUUM übersprungen: {e}")
|
||||
|
||||
inserted = 0
|
||||
failed = 0
|
||||
|
||||
for clip in all_clips:
|
||||
try:
|
||||
start = float(clip["start"])
|
||||
end = float(clip["end"])
|
||||
duration = float(clip["duration"])
|
||||
summary = (clip.get("summary") or "").strip()
|
||||
|
||||
if end <= start or start < 0:
|
||||
raise ValueError("Ungültige Zeiten")
|
||||
|
||||
# JSON-Segmente (zur Nachvollziehbarkeit) + Wort-basierter Text (dopplerfrei)
|
||||
json_snippets = get_json_snippets_for_clip(start, end, segment_json_data)
|
||||
json_raw = json.dumps(json_snippets, ensure_ascii=False)
|
||||
|
||||
original_text = build_text_strict_from_words(start, end, WORDS)
|
||||
|
||||
cur.execute("""
|
||||
INSERT OR IGNORE INTO highlights (
|
||||
file, start, end, duration, text, summary, json_raw,
|
||||
viralitaet, emotionalitaet, witz, provokation, score_total
|
||||
)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, NULL, NULL, NULL, NULL, NULL)
|
||||
""", (
|
||||
# 'file' = Basename (z. B. testVideoShort)
|
||||
Path(base).name,
|
||||
start, end, duration,
|
||||
original_text, summary, json_raw
|
||||
))
|
||||
if cur.rowcount > 0:
|
||||
inserted += 1
|
||||
except Exception as e:
|
||||
failed += 1
|
||||
append_error_log(f"❌ DB-Fehler: {clip}\n{e}")
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
print("\n📊 Ergebnisse:")
|
||||
print(f" ✅ Highlights gespeichert: {inserted}")
|
||||
print(f" ❌ Fehlerhafte Clips: {failed}")
|
||||
print(f"📁 Logs: {LOG_DIR.resolve()}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
276
src/text/transcription.py
Normal file
276
src/text/transcription.py
Normal file
@ -0,0 +1,276 @@
|
||||
#!/usr/bin/env python3
|
||||
# transcription_chunked_words.py — Whisper mit Wortzeitstempeln, doppler-sicher
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from tempfile import TemporaryDirectory
|
||||
from typing import List, Dict, Tuple, Optional
|
||||
|
||||
import ffmpeg
|
||||
import whisper
|
||||
|
||||
# ── Projektwurzel in sys.path aufnehmen (dieses Skript liegt z. B. unter src/text/)
|
||||
ROOT = Path(__file__).resolve().parents[2]
|
||||
sys.path.insert(0, str(ROOT))
|
||||
|
||||
from config import INPUT_DIR, TRANSCRIPTS_DIR # zentrale Pfade
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# Utilities
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def probe_duration(path: Path) -> float:
|
||||
"""Ermittle die Videodauer in Sekunden (ffmpeg.probe)."""
|
||||
try:
|
||||
meta = ffmpeg.probe(str(path))
|
||||
except ffmpeg.Error as e:
|
||||
raise RuntimeError(f"ffmpeg.probe fehlgeschlagen für {path}: {e.stderr.decode('utf-8','ignore') if hasattr(e, 'stderr') else e}") from e
|
||||
|
||||
dur = meta.get("format", {}).get("duration")
|
||||
if dur is not None:
|
||||
return float(dur)
|
||||
|
||||
cand = 0.0
|
||||
for s in meta.get("streams", []) or []:
|
||||
d = s.get("duration")
|
||||
if d:
|
||||
cand = max(cand, float(d))
|
||||
if cand > 0:
|
||||
return cand
|
||||
raise RuntimeError(f"Konnte Videodauer nicht bestimmen: {path}")
|
||||
|
||||
def make_chunks(total: float, chunk_seconds: float, overlap: float) -> List[Tuple[float,float]]:
|
||||
"""Zerteile [0,total] in überlappende Intervalle."""
|
||||
if chunk_seconds <= 0:
|
||||
return [(0.0, total)]
|
||||
s, out = 0.0, []
|
||||
while s < total:
|
||||
e = min(s + chunk_seconds, total)
|
||||
out.append((s, e))
|
||||
if e >= total:
|
||||
break
|
||||
s = max(0.0, e - overlap)
|
||||
return out
|
||||
|
||||
def extract_audio_segment(src_video: Path, start: float, end: float, out_wav: Path) -> None:
|
||||
"""Extrahiere [start,end] als Mono-16kHz-WAV."""
|
||||
(
|
||||
ffmpeg
|
||||
.input(str(src_video), ss=start, to=end)
|
||||
.output(
|
||||
str(out_wav),
|
||||
format="wav",
|
||||
acodec="pcm_s16le",
|
||||
ac=1,
|
||||
ar="16000",
|
||||
loglevel="error",
|
||||
)
|
||||
.overwrite_output()
|
||||
.run()
|
||||
)
|
||||
|
||||
def is_suspect(text: str) -> bool:
|
||||
"""Heuristik: leere/loopende/zweifelhafte Zeilen markieren."""
|
||||
t = (text or "").strip().lower()
|
||||
if not t:
|
||||
return True
|
||||
words = t.split()
|
||||
if not words:
|
||||
return True
|
||||
counts = {w: words.count(w) for w in set(words)}
|
||||
most_common = max(counts.values())
|
||||
return most_common / len(words) > 0.6 or most_common > 20
|
||||
|
||||
def merge_overlaps_keep_best(
|
||||
segments: List[Dict],
|
||||
max_gap: float = 0.15,
|
||||
min_dur: float = 0.30
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Zeitlich sortieren, kleine Gaps schließen. Bei Überlappung:
|
||||
- keine Text-Konkatenation
|
||||
- behalte das "bessere" Segment (längere Dauer, dann längerer Text)
|
||||
- words: vom "best" übernehmen (falls vorhanden)
|
||||
"""
|
||||
cleaned = []
|
||||
for s in segments:
|
||||
s0 = float(s["start"]); s1 = float(s["end"])
|
||||
txt = (s.get("text") or "").strip()
|
||||
if s1 - s0 >= min_dur and txt:
|
||||
cleaned.append({
|
||||
"start": s0, "end": s1,
|
||||
"text": txt,
|
||||
"words": s.get("words", [])
|
||||
})
|
||||
if not cleaned:
|
||||
return []
|
||||
|
||||
cleaned.sort(key=lambda x: (x["start"], x["end"]))
|
||||
out = [cleaned[0]]
|
||||
|
||||
def score(x: Dict) -> tuple:
|
||||
return (x["end"] - x["start"], len(x.get("text", "")))
|
||||
|
||||
for s in cleaned[1:]:
|
||||
m = out[-1]
|
||||
if s["start"] <= m["end"] + max_gap:
|
||||
best = s if score(s) > score(m) else m
|
||||
out[-1] = {
|
||||
"start": min(m["start"], s["start"]),
|
||||
"end": max(m["end"], s["end"]),
|
||||
"text": best["text"],
|
||||
"words": best.get("words", []),
|
||||
}
|
||||
else:
|
||||
out.append(s)
|
||||
return out
|
||||
|
||||
def write_outputs(base: Path, segments: List[Dict], out_dir: Path, ascii_dash: bool = True):
|
||||
"""Schreibe _timed.txt, _suspect_lines.txt und _segments.json."""
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
dash = "-" if ascii_dash else "–"
|
||||
|
||||
out_txt = out_dir / f"{base.stem}_timed.txt"
|
||||
out_sus = out_dir / f"{base.stem}_suspect_lines.txt"
|
||||
out_json = out_dir / f"{base.stem}_segments.json"
|
||||
|
||||
# TXT nur zur Ansicht
|
||||
with open(out_txt, "w", encoding="utf-8") as f_txt, open(out_sus, "w", encoding="utf-8") as f_sus:
|
||||
for s in segments:
|
||||
line = f"[{s['start']:.2f} {dash} {s['end']:.2f}] {s['text']}\n"
|
||||
f_txt.write(line)
|
||||
if is_suspect(s["text"]):
|
||||
f_sus.write(line)
|
||||
|
||||
# JSON für die Weiterverarbeitung (inkl. words)
|
||||
with open(out_json, "w", encoding="utf-8") as f_json:
|
||||
json.dump(segments, f_json, ensure_ascii=False, indent=2)
|
||||
|
||||
return out_txt, out_sus, out_json
|
||||
|
||||
def find_default_input() -> Optional[Path]:
|
||||
"""Nimm das erste Video aus INPUT_DIR, falls kein --input übergeben wurde."""
|
||||
exts = (".mp4", ".mov", ".mkv", ".m4v", ".wav", ".mp3")
|
||||
for p in sorted(INPUT_DIR.iterdir()):
|
||||
if p.suffix.lower() in exts:
|
||||
return p
|
||||
return None
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# CLI
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def parse_args():
|
||||
p = argparse.ArgumentParser(
|
||||
description="Chunked Whisper Transcription mit Wortzeitstempeln & doppler-sicherem Stitching."
|
||||
)
|
||||
p.add_argument("--input", type=Path, default=None, help=f"Eingabevideo/-audio. Default: erstes File in {INPUT_DIR}")
|
||||
p.add_argument("--outdir", type=Path, default=None, help=f"Ausgabeverzeichnis. Default: {TRANSCRIPTS_DIR}")
|
||||
p.add_argument("--model", type=str, default=os.getenv("WHISPER_MODEL", "small"), help="Whisper-Modell (tiny/base/small/medium/large)")
|
||||
p.add_argument("--lang", type=str, default=os.getenv("LANGUAGE", "none"), help="Sprachcode (z. B. 'de') oder leer/None für Auto-Detect")
|
||||
p.add_argument("--chunk", type=float, default=60.0, help="Chunk-Länge in Sekunden (0 = ganzes File)")
|
||||
p.add_argument("--overlap", type=float, default=2.0, help="Overlap in Sekunden")
|
||||
p.add_argument("--min-dur", type=float, default=0.30, help="Mindest-Segmentdauer (Sekunden)")
|
||||
p.add_argument("--max-gap", type=float, default=0.15, help="Maximaler Zeit-Gap für Merge (Sekunden)")
|
||||
p.add_argument("--fp16", action="store_true", help="fp16 aktivieren (nur sinnvoll mit GPU)")
|
||||
return p.parse_args()
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# Main
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
# Whisper-Cache (damit Modelle lokal landen)
|
||||
os.environ.setdefault("XDG_CACHE_HOME", str(ROOT / "whisper-cache"))
|
||||
|
||||
args = parse_args()
|
||||
input_path = args.input or find_default_input()
|
||||
out_dir = args.outdir or TRANSCRIPTS_DIR
|
||||
|
||||
print("📁 Projekt-Root:", ROOT)
|
||||
print("📄 Input:", input_path if input_path else "—")
|
||||
if not input_path or not input_path.exists():
|
||||
raise FileNotFoundError(f"Kein gültiges Eingabefile gefunden. Lege ein Video/Audio in {INPUT_DIR} oder nutze --input.")
|
||||
|
||||
out_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
duration = probe_duration(input_path)
|
||||
print(f"🎬 Dauer: {duration:.2f}s")
|
||||
|
||||
chunks = make_chunks(duration, args.chunk, args.overlap)
|
||||
print(f"🔪 {len(chunks)} Chunks à {args.chunk:.1f}s mit {args.overlap:.1f}s Overlap")
|
||||
|
||||
# Whisper laden
|
||||
print(f"🧠 Lade Whisper-Modell: {args.model}")
|
||||
try:
|
||||
model = whisper.load_model(args.model)
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"Whisper-Modell '{args.model}' konnte nicht geladen werden. Installiert? (pip install openai-whisper)\n{e}") from e
|
||||
|
||||
all_segments: List[Dict] = []
|
||||
with TemporaryDirectory() as tmpdir_str:
|
||||
tmpdir = Path(tmpdir_str)
|
||||
for i, (start, end) in enumerate(chunks, 1):
|
||||
print(f"🔉 Chunk {i}/{len(chunks)}: {start:.2f}s - {end:.2f}s")
|
||||
wav = tmpdir / f"chunk_{i:03d}.wav"
|
||||
extract_audio_segment(input_path, start, end, wav)
|
||||
|
||||
# Sprache: ''/none = Auto-Detect
|
||||
lang = None if str(args.lang).strip().lower() in {"", "none", "null"} else args.lang
|
||||
|
||||
# Transkribieren mit Wortzeiten, ohne Cross-Chunk-Kontext
|
||||
result = model.transcribe(
|
||||
str(wav),
|
||||
language=lang,
|
||||
fp16=args.fp16,
|
||||
word_timestamps=True,
|
||||
condition_on_previous_text=False,
|
||||
temperature=0,
|
||||
verbose=False,
|
||||
)
|
||||
|
||||
# Center-Cut: nur Mittelteil behalten (verhindert Grenz-Doppler)
|
||||
keep_start = start if i == 1 else start + args.overlap / 2.0
|
||||
keep_end = end if i == len(chunks) else end - args.overlap / 2.0
|
||||
|
||||
for seg in result.get("segments", []) or []:
|
||||
s0 = float(seg["start"]) + start
|
||||
s1 = float(seg["end"]) + start
|
||||
mid = (s0 + s1) / 2.0
|
||||
if not (keep_start <= mid < keep_end):
|
||||
continue
|
||||
|
||||
# Wörter mit absoluten Zeiten übernehmen
|
||||
words = []
|
||||
for w in (seg.get("words") or []):
|
||||
txt = (w.get("word") or w.get("text") or "").strip()
|
||||
if not txt:
|
||||
continue
|
||||
words.append({
|
||||
"start": float(w["start"]) + start,
|
||||
"end": float(w["end"]) + start,
|
||||
"text": txt
|
||||
})
|
||||
|
||||
all_segments.append({
|
||||
"start": s0,
|
||||
"end": s1,
|
||||
"text": (seg.get("text") or "").strip(),
|
||||
"words": words
|
||||
})
|
||||
|
||||
print(f"🧹 Roh-Segmente: {len(all_segments)} → merge & filter …")
|
||||
merged = merge_overlaps_keep_best(all_segments, max_gap=args.max_gap, min_dur=args.min_dur)
|
||||
print(f"✅ Gemergte Segmente: {len(merged)}")
|
||||
|
||||
out_txt, out_sus, out_json = write_outputs(input_path, merged, out_dir, ascii_dash=True)
|
||||
print(f"📝 TXT: {out_txt}")
|
||||
print(f"⚠️ SUSPECT: {out_sus}")
|
||||
print(f"💾 JSON: {out_json}")
|
||||
print("🎉 Fertig.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
88
src/text/transcription_with_speaker.py
Normal file
88
src/text/transcription_with_speaker.py
Normal file
@ -0,0 +1,88 @@
|
||||
import os
|
||||
import json
|
||||
import ffmpeg
|
||||
import whisper
|
||||
import tempfile
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
from pathlib import Path
|
||||
from pyannote.audio import Pipeline
|
||||
|
||||
# === HUGGING FACE TOKEN (für pyannote) ===
|
||||
HF_TOKEN = "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"
|
||||
|
||||
# === Torch Optimierung (optional) ===
|
||||
torch.set_float32_matmul_precision("medium")
|
||||
|
||||
# === Einstellungen ===
|
||||
PROJECT_ROOT = Path(__file__).resolve().parents[2]
|
||||
input_file = PROJECT_ROOT / "input" / "testVideoShort.mov"
|
||||
output_dir = PROJECT_ROOT / "transkripte"
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
output_txt = output_dir / f"{input_file.stem}_timed.txt"
|
||||
output_json = output_dir / f"{input_file.stem}_segments.json"
|
||||
|
||||
# === Video in Audio konvertieren ===
|
||||
print("🎞️ Extrahiere Audio ...")
|
||||
tmp_dir = Path(tempfile.mkdtemp())
|
||||
wav_file = tmp_dir / "audio.wav"
|
||||
ffmpeg.input(str(input_file)).output(
|
||||
str(wav_file),
|
||||
format="wav",
|
||||
acodec="pcm_s16le",
|
||||
ac=1,
|
||||
ar="16000",
|
||||
loglevel="error"
|
||||
).overwrite_output().run()
|
||||
|
||||
# === Transkription mit Whisper ===
|
||||
print("🧠 Starte Transkription mit Whisper ...")
|
||||
model = whisper.load_model("small")
|
||||
result = model.transcribe(
|
||||
str(wav_file),
|
||||
language="de",
|
||||
fp16=False,
|
||||
word_timestamps=False,
|
||||
condition_on_previous_text=True,
|
||||
temperature=0,
|
||||
verbose=False
|
||||
)
|
||||
segments = result["segments"]
|
||||
|
||||
# === Diarisation mit Pyannote ===
|
||||
print("🗣️ Starte Sprecheranalyse mit Pyannote (das dauert jetzt etwas) ...")
|
||||
pipeline = Pipeline.from_pretrained(
|
||||
"pyannote/speaker-diarization-3.1",
|
||||
use_auth_token=HF_TOKEN
|
||||
)
|
||||
pipeline.to(torch.device("mps")) # ⬅️ Apple GPU beschleunigen
|
||||
|
||||
diarization = pipeline(str(wav_file))
|
||||
|
||||
# === Sprecher zuordnen ===
|
||||
def assign_speakers_to_segments(segments, diarization):
|
||||
assigned = []
|
||||
for seg in tqdm(segments, desc="🎙️ Weise Sprecher zu"):
|
||||
speaker = "unknown"
|
||||
for turn, _, label in diarization.itertracks(yield_label=True):
|
||||
if turn.start <= seg["start"] <= turn.end:
|
||||
speaker = label
|
||||
break
|
||||
seg["speaker"] = speaker
|
||||
assigned.append(seg)
|
||||
return assigned
|
||||
|
||||
segments_with_speaker = assign_speakers_to_segments(segments, diarization)
|
||||
|
||||
# === Speichern als TXT
|
||||
with open(output_txt, "w", encoding="utf-8") as f:
|
||||
for seg in segments_with_speaker:
|
||||
line = f"[{seg['start']:.2f} – {seg['end']:.2f}] {seg['speaker'].upper()}: {seg['text'].strip()}\n"
|
||||
f.write(line)
|
||||
|
||||
# === Speichern als JSON
|
||||
with open(output_json, "w", encoding="utf-8") as f:
|
||||
json.dump(segments_with_speaker, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"✅ Transkript mit Sprecherinfos gespeichert unter:\n📄 {output_txt}\n📄 {output_json}")
|
@ -1 +0,0 @@
|
||||
Subproject commit 7815f8b37d91b75cf160ed3f0ec8550c0b58cabb
|
BIN
transkripte/.DS_Store
vendored
BIN
transkripte/.DS_Store
vendored
Binary file not shown.
@ -1 +0,0 @@
|
||||
Subproject commit 2e310b841e0b4e7cf00890b53411dd9f8578f243
|
Loading…
x
Reference in New Issue
Block a user