diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index e90ef19..0000000 Binary files a/.DS_Store and /dev/null differ diff --git a/.gitignore b/.gitignore index 382d186..8b0731e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,27 +1,108 @@ -# IDE & Cache +# ───────────────────────────── +# IDEs & System Files +# ───────────────────────────── .idea/ +.vscode/ __pycache__/ *.pyc .DS_Store +*.log -# Whisper Modelle & Cache +# ───────────────────────────── +# Cache / Modelle / Checkpoints +# ───────────────────────────── whisper-cache/ models/ *.pt +*.onnx +*.bin +*.safetensors -# Output/Temp Files +# ───────────────────────────── +# Datenbank / temporäre Dateien +# ───────────────────────────── +*.db +*.sqlite +logs/ +temp/ +tmp/ +*.tmp + +# ───────────────────────────── +# Transkripte / KI-Zwischenausgaben +# ───────────────────────────── +/data/transkripte/ +/transcripts/ +/outputs/ +/results/ +*_segments.json +*_timed.txt +*_suspect_lines.txt + +# ───────────────────────────── +# Video / Audio Outputs +# ───────────────────────────── *.mp4 *.mov -*.db +*.mkv *.wav -*.json -temp.* -logs/ +*.webm +*.mp3 -# Eingebettete Repos +# ───────────────────────────── +# Generierte Teil-/Ergebnis-Ordner +# ───────────────────────────── +/raw_clips/ +/face_combined/ +/face_crop_centers/ +/cropped/ +/subtitled/ +/segments/ +/highlight_clips/ +/output/ +/renders/ +/exports/ + +# ───────────────────────────── +# Eingebettete Repos oder externe Module +# ───────────────────────────── +/whisper.cpp/ +/text-clustering/ +/venv/ +/.env/ +/.env.local +.envrc +.env.* + +# ───────────────────────────── +# Backups / Sonstiges +# ───────────────────────────── +*.bak +*.old +*.orig +*.swp +*.zip +*.tar +*.gz + +# IDE/System +.idea/ +.DS_Store +__pycache__/ +*.pyc + +# Secrets/Umgebung +.env +config.py + +# Große/ausgeleitete Daten +data/ +transkripte/ +whisper-cache/ +models/ +*.db +*.mp4 *.mov *.mkv *.wav *.mp3 *.webm +logs/ tmp/ temp/ +# embedded / external text-clustering/ whisper.cpp/ - -# Video-Rohmaterial -*.mov - diff --git a/.idea/.gitignore b/.idea/.gitignore deleted file mode 100644 index 13566b8..0000000 --- a/.idea/.gitignore +++ /dev/null @@ -1,8 +0,0 @@ -# Default ignored files -/shelf/ -/workspace.xml -# Editor-based HTTP Client requests -/httpRequests/ -# Datasource local storage ignored files -/dataSources/ -/dataSources.local.xml diff --git a/.idea/BachlorArbeit.iml b/.idea/BachlorArbeit.iml deleted file mode 100644 index 106b3db..0000000 --- a/.idea/BachlorArbeit.iml +++ /dev/null @@ -1,11 +0,0 @@ - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/dataSources.xml b/.idea/dataSources.xml deleted file mode 100644 index 34586b5..0000000 --- a/.idea/dataSources.xml +++ /dev/null @@ -1,35 +0,0 @@ - - - - - sqlite.xerial - true - org.sqlite.JDBC - jdbc:sqlite:$PROJECT_DIR$/segments.db - $ProjectFileDir$ - - - file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/xerial/sqlite-jdbc/3.45.1.0/sqlite-jdbc-3.45.1.0.jar - - - file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/slf4j/slf4j-api/1.7.36/slf4j-api-1.7.36.jar - - - - - sqlite.xerial - true - org.sqlite.JDBC - jdbc:sqlite:$PROJECT_DIR$/clips_openai.db - $ProjectFileDir$ - - - file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/xerial/sqlite-jdbc/3.45.1.0/sqlite-jdbc-3.45.1.0.jar - - - file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/slf4j/slf4j-api/1.7.36/slf4j-api-1.7.36.jar - - - - - \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml deleted file mode 100644 index 105ce2d..0000000 --- a/.idea/inspectionProfiles/profiles_settings.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml deleted file mode 100644 index 1733c19..0000000 --- a/.idea/misc.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml deleted file mode 100644 index 5be715f..0000000 --- a/.idea/modules.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml deleted file mode 100644 index 6bdb7e2..0000000 --- a/.idea/vcs.xml +++ /dev/null @@ -1,9 +0,0 @@ - - - - - - - - - \ No newline at end of file diff --git a/README.md b/README.md index e69de29..fa355d2 100644 --- a/README.md +++ b/README.md @@ -0,0 +1,250 @@ +# Bachelorarbeit – Pipeline: Automatisierte Highlight-Erkennung & 9:16-Aufbereitung + +Diese Repository enthält eine vollständige, skriptbasierte Pipeline, um aus Langvideos automatisch Social‑Media‑taugliche 9:16‑Highlights zu erzeugen – inkl. Transkription, KI‑gestützter Clip‑Selektion, Gesichts‑/Mundaktivitätsanalyse, Auto‑Cropping, Untertitel (Word‑Caps) und finalem Export. + +## Inhaltsverzeichnis +- [Features](#features) +- [Ordnerstruktur](#ordnerstruktur) +- [Voraussetzungen](#voraussetzungen) +- [Installation](#installation) +- [Schnellstart (empfohlener Workflow)](#schnellstart-empfohlener-workflow) +- [Skripte & CLI](#skripte--cli) +- [Tipps & Troubleshooting](#tipps--troubleshooting) +- [Reproduzierbarkeit](#reproduzierbarkeit) +- [Lizenz / Danksagung](#lizenz--danksagung) + +--- + +## Features +- **Transkription mit Wort‑Zeitstempeln (Whisper, chunked ohne Grenz‑Doppler)** +- **LLM‑gestützte Clip‑Selektion** (Viralität/Emotionalität etc. in SQLite gespeichert) +- **Face‑Detection (YOLOv8‑face) & Mundaktivität (MediaPipe)** +- **Stabiles 9:16‑Auto‑Cropping** (Median + EMA, Deadband, Szenenschnitt‑Erkennung, Switch‑Cooldown) +- **Word‑Caps Untertitel** (ASS generiert, per ffmpeg eingebrannt) +- **Batch‑Export der Highlights** (MoviePy, Längen‑/Grenz‑Checks) + +## Ordnerstruktur +Die Pfade werden zentral in `config.py` definiert: +``` +PROJECT_ROOT/ +├─ data/ +│ ├─ input/ # Eingabevideo(s) +│ ├─ transkripte/ # Whisper-Outputs (*_segments.json, *_timed.txt ...) +│ ├─ segments/ # LLM-Clip-Auswahl, DB etc. +│ ├─ output/ +│ │ └─ raw_clips/ # Roh-Highlight-Clips (aus cutClips.py) +│ ├─ face_data_combined/ # faces.json je Clip (YOLO + MediaPipe) +│ └─ face_crop_centers/ # (optional) Center-Listen +├─ output/ +│ ├─ output_9x16_final/ # Auto-cropped 9:16 Videos +│ ├─ output_9x16_final_subbed_word/ # 9:16 mit eingebrannten Word-Caps +│ └─ debug/ # Debug-Previews/Artefakte +├─ models/ # YOLO-Weights (z. B. yolov8n-face.pt) +├─ whisper-cache/ # Whisper Modell-Cache +└─ src/... (optional projektspezifisch) +``` +> Beim ersten Start legt `config.py` fehlende Verzeichnisse automatisch an. + +## Voraussetzungen +**System‑Tools** +- `ffmpeg` (inkl. `ffprobe`) im `PATH` + +**Python** +- Python 3.10+ empfohlen +- Pakete (Beispiel): + `openai-whisper`, `torch`, `ffmpeg-python`, `ultralytics`, `opencv-python`, `mediapipe`, `moviepy`, `tqdm`, `numpy`, `regex` +- Optional/abhängig vom Codepfad: `pydub`, `scikit-image` (falls in Erweiterungen verwendet) + +**Modelle & Keys** +- **Whisper**: lädt Modelle automatisch in `whisper-cache/` (steuerbar via `WHISPER_MODEL`) +- **YOLOv8‑face**: `models/yolov8n-face.pt` (oder größeres Modell) +- **OpenAI API Key** (für `segment_transcript.py` & `rateCluster.py`): `export OPENAI_API_KEY=...` + - Default‑Modell ggf. per `export OPENAI_MODEL=gpt-4o` setzen + +## Installation +```bash +# 1) Python-Umgebung +python3 -m venv .venv +source .venv/bin/activate + +# 2) Systemabhängigkeiten +# ffmpeg installieren (Mac: brew install ffmpeg, Ubuntu: apt install ffmpeg) + +# 3) Python-Pakete (Beispiel) +pip install --upgrade pip +pip install openai-whisper torch ffmpeg-python ultralytics opencv-python mediapipe moviepy tqdm numpy regex + +# 4) Modelle/Dateien +# YOLO-Weights: +# Download yolov8n-face.pt → ./models/yolov8n-face.pt +# API Key für LLM: +export OPENAI_API_KEY="sk-..." +export OPENAI_MODEL="gpt-4o" +``` + +## Schnellstart (empfohlener Workflow) +1) **Eingabe ablegen** + Lege dein Langvideo in `data/input/` (z. B. `meinvideo.mp4`). + +2) **Transkription (Whisper, chunked & doppler-sicher)** +```bash +python transcription.py --input data/input/meinvideo.mp4 --model small --lang de +``` + → erzeugt `*_segments.json` + `*_timed.txt` in `data/transkripte/`. + +3) **Clips mit LLM selektieren & in DB speichern** +```bash +export OPENAI_API_KEY="..."; export OPENAI_MODEL="gpt-4o" +python segment_transcript.py --base meinvideo --block 60 --min 6.0 --max 30.0 +``` + → schreibt Clips in SQLite (`data/clips_openai.db` o. ä.) + +4) **Highlights aus dem Originalvideo schneiden** +```bash +python cutClips.py --file meinvideo.mp4 --limit 10 --order score +``` + → exportiert `highlight_*.mp4` nach `data/output/raw_clips/` + +5) **Face‑Detection + Mundaktivität** +```bash +python main_detect_faces.py --model models/yolov8n-face.pt --input-dir data/output/raw_clips --output-dir data/face_data_combined --frame-skip 1 --downscale 0.5 +``` + +6) **Targets je Frame bauen (Zentren/Größe glätten)** +```bash +python make_segments.py --pattern "highlight_*.mp4" --fps 0 --smooth 7 --overwrite +``` + +7) **9:16 Auto‑Crop anwenden** +```bash +python main_apply_crop.py --pattern "highlight_*.mp4" --median 7 --ema 0.5 --deadband 16 --cut_detect --mux_audio --overwrite +``` + → fertige 9:16‑Clips in `output/output_9x16_final/` + +8) **Word‑Caps Untertitel einbrennen (optional)** +```bash +python add_subtitles.py --clips_dir output/output_9x16_final --out_dir output/output_9x16_final_subbed_word --model small --limit 20 +``` + → fertige Videos mit eingebrannten Word‑Caps in `output/output_9x16_final_subbed_word/` + +> 💡 Du kannst viele Parameter (Fensterbreiten, Deadband, Erkennungsschwellen, Limits) über die CLI anpassen. + +## Skripte & CLI +### `transcription.py` +Chunked‑Transkription mit Wortzeitstempeln. +``` +--input PATH # Eingabevideo/-audio (Default: erstes File in data/input/) +--outdir PATH # Ausgabeverzeichnis (Default: data/transkripte/) +--model NAME # Whisper-Modell (tiny/base/small/medium/large; env: WHISPER_MODEL) +--lang CODE # Sprachcode (z. B. de) oder leer/None für Auto-Detect +--chunk FLOAT # Chunk-Länge in s (Default 60) +--overlap FLOAT # Überlappung in s (Default 2.0) +--min-dur FLOAT # Mindest-Segmentdauer (s) +--max-gap FLOAT # Max. Zeit-Gap beim Mergen (s) +--fp16 # Nur sinnvoll mit GPU +``` + +### `segment_transcript.py` +LLM‑Selektion & Speichern in SQLite. +``` +--base STR # Basename der Transkriptdateien (z. B. 'meinvideo') +--block FLOAT # Blocklänge s für den Prompt +--min FLOAT # minimale Clip-Länge s +--max FLOAT # maximale Clip-Länge s +# env: OPENAI_API_KEY, OPENAI_MODEL (z. B. gpt-4o) +``` + +### `cutClips.py` +Schneidet ausgewählte Highlights als Einzelclips. +``` +--file NAME # Name der Input-Datei in data/input (Default: erstes Video) +--limit INT # Anzahl zu exportierender Clips (Default 10) +--order {score,start} # Sortierung: Score (absteigend) oder Startzeit +``` + +### `main_detect_faces.py` +YOLOv8‑face + MediaPipe → `faces.json` pro Clip. +``` +--input-dir PATH # Default: data/output/raw_clips +--output-dir PATH # Default: data/face_data_combined +--model PATH # YOLOv8-face Weights (Default: models/yolov8n-face.pt) +--conf-thresh FLOAT # Default 0.35 +--frame-skip INT # z. B. 1 = jeden Frame, 2 = jeden von zwei ... +--downscale FLOAT # Frame-Downscale vor YOLO (0..1, z. B. 0.5) +--expansion FLOAT # Margin Pass 1 (relativ) +--expansion2 FLOAT # Margin Pass 2 (relativ) +--min-crop INT # minimale Croplänge (px) +--faces-upscale INT # min. Kantenlänge für FaceMesh (kleine Crops hochskalieren) +--imgsz INT # YOLO input size (Default 448) +--max-det INT # Max Detects / Frame +--use-refine # MediaPipe refine_landmarks aktivieren +``` + +### `make_segments.py` +Erzeugt `*_target_by_frame.json` (Zentren+Side pro Frame) aus Face/Center‑Daten. +``` +--pattern STR # Dateimuster in raw_clips (Default: highlight_*.mp4) +--fps FLOAT # FPS erzwingen (0 = aus Video lesen) +--smooth INT # MA-Fensterbreite (ungerade) +--overwrite # bestehende target_by_frame.json überschreiben +``` + +### `main_apply_crop.py` +Wendet 9:16‑Crop mit Glättung/Szenenschnitt an. +``` +--pattern STR # Dateimuster in raw_clips (Default: *.mp4) +--out_w INT # Output-Breite (Default 1080) +--out_h INT # Output-Höhe (Default 1920) +--zoom_pad FLOAT # Zoom-Pad (0..1) +--median INT # Median-Fenster (>=3, ungerade) +--ema FLOAT # EMA-Alpha (0..1) +--deadband FLOAT # Totband in Pixel +--switch_cd INT # Cooldown-Frames nach Trackwechsel +--cut_detect # Szenenschnitt-Erkennung aktivieren +--cut_corr FLOAT # Schwellwert Korrelation (0..1) +--cut_cd INT # Cooldown-Frames nach Cut +--mux_audio # Original-Audio unterlegen +--debug # Debug-Overlay anzeigen +--debug_scale FLOAT # Debug-Preview skaliert rendern +--overwrite # vorhandene Ausgaben überschreiben +``` + +### `add_subtitles.py` +Generiert Word‑Caps mit Whisper & brennt sie ein. +``` +--clips_dir PATH # Quelle (Default: output/output_9x16_final) +--out_dir PATH # Ziel (Default: output/output_9x16_final_subbed_word) +--pattern STR # z. B. *.mp4 +--limit INT # Nur die ersten N Clips +--model NAME # Whisper-Modell (tiny/base/small/medium/large) +--lang CODE # Sprachcode oder Auto +``` + +### `rateCluster.py` (optional) +Lässt LLM Scores (Viralität, Emotion, Humor, Provokation) nachtragen. +> Modelliere Standard‑Modell via `OPENAI_MODEL` (z. B. `gpt-4o`). + +--- + +## Tipps & Troubleshooting +- **Modelle/Performance** + - CPU‑only ist möglich (Whisper/YOLO langsamer). Auf Apple Silicon wird automatisch **MPS** genutzt; auf NVIDIA **CUDA**. + - `--frame-skip` und `--downscale` in `main_detect_faces.py` beschleunigen die Face‑Detection deutlich. +- **ffmpeg‑Muxing prüfen** (`main_apply_crop.py --mux_audio`): Falls Ton fehlt, `ffmpeg`-Installation checken. Rückgabecode im Log prüfen. +- **Fehlende Dateien** + - Kein Input? → `data/input/` prüfen. + - Fehlende Transkript‑Paare? → `*_timed.txt` und `*_segments.json` müssen existieren (aus `transcription.py`). + - Fehlende Faces? → Pfad zu `models/yolov8n-face.pt` korrekt? +- **Datenbank** + - Highlights liegen in SQLite (siehe `config.py`: `DB_PATH`). Bei Wiederholungen kann ein `DELETE FROM highlights; VACUUM;` sinnvoll sein. +- **Cache/Verzeichnisse** + - Whisper‑Cache via `XDG_CACHE_HOME` → `whisper-cache/` neben dem Projekt. Speicherplatz beachten. + +## Reproduzierbarkeit +- Lege eine `requirements.txt` mit exakten Versionen an (Freeze deiner funktionierenden Umgebung). +- Dokumentiere verwendete **Modell‑Versionsstände** (YOLO Weights, Whisper‑Modellgröße, OPENAI_MODEL). +- Fixiere Random‑Seeds, falls nötig (hier meist deterministisch durch externe Modelle/Bibliotheken). + +## Lizenz / Danksagung +- Verwendung von **OpenAI Whisper**, **Ultralytics YOLOv8**, **MediaPipe**, **OpenCV**, **MoviePy**, **ffmpeg**. +- Die jeweiligen Lizenzen der Bibliotheken beachten. diff --git a/code/text/cutClips.py b/code/text/cutClips.py deleted file mode 100644 index a58331e..0000000 --- a/code/text/cutClips.py +++ /dev/null @@ -1,38 +0,0 @@ -from moviepy.video.io.VideoFileClip import VideoFileClip -from pathlib import Path -import sqlite3 - -# === Setup === -input_video = Path("input/testVideoShort.mov") -output_dir = Path("output") -output_dir.mkdir(parents=True, exist_ok=True) - -# === SQLite DB lesen === -db_path = "clips_openai.db" -conn = sqlite3.connect(db_path) -cursor = conn.cursor() - -# Nur die Top 10 Clips mit höchstem score_total -cursor.execute(""" - SELECT start, end, text - FROM highlights - ORDER BY score_total DESC - LIMIT 10 -""") -highlights = cursor.fetchall() - -# === Video laden === -video = VideoFileClip(str(input_video)) - -# === Clips schneiden === -for i, (start, end, text) in enumerate(highlights): - output_file = output_dir / f"highlight_{i+1}.mp4" - end = min(end, video.duration) # Sicherstellen, dass das Ende nicht über das Video hinausgeht - print(f"🎬 Exportiere Clip {i+1}: {start:.2f}s – {end:.2f}s → {output_file.name}") - clip = video.subclipped(start, end) - clip.write_videofile(str(output_file), codec="libx264", audio_codec="aac") - -# === Cleanup === -conn.close() -video.close() -print("✅ Top 10 Clips exportiert.") diff --git a/code/text/segment_transcript.py b/code/text/segment_transcript.py deleted file mode 100644 index d8eba8b..0000000 --- a/code/text/segment_transcript.py +++ /dev/null @@ -1,196 +0,0 @@ -import json -import sqlite3 -import re -from pathlib import Path -from openai import OpenAI -from datetime import datetime -import time -import nltk - -nltk.download("punkt") - -# === SETTINGS === -TRANSCRIPT_PATH = Path("transkripte/testVideoShort_timed.txt") -DB_PATH = Path("clips_openai.db") -LOG_DIR = Path("logs") -LOG_DIR.mkdir(exist_ok=True) -BLOCK_DURATION = 300 -MIN_CLIP_LEN = 5 -MAX_CLIP_LEN = 90 - -client = OpenAI(api_key="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") - -# === HILFSFUNKTIONEN === -def log_text(filename, content): - (LOG_DIR / filename).write_text(content.strip(), encoding="utf-8") - -def append_error_log(content): - with (LOG_DIR / "errors.txt").open("a", encoding="utf-8") as f: - f.write(content + "\n\n") - -def extract_json(text): - match = re.search(r"\[.*\]", text.strip(), re.DOTALL) - if match: - try: - return json.loads(match.group()) - except Exception as e: - append_error_log(f"❌ JSON-Fehler: {e}\n{text}") - return [] - -def get_original_text(clip, segments, debug=False): - texts = [] - used_segments = [] - for s in segments: - # Überschneidung: Segment und Clip teilen sich Zeit - if not (s["end"] < clip["start"] or s["start"] > clip["end"]): - texts.append(s["text"]) - used_segments.append(f"[{s['start']:.2f} – {s['end']:.2f}] {s['text']}") - if debug: - print(f"\n🟢 Clip {clip['start']}–{clip['end']} nutzt Segmente:\n" + - "\n".join(used_segments)) - return " ".join(texts).strip() - -# === TRANSKRIPT EINLESEN === -lines = TRANSCRIPT_PATH.read_text(encoding="utf-8").splitlines() -segments = [] -for line in lines: - match = re.match(r"\[(\d+(?:\.\d+)?)\s*[–-]\s*(\d+(?:\.\d+)?)\]\s*(.*)", line) - if match: - start, end, text = match.groups() - start = float(start) - end = float(end) - if end - start >= 2.0: - segments.append({"start": start, "end": end, "text": text.strip()}) - -if not segments: - raise RuntimeError("🚫 Keine gültigen Segmente gefunden.") -print(f"✅ {len(segments)} gültige Transkriptsegmente geladen.") - -# === BLÖCKE BILDEN -blocks = [] -current_block = [] -current_start = 0.0 -for seg in segments: - if seg["end"] - current_start > BLOCK_DURATION: - blocks.append(current_block) - current_block = [] - current_start = seg["start"] - current_block.append(seg) -if current_block: - blocks.append(current_block) -print(f"🧱 {len(blocks)} Blöcke erstellt (à ~{BLOCK_DURATION}s).") - -# === KI: CLIP-AUSWAHL -all_clips = [] -start_time = time.perf_counter() - -for i, block in enumerate(blocks): - if not block: - continue - - print(f"\n🤖 Sende Block {i+1}/{len(blocks)} an GPT-4o...") - - block_text = "\n".join([f"[{s['start']} – {s['end']}] {s['text']}" for s in block]) - prompt = f""" -Du bekommst einen Transkriptblock mit Zeitangaben. Extrahiere daraus 1–3 besonders interessante Abschnitte, die sich als eigenständige Clips eignen. -Achte darauf das es abgeschlossene Clips sind und als eigenstaendiger Clip funktionieren kann. - -Ein guter Clip: -- ist abgeschlossen und verständlich -- enthält eine Pointe, Erkenntnis oder einen emotionalen Moment -- wirkt wie ein Mini-Ausschnitt mit Anfang, Spannungsbogen, Auflösung oder Punchline -- ist **mindestens 30 Sekunden lang** - -Nutze ausschließlich die vorhandenen Start- und Endzeiten – keine neuen erfinden. - -Gib ein valides JSON-Array zurück im Format: -[ - {{ - "start": float, - "end": float, - "summary": "Kurze Beschreibung des Inhalts" - }} -] - -TRANSKRIPT: -{block_text} -""" - log_text(f"block_prompt_{i+1}.txt", prompt) - - try: - response = client.chat.completions.create( - model="gpt-4o", - messages=[{"role": "user", "content": prompt}], - temperature=0.4 - ) - raw = response.choices[0].message.content - log_text(f"block_output_{i+1}.txt", raw) - clips = extract_json(raw) - - print(f"✅ {len(clips)} Clips empfangen in Block {i+1}") - - for clip in clips: - try: - dur = float(clip["end"]) - float(clip["start"]) - if MIN_CLIP_LEN <= dur <= MAX_CLIP_LEN: - clip["duration"] = round(dur, 2) - all_clips.append(clip) - except Exception as e: - append_error_log(f"⛔ Clip-Filterfehler: {clip}\n{e}") - - print(f"📈 Aktueller Clip-Gesamtstand: {len(all_clips)}") - - # ETA berechnen - elapsed = time.perf_counter() - start_time - avg_time = elapsed / (i + 1) - eta = avg_time * (len(blocks) - (i + 1)) - print(f"⏱️ Geschätzte Restzeit: {eta:.1f} Sekunden") - - except Exception as e: - append_error_log(f"❌ OpenAI-Fehler Block {i+1}: {e}") - print(f"❌ Fehler bei Block {i+1}: {e}") - -# === DB SPEICHERN -conn = sqlite3.connect(DB_PATH) -cur = conn.cursor() -cur.execute("DROP TABLE IF EXISTS segments") -cur.execute(""" -CREATE TABLE segments ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - file TEXT, - start REAL, - end REAL, - duration REAL, - text TEXT, - summary TEXT -) -""") - -inserted = 0 -failed = 0 -for clip in all_clips: - try: - start = float(clip["start"]) - end = float(clip["end"]) - duration = float(clip["duration"]) - summary = clip.get("summary", "") - # debug=True für print aller Segment-Texte pro Clip - original_text = get_original_text(clip, segments, debug=False) - if end <= start or start < 0: - raise ValueError("Ungültige Zeiten") - cur.execute( - "INSERT INTO segments (file, start, end, duration, text, summary) VALUES (?, ?, ?, ?, ?, ?)", - (TRANSCRIPT_PATH.stem, start, end, duration, original_text, summary.strip()) - ) - inserted += 1 - except Exception as e: - failed += 1 - append_error_log(f"❌ DB-Fehler: {clip}\n{e}") - -conn.commit() -conn.close() - -print("\n📊 Ergebnisse:") -print(f" ✅ Clips gespeichert: {inserted}") -print(f" ❌ Fehlerhafte Clips: {failed}") -print(f"📁 Logs gespeichert in: {LOG_DIR.resolve()}") diff --git a/code/text/transcription.py b/code/text/transcription.py deleted file mode 100644 index 82ee81d..0000000 --- a/code/text/transcription.py +++ /dev/null @@ -1,108 +0,0 @@ -# transcription_chunked.py -import whisper -from pathlib import Path -import os -import json -import ffmpeg -import tempfile - -# === Einstellungen === -input_file = Path("input/testVideoShort.mov") -output_dir = Path("transkripte") -output_dir.mkdir(parents=True, exist_ok=True) - -output_txt = output_dir / f"{input_file.stem}_timed.txt" -output_json = output_dir / f"{input_file.stem}_segments.json" -suspect_txt = output_dir / f"{input_file.stem}_suspect_lines.txt" - -CHUNKS = 4 # Anzahl Chunks (anpassen!) -OVERLAP = 2.0 # Sekunden Überlappung - -os.environ["XDG_CACHE_HOME"] = str(Path(__file__).parent / "whisper-cache") - -probe = ffmpeg.probe(str(input_file)) -duration = float(probe["format"]["duration"]) -print(f"🎥 Videolänge: {duration:.2f} Sekunden") - -def extract_audio_chunk(start_time, duration, output_path): - ffmpeg.input(str(input_file), ss=start_time, t=duration).output( - str(output_path), - format="wav", - acodec="pcm_s16le", - ac=1, - ar="16000", - loglevel="error" - ).overwrite_output().run() - -def is_suspect(text): - words = text.strip().lower().split() - if not words: - return True - most_common = max([words.count(w) for w in set(words)]) - return most_common / len(words) > 0.6 or most_common > 20 - -tmp_dir = Path(tempfile.mkdtemp()) -all_segments = [] - -print(f"✂️ Teile Audio in {CHUNKS} Chunks ...") -for i in range(CHUNKS): - chunk_start = max(0, i * (duration / CHUNKS) - OVERLAP if i > 0 else 0) - chunk_end = min(duration, (i + 1) * (duration / CHUNKS) + OVERLAP) - chunk_dur = chunk_end - chunk_start - chunk_file = tmp_dir / f"chunk_{i}.wav" - print(f"🔉 Extrahiere Chunk {i+1}/{CHUNKS}: {chunk_start:.2f}s – {chunk_end:.2f}s") - extract_audio_chunk(chunk_start, chunk_dur, chunk_file) - - print(f"🧠 Transkribiere Chunk {i+1} ...") - model = whisper.load_model("small") # Wechsel zu "medium" oder "large" falls gewünscht - result = model.transcribe( - str(chunk_file), - language="de", - fp16=False, - word_timestamps=False, - condition_on_previous_text=True, - temperature=0, - verbose=False - ) - - segments = result["segments"] - # Zeitversatz für den aktuellen Chunk hinzufügen - offset = chunk_start - for seg in segments: - seg["start"] += offset - seg["end"] += offset - all_segments.extend(segments) - -# === Sortiere und filtere doppelte/überlappende Segmente -all_segments.sort(key=lambda x: x["start"]) - -def segment_hash(seg): - return (round(seg["start"], 2), round(seg["end"], 2), seg["text"].strip().lower()) - -unique_segments = [] -seen = set() -for seg in all_segments: - h = segment_hash(seg) - if h not in seen: - seen.add(h) - unique_segments.append(seg) - -print(f"✅ Insgesamt {len(unique_segments)} einzigartige Segmente transkribiert.") - -with open(output_txt, "w", encoding="utf-8") as f, open(suspect_txt, "w", encoding="utf-8") as f_sus: - for seg in unique_segments: - start = seg["start"] - end = seg["end"] - text = seg["text"].strip() - line = f"[{start:.2f} – {end:.2f}] {text}\n" - f.write(line) # IMMER ins Haupttranskript! - if is_suspect(text): - f_sus.write(line) - - -print(f"📝 Zeitmarkiertes Transkript gespeichert unter: {output_txt}") -print(f"⚠️ Verdächtige Zeilen gespeichert unter: {suspect_txt}") - -with open(output_json, "w", encoding="utf-8") as f: - json.dump(unique_segments, f, ensure_ascii=False, indent=2) -print(f"💾 Segmentdaten gespeichert unter: {output_json}") diff --git a/main.py b/main.py new file mode 100644 index 0000000..5bbd77c --- /dev/null +++ b/main.py @@ -0,0 +1,233 @@ +#!/usr/bin/env python3 +"""Run the full Bachelor pipeline end-to-end with timing, errors, and flexible flags. + +Steps: + 1) transcription.py → Whisper transcripts (segments + timed words) + 2) segment_transcript.py → LLM selects highlight candidates → SQLite + 3) cutClips.py → export highlight_*.mp4 (raw clips) + 4) main_detect_faces.py → YOLO + MediaPipe → faces.json per clip + 5) make_segments.py → *_target_by_frame.json (center+side per frame) + 6) main_apply_crop.py → 9:16 crop with smoothing & optional audio mux + 7) rateCluster.py → (optional) LLM scoring (virality, emotion, ...) + 8) add_subtitles.py → (optional) word-cap subtitles burned in + +Usage examples: + python main.py --input data/input/meinvideo.mp4 --limit 10 --openai-model gpt-4o + python main.py --no-rate --no-subs +""" + +from __future__ import annotations +import argparse +import os +import sys +import subprocess +import time +from datetime import datetime +from pathlib import Path + +# --- Import project config --- +try: + from config import ( + PROJECT_ROOT, INPUT_DIR, RAW_CLIPS_DIR, CROPPED_DIR, SUBTITLED_DIR, + WHISPER_CACHE_DIR + ) +except Exception: + PROJECT_ROOT = Path(__file__).resolve().parent + sys.path.insert(0, str(PROJECT_ROOT)) + from config import ( + PROJECT_ROOT, INPUT_DIR, RAW_CLIPS_DIR, CROPPED_DIR, SUBTITLED_DIR, + WHISPER_CACHE_DIR + ) + +LOGS_DIR = PROJECT_ROOT / "logs" +LOGS_DIR.mkdir(parents=True, exist_ok=True) + +# --- korrekte Pfade zu den Skripten --- +SCRIPTS = { + "transcription": str(PROJECT_ROOT / "src" / "text" / "transcription.py"), + "segment_transcript": str(PROJECT_ROOT / "src" / "text" / "segment_transcript.py"), + "cutClips": str(PROJECT_ROOT / "src" / "text" / "cutClips.py"), + "detect_faces": str(PROJECT_ROOT / "src" / "reformat" / "main_detect_faces.py"), + "make_segments": str(PROJECT_ROOT / "src" / "reformat" / "make_segments.py"), + "apply_crop": str(PROJECT_ROOT / "src" / "reformat" / "main_apply_crop.py"), + "rateCluster": str(PROJECT_ROOT / "src" / "text" / "rateCluster.py"), + "add_subtitles": str(PROJECT_ROOT / "src" / "subtitles" / "add_subtitles.py"), +} + +def shlex_join(cmd): + return " ".join(str(c) for c in cmd) + +def run_step(cmd: list[str], name: str, env: dict[str, str] | None = None) -> float: + """Run a subprocess step, raise on error, return duration in seconds.""" + t0 = time.perf_counter() + print(f"\n===== {name} =====") + print(" ", shlex_join(cmd)) + cp = subprocess.run(cmd, env=env) + dt = time.perf_counter() - t0 + if cp.returncode != 0: + print(f"❌ Fehler in {name} (Exit {cp.returncode}) nach {dt:.2f}s") + print(" → Prüfe das Logfile oben für Details und stelle sicher, dass Abhängigkeiten installiert sind:") + print(" - ffmpeg/ffprobe im PATH") + print(" - Python-Pakete: openai-whisper, torch, ffmpeg-python, ultralytics, opencv-python, mediapipe, moviepy, tqdm, numpy") + print(" - OPENAI_API_KEY gesetzt (für LLM-Schritte)") + raise SystemExit(cp.returncode) + print(f"✅ {name} in {dt:.2f}s") + return dt + +def infer_base_from_input(input_path: Path) -> str: + return input_path.stem + +def default_input() -> Path | None: + if not INPUT_DIR.exists(): + return None + for p in sorted(INPUT_DIR.iterdir()): + if p.suffix.lower() in {".mp4", ".mov", ".mkv", ".m4v", ".mp3", ".wav"}: + return p + return None + +def main(): + ap = argparse.ArgumentParser(description="Bachelor Pipeline Runner") + ap.add_argument("--input", type=str, default=None, help="Pfad zu Eingabedatei (Default: erstes File in data/input)") + ap.add_argument("--limit", type=int, default=10, help="Anzahl Highlights (cutClips)") + ap.add_argument("--whisper-model", type=str, default=os.getenv("WHISPER_MODEL", "small")) + ap.add_argument("--lang", type=str, default=None, help="Sprachcode (z. B. de)") + ap.add_argument("--openai-model", type=str, default=os.getenv("OPENAI_MODEL", "gpt-4o")) + ap.add_argument("--pattern", type=str, default="highlight_*.mp4") + ap.add_argument("--overwrite", action="store_true") + ap.add_argument("--no-rate", action="store_true") + ap.add_argument("--no-subs", action="store_true") + ap.add_argument("--no-detect", action="store_true") + ap.add_argument("--no-make", action="store_true") + ap.add_argument("--no-apply", action="store_true") + ap.add_argument("--logfile", type=str, default=None) + args = ap.parse_args() + + os.chdir(PROJECT_ROOT) + + env = os.environ.copy() + env.setdefault("OPENAI_MODEL", args.openai_model) + env.setdefault("XDG_CACHE_HOME", str(WHISPER_CACHE_DIR)) + + if not env.get("OPENAI_API_KEY"): + print("⚠️ OPENAI_API_KEY ist nicht gesetzt – LLM-Schritte könnten fehlschlagen.") + + # Input-Datei bestimmen + if args.input: + input_path = Path(args.input) + if not input_path.is_file(): + candidate = INPUT_DIR / args.input + if candidate.is_file(): + input_path = candidate + else: + raise SystemExit(f"Input nicht gefunden: {args.input}") + else: + picked = default_input() + if not picked: + raise SystemExit(f"Kein Input in {INPUT_DIR} gefunden. Bitte --input setzen.") + input_path = picked + + base = infer_base_from_input(input_path) + print(f"📥 Input: {input_path}") + print(f"🔤 Whisper: {args.whisper_model} | 🌐 LLM: {env.get('OPENAI_MODEL')}") + print(f"🧩 Base: {base}") + + # Logfile + if args.logfile: + log_path = Path(args.logfile) + else: + log_path = LOGS_DIR / f"run_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log" + + # Tee: schreibe in Datei UND Konsole + try: + log_fh = open(log_path, "w", encoding="utf-8") + class _Tee: + def __init__(self, *streams): self.streams = streams + def write(self, data): + for s in self.streams: + try: s.write(data); s.flush() + except Exception: pass + def flush(self): + for s in self.streams: + try: s.flush() + except Exception: pass + sys.stdout = _Tee(sys.__stdout__, log_fh) + sys.stderr = _Tee(sys.__stderr__, log_fh) + print(f"📝 Logfile: {log_path}") + except Exception: + print(f"⚠️ Konnte Logfile nicht initialisieren: {log_path}") + + durations = [] + started = datetime.now() + print(f"🚀 Start: {started:%Y-%m-%d %H:%M:%S}") + + try: + # 1) Transcription + t_args = [sys.executable, SCRIPTS["transcription"], "--input", str(input_path), "--model", args.whisper_model] + if args.lang: t_args += ["--lang", args.lang] + durations.append(("Transcription", run_step(t_args, "Transcription", env=env))) + + # 2) LLM Segmentierung + st_args = [sys.executable, SCRIPTS["segment_transcript"], "--base", base] + durations.append(("Segment Transcript", run_step(st_args, "Segment Transcript", env=env))) + + # 3) Highlights schneiden + cut_filename = input_path.name + cc_args = [sys.executable, SCRIPTS["cutClips"], "--file", cut_filename, "--limit", str(args.limit)] + durations.append(("Cut Clips", run_step(cc_args, "Cut Clips", env=env))) + + # 4) Faces + if not args.no_detect: + df_args = [sys.executable, SCRIPTS["detect_faces"]] + durations.append(("Detect Faces", run_step(df_args, "Detect Faces", env=env))) + else: + print("⏭️ Detect Faces übersprungen.") + + # 5) Make Targets + if not args.no_make: + ms_args = [sys.executable, SCRIPTS["make_segments"], "--pattern", args.pattern] + durations.append(("Make Targets", run_step(ms_args, "Make Targets", env=env))) + else: + print("⏭️ Make Targets übersprungen.") + + # 6) Crop + if not args.no_apply: + ac_args = [sys.executable, SCRIPTS["apply_crop"], "--pattern", args.pattern, "--mux_audio"] + if args.overwrite: ac_args.append("--overwrite") + durations.append(("Apply Crop", run_step(ac_args, "Apply Crop", env=env))) + else: + print("⏭️ Apply Crop übersprungen.") + + # 7) Bewertung + if not args.no_rate: + rc_args = [sys.executable, SCRIPTS["rateCluster"]] + durations.append(("Rate Clusters", run_step(rc_args, "Rate Clusters", env=env))) + else: + print("⏭️ Rate Clusters übersprungen.") + + # 8) Untertitel + if not args.no_subs: + as_args = [sys.executable, SCRIPTS["add_subtitles"]] + durations.append(("Subtitles", run_step(as_args, "Subtitles", env=env))) + else: + print("⏭️ Subtitles übersprungen.") + + except KeyboardInterrupt: + print("\n⛔ Abgebrochen (Ctrl+C).") + finally: + finished = datetime.now() + total = sum(dt for _, dt in durations) + print("\n======================== ZUSAMMENFASSUNG ============================") + for name, dt in durations: + print(f"✅ {name:<24} {dt:7.2f}s") + print("---------------------------------------------------------------------") + print(f"⏱️ Gesamtdauer: {total:.2f}s") + print(f"🕒 Start : {started:%Y-%m-%d %H:%M:%S}") + print(f"🕒 Ende : {finished:%Y-%m-%d %H:%M:%S}") + print(f"📂 Output:") + print(f" Raw Clips : {RAW_CLIPS_DIR}") + print(f" 9:16 : {CROPPED_DIR}") + print(f" Subbed : {SUBTITLED_DIR}") + print("=====================================================================") + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/src/main.py b/src/main.py new file mode 100644 index 0000000..422cc64 --- /dev/null +++ b/src/main.py @@ -0,0 +1,43 @@ +#!/usr/bin/env python3 +""" +Einfaches Master-Skript, das alle Unter-Skripte nacheinander startet – ohne Argumente. +""" +import subprocess +import sys +from pathlib import Path + +# Reihenfolge der auszuführenden Skripte (relativer Pfad) +SCRIPTS = [ + "text/transcription.py", + "text/segment_transcript.py", + "text/rateCluster.py", + "text/cutClips.py", + "reformat/track_faces_Yolo.py", + "reformat/detect_speaking_faces.py", + "reformat/crop_to_speaker.py", +] + + +def run_script(script_path: str): + """ + Führt ein Python-Skript ohne weitere Argumente aus. + """ + print(f"🔄 Running: {script_path}") + full_path = Path(__file__).parent / script_path + try: + subprocess.check_call([sys.executable, str(full_path)]) + print(f"✔️ {script_path} erfolgreich abgeschlossen.\n") + except subprocess.CalledProcessError as e: + print(f"❌ Fehler in {script_path}: Rückgabecode {e.returncode}") + sys.exit(e.returncode) + + +def main(): + print("\n=== Starte komplette Podcast-Pipeline ===\n") + for script in SCRIPTS: + run_script(script) + print("✅ Alle Schritte erfolgreich abgeschlossen.") + + +if __name__ == '__main__': + main() diff --git a/src/reformat/main_apply_crop.py b/src/reformat/main_apply_crop.py new file mode 100644 index 0000000..da71e4f --- /dev/null +++ b/src/reformat/main_apply_crop.py @@ -0,0 +1,315 @@ +#!/usr/bin/env python3 +# src/reformat/new/main_apply_crop.py +from __future__ import annotations +import logging, json, math, subprocess, argparse +from pathlib import Path +from typing import Optional, Tuple, List, Dict, Any +from collections import deque +import sys + +import cv2 +import numpy as np + +# ── Projektwurzel importierbar machen +ROOT = Path(__file__).resolve().parents[2] +sys.path.insert(0, str(ROOT)) +from config import RAW_CLIPS_DIR, FACE_COMBINED_DIR, SEGMENTS_DIR, CROPPED_DIR + +# ==== Defaults (per CLI überschreibbar) ====================================== +OUT_W_DEFAULT, OUT_H_DEFAULT = 1080, 1920 # 9:16 +DEBUG_SCALE_DEFAULT = 0.6 +MEDIAN_WIN_DEFAULT = 5 +EMA_ALPHA_DEFAULT = 0.22 +DEADBAND_PX_DEFAULT = 8.0 +SWITCH_COOLDOWN_FR_DEFAULT = 12 +ZOOM_PAD_FRAC_DEFAULT = 0.10 + +USE_CUT_DETECT_DEFAULT = True +CUT_CORR_THRESH_DEFAULT = 0.65 +CUT_COOLDOWN_DEFAULT = 6 + +MUX_AUDIO_DEFAULT = True +FFMPEG_BIN = "ffmpeg" +# ============================================================================ + +def clamp(v, lo, hi): return max(lo, min(hi, v)) + +def compute_crop_rect(cx: float, cy: float, src_w: int, src_h: int, + out_w: int, out_h: int, zoom_pad_frac: float) -> tuple[int,int,int,int]: + """9:16 (out_w:out_h) Crop um (cx,cy) — ohne Squeeze, mit Zoom-Pad, im Bild gehalten.""" + target_ar = out_w / out_h + src_ar = src_w / src_h + if src_ar >= target_ar: + base_h = src_h + base_w = int(round(base_h * target_ar)) + else: + base_w = src_w + base_h = int(round(base_w / target_ar)) + + desired_scale = 1.0 + zoom_pad_frac + s = min(desired_scale, src_w / base_w, src_h / base_h) + w = int(round(base_w * s)) + h = int(round(base_h * s)) + half_w, half_h = w // 2, h // 2 + + cx = clamp(cx, half_w, src_w - half_w) + cy = clamp(cy, half_h, src_h - half_h) + x = int(round(cx - half_w)) + y = int(round(cy - half_h)) + return x, y, w, h + +def draw_center(img, pt, color, label=None): + if pt is None: return + x, y = int(pt[0]), int(pt[1]) + cv2.circle(img, (x, y), 6, color, -1) + if label: + cv2.putText(img, label, (x + 8, y - 8), + cv2.FONT_HERSHEY_SIMPLEX, 0.55, color, 2, cv2.LINE_AA) + +def scene_corr(a_small: np.ndarray, b_small: np.ndarray) -> float: + a_hsv = cv2.cvtColor(a_small, cv2.COLOR_BGR2HSV) + b_hsv = cv2.cvtColor(b_small, cv2.COLOR_BGR2HSV) + ha = cv2.calcHist([a_hsv],[0,1],None,[50,50],[0,180,0,256]) + hb = cv2.calcHist([b_hsv],[0,1],None,[50,50],[0,180,0,256]) + cv2.normalize(ha,ha,0,1,cv2.NORM_MINMAX); cv2.normalize(hb,hb,0,1,cv2.NORM_MINMAX) + return float((cv2.compareHist(ha, hb, cv2.HISTCMP_CORREL) + 1.0)/2.0) + +def mux_audio_from_source(src_video: Path, silent_video: Path, out_video: Path): + cmd = [ + FFMPEG_BIN, "-y", + "-i", str(src_video), + "-i", str(silent_video), + "-map", "1:v:0", + "-map", "0:a:0?", + "-c:v", "copy", + "-c:a", "aac", "-b:a", "192k", + "-shortest", + str(out_video), + ] + subprocess.run(cmd, check=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + +def load_faces(name: str) -> List[Dict[str,Any]]: + p = FACE_COMBINED_DIR / f"{name}_faces.json" + if not p.exists(): return [] + return json.loads(p.read_text(encoding="utf-8")) + +def load_target_map_or_segments(name: str, total_frames: int) -> List[Optional[int] | Dict]: + """ + Bevorzugt *_target_by_frame.json (Liste Dicts mit t,cx,cy,w,h). + Fallback: *_segments.json (pro Frame Track-ID). + Gibt Liste gleicher Länge wie total_frames zurück. + """ + map_p = SEGMENTS_DIR / f"{name}_target_by_frame.json" + if map_p.exists(): + target = json.loads(map_p.read_text(encoding="utf-8")) + # Falls es Dicts sind (cx,cy,w,h pro frame), einfach zurückgeben: + if target and isinstance(target[0], dict): + if len(target) < total_frames: + last = target[-1] if target else {"t":0,"cx":0.5,"cy":0.5,"w":0.6,"h":0.6} + target += [last] * (total_frames - len(target)) + return target[:total_frames] + # Falls numerische IDs drin wären, fällt es unten durch auf segs-Logik + seg_p = SEGMENTS_DIR / f"{name}_segments.json" + if seg_p.exists(): + segs = json.loads(seg_p.read_text(encoding="utf-8")) + target_tid = [None]*total_frames + for s in segs: + a, b, tid = int(s["start_f"]), int(s["end_f"]), s["track_id"] + for t in range(max(0,a), min(total_frames, b+1)): + target_tid[t] = tid + return target_tid + return [None]*total_frames + +def find_center_for_track(faces_frame: Dict[str,Any], target_tid: Optional[int], fallback: Tuple[float,float]) -> Tuple[float,float]: + if target_tid is None: + return fallback + faces = faces_frame.get("faces", []) + for f in faces: + if int(f.get("track_id", -1)) == int(target_tid): + x,y,w,h = f.get("bbox", [None,None,None,None]) + if None not in (x,y,w,h): + return (float(x + w/2), float(y + h/2)) + return fallback + +def parse_args(): + p = argparse.ArgumentParser(description="Apply 9:16 Auto-Crop auf Rohclips mit Face-/Target-Daten.") + p.add_argument("--pattern", type=str, default="*.mp4", help="Dateimuster in RAW_CLIPS_DIR (Default: *.mp4)") + p.add_argument("--out_w", type=int, default=OUT_W_DEFAULT, help="Output-Breite (Default: 1080)") + p.add_argument("--out_h", type=int, default=OUT_H_DEFAULT, help="Output-Höhe (Default: 1920)") + p.add_argument("--zoom_pad", type=float, default=ZOOM_PAD_FRAC_DEFAULT, help="Zoom-Pad (0..1, Default 0.10)") + p.add_argument("--median", type=int, default=MEDIAN_WIN_DEFAULT, help="Median-Fenster (ungerade, >=3)") + p.add_argument("--ema", type=float, default=EMA_ALPHA_DEFAULT, help="EMA-Alpha (0..1)") + p.add_argument("--deadband", type=float, default=DEADBAND_PX_DEFAULT, help="Totband in Pixel") + p.add_argument("--switch_cd", type=int, default=SWITCH_COOLDOWN_FR_DEFAULT, help="Cooldown-Frames nach Trackwechsel") + p.add_argument("--cut_detect", action="store_true", default=USE_CUT_DETECT_DEFAULT, help="Szenenschnitt-Erkennung aktivieren") + p.add_argument("--cut_corr", type=float, default=CUT_CORR_THRESH_DEFAULT, help="Korrelation-Schwelle (0..1)") + p.add_argument("--cut_cd", type=int, default=CUT_COOLDOWN_DEFAULT, help="Cooldown-Frames nach Cut") + p.add_argument("--mux_audio", action="store_true", default=MUX_AUDIO_DEFAULT, help="Audio vom Original muxen") + p.add_argument("--debug", action="store_true", help="Debug-Overlay anzeigen (langsam)") + p.add_argument("--debug_scale", type=float, default=DEBUG_SCALE_DEFAULT, help="Skalierung Debug-Preview") + p.add_argument("--overwrite", action="store_true", help="Existierende Outputs überschreiben") + return p.parse_args() + +def main(): + args = parse_args() + OUT_DIR = CROPPED_DIR + OUT_DIR.mkdir(parents=True, exist_ok=True) + + logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO) + clips = sorted(list(RAW_CLIPS_DIR.glob(args.pattern))) + if not clips: + print(f"⚠️ Keine Clips in {RAW_CLIPS_DIR} mit Pattern '{args.pattern}'") + return + + print(f"🔎 {len(clips)} Clips gefunden …") + for video_path in clips: + name = video_path.stem + out_path = OUT_DIR / f"{name}_9x16.mp4" + if out_path.exists() and not args.overwrite: + print(f"⏭️ Skip (existiert): {out_path.name}") + continue + + # Video öffnen + cap = cv2.VideoCapture(str(video_path)) + if not cap.isOpened(): + print(f"❌ Kann Video nicht öffnen: {video_path.name}") + continue + width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + fps = cap.get(cv2.CAP_PROP_FPS) or 25.0 + total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + + # Face/Target laden + faces_all = load_faces(name) + if faces_all and len(faces_all) < total: + faces_all += [ {"faces": [], "W": width, "H": height} ] * (total - len(faces_all)) + target_by_frame = load_target_map_or_segments(name, total) + + # Writer vorbereiten + writer = cv2.VideoWriter(str(out_path), + cv2.VideoWriter_fourcc(*"mp4v"), + fps, (args.out_w, args.out_h)) + + median_buf = deque(maxlen=max(3, args.median if args.median % 2 else args.median+1)) + ema_center: Optional[Tuple[float,float]] = None + last_center: Optional[Tuple[float,float]] = (width/2, height/2) + switch_cooldown = 0 + + prev_small = None + cut_cd = 0 + + print(f"🎞️ Apply: {name} src={width}x{height} fps={fps:.2f} frames={total}") + + for t in range(total): + ret, frame = cap.read() + if not ret: break + + # Ziel bestimmen: + desired = None + tgt = target_by_frame[t] if t < len(target_by_frame) else None + + # Fall A: target_by_frame.json mit direkten Zentren (Dict) + if isinstance(tgt, dict) and all(k in tgt for k in ("cx","cy","w","h")): + desired = (float(tgt["cx"])*width, float(tgt["cy"])*height) + else: + # Fall B: numerische Track-ID + target_tid = tgt if tgt is None or isinstance(tgt, (int, float)) else None + faces_fr = faces_all[t] if (faces_all and t < len(faces_all)) else {"faces":[]} + desired = find_center_for_track(faces_fr, target_tid, last_center or (width/2, height/2)) + + # Szenenschnitt? + if args.cut_detect: + small = cv2.resize(frame, (128, 72)) + if prev_small is not None: + corr = scene_corr(prev_small, small) + if corr < args.cut_corr: + ema_center = desired + last_center = desired + switch_cooldown = args.switch_cd + cut_cd = args.cut_cd + prev_small = small + + # Median-Filter + median_buf.append(desired) + if len(median_buf) >= 3: + xs = sorted(p[0] for p in median_buf) + ys = sorted(p[1] for p in median_buf) + m = len(median_buf)//2 + desired_f = (xs[m], ys[m]) + else: + desired_f = desired + + # Trackwechsel erkennen (nur bei Track-IDs sauber möglich) + if t > 0: + prev_tgt = target_by_frame[t-1] if t-1 < len(target_by_frame) else None + else: + prev_tgt = tgt + is_switch = (not isinstance(tgt, dict)) and (tgt != prev_tgt) + + if ema_center is None: + ema_center = desired_f + if last_center is None: + last_center = desired_f + + if is_switch: + ema_center = desired_f + last_center = desired_f + switch_cooldown = args.switch_cd + else: + dx = desired_f[0] - ema_center[0] + dy = desired_f[1] - ema_center[1] + dist = math.hypot(dx, dy) + if cut_cd > 0: + ema_center = desired_f + cut_cd -= 1 + else: + if dist > args.deadband: + ema_center = (ema_center[0] + dx*args.ema, + ema_center[1] + dy*args.ema) + + last_center = desired_f + + # 9:16 Crop anwenden + x, y, w, h = compute_crop_rect(ema_center[0], ema_center[1], width, height, + args.out_w, args.out_h, args.zoom_pad) + cropped = frame[y:y+h, x:x+w] + if cropped.size == 0: cropped = frame + final = cv2.resize(cropped, (args.out_w, args.out_h), interpolation=cv2.INTER_AREA) + writer.write(final) + + if args.debug: + dbg = frame.copy() + cv2.rectangle(dbg, (x, y), (x+w, y+h), (0, 0, 255), 2) + draw_center(dbg, desired, (128,128,255), "desired") + draw_center(dbg, desired_f, (255,255, 0), "median") + draw_center(dbg, ema_center, ( 0,255,255), "ema") + cv2.putText(dbg, f"t={t+1}/{total}", (12, height-14), + cv2.FONT_HERSHEY_SIMPLEX, 0.6, (20,220,20), 2, cv2.LINE_AA) + disp = cv2.resize(dbg, (int(width*args.debug_scale), int(height*args.debug_scale))) + cv2.imshow("Apply Debug", disp) + if cv2.waitKey(1) & 0xFF == ord("q"): + print("🛑 Abgebrochen (q).") + break + + writer.release() + cap.release() + + # Audio muxen? + if args.mux_audio: + tmp = out_path.with_suffix(".tmp.mp4") + try: + out_path.rename(tmp) + mux_audio_from_source(video_path, tmp, out_path) + finally: + if tmp.exists(): + try: tmp.unlink() + except: pass + print(f"✅ Fertig (mit Audio): {out_path.name}") + else: + print(f"✅ Fertig: {out_path.name}") + + if args.debug: + cv2.destroyAllWindows() + +if __name__ == "__main__": + main() diff --git a/src/reformat/main_detect_faces.py b/src/reformat/main_detect_faces.py new file mode 100644 index 0000000..44f0300 --- /dev/null +++ b/src/reformat/main_detect_faces.py @@ -0,0 +1,335 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Face-Detection + Mouth-Openness (YOLOv8-face + MediaPipe) +- liest Rohclips aus RAW_CLIPS_DIR +- schreibt pro Video eine faces.json in FACE_COMBINED_DIR +- optionaler Fortschrittsbalken (tqdm) +""" + +from __future__ import annotations +import argparse +import logging +import json +import time +from pathlib import Path +from contextlib import nullcontext +from typing import List, Dict, Any +from src.reformat.speaking import get_mouth_openness + +import cv2 +import numpy as np +import torch +from ultralytics import YOLO +import mediapipe as mp +import sys + +# ── Projekt-Root + zentrale Pfade laden +ROOT = Path(__file__).resolve().parents[2] +sys.path.insert(0, str(ROOT)) +from config import RAW_CLIPS_DIR, FACE_COMBINED_DIR # zentrale Verzeichnisse + +# Fortschritt hübsch, wenn verfügbar +try: + from tqdm import tqdm + _HAS_TQDM = True +except Exception: + _HAS_TQDM = False + +# ---------- Performance Tweaks ---------- +torch.set_float32_matmul_precision("high") +cv2.setUseOptimized(True) + +# ---------- Hilfsfunktionen ---------- +def make_square_crop(x1, y1, x2, y2, W, H, margin_scale, min_crop): + cx = (x1 + x2) * 0.5 + cy = (y1 + y2) * 0.5 + w = (x2 - x1) * (1.0 + 2.0 * margin_scale) + h = (y2 - y1) * (1.0 + 2.0 * margin_scale) + side = max(w, h, float(min_crop)) + half = side * 0.5 + + sx1 = int(max(0, round(cx - half))) + sy1 = int(max(0, round(cy - half))) + sx2 = int(min(W, round(cx + half))) + sy2 = int(min(H, round(cy + half))) + + side_w = max(0, sx2 - sx1) + side_h = max(0, sy2 - sy1) + side = max(2, min(side_w, side_h)) + sx2 = sx1 + side + sy2 = sy1 + side + return sx1, sy1, sx2, sy2 + + +def pick_landmarks_near_crop_center(lm_lists, crop_w, crop_h): + if not lm_lists: + return None + cx_t, cy_t = crop_w * 0.5, crop_h * 0.5 + best, best_d = None, 1e12 + for lms in lm_lists: + xs = [p.x * crop_w for p in lms.landmark] + ys = [p.y * crop_h for p in lms.landmark] + cx = sum(xs) / len(xs) + cy = sum(ys) / len(ys) + d = (cx - cx_t) ** 2 + (cy - cy_t) ** 2 + if d < best_d: + best, best_d = lms, d + return best + + +def run_mesh(face_mesh, crop_bgr, upscale_if_small): + if crop_bgr.size == 0: + return None, 0.0 + ch, cw = crop_bgr.shape[:2] + if max(ch, cw) < upscale_if_small: + scale = float(upscale_if_small) / max(ch, cw) + new_w = max(1, int(round(cw * scale))) + new_h = max(1, int(round(ch * scale))) + crop_bgr = cv2.resize(crop_bgr, (new_w, new_h), interpolation=cv2.INTER_CUBIC) + ch, cw = new_h, new_w + rgb = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2RGB) + res = face_mesh.process(rgb) + if not res.multi_face_landmarks: + return None, 0.0 + chosen = pick_landmarks_near_crop_center(res.multi_face_landmarks, cw, ch) + if chosen is None: + return None, 0.0 + mo = get_mouth_openness(chosen.landmark, ch) + return chosen, float(mo) + +# ---------- Kernprozess ---------- +def process_video(video_path: Path, + output_path: Path, + model: YOLO, + face_mesh, + conf_thresh: float, + frame_skip: int, + downscale: float, + expansion_1: float, + expansion_2: float, + min_crop: int, + faces_upscale: int, + imgsz: int, + device: str, + max_det: int): + print(f"🎬 Starte Detection: {video_path.name}") + cap = cv2.VideoCapture(str(video_path)) + if not cap.isOpened(): + logging.error(f"❌ Kann Video nicht öffnen: {video_path}") + return + + fps = cap.get(cv2.CAP_PROP_FPS) or 25.0 + orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + total_frames_raw = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + total_to_process = None + if total_frames_raw > 0: + total_to_process = (total_frames_raw + (frame_skip - 1)) // max(1, frame_skip) + + scaled_w = max(1, int(round(orig_w * downscale))) + scaled_h = max(1, int(round(orig_h * downscale))) + + data: List[Dict[str, Any]] = [] + frame_idx = 0 + processed_frames = 0 + + sx = (orig_w / scaled_w) if downscale != 1.0 else 1.0 + sy = (orig_h / scaled_h) if downscale != 1.0 else 1.0 + + autocast_ctx = ( + torch.autocast(device_type=device, dtype=torch.float16) + if device in ("mps", "cuda") else nullcontext() + ) + + bar = None + start_t = time.time() + if _HAS_TQDM and total_to_process: + bar = tqdm(total=total_to_process, desc=f"{video_path.name}", unit="f", leave=True) + + while True: + ret, frame = cap.read() + if not ret: + break + + if frame_skip > 1 and (frame_idx % frame_skip != 0): + frame_idx += 1 + continue + + frame_infer = frame if downscale == 1.0 else cv2.resize(frame, (scaled_w, scaled_h), interpolation=cv2.INTER_AREA) + + with torch.no_grad(): + with autocast_ctx: + # Ultralytics 8 API: __call__ statt .predict() (beide funktionieren) + result = model(frame_infer, imgsz=imgsz, device=device, verbose=False, + conf=conf_thresh, iou=0.5, max_det=max_det) + detections = result[0] + + faces = [] + for i in range(len(detections.boxes)): + box = detections.boxes[i] + conf = float(box.conf[0]) if hasattr(box.conf, "__len__") else float(box.conf) + if conf < conf_thresh: + continue + x1, y1, x2, y2 = [float(v) for v in box.xyxy[0].tolist()] + if downscale != 1.0: + x1 *= sx; x2 *= sx; y1 *= sy; y2 *= sy + x1 = max(0.0, min(x1, orig_w - 1)) + y1 = max(0.0, min(y1, orig_h - 1)) + x2 = max(0.0, min(x2, orig_w - 1)) + y2 = max(0.0, min(y2, orig_h - 1)) + + w = max(1.0, x2 - x1) + h = max(1.0, y2 - y1) + cx = x1 + w / 2.0 + cy = y1 + h / 2.0 + + # Pass 1 + sx1, sy1, sx2, sy2 = make_square_crop(x1, y1, x2, y2, orig_w, orig_h, expansion_1, min_crop) + if sx2 - sx1 < 4 or sy2 - sy1 < 4: + continue + face_crop = frame[sy1:sy2, sx1:sx2] + _, mouth_open = run_mesh(face_mesh, face_crop, faces_upscale) + + # Pass 2 nur wenn nötig + if mouth_open == 0.0: + sx1b, sy1b, sx2b, sy2b = make_square_crop(x1, y1, x2, y2, orig_w, orig_h, expansion_2, min_crop) + if (sx2b - sx1b) >= 4 and (sy2b - sy1b) >= 4: + face_crop_b = frame[sy1b:sy2b, sx1b:sx2b] + _, mouth_open = run_mesh(face_mesh, face_crop_b, faces_upscale) + + faces.append({ + "bbox": [int(round(x1)), int(round(y1)), int(round(w)), int(round(h))], + "conf": round(conf, 3), + "center": [round(cx, 1), round(cy, 1)], + "mouth_openness": round(float(mouth_open), 3) + }) + + data.append({ + "frame": frame_idx, + "timestamp": round(frame_idx / fps, 3), + "W": orig_w, + "H": orig_h, + "faces": faces + }) + frame_idx += 1 + processed_frames += 1 + + # Fortschritt + if bar is not None: + bar.update(1) + else: + if processed_frames % 30 == 0: + elapsed = time.time() - start_t + rate = processed_frames / max(1e-6, elapsed) # frames/sec + if total_to_process: + remaining = max(0, total_to_process - processed_frames) + eta_sec = remaining / max(1e-6, rate) + print(f"[{video_path.name}] {processed_frames}/{total_to_process} " + f"({processed_frames/total_to_process*100:.1f}%) " + f"— {rate:.1f} f/s — ETA {eta_sec/60:.1f} min") + else: + print(f"[{video_path.name}] {processed_frames} frames — {rate:.1f} f/s") + + cap.release() + if bar is not None: + bar.close() + + # schön formatiertes JSON + output_path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8") + print(f"✅ Faces gespeichert: {output_path.name}") + +# ---------- CLI ---------- +def parse_args(): + p = argparse.ArgumentParser(description="YOLOv8-face + MediaPipe FaceMesh → faces.json pro Clip") + # Verzeichnisse (Default aus config.py) + p.add_argument("--input-dir", type=Path, default=RAW_CLIPS_DIR, help=f"Rohclips (Default: {RAW_CLIPS_DIR})") + p.add_argument("--output-dir", type=Path, default=FACE_COMBINED_DIR, help=f"Zielordner (Default: {FACE_COMBINED_DIR})") + # Modell + p.add_argument("--model", type=Path, default=ROOT / "models" / "yolov8n-face.pt", + help="Pfad zum YOLOv8-face Modell (.pt)") + # Optimierte Defaults + p.add_argument("--conf-thresh", type=float, default=0.35) + p.add_argument("--frame-skip", type=int, default=1, help="Nur jeden n-ten Frame verarbeiten") + p.add_argument("--downscale", type=float, default=0.5, help="Eingangsframe auf Faktor verkleinern (0..1)") + p.add_argument("--expansion", type=float, default=0.4, help="Crop-Margin Pass 1 (relativ)") + p.add_argument("--expansion2", type=float, default=0.8, help="Crop-Margin Pass 2 (relativ)") + p.add_argument("--min-crop", type=int, default=160, help="Minimaler Croprand in Pixeln (quadratisch)") + p.add_argument("--faces-upscale", type=int, default=192, help="Minimale Kantenlänge für FaceMesh (bei kleineren Crops upscalen)") + p.add_argument("--imgsz", type=int, default=448) + p.add_argument("--max-det", type=int, default=20) + p.add_argument("--use-refine", action="store_true", default=False, help="MediaPipe mit refine_landmarks") + return p.parse_args() + +def main(): + args = parse_args() + + logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO) + args.output_dir.mkdir(parents=True, exist_ok=True) + + # YOLO Modell & Device + yolo = YOLO(str(args.model)) + if torch.backends.mps.is_available(): + device = "mps" + elif torch.cuda.is_available(): + device = "cuda" + else: + device = "cpu" + yolo.to(device) + print(f"🖥️ Inference-Device: {device}") + + # Warmup + try: + with torch.no_grad(): + dummy = np.zeros((args.imgsz, args.imgsz, 3), dtype=np.uint8) + _ = yolo(source=[dummy], imgsz=args.imgsz, verbose=False, device=device) + except Exception: + pass + + # Eingabedateien anzeigen + videos = sorted([*args.input_dir.glob("*.mp4"), *args.input_dir.glob("*.mov"), *args.input_dir.glob("*.mkv")]) + print(f"🔍 Input-Ordner: {args.input_dir.resolve()}") + if not videos: + print("⚠️ Keine passenden Videos gefunden.") + return + print("📁 Dateien:") + for p in videos: + print(" →", p.name) + + outer = None + if _HAS_TQDM: + outer = tqdm(total=len(videos), desc="Gesamt", unit="vid", leave=False) + + with mp.solutions.face_mesh.FaceMesh( + static_image_mode=False, + max_num_faces=10, + refine_landmarks=args.use_refine, + min_detection_confidence=0.5, + min_tracking_confidence=0.5 + ) as face_mesh: + for vid in videos: + out = args.output_dir / f"{vid.stem}_faces.json" + process_video( + video_path=vid, + output_path=out, + model=yolo, + face_mesh=face_mesh, + conf_thresh=args.conf_thresh, + frame_skip=args.frame_skip, + downscale=args.downscale, + expansion_1=args.expansion, + expansion_2=args.expansion2, + min_crop=args.min_crop, + faces_upscale=args.faces_upscale, + imgsz=args.imgsz, + device=device, + max_det=args.max_det + ) + if outer is not None: + outer.update(1) + + if outer is not None: + outer.close() + +if __name__ == "__main__": + main() diff --git a/src/reformat/main_track_faces.py b/src/reformat/main_track_faces.py new file mode 100644 index 0000000..258bf02 --- /dev/null +++ b/src/reformat/main_track_faces.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 +import logging, json +from pathlib import Path +from typing import List, Dict, Any +import sys + +# Projekt-Root verfügbar machen +ROOT = Path(__file__).resolve().parents[2] +sys.path.insert(0, str(ROOT)) + +from config import FACE_COMBINED_DIR, FACE_CROP_CENTERS # ggf. SEGMENTS_DIR, wenn du dorthin schreibst + + +def iou(boxA, boxB): + xA = max(boxA[0], boxB[0]) + yA = max(boxA[1], boxB[1]) + xB = min(boxA[0]+boxA[2], boxB[0]+boxB[2]) + yB = min(boxA[1]+boxA[3], boxB[1]+boxB[3]) + interW, interH = max(0, xB-xA), max(0, yB-yA) + inter = interW * interH + union = boxA[2]*boxA[3] + boxB[2]*boxB[3] - inter + return inter/union if union > 0 else 0.0 + +def track_faces(faces_all: List[Dict[str,Any]], iou_thresh=0.3): + next_id = 0 + last_boxes = {} # track_id -> bbox + for frame in faces_all: + new_boxes = {} + for face in frame["faces"]: + box = face["bbox"] + # match gegen bestehende + best_id, best_iou = None, 0.0 + for tid, prev_box in last_boxes.items(): + ov = iou(box, prev_box) + if ov > best_iou: + best_id, best_iou = tid, ov + if best_iou > iou_thresh: + face["track_id"] = best_id + new_boxes[best_id] = box + else: + face["track_id"] = next_id + new_boxes[next_id] = box + next_id += 1 + last_boxes = new_boxes + return faces_all + +def main(): + # Eingabe: erkannte Gesichter/Tracks + FACE_DIR = FACE_COMBINED_DIR + # Ausgabe: z. B. berechnete Center pro Frame + OUT_DIR = FACE_CROP_CENTERS + OUT_DIR.mkdir(parents=True, exist_ok=True) + + for f in FACE_DIR.glob("*_faces.json"): + try: + faces_all = json.loads(f.read_text(encoding="utf-8")) + except Exception as e: + print(f"❌ Fehler beim Laden {f.name}: {e}") + continue + + tracked = track_faces(faces_all) + f.write_text(json.dumps(tracked, ensure_ascii=False), encoding="utf-8") + print(f"✅ Track-IDs ergänzt: {f.name}") + + # zusätzlich centers.json (dominant = höchster mouth_openness pro Frame) + centers = [] + for fr in tracked: + if fr["faces"]: + best = max(fr["faces"], key=lambda ff: ff.get("mouth_openness", 0.0)) + centers.append([best["center"][0], best["center"][1]]) + else: + centers.append([fr["W"]/2, fr["H"]/2]) + centers_path = f.with_name(f.stem.replace("_faces","_centers")+".json") + centers_path.write_text(json.dumps(centers, ensure_ascii=False), encoding="utf-8") + print(f"📝 Centers gespeichert: {centers_path.name}") + +if __name__ == "__main__": + main() diff --git a/src/reformat/make_segments.py b/src/reformat/make_segments.py new file mode 100644 index 0000000..1c438f5 --- /dev/null +++ b/src/reformat/make_segments.py @@ -0,0 +1,306 @@ +#!/usr/bin/env python3 +# make_segments.py — erzeugt pro Highlight eine Zielspur (target_by_frame.json) fürs Cropping + +from __future__ import annotations +import json +import argparse +from dataclasses import dataclass +from typing import List, Dict, Optional, Tuple +from pathlib import Path +import sys + +# ── Projektwurzel in sys.path aufnehmen (dieses Skript liegt z. B. unter src/reformat/) +ROOT = Path(__file__).resolve().parents[2] +sys.path.insert(0, str(ROOT)) + +from config import RAW_CLIPS_DIR, FACE_COMBINED_DIR, FACE_CROP_CENTERS, SEGMENTS_DIR + +try: + from moviepy.video.io.VideoFileClip import VideoFileClip + MOVIEPY_OK = True +except Exception: + MOVIEPY_OK = False + + +# ────────────────────────────────────────────────────────────────────────────── +# Hilfsstrukturen +# ────────────────────────────────────────────────────────────────────────────── + +@dataclass +class FaceDet: + t: float # Sekunden + cx: float # Zentrum x (0..1) + cy: float # Zentrum y (0..1) + w: float # Breite rel. (0..1) + h: float # Höhe rel. (0..1) + track_id: Optional[int] = None + mouth_prob: Optional[float] = None + +def moving_average(xs: List[float], win: int) -> List[float]: + if win <= 1 or len(xs) <= 2: + return xs[:] + # ungerade Fensterbreite erzwingen + win = win if win % 2 == 1 else win + 1 + r = win // 2 + out = [] + for i in range(len(xs)): + a = max(0, i - r) + b = min(len(xs), i + r + 1) + out.append(sum(xs[a:b]) / (b - a)) + return out + +def clamp01(x: float) -> float: + return max(0.0, min(1.0, x)) + + +# ────────────────────────────────────────────────────────────────────────────── +# Lesen möglicher Eingabeformate (robust, schema-tolerant) +# ────────────────────────────────────────────────────────────────────────────── + +def _parse_face_like(obj: Dict, t: float, W: float | None = None, H: float | None = None) -> FaceDet: + """ + Erwartet entweder: + - bbox=[x,y,w,h] in Pixel → wird via W,H auf 0..1 normiert + - oder bereits normierte Felder cx,cy,w,h in 0..1 + Optional: track_id, mouth_prob / mouth_open / talking_prob + """ + if "bbox" in obj and isinstance(obj["bbox"], (list, tuple)) and len(obj["bbox"]) >= 4: + x, y, w, h = [float(v) for v in obj["bbox"][:4]] + if W and H and W > 0 and H > 0: + cx = (x + w * 0.5) / W + cy = (y + h * 0.5) / H + w = w / W + h = h / H + else: + # Falls Maße fehlen: best effort, danach clampen + cx = x + w * 0.5 + cy = y + h * 0.5 + cx, cy = clamp01(cx), clamp01(cy) + w, h = max(0.0, min(1.0, w)), max(0.0, min(1.0, h)) + else: + cx = float(obj.get("cx", 0.5)) + cy = float(obj.get("cy", 0.5)) + w = float(obj.get("w", 0.3)) + h = float(obj.get("h", 0.3)) + cx, cy = clamp01(cx), clamp01(cy) + w, h = max(0.0, min(1.0, w)), max(0.0, min(1.0, h)) + + track_id = obj.get("track_id") + mouth_prob = obj.get("mouth_prob") or obj.get("mouth_open") or obj.get("talking_prob") + mouth_prob = None if mouth_prob is None else float(mouth_prob) + + return FaceDet(t=t, cx=cx, cy=cy, w=w, h=h, track_id=track_id, mouth_prob=mouth_prob) + + +def load_faces_or_centers(stem: str, fps_hint: float | None = None) -> List[FaceDet]: + """ + Lädt die beste verfügbare Gesichts/Center-Quelle für ein Highlight. + Suchreihenfolge: + 1) FACE_COMBINED_DIR/{stem}_faces.json (Liste von Frames mit 'faces') + 2) FACE_CROP_CENTERS/{stem}_centers.json + - akzeptiert entweder [[cx,cy], ...] oder [{t,cx,cy,w,h}, ...] + """ + candidates = [ + (FACE_COMBINED_DIR / f"{stem}_faces.json", "faces"), + (FACE_CROP_CENTERS / f"{stem}_centers.json", "centers"), + ] + path = kind = None + for p, k in candidates: + if p.exists(): + path, kind = p, k + break + + if path is None: + print(f"⚠️ Keine Face/Centers-Datei gefunden für {stem}. Fallback später → (0.5,0.5).") + return [] + + try: + raw = path.read_text(encoding="utf-8") + data = json.loads(raw) + except Exception as e: + print(f"❌ Konnte {path.name} nicht lesen: {e}") + return [] + + dets: List[FaceDet] = [] + + # 1) Liste von Frames: [{ "W":..,"H":..,"timestamp"/"t":.., "faces":[...] }, ...] + if isinstance(data, list) and data and isinstance(data[0], dict) and "faces" in data[0]: + for fr in data: + W = float(fr.get("W") or 0.0) + H = float(fr.get("H") or 0.0) + t = float(fr.get("t") or fr.get("timestamp") or fr.get("time") or 0.0) + for f in fr.get("faces", []): + dets.append(_parse_face_like(f, t, W, H)) + + # 2) Dict mit "frames": [...] + elif isinstance(data, dict) and "frames" in data: + for fr in data["frames"]: + W = float(fr.get("W") or 0.0) + H = float(fr.get("H") or 0.0) + t = float(fr.get("t") or fr.get("timestamp") or fr.get("time") or 0.0) + for f in fr.get("faces", []): + dets.append(_parse_face_like(f, t, W, H)) + + # 3) centers.json als Liste von Listen: [[cx,cy], ...] + elif isinstance(data, list) and data and isinstance(data[0], (list, tuple)) and len(data[0]) >= 2: + fps = float(fps_hint or 25.0) + for i, pair in enumerate(data): + cx, cy = float(pair[0]), float(pair[1]) + dets.append(FaceDet(t=i / fps, cx=clamp01(cx), cy=clamp01(cy), w=0.6, h=0.6)) + + # 4) Liste von Dicts mit evtl. bereits normierten Feldern + elif isinstance(data, list) and data and isinstance(data[0], dict): + for item in data: + t = float(item.get("t") or item.get("time") or 0.0) + dets.append(_parse_face_like(item, t)) + + else: + print(f"⚠️ Unbekanntes JSON-Format in {path.name}.") + return [] + + # filtern & sortieren + dets = [d for d in dets if 0.0 <= d.cx <= 1.0 and 0.0 <= d.cy <= 1.0] + dets.sort(key=lambda d: d.t) + print(f"✅ {len(dets)} Detektionen aus {path.name} ({kind}).") + return dets + + +# ────────────────────────────────────────────────────────────────────────────── +# Zielspur berechnen +# ────────────────────────────────────────────────────────────────────────────── + +def build_target_by_frame( + faces: List[FaceDet], + duration: float, + fps: float, + smooth_win: int = 7 +) -> List[Dict]: + """ + Wählt pro Frame eine Zielposition (cx,cy,w,h). + Heuristik: + - bevorzuge Gesicht mit höchster mouth_prob (wenn vorhanden), + - sonst größtes Bounding-Box-Areal (w*h), + - halte IDs stabil (nicht zu häufige Sprünge). + Anschließend leichte Glättung (Moving Average) der Zentren/Größen. + """ + if fps <= 0: + fps = 25.0 + total_frames = max(1, int(round(duration * fps))) + if not faces: + # Fallback: center track + return [{"frame": i, "t": round(i / fps, 4), "cx": 0.5, "cy": 0.5, "w": 0.6, "h": 0.6} for i in range(total_frames)] + + frame_targets: List[Tuple[float, float, float, float]] = [] # (cx, cy, w, h) + last_track: Optional[int] = None + + # lineare Suche über faces (bei Bedarf später bucketisieren) + for i in range(total_frames): + t = i / fps + lo, hi = t - 1.0 / fps, t + 1.0 / fps + + cand: List[FaceDet] = [d for d in faces if lo <= d.t <= hi] + if not cand: + # Nimm den zeitlich nächsten + nearest = min(faces, key=lambda d: abs(d.t - t)) + cand = [nearest] + + def score(d: FaceDet) -> Tuple[float, float, float]: + mouth = -1.0 if d.mouth_prob is None else float(d.mouth_prob) # None schlechter als 0 + area = float(d.w) * float(d.h) + stable = 1.0 if (last_track is not None and d.track_id == last_track) else 0.0 + return (mouth, area, stable) + + cand.sort(key=score, reverse=True) + best = cand[0] + if best.track_id is not None: + last_track = best.track_id + frame_targets.append((best.cx, best.cy, best.w, best.h)) + + # Glätten + cxs = moving_average([c for c, _, _, _ in frame_targets], smooth_win) + cys = moving_average([c for _, c, _, _ in frame_targets], smooth_win) + ws = moving_average([w for *_, w, _ in frame_targets], max(3, smooth_win // 2)) + hs = moving_average([h for *_, _, h in frame_targets], max(3, smooth_win // 2)) + + out = [] + for i, (cx, cy, w, h) in enumerate(zip(cxs, cys, ws, hs)): + t = i / fps + out.append({ + "frame": i, + "t": round(t, 4), + "cx": round(clamp01(cx), 4), + "cy": round(clamp01(cy), 4), + "w": round(max(0.05, min(1.0, w)), 4), + "h": round(max(0.05, min(1.0, h)), 4), + }) + return out + + +# ────────────────────────────────────────────────────────────────────────────── +# I/O +# ────────────────────────────────────────────────────────────────────────────── + +def write_target_json(stem: str, target: List[Dict]) -> Path: + SEGMENTS_DIR.mkdir(parents=True, exist_ok=True) + out_path = SEGMENTS_DIR / f"{stem}_target_by_frame.json" + out_path.write_text(json.dumps(target, ensure_ascii=False, indent=2), encoding="utf-8") + return out_path + + +# ────────────────────────────────────────────────────────────────────────────── +# CLI / Main +# ────────────────────────────────────────────────────────────────────────────── + +def parse_args(): + p = argparse.ArgumentParser(description="Erzeugt target_by_frame.json aus Face/Center-Detektionen für Cropping.") + p.add_argument("--pattern", type=str, default="highlight_*.mp4", help="Dateimuster in RAW_CLIPS_DIR (Default: highlight_*.mp4)") + p.add_argument("--fps", type=float, default=0.0, help="FPS erzwingen (0 = aus Video lesen).") + p.add_argument("--smooth", type=int, default=7, help="Fensterbreite für Moving-Average-Glättung (ungerade).") + p.add_argument("--overwrite", action="store_true", help="Existierende target_by_frame.json überschreiben.") + return p.parse_args() + + +def main(): + if not MOVIEPY_OK: + raise RuntimeError("moviepy ist nicht installiert. Bitte `pip install moviepy` ausführen.") + + args = parse_args() + + vids = sorted(RAW_CLIPS_DIR.glob(args.pattern)) + if not vids: + print(f"⚠️ Keine Rohclips gefunden in {RAW_CLIPS_DIR} mit Pattern '{args.pattern}'.") + return + + print(f"🔎 Finde {len(vids)} Clips …") + + for vid in vids: + stem = vid.stem # z. B. highlight_3 + out_json = SEGMENTS_DIR / f"{stem}_target_by_frame.json" + if out_json.exists() and not args.overwrite: + print(f"⏭️ {out_json.name} existiert bereits – überspringe (nutze --overwrite zum Ersetzen).") + continue + + # Video-Metadaten + try: + with VideoFileClip(str(vid)) as V: + duration = float(V.duration or 0.0) + fps = float(args.fps or (V.fps or 25.0)) + except Exception as e: + print(f"❌ Kann Video {vid.name} nicht öffnen: {e} – Fallback duration/fps (10s/25fps).") + duration, fps = 10.0, (args.fps or 25.0) + + # Face/Centers laden (fps_hint durchreichen, wichtig für centers-Listen) + faces = load_faces_or_centers(stem, fps_hint=fps) + + # Zielspur bauen + target = build_target_by_frame(faces, duration=duration, fps=fps, smooth_win=args.smooth) + + # Schreiben + out = write_target_json(stem, target) + print(f"💾 geschrieben: {out}") + + print("🎉 Fertig.") + + +if __name__ == "__main__": + main() diff --git a/src/reformat/new/analyze_mouth_activity.py b/src/reformat/new/analyze_mouth_activity.py new file mode 100644 index 0000000..41f71e4 --- /dev/null +++ b/src/reformat/new/analyze_mouth_activity.py @@ -0,0 +1,118 @@ +#!/usr/bin/env python3 +# src/reformat/new/analyze_mouth_activity.py +import logging +from pathlib import Path +from typing import List, Dict, Any, Tuple, Optional + +# OpenAI optional; aktuell nicht genutzt (Flag fehlt bewusst) +# from openai import OpenAI + +# === HARTE DEFAULTS: einfach Play drücken === +PROJECT_ROOT = Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit") +RAW_DIR = PROJECT_ROOT / "data" / "output" / "raw_clips" +FACES_DIR = PROJECT_ROOT / "data" / "face_data_combined" +TIMED_DIR = PROJECT_ROOT / "data" / "transkripte" +CENTERS_DIR = PROJECT_ROOT / "data" / "face_crop_centers" + +def parse_timed_file(path: Path) -> List[Tuple[float, float]]: + """ + Erwartet Zeilen wie: + [00:00.00 - 00:05.20] Text... + Gibt Liste [(start_sec, end_sec)] zurück. Falls keine Treffer: leere Liste. + """ + import re + rx = re.compile(r"\[(\d+):(\d+)\.(\d+)\s*-\s*(\d+):(\d+)\.(\d+)\]") + segs = [] + try: + for line in path.read_text(encoding="utf-8").splitlines(): + m = rx.search(line) + if not m: + continue + smin, ssec, sms, emin, esec, ems = map(int, m.groups()) + start = smin * 60 + ssec + sms / 100.0 + end = emin * 60 + esec + ems / 100.0 + if end > start: + segs.append((start, end)) + except FileNotFoundError: + pass + return segs + +def select_speaker_center(faces: List[Dict[str, Any]]) -> Tuple[float, float]: + """Priorität: mouth_openness, Fallback: größte Fläche; sonst Bildmitte.""" + if not faces: + return (960.0, 540.0) + def area(f): + bx = f.get("bbox",[0,0,0,0]); return float(bx[2]*bx[3]) + best = max( + faces, + key=lambda f: (float(f.get("mouth_openness", 0.0)), area(f)) + ) + x, y, w, h = best["bbox"] + return (x + w/2.0, y + h/2.0) + +def load_json(path: Path): + import json + return json.loads(path.read_text(encoding="utf-8")) + +def save_json(obj, path: Path): + import json + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(json.dumps(obj, ensure_ascii=False, indent=2), encoding="utf-8") + +def process_one(base_name: str) -> bool: + faces_path = FACES_DIR / f"{base_name}_faces.json" + timed_path = TIMED_DIR / f"{base_name}_timed.txt" + centers_path = CENTERS_DIR / f"{base_name}_centers.json" + + if not faces_path.exists(): + logging.warning("Skip %-18s | Faces fehlen: %s", base_name, faces_path) + return False + if centers_path.exists(): + logging.info("Skip %-18s | Centers existieren schon: %s", base_name, centers_path.name) + return True + + try: + face_data: List[Dict[str, Any]] = load_json(faces_path) + except Exception as e: + logging.error("Fehler beim Lesen von %s: %s", faces_path, e) + return False + + segments = parse_timed_file(timed_path) + if not segments: + logging.warning("[%s] Keine Segmente erkannt oder Datei fehlt: %s", base_name, timed_path.name) + + centers: List[List[float]] = [] + for entry in face_data: + faces = entry.get("faces", []) + cx, cy = select_speaker_center(faces) + centers.append([float(cx), float(cy)]) + + save_json(centers, centers_path) + logging.info("OK %-18s | Centers gespeichert: %s (frames=%d)", base_name, centers_path.name, len(centers)) + return True + +def main(): + logging.basicConfig( + format="%(asctime)s %(levelname)s: %(message)s", + level=logging.INFO + ) + + if not RAW_DIR.exists(): + logging.error("RAW_DIR existiert nicht: %s", RAW_DIR) + return + + clips = sorted(RAW_DIR.glob("*.mp4")) + if not clips: + logging.warning("Keine Clips gefunden in %s", RAW_DIR) + return + + logging.info("Analyze (mouth) Batch-Mode: %d Clips", len(clips)) + ok = 0 + for clip in clips: + base = clip.stem + if process_one(base): + ok += 1 + logging.info("Fertig. %d/%d Clips verarbeitet.", ok, len(clips)) + +if __name__ == "__main__": + main() diff --git a/src/reformat/new/main_apply_crop.py b/src/reformat/new/main_apply_crop.py new file mode 100644 index 0000000..cf90a2c --- /dev/null +++ b/src/reformat/new/main_apply_crop.py @@ -0,0 +1,294 @@ +#!/usr/bin/env python3 +# src/reformat/new/main_apply_crop.py +from __future__ import annotations +import logging, json, math, subprocess +from pathlib import Path +from typing import Optional, Tuple, List, Dict, Any +from collections import deque + +import cv2 +import numpy as np + +# ==== Pfade ================================================================= +PROJECT_ROOT = Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit") +INPUT_VIDEO_DIR = PROJECT_ROOT / "data" / "output" / "raw_clips" +FACE_COMBINED_DIR = PROJECT_ROOT / "data" / "face_data_combined" +SEGMENTS_DIR = PROJECT_ROOT / "data" / "segments" +OUTPUT_DIR = PROJECT_ROOT / "output" / "output_9x16_final" +OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + +OUT_W, OUT_H = 1080, 1920 +TARGET_AR = OUT_W / OUT_H # 0.5625 + +# ==== Debug ================================================================= +DEBUG_MODE = False +DEBUG_SCALE = 0.6 +DRAW_GUIDES = True + +# ==== Smooth / Switch ======================================================= +MEDIAN_WIN = 5 +EMA_ALPHA = 0.22 +DEADBAND_PX = 8.0 +SWITCH_COOLDOWN_FRAMES = 12 # kurze Ruhe nach Segmentwechsel +ZOOM_PAD_FRAC = 0.10 + +# ==== Scene-Cut-Erkennung =================================================== +USE_CUT_DETECT = True +CUT_CORR_THRESH = 0.65 +CUT_COOLDOWN = 6 + +# ==== Audio-Mux ============================================================= +MUX_AUDIO = True +FFMPEG_BIN = "ffmpeg" +# ============================================================================ + +def clamp(v, lo, hi): return max(lo, min(hi, v)) + +def compute_crop_rect(cx: float, cy: float, src_w: int, src_h: int) -> tuple[int,int,int,int]: + """ + Liefert ein 9:16-Croprechteck (x,y,w,h) um (cx,cy). + - AR bleibt IMMER exakt 9:16 (kein Squeeze) + - ZOOM_PAD_FRAC wirkt als uniformer Scale auf Breite und Höhe + - Rechteck bleibt vollständig im Bild + """ + src_ar = src_w / src_h + + if src_ar >= TARGET_AR: + base_h = src_h + base_w = int(round(base_h * TARGET_AR)) + else: + base_w = src_w + base_h = int(round(base_w / TARGET_AR)) + + desired_scale = 1.0 + ZOOM_PAD_FRAC + max_scale_w = src_w / base_w + max_scale_h = src_h / base_h + s = min(desired_scale, max_scale_w, max_scale_h) + + w = int(round(base_w * s)) + h = int(round(base_h * s)) + + half_w, half_h = w // 2, h // 2 + + cx = clamp(cx, half_w, src_w - half_w) + cy = clamp(cy, half_h, src_h - half_h) + + x = int(round(cx - half_w)) + y = int(round(cy - half_h)) + return x, y, w, h + +def draw_center(img, pt, color, label=None): + if pt is None: return + x, y = int(pt[0]), int(pt[1]) + cv2.circle(img, (x, y), 6, color, -1) + if label: + cv2.putText(img, label, (x + 8, y - 8), + cv2.FONT_HERSHEY_SIMPLEX, 0.55, color, 2, cv2.LINE_AA) + +def scene_corr(a_small: np.ndarray, b_small: np.ndarray) -> float: + a_hsv = cv2.cvtColor(a_small, cv2.COLOR_BGR2HSV) + b_hsv = cv2.cvtColor(b_small, cv2.COLOR_BGR2HSV) + ha = cv2.calcHist([a_hsv],[0,1],None,[50,50],[0,180,0,256]) + hb = cv2.calcHist([b_hsv],[0,1],None,[50,50],[0,180,0,256]) + cv2.normalize(ha,ha,0,1,cv2.NORM_MINMAX); cv2.normalize(hb,hb,0,1,cv2.NORM_MINMAX) + corr = cv2.compareHist(ha, hb, cv2.HISTCMP_CORREL) + return float((corr + 1.0)/2.0) + +def mux_audio_from_source(src_video: Path, silent_video: Path, out_video: Path): + cmd = [ + FFMPEG_BIN, "-y", + "-i", str(src_video), + "-i", str(silent_video), + "-map", "1:v:0", + "-map", "0:a:0?", + "-c:v", "copy", + "-c:a", "aac", "-b:a", "192k", + "-shortest", + str(out_video), + ] + subprocess.run(cmd, check=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + +def load_faces(name: str) -> List[Dict[str,Any]]: + p = FACE_COMBINED_DIR / f"{name}_faces.json" + return json.loads(p.read_text(encoding="utf-8")) + +def load_segments(name: str, total_frames: int) -> List[Optional[int]]: + seg_p = SEGMENTS_DIR / f"{name}_segments.json" + map_p = SEGMENTS_DIR / f"{name}_target_by_frame.json" + if map_p.exists(): + target = json.loads(map_p.read_text(encoding="utf-8")) + if len(target) < total_frames: + target += [target[-1] if target else None] * (total_frames - len(target)) + return target[:total_frames] + if seg_p.exists(): + segs = json.loads(seg_p.read_text(encoding="utf-8")) + target = [None]*total_frames + for s in segs: + a, b, tid = int(s["start_f"]), int(s["end_f"]), s["track_id"] + for t in range(max(0,a), min(total_frames, b+1)): + target[t] = tid + return target + return [None]*total_frames + +def find_center_for_track(faces_frame: Dict[str,Any], target_tid: Optional[int], fallback: Tuple[float,float]) -> Tuple[float,float]: + if target_tid is None: + return fallback + faces = faces_frame.get("faces", []) + for f in faces: + if int(f.get("track_id", -1)) == int(target_tid): + x,y,w,h = f.get("bbox", [None,None,None,None]) + if None not in (x,y,w,h): + return (float(x + w/2), float(y + h/2)) + return fallback + +def main(): + logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO) + clips = sorted(list(INPUT_VIDEO_DIR.glob("*.mp4")) + list(INPUT_VIDEO_DIR.glob("*.mov"))) + if not clips: + print(f"⚠️ Keine Clips in {INPUT_VIDEO_DIR}") + return + + for video_path in clips: + name = video_path.stem + faces_path = FACE_COMBINED_DIR / f"{name}_faces.json" + if not faces_path.exists(): + print(f"⏭️ Skip (keine Faces): {faces_path.name}") + continue + + cap = cv2.VideoCapture(str(video_path)) + if not cap.isOpened(): + print(f"❌ Kann Video nicht öffnen: {video_path.name}") + continue + width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + fps = cap.get(cv2.CAP_PROP_FPS) or 25.0 + total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + + faces_all = load_faces(name) + if len(faces_all) < total: + faces_all += [ {"faces": [], "W": width, "H": height} ] * (total - len(faces_all)) + + target_by_frame = load_segments(name, total) + + out_path = OUTPUT_DIR / f"{name}_9x16.mp4" + if out_path.exists(): + print(f"⏭️ Skip: Output existiert bereits → {out_path.name}") + cap.release() + continue + + writer = cv2.VideoWriter( + str(out_path), + cv2.VideoWriter_fourcc(*"mp4v"), + fps, + (OUT_W, OUT_H) + ) + + median_buf = deque(maxlen=max(3, MEDIAN_WIN if MEDIAN_WIN % 2 else MEDIAN_WIN+1)) + ema_center: Optional[Tuple[float,float]] = None + last_center: Optional[Tuple[float,float]] = (width/2, height/2) + switch_cooldown = 0 + + prev_small = None + cut_cd = 0 + + print(f"🎞️ Apply: {name} src={width}x{height} fps={fps:.2f} frames={total}") + + for t in range(total): + ret, frame = cap.read() + if not ret: break + + target_tid = target_by_frame[t] if t < len(target_by_frame) else None + faces_fr = faces_all[t] if t < len(faces_all) else {"faces":[]} + desired = find_center_for_track(faces_fr, target_tid, last_center or (width/2, height/2)) + + if USE_CUT_DETECT: + small = cv2.resize(frame, (128, 72)) + if prev_small is not None: + corr = scene_corr(prev_small, small) + if corr < CUT_CORR_THRESH: + ema_center = desired + last_center = desired + switch_cooldown = SWITCH_COOLDOWN_FRAMES + cut_cd = CUT_COOLDOWN + prev_small = small + + median_buf.append(desired) + if len(median_buf) >= 3: + xs = sorted(p[0] for p in median_buf) + ys = sorted(p[1] for p in median_buf) + m = len(median_buf)//2 + desired_f = (xs[m], ys[m]) + else: + desired_f = desired + + if t > 0: + prev_tid = target_by_frame[t-1] if t-1 < len(target_by_frame) else None + else: + prev_tid = target_tid + + if ema_center is None: + ema_center = desired_f + if last_center is None: + last_center = desired_f + + if target_tid != prev_tid: + ema_center = desired_f + last_center = desired_f + switch_cooldown = SWITCH_COOLDOWN_FRAMES + else: + dx = desired_f[0] - ema_center[0] + dy = desired_f[1] - ema_center[1] + dist = math.hypot(dx, dy) + if cut_cd > 0: + ema_center = desired_f + cut_cd -= 1 + else: + if dist > DEADBAND_PX: + ema_center = (ema_center[0] + dx*EMA_ALPHA, + ema_center[1] + dy*EMA_ALPHA) + + last_center = desired_f + + # neuer 9:16 Crop + x, y, w, h = compute_crop_rect(ema_center[0], ema_center[1], width, height) + cropped = frame[y:y+h, x:x+w] + if cropped.size == 0: cropped = frame + final = cv2.resize(cropped, (OUT_W, OUT_H), interpolation=cv2.INTER_AREA) + writer.write(final) + + if DEBUG_MODE: + dbg = frame.copy() + cv2.rectangle(dbg, (x, y), (x+w, y+h), (0, 0, 255), 2) + if DRAW_GUIDES: + draw_center(dbg, desired, (128,128,255), "desired") + draw_center(dbg, desired_f, (255,255, 0), "median") + draw_center(dbg, ema_center, ( 0,255,255), "ema") + cv2.putText(dbg, f"t={t+1}/{total} tid={target_tid}", + (12, height-14), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (20,220,20), 2, cv2.LINE_AA) + disp = cv2.resize(dbg, (int(width*DEBUG_SCALE), int(height*DEBUG_SCALE))) + cv2.imshow("Apply Debug", disp) + if cv2.waitKey(1) & 0xFF == ord("q"): + print("🛑 Abgebrochen (q).") + break + + writer.release() + cap.release() + + if MUX_AUDIO: + tmp = out_path.with_suffix(".tmp.mp4") + try: + out_path.rename(tmp) + mux_audio_from_source(video_path, tmp, out_path) + finally: + if tmp.exists(): + try: tmp.unlink() + except: pass + print(f"✅ Fertig (mit Audio): {out_path.name}") + else: + print(f"✅ Fertig: {out_path.name}") + + if DEBUG_MODE: + cv2.destroyAllWindows() + +if __name__ == "__main__": + main() diff --git a/src/reformat/new/main_detect_faces.py b/src/reformat/new/main_detect_faces.py new file mode 100644 index 0000000..12094ec --- /dev/null +++ b/src/reformat/new/main_detect_faces.py @@ -0,0 +1,319 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import argparse +import logging +import json +import time +from pathlib import Path +from contextlib import nullcontext + +import cv2 +import numpy as np +import torch +from ultralytics import YOLO +import mediapipe as mp + +# Fortschritt hübsch, wenn verfügbar +try: + from tqdm import tqdm + _HAS_TQDM = True +except Exception: + _HAS_TQDM = False + +from src.reformat.new.speaking import get_mouth_openness + +# ---------- Performance Tweaks ---------- +torch.set_float32_matmul_precision("high") +cv2.setUseOptimized(True) + +# ---------- Hilfsfunktionen ---------- + +def make_square_crop(x1, y1, x2, y2, W, H, margin_scale, min_crop): + cx = (x1 + x2) * 0.5 + cy = (y1 + y2) * 0.5 + w = (x2 - x1) * (1.0 + 2.0 * margin_scale) + h = (y2 - y1) * (1.0 + 2.0 * margin_scale) + side = max(w, h, float(min_crop)) + half = side * 0.5 + + sx1 = int(max(0, round(cx - half))) + sy1 = int(max(0, round(cy - half))) + sx2 = int(min(W, round(cx + half))) + sy2 = int(min(H, round(cy + half))) + + side_w = max(0, sx2 - sx1) + side_h = max(0, sy2 - sy1) + side = max(2, min(side_w, side_h)) + sx2 = sx1 + side + sy2 = sy1 + side + return sx1, sy1, sx2, sy2 + + +def pick_landmarks_near_crop_center(lm_lists, crop_w, crop_h): + if not lm_lists: + return None + cx_t, cy_t = crop_w * 0.5, crop_h * 0.5 + best, best_d = None, 1e12 + for lms in lm_lists: + xs = [p.x * crop_w for p in lms.landmark] + ys = [p.y * crop_h for p in lms.landmark] + cx = sum(xs) / len(xs) + cy = sum(ys) / len(ys) + d = (cx - cx_t) ** 2 + (cy - cy_t) ** 2 + if d < best_d: + best, best_d = lms, d + return best + + +def run_mesh(face_mesh, crop_bgr, upscale_if_small): + if crop_bgr.size == 0: + return None, 0.0 + ch, cw = crop_bgr.shape[:2] + if max(ch, cw) < upscale_if_small: + scale = float(upscale_if_small) / max(ch, cw) + new_w = max(1, int(round(cw * scale))) + new_h = max(1, int(round(ch * scale))) + crop_bgr = cv2.resize(crop_bgr, (new_w, new_h), interpolation=cv2.INTER_CUBIC) + ch, cw = new_h, new_w + rgb = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2RGB) + res = face_mesh.process(rgb) + if not res.multi_face_landmarks: + return None, 0.0 + chosen = pick_landmarks_near_crop_center(res.multi_face_landmarks, cw, ch) + if chosen is None: + return None, 0.0 + mo = get_mouth_openness(chosen.landmark, ch) + return chosen, float(mo) + +# ---------- Kernprozess ---------- + +def process_video(video_path: Path, + output_path: Path, + model: YOLO, + face_mesh, + conf_thresh: float, + frame_skip: int, + downscale: float, + expansion_1: float, + expansion_2: float, + min_crop: int, + faces_upscale: int, + imgsz: int, + device: str, + max_det: int): + print(f"🎬 Starte Detection: {video_path.name}") + cap = cv2.VideoCapture(str(video_path)) + if not cap.isOpened(): + logging.error(f"❌ Kann Video nicht öffnen: {video_path}") + return + + fps = cap.get(cv2.CAP_PROP_FPS) or 25.0 + orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + total_frames_raw = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) + # Wenn frame_skip > 1, reduziert sich die tatsächlich verarbeitete Anzahl + total_to_process = None + if total_frames_raw > 0: + total_to_process = (total_frames_raw + (frame_skip - 1)) // max(1, frame_skip) + + scaled_w = max(1, int(round(orig_w * downscale))) + scaled_h = max(1, int(round(orig_h * downscale))) + + data = [] + frame_idx = 0 + processed_frames = 0 + + sx = (orig_w / scaled_w) if downscale != 1.0 else 1.0 + sy = (orig_h / scaled_h) if downscale != 1.0 else 1.0 + + autocast_ctx = ( + torch.autocast(device_type=device, dtype=torch.float16) + if device in ("mps", "cuda") else nullcontext() + ) + + # Fortschrittsbalken pro Video + bar = None + start_t = time.time() + if _HAS_TQDM: + bar = tqdm(total=total_to_process, desc=f"{video_path.name}", unit="f", leave=True) + + while True: + ret, frame = cap.read() + if not ret: + break + + if frame_skip > 1 and (frame_idx % frame_skip != 0): + frame_idx += 1 + continue + + frame_infer = frame if downscale == 1.0 else cv2.resize(frame, (scaled_w, scaled_h), interpolation=cv2.INTER_AREA) + + with torch.no_grad(): + with autocast_ctx: + detections = model(frame_infer, imgsz=imgsz, device=device, verbose=False, + conf=conf_thresh, iou=0.5, max_det=max_det)[0] + + faces = [] + for i in range(len(detections.boxes)): + box = detections.boxes[i] + conf = float(box.conf[0]) if hasattr(box.conf, "__len__") else float(box.conf) + if conf < conf_thresh: + continue + x1, y1, x2, y2 = [float(v) for v in box.xyxy[0].tolist()] + if downscale != 1.0: + x1 *= sx; x2 *= sx; y1 *= sy; y2 *= sy + x1 = max(0.0, min(x1, orig_w - 1)) + y1 = max(0.0, min(y1, orig_h - 1)) + x2 = max(0.0, min(x2, orig_w - 1)) + y2 = max(0.0, min(y2, orig_h - 1)) + + w = max(1.0, x2 - x1) + h = max(1.0, y2 - y1) + cx = x1 + w / 2.0 + cy = y1 + h / 2.0 + + # Pass 1 + sx1, sy1, sx2, sy2 = make_square_crop(x1, y1, x2, y2, orig_w, orig_h, expansion_1, min_crop) + if sx2 - sx1 < 4 or sy2 - sy1 < 4: + continue + face_crop = frame[sy1:sy2, sx1:sx2] + _, mouth_open = run_mesh(face_mesh, face_crop, faces_upscale) + + # Pass 2 nur wenn nötig + if mouth_open == 0.0: + sx1b, sy1b, sx2b, sy2b = make_square_crop(x1, y1, x2, y2, orig_w, orig_h, expansion_2, min_crop) + if (sx2b - sx1b) >= 4 and (sy2b - sy1b) >= 4: + face_crop_b = frame[sy1b:sy2b, sx1b:sx2b] + _, mouth_open = run_mesh(face_mesh, face_crop_b, faces_upscale) + + faces.append({ + "bbox": [int(round(x1)), int(round(y1)), int(round(w)), int(round(h))], + "conf": round(conf, 3), + "center": [round(cx, 1), round(cy, 1)], + "mouth_openness": round(float(mouth_open), 3) + }) + + data.append({ + "frame": frame_idx, + "timestamp": round(frame_idx / fps, 3), + "W": orig_w, + "H": orig_h, + "faces": faces + }) + frame_idx += 1 + processed_frames += 1 + + # Fortschritt aktualisieren + if _HAS_TQDM: + bar.update(1) + else: + # leichter Fallback: ETA Ausgabe alle 30 verarbeitete Frames + if processed_frames % 30 == 0: + elapsed = time.time() - start_t + rate = processed_frames / max(1e-6, elapsed) # frames/sec + if total_to_process: + remaining = max(0, total_to_process - processed_frames) + eta_sec = remaining / max(1e-6, rate) + print(f"[{video_path.name}] {processed_frames}/{total_to_process} " + f"({processed_frames/total_to_process*100:.1f}%) " + f"— {rate:.1f} f/s — ETA {eta_sec/60:.1f} min") + else: + print(f"[{video_path.name}] {processed_frames} frames — {rate:.1f} f/s") + + cap.release() + if _HAS_TQDM and bar is not None: + bar.close() + + output_path.write_text(json.dumps(data, ensure_ascii=False), encoding="utf-8") + print(f"✅ Faces gespeichert: {output_path.name}") + +def main(): + parser = argparse.ArgumentParser() + # Verzeichnisse + parser.add_argument("--input-dir", type=Path, + default=Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit/data/output/raw_clips")) + parser.add_argument("--output-dir", type=Path, + default=Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit/data/face_data_combined")) + parser.add_argument("--model", type=Path, + default=Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit/models/yolov8n-face.pt")) + # Optimierte Defaults (keine Presets nötig) + parser.add_argument("--conf-thresh", type=float, default=0.35) + parser.add_argument("--frame-skip", type=int, default=1) + parser.add_argument("--downscale", type=float, default=0.5) + parser.add_argument("--expansion", type=float, default=0.4) + parser.add_argument("--expansion2", type=float, default=0.8) + parser.add_argument("--min-crop", type=int, default=160) + parser.add_argument("--faces-upscale", type=int, default=192) + parser.add_argument("--imgsz", type=int, default=448) + parser.add_argument("--max-det", type=int, default=20) + parser.add_argument("--use-refine", action="store_true", default=False) + args = parser.parse_args() + + logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO) + args.output_dir.mkdir(parents=True, exist_ok=True) + + # Model & Device + yolo = YOLO(str(args.model)) + if torch.backends.mps.is_available(): + device = "mps" + elif torch.cuda.is_available(): + device = "cuda" + else: + device = "cpu" + yolo.to(device) + print(f"🖥️ Inference-Device: {device}") + + # Warmup (reduziert Anlaufschwankungen) + try: + with torch.no_grad(): + dummy = np.zeros((args.imgsz, args.imgsz, 3), dtype=np.uint8) + _ = yolo.predict(source=[dummy], imgsz=args.imgsz, verbose=False, device=device) + except Exception: + pass + + # Liste der Videos (für Gesamt-Fortschritt) + videos = sorted(args.input_dir.glob("*.mp4")) + print(f"🔍 Input-Ordner: {args.input_dir.resolve()}") + print("📁 Dateien:") + for p in sorted(args.input_dir.glob("*")): + print(" →", p.name) + + # Gesamt-Fortschrittsbalken pro Datei + outer = None + if _HAS_TQDM: + outer = tqdm(total=len(videos), desc="Gesamt", unit="vid", leave=False) + + with mp.solutions.face_mesh.FaceMesh( + static_image_mode=False, + max_num_faces=10, + refine_landmarks=args.use_refine, + min_detection_confidence=0.5, + min_tracking_confidence=0.5 + ) as face_mesh: + for vid in videos: + out = args.output_dir / f"{vid.stem}_faces.json" + process_video( + video_path=vid, + output_path=out, + model=yolo, + face_mesh=face_mesh, + conf_thresh=args.conf_thresh, + frame_skip=args.frame_skip, + downscale=args.downscale, + expansion_1=args.expansion, + expansion_2=args.expansion2, + min_crop=args.min_crop, + faces_upscale=args.faces_upscale, + imgsz=args.imgsz, + device=device, + max_det=args.max_det + ) + if _HAS_TQDM and outer is not None: + outer.update(1) + + if _HAS_TQDM and outer is not None: + outer.close() + +if __name__ == "__main__": + main() diff --git a/src/reformat/new/main_track_faces.py b/src/reformat/new/main_track_faces.py new file mode 100644 index 0000000..53d7347 --- /dev/null +++ b/src/reformat/new/main_track_faces.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python3 +import logging, json +from pathlib import Path +from typing import List, Dict, Any + +def iou(boxA, boxB): + xA = max(boxA[0], boxB[0]) + yA = max(boxA[1], boxB[1]) + xB = min(boxA[0]+boxA[2], boxB[0]+boxB[2]) + yB = min(boxA[1]+boxA[3], boxB[1]+boxB[3]) + interW, interH = max(0, xB-xA), max(0, yB-yA) + inter = interW * interH + union = boxA[2]*boxA[3] + boxB[2]*boxB[3] - inter + return inter/union if union > 0 else 0.0 + +def track_faces(faces_all: List[Dict[str,Any]], iou_thresh=0.3): + next_id = 0 + last_boxes = {} # track_id -> bbox + for frame in faces_all: + new_boxes = {} + for face in frame["faces"]: + box = face["bbox"] + # match gegen bestehende + best_id, best_iou = None, 0.0 + for tid, prev_box in last_boxes.items(): + ov = iou(box, prev_box) + if ov > best_iou: + best_id, best_iou = tid, ov + if best_iou > iou_thresh: + face["track_id"] = best_id + new_boxes[best_id] = box + else: + face["track_id"] = next_id + new_boxes[next_id] = box + next_id += 1 + last_boxes = new_boxes + return faces_all + +def main(): + PROJECT_ROOT = Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit") + FACE_DIR = PROJECT_ROOT / "data" / "face_data_combined" + + for f in FACE_DIR.glob("*_faces.json"): + try: + faces_all = json.loads(f.read_text(encoding="utf-8")) + except Exception as e: + print(f"❌ Fehler beim Laden {f.name}: {e}") + continue + + tracked = track_faces(faces_all) + f.write_text(json.dumps(tracked, ensure_ascii=False), encoding="utf-8") + print(f"✅ Track-IDs ergänzt: {f.name}") + + # zusätzlich centers.json (dominant = höchster mouth_openness pro Frame) + centers = [] + for fr in tracked: + if fr["faces"]: + best = max(fr["faces"], key=lambda ff: ff.get("mouth_openness", 0.0)) + centers.append([best["center"][0], best["center"][1]]) + else: + centers.append([fr["W"]/2, fr["H"]/2]) + centers_path = f.with_name(f.stem.replace("_faces","_centers")+".json") + centers_path.write_text(json.dumps(centers, ensure_ascii=False), encoding="utf-8") + print(f"📝 Centers gespeichert: {centers_path.name}") + +if __name__ == "__main__": + main() diff --git a/src/reformat/new/make_segments.py b/src/reformat/new/make_segments.py new file mode 100644 index 0000000..c661485 --- /dev/null +++ b/src/reformat/new/make_segments.py @@ -0,0 +1,179 @@ +#!/usr/bin/env python3 +# src/reformat/new/make_segments.py +from __future__ import annotations +import json, math +from pathlib import Path +from dataclasses import dataclass +from typing import List, Dict, Any, Optional +import numpy as np +import cv2 + +# ==== Pfade (an dein Projekt angepasst) ===================================== +PROJECT_ROOT = Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit") +RAW_DIR = PROJECT_ROOT / "data" / "output" / "raw_clips" # Videos +FACE_COMBINED_DIR= PROJECT_ROOT / "data" / "face_data_combined" # *_faces.json +SEGMENTS_DIR = PROJECT_ROOT / "data" / "segments" # Output +SEGMENTS_DIR.mkdir(parents=True, exist_ok=True) +# =========================================================================== + +# === Segment-Parameter === +WIN_SEC = 1.2 # Fensterlänge +STRIDE_SEC = 0.6 # Schrittweite +HYSTERESIS_FACTOR = 1.25 # neuer Sprecher muss +25% besser sein +MIN_SEG_SEC = 1.0 # kürzere Segmente werden an Nachbarn gemerged +CONF_MIN = 0.35 # Sichtbarkeits-Threshold +AREA_CAP_FRAC = 0.12 # ab 12% Framefläche kappen wir den Flächenbonus + +@dataclass +class Segment: + start_f: int + end_f: int + track_id: Optional[int] + +def robust_minmax(vals, p_lo=5, p_hi=95): + v = np.array(vals, dtype=float) + lo, hi = np.percentile(v, [p_lo, p_hi]) + if hi <= lo: hi = lo + 1e-6 + return float(lo), float(hi) + +def score_face(face: Dict[str,Any], W: int, H: int, cx: float, cy: float, + lo: float, hi: float) -> float: + # Mundaktivität robust normalisieren + mo = float(face.get("mouth_openness", 0.0)) + mo = (mo - lo) / (hi - lo + 1e-9) + mo = float(np.clip(mo, 0.0, 1.0)) + + x, y, w, h = map(float, face.get("bbox", [0,0,0,0])) + conf = float(face.get("conf", 1.0)) + if conf < CONF_MIN or w <= 5 or h <= 5: # sehr kleine/unsichere Gesichter raus + return 0.0 + + area = (w*h) / (W*H + 1e-9) + size_w = min(1.0, area / AREA_CAP_FRAC) # Flächengewicht + fx = x + w/2; fy = y + h/2 + dist = math.hypot(fx - cx, fy - cy) / math.hypot(W/2, H/2) + center_w = max(0.0, 1.0 - dist**2) # Mitte leicht bevorzugen + + # MO dominiert, Fläche und Mitte geben Stabilität + return mo * (0.6 + 0.3*size_w + 0.1*center_w) + +def build_segments_for_clip(faces_per_frame: List[Dict[str,Any]], fps: float) -> (List[Segment], List[Optional[int]]): + T = len(faces_per_frame) + if T == 0: + return [], [] + + # Framegröße + W = faces_per_frame[0].get("W") or faces_per_frame[0].get("width") + H = faces_per_frame[0].get("H") or faces_per_frame[0].get("height") + if not W or not H: + # Versuch, aus BBox-Max abzuleiten (Fallback) + max_w = max((f["bbox"][0]+f["bbox"][2]) for fr in faces_per_frame for f in fr.get("faces", []) if "bbox" in f) if any(fr.get("faces") for fr in faces_per_frame) else 1920 + max_h = max((f["bbox"][1]+f["bbox"][3]) for fr in faces_per_frame for f in fr.get("faces", []) if "bbox" in f) if any(fr.get("faces") for fr in faces_per_frame) else 1080 + W, H = int(max_w), int(max_h) + + # Mundwerte für robuste Normierung sammeln + all_mo = [float(f.get("mouth_openness", 0.0)) + for fr in faces_per_frame for f in fr.get("faces", [])] + lo, hi = robust_minmax(all_mo) if all_mo else (0.0, 1.0) + + win = max(1, int(round(WIN_SEC * fps))) + stride = max(1, int(round(STRIDE_SEC * fps))) + minseg = max(1, int(round(MIN_SEG_SEC * fps))) + + chosen_by_frame: List[Optional[int]] = [None]*T + last_track: Optional[int] = None + + for start in range(0, T, stride): + end = min(T, start + win) + sums: Dict[int, float] = {} + for t in range(start, end): + faces = faces_per_frame[t].get("faces", []) + if not faces: continue + for face in faces: + tid = face.get("track_id") + if tid is None: + continue + s = score_face(face, W, H, W/2, H/2, lo, hi) + if s <= 0: + continue + tid = int(tid) + sums[tid] = sums.get(tid, 0.0) + s + + if not sums: + chosen = last_track + else: + best_tid, best_val = max(sums.items(), key=lambda kv: kv[1]) + if last_track is None: + chosen = best_tid + else: + prev_val = sums.get(last_track, 0.0) + chosen = best_tid if best_val > prev_val * HYSTERESIS_FACTOR else last_track + + for t in range(start, end): + chosen_by_frame[t] = chosen + last_track = chosen + + # Lücken auffüllen + for t in range(T): + if chosen_by_frame[t] is None: + chosen_by_frame[t] = last_track + + # Segmente bauen + segs: List[Segment] = [] + cur = chosen_by_frame[0] + seg_start = 0 + for t in range(1, T): + if chosen_by_frame[t] != cur: + segs.append(Segment(seg_start, t-1, cur)) + cur = chosen_by_frame[t] + seg_start = t + segs.append(Segment(seg_start, T-1, cur)) + + # Mindestlänge: zu kurze an vorheriges mergen + out: List[Segment] = [] + for s in segs: + if out and (s.end_f - s.start_f + 1) < minseg: + out[-1].end_f = s.end_f + else: + out.append(s) + + return out, chosen_by_frame + +def main(): + clips = sorted(list(RAW_DIR.glob("*.mp4")) + list(RAW_DIR.glob("*.mov"))) + if not clips: + print(f"⚠️ Keine Videos in {RAW_DIR}") + return + + for vid in clips: + name = vid.stem + faces_path = FACE_COMBINED_DIR / f"{name}_faces.json" + if not faces_path.exists(): + print(f"⏭️ Skip (keine Faces): {faces_path.name}") + continue + + # FPS vom Video + cap = cv2.VideoCapture(str(vid)) + if not cap.isOpened(): + print(f"❌ Kann Video nicht öffnen: {vid.name}") + continue + fps = cap.get(cv2.CAP_PROP_FPS) or 25.0 + cap.release() + + try: + faces_per_frame = json.loads(faces_path.read_text(encoding="utf-8")) + except Exception as e: + print(f"❌ Fehler beim Lesen {faces_path.name}: {e}") + continue + + segs, chosen = build_segments_for_clip(faces_per_frame, fps) + + seg_out = SEGMENTS_DIR / f"{name}_segments.json" + map_out = SEGMENTS_DIR / f"{name}_target_by_frame.json" + seg_out.write_text(json.dumps([s.__dict__ for s in segs], ensure_ascii=False), encoding="utf-8") + map_out.write_text(json.dumps(chosen, ensure_ascii=False), encoding="utf-8") + + print(f"✅ Segmente erzeugt: {seg_out.name} ({len(segs)} Segmente)") + +if __name__ == "__main__": + main() diff --git a/src/reformat/new/smart_speaker_tracker.py b/src/reformat/new/smart_speaker_tracker.py new file mode 100644 index 0000000..5875c54 --- /dev/null +++ b/src/reformat/new/smart_speaker_tracker.py @@ -0,0 +1,58 @@ +from typing import Dict, List, Tuple, Optional +from .tracking import FaceTracker + +class SmartSpeakerTracker: + def __init__(self): + self.face_tracker = FaceTracker() + self.movement_per_id: Dict[int, float] = {} + self.prev_openness: Dict[int, float] = {} + self.confirmation_counter: Dict[int, int] = {} + self.speaker_threshold = 3.0 # wie viel Lippenbewegung braucht es mind. + self.decay_factor = 0.9 # wie schnell "verblasst" die Bewegung + self.speaker_confirm_frames = 25 # wie viele Frames muss ein Sprecher dominieren + self.speaker_id: Optional[int] = None + + def update(self, faces: List[Dict]) -> Tuple[float, float]: + if not faces: + return self.face_tracker.update([]) + + # Lippenbewegung analysieren + for face in faces: + id = face.get("id") + openness = face.get("mouth_openness", 0.0) + prev = self.prev_openness.get(id, openness) + movement = abs(openness - prev) + + # Bewegung aufaddieren mit Decay + old_score = self.movement_per_id.get(id, 0.0) * self.decay_factor + self.movement_per_id[id] = old_score + movement + self.prev_openness[id] = openness + + # Finde ID mit größter Bewegung + if self.movement_per_id: + top_id = max(self.movement_per_id, key=self.movement_per_id.get) + top_movement = self.movement_per_id[top_id] + + if top_movement >= self.speaker_threshold: + self.confirmation_counter[top_id] = self.confirmation_counter.get(top_id, 0) + 1 + # Andere runterzählen + for other_id in self.confirmation_counter: + if other_id != top_id: + self.confirmation_counter[other_id] = max(0, self.confirmation_counter[other_id] - 1) + + # Wenn lange genug bestätigt, neuer Sprecher + if self.confirmation_counter[top_id] >= self.speaker_confirm_frames: + self.speaker_id = top_id + else: + # Wenn keiner über der Schwelle → kein neuer Sprecher + self.confirmation_counter = {k: max(0, v - 1) for k, v in self.confirmation_counter.items()} + + # Sprecher vorhanden → dahin zentrieren + if self.speaker_id is not None: + for face in faces: + if face.get("id") == self.speaker_id: + return tuple(face["center"]) + + # Fallback: stabiler Durchschnitt + centers = [tuple(face["center"]) for face in faces] + return self.face_tracker.update(centers) diff --git a/src/reformat/new/speaker_crop_from_segments.py b/src/reformat/new/speaker_crop_from_segments.py new file mode 100644 index 0000000..5d25c3f --- /dev/null +++ b/src/reformat/new/speaker_crop_from_segments.py @@ -0,0 +1,67 @@ +import json +from pathlib import Path +from typing import List, Dict + +# === Pfade === +SCRIPT_DIR = Path(__file__).resolve().parent +PROJECT_ROOT = SCRIPT_DIR.parents[2] + +FACES_PATH = PROJECT_ROOT / "data" / "face_data_combined" / "testVideoShort_faces.json" +SEGMENTS_PATH = PROJECT_ROOT / "data" / "transkripte" / "testVideoShort_segments.json" +OUTPUT_PATH = PROJECT_ROOT / "data" / "face_crop_centers" / "testVideoShort_centers.json" + +FPS = 25 # Muss zur Framerate deines Videos passen + +# === Dateien laden === +with open(FACES_PATH) as f: + face_data = json.load(f) + +with open(SEGMENTS_PATH) as f: + segments = json.load(f) + +# === Zentrierungen pro Frame bestimmen === +frame_centers: List[List[float]] = [] + +for segment in segments: + start_sec = segment["start"] + end_sec = segment["end"] + start_f = int(start_sec * FPS) + end_f = int(end_sec * FPS) + + # Lippenbewegung pro ID in diesem Segment aufaddieren + movement: Dict[int, float] = {} + count: Dict[int, int] = {} + + for f in range(start_f, min(end_f, len(face_data))): + for face in face_data[f]["faces"]: + id = face.get("id") + openness = face.get("mouth_openness", 0.0) + movement[id] = movement.get(id, 0.0) + openness + count[id] = count.get(id, 0) + 1 + + # Durchschnitt berechnen + avg_movement = {id: movement[id] / count[id] for id in movement if count[id] > 0} + if not avg_movement: + speaker_id = None + else: + speaker_id = max(avg_movement, key=avg_movement.get) + + # Für jedes Frame in diesem Segment den Sprecher zentrieren + for f in range(start_f, min(end_f, len(face_data))): + faces = face_data[f].get("faces", []) + center = [960.0, 540.0] # Fallback + + if speaker_id is not None: + for face in faces: + if face.get("id") == speaker_id: + center = face["center"][:2] + break + + frame_centers.append([round(center[0], 2), round(center[1], 2)]) + +# === Ergebnis speichern === +OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True) +with open(OUTPUT_PATH, "w") as f: + json.dump(frame_centers, f, indent=2) + +print(f"✅ Zentrierung auf Sprecher für {len(frame_centers)} Frames gespeichert unter:\n{OUTPUT_PATH}") diff --git a/src/reformat/new/tracking.py b/src/reformat/new/tracking.py new file mode 100644 index 0000000..838b6c6 --- /dev/null +++ b/src/reformat/new/tracking.py @@ -0,0 +1,84 @@ +from typing import List, Tuple, Optional + + +class FaceTracker: + def __init__( + self, + dist_threshold: float = 200.0, + switch_frames: int = 5, + panning_window: int = 10, + panning_threshold: float = 40.0, + smooth_window: int = 3, + scene_jump_threshold: float = 400.0 + ): + self.dist_threshold = dist_threshold + self.switch_frames = switch_frames + self.panning_window = panning_window + self.panning_threshold = panning_threshold + self.smooth_window = smooth_window + self.scene_jump_threshold = scene_jump_threshold + + self.current_center: Tuple[float, float] = (960.0, 540.0) # Default Mitte (bei 1920x1080) + self.raw_center: Tuple[float, float] = self.current_center + self.prev_center: Tuple[float, float] = self.current_center + self.prev_raw: Tuple[float, float] = self.current_center + self.candidate_center: Optional[Tuple[float, float]] = None + self.switch_counter = 0 + + self.recent_raw_centers: List[Tuple[float, float]] = [] + self.recent_final_centers: List[Tuple[float, float]] = [] + + def update(self, candidates: List[Tuple[float, float]]) -> Tuple[float, float]: + if not candidates: + # kein Gesicht → verwende alten Wert + self.recent_raw_centers.append(self.raw_center) + self.recent_final_centers.append(self.current_center) + return self.current_center + + # nehme das Gesicht, das am nächsten zur vorherigen Position ist + new_center = min(candidates, key=lambda pt: self._distance(self.prev_center, pt)) + self.raw_center = new_center + self.recent_raw_centers.append(new_center) + + dist = self._distance(self.prev_raw, new_center) + if dist > self.scene_jump_threshold: + self.current_center = new_center + self.prev_center = new_center + self.prev_raw = new_center + self._smooth_reset() + return self.current_center + + if dist > self.dist_threshold: + if self.candidate_center != new_center: + self.candidate_center = new_center + self.switch_counter = 1 + else: + self.switch_counter += 1 + if self.switch_counter >= self.switch_frames: + self.prev_center = self.current_center + self.current_center = new_center + self.prev_raw = new_center + self.switch_counter = 0 + else: + self.switch_counter = 0 + self.prev_raw = new_center + + # Smoothes Nachziehen + smoothed = self._moving_average(self.current_center, new_center, self.smooth_window) + self.prev_center = self.current_center + self.current_center = smoothed + self.recent_final_centers.append(smoothed) + + return smoothed + + def _moving_average(self, old, new, factor): + x = (old[0] * (factor - 1) + new[0]) / factor + y = (old[1] * (factor - 1) + new[1]) / factor + return (x, y) + + def _distance(self, pt1, pt2): + return ((pt1[0] - pt2[0]) ** 2 + (pt1[1] - pt2[1]) ** 2) ** 0.5 + + def _smooth_reset(self): + self.recent_raw_centers.clear() + self.recent_final_centers.clear() diff --git a/src/reformat/new/utils.py b/src/reformat/new/utils.py new file mode 100644 index 0000000..0ea37c6 --- /dev/null +++ b/src/reformat/new/utils.py @@ -0,0 +1,129 @@ +# src/utils.py +from __future__ import annotations +import json +import logging +import os +from pathlib import Path +from typing import Any, Dict, Tuple + +try: + import cv2 +except Exception: + cv2 = None # erlaubt Import ohne OpenCV (z.B. beim reinen Testen) + +# --- Logging --------------------------------------------------------------- + +def setup_logging(debug: bool = False) -> None: + level = logging.DEBUG if debug else logging.INFO + logging.basicConfig( + level=level, + format="%(asctime)s | %(levelname)s | %(message)s", + ) + +# --- Mathe/Helpers --------------------------------------------------------- + +def clamp(v: float, lo: float, hi: float) -> float: + return max(lo, min(hi, v)) + +def compute_crop_width(orig_w: int, orig_h: int, out_w: int = 1080, out_h: int = 1920) -> int: + # Für 9:16 Ziel: Breite = (9/16) * orig_h, standardmäßig 1080x1920 + return int((out_w / out_h) * orig_h) + +def iou(boxA, boxB) -> float: + """Berechnet Intersection-over-Union zweier Bounding-Boxes.""" + ax1, ay1, aw, ah = boxA + ax2, ay2 = ax1 + aw, ay1 + ah + bx1, by1, bw, bh = boxB + bx2, by2 = bx1 + bw, by1 + bh + + inter_x1 = max(ax1, bx1) + inter_y1 = max(ay1, by1) + inter_x2 = min(ax2, bx2) + inter_y2 = min(ay2, by2) + inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1) + + union_area = aw * ah + bw * bh - inter_area + return inter_area / union_area if union_area > 0 else 0 + +# --- IO -------------------------------------------------------------------- + +def load_json(path: Path) -> Any: + if not path.exists(): + raise FileNotFoundError(f"Datei fehlt: {path}") + with open(path, "r", encoding="utf-8") as f: + return json.load(f) + +def save_json(obj: Any, path: Path) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + with open(path, "w", encoding="utf-8") as f: + json.dump(obj, f, ensure_ascii=False, indent=2) + +def ensure_exists(path: Path, what: str = "Datei/Ordner") -> None: + if not path.exists(): + raise FileNotFoundError(f"{what} nicht gefunden: {path}") + +# --- Video / Pfade --------------------------------------------------------- + +def get_fps(video_path: Path, fallback: float = 25.0) -> float: + if cv2 is None: + logging.warning("OpenCV nicht verfügbar – nutze FPS-Fallback %.2f", fallback) + return fallback + cap = cv2.VideoCapture(str(video_path)) + fps = cap.get(5) # cv2.CAP_PROP_FPS + cap.release() + if not fps or fps <= 1e-3: + logging.warning("Konnte FPS nicht lesen – nutze Fallback %.2f", fallback) + return fallback + return float(fps) + +def project_root_from(file: Path) -> Path: + # Dein Projekt nutzt häufig parents[2]; kapseln: + return file.resolve().parents[3] + +def resolve_paths(project_root: Path, base_name: str) -> Dict[str, Path]: + data = project_root / "data" + return { + "timed_path": data / "transkripte" / f"{base_name}_timed.txt", + "segments_path":data / "transkripte" / f"{base_name}_segments.json", + "faces_path": data / "face_data_combined" / f"{base_name}_faces.json", + "centers_path": data / "face_crop_centers" / f"{base_name}_centers.json", + "video_path": data / "output" / "raw_clips" / f"{base_name}.mp4", + "out_9x16_dir": project_root / "output" / "output_9x16_final", + "face_debug_dir": project_root / "output" / "debug" / "faces", + } + +def require_api_key(env_name: str = "OPENAI_API_KEY") -> str: + key = os.getenv(env_name) + if not key: + raise RuntimeError( + f"Umgebungsvariable {env_name} fehlt. " + f"Exportiere sie z.B.: export {env_name}='sk-...'") + return key + +# --- Simple smoothing for centers ------------------------------------------ + +from typing import List, Optional + +class CenterSmoother: + """Glättet Zentren mit Moving Average und optionaler Jump-Erkennung.""" + def __init__(self, window: int = 7, jump_thresh: float = 120.0): + self.window = window + self.jump_thresh = jump_thresh + self.buffer: List[Tuple[float, float]] = [] + self.prev: Optional[Tuple[float, float]] = None + + def push(self, cx: float, cy: float) -> Tuple[float, float]: + if self.prev is not None: + dx = abs(cx - self.prev[0]) + abs(cy - self.prev[1]) + if dx > self.jump_thresh: + # harter Cut: reset buffer + self.buffer.clear() + + self.buffer.append((cx, cy)) + if len(self.buffer) > self.window: + self.buffer.pop(0) + + avgx = sum(p[0] for p in self.buffer) / len(self.buffer) + avgy = sum(p[1] for p in self.buffer) / len(self.buffer) + self.prev = (avgx, avgy) + return self.prev diff --git a/src/reformat/old/analyze_crop_position.py b/src/reformat/old/analyze_crop_position.py new file mode 100644 index 0000000..33a832a --- /dev/null +++ b/src/reformat/old/analyze_crop_position.py @@ -0,0 +1,235 @@ +import argparse +import json +import logging +import math +import random +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple + + +class FaceTracker: + def __init__( + self, + dist_threshold: float, + switch_frames: int, + panning_window: int, + panning_threshold: float, + smooth_window: int, + scene_jump_threshold: float, + ): + self.dist_threshold = dist_threshold + self.switch_frames = switch_frames + self.panning_window = panning_window + self.panning_threshold = panning_threshold + self.smooth_window = smooth_window + self.scene_jump_threshold = scene_jump_threshold + + self.current_center: Tuple[float, float] = (960.0, 540.0) + self.raw_center: Tuple[float, float] = self.current_center + self.prev_center: Tuple[float, float] = self.current_center + self.prev_raw: Tuple[float, float] = self.current_center + self.candidate_center: Optional[Tuple[float, float]] = None + self.switch_counter: int = 0 + self.last_speaker_set: bool = False + self.random_center: Optional[Tuple[float, float]] = None + + self.panning_buffer: List[float] = [] + self.smooth_buffer: List[Tuple[float, float]] = [] + + def process_frame(self, faces: List[Dict[str, Any]]) -> Tuple[Tuple[int, int], bool]: + valid_faces = [f for f in faces if f.get("center") and f.get("mouth_openness") is not None] + all_faces = [f for f in faces if f.get("center")] + + # Speaker tracking + if valid_faces: + self._update_speaker(valid_faces) + else: + self._retain_or_random_center(all_faces) + + # Panning detection + is_panning = self._detect_panning() + + # Smooth / moving average + center = self._smooth_center() + + return (int(center[0]), int(center[1])), is_panning + + def _update_speaker(self, valid_faces: List[Dict[str, Any]]) -> None: + best = max(valid_faces, key=lambda x: x["mouth_openness"]) + cx, cy, *_ = best["center"] + new_center = (cx, cy) + + dist = math.hypot(new_center[0] - self.raw_center[0], new_center[1] - self.raw_center[1]) + if dist < self.dist_threshold: + self.raw_center = new_center + self.candidate_center = None + self.switch_counter = 0 + else: + if ( + self.candidate_center is None + or math.hypot( + new_center[0] - self.candidate_center[0], new_center[1] - self.candidate_center[1] + ) + > self.dist_threshold + ): + self.candidate_center = new_center + self.switch_counter = 1 + else: + self.switch_counter += 1 + + if self.switch_counter >= self.switch_frames: + self.raw_center = self.candidate_center # type: ignore + self.candidate_center = None + self.switch_counter = 0 + + self.random_center = None + self.last_speaker_set = True + + def _retain_or_random_center(self, all_faces: List[Dict[str, Any]]) -> None: + if self.last_speaker_set: + # keep previous raw_center + pass + elif self.random_center is not None: + self.raw_center = self.random_center + elif all_faces: + f = random.choice(all_faces) + cx, cy, *_ = f["center"] + self.random_center = (cx, cy) + self.raw_center = self.random_center + + def _detect_panning(self) -> bool: + dx = self.raw_center[0] - self.prev_raw[0] + self.panning_buffer.append(dx) + if len(self.panning_buffer) > self.panning_window: + self.panning_buffer.pop(0) + avg_dx = sum(abs(d) for d in self.panning_buffer) / len(self.panning_buffer) + is_panning = avg_dx > self.panning_threshold + self.prev_raw = self.raw_center + return is_panning + + def _smooth_center(self) -> Tuple[float, float]: + sudden_jump = ( + math.hypot( + self.raw_center[0] - self.prev_center[0], + self.raw_center[1] - self.prev_center[1], + ) + > self.scene_jump_threshold + ) + if not sudden_jump: + self.smooth_buffer.append(self.raw_center) + if len(self.smooth_buffer) > self.smooth_window: + self.smooth_buffer.pop(0) + avg_x = sum(p[0] for p in self.smooth_buffer) / len(self.smooth_buffer) + avg_y = sum(p[1] for p in self.smooth_buffer) / len(self.smooth_buffer) + center = (avg_x, avg_y) + else: + center = self.raw_center + self.smooth_buffer.clear() + + self.prev_center = center + return center + + +def parse_args() -> argparse.Namespace: + script_dir = Path(__file__).resolve().parent + project_root = script_dir.parents[1] + default_input = project_root / "data" / "face_data_combined" + default_output = project_root / "data" / "face_crop_centers" + + parser = argparse.ArgumentParser( + description="Track and smooth face crop centers based on mouth openness." + ) + parser.add_argument( + "-i", "--input-dir", type=Path, + default=default_input, + help=f"Directory containing *_faces.json files (default: {default_input})" + ) + parser.add_argument( + "-o", "--output-dir", type=Path, + default=default_output, + help=f"Directory to save *_centers.json files (default: {default_output})" + ) + parser.add_argument( + "--dist-threshold", type=float, default=30.0, + help="Pixel distance threshold to switch speaker" + ) + parser.add_argument( + "--switch-frames", type=int, default=20, + help="Number of consecutive frames required to confirm speaker switch" + ) + parser.add_argument( + "--panning-window", type=int, default=30, + help="Frame window size for panning detection" + ) + parser.add_argument( + "--panning-threshold", type=float, default=3.0, + help="Average dx threshold for panning detection" + ) + parser.add_argument( + "--smooth-window", type=int, default=8, + help="Moving average window for smoothing" + ) + parser.add_argument( + "--scene-jump-threshold", type=float, default=300.0, + help="Jump threshold to detect scene cuts" + ) + return parser.parse_args() + + +def setup_logging() -> None: + logging.basicConfig( + format="%(asctime)s %(levelname)s: %(message)s", + level=logging.INFO, + ) + + +def main() -> None: + setup_logging() + args = parse_args() + + input_dir: Path = args.input_dir.resolve() + output_dir: Path = args.output_dir.resolve() + output_dir.mkdir(parents=True, exist_ok=True) + + tracker = FaceTracker( + dist_threshold=args.dist_threshold, + switch_frames=args.switch_frames, + panning_window=args.panning_window, + panning_threshold=args.panning_threshold, + smooth_window=args.smooth_window, + scene_jump_threshold=args.scene_jump_threshold, + ) + + json_files = sorted(input_dir.glob("*_faces.json")) + if not json_files: + logging.error("Keine *_faces.json-Dateien gefunden in %s", input_dir) + return + + logging.info("Gefundene Dateien: %d", len(json_files)) + + for json_path in json_files: + logging.info("Verarbeite %s", json_path.name) + try: + frames_data = json.loads(json_path.read_text()) + except json.JSONDecodeError as e: + logging.error("JSON-Fehler in %s: %s", json_path.name, e) + continue + + out_data: List[Dict[str, Any]] = [] + for frame_idx, frame in enumerate(frames_data): + faces = frame.get("faces", []) + center, is_panning = tracker.process_frame(faces) + out_data.append({ + "frame": frame_idx, + "center": [center[0], center[1]], + "panning": is_panning, + }) + + out_path = output_dir / f"{json_path.stem.replace('_faces', '')}_centers.json" + with out_path.open("w") as f: + json.dump(out_data, f, indent=2) + logging.info("Gespeichert: %s (%d Frames)", out_path.name, len(out_data)) + + +if __name__ == "__main__": + main() diff --git a/src/reformat/old/crop_to_speaker.py b/src/reformat/old/crop_to_speaker.py new file mode 100644 index 0000000..553bbbd --- /dev/null +++ b/src/reformat/old/crop_to_speaker.py @@ -0,0 +1,180 @@ +import json +import cv2 +import subprocess +from pathlib import Path + +# === Pfade & globale Settings === +SCRIPT_DIR = Path(__file__).resolve().parent +PROJECT_ROOT = SCRIPT_DIR.parents[1] + +INPUT_VIDEO_DIR = PROJECT_ROOT / "data" / "output" / "raw_clips" +INPUT_CENTER_DIR = PROJECT_ROOT / "data" / "face_crop_centers" +INPUT_FACES_DIR = PROJECT_ROOT / "data" / "face_data_combined" +OUTPUT_DIR = PROJECT_ROOT / "output" / "output_9x16_final" +OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + +OUT_W, OUT_H = 1080, 1920 + +DEBUG_MODE = True +DEBUG_SCALE = 0.75 +# Ab welcher Offenheit wir "Bewegung" annehmen +DEBUG_MOUTH_THRESHOLD = 0.02 + +# === Hilfsfunktionen === +def clamp(v, lo, hi): + return max(lo, min(hi, v)) + +def compute_crop_width(orig_w, orig_h): + return int((OUT_W / OUT_H) * orig_h) + +# === Verarbeitung === +for center_path in sorted(INPUT_CENTER_DIR.glob("*_centers.json")): + stem = center_path.stem.replace("_centers", "") + video_path = INPUT_VIDEO_DIR / f"{stem}.mp4" + faces_path = INPUT_FACES_DIR / f"{stem}_faces.json" + + if not video_path.exists(): + print(f"⚠️ Video fehlt: {stem}.mp4") + continue + if not faces_path.exists(): + print(f"⚠️ Gesichtsdaten fehlen: {stem}_faces.json") + continue + + centers_data = json.loads(center_path.read_text()) + faces_data = json.loads(faces_path.read_text()) + + # Debug-Liste pro Video anlegen + if DEBUG_MODE: + debug_results: list = [] + + cap = cv2.VideoCapture(str(video_path)) + fps = cap.get(cv2.CAP_PROP_FPS) + orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + crop_w = compute_crop_width(orig_w, orig_h) + crop_h = orig_h + + fourcc = cv2.VideoWriter_fourcc(*"mp4v") + temp_vid = OUTPUT_DIR / f"{stem}_cropped.mp4" + out_vid = cv2.VideoWriter(str(temp_vid), fourcc, fps, (OUT_W, OUT_H)) + if not out_vid.isOpened(): + print(f"❌ Kann nicht schreiben: {temp_vid.name}") + continue + + if DEBUG_MODE: + cv2.namedWindow("Debug", cv2.WINDOW_NORMAL) + + frame_idx = 0 + while True: + ret, frame = cap.read() + if not ret or frame_idx >= len(centers_data): + break + + # Crop-Infos + info = centers_data[frame_idx] + cx, cy = info["center"] + is_panning = info.get("panning", False) + if is_panning: + cx = orig_w // 2 + + x0 = int(cx - crop_w / 2) + x0 = clamp(x0, 0, orig_w - crop_w) + y0 = 0 + + # Ausschneiden + Resize + crop = frame[y0:y0+crop_h, x0:x0+crop_w] + if crop.shape[1] != crop_w or crop.shape[0] != crop_h: + crop = cv2.copyMakeBorder( + crop, 0, crop_h - crop.shape[0], + 0, crop_w - crop.shape[1], + cv2.BORDER_CONSTANT, value=[0, 0, 0] + ) + out_frame = cv2.resize(crop, (OUT_W, OUT_H), interpolation=cv2.INTER_LINEAR) + out_vid.write(out_frame) + + if DEBUG_MODE: + debug_frame = frame.copy() + frame_faces = faces_data[frame_idx].get("faces", []) + + # Build debug entry for this frame + dbg_faces = [] + for f in frame_faces: + # center und Offenheit + cx_f, cy_f = map(int, f["center"][:2]) + openness = f.get("mouth_openness", 0.0) + moving = openness > DEBUG_MOUTH_THRESHOLD + dbg_faces.append({ + "center": [cx_f, cy_f], + "mouth_openness": openness, + "mouth_moving": moving + }) + + # Anzeige im Debug-Fenster + cv2.circle(debug_frame, (cx_f, cy_f), 4, (180, 180, 180), -1) + cv2.putText( + debug_frame, + f"{round(openness,2)}", + (cx_f + 6, cy_f - 6), + cv2.FONT_HERSHEY_SIMPLEX, + 0.5, + (255, 255, 255), + 1, + cv2.LINE_AA + ) + # roter Punkt, wenn Bewegung + color = (0,0,255) if moving else (0,255,255) + cv2.circle(debug_frame, (cx_f, cy_f), 6, color, 1) + + debug_results.append({ + "frame": frame_idx, + "faces": dbg_faces + }) + + # Haupt-Center & Crop-Rahmen + cv2.circle(debug_frame, (int(cx), int(cy)), 18, (0, 255, 0), 2) + cv2.rectangle(debug_frame, (x0, 0), (x0 + crop_w, crop_h), (0, 0, 255), 2) + + dbg = cv2.resize( + debug_frame, + (int(orig_w * DEBUG_SCALE), int(orig_h * DEBUG_SCALE)) + ) + cv2.imshow("Debug", dbg) + if cv2.waitKey(1) & 0xFF == ord('q'): + break + + frame_idx += 1 + + cap.release() + out_vid.release() + if DEBUG_MODE: + cv2.destroyAllWindows() + + # Audio extrahieren & muxen + audio_tmp = OUTPUT_DIR / f"{stem}_audio.aac" + final_vid = OUTPUT_DIR / f"{stem}.mp4" + try: + subprocess.run( + ["ffmpeg", "-y", "-i", str(video_path), "-vn", "-acodec", "copy", str(audio_tmp)], + check=True + ) + subprocess.run( + ["ffmpeg", "-y", "-i", str(temp_vid), "-i", str(audio_tmp), + "-c:v", "copy", "-c:a", "aac", "-b:a", "128k", str(final_vid)], + check=True + ) + finally: + try: temp_vid.unlink() + except: pass + try: audio_tmp.unlink() + except: pass + + # Debug-JSON schreiben + if DEBUG_MODE: + dbg_path = OUTPUT_DIR / f"{stem}_debug.json" + with dbg_path.open("w") as f: + json.dump(debug_results, f, indent=2) + print(f"🛠️ Debug-Daten: {dbg_path.name}") + + print(f"✅ Finales Video: {final_vid.name}") + +print("\n🏁 Alle Videos fertig in:", OUTPUT_DIR.resolve()) diff --git a/src/reformat/old/detect_speaking_faces.py b/src/reformat/old/detect_speaking_faces.py new file mode 100644 index 0000000..f439d30 --- /dev/null +++ b/src/reformat/old/detect_speaking_faces.py @@ -0,0 +1,126 @@ +import json +from pathlib import Path +from collections import defaultdict +import numpy as np + +# === Einstellungen === +SCRIPT_DIR = Path(__file__).resolve().parent +PROJECT_ROOT = SCRIPT_DIR.parents[1] +INPUT_DIR = PROJECT_ROOT / "data" / "face_data_combined" +OUTPUT_PATH = INPUT_DIR / "dominant_faces.json" + +SEGMENT_LENGTH = 2.0 # Länge jedes Segments in Sekunden +MOUTH_THRESHOLD = 0.01 # minimale Mundöffnung, um einen Sprecher zu zählen +SMOOTH_WINDOW = 5 # Fenstergröße (in Segmenten) für Moving Average + +def analyze_clip_timed(path): + # 1) JSON einlesen + try: + data = json.loads(path.read_text()) + except Exception as e: + print(f"❌ Fehler beim Lesen von {path.name}: {e}") + return None + + # 2) Nur valide Frames verwenden + frames = [d for d in data if "timestamp" in d and isinstance(d.get("faces"), list)] + if not frames: + print(f"⚠️ Keine validen Frames in {path.name}") + return None + + frames.sort(key=lambda x: x["timestamp"]) + max_time = frames[-1]["timestamp"] + + # 3) Segmente erzeugen und dominanten Sprecher per Segment finden + segments = [] + t = 0.0 + while t < max_time: + t_end = t + SEGMENT_LENGTH + face_scores = defaultdict(list) # mouth_openness pro bbox + face_boxes = defaultdict(list) # raw bbox pro bbox-key + face_centers = defaultdict(list) # center [cx,cy,w,h] pro bbox-key + + # alle Frames durchsuchen, die in dieses Segment fallen + for f in frames: + ts = f["timestamp"] + if t <= ts < t_end: + for face in f["faces"]: + bbox = face["bbox"] # [x,y,w,h] + score = face.get("mouth_openness", 0.0) + center = face.get("center", None) # [cx,cy,w,h] + key = tuple(bbox) + + if score >= MOUTH_THRESHOLD and center is not None: + face_scores[key].append(score) + face_boxes[key].append(bbox) + face_centers[key].append(center) + + if face_scores: + # den Key mit dem höchsten Durchschnittsscore wählen + avg_scores = {k: np.mean(v) for k, v in face_scores.items()} + dominant_key = max(avg_scores, key=avg_scores.get) + + # mittlere Bounding‑Box und mittleres Center berechnen + avg_bbox = np.mean(face_boxes[dominant_key], axis=0).astype(int).tolist() + avg_center = np.mean(face_centers[dominant_key], axis=0).tolist() # [cx,cy,w,h] + + segments.append({ + "start": round(t, 2), + "end": round(t_end if t_end < max_time else max_time, 2), + "bbox": avg_bbox, + "center": [float(avg_center[0]), float(avg_center[1]), float(avg_center[2]), float(avg_center[3])] + }) + + t += SEGMENT_LENGTH + + if not segments: + print(f"⚠️ Keine Segmente für Clip {path.name}") + return None + + # 4) Glätten der Segment‑Zentren mit Moving Average + seg_centers = [s["center"] for s in segments] # Liste von [cx,cy,w,h] + sm_centers = [] + n = len(seg_centers) + half = SMOOTH_WINDOW // 2 + + for i in range(n): + start = max(0, i - half) + end = min(n, i + half + 1) + window = seg_centers[start:end] + avg = np.mean(window, axis=0) # ergibt [cx,cy,w,h] + sm_centers.append(avg.tolist()) + + # 5) Ausgabe des geglätteten Pfades in die Konsole + print(f"\n🔄 Smoothed path für Clip {path.stem}:") + for i, s in enumerate(segments): + cx, cy, w, h = sm_centers[i] + print(f" Segment {i} [{s['start']}–{s['end']}s]: " + f"center=({cx:.1f}, {cy:.1f}), size=({w:.1f}×{h:.1f})") + + # 6) Neue Felder für Ausgabe‑JSON bauen + sm_segments = [] + for i, s in enumerate(segments): + cx, cy, w, h = sm_centers[i] + x0 = int(cx - w/2) + y0 = int(cy - h/2) + sm_segments.append({ + "start": s["start"], + "end": s["end"], + "bbox": [x0, y0, int(w), int(h)] + }) + + return { + "clip": path.stem.replace("_faces", "") + ".mp4", + "segments": sm_segments + } + + +# === Hauptschleife über alle Clips === +results = [] +for json_file in sorted(INPUT_DIR.glob("*_faces.json")): + out = analyze_clip_timed(json_file) + if out: + results.append(out) + +OUTPUT_PATH.write_text(json.dumps(results, indent=2)) +print(f"\n✅ Analyse abgeschlossen – {len(results)} Clips erkannt.") +print(f"📄 Gespeichert in: {OUTPUT_PATH.resolve()}") diff --git a/src/reformat/old/grid_faces_from_yolo.py b/src/reformat/old/grid_faces_from_yolo.py new file mode 100644 index 0000000..b9de56f --- /dev/null +++ b/src/reformat/old/grid_faces_from_yolo.py @@ -0,0 +1,114 @@ +import json +import cv2 +import numpy as np +from pathlib import Path +from tqdm import tqdm +from collections import defaultdict, Counter +from sklearn.cluster import DBSCAN + +# === Einstellungen === +SCRIPT_DIR = Path(__file__).resolve().parent +VIDEO_DIR = SCRIPT_DIR.parents[1] / "output" +FACE_JSON_DIR = SCRIPT_DIR / "face_data_yolo" +OUTPUT_DIR = SCRIPT_DIR.parents[1] / "output_stacked_faces" +OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + +OUT_WIDTH = 1080 +OUT_HEIGHT = 1920 +GRID_ROWS = 4 +FACE_CROP_HEIGHT = OUT_HEIGHT // GRID_ROWS +FACE_CROP_WIDTH = OUT_WIDTH + +# === Hilfsfunktion +def bbox_center(bbox): + x, y, w, h = bbox + return int(x + w // 2), int(y + h // 2) + +# === Hauptverarbeitung === +for json_path in tqdm(sorted(FACE_JSON_DIR.glob("*_faces.json")), desc="🔍 Erzeuge Grid-Clips"): + video_name = json_path.stem.replace("_faces", "") + ".mp4" + video_path = VIDEO_DIR / video_name + if not video_path.exists(): + print(f"❌ Video nicht gefunden: {video_name}") + continue + + data = json.loads(json_path.read_text()) + + # === Alle Gesichtszentren sammeln + all_faces = [] + for frame in data: + for face in frame["faces"]: + center = bbox_center(face["bbox"]) + all_faces.append((center, face["bbox"])) + + if not all_faces: + print(f"⚠️ Keine Gesichter erkannt in {video_name}") + continue + + # === Clustern + coords = [pos for pos, _ in all_faces] + clustering = DBSCAN(eps=80, min_samples=5).fit(coords) + cluster_labels = clustering.labels_ + label_counts = Counter(cluster_labels) + most_common_labels = [lbl for lbl, _ in label_counts.most_common(GRID_ROWS) if lbl != -1] + + if not most_common_labels: + print(f"⚠️ Keine gültigen Cluster in {video_name}") + continue + + # === Zuordnung: cluster_id → feste Zeile + cluster_faces = defaultdict(list) + for (_, bbox), label in zip(all_faces, cluster_labels): + if label in most_common_labels: + cluster_faces[label].append(bbox) + + def cluster_y(label): + return np.mean([bbox[1] for bbox in cluster_faces[label]]) + + sorted_labels = sorted(most_common_labels, key=cluster_y) + label_to_row = {lbl: idx for idx, lbl in enumerate(sorted_labels)} + + # === cluster_id zu jedem Gesicht hinzufügen + for frame in data: + for face in frame["faces"]: + center = bbox_center(face["bbox"]) + distances = [np.linalg.norm(np.array(center) - np.array(c)) for c in coords] + nearest = np.argmin(distances) + label = cluster_labels[nearest] + face["cluster_id"] = label + + # === Video verarbeiten + cap = cv2.VideoCapture(str(video_path)) + fps = cap.get(cv2.CAP_PROP_FPS) + out_path = OUTPUT_DIR / video_name.replace(".mp4", "_stacked.mp4") + fourcc = cv2.VideoWriter_fourcc(*"mp4v") + writer = cv2.VideoWriter(str(out_path), fourcc, fps, (OUT_WIDTH, OUT_HEIGHT)) + + frame_idx = 0 + while cap.isOpened(): + ret, frame = cap.read() + if not ret or frame_idx >= len(data): + break + + output_frame = np.zeros((OUT_HEIGHT, OUT_WIDTH, 3), dtype=np.uint8) + for face in data[frame_idx]["faces"]: + label = face.get("cluster_id", -1) + if label not in label_to_row: + continue + row = label_to_row[label] + x, y, w, h = face["bbox"] + crop = frame[y:y+h, x:x+w] + if crop.size == 0: + continue + resized = cv2.resize(crop, (FACE_CROP_WIDTH, FACE_CROP_HEIGHT)) + y_offset = row * FACE_CROP_HEIGHT + output_frame[y_offset:y_offset+FACE_CROP_HEIGHT, :] = resized + + writer.write(output_frame) + frame_idx += 1 + + cap.release() + writer.release() + print(f"✅ Exportiert: {out_path.name}") + +print("🏁 Alle Grid-Videos fertig.") diff --git a/src/reformat/old/preview_faces.py b/src/reformat/old/preview_faces.py new file mode 100644 index 0000000..dc777fc --- /dev/null +++ b/src/reformat/old/preview_faces.py @@ -0,0 +1,75 @@ +import cv2 +import json +from pathlib import Path +from tqdm import tqdm + +SCRIPT_DIR = Path(__file__).resolve().parent +PROJECT_DIR = SCRIPT_DIR.parents[1] # ← geht von /src/reformat zu /BachlorArbeit + +FACES_DIR = PROJECT_DIR / "data" / "face_data_combined" +INPUT_VIDEO_DIR = PROJECT_DIR / "data" / "output" / "raw_clips" +OUTPUT_DIR = PROJECT_DIR / "output" / "output_preview_faces" +OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + +# === Alle *_faces.json Dateien durchgehen === +face_files = sorted(FACES_DIR.glob("*_faces.json")) + +for face_file in tqdm(face_files, desc="🔍 Erzeuge Vorschau mit Sprechererkennung"): + clip_name = face_file.stem.replace("_faces", "") + ".mp4" + input_path = INPUT_VIDEO_DIR / clip_name + output_path = OUTPUT_DIR / clip_name.replace(".mp4", "_preview_faces.mp4") + + if not input_path.exists(): + print(f"❌ Clip nicht gefunden: {clip_name}") + continue + + # Video-Setup + cap = cv2.VideoCapture(str(input_path)) + fps = cap.get(cv2.CAP_PROP_FPS) + fps = fps if fps > 1 else 25 # fallback falls FPS = 0 + width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + fourcc = cv2.VideoWriter_fourcc(*"avc1") # Kompatibler als mp4v + out = cv2.VideoWriter(str(output_path), fourcc, fps, (width, height)) + + # Gesichts-Daten laden + data = json.loads(face_file.read_text()) + data_by_frame = {d["frame"]: d["faces"] for d in data if d["faces"]} + + print(f"🔢 Frames mit Gesichtern: {len(data_by_frame)}") + + frame_idx = 0 + while cap.isOpened(): + ret, frame = cap.read() + if not ret: + break + + faces = data_by_frame.get(frame_idx, []) + speaker_idx = None + + # Sprecher anhand Mundöffnung + if faces and all("mouth_openness" in f for f in faces): + mouth_vals = [f["mouth_openness"] for f in faces] + if any(v > 0.01 for v in mouth_vals): # einfache Aktivitäts-Schwelle + speaker_idx = mouth_vals.index(max(mouth_vals)) + + for i, face in enumerate(faces): + x, y, w, h = face["bbox"] + color = (0, 255, 0) if i == speaker_idx else (255, 255, 255) + label = f"Mouth: {face.get('mouth_openness', 0):.2f}" + + # Debug-Ausgabe (optional) + print(f"Frame {frame_idx} | Face {i} | BBox: ({x},{y},{w},{h}) | Speaker: {speaker_idx}") + + cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2) + cv2.putText(frame, label, (x, y - 10), + cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2) + + out.write(frame) + frame_idx += 1 + + cap.release() + out.release() + print(f"✅ Vorschau exportiert: {output_path.name}") + +print("🏁 Alle Vorschauvideos mit Sprecherkennung erstellt.") diff --git a/src/reformat/old/track_faces.py b/src/reformat/old/track_faces.py new file mode 100644 index 0000000..f335069 --- /dev/null +++ b/src/reformat/old/track_faces.py @@ -0,0 +1,92 @@ +import cv2 +import mediapipe as mp +import json +from pathlib import Path +from tqdm import tqdm + +# === Einstellungen === +INPUT_DIR = Path(__file__).resolve().parents[2] / "output" +OUTPUT_DIR = Path(__file__).resolve().parent / "face_data" +OUTPUT_DIR.mkdir(exist_ok=True) +FRAME_SKIP = 1 # analysiere jeden Frame für maximale Genauigkeit +PADDING = 30 # Pixel Padding um Gesicht + +mp_face_mesh = mp.solutions.face_mesh + +# Erweiterte Lippen-Landmarks (innen) +TOP_LIPS = [13, 78, 82] +BOTTOM_LIPS = [14, 87, 317] + +def mouth_openness(landmarks, image_height): + try: + top_avg = sum([landmarks[i].y for i in TOP_LIPS]) / len(TOP_LIPS) + bottom_avg = sum([landmarks[i].y for i in BOTTOM_LIPS]) / len(BOTTOM_LIPS) + return abs(bottom_avg - top_avg) + except: + return 0.0 + +def process_video(path): + cap = cv2.VideoCapture(str(path)) + fps = cap.get(cv2.CAP_PROP_FPS) + width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) + height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) + results = [] + + with mp_face_mesh.FaceMesh( + static_image_mode=False, + max_num_faces=5, + refine_landmarks=True, + min_detection_confidence=0.6, + min_tracking_confidence=0.6 + ) as face_mesh: + + frame_idx = 0 + while cap.isOpened(): + ret, frame = cap.read() + if not ret: + break + + if frame_idx % FRAME_SKIP != 0: + frame_idx += 1 + continue + + rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + output = face_mesh.process(rgb) + + faces = [] + if output.multi_face_landmarks: + for landmarks in output.multi_face_landmarks: + mouth = mouth_openness(landmarks.landmark, height) + + xs = [lm.x * width for lm in landmarks.landmark] + ys = [lm.y * height for lm in landmarks.landmark] + x1 = max(0, int(min(xs)) - PADDING) + y1 = max(0, int(min(ys)) - PADDING) + x2 = min(width, int(max(xs)) + PADDING) + y2 = min(height, int(max(ys)) + PADDING) + bbox = [x1, y1, x2 - x1, y2 - y1] + + faces.append({ + "bbox": bbox, + "mouth_openness": round(mouth, 4) + }) + + results.append({ + "frame": frame_idx, + "timestamp": round(frame_idx / fps, 2), + "faces": faces + }) + + frame_idx += 1 + + cap.release() + out_path = OUTPUT_DIR / f"{path.stem}_faces.json" + out_path.write_text(json.dumps(results, indent=2)) + print(f"✅ {path.name} verarbeitet → {out_path.name}") + +# === Alle Videos im output/ Ordner durchgehen +videos = list(INPUT_DIR.glob("*.mp4")) +print(f"🎬 {len(videos)} Videos gefunden in: {INPUT_DIR}") + +for video in tqdm(videos): + process_video(video) diff --git a/src/reformat/old/track_faces_Yolo.py b/src/reformat/old/track_faces_Yolo.py new file mode 100644 index 0000000..d7f5d1f --- /dev/null +++ b/src/reformat/old/track_faces_Yolo.py @@ -0,0 +1,206 @@ +#!/usr/bin/env python3 +import argparse +import logging +import json +from pathlib import Path + +import cv2 +from ultralytics import YOLO +import mediapipe as mp + +# === Pfade und Standardwerte === +SCRIPT_DIR = Path(__file__).resolve().parent +PROJECT_ROOT = SCRIPT_DIR.parents[1] +DEFAULT_INPUT_DIR = PROJECT_ROOT / "data" / "output" / "raw_clips" +DEFAULT_OUTPUT_DIR = PROJECT_ROOT / "data" / "face_data_combined" +DEFAULT_MODEL_PATH = PROJECT_ROOT / "models" / "yolov8n-face.pt" + +# Stelle sicher, dass das Standard-Output-Verzeichnis existiert +DEFAULT_OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + +# === Landmarks für Lippen === +TOP_LIPS = [61, 185, 40, 39, 37, 0, 267, 269, 270, 409] +BOTTOM_LIPS = [146, 91, 181, 84, 17, 314, 405, 321, 375, 291] + + +def get_mouth_openness(landmarks, image_height): + """ + Berechnet die Mundöffnung in Pixeln basierend auf normierten Landmark-Koordinaten. + """ + top_avg = sum(landmarks[i].y for i in TOP_LIPS) / len(TOP_LIPS) + bottom_avg = sum(landmarks[i].y for i in BOTTOM_LIPS) / len(BOTTOM_LIPS) + return abs(bottom_avg - top_avg) * image_height + + +def iou(boxA, boxB): + """Berechnet Intersection-over-Union zweier Bounding-Boxes im Format (x, y, w, h).""" + ax1, ay1, aw, ah = boxA + ax2, ay2 = ax1 + aw, ay1 + ah + bx1, by1, bw, bh = boxB + bx2, by2 = bx1 + bw, by1 + bh + + inter_x1 = max(ax1, bx1) + inter_y1 = max(ay1, by1) + inter_x2 = min(ax2, bx2) + inter_y2 = min(ay2, by2) + inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1) + + union_area = aw * ah + bw * bh - inter_area + return inter_area / union_area if union_area > 0 else 0 + + +def process_video( + video_path: Path, + output_path: Path, + model: YOLO, + face_mesh: mp.solutions.face_mesh.FaceMesh, + conf_thresh: float, + frame_skip: int, + downscale: float, +): + cap = cv2.VideoCapture(str(video_path)) + if not cap.isOpened(): + logging.error(f"Kann Video nicht öffnen: {video_path}") + return + + fps = cap.get(cv2.CAP_PROP_FPS) + width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH) * downscale) + height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT) * downscale) + + # JSON-Ausgabe mit Streaming + with output_path.open('w', encoding='utf-8') as f_out: + f_out.write('[\n') + first = True + frame_idx = 0 + + while True: + ret, frame = cap.read() + if not ret: + break + if frame_skip > 1 and frame_idx % frame_skip != 0: + frame_idx += 1 + continue + + if downscale != 1.0: + frame = cv2.resize(frame, (width, height)) + + detections = model(frame, verbose=False)[0] + yolo_boxes = [] + for box in detections.boxes: + conf = float(box.conf[0] if hasattr(box.conf, '__getitem__') else box.conf) + if conf < conf_thresh: + continue + coords = box.xyxy[0].cpu().numpy() + x1, y1, x2, y2 = map(int, coords) + yolo_boxes.append([x1, y1, x2 - x1, y2 - y1]) + + rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) + mp_result = face_mesh.process(rgb) + mp_faces = [] + if mp_result.multi_face_landmarks: + for landmarks in mp_result.multi_face_landmarks: + mouth_px = get_mouth_openness(landmarks.landmark, height) + xs = [lm.x * width for lm in landmarks.landmark] + ys = [lm.y * height for lm in landmarks.landmark] + x1, y1 = int(min(xs)), int(min(ys)) + x2, y2 = int(max(xs)), int(max(ys)) + mp_faces.append({ + "bbox": [x1, y1, x2 - x1, y2 - y1], + "mouth_openness": round(mouth_px, 1) + }) + + combined = [] + for yb in yolo_boxes: + if mp_faces: + best = max(mp_faces, key=lambda m: iou(yb, m["bbox"])) + best_iou = iou(yb, best["bbox"]) + mouth = best["mouth_openness"] if best_iou > 0.2 else 0.0 + else: + mouth = 0.0 + + x, y, w, h = yb + cx, cy = x + w / 2, y + h / 2 + combined.append({ + "bbox": yb, + "mouth_openness": round(mouth, 1), + "center": [round(cx, 1), round(cy, 1), w, h] + }) + + result = { + "frame": frame_idx, + "timestamp": round(frame_idx / fps, 3), + "faces": combined + } + + if not first: + f_out.write(',\n') + json.dump(result, f_out, ensure_ascii=False) + first = False + frame_idx += 1 + + f_out.write('\n]') + + cap.release() + logging.info(f"Verarbeitet: {video_path.name} → {output_path.name}") + + +def main(): + parser = argparse.ArgumentParser( + description="Analyse von Videos: Gesichter und Mundöffnung erkennen" + ) + parser.add_argument( + "--input-dir", type=Path, + default=DEFAULT_INPUT_DIR, + help=f"Verzeichnis mit MP4-Videos (standard: {DEFAULT_INPUT_DIR})" + ) + parser.add_argument( + "--output-dir", type=Path, + default=DEFAULT_OUTPUT_DIR, + help=f"Verzeichnis für JSON-Ergebnisse (standard: {DEFAULT_OUTPUT_DIR})" + ) + parser.add_argument( + "--model", type=Path, + default=DEFAULT_MODEL_PATH, + help=f"Pfad zum YOLOv8-Face-Modell (.pt) (standard: {DEFAULT_MODEL_PATH})" + ) + parser.add_argument( + "--conf-thresh", type=float, default=0.5, + help="Schwelle für YOLO-Confidence" + ) + parser.add_argument( + "--frame-skip", type=int, default=1, + help="Nur jede n-te Frame verarbeiten" + ) + parser.add_argument( + "--downscale", type=float, default=1.0, + help="Skalierungsfaktor für Frames" + ) + args = parser.parse_args() + + logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO) + args.output_dir.mkdir(parents=True, exist_ok=True) + + yolo = YOLO(str(args.model)) + face_mesh = mp.solutions.face_mesh.FaceMesh( + static_image_mode=False, + max_num_faces=5, + refine_landmarks=True, + min_detection_confidence=0.5, + min_tracking_confidence=0.5 + ) + + for video_path in sorted(args.input_dir.glob("*.mp4")): + out_path = args.output_dir / f"{video_path.stem}_faces.json" + process_video( + video_path, + out_path, + yolo, + face_mesh, + args.conf_thresh, + args.frame_skip, + args.downscale, + ) + + +if __name__ == "__main__": + main() diff --git a/src/reformat/speaking.py b/src/reformat/speaking.py new file mode 100644 index 0000000..4d7b83c --- /dev/null +++ b/src/reformat/speaking.py @@ -0,0 +1,12 @@ +# src/speaking.py + +TOP_LIPS = [61, 185, 40, 39, 37, 0, 267, 269, 270, 409] +BOTTOM_LIPS = [146, 91, 181, 84, 17, 314, 405, 321, 375, 291] + +def get_mouth_openness(landmarks, image_height): + """ + Berechnet die Mundöffnung basierend auf MediaPipe-Landmarks. + """ + top_avg = sum(landmarks[i].y for i in TOP_LIPS) / len(TOP_LIPS) + bottom_avg = sum(landmarks[i].y for i in BOTTOM_LIPS) / len(BOTTOM_LIPS) + return abs(bottom_avg - top_avg) * image_height diff --git a/src/subtitles/add_subtitles.py b/src/subtitles/add_subtitles.py new file mode 100644 index 0000000..2f3448c --- /dev/null +++ b/src/subtitles/add_subtitles.py @@ -0,0 +1,265 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +add_subtitles.py — TikTok-Word-Caps mit OpenAI Whisper (CPU) +- läuft Ordner-weise über 9:16-Kurzclips +- transkribiert mit word_timestamps=True +- erzeugt ASS (ein Wort pro Zeile, Pop-Animation, Bottom-Center) +- brennt via ffmpeg in *_subtitled.mp4 +""" + +import os +import re +import glob +import json +import subprocess +import tempfile +import traceback +import argparse +from typing import List, Tuple, Optional +from pathlib import Path +import sys + +# ── Projektwurzel in sys.path aufnehmen (dieses Skript liegt z. B. unter src/subtitles/) +ROOT = Path(__file__).resolve().parents[2] +sys.path.insert(0, str(ROOT)) + +from config import CROPPED_DIR, SUBTITLED_DIR # zentrale Pfade + +# --- Stabil auf CPU (vermeidet MPS-Sparse-Fehler) --- +os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" +os.environ["CUDA_VISIBLE_DEVICES"] = "" + +def log(*a): print("[LOG]", *a) +def ensure_dir(p: Path): p.mkdir(parents=True, exist_ok=True) + +def has_audio_stream(video_path: str) -> bool: + cmd = ["ffprobe","-v","error","-select_streams","a","-show_entries","stream=index","-of","json",video_path] + try: + out = subprocess.check_output(cmd).decode("utf-8") + data = json.loads(out) + return bool(data.get("streams")) + except Exception: + return False + +def load_whisper_cpu(model_name: str): + import whisper # openai-whisper + device = "cpu" + model = whisper.load_model(model_name, device=device) + fp16 = False + return model, device, fp16 + +def transcribe_words_whisper(model, media_path: str, language: Optional[str], fp16: bool) -> List[Tuple[float,float,str]]: + """ + Nutzt 'openai-whisper' mit word_timestamps=True. + Fallback: wenn 'words' fehlen, werden Segmenttexte approx. auf Wörter verteilt. + """ + result = model.transcribe( + media_path, + language=language, + task="transcribe", + word_timestamps=True, + condition_on_previous_text=False, + verbose=False, + fp16=fp16 + ) + words: List[Tuple[float,float,str]] = [] + segs = result.get("segments", []) or [] + for seg in segs: + wlist = seg.get("words") + if isinstance(wlist, list) and wlist and all(isinstance(w, dict) for w in wlist): + for w in wlist: + t = (w.get("word") or w.get("text") or "").strip() + if not t: + continue + ws = w.get("start"); we = w.get("end") + if ws is None or we is None: + continue + t = re.sub(r"\s+", " ", t) + if t: + words.append((float(ws), float(we), t)) + else: + # Fallback: Segment auf Wörter aufteilen & Zeiten gleichmäßig verteilen + text = (seg.get("text") or "").strip() + if not text: + continue + seg_start = float(seg.get("start", 0.0)) + seg_end = float(seg.get("end", seg_start)) + toks = [w for w in re.split(r"(\s+)", text) if w.strip()] + if not toks or seg_end <= seg_start: + continue + dur = seg_end - seg_start + step = dur / len(toks) + for i, tok in enumerate(toks): + ws = seg_start + i * step + we = seg_start + (i+1) * step + words.append((ws, we, tok)) + return words + +def ass_time(t: float) -> str: + if t < 0: t = 0 + h = int(t // 3600); m = int((t % 3600)//60); s = int(t % 60); cs = int(round((t - int(t))*100)) + return f"{h:d}:{m:02d}:{s:02d}.{cs:02d}" + +def write_ass_words(words: List[Tuple[float,float,str]], ass_path: Path, font_size: int, margin_v: int, uppercase: bool): + """ + Ein Wort pro Zeile, ohne Überlappung: + - Ende = min(eigene Endzeit, Start nächstes Wort - 0.02) + - Pop-Animation 150ms, fette Outline, Bottom-Center (PlayResY=1920) + """ + header = f"""[Script Info] +ScriptType: v4.00+ +Collisions: Normal +PlayResX: 1080 +PlayResY: 1920 +ScaledBorderAndShadow: yes + +[V4+ Styles] +Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding +Style: WordCap,Inter,{font_size},&H00FFFFFF,&H00FFFFFF,&H00101010,&H64000000,1,0,0,0,100,100,0,0,1,6,0.8,2,80,80,{margin_v},1 + +[Events] +Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text +""" + # Zeiten glätten, damit immer nur ein Wort sichtbar ist + adjusted = [] + for i, (s, e, t) in enumerate(words): + nstart = words[i+1][0] if i+1 < len(words) else e + new_end = min(e, nstart - 0.02) if nstart > s else e + if new_end <= s: + new_end = s + 0.06 + adjusted.append((s, new_end, t)) + + with open(ass_path, "w", encoding="utf-8") as f: + f.write(header) + for s, e, t in adjusted: + st, en = ass_time(s), ass_time(e) + txt = t.upper() if uppercase else t + # \fs sichere Größe, \blur für weiche Outline, \fad Ein/Aus, + # \fscx135\fscy135 → Start groß, \t(...) schrumpft in 150ms auf 100% = Pop + overrides = r"\blur1\bord8\1c&H0000FFFF&\3c&H000000&\4c&H000000&\fad(50,20)\fscx135\fscy135\t(0,150,\fscx100\fscy100)" + f.write(f"Dialogue: 0,{st},{en},WordCap,,0,0,0,,{{{overrides}}}{txt}\n") + +def ffmpeg_escape_for_subtitles(path: Path) -> str: + """ + Pfad für -vf subtitles=… escapen (für Leerzeichen, Doppelpunkte etc.). + ffmpeg erwartet Backslash-escaping für Filter-Argumente. + """ + s = str(path) + s = s.replace("\\", "\\\\") + s = s.replace(":", "\\:") + s = s.replace("'", "\\'") + s = s.replace(",", "\\,") + s = s.replace("[", "\\[") + s = s.replace("]", "\\]") + s = s.replace(";", "\\;") + s = s.replace("=", "\\=") + return s + +def burn(video_in: Path, ass_file: Path, out_path: Path, crf=18, preset="medium") -> int: + vf = f"subtitles={ffmpeg_escape_for_subtitles(ass_file)}" + cmd = [ + "ffmpeg","-y","-i",str(video_in), + "-vf", vf, + "-c:v","libx264","-preset",preset,"-crf",str(crf), + "-c:a","copy", + str(out_path) + ] + log("FFmpeg:", " ".join(cmd)) + return subprocess.call(cmd) + +def parse_args(): + p = argparse.ArgumentParser(description="Brennt Word-Caps (ASS) via Whisper-Transkription in 9:16-Clips.") + p.add_argument("--clips_dir", type=Path, default=CROPPED_DIR, help=f"Quellordner (Default: {CROPPED_DIR})") + p.add_argument("--out_dir", type=Path, default=SUBTITLED_DIR, help=f"Zielordner (Default: {SUBTITLED_DIR})") + p.add_argument("--pattern", type=str, default="*.mp4", help="Dateimuster (Default: *.mp4)") + p.add_argument("--limit", type=int, default=None, help="Nur die ersten N Clips verarbeiten") + p.add_argument("--model", type=str, default=os.getenv("WHISPER_MODEL", "medium"), help="Whisper-Modell") + p.add_argument("--lang", type=str, default=os.getenv("LANGUAGE", "none"), help="Sprachcode (z. B. de, en, None=Auto)") + p.add_argument("--uppercase", action="store_true", help="Text in Großbuchstaben rendern") + p.add_argument("--font_size", type=int, default=108, help="ASS-Fontgröße") + p.add_argument("--margin_v", type=int, default=320, help="ASS-MarginV (Abstand vom unteren Rand)") + p.add_argument("--crf", type=int, default=18, help="ffmpeg CRF (Qualität)") + p.add_argument("--preset", type=str, default="medium", help="ffmpeg Preset") + return p.parse_args() + +def main(): + args = parse_args() + + clips_dir = args.clips_dir + output_dir = args.out_dir + ensure_dir(output_dir) + + log("Starte TikTok Word-Caps (Whisper)") + log("CLIPS_DIR =", clips_dir) + log("OUTPUT_DIR =", output_dir) + + clips: List[str] = [] + for pat in (args.pattern,): + clips += glob.glob(str(clips_dir / pat)) + clips.sort() + log(f"{len(clips)} Clips gefunden.") + if args.limit: + clips = clips[:args.limit] + log(f"LIMIT aktiv: {args.limit}") + + if not clips: + log("Keine Clips gefunden. Pfad/Pattern checken.") + return + + # Whisper laden (CPU) + try: + model, device, fp16 = load_whisper_cpu(args.model) + log(f"Whisper geladen: {args.model} auf {device} (fp16={fp16})") + log("Hinweis: Beim ersten Lauf kann das Modell nachgeladen werden.") + except Exception as e: + print("[ERROR] Whisper konnte nicht geladen werden:", e) + traceback.print_exc() + return + + lang = None if str(args.lang).strip().lower() in {"", "none", "null"} else args.lang + + for clip in clips: + base = os.path.basename(clip) + stem, _ = os.path.splitext(base) + log("="*60) + log("Clip:", base) + + if not has_audio_stream(clip): + log("WARN: Keine Audio-Spur → übersprungen.") + continue + + # Transkription + try: + log("Transkription startet …") + words = transcribe_words_whisper(model, clip, language=lang, fp16=fp16) + log(f"Transkription fertig. {len(words)} Wörter.") + if not words: + log("WARN: 0 Wörter erkannt → übersprungen.") + continue + except Exception as e: + print("[ERROR] Transkription fehlgeschlagen:", e) + traceback.print_exc() + continue + + # ASS erzeugen & brennen + with tempfile.NamedTemporaryFile(suffix=".ass", delete=False) as tmp: + ass_path = Path(tmp.name) + try: + log("Erzeuge ASS …") + write_ass_words(words, ass_path, font_size=args.font_size, margin_v=args.margin_v, uppercase=args.uppercase) + out_path = output_dir / f"{stem}_subtitled.mp4" + log("Brenne Untertitel …") + rc = burn(Path(clip), ass_path, out_path, crf=args.crf, preset=args.preset) + if rc == 0: + log("OK:", out_path) + else: + log("ERROR: ffmpeg fehlgeschlagen, code", rc) + finally: + try: ass_path.unlink(missing_ok=True) + except Exception: pass + + log("Fertig.") + +if __name__ == "__main__": + main() diff --git a/src/subtitles/run_subtitles.py b/src/subtitles/run_subtitles.py new file mode 100644 index 0000000..1ce2f40 --- /dev/null +++ b/src/subtitles/run_subtitles.py @@ -0,0 +1,25 @@ +import os +import tempfile +from add_subtitles import process # wir nutzen die Logik aus dem großen Skript + +# ==== HIER EINSTELLEN ==== +VIDEO_PATH = "data/input.mp4" # Dein Video +TRANSCRIPT_PATH = "data/transcript.srt" # Oder .json (Whisper) +OUTPUT_DIR = "data/output" # Ordner für Ergebnisse +CLIPS_PATH = None # Optional: "data/clips.csv" oder "data/clips.json" +CRF = 18 +PRESET = "medium" +STYLE = r"\\bord4\\shad4\\outline3\\fs64\\b1\\1c&HFFFFFF&\\3c&H000000&\\4c&H000000&" +# ========================== + +if __name__ == "__main__": + os.makedirs(OUTPUT_DIR, exist_ok=True) + process( + video_path=VIDEO_PATH, + transcript_path=TRANSCRIPT_PATH, + output_dir=OUTPUT_DIR, + clips_path=CLIPS_PATH, + crf=CRF, + preset=PRESET, + style_overrides=STYLE, + ) diff --git a/src/text/cutClips.py b/src/text/cutClips.py new file mode 100644 index 0000000..ae314b8 --- /dev/null +++ b/src/text/cutClips.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 +# cutClips.py — exportiert Clips aus dem ersten gefundenen Video oder aus angegebener Datei + +from pathlib import Path +import sqlite3 +import argparse +from moviepy.video.io.VideoFileClip import VideoFileClip +import sys + +# ── Projektwurzel in sys.path aufnehmen +ROOT = Path(__file__).resolve().parents[2] +sys.path.insert(0, str(ROOT)) + +from config import INPUT_DIR, RAW_CLIPS_DIR, DB_PATH + + +def parse_args(): + p = argparse.ArgumentParser(description="Exportiert Highlights aus dem Video gemäß SQLite-DB.") + p.add_argument("--file", type=str, default=None, + help="Name der Input-Datei im INPUT_DIR. Wenn leer, wird das erste Video im Ordner verwendet.") + p.add_argument("--limit", type=int, default=10, + help="Anzahl der zu exportierenden Clips (Default: 10)") + p.add_argument("--order", type=str, choices=["score", "start"], default="score", + help="Sortierung: 'score' (score_total absteigend) oder 'start' (zeitlich).") + return p.parse_args() + + +def find_first_video(directory: Path): + """Suche nach der ersten Videodatei im Verzeichnis (mp4, mov, mkv).""" + for ext in ("*.mp4", "*.mov", "*.mkv"): + files = sorted(directory.glob(ext)) + if files: + return files[0] + return None + + +def main(): + args = parse_args() + + # === Eingabevideo bestimmen === + if args.file: + input_video = INPUT_DIR / args.file + else: + input_video = find_first_video(INPUT_DIR) + if not input_video: + raise FileNotFoundError(f"🚫 Kein Video im Eingabeordner {INPUT_DIR} gefunden.") + print(f"📂 Kein --file angegeben → verwende automatisch: {input_video.name}") + + if not input_video.exists(): + raise FileNotFoundError(f"🚫 Input-Video nicht gefunden: {input_video}") + + output_dir = RAW_CLIPS_DIR + output_dir.mkdir(parents=True, exist_ok=True) + + # === SQLite DB lesen === + conn = sqlite3.connect(DB_PATH) + cursor = conn.cursor() + + order_clause = "ORDER BY score_total DESC" if args.order == "score" else "ORDER BY start ASC" + cursor.execute(f""" + SELECT start, end, text + FROM highlights + {order_clause} + LIMIT ? + """, (args.limit,)) + highlights = cursor.fetchall() + + if not highlights: + print("⚠️ Keine Highlights in der Datenbank gefunden.") + conn.close() + return + + # === Video laden === + video = VideoFileClip(str(input_video)) + + # === Clips schneiden === + for i, (start, end, text) in enumerate(highlights, start=1): + if start >= video.duration: + print(f"⚠️ Clip {i} übersprungen – Startzeit {start:.2f}s liegt außerhalb der Videolänge ({video.duration:.2f}s).") + continue + + end = min(end, video.duration) + output_file = output_dir / f"highlight_{i}.mp4" + print(f"🎬 Exportiere Clip {i}: {start:.2f}s – {end:.2f}s → {output_file.name}") + + try: + clip = video.subclipped(start, end) + clip.write_videofile(str(output_file), codec="libx264", audio_codec="aac", logger=None) + clip.close() + except Exception as e: + print(f"❌ Fehler beim Export von Clip {i}: {e}") + + # === Cleanup === + conn.close() + video.close() + print(f"✅ {len(highlights)} Clips exportiert nach {output_dir}") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/code/text/rateCluster.py b/src/text/rateCluster.py similarity index 69% rename from code/text/rateCluster.py rename to src/text/rateCluster.py index 0c9cf07..fe5e2f6 100644 --- a/code/text/rateCluster.py +++ b/src/text/rateCluster.py @@ -2,44 +2,41 @@ import sqlite3 import re from openai import OpenAI from time import sleep +from pathlib import Path +import os + +from pathlib import Path +import sys + +# Projekt-Root einfügen (2 Ebenen hoch von src/* ausgehend) +ROOT = Path(__file__).resolve().parents[2] +sys.path.insert(0, str(ROOT)) + +from config import DB_PATH + -# === Einstellungen === -DB_PATH = "clips_openai.db" -VIDEO_ID = "testVideoShort" MAX_CLIPS = 5 # oder "all" -OPENAI_API_KEY = "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" -client = OpenAI(api_key=OPENAI_API_KEY) +# === OPENAI-CLIENT (API-Key aus Env) === +if not os.getenv("OPENAI_API_KEY"): + raise RuntimeError("🚫 OPENAI_API_KEY fehlt in der Umgebung") +client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) # === DB-Verbindung conn = sqlite3.connect(DB_PATH) cursor = conn.cursor() -cursor.execute("DROP TABLE IF EXISTS highlights") +# === Unbewertete Highlights laden cursor.execute(""" -CREATE TABLE highlights ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - file TEXT, - start REAL, - end REAL, - text TEXT, - viralitaet INTEGER, - emotionalitaet INTEGER, - witz INTEGER, - provokation INTEGER, - score_total INTEGER -) + SELECT id, start, end, text FROM highlights + WHERE viralitaet IS NULL OR emotionalitaet IS NULL + ORDER BY start """) -conn.commit() -print(f"🧹 Tabelle 'highlights' neu erstellt für: {VIDEO_ID}") - -# === Segmente laden -cursor.execute("SELECT start, end, text FROM segments ORDER BY start") segments = cursor.fetchall() -print(f"📥 {len(segments)} Segmente (Originaltext) geladen.") +print(f"📥 {len(segments)} unbewertete Highlights geladen.") # === Bewertungsfunktion (GPT-4o) -def analyse_segment(text, start, end): +def analyse_segment(clip_id, text, start, end): print(f"\n🔎 Bewerte Clip: {start:.2f}s – {end:.2f}s") prompt = f""" @@ -86,19 +83,19 @@ Provokation: [Zahl] if all(v is not None for v in values.values()): total_score = sum(values.values()) cursor.execute(""" - INSERT INTO highlights ( - file, start, end, text, - viralitaet, emotionalitaet, witz, provokation, score_total - ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) + UPDATE highlights SET + viralitaet = ?, emotionalitaet = ?, witz = ?, provokation = ?, score_total = ? + WHERE id = ? """, ( - VIDEO_ID, start, end, text.strip(), values["viralitaet"], values["emotionalitaet"], values["witz"], values["provokation"], - total_score + total_score, + clip_id )) conn.commit() return { + "id": clip_id, "start": start, "end": end, "text": text.strip(), @@ -113,8 +110,8 @@ Provokation: [Zahl] # === Clips bewerten rated = [] -for start, end, text in segments: - result = analyse_segment(text, float(start), float(end)) +for clip_id, start, end, text in segments: + result = analyse_segment(clip_id, text, float(start), float(end)) if result: rated.append(result) sleep(1.2) # Anti-Rate-Limit @@ -123,7 +120,7 @@ for start, end, text in segments: rated.sort(key=lambda x: x["total"], reverse=True) selected = rated if MAX_CLIPS == "all" else rated[:int(MAX_CLIPS)] -print(f"\n🎬 Beste {len(selected)} Highlights für: {VIDEO_ID}") +print(f"\n🎬 Beste {len(selected)} Highlights nach Bewertung:") for clip in selected: print(f"\n🚀 {clip['start']:.2f}s – {clip['end']:.2f}s") print(f"🎙️ {clip['text'][:200]}...") diff --git a/src/text/segment_transcript.py b/src/text/segment_transcript.py new file mode 100644 index 0000000..7e8e577 --- /dev/null +++ b/src/text/segment_transcript.py @@ -0,0 +1,409 @@ +#!/usr/bin/env python3 +# clip_selector_optimized.py — word-based text rebuild (no duplicates) + +import os +import re +import json +import sqlite3 +import time +from pathlib import Path +from datetime import datetime +import argparse +import sys +from typing import List, Dict, Optional + +from openai import OpenAI + +# ── Projektwurzel in sys.path aufnehmen (dieses Skript kann z. B. unter src/text/ liegen) +ROOT = Path(__file__).resolve().parents[2] +sys.path.insert(0, str(ROOT)) + +from config import TRANSCRIPTS_DIR, DB_PATH # zentrale Pfade + +LOG_DIR = ROOT / "logs" +LOG_DIR.mkdir(exist_ok=True, parents=True) + +# === DEFAULTS (per CLI überschreibbar) === +DEFAULT_BLOCK_DURATION = 300.0 # Sek. pro Block +DEFAULT_MIN_CLIP_LEN = 30.0 # konsistent mit Prompt +DEFAULT_MAX_CLIP_LEN = 90.0 + +# === OPENAI-CLIENT (API-Key aus Env) === +API_KEY = os.getenv("OPENAI_API_KEY") +if not API_KEY: + raise RuntimeError("🚫 OPENAI_API_KEY fehlt in der Umgebung") +OPENAI_MODEL = os.getenv("OPENAI_MODEL", "gpt-5") # bei Bedarf überschreiben +client = OpenAI(api_key=API_KEY) + +# ────────────────────────────────────────────────────────────────────────────── +# Hilfsfunktionen +# ────────────────────────────────────────────────────────────────────────────── + +def log_text(filename: str, content: str) -> None: + (LOG_DIR / filename).write_text((content or "").strip(), encoding="utf-8") + +def append_error_log(content: str) -> None: + with (LOG_DIR / "errors.txt").open("a", encoding="utf-8") as f: + f.write(f"{datetime.now().isoformat()} {content}\n\n") + +def extract_json(text: str) -> list: + """Nur für Debug: versucht JSON-Array aus beliebigem Text zu extrahieren.""" + txt = (text or "").strip() + txt = re.sub(r"^```(?:json)?\s*|\s*```$", "", txt, flags=re.IGNORECASE | re.DOTALL) + m = re.search(r"\[\s*{.*?}\s*\]", txt, re.DOTALL) + if not m: + append_error_log(f"❌ Kein JSON-Array gefunden.\n{txt}") + return [] + try: + return json.loads(m.group(0)) + except Exception as e: + append_error_log(f"❌ JSON-Fehler: {e}\n{txt}") + return [] + +def get_json_snippets_for_clip(start: float, end: float, segment_json: List[Dict]) -> List[Dict]: + """halb-offenes Fenster [start, end)""" + return [s for s in segment_json if not (float(s["end"]) <= start or float(s["start"]) >= end)] + +def _norm_space(s: str) -> str: + return re.sub(r"\s+", " ", (s or "").strip()) + +def explode_segments_to_words(segments: List[Dict]) -> List[Dict]: + """ + Baut eine globale Wortliste. Bevorzugt echte 'words' aus JSON, + fällt ansonsten auf lineare Interpolation über Segmentdauer zurück. + Ausgabe-Items: {idx, mid, text} + """ + words = [] + idx = 0 + for seg in sorted(segments, key=lambda s: (float(s["start"]), float(s["end"]))): + s0, s1 = float(seg["start"]), float(seg["end"]) + txt = (seg.get("text") or "").strip() + seg_words = seg.get("words") or [] + if seg_words: + for w in seg_words: + t = (w.get("text") or w.get("word") or "").strip() + if not t: + continue + w0 = float(w["start"]); w1 = float(w["end"]) + words.append({"idx": idx, "mid": round((w0 + w1) / 2.0, 4), "text": t}) + idx += 1 + else: + toks = txt.split() + n = len(toks) + if n == 0: + continue + dur = max(1e-6, s1 - s0) + for i, tok in enumerate(toks): + w0 = s0 + (i / n) * dur + w1 = s0 + ((i + 1) / n) * dur + words.append({"idx": idx, "mid": round((w0 + w1) / 2.0, 4), "text": tok}) + idx += 1 + return words + +def build_text_strict_from_words(clip_start: float, clip_end: float, WORDS: List[Dict]) -> str: + """Nimmt jedes Wort genau einmal, wenn mid ∈ [start, end).""" + sel = [w for w in WORDS if clip_start <= w["mid"] < clip_end] + sel.sort(key=lambda w: w["idx"]) + return _norm_space(" ".join(w["text"] for w in sel)) + +def find_transcript_pair(base: Optional[str]) -> tuple[Path, Path, str]: + """ + Finde (timed.txt, segments.json) in TRANSCRIPTS_DIR. + - Wenn base übergeben: benutzt {base}_timed.txt und {base}_segments.json. + - Sonst: nimmt das lexikographisch erste *_timed.txt und leitet die JSON davon ab. + """ + if base: + txt = TRANSCRIPTS_DIR / f"{base}_timed.txt" + jsn = TRANSCRIPTS_DIR / f"{base}_segments.json" + if not txt.exists(): + raise FileNotFoundError(f"Transkript nicht gefunden: {txt}") + if not jsn.exists(): + raise FileNotFoundError(f"Segment-JSON nicht gefunden: {jsn}") + return txt, jsn, base + + # auto-detect + candidates = sorted(TRANSCRIPTS_DIR.glob("*_timed.txt")) + if not candidates: + raise FileNotFoundError(f"Keine *_timed.txt in {TRANSCRIPTS_DIR} gefunden.") + txt = candidates[0] + stem = txt.stem.replace("_timed", "") + jsn = TRANSCRIPTS_DIR / f"{stem}_segments.json" + if not jsn.exists(): + raise FileNotFoundError(f"Gefundenes TXT: {txt.name}, aber JSON fehlt: {jsn.name}") + return txt, jsn, stem + +# ────────────────────────────────────────────────────────────────────────────── +# CLI +# ────────────────────────────────────────────────────────────────────────────── + +def parse_args(): + p = argparse.ArgumentParser(description="Selektiert Social-Media-taugliche Clips aus Transkripten (LLM-gestützt).") + p.add_argument("--base", type=str, default=None, + help="Basename der Transkriptdateien (z. B. 'testVideoShort' für *_timed.txt und *_segments.json).") + p.add_argument("--block", type=float, default=DEFAULT_BLOCK_DURATION, help="Blocklänge in Sekunden für die Prompt-Bildung.") + p.add_argument("--min", type=float, default=DEFAULT_MIN_CLIP_LEN, help="Minimale Clip-Länge (Sekunden).") + p.add_argument("--max", type=float, default=DEFAULT_MAX_CLIP_LEN, help="Maximale Clip-Länge (Sekunden).") + return p.parse_args() + +# ────────────────────────────────────────────────────────────────────────────── +# Main +# ────────────────────────────────────────────────────────────────────────────── + +def main(): + args = parse_args() + BLOCK_DURATION = float(args.block) + MIN_CLIP_LEN = float(args.min) + MAX_CLIP_LEN = float(args.max) + + # --- Transkriptdateien finden --- + TRANSCRIPT_PATH, SEGMENT_JSON_PATH, base = find_transcript_pair(args.base) + print(f"📄 TXT : {TRANSCRIPT_PATH}") + print(f"🧾 JSON: {SEGMENT_JSON_PATH}") + + # === TRANSKRIPT EINLESEN (TXT) -> NUR für Blockbildung & Promptanzeige === + lines = TRANSCRIPT_PATH.read_text(encoding="utf-8").splitlines() + segments_txt: List[Dict] = [] + for line in lines: + m = re.match(r"\[(\d+(?:\.\d+)?)\s*[–-]\s*(\d+(?:\.\d+)?)\]\s*(?:[A-Z_0-9]+:)?\s*(.*)", line) + if not m: + continue + start, end, text = m.groups() + start, end = float(start), float(end) + if end - start >= 2.0: + segments_txt.append({"start": start, "end": end, "text": (text or "").strip()}) + + if not segments_txt: + raise RuntimeError("🚫 Keine gültigen TXT-Segmente gefunden.") + print(f"✅ {len(segments_txt)} gültige TXT-Segmente geladen.") + + # === TRANSKRIPT EINLESEN (JSON) -> Quelle für DB-Text/Wörter === + segment_json_data = json.loads(SEGMENT_JSON_PATH.read_text(encoding="utf-8")) + if not isinstance(segment_json_data, list) or not segment_json_data: + raise RuntimeError("🚫 JSON-Segmente leer/ungültig.") + print(f"✅ {len(segment_json_data)} JSON-Segmente geladen.") + + # Globale Wörterliste einmal berechnen (bevor wir Clips bilden) + WORDS = explode_segments_to_words(segment_json_data) + print(f"🔤 Globale Wörter im Korpus: {len(WORDS)}") + + # === BLÖCKE BILDEN (aus TXT) === + segments_txt.sort(key=lambda s: (s["start"], s["end"])) + blocks, current_block, current_start = [], [], 0.0 + for seg in segments_txt: + if not current_block: + current_start = seg["start"] + # Blockwechsel, wenn Dauer überschritten + if seg["end"] - current_start > BLOCK_DURATION: + blocks.append(current_block) + current_block = [] + current_start = seg["start"] + current_block.append(seg) + if current_block: + blocks.append(current_block) + print(f"🧱 {len(blocks)} Blöcke erstellt (à ~{BLOCK_DURATION:.0f}s).") + + # === KI: CLIP-AUSWAHL === + all_clips = [] + t0 = time.perf_counter() + + for i, block in enumerate(blocks, start=1): + if not block: + continue + print(f"\n🤖 Sende Block {i}/{len(blocks)} an {OPENAI_MODEL} …") + block_text = "\n".join([f"[{s['start']} – {s['end']}] {s['text']}" for s in block]) + + prompt = f""" +Du bekommst einen Transkriptblock mit Zeitangaben. Extrahiere daraus 1–3 besonders interessante Abschnitte, die sich als eigenständige Social Media Clips eignen. +Ein guter Clip: +- ist abgeschlossen und verständlich +- enthält eine Pointe, Erkenntnis oder einen emotionalen Moment +- wirkt wie ein Mini-Ausschnitt mit Anfang, Spannungsbogen, Auflösung oder Punchline +- ist mindestens {MIN_CLIP_LEN:.0f} Sekunden lang +Nutze ausschließlich die vorhandenen Start- und Endzeiten – keine neuen erfinden. + +Gib ein JSON-Objekt zurück im Format: +{{ + "clips": [ + {{ + "start": float, + "end": float, + "summary": "Kurze Beschreibung des Inhalts" + }} + ] +}} + +TRANSKRIPT: +{block_text} +""".strip() + + log_text(f"block_prompt_{i:02d}.txt", prompt) + + # --- robuster API-Call mit Schema (Root=object) und kleinem Retry --- + import time as _time + clips = [] + for attempt in range(3): + try: + resp = client.chat.completions.create( + model=OPENAI_MODEL, + messages=[{"role": "user", "content": prompt}], + response_format={ + "type": "json_schema", + "json_schema": { + "name": "clips_payload", + "schema": { + "type": "object", + "properties": { + "clips": { + "type": "array", + "items": { + "type": "object", + "properties": { + "start": {"type": "number"}, + "end": {"type": "number"}, + "summary": {"type": "string"} + }, + "required": ["start", "end", "summary"], + "additionalProperties": False + } + } + }, + "required": ["clips"], + "additionalProperties": False + } + } + } + ) + msg = resp.choices[0].message + payload = getattr(msg, "parsed", None) + if payload is None: + payload = json.loads(msg.content) + + clips = payload.get("clips", []) or [] + + try: + log_text(f"block_output_{i:02d}.txt", json.dumps(payload, ensure_ascii=False, indent=2)) + except Exception: + pass + break + except Exception as e: + if attempt == 2: + append_error_log(f"❌ OpenAI-Fehler Block {i}: {e}") + print(f"❌ Fehler bei Block {i}: {e}") + else: + _time.sleep(1.5 * (attempt + 1)) + + print(f"✅ {len(clips)} Clips empfangen in Block {i}") + + # --- Clips filtern & clampen --- + for clip in clips: + try: + b_start, b_end = block[0]["start"], block[-1]["end"] + start = max(b_start, min(float(clip["start"]), b_end)) + end = max(b_start, min(float(clip["end"]), b_end)) + dur = end - start + if MIN_CLIP_LEN <= dur <= MAX_CLIP_LEN: + clip["start"] = start + clip["end"] = end + clip["duration"] = round(dur, 2) + all_clips.append(clip) + except Exception as e: + append_error_log(f"⛔ Clip-Filterfehler: {clip}\n{e}") + + elapsed = time.perf_counter() - t0 + avg = elapsed / i + eta = max(0.0, avg * (len(blocks) - i)) + print(f"⏱️ Geschätzte Restzeit: {eta:.1f} s") + + # --- Duplikate entfernen (auf 2 Dezimalen) --- + dedup, seen = [], set() + for c in all_clips: + k = (round(c["start"], 2), round(c["end"], 2)) + if k in seen: + continue + seen.add(k) + dedup.append(c) + all_clips = dedup + + print(f"\n📈 Gesamtclips vor DB-Insert: {len(all_clips)}") + + # === DB SPEICHERN === + conn = sqlite3.connect(DB_PATH) + cur = conn.cursor() + + cur.execute(""" + CREATE TABLE IF NOT EXISTS highlights ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + file TEXT, + start REAL, + end REAL, + duration REAL, + text TEXT, + summary TEXT, + json_raw TEXT, + viralitaet INTEGER, + emotionalitaet INTEGER, + witz INTEGER, + provokation INTEGER, + score_total INTEGER, + UNIQUE(file,start,end) + ) + """) + + # --- Tabelle vor neuem Lauf komplett leeren --- + cur.execute("DELETE FROM highlights") + conn.commit() # Transaktion schließen, damit VACUUM außerhalb läuft + + # VACUUM separat (optional) + try: + conn.execute("VACUUM") # oder: sqlite3.connect(DB_PATH).execute("VACUUM").close() + print("🧹 Alte Highlights gelöscht und Datenbank komprimiert.") + except sqlite3.OperationalError as e: + print(f"⚠️ VACUUM übersprungen: {e}") + + inserted = 0 + failed = 0 + + for clip in all_clips: + try: + start = float(clip["start"]) + end = float(clip["end"]) + duration = float(clip["duration"]) + summary = (clip.get("summary") or "").strip() + + if end <= start or start < 0: + raise ValueError("Ungültige Zeiten") + + # JSON-Segmente (zur Nachvollziehbarkeit) + Wort-basierter Text (dopplerfrei) + json_snippets = get_json_snippets_for_clip(start, end, segment_json_data) + json_raw = json.dumps(json_snippets, ensure_ascii=False) + + original_text = build_text_strict_from_words(start, end, WORDS) + + cur.execute(""" + INSERT OR IGNORE INTO highlights ( + file, start, end, duration, text, summary, json_raw, + viralitaet, emotionalitaet, witz, provokation, score_total + ) + VALUES (?, ?, ?, ?, ?, ?, ?, NULL, NULL, NULL, NULL, NULL) + """, ( + # 'file' = Basename (z. B. testVideoShort) + Path(base).name, + start, end, duration, + original_text, summary, json_raw + )) + if cur.rowcount > 0: + inserted += 1 + except Exception as e: + failed += 1 + append_error_log(f"❌ DB-Fehler: {clip}\n{e}") + + conn.commit() + conn.close() + + print("\n📊 Ergebnisse:") + print(f" ✅ Highlights gespeichert: {inserted}") + print(f" ❌ Fehlerhafte Clips: {failed}") + print(f"📁 Logs: {LOG_DIR.resolve()}") + +if __name__ == "__main__": + main() diff --git a/src/text/transcription.py b/src/text/transcription.py new file mode 100644 index 0000000..0c8ee69 --- /dev/null +++ b/src/text/transcription.py @@ -0,0 +1,276 @@ +#!/usr/bin/env python3 +# transcription_chunked_words.py — Whisper mit Wortzeitstempeln, doppler-sicher + +import os +import sys +import json +import argparse +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import List, Dict, Tuple, Optional + +import ffmpeg +import whisper + +# ── Projektwurzel in sys.path aufnehmen (dieses Skript liegt z. B. unter src/text/) +ROOT = Path(__file__).resolve().parents[2] +sys.path.insert(0, str(ROOT)) + +from config import INPUT_DIR, TRANSCRIPTS_DIR # zentrale Pfade + +# ────────────────────────────────────────────────────────────────────────────── +# Utilities +# ────────────────────────────────────────────────────────────────────────────── + +def probe_duration(path: Path) -> float: + """Ermittle die Videodauer in Sekunden (ffmpeg.probe).""" + try: + meta = ffmpeg.probe(str(path)) + except ffmpeg.Error as e: + raise RuntimeError(f"ffmpeg.probe fehlgeschlagen für {path}: {e.stderr.decode('utf-8','ignore') if hasattr(e, 'stderr') else e}") from e + + dur = meta.get("format", {}).get("duration") + if dur is not None: + return float(dur) + + cand = 0.0 + for s in meta.get("streams", []) or []: + d = s.get("duration") + if d: + cand = max(cand, float(d)) + if cand > 0: + return cand + raise RuntimeError(f"Konnte Videodauer nicht bestimmen: {path}") + +def make_chunks(total: float, chunk_seconds: float, overlap: float) -> List[Tuple[float,float]]: + """Zerteile [0,total] in überlappende Intervalle.""" + if chunk_seconds <= 0: + return [(0.0, total)] + s, out = 0.0, [] + while s < total: + e = min(s + chunk_seconds, total) + out.append((s, e)) + if e >= total: + break + s = max(0.0, e - overlap) + return out + +def extract_audio_segment(src_video: Path, start: float, end: float, out_wav: Path) -> None: + """Extrahiere [start,end] als Mono-16kHz-WAV.""" + ( + ffmpeg + .input(str(src_video), ss=start, to=end) + .output( + str(out_wav), + format="wav", + acodec="pcm_s16le", + ac=1, + ar="16000", + loglevel="error", + ) + .overwrite_output() + .run() + ) + +def is_suspect(text: str) -> bool: + """Heuristik: leere/loopende/zweifelhafte Zeilen markieren.""" + t = (text or "").strip().lower() + if not t: + return True + words = t.split() + if not words: + return True + counts = {w: words.count(w) for w in set(words)} + most_common = max(counts.values()) + return most_common / len(words) > 0.6 or most_common > 20 + +def merge_overlaps_keep_best( + segments: List[Dict], + max_gap: float = 0.15, + min_dur: float = 0.30 +) -> List[Dict]: + """ + Zeitlich sortieren, kleine Gaps schließen. Bei Überlappung: + - keine Text-Konkatenation + - behalte das "bessere" Segment (längere Dauer, dann längerer Text) + - words: vom "best" übernehmen (falls vorhanden) + """ + cleaned = [] + for s in segments: + s0 = float(s["start"]); s1 = float(s["end"]) + txt = (s.get("text") or "").strip() + if s1 - s0 >= min_dur and txt: + cleaned.append({ + "start": s0, "end": s1, + "text": txt, + "words": s.get("words", []) + }) + if not cleaned: + return [] + + cleaned.sort(key=lambda x: (x["start"], x["end"])) + out = [cleaned[0]] + + def score(x: Dict) -> tuple: + return (x["end"] - x["start"], len(x.get("text", ""))) + + for s in cleaned[1:]: + m = out[-1] + if s["start"] <= m["end"] + max_gap: + best = s if score(s) > score(m) else m + out[-1] = { + "start": min(m["start"], s["start"]), + "end": max(m["end"], s["end"]), + "text": best["text"], + "words": best.get("words", []), + } + else: + out.append(s) + return out + +def write_outputs(base: Path, segments: List[Dict], out_dir: Path, ascii_dash: bool = True): + """Schreibe _timed.txt, _suspect_lines.txt und _segments.json.""" + out_dir.mkdir(parents=True, exist_ok=True) + dash = "-" if ascii_dash else "–" + + out_txt = out_dir / f"{base.stem}_timed.txt" + out_sus = out_dir / f"{base.stem}_suspect_lines.txt" + out_json = out_dir / f"{base.stem}_segments.json" + + # TXT nur zur Ansicht + with open(out_txt, "w", encoding="utf-8") as f_txt, open(out_sus, "w", encoding="utf-8") as f_sus: + for s in segments: + line = f"[{s['start']:.2f} {dash} {s['end']:.2f}] {s['text']}\n" + f_txt.write(line) + if is_suspect(s["text"]): + f_sus.write(line) + + # JSON für die Weiterverarbeitung (inkl. words) + with open(out_json, "w", encoding="utf-8") as f_json: + json.dump(segments, f_json, ensure_ascii=False, indent=2) + + return out_txt, out_sus, out_json + +def find_default_input() -> Optional[Path]: + """Nimm das erste Video aus INPUT_DIR, falls kein --input übergeben wurde.""" + exts = (".mp4", ".mov", ".mkv", ".m4v", ".wav", ".mp3") + for p in sorted(INPUT_DIR.iterdir()): + if p.suffix.lower() in exts: + return p + return None + +# ────────────────────────────────────────────────────────────────────────────── +# CLI +# ────────────────────────────────────────────────────────────────────────────── + +def parse_args(): + p = argparse.ArgumentParser( + description="Chunked Whisper Transcription mit Wortzeitstempeln & doppler-sicherem Stitching." + ) + p.add_argument("--input", type=Path, default=None, help=f"Eingabevideo/-audio. Default: erstes File in {INPUT_DIR}") + p.add_argument("--outdir", type=Path, default=None, help=f"Ausgabeverzeichnis. Default: {TRANSCRIPTS_DIR}") + p.add_argument("--model", type=str, default=os.getenv("WHISPER_MODEL", "small"), help="Whisper-Modell (tiny/base/small/medium/large)") + p.add_argument("--lang", type=str, default=os.getenv("LANGUAGE", "none"), help="Sprachcode (z. B. 'de') oder leer/None für Auto-Detect") + p.add_argument("--chunk", type=float, default=60.0, help="Chunk-Länge in Sekunden (0 = ganzes File)") + p.add_argument("--overlap", type=float, default=2.0, help="Overlap in Sekunden") + p.add_argument("--min-dur", type=float, default=0.30, help="Mindest-Segmentdauer (Sekunden)") + p.add_argument("--max-gap", type=float, default=0.15, help="Maximaler Zeit-Gap für Merge (Sekunden)") + p.add_argument("--fp16", action="store_true", help="fp16 aktivieren (nur sinnvoll mit GPU)") + return p.parse_args() + +# ────────────────────────────────────────────────────────────────────────────── +# Main +# ────────────────────────────────────────────────────────────────────────────── + +def main(): + # Whisper-Cache (damit Modelle lokal landen) + os.environ.setdefault("XDG_CACHE_HOME", str(ROOT / "whisper-cache")) + + args = parse_args() + input_path = args.input or find_default_input() + out_dir = args.outdir or TRANSCRIPTS_DIR + + print("📁 Projekt-Root:", ROOT) + print("📄 Input:", input_path if input_path else "—") + if not input_path or not input_path.exists(): + raise FileNotFoundError(f"Kein gültiges Eingabefile gefunden. Lege ein Video/Audio in {INPUT_DIR} oder nutze --input.") + + out_dir.mkdir(parents=True, exist_ok=True) + + duration = probe_duration(input_path) + print(f"🎬 Dauer: {duration:.2f}s") + + chunks = make_chunks(duration, args.chunk, args.overlap) + print(f"🔪 {len(chunks)} Chunks à {args.chunk:.1f}s mit {args.overlap:.1f}s Overlap") + + # Whisper laden + print(f"🧠 Lade Whisper-Modell: {args.model}") + try: + model = whisper.load_model(args.model) + except Exception as e: + raise RuntimeError(f"Whisper-Modell '{args.model}' konnte nicht geladen werden. Installiert? (pip install openai-whisper)\n{e}") from e + + all_segments: List[Dict] = [] + with TemporaryDirectory() as tmpdir_str: + tmpdir = Path(tmpdir_str) + for i, (start, end) in enumerate(chunks, 1): + print(f"🔉 Chunk {i}/{len(chunks)}: {start:.2f}s - {end:.2f}s") + wav = tmpdir / f"chunk_{i:03d}.wav" + extract_audio_segment(input_path, start, end, wav) + + # Sprache: ''/none = Auto-Detect + lang = None if str(args.lang).strip().lower() in {"", "none", "null"} else args.lang + + # Transkribieren mit Wortzeiten, ohne Cross-Chunk-Kontext + result = model.transcribe( + str(wav), + language=lang, + fp16=args.fp16, + word_timestamps=True, + condition_on_previous_text=False, + temperature=0, + verbose=False, + ) + + # Center-Cut: nur Mittelteil behalten (verhindert Grenz-Doppler) + keep_start = start if i == 1 else start + args.overlap / 2.0 + keep_end = end if i == len(chunks) else end - args.overlap / 2.0 + + for seg in result.get("segments", []) or []: + s0 = float(seg["start"]) + start + s1 = float(seg["end"]) + start + mid = (s0 + s1) / 2.0 + if not (keep_start <= mid < keep_end): + continue + + # Wörter mit absoluten Zeiten übernehmen + words = [] + for w in (seg.get("words") or []): + txt = (w.get("word") or w.get("text") or "").strip() + if not txt: + continue + words.append({ + "start": float(w["start"]) + start, + "end": float(w["end"]) + start, + "text": txt + }) + + all_segments.append({ + "start": s0, + "end": s1, + "text": (seg.get("text") or "").strip(), + "words": words + }) + + print(f"🧹 Roh-Segmente: {len(all_segments)} → merge & filter …") + merged = merge_overlaps_keep_best(all_segments, max_gap=args.max_gap, min_dur=args.min_dur) + print(f"✅ Gemergte Segmente: {len(merged)}") + + out_txt, out_sus, out_json = write_outputs(input_path, merged, out_dir, ascii_dash=True) + print(f"📝 TXT: {out_txt}") + print(f"⚠️ SUSPECT: {out_sus}") + print(f"💾 JSON: {out_json}") + print("🎉 Fertig.") + +if __name__ == "__main__": + main() diff --git a/src/text/transcription_with_speaker.py b/src/text/transcription_with_speaker.py new file mode 100644 index 0000000..e5912b7 --- /dev/null +++ b/src/text/transcription_with_speaker.py @@ -0,0 +1,88 @@ +import os +import json +import ffmpeg +import whisper +import tempfile +import torch +from tqdm import tqdm +from pathlib import Path +from pyannote.audio import Pipeline + +# === HUGGING FACE TOKEN (für pyannote) === +HF_TOKEN = "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" + +# === Torch Optimierung (optional) === +torch.set_float32_matmul_precision("medium") + +# === Einstellungen === +PROJECT_ROOT = Path(__file__).resolve().parents[2] +input_file = PROJECT_ROOT / "input" / "testVideoShort.mov" +output_dir = PROJECT_ROOT / "transkripte" +output_dir.mkdir(parents=True, exist_ok=True) + +output_txt = output_dir / f"{input_file.stem}_timed.txt" +output_json = output_dir / f"{input_file.stem}_segments.json" + +# === Video in Audio konvertieren === +print("🎞️ Extrahiere Audio ...") +tmp_dir = Path(tempfile.mkdtemp()) +wav_file = tmp_dir / "audio.wav" +ffmpeg.input(str(input_file)).output( + str(wav_file), + format="wav", + acodec="pcm_s16le", + ac=1, + ar="16000", + loglevel="error" +).overwrite_output().run() + +# === Transkription mit Whisper === +print("🧠 Starte Transkription mit Whisper ...") +model = whisper.load_model("small") +result = model.transcribe( + str(wav_file), + language="de", + fp16=False, + word_timestamps=False, + condition_on_previous_text=True, + temperature=0, + verbose=False +) +segments = result["segments"] + +# === Diarisation mit Pyannote === +print("🗣️ Starte Sprecheranalyse mit Pyannote (das dauert jetzt etwas) ...") +pipeline = Pipeline.from_pretrained( + "pyannote/speaker-diarization-3.1", + use_auth_token=HF_TOKEN +) +pipeline.to(torch.device("mps")) # ⬅️ Apple GPU beschleunigen + +diarization = pipeline(str(wav_file)) + +# === Sprecher zuordnen === +def assign_speakers_to_segments(segments, diarization): + assigned = [] + for seg in tqdm(segments, desc="🎙️ Weise Sprecher zu"): + speaker = "unknown" + for turn, _, label in diarization.itertracks(yield_label=True): + if turn.start <= seg["start"] <= turn.end: + speaker = label + break + seg["speaker"] = speaker + assigned.append(seg) + return assigned + +segments_with_speaker = assign_speakers_to_segments(segments, diarization) + +# === Speichern als TXT +with open(output_txt, "w", encoding="utf-8") as f: + for seg in segments_with_speaker: + line = f"[{seg['start']:.2f} – {seg['end']:.2f}] {seg['speaker'].upper()}: {seg['text'].strip()}\n" + f.write(line) + +# === Speichern als JSON +with open(output_json, "w", encoding="utf-8") as f: + json.dump(segments_with_speaker, f, ensure_ascii=False, indent=2) + +print(f"✅ Transkript mit Sprecherinfos gespeichert unter:\n📄 {output_txt}\n📄 {output_json}") diff --git a/text-clustering b/text-clustering deleted file mode 160000 index 7815f8b..0000000 --- a/text-clustering +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 7815f8b37d91b75cf160ed3f0ec8550c0b58cabb diff --git a/transkripte/.DS_Store b/transkripte/.DS_Store deleted file mode 100644 index 1a1bbf7..0000000 Binary files a/transkripte/.DS_Store and /dev/null differ diff --git a/whisper.cpp b/whisper.cpp deleted file mode 160000 index 2e310b8..0000000 --- a/whisper.cpp +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 2e310b841e0b4e7cf00890b53411dd9f8578f243