76 lines
2.7 KiB
Python
76 lines
2.7 KiB
Python
import cv2
|
|
import json
|
|
from pathlib import Path
|
|
from tqdm import tqdm
|
|
|
|
SCRIPT_DIR = Path(__file__).resolve().parent
|
|
PROJECT_DIR = SCRIPT_DIR.parents[1] # ← geht von /src/reformat zu /BachlorArbeit
|
|
|
|
FACES_DIR = PROJECT_DIR / "data" / "face_data_combined"
|
|
INPUT_VIDEO_DIR = PROJECT_DIR / "data" / "output" / "raw_clips"
|
|
OUTPUT_DIR = PROJECT_DIR / "output" / "output_preview_faces"
|
|
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# === Alle *_faces.json Dateien durchgehen ===
|
|
face_files = sorted(FACES_DIR.glob("*_faces.json"))
|
|
|
|
for face_file in tqdm(face_files, desc="🔍 Erzeuge Vorschau mit Sprechererkennung"):
|
|
clip_name = face_file.stem.replace("_faces", "") + ".mp4"
|
|
input_path = INPUT_VIDEO_DIR / clip_name
|
|
output_path = OUTPUT_DIR / clip_name.replace(".mp4", "_preview_faces.mp4")
|
|
|
|
if not input_path.exists():
|
|
print(f"❌ Clip nicht gefunden: {clip_name}")
|
|
continue
|
|
|
|
# Video-Setup
|
|
cap = cv2.VideoCapture(str(input_path))
|
|
fps = cap.get(cv2.CAP_PROP_FPS)
|
|
fps = fps if fps > 1 else 25 # fallback falls FPS = 0
|
|
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
|
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
|
fourcc = cv2.VideoWriter_fourcc(*"avc1") # Kompatibler als mp4v
|
|
out = cv2.VideoWriter(str(output_path), fourcc, fps, (width, height))
|
|
|
|
# Gesichts-Daten laden
|
|
data = json.loads(face_file.read_text())
|
|
data_by_frame = {d["frame"]: d["faces"] for d in data if d["faces"]}
|
|
|
|
print(f"🔢 Frames mit Gesichtern: {len(data_by_frame)}")
|
|
|
|
frame_idx = 0
|
|
while cap.isOpened():
|
|
ret, frame = cap.read()
|
|
if not ret:
|
|
break
|
|
|
|
faces = data_by_frame.get(frame_idx, [])
|
|
speaker_idx = None
|
|
|
|
# Sprecher anhand Mundöffnung
|
|
if faces and all("mouth_openness" in f for f in faces):
|
|
mouth_vals = [f["mouth_openness"] for f in faces]
|
|
if any(v > 0.01 for v in mouth_vals): # einfache Aktivitäts-Schwelle
|
|
speaker_idx = mouth_vals.index(max(mouth_vals))
|
|
|
|
for i, face in enumerate(faces):
|
|
x, y, w, h = face["bbox"]
|
|
color = (0, 255, 0) if i == speaker_idx else (255, 255, 255)
|
|
label = f"Mouth: {face.get('mouth_openness', 0):.2f}"
|
|
|
|
# Debug-Ausgabe (optional)
|
|
print(f"Frame {frame_idx} | Face {i} | BBox: ({x},{y},{w},{h}) | Speaker: {speaker_idx}")
|
|
|
|
cv2.rectangle(frame, (x, y), (x + w, y + h), color, 2)
|
|
cv2.putText(frame, label, (x, y - 10),
|
|
cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
|
|
|
|
out.write(frame)
|
|
frame_idx += 1
|
|
|
|
cap.release()
|
|
out.release()
|
|
print(f"✅ Vorschau exportiert: {output_path.name}")
|
|
|
|
print("🏁 Alle Vorschauvideos mit Sprecherkennung erstellt.")
|