#!/usr/bin/env python3 # -*- coding: utf-8 -*- import argparse import logging import json import time from pathlib import Path from contextlib import nullcontext import cv2 import numpy as np import torch from ultralytics import YOLO import mediapipe as mp # Fortschritt hübsch, wenn verfügbar try: from tqdm import tqdm _HAS_TQDM = True except Exception: _HAS_TQDM = False from src.reformat.new.speaking import get_mouth_openness # ---------- Performance Tweaks ---------- torch.set_float32_matmul_precision("high") cv2.setUseOptimized(True) # ---------- Hilfsfunktionen ---------- def make_square_crop(x1, y1, x2, y2, W, H, margin_scale, min_crop): cx = (x1 + x2) * 0.5 cy = (y1 + y2) * 0.5 w = (x2 - x1) * (1.0 + 2.0 * margin_scale) h = (y2 - y1) * (1.0 + 2.0 * margin_scale) side = max(w, h, float(min_crop)) half = side * 0.5 sx1 = int(max(0, round(cx - half))) sy1 = int(max(0, round(cy - half))) sx2 = int(min(W, round(cx + half))) sy2 = int(min(H, round(cy + half))) side_w = max(0, sx2 - sx1) side_h = max(0, sy2 - sy1) side = max(2, min(side_w, side_h)) sx2 = sx1 + side sy2 = sy1 + side return sx1, sy1, sx2, sy2 def pick_landmarks_near_crop_center(lm_lists, crop_w, crop_h): if not lm_lists: return None cx_t, cy_t = crop_w * 0.5, crop_h * 0.5 best, best_d = None, 1e12 for lms in lm_lists: xs = [p.x * crop_w for p in lms.landmark] ys = [p.y * crop_h for p in lms.landmark] cx = sum(xs) / len(xs) cy = sum(ys) / len(ys) d = (cx - cx_t) ** 2 + (cy - cy_t) ** 2 if d < best_d: best, best_d = lms, d return best def run_mesh(face_mesh, crop_bgr, upscale_if_small): if crop_bgr.size == 0: return None, 0.0 ch, cw = crop_bgr.shape[:2] if max(ch, cw) < upscale_if_small: scale = float(upscale_if_small) / max(ch, cw) new_w = max(1, int(round(cw * scale))) new_h = max(1, int(round(ch * scale))) crop_bgr = cv2.resize(crop_bgr, (new_w, new_h), interpolation=cv2.INTER_CUBIC) ch, cw = new_h, new_w rgb = cv2.cvtColor(crop_bgr, cv2.COLOR_BGR2RGB) res = face_mesh.process(rgb) if not res.multi_face_landmarks: return None, 0.0 chosen = pick_landmarks_near_crop_center(res.multi_face_landmarks, cw, ch) if chosen is None: return None, 0.0 mo = get_mouth_openness(chosen.landmark, ch) return chosen, float(mo) # ---------- Kernprozess ---------- def process_video(video_path: Path, output_path: Path, model: YOLO, face_mesh, conf_thresh: float, frame_skip: int, downscale: float, expansion_1: float, expansion_2: float, min_crop: int, faces_upscale: int, imgsz: int, device: str, max_det: int): print(f"🎬 Starte Detection: {video_path.name}") cap = cv2.VideoCapture(str(video_path)) if not cap.isOpened(): logging.error(f"❌ Kann Video nicht öffnen: {video_path}") return fps = cap.get(cv2.CAP_PROP_FPS) or 25.0 orig_w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) orig_h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)) total_frames_raw = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) # Wenn frame_skip > 1, reduziert sich die tatsächlich verarbeitete Anzahl total_to_process = None if total_frames_raw > 0: total_to_process = (total_frames_raw + (frame_skip - 1)) // max(1, frame_skip) scaled_w = max(1, int(round(orig_w * downscale))) scaled_h = max(1, int(round(orig_h * downscale))) data = [] frame_idx = 0 processed_frames = 0 sx = (orig_w / scaled_w) if downscale != 1.0 else 1.0 sy = (orig_h / scaled_h) if downscale != 1.0 else 1.0 autocast_ctx = ( torch.autocast(device_type=device, dtype=torch.float16) if device in ("mps", "cuda") else nullcontext() ) # Fortschrittsbalken pro Video bar = None start_t = time.time() if _HAS_TQDM: bar = tqdm(total=total_to_process, desc=f"{video_path.name}", unit="f", leave=True) while True: ret, frame = cap.read() if not ret: break if frame_skip > 1 and (frame_idx % frame_skip != 0): frame_idx += 1 continue frame_infer = frame if downscale == 1.0 else cv2.resize(frame, (scaled_w, scaled_h), interpolation=cv2.INTER_AREA) with torch.no_grad(): with autocast_ctx: detections = model(frame_infer, imgsz=imgsz, device=device, verbose=False, conf=conf_thresh, iou=0.5, max_det=max_det)[0] faces = [] for i in range(len(detections.boxes)): box = detections.boxes[i] conf = float(box.conf[0]) if hasattr(box.conf, "__len__") else float(box.conf) if conf < conf_thresh: continue x1, y1, x2, y2 = [float(v) for v in box.xyxy[0].tolist()] if downscale != 1.0: x1 *= sx; x2 *= sx; y1 *= sy; y2 *= sy x1 = max(0.0, min(x1, orig_w - 1)) y1 = max(0.0, min(y1, orig_h - 1)) x2 = max(0.0, min(x2, orig_w - 1)) y2 = max(0.0, min(y2, orig_h - 1)) w = max(1.0, x2 - x1) h = max(1.0, y2 - y1) cx = x1 + w / 2.0 cy = y1 + h / 2.0 # Pass 1 sx1, sy1, sx2, sy2 = make_square_crop(x1, y1, x2, y2, orig_w, orig_h, expansion_1, min_crop) if sx2 - sx1 < 4 or sy2 - sy1 < 4: continue face_crop = frame[sy1:sy2, sx1:sx2] _, mouth_open = run_mesh(face_mesh, face_crop, faces_upscale) # Pass 2 nur wenn nötig if mouth_open == 0.0: sx1b, sy1b, sx2b, sy2b = make_square_crop(x1, y1, x2, y2, orig_w, orig_h, expansion_2, min_crop) if (sx2b - sx1b) >= 4 and (sy2b - sy1b) >= 4: face_crop_b = frame[sy1b:sy2b, sx1b:sx2b] _, mouth_open = run_mesh(face_mesh, face_crop_b, faces_upscale) faces.append({ "bbox": [int(round(x1)), int(round(y1)), int(round(w)), int(round(h))], "conf": round(conf, 3), "center": [round(cx, 1), round(cy, 1)], "mouth_openness": round(float(mouth_open), 3) }) data.append({ "frame": frame_idx, "timestamp": round(frame_idx / fps, 3), "W": orig_w, "H": orig_h, "faces": faces }) frame_idx += 1 processed_frames += 1 # Fortschritt aktualisieren if _HAS_TQDM: bar.update(1) else: # leichter Fallback: ETA Ausgabe alle 30 verarbeitete Frames if processed_frames % 30 == 0: elapsed = time.time() - start_t rate = processed_frames / max(1e-6, elapsed) # frames/sec if total_to_process: remaining = max(0, total_to_process - processed_frames) eta_sec = remaining / max(1e-6, rate) print(f"[{video_path.name}] {processed_frames}/{total_to_process} " f"({processed_frames/total_to_process*100:.1f}%) " f"— {rate:.1f} f/s — ETA {eta_sec/60:.1f} min") else: print(f"[{video_path.name}] {processed_frames} frames — {rate:.1f} f/s") cap.release() if _HAS_TQDM and bar is not None: bar.close() output_path.write_text(json.dumps(data, ensure_ascii=False), encoding="utf-8") print(f"✅ Faces gespeichert: {output_path.name}") def main(): parser = argparse.ArgumentParser() # Verzeichnisse parser.add_argument("--input-dir", type=Path, default=Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit/data/output/raw_clips")) parser.add_argument("--output-dir", type=Path, default=Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit/data/face_data_combined")) parser.add_argument("--model", type=Path, default=Path("/Users/juppkerschbaum/PycharmProjects/BachlorArbeit/models/yolov8n-face.pt")) # Optimierte Defaults (keine Presets nötig) parser.add_argument("--conf-thresh", type=float, default=0.35) parser.add_argument("--frame-skip", type=int, default=1) parser.add_argument("--downscale", type=float, default=0.5) parser.add_argument("--expansion", type=float, default=0.4) parser.add_argument("--expansion2", type=float, default=0.8) parser.add_argument("--min-crop", type=int, default=160) parser.add_argument("--faces-upscale", type=int, default=192) parser.add_argument("--imgsz", type=int, default=448) parser.add_argument("--max-det", type=int, default=20) parser.add_argument("--use-refine", action="store_true", default=False) args = parser.parse_args() logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO) args.output_dir.mkdir(parents=True, exist_ok=True) # Model & Device yolo = YOLO(str(args.model)) if torch.backends.mps.is_available(): device = "mps" elif torch.cuda.is_available(): device = "cuda" else: device = "cpu" yolo.to(device) print(f"🖥️ Inference-Device: {device}") # Warmup (reduziert Anlaufschwankungen) try: with torch.no_grad(): dummy = np.zeros((args.imgsz, args.imgsz, 3), dtype=np.uint8) _ = yolo.predict(source=[dummy], imgsz=args.imgsz, verbose=False, device=device) except Exception: pass # Liste der Videos (für Gesamt-Fortschritt) videos = sorted(args.input_dir.glob("*.mp4")) print(f"🔍 Input-Ordner: {args.input_dir.resolve()}") print("📁 Dateien:") for p in sorted(args.input_dir.glob("*")): print(" →", p.name) # Gesamt-Fortschrittsbalken pro Datei outer = None if _HAS_TQDM: outer = tqdm(total=len(videos), desc="Gesamt", unit="vid", leave=False) with mp.solutions.face_mesh.FaceMesh( static_image_mode=False, max_num_faces=10, refine_landmarks=args.use_refine, min_detection_confidence=0.5, min_tracking_confidence=0.5 ) as face_mesh: for vid in videos: out = args.output_dir / f"{vid.stem}_faces.json" process_video( video_path=vid, output_path=out, model=yolo, face_mesh=face_mesh, conf_thresh=args.conf_thresh, frame_skip=args.frame_skip, downscale=args.downscale, expansion_1=args.expansion, expansion_2=args.expansion2, min_crop=args.min_crop, faces_upscale=args.faces_upscale, imgsz=args.imgsz, device=device, max_det=args.max_det ) if _HAS_TQDM and outer is not None: outer.update(1) if _HAS_TQDM and outer is not None: outer.close() if __name__ == "__main__": main()