#!/usr/bin/env python3 import argparse import logging import json from pathlib import Path import cv2 from ultralytics import YOLO import mediapipe as mp # === Pfade und Standardwerte === SCRIPT_DIR = Path(__file__).resolve().parent PROJECT_ROOT = SCRIPT_DIR.parents[1] DEFAULT_INPUT_DIR = PROJECT_ROOT / "data" / "output" / "raw_clips" DEFAULT_OUTPUT_DIR = PROJECT_ROOT / "data" / "face_data_combined" DEFAULT_MODEL_PATH = PROJECT_ROOT / "models" / "yolov8n-face.pt" # Stelle sicher, dass das Standard-Output-Verzeichnis existiert DEFAULT_OUTPUT_DIR.mkdir(parents=True, exist_ok=True) # === Landmarks für Lippen === TOP_LIPS = [61, 185, 40, 39, 37, 0, 267, 269, 270, 409] BOTTOM_LIPS = [146, 91, 181, 84, 17, 314, 405, 321, 375, 291] def get_mouth_openness(landmarks, image_height): """ Berechnet die Mundöffnung in Pixeln basierend auf normierten Landmark-Koordinaten. """ top_avg = sum(landmarks[i].y for i in TOP_LIPS) / len(TOP_LIPS) bottom_avg = sum(landmarks[i].y for i in BOTTOM_LIPS) / len(BOTTOM_LIPS) return abs(bottom_avg - top_avg) * image_height def iou(boxA, boxB): """Berechnet Intersection-over-Union zweier Bounding-Boxes im Format (x, y, w, h).""" ax1, ay1, aw, ah = boxA ax2, ay2 = ax1 + aw, ay1 + ah bx1, by1, bw, bh = boxB bx2, by2 = bx1 + bw, by1 + bh inter_x1 = max(ax1, bx1) inter_y1 = max(ay1, by1) inter_x2 = min(ax2, bx2) inter_y2 = min(ay2, by2) inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1) union_area = aw * ah + bw * bh - inter_area return inter_area / union_area if union_area > 0 else 0 def process_video( video_path: Path, output_path: Path, model: YOLO, face_mesh: mp.solutions.face_mesh.FaceMesh, conf_thresh: float, frame_skip: int, downscale: float, ): cap = cv2.VideoCapture(str(video_path)) if not cap.isOpened(): logging.error(f"Kann Video nicht öffnen: {video_path}") return fps = cap.get(cv2.CAP_PROP_FPS) width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH) * downscale) height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT) * downscale) # JSON-Ausgabe mit Streaming with output_path.open('w', encoding='utf-8') as f_out: f_out.write('[\n') first = True frame_idx = 0 while True: ret, frame = cap.read() if not ret: break if frame_skip > 1 and frame_idx % frame_skip != 0: frame_idx += 1 continue if downscale != 1.0: frame = cv2.resize(frame, (width, height)) detections = model(frame, verbose=False)[0] yolo_boxes = [] for box in detections.boxes: conf = float(box.conf[0] if hasattr(box.conf, '__getitem__') else box.conf) if conf < conf_thresh: continue coords = box.xyxy[0].cpu().numpy() x1, y1, x2, y2 = map(int, coords) yolo_boxes.append([x1, y1, x2 - x1, y2 - y1]) rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) mp_result = face_mesh.process(rgb) mp_faces = [] if mp_result.multi_face_landmarks: for landmarks in mp_result.multi_face_landmarks: mouth_px = get_mouth_openness(landmarks.landmark, height) xs = [lm.x * width for lm in landmarks.landmark] ys = [lm.y * height for lm in landmarks.landmark] x1, y1 = int(min(xs)), int(min(ys)) x2, y2 = int(max(xs)), int(max(ys)) mp_faces.append({ "bbox": [x1, y1, x2 - x1, y2 - y1], "mouth_openness": round(mouth_px, 1) }) combined = [] for yb in yolo_boxes: if mp_faces: best = max(mp_faces, key=lambda m: iou(yb, m["bbox"])) best_iou = iou(yb, best["bbox"]) mouth = best["mouth_openness"] if best_iou > 0.2 else 0.0 else: mouth = 0.0 x, y, w, h = yb cx, cy = x + w / 2, y + h / 2 combined.append({ "bbox": yb, "mouth_openness": round(mouth, 1), "center": [round(cx, 1), round(cy, 1), w, h] }) result = { "frame": frame_idx, "timestamp": round(frame_idx / fps, 3), "faces": combined } if not first: f_out.write(',\n') json.dump(result, f_out, ensure_ascii=False) first = False frame_idx += 1 f_out.write('\n]') cap.release() logging.info(f"Verarbeitet: {video_path.name} → {output_path.name}") def main(): parser = argparse.ArgumentParser( description="Analyse von Videos: Gesichter und Mundöffnung erkennen" ) parser.add_argument( "--input-dir", type=Path, default=DEFAULT_INPUT_DIR, help=f"Verzeichnis mit MP4-Videos (standard: {DEFAULT_INPUT_DIR})" ) parser.add_argument( "--output-dir", type=Path, default=DEFAULT_OUTPUT_DIR, help=f"Verzeichnis für JSON-Ergebnisse (standard: {DEFAULT_OUTPUT_DIR})" ) parser.add_argument( "--model", type=Path, default=DEFAULT_MODEL_PATH, help=f"Pfad zum YOLOv8-Face-Modell (.pt) (standard: {DEFAULT_MODEL_PATH})" ) parser.add_argument( "--conf-thresh", type=float, default=0.5, help="Schwelle für YOLO-Confidence" ) parser.add_argument( "--frame-skip", type=int, default=1, help="Nur jede n-te Frame verarbeiten" ) parser.add_argument( "--downscale", type=float, default=1.0, help="Skalierungsfaktor für Frames" ) args = parser.parse_args() logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO) args.output_dir.mkdir(parents=True, exist_ok=True) yolo = YOLO(str(args.model)) face_mesh = mp.solutions.face_mesh.FaceMesh( static_image_mode=False, max_num_faces=5, refine_landmarks=True, min_detection_confidence=0.5, min_tracking_confidence=0.5 ) for video_path in sorted(args.input_dir.glob("*.mp4")): out_path = args.output_dir / f"{video_path.stem}_faces.json" process_video( video_path, out_path, yolo, face_mesh, args.conf_thresh, args.frame_skip, args.downscale, ) if __name__ == "__main__": main()