bachlorarbeit/src/reformat/old/track_faces_Yolo.py
2025-10-19 16:22:26 +02:00

207 lines
6.6 KiB
Python

#!/usr/bin/env python3
import argparse
import logging
import json
from pathlib import Path
import cv2
from ultralytics import YOLO
import mediapipe as mp
# === Pfade und Standardwerte ===
SCRIPT_DIR = Path(__file__).resolve().parent
PROJECT_ROOT = SCRIPT_DIR.parents[1]
DEFAULT_INPUT_DIR = PROJECT_ROOT / "data" / "output" / "raw_clips"
DEFAULT_OUTPUT_DIR = PROJECT_ROOT / "data" / "face_data_combined"
DEFAULT_MODEL_PATH = PROJECT_ROOT / "models" / "yolov8n-face.pt"
# Stelle sicher, dass das Standard-Output-Verzeichnis existiert
DEFAULT_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
# === Landmarks für Lippen ===
TOP_LIPS = [61, 185, 40, 39, 37, 0, 267, 269, 270, 409]
BOTTOM_LIPS = [146, 91, 181, 84, 17, 314, 405, 321, 375, 291]
def get_mouth_openness(landmarks, image_height):
"""
Berechnet die Mundöffnung in Pixeln basierend auf normierten Landmark-Koordinaten.
"""
top_avg = sum(landmarks[i].y for i in TOP_LIPS) / len(TOP_LIPS)
bottom_avg = sum(landmarks[i].y for i in BOTTOM_LIPS) / len(BOTTOM_LIPS)
return abs(bottom_avg - top_avg) * image_height
def iou(boxA, boxB):
"""Berechnet Intersection-over-Union zweier Bounding-Boxes im Format (x, y, w, h)."""
ax1, ay1, aw, ah = boxA
ax2, ay2 = ax1 + aw, ay1 + ah
bx1, by1, bw, bh = boxB
bx2, by2 = bx1 + bw, by1 + bh
inter_x1 = max(ax1, bx1)
inter_y1 = max(ay1, by1)
inter_x2 = min(ax2, bx2)
inter_y2 = min(ay2, by2)
inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1)
union_area = aw * ah + bw * bh - inter_area
return inter_area / union_area if union_area > 0 else 0
def process_video(
video_path: Path,
output_path: Path,
model: YOLO,
face_mesh: mp.solutions.face_mesh.FaceMesh,
conf_thresh: float,
frame_skip: int,
downscale: float,
):
cap = cv2.VideoCapture(str(video_path))
if not cap.isOpened():
logging.error(f"Kann Video nicht öffnen: {video_path}")
return
fps = cap.get(cv2.CAP_PROP_FPS)
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH) * downscale)
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT) * downscale)
# JSON-Ausgabe mit Streaming
with output_path.open('w', encoding='utf-8') as f_out:
f_out.write('[\n')
first = True
frame_idx = 0
while True:
ret, frame = cap.read()
if not ret:
break
if frame_skip > 1 and frame_idx % frame_skip != 0:
frame_idx += 1
continue
if downscale != 1.0:
frame = cv2.resize(frame, (width, height))
detections = model(frame, verbose=False)[0]
yolo_boxes = []
for box in detections.boxes:
conf = float(box.conf[0] if hasattr(box.conf, '__getitem__') else box.conf)
if conf < conf_thresh:
continue
coords = box.xyxy[0].cpu().numpy()
x1, y1, x2, y2 = map(int, coords)
yolo_boxes.append([x1, y1, x2 - x1, y2 - y1])
rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
mp_result = face_mesh.process(rgb)
mp_faces = []
if mp_result.multi_face_landmarks:
for landmarks in mp_result.multi_face_landmarks:
mouth_px = get_mouth_openness(landmarks.landmark, height)
xs = [lm.x * width for lm in landmarks.landmark]
ys = [lm.y * height for lm in landmarks.landmark]
x1, y1 = int(min(xs)), int(min(ys))
x2, y2 = int(max(xs)), int(max(ys))
mp_faces.append({
"bbox": [x1, y1, x2 - x1, y2 - y1],
"mouth_openness": round(mouth_px, 1)
})
combined = []
for yb in yolo_boxes:
if mp_faces:
best = max(mp_faces, key=lambda m: iou(yb, m["bbox"]))
best_iou = iou(yb, best["bbox"])
mouth = best["mouth_openness"] if best_iou > 0.2 else 0.0
else:
mouth = 0.0
x, y, w, h = yb
cx, cy = x + w / 2, y + h / 2
combined.append({
"bbox": yb,
"mouth_openness": round(mouth, 1),
"center": [round(cx, 1), round(cy, 1), w, h]
})
result = {
"frame": frame_idx,
"timestamp": round(frame_idx / fps, 3),
"faces": combined
}
if not first:
f_out.write(',\n')
json.dump(result, f_out, ensure_ascii=False)
first = False
frame_idx += 1
f_out.write('\n]')
cap.release()
logging.info(f"Verarbeitet: {video_path.name}{output_path.name}")
def main():
parser = argparse.ArgumentParser(
description="Analyse von Videos: Gesichter und Mundöffnung erkennen"
)
parser.add_argument(
"--input-dir", type=Path,
default=DEFAULT_INPUT_DIR,
help=f"Verzeichnis mit MP4-Videos (standard: {DEFAULT_INPUT_DIR})"
)
parser.add_argument(
"--output-dir", type=Path,
default=DEFAULT_OUTPUT_DIR,
help=f"Verzeichnis für JSON-Ergebnisse (standard: {DEFAULT_OUTPUT_DIR})"
)
parser.add_argument(
"--model", type=Path,
default=DEFAULT_MODEL_PATH,
help=f"Pfad zum YOLOv8-Face-Modell (.pt) (standard: {DEFAULT_MODEL_PATH})"
)
parser.add_argument(
"--conf-thresh", type=float, default=0.5,
help="Schwelle für YOLO-Confidence"
)
parser.add_argument(
"--frame-skip", type=int, default=1,
help="Nur jede n-te Frame verarbeiten"
)
parser.add_argument(
"--downscale", type=float, default=1.0,
help="Skalierungsfaktor für Frames"
)
args = parser.parse_args()
logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO)
args.output_dir.mkdir(parents=True, exist_ok=True)
yolo = YOLO(str(args.model))
face_mesh = mp.solutions.face_mesh.FaceMesh(
static_image_mode=False,
max_num_faces=5,
refine_landmarks=True,
min_detection_confidence=0.5,
min_tracking_confidence=0.5
)
for video_path in sorted(args.input_dir.glob("*.mp4")):
out_path = args.output_dir / f"{video_path.stem}_faces.json"
process_video(
video_path,
out_path,
yolo,
face_mesh,
args.conf_thresh,
args.frame_skip,
args.downscale,
)
if __name__ == "__main__":
main()