207 lines
6.6 KiB
Python
207 lines
6.6 KiB
Python
#!/usr/bin/env python3
|
|
import argparse
|
|
import logging
|
|
import json
|
|
from pathlib import Path
|
|
|
|
import cv2
|
|
from ultralytics import YOLO
|
|
import mediapipe as mp
|
|
|
|
# === Pfade und Standardwerte ===
|
|
SCRIPT_DIR = Path(__file__).resolve().parent
|
|
PROJECT_ROOT = SCRIPT_DIR.parents[1]
|
|
DEFAULT_INPUT_DIR = PROJECT_ROOT / "data" / "output" / "raw_clips"
|
|
DEFAULT_OUTPUT_DIR = PROJECT_ROOT / "data" / "face_data_combined"
|
|
DEFAULT_MODEL_PATH = PROJECT_ROOT / "models" / "yolov8n-face.pt"
|
|
|
|
# Stelle sicher, dass das Standard-Output-Verzeichnis existiert
|
|
DEFAULT_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
# === Landmarks für Lippen ===
|
|
TOP_LIPS = [61, 185, 40, 39, 37, 0, 267, 269, 270, 409]
|
|
BOTTOM_LIPS = [146, 91, 181, 84, 17, 314, 405, 321, 375, 291]
|
|
|
|
|
|
def get_mouth_openness(landmarks, image_height):
|
|
"""
|
|
Berechnet die Mundöffnung in Pixeln basierend auf normierten Landmark-Koordinaten.
|
|
"""
|
|
top_avg = sum(landmarks[i].y for i in TOP_LIPS) / len(TOP_LIPS)
|
|
bottom_avg = sum(landmarks[i].y for i in BOTTOM_LIPS) / len(BOTTOM_LIPS)
|
|
return abs(bottom_avg - top_avg) * image_height
|
|
|
|
|
|
def iou(boxA, boxB):
|
|
"""Berechnet Intersection-over-Union zweier Bounding-Boxes im Format (x, y, w, h)."""
|
|
ax1, ay1, aw, ah = boxA
|
|
ax2, ay2 = ax1 + aw, ay1 + ah
|
|
bx1, by1, bw, bh = boxB
|
|
bx2, by2 = bx1 + bw, by1 + bh
|
|
|
|
inter_x1 = max(ax1, bx1)
|
|
inter_y1 = max(ay1, by1)
|
|
inter_x2 = min(ax2, bx2)
|
|
inter_y2 = min(ay2, by2)
|
|
inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1)
|
|
|
|
union_area = aw * ah + bw * bh - inter_area
|
|
return inter_area / union_area if union_area > 0 else 0
|
|
|
|
|
|
def process_video(
|
|
video_path: Path,
|
|
output_path: Path,
|
|
model: YOLO,
|
|
face_mesh: mp.solutions.face_mesh.FaceMesh,
|
|
conf_thresh: float,
|
|
frame_skip: int,
|
|
downscale: float,
|
|
):
|
|
cap = cv2.VideoCapture(str(video_path))
|
|
if not cap.isOpened():
|
|
logging.error(f"Kann Video nicht öffnen: {video_path}")
|
|
return
|
|
|
|
fps = cap.get(cv2.CAP_PROP_FPS)
|
|
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH) * downscale)
|
|
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT) * downscale)
|
|
|
|
# JSON-Ausgabe mit Streaming
|
|
with output_path.open('w', encoding='utf-8') as f_out:
|
|
f_out.write('[\n')
|
|
first = True
|
|
frame_idx = 0
|
|
|
|
while True:
|
|
ret, frame = cap.read()
|
|
if not ret:
|
|
break
|
|
if frame_skip > 1 and frame_idx % frame_skip != 0:
|
|
frame_idx += 1
|
|
continue
|
|
|
|
if downscale != 1.0:
|
|
frame = cv2.resize(frame, (width, height))
|
|
|
|
detections = model(frame, verbose=False)[0]
|
|
yolo_boxes = []
|
|
for box in detections.boxes:
|
|
conf = float(box.conf[0] if hasattr(box.conf, '__getitem__') else box.conf)
|
|
if conf < conf_thresh:
|
|
continue
|
|
coords = box.xyxy[0].cpu().numpy()
|
|
x1, y1, x2, y2 = map(int, coords)
|
|
yolo_boxes.append([x1, y1, x2 - x1, y2 - y1])
|
|
|
|
rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
|
mp_result = face_mesh.process(rgb)
|
|
mp_faces = []
|
|
if mp_result.multi_face_landmarks:
|
|
for landmarks in mp_result.multi_face_landmarks:
|
|
mouth_px = get_mouth_openness(landmarks.landmark, height)
|
|
xs = [lm.x * width for lm in landmarks.landmark]
|
|
ys = [lm.y * height for lm in landmarks.landmark]
|
|
x1, y1 = int(min(xs)), int(min(ys))
|
|
x2, y2 = int(max(xs)), int(max(ys))
|
|
mp_faces.append({
|
|
"bbox": [x1, y1, x2 - x1, y2 - y1],
|
|
"mouth_openness": round(mouth_px, 1)
|
|
})
|
|
|
|
combined = []
|
|
for yb in yolo_boxes:
|
|
if mp_faces:
|
|
best = max(mp_faces, key=lambda m: iou(yb, m["bbox"]))
|
|
best_iou = iou(yb, best["bbox"])
|
|
mouth = best["mouth_openness"] if best_iou > 0.2 else 0.0
|
|
else:
|
|
mouth = 0.0
|
|
|
|
x, y, w, h = yb
|
|
cx, cy = x + w / 2, y + h / 2
|
|
combined.append({
|
|
"bbox": yb,
|
|
"mouth_openness": round(mouth, 1),
|
|
"center": [round(cx, 1), round(cy, 1), w, h]
|
|
})
|
|
|
|
result = {
|
|
"frame": frame_idx,
|
|
"timestamp": round(frame_idx / fps, 3),
|
|
"faces": combined
|
|
}
|
|
|
|
if not first:
|
|
f_out.write(',\n')
|
|
json.dump(result, f_out, ensure_ascii=False)
|
|
first = False
|
|
frame_idx += 1
|
|
|
|
f_out.write('\n]')
|
|
|
|
cap.release()
|
|
logging.info(f"Verarbeitet: {video_path.name} → {output_path.name}")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Analyse von Videos: Gesichter und Mundöffnung erkennen"
|
|
)
|
|
parser.add_argument(
|
|
"--input-dir", type=Path,
|
|
default=DEFAULT_INPUT_DIR,
|
|
help=f"Verzeichnis mit MP4-Videos (standard: {DEFAULT_INPUT_DIR})"
|
|
)
|
|
parser.add_argument(
|
|
"--output-dir", type=Path,
|
|
default=DEFAULT_OUTPUT_DIR,
|
|
help=f"Verzeichnis für JSON-Ergebnisse (standard: {DEFAULT_OUTPUT_DIR})"
|
|
)
|
|
parser.add_argument(
|
|
"--model", type=Path,
|
|
default=DEFAULT_MODEL_PATH,
|
|
help=f"Pfad zum YOLOv8-Face-Modell (.pt) (standard: {DEFAULT_MODEL_PATH})"
|
|
)
|
|
parser.add_argument(
|
|
"--conf-thresh", type=float, default=0.5,
|
|
help="Schwelle für YOLO-Confidence"
|
|
)
|
|
parser.add_argument(
|
|
"--frame-skip", type=int, default=1,
|
|
help="Nur jede n-te Frame verarbeiten"
|
|
)
|
|
parser.add_argument(
|
|
"--downscale", type=float, default=1.0,
|
|
help="Skalierungsfaktor für Frames"
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO)
|
|
args.output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
yolo = YOLO(str(args.model))
|
|
face_mesh = mp.solutions.face_mesh.FaceMesh(
|
|
static_image_mode=False,
|
|
max_num_faces=5,
|
|
refine_landmarks=True,
|
|
min_detection_confidence=0.5,
|
|
min_tracking_confidence=0.5
|
|
)
|
|
|
|
for video_path in sorted(args.input_dir.glob("*.mp4")):
|
|
out_path = args.output_dir / f"{video_path.stem}_faces.json"
|
|
process_video(
|
|
video_path,
|
|
out_path,
|
|
yolo,
|
|
face_mesh,
|
|
args.conf_thresh,
|
|
args.frame_skip,
|
|
args.downscale,
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|