bachlorarbeit/src/reformat/old/analyze_crop_position.py
2025-10-19 16:22:26 +02:00

236 lines
8.1 KiB
Python

import argparse
import json
import logging
import math
import random
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
class FaceTracker:
def __init__(
self,
dist_threshold: float,
switch_frames: int,
panning_window: int,
panning_threshold: float,
smooth_window: int,
scene_jump_threshold: float,
):
self.dist_threshold = dist_threshold
self.switch_frames = switch_frames
self.panning_window = panning_window
self.panning_threshold = panning_threshold
self.smooth_window = smooth_window
self.scene_jump_threshold = scene_jump_threshold
self.current_center: Tuple[float, float] = (960.0, 540.0)
self.raw_center: Tuple[float, float] = self.current_center
self.prev_center: Tuple[float, float] = self.current_center
self.prev_raw: Tuple[float, float] = self.current_center
self.candidate_center: Optional[Tuple[float, float]] = None
self.switch_counter: int = 0
self.last_speaker_set: bool = False
self.random_center: Optional[Tuple[float, float]] = None
self.panning_buffer: List[float] = []
self.smooth_buffer: List[Tuple[float, float]] = []
def process_frame(self, faces: List[Dict[str, Any]]) -> Tuple[Tuple[int, int], bool]:
valid_faces = [f for f in faces if f.get("center") and f.get("mouth_openness") is not None]
all_faces = [f for f in faces if f.get("center")]
# Speaker tracking
if valid_faces:
self._update_speaker(valid_faces)
else:
self._retain_or_random_center(all_faces)
# Panning detection
is_panning = self._detect_panning()
# Smooth / moving average
center = self._smooth_center()
return (int(center[0]), int(center[1])), is_panning
def _update_speaker(self, valid_faces: List[Dict[str, Any]]) -> None:
best = max(valid_faces, key=lambda x: x["mouth_openness"])
cx, cy, *_ = best["center"]
new_center = (cx, cy)
dist = math.hypot(new_center[0] - self.raw_center[0], new_center[1] - self.raw_center[1])
if dist < self.dist_threshold:
self.raw_center = new_center
self.candidate_center = None
self.switch_counter = 0
else:
if (
self.candidate_center is None
or math.hypot(
new_center[0] - self.candidate_center[0], new_center[1] - self.candidate_center[1]
)
> self.dist_threshold
):
self.candidate_center = new_center
self.switch_counter = 1
else:
self.switch_counter += 1
if self.switch_counter >= self.switch_frames:
self.raw_center = self.candidate_center # type: ignore
self.candidate_center = None
self.switch_counter = 0
self.random_center = None
self.last_speaker_set = True
def _retain_or_random_center(self, all_faces: List[Dict[str, Any]]) -> None:
if self.last_speaker_set:
# keep previous raw_center
pass
elif self.random_center is not None:
self.raw_center = self.random_center
elif all_faces:
f = random.choice(all_faces)
cx, cy, *_ = f["center"]
self.random_center = (cx, cy)
self.raw_center = self.random_center
def _detect_panning(self) -> bool:
dx = self.raw_center[0] - self.prev_raw[0]
self.panning_buffer.append(dx)
if len(self.panning_buffer) > self.panning_window:
self.panning_buffer.pop(0)
avg_dx = sum(abs(d) for d in self.panning_buffer) / len(self.panning_buffer)
is_panning = avg_dx > self.panning_threshold
self.prev_raw = self.raw_center
return is_panning
def _smooth_center(self) -> Tuple[float, float]:
sudden_jump = (
math.hypot(
self.raw_center[0] - self.prev_center[0],
self.raw_center[1] - self.prev_center[1],
)
> self.scene_jump_threshold
)
if not sudden_jump:
self.smooth_buffer.append(self.raw_center)
if len(self.smooth_buffer) > self.smooth_window:
self.smooth_buffer.pop(0)
avg_x = sum(p[0] for p in self.smooth_buffer) / len(self.smooth_buffer)
avg_y = sum(p[1] for p in self.smooth_buffer) / len(self.smooth_buffer)
center = (avg_x, avg_y)
else:
center = self.raw_center
self.smooth_buffer.clear()
self.prev_center = center
return center
def parse_args() -> argparse.Namespace:
script_dir = Path(__file__).resolve().parent
project_root = script_dir.parents[1]
default_input = project_root / "data" / "face_data_combined"
default_output = project_root / "data" / "face_crop_centers"
parser = argparse.ArgumentParser(
description="Track and smooth face crop centers based on mouth openness."
)
parser.add_argument(
"-i", "--input-dir", type=Path,
default=default_input,
help=f"Directory containing *_faces.json files (default: {default_input})"
)
parser.add_argument(
"-o", "--output-dir", type=Path,
default=default_output,
help=f"Directory to save *_centers.json files (default: {default_output})"
)
parser.add_argument(
"--dist-threshold", type=float, default=30.0,
help="Pixel distance threshold to switch speaker"
)
parser.add_argument(
"--switch-frames", type=int, default=20,
help="Number of consecutive frames required to confirm speaker switch"
)
parser.add_argument(
"--panning-window", type=int, default=30,
help="Frame window size for panning detection"
)
parser.add_argument(
"--panning-threshold", type=float, default=3.0,
help="Average dx threshold for panning detection"
)
parser.add_argument(
"--smooth-window", type=int, default=8,
help="Moving average window for smoothing"
)
parser.add_argument(
"--scene-jump-threshold", type=float, default=300.0,
help="Jump threshold to detect scene cuts"
)
return parser.parse_args()
def setup_logging() -> None:
logging.basicConfig(
format="%(asctime)s %(levelname)s: %(message)s",
level=logging.INFO,
)
def main() -> None:
setup_logging()
args = parse_args()
input_dir: Path = args.input_dir.resolve()
output_dir: Path = args.output_dir.resolve()
output_dir.mkdir(parents=True, exist_ok=True)
tracker = FaceTracker(
dist_threshold=args.dist_threshold,
switch_frames=args.switch_frames,
panning_window=args.panning_window,
panning_threshold=args.panning_threshold,
smooth_window=args.smooth_window,
scene_jump_threshold=args.scene_jump_threshold,
)
json_files = sorted(input_dir.glob("*_faces.json"))
if not json_files:
logging.error("Keine *_faces.json-Dateien gefunden in %s", input_dir)
return
logging.info("Gefundene Dateien: %d", len(json_files))
for json_path in json_files:
logging.info("Verarbeite %s", json_path.name)
try:
frames_data = json.loads(json_path.read_text())
except json.JSONDecodeError as e:
logging.error("JSON-Fehler in %s: %s", json_path.name, e)
continue
out_data: List[Dict[str, Any]] = []
for frame_idx, frame in enumerate(frames_data):
faces = frame.get("faces", [])
center, is_panning = tracker.process_frame(faces)
out_data.append({
"frame": frame_idx,
"center": [center[0], center[1]],
"panning": is_panning,
})
out_path = output_dir / f"{json_path.stem.replace('_faces', '')}_centers.json"
with out_path.open("w") as f:
json.dump(out_data, f, indent=2)
logging.info("Gespeichert: %s (%d Frames)", out_path.name, len(out_data))
if __name__ == "__main__":
main()