236 lines
8.1 KiB
Python
236 lines
8.1 KiB
Python
import argparse
|
|
import json
|
|
import logging
|
|
import math
|
|
import random
|
|
from pathlib import Path
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
|
|
class FaceTracker:
|
|
def __init__(
|
|
self,
|
|
dist_threshold: float,
|
|
switch_frames: int,
|
|
panning_window: int,
|
|
panning_threshold: float,
|
|
smooth_window: int,
|
|
scene_jump_threshold: float,
|
|
):
|
|
self.dist_threshold = dist_threshold
|
|
self.switch_frames = switch_frames
|
|
self.panning_window = panning_window
|
|
self.panning_threshold = panning_threshold
|
|
self.smooth_window = smooth_window
|
|
self.scene_jump_threshold = scene_jump_threshold
|
|
|
|
self.current_center: Tuple[float, float] = (960.0, 540.0)
|
|
self.raw_center: Tuple[float, float] = self.current_center
|
|
self.prev_center: Tuple[float, float] = self.current_center
|
|
self.prev_raw: Tuple[float, float] = self.current_center
|
|
self.candidate_center: Optional[Tuple[float, float]] = None
|
|
self.switch_counter: int = 0
|
|
self.last_speaker_set: bool = False
|
|
self.random_center: Optional[Tuple[float, float]] = None
|
|
|
|
self.panning_buffer: List[float] = []
|
|
self.smooth_buffer: List[Tuple[float, float]] = []
|
|
|
|
def process_frame(self, faces: List[Dict[str, Any]]) -> Tuple[Tuple[int, int], bool]:
|
|
valid_faces = [f for f in faces if f.get("center") and f.get("mouth_openness") is not None]
|
|
all_faces = [f for f in faces if f.get("center")]
|
|
|
|
# Speaker tracking
|
|
if valid_faces:
|
|
self._update_speaker(valid_faces)
|
|
else:
|
|
self._retain_or_random_center(all_faces)
|
|
|
|
# Panning detection
|
|
is_panning = self._detect_panning()
|
|
|
|
# Smooth / moving average
|
|
center = self._smooth_center()
|
|
|
|
return (int(center[0]), int(center[1])), is_panning
|
|
|
|
def _update_speaker(self, valid_faces: List[Dict[str, Any]]) -> None:
|
|
best = max(valid_faces, key=lambda x: x["mouth_openness"])
|
|
cx, cy, *_ = best["center"]
|
|
new_center = (cx, cy)
|
|
|
|
dist = math.hypot(new_center[0] - self.raw_center[0], new_center[1] - self.raw_center[1])
|
|
if dist < self.dist_threshold:
|
|
self.raw_center = new_center
|
|
self.candidate_center = None
|
|
self.switch_counter = 0
|
|
else:
|
|
if (
|
|
self.candidate_center is None
|
|
or math.hypot(
|
|
new_center[0] - self.candidate_center[0], new_center[1] - self.candidate_center[1]
|
|
)
|
|
> self.dist_threshold
|
|
):
|
|
self.candidate_center = new_center
|
|
self.switch_counter = 1
|
|
else:
|
|
self.switch_counter += 1
|
|
|
|
if self.switch_counter >= self.switch_frames:
|
|
self.raw_center = self.candidate_center # type: ignore
|
|
self.candidate_center = None
|
|
self.switch_counter = 0
|
|
|
|
self.random_center = None
|
|
self.last_speaker_set = True
|
|
|
|
def _retain_or_random_center(self, all_faces: List[Dict[str, Any]]) -> None:
|
|
if self.last_speaker_set:
|
|
# keep previous raw_center
|
|
pass
|
|
elif self.random_center is not None:
|
|
self.raw_center = self.random_center
|
|
elif all_faces:
|
|
f = random.choice(all_faces)
|
|
cx, cy, *_ = f["center"]
|
|
self.random_center = (cx, cy)
|
|
self.raw_center = self.random_center
|
|
|
|
def _detect_panning(self) -> bool:
|
|
dx = self.raw_center[0] - self.prev_raw[0]
|
|
self.panning_buffer.append(dx)
|
|
if len(self.panning_buffer) > self.panning_window:
|
|
self.panning_buffer.pop(0)
|
|
avg_dx = sum(abs(d) for d in self.panning_buffer) / len(self.panning_buffer)
|
|
is_panning = avg_dx > self.panning_threshold
|
|
self.prev_raw = self.raw_center
|
|
return is_panning
|
|
|
|
def _smooth_center(self) -> Tuple[float, float]:
|
|
sudden_jump = (
|
|
math.hypot(
|
|
self.raw_center[0] - self.prev_center[0],
|
|
self.raw_center[1] - self.prev_center[1],
|
|
)
|
|
> self.scene_jump_threshold
|
|
)
|
|
if not sudden_jump:
|
|
self.smooth_buffer.append(self.raw_center)
|
|
if len(self.smooth_buffer) > self.smooth_window:
|
|
self.smooth_buffer.pop(0)
|
|
avg_x = sum(p[0] for p in self.smooth_buffer) / len(self.smooth_buffer)
|
|
avg_y = sum(p[1] for p in self.smooth_buffer) / len(self.smooth_buffer)
|
|
center = (avg_x, avg_y)
|
|
else:
|
|
center = self.raw_center
|
|
self.smooth_buffer.clear()
|
|
|
|
self.prev_center = center
|
|
return center
|
|
|
|
|
|
def parse_args() -> argparse.Namespace:
|
|
script_dir = Path(__file__).resolve().parent
|
|
project_root = script_dir.parents[1]
|
|
default_input = project_root / "data" / "face_data_combined"
|
|
default_output = project_root / "data" / "face_crop_centers"
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description="Track and smooth face crop centers based on mouth openness."
|
|
)
|
|
parser.add_argument(
|
|
"-i", "--input-dir", type=Path,
|
|
default=default_input,
|
|
help=f"Directory containing *_faces.json files (default: {default_input})"
|
|
)
|
|
parser.add_argument(
|
|
"-o", "--output-dir", type=Path,
|
|
default=default_output,
|
|
help=f"Directory to save *_centers.json files (default: {default_output})"
|
|
)
|
|
parser.add_argument(
|
|
"--dist-threshold", type=float, default=30.0,
|
|
help="Pixel distance threshold to switch speaker"
|
|
)
|
|
parser.add_argument(
|
|
"--switch-frames", type=int, default=20,
|
|
help="Number of consecutive frames required to confirm speaker switch"
|
|
)
|
|
parser.add_argument(
|
|
"--panning-window", type=int, default=30,
|
|
help="Frame window size for panning detection"
|
|
)
|
|
parser.add_argument(
|
|
"--panning-threshold", type=float, default=3.0,
|
|
help="Average dx threshold for panning detection"
|
|
)
|
|
parser.add_argument(
|
|
"--smooth-window", type=int, default=8,
|
|
help="Moving average window for smoothing"
|
|
)
|
|
parser.add_argument(
|
|
"--scene-jump-threshold", type=float, default=300.0,
|
|
help="Jump threshold to detect scene cuts"
|
|
)
|
|
return parser.parse_args()
|
|
|
|
|
|
def setup_logging() -> None:
|
|
logging.basicConfig(
|
|
format="%(asctime)s %(levelname)s: %(message)s",
|
|
level=logging.INFO,
|
|
)
|
|
|
|
|
|
def main() -> None:
|
|
setup_logging()
|
|
args = parse_args()
|
|
|
|
input_dir: Path = args.input_dir.resolve()
|
|
output_dir: Path = args.output_dir.resolve()
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
tracker = FaceTracker(
|
|
dist_threshold=args.dist_threshold,
|
|
switch_frames=args.switch_frames,
|
|
panning_window=args.panning_window,
|
|
panning_threshold=args.panning_threshold,
|
|
smooth_window=args.smooth_window,
|
|
scene_jump_threshold=args.scene_jump_threshold,
|
|
)
|
|
|
|
json_files = sorted(input_dir.glob("*_faces.json"))
|
|
if not json_files:
|
|
logging.error("Keine *_faces.json-Dateien gefunden in %s", input_dir)
|
|
return
|
|
|
|
logging.info("Gefundene Dateien: %d", len(json_files))
|
|
|
|
for json_path in json_files:
|
|
logging.info("Verarbeite %s", json_path.name)
|
|
try:
|
|
frames_data = json.loads(json_path.read_text())
|
|
except json.JSONDecodeError as e:
|
|
logging.error("JSON-Fehler in %s: %s", json_path.name, e)
|
|
continue
|
|
|
|
out_data: List[Dict[str, Any]] = []
|
|
for frame_idx, frame in enumerate(frames_data):
|
|
faces = frame.get("faces", [])
|
|
center, is_panning = tracker.process_frame(faces)
|
|
out_data.append({
|
|
"frame": frame_idx,
|
|
"center": [center[0], center[1]],
|
|
"panning": is_panning,
|
|
})
|
|
|
|
out_path = output_dir / f"{json_path.stem.replace('_faces', '')}_centers.json"
|
|
with out_path.open("w") as f:
|
|
json.dump(out_data, f, indent=2)
|
|
logging.info("Gespeichert: %s (%d Frames)", out_path.name, len(out_data))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|