277 lines
11 KiB
Python
277 lines
11 KiB
Python
#!/usr/bin/env python3
|
||
# transcription_chunked_words.py — Whisper mit Wortzeitstempeln, doppler-sicher
|
||
|
||
import os
|
||
import sys
|
||
import json
|
||
import argparse
|
||
from pathlib import Path
|
||
from tempfile import TemporaryDirectory
|
||
from typing import List, Dict, Tuple, Optional
|
||
|
||
import ffmpeg
|
||
import whisper
|
||
|
||
# ── Projektwurzel in sys.path aufnehmen (dieses Skript liegt z. B. unter src/text/)
|
||
ROOT = Path(__file__).resolve().parents[2]
|
||
sys.path.insert(0, str(ROOT))
|
||
|
||
from config import INPUT_DIR, TRANSCRIPTS_DIR # zentrale Pfade
|
||
|
||
# ──────────────────────────────────────────────────────────────────────────────
|
||
# Utilities
|
||
# ──────────────────────────────────────────────────────────────────────────────
|
||
|
||
def probe_duration(path: Path) -> float:
|
||
"""Ermittle die Videodauer in Sekunden (ffmpeg.probe)."""
|
||
try:
|
||
meta = ffmpeg.probe(str(path))
|
||
except ffmpeg.Error as e:
|
||
raise RuntimeError(f"ffmpeg.probe fehlgeschlagen für {path}: {e.stderr.decode('utf-8','ignore') if hasattr(e, 'stderr') else e}") from e
|
||
|
||
dur = meta.get("format", {}).get("duration")
|
||
if dur is not None:
|
||
return float(dur)
|
||
|
||
cand = 0.0
|
||
for s in meta.get("streams", []) or []:
|
||
d = s.get("duration")
|
||
if d:
|
||
cand = max(cand, float(d))
|
||
if cand > 0:
|
||
return cand
|
||
raise RuntimeError(f"Konnte Videodauer nicht bestimmen: {path}")
|
||
|
||
def make_chunks(total: float, chunk_seconds: float, overlap: float) -> List[Tuple[float,float]]:
|
||
"""Zerteile [0,total] in überlappende Intervalle."""
|
||
if chunk_seconds <= 0:
|
||
return [(0.0, total)]
|
||
s, out = 0.0, []
|
||
while s < total:
|
||
e = min(s + chunk_seconds, total)
|
||
out.append((s, e))
|
||
if e >= total:
|
||
break
|
||
s = max(0.0, e - overlap)
|
||
return out
|
||
|
||
def extract_audio_segment(src_video: Path, start: float, end: float, out_wav: Path) -> None:
|
||
"""Extrahiere [start,end] als Mono-16kHz-WAV."""
|
||
(
|
||
ffmpeg
|
||
.input(str(src_video), ss=start, to=end)
|
||
.output(
|
||
str(out_wav),
|
||
format="wav",
|
||
acodec="pcm_s16le",
|
||
ac=1,
|
||
ar="16000",
|
||
loglevel="error",
|
||
)
|
||
.overwrite_output()
|
||
.run()
|
||
)
|
||
|
||
def is_suspect(text: str) -> bool:
|
||
"""Heuristik: leere/loopende/zweifelhafte Zeilen markieren."""
|
||
t = (text or "").strip().lower()
|
||
if not t:
|
||
return True
|
||
words = t.split()
|
||
if not words:
|
||
return True
|
||
counts = {w: words.count(w) for w in set(words)}
|
||
most_common = max(counts.values())
|
||
return most_common / len(words) > 0.6 or most_common > 20
|
||
|
||
def merge_overlaps_keep_best(
|
||
segments: List[Dict],
|
||
max_gap: float = 0.15,
|
||
min_dur: float = 0.30
|
||
) -> List[Dict]:
|
||
"""
|
||
Zeitlich sortieren, kleine Gaps schließen. Bei Überlappung:
|
||
- keine Text-Konkatenation
|
||
- behalte das "bessere" Segment (längere Dauer, dann längerer Text)
|
||
- words: vom "best" übernehmen (falls vorhanden)
|
||
"""
|
||
cleaned = []
|
||
for s in segments:
|
||
s0 = float(s["start"]); s1 = float(s["end"])
|
||
txt = (s.get("text") or "").strip()
|
||
if s1 - s0 >= min_dur and txt:
|
||
cleaned.append({
|
||
"start": s0, "end": s1,
|
||
"text": txt,
|
||
"words": s.get("words", [])
|
||
})
|
||
if not cleaned:
|
||
return []
|
||
|
||
cleaned.sort(key=lambda x: (x["start"], x["end"]))
|
||
out = [cleaned[0]]
|
||
|
||
def score(x: Dict) -> tuple:
|
||
return (x["end"] - x["start"], len(x.get("text", "")))
|
||
|
||
for s in cleaned[1:]:
|
||
m = out[-1]
|
||
if s["start"] <= m["end"] + max_gap:
|
||
best = s if score(s) > score(m) else m
|
||
out[-1] = {
|
||
"start": min(m["start"], s["start"]),
|
||
"end": max(m["end"], s["end"]),
|
||
"text": best["text"],
|
||
"words": best.get("words", []),
|
||
}
|
||
else:
|
||
out.append(s)
|
||
return out
|
||
|
||
def write_outputs(base: Path, segments: List[Dict], out_dir: Path, ascii_dash: bool = True):
|
||
"""Schreibe _timed.txt, _suspect_lines.txt und _segments.json."""
|
||
out_dir.mkdir(parents=True, exist_ok=True)
|
||
dash = "-" if ascii_dash else "–"
|
||
|
||
out_txt = out_dir / f"{base.stem}_timed.txt"
|
||
out_sus = out_dir / f"{base.stem}_suspect_lines.txt"
|
||
out_json = out_dir / f"{base.stem}_segments.json"
|
||
|
||
# TXT nur zur Ansicht
|
||
with open(out_txt, "w", encoding="utf-8") as f_txt, open(out_sus, "w", encoding="utf-8") as f_sus:
|
||
for s in segments:
|
||
line = f"[{s['start']:.2f} {dash} {s['end']:.2f}] {s['text']}\n"
|
||
f_txt.write(line)
|
||
if is_suspect(s["text"]):
|
||
f_sus.write(line)
|
||
|
||
# JSON für die Weiterverarbeitung (inkl. words)
|
||
with open(out_json, "w", encoding="utf-8") as f_json:
|
||
json.dump(segments, f_json, ensure_ascii=False, indent=2)
|
||
|
||
return out_txt, out_sus, out_json
|
||
|
||
def find_default_input() -> Optional[Path]:
|
||
"""Nimm das erste Video aus INPUT_DIR, falls kein --input übergeben wurde."""
|
||
exts = (".mp4", ".mov", ".mkv", ".m4v", ".wav", ".mp3")
|
||
for p in sorted(INPUT_DIR.iterdir()):
|
||
if p.suffix.lower() in exts:
|
||
return p
|
||
return None
|
||
|
||
# ──────────────────────────────────────────────────────────────────────────────
|
||
# CLI
|
||
# ──────────────────────────────────────────────────────────────────────────────
|
||
|
||
def parse_args():
|
||
p = argparse.ArgumentParser(
|
||
description="Chunked Whisper Transcription mit Wortzeitstempeln & doppler-sicherem Stitching."
|
||
)
|
||
p.add_argument("--input", type=Path, default=None, help=f"Eingabevideo/-audio. Default: erstes File in {INPUT_DIR}")
|
||
p.add_argument("--outdir", type=Path, default=None, help=f"Ausgabeverzeichnis. Default: {TRANSCRIPTS_DIR}")
|
||
p.add_argument("--model", type=str, default=os.getenv("WHISPER_MODEL", "small"), help="Whisper-Modell (tiny/base/small/medium/large)")
|
||
p.add_argument("--lang", type=str, default=os.getenv("LANGUAGE", "none"), help="Sprachcode (z. B. 'de') oder leer/None für Auto-Detect")
|
||
p.add_argument("--chunk", type=float, default=60.0, help="Chunk-Länge in Sekunden (0 = ganzes File)")
|
||
p.add_argument("--overlap", type=float, default=2.0, help="Overlap in Sekunden")
|
||
p.add_argument("--min-dur", type=float, default=0.30, help="Mindest-Segmentdauer (Sekunden)")
|
||
p.add_argument("--max-gap", type=float, default=0.15, help="Maximaler Zeit-Gap für Merge (Sekunden)")
|
||
p.add_argument("--fp16", action="store_true", help="fp16 aktivieren (nur sinnvoll mit GPU)")
|
||
return p.parse_args()
|
||
|
||
# ──────────────────────────────────────────────────────────────────────────────
|
||
# Main
|
||
# ──────────────────────────────────────────────────────────────────────────────
|
||
|
||
def main():
|
||
# Whisper-Cache (damit Modelle lokal landen)
|
||
os.environ.setdefault("XDG_CACHE_HOME", str(ROOT / "whisper-cache"))
|
||
|
||
args = parse_args()
|
||
input_path = args.input or find_default_input()
|
||
out_dir = args.outdir or TRANSCRIPTS_DIR
|
||
|
||
print("📁 Projekt-Root:", ROOT)
|
||
print("📄 Input:", input_path if input_path else "—")
|
||
if not input_path or not input_path.exists():
|
||
raise FileNotFoundError(f"Kein gültiges Eingabefile gefunden. Lege ein Video/Audio in {INPUT_DIR} oder nutze --input.")
|
||
|
||
out_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
duration = probe_duration(input_path)
|
||
print(f"🎬 Dauer: {duration:.2f}s")
|
||
|
||
chunks = make_chunks(duration, args.chunk, args.overlap)
|
||
print(f"🔪 {len(chunks)} Chunks à {args.chunk:.1f}s mit {args.overlap:.1f}s Overlap")
|
||
|
||
# Whisper laden
|
||
print(f"🧠 Lade Whisper-Modell: {args.model}")
|
||
try:
|
||
model = whisper.load_model(args.model)
|
||
except Exception as e:
|
||
raise RuntimeError(f"Whisper-Modell '{args.model}' konnte nicht geladen werden. Installiert? (pip install openai-whisper)\n{e}") from e
|
||
|
||
all_segments: List[Dict] = []
|
||
with TemporaryDirectory() as tmpdir_str:
|
||
tmpdir = Path(tmpdir_str)
|
||
for i, (start, end) in enumerate(chunks, 1):
|
||
print(f"🔉 Chunk {i}/{len(chunks)}: {start:.2f}s - {end:.2f}s")
|
||
wav = tmpdir / f"chunk_{i:03d}.wav"
|
||
extract_audio_segment(input_path, start, end, wav)
|
||
|
||
# Sprache: ''/none = Auto-Detect
|
||
lang = None if str(args.lang).strip().lower() in {"", "none", "null"} else args.lang
|
||
|
||
# Transkribieren mit Wortzeiten, ohne Cross-Chunk-Kontext
|
||
result = model.transcribe(
|
||
str(wav),
|
||
language=lang,
|
||
fp16=args.fp16,
|
||
word_timestamps=True,
|
||
condition_on_previous_text=False,
|
||
temperature=0,
|
||
verbose=False,
|
||
)
|
||
|
||
# Center-Cut: nur Mittelteil behalten (verhindert Grenz-Doppler)
|
||
keep_start = start if i == 1 else start + args.overlap / 2.0
|
||
keep_end = end if i == len(chunks) else end - args.overlap / 2.0
|
||
|
||
for seg in result.get("segments", []) or []:
|
||
s0 = float(seg["start"]) + start
|
||
s1 = float(seg["end"]) + start
|
||
mid = (s0 + s1) / 2.0
|
||
if not (keep_start <= mid < keep_end):
|
||
continue
|
||
|
||
# Wörter mit absoluten Zeiten übernehmen
|
||
words = []
|
||
for w in (seg.get("words") or []):
|
||
txt = (w.get("word") or w.get("text") or "").strip()
|
||
if not txt:
|
||
continue
|
||
words.append({
|
||
"start": float(w["start"]) + start,
|
||
"end": float(w["end"]) + start,
|
||
"text": txt
|
||
})
|
||
|
||
all_segments.append({
|
||
"start": s0,
|
||
"end": s1,
|
||
"text": (seg.get("text") or "").strip(),
|
||
"words": words
|
||
})
|
||
|
||
print(f"🧹 Roh-Segmente: {len(all_segments)} → merge & filter …")
|
||
merged = merge_overlaps_keep_best(all_segments, max_gap=args.max_gap, min_dur=args.min_dur)
|
||
print(f"✅ Gemergte Segmente: {len(merged)}")
|
||
|
||
out_txt, out_sus, out_json = write_outputs(input_path, merged, out_dir, ascii_dash=True)
|
||
print(f"📝 TXT: {out_txt}")
|
||
print(f"⚠️ SUSPECT: {out_sus}")
|
||
print(f"💾 JSON: {out_json}")
|
||
print("🎉 Fertig.")
|
||
|
||
if __name__ == "__main__":
|
||
main()
|