bachlorarbeit/segment_transcript.py
2025-06-16 12:29:08 +02:00

197 lines
6.1 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import sqlite3
import re
from pathlib import Path
from openai import OpenAI
from datetime import datetime
import time
import nltk
nltk.download("punkt")
# === SETTINGS ===
TRANSCRIPT_PATH = Path("transkripte/testVideoShort_timed.txt")
DB_PATH = Path("clips_openai.db")
LOG_DIR = Path("logs")
LOG_DIR.mkdir(exist_ok=True)
BLOCK_DURATION = 300
MIN_CLIP_LEN = 5
MAX_CLIP_LEN = 90
client = OpenAI(api_key="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")
# === HILFSFUNKTIONEN ===
def log_text(filename, content):
(LOG_DIR / filename).write_text(content.strip(), encoding="utf-8")
def append_error_log(content):
with (LOG_DIR / "errors.txt").open("a", encoding="utf-8") as f:
f.write(content + "\n\n")
def extract_json(text):
match = re.search(r"\[.*\]", text.strip(), re.DOTALL)
if match:
try:
return json.loads(match.group())
except Exception as e:
append_error_log(f"❌ JSON-Fehler: {e}\n{text}")
return []
def get_original_text(clip, segments, debug=False):
texts = []
used_segments = []
for s in segments:
# Überschneidung: Segment und Clip teilen sich Zeit
if not (s["end"] < clip["start"] or s["start"] > clip["end"]):
texts.append(s["text"])
used_segments.append(f"[{s['start']:.2f} {s['end']:.2f}] {s['text']}")
if debug:
print(f"\n🟢 Clip {clip['start']}{clip['end']} nutzt Segmente:\n" +
"\n".join(used_segments))
return " ".join(texts).strip()
# === TRANSKRIPT EINLESEN ===
lines = TRANSCRIPT_PATH.read_text(encoding="utf-8").splitlines()
segments = []
for line in lines:
match = re.match(r"\[(\d+(?:\.\d+)?)\s*[-]\s*(\d+(?:\.\d+)?)\]\s*(.*)", line)
if match:
start, end, text = match.groups()
start = float(start)
end = float(end)
if end - start >= 2.0:
segments.append({"start": start, "end": end, "text": text.strip()})
if not segments:
raise RuntimeError("🚫 Keine gültigen Segmente gefunden.")
print(f"{len(segments)} gültige Transkriptsegmente geladen.")
# === BLÖCKE BILDEN
blocks = []
current_block = []
current_start = 0.0
for seg in segments:
if seg["end"] - current_start > BLOCK_DURATION:
blocks.append(current_block)
current_block = []
current_start = seg["start"]
current_block.append(seg)
if current_block:
blocks.append(current_block)
print(f"🧱 {len(blocks)} Blöcke erstellt (à ~{BLOCK_DURATION}s).")
# === KI: CLIP-AUSWAHL
all_clips = []
start_time = time.perf_counter()
for i, block in enumerate(blocks):
if not block:
continue
print(f"\n🤖 Sende Block {i+1}/{len(blocks)} an GPT-4o...")
block_text = "\n".join([f"[{s['start']} {s['end']}] {s['text']}" for s in block])
prompt = f"""
Du bekommst einen Transkriptblock mit Zeitangaben. Extrahiere daraus 13 besonders interessante Abschnitte, die sich als eigenständige Clips eignen.
Achte darauf das es abgeschlossene Clips sind und als eigenstaendiger Clip funktionieren kann.
Ein guter Clip:
- ist abgeschlossen und verständlich
- enthält eine Pointe, Erkenntnis oder einen emotionalen Moment
- wirkt wie ein Mini-Ausschnitt mit Anfang, Spannungsbogen, Auflösung oder Punchline
- ist **mindestens 30 Sekunden lang**
Nutze ausschließlich die vorhandenen Start- und Endzeiten keine neuen erfinden.
Gib ein valides JSON-Array zurück im Format:
[
{{
"start": float,
"end": float,
"summary": "Kurze Beschreibung des Inhalts"
}}
]
TRANSKRIPT:
{block_text}
"""
log_text(f"block_prompt_{i+1}.txt", prompt)
try:
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
temperature=0.4
)
raw = response.choices[0].message.content
log_text(f"block_output_{i+1}.txt", raw)
clips = extract_json(raw)
print(f"{len(clips)} Clips empfangen in Block {i+1}")
for clip in clips:
try:
dur = float(clip["end"]) - float(clip["start"])
if MIN_CLIP_LEN <= dur <= MAX_CLIP_LEN:
clip["duration"] = round(dur, 2)
all_clips.append(clip)
except Exception as e:
append_error_log(f"⛔ Clip-Filterfehler: {clip}\n{e}")
print(f"📈 Aktueller Clip-Gesamtstand: {len(all_clips)}")
# ETA berechnen
elapsed = time.perf_counter() - start_time
avg_time = elapsed / (i + 1)
eta = avg_time * (len(blocks) - (i + 1))
print(f"⏱️ Geschätzte Restzeit: {eta:.1f} Sekunden")
except Exception as e:
append_error_log(f"❌ OpenAI-Fehler Block {i+1}: {e}")
print(f"❌ Fehler bei Block {i+1}: {e}")
# === DB SPEICHERN
conn = sqlite3.connect(DB_PATH)
cur = conn.cursor()
cur.execute("DROP TABLE IF EXISTS segments")
cur.execute("""
CREATE TABLE segments (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file TEXT,
start REAL,
end REAL,
duration REAL,
text TEXT,
summary TEXT
)
""")
inserted = 0
failed = 0
for clip in all_clips:
try:
start = float(clip["start"])
end = float(clip["end"])
duration = float(clip["duration"])
summary = clip.get("summary", "")
# debug=True für print aller Segment-Texte pro Clip
original_text = get_original_text(clip, segments, debug=False)
if end <= start or start < 0:
raise ValueError("Ungültige Zeiten")
cur.execute(
"INSERT INTO segments (file, start, end, duration, text, summary) VALUES (?, ?, ?, ?, ?, ?)",
(TRANSCRIPT_PATH.stem, start, end, duration, original_text, summary.strip())
)
inserted += 1
except Exception as e:
failed += 1
append_error_log(f"❌ DB-Fehler: {clip}\n{e}")
conn.commit()
conn.close()
print("\n📊 Ergebnisse:")
print(f" ✅ Clips gespeichert: {inserted}")
print(f" ❌ Fehlerhafte Clips: {failed}")
print(f"📁 Logs gespeichert in: {LOG_DIR.resolve()}")