bachlorarbeit/segment_transcript.py

import json
import sqlite3
import re
from pathlib import Path
from openai import OpenAI
from datetime import datetime
import time
import nltk

nltk.download("punkt")

# === SETTINGS ===
TRANSCRIPT_PATH = Path("transkripte/testVideoShort_timed.txt")
DB_PATH = Path("clips_openai.db")
LOG_DIR = Path("logs")
LOG_DIR.mkdir(exist_ok=True)
BLOCK_DURATION = 300
MIN_CLIP_LEN = 5
MAX_CLIP_LEN = 90

client = OpenAI(api_key="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")

# === HILFSFUNKTIONEN ===
def log_text(filename, content):
    (LOG_DIR / filename).write_text(content.strip(), encoding="utf-8")

def append_error_log(content):
    with (LOG_DIR / "errors.txt").open("a", encoding="utf-8") as f:
        f.write(content + "\n\n")

def extract_json(text):
    match = re.search(r"\[.*\]", text.strip(), re.DOTALL)
    if match:
        try:
            return json.loads(match.group())
        except Exception as e:
            append_error_log(f"❌ JSON-Fehler: {e}\n{text}")
    return []

def get_original_text(clip, segments, debug=False):
    texts = []
    used_segments = []
    for s in segments:
        # Überschneidung: Segment und Clip teilen sich Zeit
        if not (s["end"] < clip["start"] or s["start"] > clip["end"]):
            texts.append(s["text"])
            used_segments.append(f"[{s['start']:.2f} – {s['end']:.2f}] {s['text']}")
    if debug:
        print(f"\n🟢 Clip {clip['start']}–{clip['end']} nutzt Segmente:\n" +
              "\n".join(used_segments))
    return " ".join(texts).strip()

# === TRANSKRIPT EINLESEN ===
lines = TRANSCRIPT_PATH.read_text(encoding="utf-8").splitlines()
segments = []
for line in lines:
    match = re.match(r"\[(\d+(?:\.\d+)?)\s*[–-]\s*(\d+(?:\.\d+)?)\]\s*(.*)", line)
    if match:
        start, end, text = match.groups()
        start = float(start)
        end = float(end)
        if end - start >= 2.0:
            segments.append({"start": start, "end": end, "text": text.strip()})

if not segments:
    raise RuntimeError("🚫 Keine gültigen Segmente gefunden.")
print(f"✅ {len(segments)} gültige Transkriptsegmente geladen.")

# === BLÖCKE BILDEN
blocks = []
current_block = []
current_start = 0.0
for seg in segments:
    if seg["end"] - current_start > BLOCK_DURATION:
        blocks.append(current_block)
        current_block = []
        current_start = seg["start"]
    current_block.append(seg)
if current_block:
    blocks.append(current_block)
print(f"🧱 {len(blocks)} Blöcke erstellt (à ~{BLOCK_DURATION}s).")

# === KI: CLIP-AUSWAHL
all_clips = []
start_time = time.perf_counter()

for i, block in enumerate(blocks):
    if not block:
        continue

    print(f"\n🤖 Sende Block {i+1}/{len(blocks)} an GPT-4o...")

    block_text = "\n".join([f"[{s['start']} – {s['end']}] {s['text']}" for s in block])
    prompt = f"""
Du bekommst einen Transkriptblock mit Zeitangaben. Extrahiere daraus 1–3 besonders interessante Abschnitte, die sich als eigenständige Clips eignen.
Achte darauf das es abgeschlossene Clips sind und als eigenstaendiger Clip funktionieren kann.

Ein guter Clip:
- ist abgeschlossen und verständlich
- enthält eine Pointe, Erkenntnis oder einen emotionalen Moment
- wirkt wie ein Mini-Ausschnitt mit Anfang, Spannungsbogen, Auflösung oder Punchline
- ist **mindestens 30 Sekunden lang**

Nutze ausschließlich die vorhandenen Start- und Endzeiten – keine neuen erfinden.

Gib ein valides JSON-Array zurück im Format:
[
  {{
    "start": float,
    "end": float,
    "summary": "Kurze Beschreibung des Inhalts"
  }}
]

TRANSKRIPT:
{block_text}
"""
    log_text(f"block_prompt_{i+1}.txt", prompt)

    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.4
        )
        raw = response.choices[0].message.content
        log_text(f"block_output_{i+1}.txt", raw)
        clips = extract_json(raw)

        print(f"✅ {len(clips)} Clips empfangen in Block {i+1}")

        for clip in clips:
            try:
                dur = float(clip["end"]) - float(clip["start"])
                if MIN_CLIP_LEN <= dur <= MAX_CLIP_LEN:
                    clip["duration"] = round(dur, 2)
                    all_clips.append(clip)
            except Exception as e:
                append_error_log(f"⛔ Clip-Filterfehler: {clip}\n{e}")

        print(f"📈 Aktueller Clip-Gesamtstand: {len(all_clips)}")

        # ETA berechnen
        elapsed = time.perf_counter() - start_time
        avg_time = elapsed / (i + 1)
        eta = avg_time * (len(blocks) - (i + 1))
        print(f"⏱️ Geschätzte Restzeit: {eta:.1f} Sekunden")

    except Exception as e:
        append_error_log(f"❌ OpenAI-Fehler Block {i+1}: {e}")
        print(f"❌ Fehler bei Block {i+1}: {e}")

# === DB SPEICHERN
conn = sqlite3.connect(DB_PATH)
cur = conn.cursor()
cur.execute("DROP TABLE IF EXISTS segments")
cur.execute("""
CREATE TABLE segments (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    file TEXT,
    start REAL,
    end REAL,
    duration REAL,
    text TEXT,
    summary TEXT
)
""")

inserted = 0
failed = 0
for clip in all_clips:
    try:
        start = float(clip["start"])
        end = float(clip["end"])
        duration = float(clip["duration"])
        summary = clip.get("summary", "")
        # debug=True für print aller Segment-Texte pro Clip
        original_text = get_original_text(clip, segments, debug=False)
        if end <= start or start < 0:
            raise ValueError("Ungültige Zeiten")
        cur.execute(
            "INSERT INTO segments (file, start, end, duration, text, summary) VALUES (?, ?, ?, ?, ?, ?)",
            (TRANSCRIPT_PATH.stem, start, end, duration, original_text, summary.strip())
        )
        inserted += 1
    except Exception as e:
        failed += 1
        append_error_log(f"❌ DB-Fehler: {clip}\n{e}")

conn.commit()
conn.close()

print("\n📊 Ergebnisse:")
print(f"  ✅ Clips gespeichert:  {inserted}")
print(f"  ❌ Fehlerhafte Clips:  {failed}")
print(f"📁 Logs gespeichert in: {LOG_DIR.resolve()}")