import json import sqlite3 import re from pathlib import Path from openai import OpenAI from datetime import datetime import time import nltk nltk.download("punkt") # === SETTINGS === TRANSCRIPT_PATH = Path("transkripte/testVideoShort_timed.txt") DB_PATH = Path("clips_openai.db") LOG_DIR = Path("logs") LOG_DIR.mkdir(exist_ok=True) BLOCK_DURATION = 300 MIN_CLIP_LEN = 5 MAX_CLIP_LEN = 90 client = OpenAI(api_key="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") # === HILFSFUNKTIONEN === def log_text(filename, content): (LOG_DIR / filename).write_text(content.strip(), encoding="utf-8") def append_error_log(content): with (LOG_DIR / "errors.txt").open("a", encoding="utf-8") as f: f.write(content + "\n\n") def extract_json(text): match = re.search(r"\[.*\]", text.strip(), re.DOTALL) if match: try: return json.loads(match.group()) except Exception as e: append_error_log(f"❌ JSON-Fehler: {e}\n{text}") return [] def get_original_text(clip, segments, debug=False): texts = [] used_segments = [] for s in segments: # Überschneidung: Segment und Clip teilen sich Zeit if not (s["end"] < clip["start"] or s["start"] > clip["end"]): texts.append(s["text"]) used_segments.append(f"[{s['start']:.2f} – {s['end']:.2f}] {s['text']}") if debug: print(f"\n🟢 Clip {clip['start']}–{clip['end']} nutzt Segmente:\n" + "\n".join(used_segments)) return " ".join(texts).strip() # === TRANSKRIPT EINLESEN === lines = TRANSCRIPT_PATH.read_text(encoding="utf-8").splitlines() segments = [] for line in lines: match = re.match(r"\[(\d+(?:\.\d+)?)\s*[–-]\s*(\d+(?:\.\d+)?)\]\s*(.*)", line) if match: start, end, text = match.groups() start = float(start) end = float(end) if end - start >= 2.0: segments.append({"start": start, "end": end, "text": text.strip()}) if not segments: raise RuntimeError("🚫 Keine gültigen Segmente gefunden.") print(f"✅ {len(segments)} gültige Transkriptsegmente geladen.") # === BLÖCKE BILDEN blocks = [] current_block = [] current_start = 0.0 for seg in segments: if seg["end"] - current_start > BLOCK_DURATION: blocks.append(current_block) current_block = [] current_start = seg["start"] current_block.append(seg) if current_block: blocks.append(current_block) print(f"🧱 {len(blocks)} Blöcke erstellt (à ~{BLOCK_DURATION}s).") # === KI: CLIP-AUSWAHL all_clips = [] start_time = time.perf_counter() for i, block in enumerate(blocks): if not block: continue print(f"\n🤖 Sende Block {i+1}/{len(blocks)} an GPT-4o...") block_text = "\n".join([f"[{s['start']} – {s['end']}] {s['text']}" for s in block]) prompt = f""" Du bekommst einen Transkriptblock mit Zeitangaben. Extrahiere daraus 1–3 besonders interessante Abschnitte, die sich als eigenständige Clips eignen. Achte darauf das es abgeschlossene Clips sind und als eigenstaendiger Clip funktionieren kann. Ein guter Clip: - ist abgeschlossen und verständlich - enthält eine Pointe, Erkenntnis oder einen emotionalen Moment - wirkt wie ein Mini-Ausschnitt mit Anfang, Spannungsbogen, Auflösung oder Punchline - ist **mindestens 30 Sekunden lang** Nutze ausschließlich die vorhandenen Start- und Endzeiten – keine neuen erfinden. Gib ein valides JSON-Array zurück im Format: [ {{ "start": float, "end": float, "summary": "Kurze Beschreibung des Inhalts" }} ] TRANSKRIPT: {block_text} """ log_text(f"block_prompt_{i+1}.txt", prompt) try: response = client.chat.completions.create( model="gpt-4o", messages=[{"role": "user", "content": prompt}], temperature=0.4 ) raw = response.choices[0].message.content log_text(f"block_output_{i+1}.txt", raw) clips = extract_json(raw) print(f"✅ {len(clips)} Clips empfangen in Block {i+1}") for clip in clips: try: dur = float(clip["end"]) - float(clip["start"]) if MIN_CLIP_LEN <= dur <= MAX_CLIP_LEN: clip["duration"] = round(dur, 2) all_clips.append(clip) except Exception as e: append_error_log(f"⛔ Clip-Filterfehler: {clip}\n{e}") print(f"📈 Aktueller Clip-Gesamtstand: {len(all_clips)}") # ETA berechnen elapsed = time.perf_counter() - start_time avg_time = elapsed / (i + 1) eta = avg_time * (len(blocks) - (i + 1)) print(f"⏱️ Geschätzte Restzeit: {eta:.1f} Sekunden") except Exception as e: append_error_log(f"❌ OpenAI-Fehler Block {i+1}: {e}") print(f"❌ Fehler bei Block {i+1}: {e}") # === DB SPEICHERN conn = sqlite3.connect(DB_PATH) cur = conn.cursor() cur.execute("DROP TABLE IF EXISTS segments") cur.execute(""" CREATE TABLE segments ( id INTEGER PRIMARY KEY AUTOINCREMENT, file TEXT, start REAL, end REAL, duration REAL, text TEXT, summary TEXT ) """) inserted = 0 failed = 0 for clip in all_clips: try: start = float(clip["start"]) end = float(clip["end"]) duration = float(clip["duration"]) summary = clip.get("summary", "") # debug=True für print aller Segment-Texte pro Clip original_text = get_original_text(clip, segments, debug=False) if end <= start or start < 0: raise ValueError("Ungültige Zeiten") cur.execute( "INSERT INTO segments (file, start, end, duration, text, summary) VALUES (?, ?, ?, ?, ?, ?)", (TRANSCRIPT_PATH.stem, start, end, duration, original_text, summary.strip()) ) inserted += 1 except Exception as e: failed += 1 append_error_log(f"❌ DB-Fehler: {clip}\n{e}") conn.commit() conn.close() print("\n📊 Ergebnisse:") print(f" ✅ Clips gespeichert: {inserted}") print(f" ❌ Fehlerhafte Clips: {failed}") print(f"📁 Logs gespeichert in: {LOG_DIR.resolve()}")