197 lines
6.1 KiB
Python
197 lines
6.1 KiB
Python
import json
|
||
import sqlite3
|
||
import re
|
||
from pathlib import Path
|
||
from openai import OpenAI
|
||
from datetime import datetime
|
||
import time
|
||
import nltk
|
||
|
||
nltk.download("punkt")
|
||
|
||
# === SETTINGS ===
|
||
TRANSCRIPT_PATH = Path("transkripte/testVideoShort_timed.txt")
|
||
DB_PATH = Path("clips_openai.db")
|
||
LOG_DIR = Path("logs")
|
||
LOG_DIR.mkdir(exist_ok=True)
|
||
BLOCK_DURATION = 300
|
||
MIN_CLIP_LEN = 5
|
||
MAX_CLIP_LEN = 90
|
||
|
||
client = OpenAI(api_key="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")
|
||
|
||
# === HILFSFUNKTIONEN ===
|
||
def log_text(filename, content):
|
||
(LOG_DIR / filename).write_text(content.strip(), encoding="utf-8")
|
||
|
||
def append_error_log(content):
|
||
with (LOG_DIR / "errors.txt").open("a", encoding="utf-8") as f:
|
||
f.write(content + "\n\n")
|
||
|
||
def extract_json(text):
|
||
match = re.search(r"\[.*\]", text.strip(), re.DOTALL)
|
||
if match:
|
||
try:
|
||
return json.loads(match.group())
|
||
except Exception as e:
|
||
append_error_log(f"❌ JSON-Fehler: {e}\n{text}")
|
||
return []
|
||
|
||
def get_original_text(clip, segments, debug=False):
|
||
texts = []
|
||
used_segments = []
|
||
for s in segments:
|
||
# Überschneidung: Segment und Clip teilen sich Zeit
|
||
if not (s["end"] < clip["start"] or s["start"] > clip["end"]):
|
||
texts.append(s["text"])
|
||
used_segments.append(f"[{s['start']:.2f} – {s['end']:.2f}] {s['text']}")
|
||
if debug:
|
||
print(f"\n🟢 Clip {clip['start']}–{clip['end']} nutzt Segmente:\n" +
|
||
"\n".join(used_segments))
|
||
return " ".join(texts).strip()
|
||
|
||
# === TRANSKRIPT EINLESEN ===
|
||
lines = TRANSCRIPT_PATH.read_text(encoding="utf-8").splitlines()
|
||
segments = []
|
||
for line in lines:
|
||
match = re.match(r"\[(\d+(?:\.\d+)?)\s*[–-]\s*(\d+(?:\.\d+)?)\]\s*(.*)", line)
|
||
if match:
|
||
start, end, text = match.groups()
|
||
start = float(start)
|
||
end = float(end)
|
||
if end - start >= 2.0:
|
||
segments.append({"start": start, "end": end, "text": text.strip()})
|
||
|
||
if not segments:
|
||
raise RuntimeError("🚫 Keine gültigen Segmente gefunden.")
|
||
print(f"✅ {len(segments)} gültige Transkriptsegmente geladen.")
|
||
|
||
# === BLÖCKE BILDEN
|
||
blocks = []
|
||
current_block = []
|
||
current_start = 0.0
|
||
for seg in segments:
|
||
if seg["end"] - current_start > BLOCK_DURATION:
|
||
blocks.append(current_block)
|
||
current_block = []
|
||
current_start = seg["start"]
|
||
current_block.append(seg)
|
||
if current_block:
|
||
blocks.append(current_block)
|
||
print(f"🧱 {len(blocks)} Blöcke erstellt (à ~{BLOCK_DURATION}s).")
|
||
|
||
# === KI: CLIP-AUSWAHL
|
||
all_clips = []
|
||
start_time = time.perf_counter()
|
||
|
||
for i, block in enumerate(blocks):
|
||
if not block:
|
||
continue
|
||
|
||
print(f"\n🤖 Sende Block {i+1}/{len(blocks)} an GPT-4o...")
|
||
|
||
block_text = "\n".join([f"[{s['start']} – {s['end']}] {s['text']}" for s in block])
|
||
prompt = f"""
|
||
Du bekommst einen Transkriptblock mit Zeitangaben. Extrahiere daraus 1–3 besonders interessante Abschnitte, die sich als eigenständige Clips eignen.
|
||
Achte darauf das es abgeschlossene Clips sind und als eigenstaendiger Clip funktionieren kann.
|
||
|
||
Ein guter Clip:
|
||
- ist abgeschlossen und verständlich
|
||
- enthält eine Pointe, Erkenntnis oder einen emotionalen Moment
|
||
- wirkt wie ein Mini-Ausschnitt mit Anfang, Spannungsbogen, Auflösung oder Punchline
|
||
- ist **mindestens 30 Sekunden lang**
|
||
|
||
Nutze ausschließlich die vorhandenen Start- und Endzeiten – keine neuen erfinden.
|
||
|
||
Gib ein valides JSON-Array zurück im Format:
|
||
[
|
||
{{
|
||
"start": float,
|
||
"end": float,
|
||
"summary": "Kurze Beschreibung des Inhalts"
|
||
}}
|
||
]
|
||
|
||
TRANSKRIPT:
|
||
{block_text}
|
||
"""
|
||
log_text(f"block_prompt_{i+1}.txt", prompt)
|
||
|
||
try:
|
||
response = client.chat.completions.create(
|
||
model="gpt-4o",
|
||
messages=[{"role": "user", "content": prompt}],
|
||
temperature=0.4
|
||
)
|
||
raw = response.choices[0].message.content
|
||
log_text(f"block_output_{i+1}.txt", raw)
|
||
clips = extract_json(raw)
|
||
|
||
print(f"✅ {len(clips)} Clips empfangen in Block {i+1}")
|
||
|
||
for clip in clips:
|
||
try:
|
||
dur = float(clip["end"]) - float(clip["start"])
|
||
if MIN_CLIP_LEN <= dur <= MAX_CLIP_LEN:
|
||
clip["duration"] = round(dur, 2)
|
||
all_clips.append(clip)
|
||
except Exception as e:
|
||
append_error_log(f"⛔ Clip-Filterfehler: {clip}\n{e}")
|
||
|
||
print(f"📈 Aktueller Clip-Gesamtstand: {len(all_clips)}")
|
||
|
||
# ETA berechnen
|
||
elapsed = time.perf_counter() - start_time
|
||
avg_time = elapsed / (i + 1)
|
||
eta = avg_time * (len(blocks) - (i + 1))
|
||
print(f"⏱️ Geschätzte Restzeit: {eta:.1f} Sekunden")
|
||
|
||
except Exception as e:
|
||
append_error_log(f"❌ OpenAI-Fehler Block {i+1}: {e}")
|
||
print(f"❌ Fehler bei Block {i+1}: {e}")
|
||
|
||
# === DB SPEICHERN
|
||
conn = sqlite3.connect(DB_PATH)
|
||
cur = conn.cursor()
|
||
cur.execute("DROP TABLE IF EXISTS segments")
|
||
cur.execute("""
|
||
CREATE TABLE segments (
|
||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||
file TEXT,
|
||
start REAL,
|
||
end REAL,
|
||
duration REAL,
|
||
text TEXT,
|
||
summary TEXT
|
||
)
|
||
""")
|
||
|
||
inserted = 0
|
||
failed = 0
|
||
for clip in all_clips:
|
||
try:
|
||
start = float(clip["start"])
|
||
end = float(clip["end"])
|
||
duration = float(clip["duration"])
|
||
summary = clip.get("summary", "")
|
||
# debug=True für print aller Segment-Texte pro Clip
|
||
original_text = get_original_text(clip, segments, debug=False)
|
||
if end <= start or start < 0:
|
||
raise ValueError("Ungültige Zeiten")
|
||
cur.execute(
|
||
"INSERT INTO segments (file, start, end, duration, text, summary) VALUES (?, ?, ?, ?, ?, ?)",
|
||
(TRANSCRIPT_PATH.stem, start, end, duration, original_text, summary.strip())
|
||
)
|
||
inserted += 1
|
||
except Exception as e:
|
||
failed += 1
|
||
append_error_log(f"❌ DB-Fehler: {clip}\n{e}")
|
||
|
||
conn.commit()
|
||
conn.close()
|
||
|
||
print("\n📊 Ergebnisse:")
|
||
print(f" ✅ Clips gespeichert: {inserted}")
|
||
print(f" ❌ Fehlerhafte Clips: {failed}")
|
||
print(f"📁 Logs gespeichert in: {LOG_DIR.resolve()}")
|