Initialer Stand der Bachelorarbeit

2025-06-16 12:29:08 +02:00 · 2025-06-16 12:29:08 +02:00 · 84b29e1eaf
commit 84b29e1eaf
17 changed files with 568 additions and 0 deletions
--- a/.DS_Store
+++ b/.DS_Store
--- a/.idea/.gitignore
+++ b/.idea/.gitignore
@ -0,0 +1,8 @@
 # Default ignored files
 /shelf/
 /workspace.xml
 # Editor-based HTTP Client requests
 /httpRequests/
 # Datasource local storage ignored files
 /dataSources/
 /dataSources.local.xml
--- a/.idea/BachlorArbeit.iml
+++ b/.idea/BachlorArbeit.iml
@ -0,0 +1,11 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <module type="PYTHON_MODULE" version="4">
  <component name="NewModuleRootManager">
    <content url="file://$MODULE_DIR$">
      <sourceFolder url="file://$MODULE_DIR$" isTestSource="false" />
      <excludeFolder url="file://$MODULE_DIR$/.venv" />
    </content>
    <orderEntry type="jdk" jdkName="Python 3.10 (BachlorArbeit)" jdkType="Python SDK" />
    <orderEntry type="sourceFolder" forTests="false" />
  </component>
 </module>
--- a/.idea/dataSources.xml
+++ b/.idea/dataSources.xml
@ -0,0 +1,35 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="DataSourceManagerImpl" format="xml" multifile-model="true">
    <data-source source="LOCAL" name="segments" uuid="b474bded-3824-407e-9dc9-bcc11057235d">
      <driver-ref>sqlite.xerial</driver-ref>
      <synchronize>true</synchronize>
      <jdbc-driver>org.sqlite.JDBC</jdbc-driver>
      <jdbc-url>jdbc:sqlite:$PROJECT_DIR$/segments.db</jdbc-url>
      <working-dir>$ProjectFileDir$</working-dir>
      <libraries>
        <library>
          <url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/xerial/sqlite-jdbc/3.45.1.0/sqlite-jdbc-3.45.1.0.jar</url>
        </library>
        <library>
          <url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/slf4j/slf4j-api/1.7.36/slf4j-api-1.7.36.jar</url>
        </library>
      </libraries>
    </data-source>
    <data-source source="LOCAL" name="clips_openai" uuid="50f21c9a-9baf-4dc5-9c33-fde0fd385e38">
      <driver-ref>sqlite.xerial</driver-ref>
      <synchronize>true</synchronize>
      <jdbc-driver>org.sqlite.JDBC</jdbc-driver>
      <jdbc-url>jdbc:sqlite:$PROJECT_DIR$/clips_openai.db</jdbc-url>
      <working-dir>$ProjectFileDir$</working-dir>
      <libraries>
        <library>
          <url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/xerial/sqlite-jdbc/3.45.1.0/sqlite-jdbc-3.45.1.0.jar</url>
        </library>
        <library>
          <url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/slf4j/slf4j-api/1.7.36/slf4j-api-1.7.36.jar</url>
        </library>
      </libraries>
    </data-source>
  </component>
 </project>
--- a/.idea/inspectionProfiles/profiles_settings.xml
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@ -0,0 +1,6 @@
 <component name="InspectionProjectProfileManager">
  <settings>
    <option name="USE_PROJECT_PROFILE" value="false" />
    <version value="1.0" />
  </settings>
 </component>
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@ -0,0 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="Black">
    <option name="sdkName" value="Python 3.10 (BachlorArbeit)" />
  </component>
 </project>
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@ -0,0 +1,8 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="ProjectModuleManager">
    <modules>
      <module fileurl="file://$PROJECT_DIR$/.idea/BachlorArbeit.iml" filepath="$PROJECT_DIR$/.idea/BachlorArbeit.iml" />
    </modules>
  </component>
 </project>
--- a/.idea/sqldialects.xml
+++ b/.idea/sqldialects.xml
@ -0,0 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="SqlDialectMappings">
    <file url="file://$PROJECT_DIR$/rateCluster.py" dialect="SQLite" />
  </component>
 </project>
--- a/.idea/vcs.xml
+++ b/.idea/vcs.xml
@ -0,0 +1,9 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
  <component name="VcsDirectoryMappings">
    <mapping directory="$PROJECT_DIR$/models/distiluse-base-multilingual-cased-v2" vcs="Git" />
    <mapping directory="$PROJECT_DIR$/text-clustering" vcs="Git" />
    <mapping directory="$PROJECT_DIR$/whisper.cpp" vcs="Git" />
    <mapping directory="$PROJECT_DIR$/whisper.cpp/whisper.cpp" vcs="Git" />
  </component>
 </project>
--- a/README.md
+++ b/README.md
--- a/cutClips.py
+++ b/cutClips.py
@ -0,0 +1,38 @@
 from moviepy.video.io.VideoFileClip import VideoFileClip
 from pathlib import Path
 import sqlite3
 # === Setup ===
 input_video = Path("input/testVideoShort.mov")
 output_dir = Path("output")
 output_dir.mkdir(parents=True, exist_ok=True)
 # === SQLite DB lesen ===
 db_path = "clips_openai.db"
 conn = sqlite3.connect(db_path)
 cursor = conn.cursor()
 # Nur die Top 10 Clips mit höchstem score_total
 cursor.execute("""
    SELECT start, end, text
    FROM highlights
    ORDER BY score_total DESC
    LIMIT 10
 """)
 highlights = cursor.fetchall()
 # === Video laden ===
 video = VideoFileClip(str(input_video))
 # === Clips schneiden ===
 for i, (start, end, text) in enumerate(highlights):
    output_file = output_dir / f"highlight_{i+1}.mp4"
    end = min(end, video.duration)  # Sicherstellen, dass das Ende nicht über das Video hinausgeht
    print(f"🎬 Exportiere Clip {i+1}: {start:.2f}s – {end:.2f}s → {output_file.name}")
    clip = video.subclipped(start, end)
    clip.write_videofile(str(output_file), codec="libx264", audio_codec="aac")
 # === Cleanup ===
 conn.close()
 video.close()
 print("✅ Top 10 Clips exportiert.")
--- a/rateCluster.py
+++ b/rateCluster.py
@ -0,0 +1,135 @@
 import sqlite3
 import re
 from openai import OpenAI
 from time import sleep
 # === Einstellungen ===
 DB_PATH = "clips_openai.db"
 VIDEO_ID = "testVideoShort"
 MAX_CLIPS = 5  # oder "all"
 OPENAI_API_KEY = "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"
 client = OpenAI(api_key=OPENAI_API_KEY)
 # === DB-Verbindung
 conn = sqlite3.connect(DB_PATH)
 cursor = conn.cursor()
 cursor.execute("DROP TABLE IF EXISTS highlights")
 cursor.execute("""
 CREATE TABLE highlights (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    file TEXT,
    start REAL,
    end REAL,
    text TEXT,
    viralitaet INTEGER,
    emotionalitaet INTEGER,
    witz INTEGER,
    provokation INTEGER,
    score_total INTEGER
 )
 """)
 conn.commit()
 print(f"🧹 Tabelle 'highlights' neu erstellt für: {VIDEO_ID}")
 # === Segmente laden
 cursor.execute("SELECT start, end, text FROM segments ORDER BY start")
 segments = cursor.fetchall()
 print(f"📥 {len(segments)} Segmente (Originaltext) geladen.")
 # === Bewertungsfunktion (GPT-4o)
 def analyse_segment(text, start, end):
    print(f"\n🔎 Bewerte Clip: {start:.2f}s – {end:.2f}s")
    prompt = f"""
 Bewerte folgenden Podcast-Ausschnitt mit genau vier Zahlen zwischen 1 und 10. Achte darauf das es abgeschlossene Clips sind und als eigenstaendiger Clip funktionieren kann.
 \"\"\"{text}\"\"\"
 Dauer: {start:.2f} bis {end:.2f} Sekunden.
 Antwortformat (bitte exakt einhalten, keine weiteren Kommentare):
 Viralität: [Zahl]
 Emotionalität: [Zahl]
 Witz: [Zahl]
 Provokation: [Zahl]
 """
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.4
        )
        output = response.choices[0].message.content.strip()
        print(f"📤 GPT-Antwort:\n{output}")
        values = {
            "viralitaet": None,
            "emotionalitaet": None,
            "witz": None,
            "provokation": None
        }
        for line in output.splitlines():
            line = line.strip().lower().replace("ä", "ae")
            if line.startswith("viralitaet"):
                values["viralitaet"] = int(re.search(r"\d+", line).group())
            elif line.startswith("emotionalitaet"):
                values["emotionalitaet"] = int(re.search(r"\d+", line).group())
            elif line.startswith("witz"):
                values["witz"] = int(re.search(r"\d+", line).group())
            elif line.startswith("provokation"):
                values["provokation"] = int(re.search(r"\d+", line).group())
        if all(v is not None for v in values.values()):
            total_score = sum(values.values())
            cursor.execute("""
                INSERT INTO highlights (
                    file, start, end, text,
                    viralitaet, emotionalitaet, witz, provokation, score_total
                ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
            """, (
                VIDEO_ID, start, end, text.strip(),
                values["viralitaet"], values["emotionalitaet"],
                values["witz"], values["provokation"],
                total_score
            ))
            conn.commit()
            return {
                "start": start,
                "end": end,
                "text": text.strip(),
                "score": values,
                "total": total_score
            }
        else:
            raise ValueError("Unvollständige Bewertung")
    except Exception as e:
        print(f"⚠️ Fehler bei GPT-Auswertung: {e}")
        return None
 # === Clips bewerten
 rated = []
 for start, end, text in segments:
    result = analyse_segment(text, float(start), float(end))
    if result:
        rated.append(result)
    sleep(1.2)  # Anti-Rate-Limit
 # === Beste Clips anzeigen
 rated.sort(key=lambda x: x["total"], reverse=True)
 selected = rated if MAX_CLIPS == "all" else rated[:int(MAX_CLIPS)]
 print(f"\n🎬 Beste {len(selected)} Highlights für: {VIDEO_ID}")
 for clip in selected:
    print(f"\n🚀 {clip['start']:.2f}s – {clip['end']:.2f}s")
    print(f"🎙️  {clip['text'][:200]}...")
    print("📊 Bewertung:")
    for k, v in clip["score"].items():
        print(f"   {k.capitalize()}: {v}")
    print(f"   👉 Gesamt: {clip['total']}")
 conn.close()
--- a/segment_transcript.py
+++ b/segment_transcript.py
@ -0,0 +1,196 @@
 import json
 import sqlite3
 import re
 from pathlib import Path
 from openai import OpenAI
 from datetime import datetime
 import time
 import nltk
 nltk.download("punkt")
 # === SETTINGS ===
 TRANSCRIPT_PATH = Path("transkripte/testVideoShort_timed.txt")
 DB_PATH = Path("clips_openai.db")
 LOG_DIR = Path("logs")
 LOG_DIR.mkdir(exist_ok=True)
 BLOCK_DURATION = 300
 MIN_CLIP_LEN = 5
 MAX_CLIP_LEN = 90
 client = OpenAI(api_key="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")
 # === HILFSFUNKTIONEN ===
 def log_text(filename, content):
    (LOG_DIR / filename).write_text(content.strip(), encoding="utf-8")
 def append_error_log(content):
    with (LOG_DIR / "errors.txt").open("a", encoding="utf-8") as f:
        f.write(content + "\n\n")
 def extract_json(text):
    match = re.search(r"\[.*\]", text.strip(), re.DOTALL)
    if match:
        try:
            return json.loads(match.group())
        except Exception as e:
            append_error_log(f"❌ JSON-Fehler: {e}\n{text}")
    return []
 def get_original_text(clip, segments, debug=False):
    texts = []
    used_segments = []
    for s in segments:
        # Überschneidung: Segment und Clip teilen sich Zeit
        if not (s["end"] < clip["start"] or s["start"] > clip["end"]):
            texts.append(s["text"])
            used_segments.append(f"[{s['start']:.2f} – {s['end']:.2f}] {s['text']}")
    if debug:
        print(f"\n🟢 Clip {clip['start']}–{clip['end']} nutzt Segmente:\n" +
              "\n".join(used_segments))
    return " ".join(texts).strip()
 # === TRANSKRIPT EINLESEN ===
 lines = TRANSCRIPT_PATH.read_text(encoding="utf-8").splitlines()
 segments = []
 for line in lines:
    match = re.match(r"\[(\d+(?:\.\d+)?)\s*[–-]\s*(\d+(?:\.\d+)?)\]\s*(.*)", line)
    if match:
        start, end, text = match.groups()
        start = float(start)
        end = float(end)
        if end - start >= 2.0:
            segments.append({"start": start, "end": end, "text": text.strip()})
 if not segments:
    raise RuntimeError("🚫 Keine gültigen Segmente gefunden.")
 print(f"✅ {len(segments)} gültige Transkriptsegmente geladen.")
 # === BLÖCKE BILDEN
 blocks = []
 current_block = []
 current_start = 0.0
 for seg in segments:
    if seg["end"] - current_start > BLOCK_DURATION:
        blocks.append(current_block)
        current_block = []
        current_start = seg["start"]
    current_block.append(seg)
 if current_block:
    blocks.append(current_block)
 print(f"🧱 {len(blocks)} Blöcke erstellt (à ~{BLOCK_DURATION}s).")
 # === KI: CLIP-AUSWAHL
 all_clips = []
 start_time = time.perf_counter()
 for i, block in enumerate(blocks):
    if not block:
        continue
    print(f"\n🤖 Sende Block {i+1}/{len(blocks)} an GPT-4o...")
    block_text = "\n".join([f"[{s['start']} – {s['end']}] {s['text']}" for s in block])
    prompt = f"""
 Du bekommst einen Transkriptblock mit Zeitangaben. Extrahiere daraus 1–3 besonders interessante Abschnitte, die sich als eigenständige Clips eignen.
 Achte darauf das es abgeschlossene Clips sind und als eigenstaendiger Clip funktionieren kann.
 Ein guter Clip:
 - ist abgeschlossen und verständlich
 - enthält eine Pointe, Erkenntnis oder einen emotionalen Moment
 - wirkt wie ein Mini-Ausschnitt mit Anfang, Spannungsbogen, Auflösung oder Punchline
 - ist **mindestens 30 Sekunden lang**
 Nutze ausschließlich die vorhandenen Start- und Endzeiten – keine neuen erfinden.
 Gib ein valides JSON-Array zurück im Format:
 [
  {{
    "start": float,
    "end": float,
    "summary": "Kurze Beschreibung des Inhalts"
  }}
 ]
 TRANSKRIPT:
 {block_text}
 """
    log_text(f"block_prompt_{i+1}.txt", prompt)
    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.4
        )
        raw = response.choices[0].message.content
        log_text(f"block_output_{i+1}.txt", raw)
        clips = extract_json(raw)
        print(f"✅ {len(clips)} Clips empfangen in Block {i+1}")
        for clip in clips:
            try:
                dur = float(clip["end"]) - float(clip["start"])
                if MIN_CLIP_LEN <= dur <= MAX_CLIP_LEN:
                    clip["duration"] = round(dur, 2)
                    all_clips.append(clip)
            except Exception as e:
                append_error_log(f"⛔ Clip-Filterfehler: {clip}\n{e}")
        print(f"📈 Aktueller Clip-Gesamtstand: {len(all_clips)}")
        # ETA berechnen
        elapsed = time.perf_counter() - start_time
        avg_time = elapsed / (i + 1)
        eta = avg_time * (len(blocks) - (i + 1))
        print(f"⏱️ Geschätzte Restzeit: {eta:.1f} Sekunden")
    except Exception as e:
        append_error_log(f"❌ OpenAI-Fehler Block {i+1}: {e}")
        print(f"❌ Fehler bei Block {i+1}: {e}")
 # === DB SPEICHERN
 conn = sqlite3.connect(DB_PATH)
 cur = conn.cursor()
 cur.execute("DROP TABLE IF EXISTS segments")
 cur.execute("""
 CREATE TABLE segments (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    file TEXT,
    start REAL,
    end REAL,
    duration REAL,
    text TEXT,
    summary TEXT
 )
 """)
 inserted = 0
 failed = 0
 for clip in all_clips:
    try:
        start = float(clip["start"])
        end = float(clip["end"])
        duration = float(clip["duration"])
        summary = clip.get("summary", "")
        # debug=True für print aller Segment-Texte pro Clip
        original_text = get_original_text(clip, segments, debug=False)
        if end <= start or start < 0:
            raise ValueError("Ungültige Zeiten")
        cur.execute(
            "INSERT INTO segments (file, start, end, duration, text, summary) VALUES (?, ?, ?, ?, ?, ?)",
            (TRANSCRIPT_PATH.stem, start, end, duration, original_text, summary.strip())
        )
        inserted += 1
    except Exception as e:
        failed += 1
        append_error_log(f"❌ DB-Fehler: {clip}\n{e}")
 conn.commit()
 conn.close()
 print("\n📊 Ergebnisse:")
 print(f"  ✅ Clips gespeichert:  {inserted}")
 print(f"  ❌ Fehlerhafte Clips:  {failed}")
 print(f"📁 Logs gespeichert in: {LOG_DIR.resolve()}")
--- a/1
+++ b/1
@ -0,0 +1 @@
 Subproject commit 7815f8b37d91b75cf160ed3f0ec8550c0b58cabb
--- a/transcription.py
+++ b/transcription.py
@ -0,0 +1,108 @@
 # transcription_chunked.py
 import whisper
 from pathlib import Path
 import os
 import json
 import ffmpeg
 import tempfile
 # === Einstellungen ===
 input_file = Path("input/testVideoShort.mov")
 output_dir = Path("transkripte")
 output_dir.mkdir(parents=True, exist_ok=True)
 output_txt = output_dir / f"{input_file.stem}_timed.txt"
 output_json = output_dir / f"{input_file.stem}_segments.json"
 suspect_txt = output_dir / f"{input_file.stem}_suspect_lines.txt"
 CHUNKS = 4  # Anzahl Chunks (anpassen!)
 OVERLAP = 2.0  # Sekunden Überlappung
 os.environ["XDG_CACHE_HOME"] = str(Path(__file__).parent / "whisper-cache")
 probe = ffmpeg.probe(str(input_file))
 duration = float(probe["format"]["duration"])
 print(f"🎥 Videolänge: {duration:.2f} Sekunden")
 def extract_audio_chunk(start_time, duration, output_path):
    ffmpeg.input(str(input_file), ss=start_time, t=duration).output(
        str(output_path),
        format="wav",
        acodec="pcm_s16le",
        ac=1,
        ar="16000",
        loglevel="error"
    ).overwrite_output().run()
 def is_suspect(text):
    words = text.strip().lower().split()
    if not words:
        return True
    most_common = max([words.count(w) for w in set(words)])
    return most_common / len(words) > 0.6 or most_common > 20
 tmp_dir = Path(tempfile.mkdtemp())
 all_segments = []
 print(f"✂️ Teile Audio in {CHUNKS} Chunks ...")
 for i in range(CHUNKS):
    chunk_start = max(0, i * (duration / CHUNKS) - OVERLAP if i > 0 else 0)
    chunk_end = min(duration, (i + 1) * (duration / CHUNKS) + OVERLAP)
    chunk_dur = chunk_end - chunk_start
    chunk_file = tmp_dir / f"chunk_{i}.wav"
    print(f"🔉 Extrahiere Chunk {i+1}/{CHUNKS}: {chunk_start:.2f}s – {chunk_end:.2f}s")
    extract_audio_chunk(chunk_start, chunk_dur, chunk_file)
    print(f"🧠 Transkribiere Chunk {i+1} ...")
    model = whisper.load_model("small")  # Wechsel zu "medium" oder "large" falls gewünscht
    result = model.transcribe(
        str(chunk_file),
        language="de",
        fp16=False,
        word_timestamps=False,
        condition_on_previous_text=True,
        temperature=0,
        verbose=False
    )
    segments = result["segments"]
    # Zeitversatz für den aktuellen Chunk hinzufügen
    offset = chunk_start
    for seg in segments:
        seg["start"] += offset
        seg["end"] += offset
    all_segments.extend(segments)
 # === Sortiere und filtere doppelte/überlappende Segmente
 all_segments.sort(key=lambda x: x["start"])
 def segment_hash(seg):
    return (round(seg["start"], 2), round(seg["end"], 2), seg["text"].strip().lower())
 unique_segments = []
 seen = set()
 for seg in all_segments:
    h = segment_hash(seg)
    if h not in seen:
        seen.add(h)
        unique_segments.append(seg)
 print(f"✅ Insgesamt {len(unique_segments)} einzigartige Segmente transkribiert.")
 with open(output_txt, "w", encoding="utf-8") as f, open(suspect_txt, "w", encoding="utf-8") as f_sus:
    for seg in unique_segments:
        start = seg["start"]
        end = seg["end"]
        text = seg["text"].strip()
        line = f"[{start:.2f} – {end:.2f}] {text}\n"
        f.write(line)   # IMMER ins Haupttranskript!
        if is_suspect(text):
            f_sus.write(line)
 print(f"📝 Zeitmarkiertes Transkript gespeichert unter: {output_txt}")
 print(f"⚠️  Verdächtige Zeilen gespeichert unter: {suspect_txt}")
 with open(output_json, "w", encoding="utf-8") as f:
    json.dump(unique_segments, f, ensure_ascii=False, indent=2)
 print(f"💾 Segmentdaten gespeichert unter: {output_json}")
--- a/transkripte/.DS_Store
+++ b/transkripte/.DS_Store
--- a/whisper.cpp
+++ b/whisper.cpp
@ -0,0 +1 @@
 Subproject commit 2e310b841e0b4e7cf00890b53411dd9f8578f243
		`@ -0,0 +1 @@`
							`Subproject commit 7815f8b37d91b75cf160ed3f0ec8550c0b58cabb`
		`@ -0,0 +1 @@`
							`Subproject commit 2e310b841e0b4e7cf00890b53411dd9f8578f243`