Initialer Stand der Bachelorarbeit

This commit is contained in:
Jupp Kerschbaum 2025-06-16 12:29:08 +02:00
commit 84b29e1eaf
17 changed files with 568 additions and 0 deletions

BIN
.DS_Store vendored Normal file

Binary file not shown.

8
.idea/.gitignore generated vendored Normal file
View File

@ -0,0 +1,8 @@
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

11
.idea/BachlorArbeit.iml generated Normal file
View File

@ -0,0 +1,11 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$" isTestSource="false" />
<excludeFolder url="file://$MODULE_DIR$/.venv" />
</content>
<orderEntry type="jdk" jdkName="Python 3.10 (BachlorArbeit)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

35
.idea/dataSources.xml generated Normal file
View File

@ -0,0 +1,35 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="DataSourceManagerImpl" format="xml" multifile-model="true">
<data-source source="LOCAL" name="segments" uuid="b474bded-3824-407e-9dc9-bcc11057235d">
<driver-ref>sqlite.xerial</driver-ref>
<synchronize>true</synchronize>
<jdbc-driver>org.sqlite.JDBC</jdbc-driver>
<jdbc-url>jdbc:sqlite:$PROJECT_DIR$/segments.db</jdbc-url>
<working-dir>$ProjectFileDir$</working-dir>
<libraries>
<library>
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/xerial/sqlite-jdbc/3.45.1.0/sqlite-jdbc-3.45.1.0.jar</url>
</library>
<library>
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/slf4j/slf4j-api/1.7.36/slf4j-api-1.7.36.jar</url>
</library>
</libraries>
</data-source>
<data-source source="LOCAL" name="clips_openai" uuid="50f21c9a-9baf-4dc5-9c33-fde0fd385e38">
<driver-ref>sqlite.xerial</driver-ref>
<synchronize>true</synchronize>
<jdbc-driver>org.sqlite.JDBC</jdbc-driver>
<jdbc-url>jdbc:sqlite:$PROJECT_DIR$/clips_openai.db</jdbc-url>
<working-dir>$ProjectFileDir$</working-dir>
<libraries>
<library>
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/xerial/sqlite-jdbc/3.45.1.0/sqlite-jdbc-3.45.1.0.jar</url>
</library>
<library>
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/slf4j/slf4j-api/1.7.36/slf4j-api-1.7.36.jar</url>
</library>
</libraries>
</data-source>
</component>
</project>

View File

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

6
.idea/misc.xml generated Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="Black">
<option name="sdkName" value="Python 3.10 (BachlorArbeit)" />
</component>
</project>

8
.idea/modules.xml generated Normal file
View File

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/BachlorArbeit.iml" filepath="$PROJECT_DIR$/.idea/BachlorArbeit.iml" />
</modules>
</component>
</project>

6
.idea/sqldialects.xml generated Normal file
View File

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="SqlDialectMappings">
<file url="file://$PROJECT_DIR$/rateCluster.py" dialect="SQLite" />
</component>
</project>

9
.idea/vcs.xml generated Normal file
View File

@ -0,0 +1,9 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$/models/distiluse-base-multilingual-cased-v2" vcs="Git" />
<mapping directory="$PROJECT_DIR$/text-clustering" vcs="Git" />
<mapping directory="$PROJECT_DIR$/whisper.cpp" vcs="Git" />
<mapping directory="$PROJECT_DIR$/whisper.cpp/whisper.cpp" vcs="Git" />
</component>
</project>

0
README.md Normal file
View File

38
cutClips.py Normal file
View File

@ -0,0 +1,38 @@
from moviepy.video.io.VideoFileClip import VideoFileClip
from pathlib import Path
import sqlite3
# === Setup ===
input_video = Path("input/testVideoShort.mov")
output_dir = Path("output")
output_dir.mkdir(parents=True, exist_ok=True)
# === SQLite DB lesen ===
db_path = "clips_openai.db"
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
# Nur die Top 10 Clips mit höchstem score_total
cursor.execute("""
SELECT start, end, text
FROM highlights
ORDER BY score_total DESC
LIMIT 10
""")
highlights = cursor.fetchall()
# === Video laden ===
video = VideoFileClip(str(input_video))
# === Clips schneiden ===
for i, (start, end, text) in enumerate(highlights):
output_file = output_dir / f"highlight_{i+1}.mp4"
end = min(end, video.duration) # Sicherstellen, dass das Ende nicht über das Video hinausgeht
print(f"🎬 Exportiere Clip {i+1}: {start:.2f}s {end:.2f}s → {output_file.name}")
clip = video.subclipped(start, end)
clip.write_videofile(str(output_file), codec="libx264", audio_codec="aac")
# === Cleanup ===
conn.close()
video.close()
print("✅ Top 10 Clips exportiert.")

135
rateCluster.py Normal file
View File

@ -0,0 +1,135 @@
import sqlite3
import re
from openai import OpenAI
from time import sleep
# === Einstellungen ===
DB_PATH = "clips_openai.db"
VIDEO_ID = "testVideoShort"
MAX_CLIPS = 5 # oder "all"
OPENAI_API_KEY = "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"
client = OpenAI(api_key=OPENAI_API_KEY)
# === DB-Verbindung
conn = sqlite3.connect(DB_PATH)
cursor = conn.cursor()
cursor.execute("DROP TABLE IF EXISTS highlights")
cursor.execute("""
CREATE TABLE highlights (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file TEXT,
start REAL,
end REAL,
text TEXT,
viralitaet INTEGER,
emotionalitaet INTEGER,
witz INTEGER,
provokation INTEGER,
score_total INTEGER
)
""")
conn.commit()
print(f"🧹 Tabelle 'highlights' neu erstellt für: {VIDEO_ID}")
# === Segmente laden
cursor.execute("SELECT start, end, text FROM segments ORDER BY start")
segments = cursor.fetchall()
print(f"📥 {len(segments)} Segmente (Originaltext) geladen.")
# === Bewertungsfunktion (GPT-4o)
def analyse_segment(text, start, end):
print(f"\n🔎 Bewerte Clip: {start:.2f}s {end:.2f}s")
prompt = f"""
Bewerte folgenden Podcast-Ausschnitt mit genau vier Zahlen zwischen 1 und 10. Achte darauf das es abgeschlossene Clips sind und als eigenstaendiger Clip funktionieren kann.
\"\"\"{text}\"\"\"
Dauer: {start:.2f} bis {end:.2f} Sekunden.
Antwortformat (bitte exakt einhalten, keine weiteren Kommentare):
Viralität: [Zahl]
Emotionalität: [Zahl]
Witz: [Zahl]
Provokation: [Zahl]
"""
try:
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
temperature=0.4
)
output = response.choices[0].message.content.strip()
print(f"📤 GPT-Antwort:\n{output}")
values = {
"viralitaet": None,
"emotionalitaet": None,
"witz": None,
"provokation": None
}
for line in output.splitlines():
line = line.strip().lower().replace("ä", "ae")
if line.startswith("viralitaet"):
values["viralitaet"] = int(re.search(r"\d+", line).group())
elif line.startswith("emotionalitaet"):
values["emotionalitaet"] = int(re.search(r"\d+", line).group())
elif line.startswith("witz"):
values["witz"] = int(re.search(r"\d+", line).group())
elif line.startswith("provokation"):
values["provokation"] = int(re.search(r"\d+", line).group())
if all(v is not None for v in values.values()):
total_score = sum(values.values())
cursor.execute("""
INSERT INTO highlights (
file, start, end, text,
viralitaet, emotionalitaet, witz, provokation, score_total
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
VIDEO_ID, start, end, text.strip(),
values["viralitaet"], values["emotionalitaet"],
values["witz"], values["provokation"],
total_score
))
conn.commit()
return {
"start": start,
"end": end,
"text": text.strip(),
"score": values,
"total": total_score
}
else:
raise ValueError("Unvollständige Bewertung")
except Exception as e:
print(f"⚠️ Fehler bei GPT-Auswertung: {e}")
return None
# === Clips bewerten
rated = []
for start, end, text in segments:
result = analyse_segment(text, float(start), float(end))
if result:
rated.append(result)
sleep(1.2) # Anti-Rate-Limit
# === Beste Clips anzeigen
rated.sort(key=lambda x: x["total"], reverse=True)
selected = rated if MAX_CLIPS == "all" else rated[:int(MAX_CLIPS)]
print(f"\n🎬 Beste {len(selected)} Highlights für: {VIDEO_ID}")
for clip in selected:
print(f"\n🚀 {clip['start']:.2f}s {clip['end']:.2f}s")
print(f"🎙️ {clip['text'][:200]}...")
print("📊 Bewertung:")
for k, v in clip["score"].items():
print(f" {k.capitalize()}: {v}")
print(f" 👉 Gesamt: {clip['total']}")
conn.close()

196
segment_transcript.py Normal file
View File

@ -0,0 +1,196 @@
import json
import sqlite3
import re
from pathlib import Path
from openai import OpenAI
from datetime import datetime
import time
import nltk
nltk.download("punkt")
# === SETTINGS ===
TRANSCRIPT_PATH = Path("transkripte/testVideoShort_timed.txt")
DB_PATH = Path("clips_openai.db")
LOG_DIR = Path("logs")
LOG_DIR.mkdir(exist_ok=True)
BLOCK_DURATION = 300
MIN_CLIP_LEN = 5
MAX_CLIP_LEN = 90
client = OpenAI(api_key="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")
# === HILFSFUNKTIONEN ===
def log_text(filename, content):
(LOG_DIR / filename).write_text(content.strip(), encoding="utf-8")
def append_error_log(content):
with (LOG_DIR / "errors.txt").open("a", encoding="utf-8") as f:
f.write(content + "\n\n")
def extract_json(text):
match = re.search(r"\[.*\]", text.strip(), re.DOTALL)
if match:
try:
return json.loads(match.group())
except Exception as e:
append_error_log(f"❌ JSON-Fehler: {e}\n{text}")
return []
def get_original_text(clip, segments, debug=False):
texts = []
used_segments = []
for s in segments:
# Überschneidung: Segment und Clip teilen sich Zeit
if not (s["end"] < clip["start"] or s["start"] > clip["end"]):
texts.append(s["text"])
used_segments.append(f"[{s['start']:.2f} {s['end']:.2f}] {s['text']}")
if debug:
print(f"\n🟢 Clip {clip['start']}{clip['end']} nutzt Segmente:\n" +
"\n".join(used_segments))
return " ".join(texts).strip()
# === TRANSKRIPT EINLESEN ===
lines = TRANSCRIPT_PATH.read_text(encoding="utf-8").splitlines()
segments = []
for line in lines:
match = re.match(r"\[(\d+(?:\.\d+)?)\s*[-]\s*(\d+(?:\.\d+)?)\]\s*(.*)", line)
if match:
start, end, text = match.groups()
start = float(start)
end = float(end)
if end - start >= 2.0:
segments.append({"start": start, "end": end, "text": text.strip()})
if not segments:
raise RuntimeError("🚫 Keine gültigen Segmente gefunden.")
print(f"{len(segments)} gültige Transkriptsegmente geladen.")
# === BLÖCKE BILDEN
blocks = []
current_block = []
current_start = 0.0
for seg in segments:
if seg["end"] - current_start > BLOCK_DURATION:
blocks.append(current_block)
current_block = []
current_start = seg["start"]
current_block.append(seg)
if current_block:
blocks.append(current_block)
print(f"🧱 {len(blocks)} Blöcke erstellt (à ~{BLOCK_DURATION}s).")
# === KI: CLIP-AUSWAHL
all_clips = []
start_time = time.perf_counter()
for i, block in enumerate(blocks):
if not block:
continue
print(f"\n🤖 Sende Block {i+1}/{len(blocks)} an GPT-4o...")
block_text = "\n".join([f"[{s['start']} {s['end']}] {s['text']}" for s in block])
prompt = f"""
Du bekommst einen Transkriptblock mit Zeitangaben. Extrahiere daraus 13 besonders interessante Abschnitte, die sich als eigenständige Clips eignen.
Achte darauf das es abgeschlossene Clips sind und als eigenstaendiger Clip funktionieren kann.
Ein guter Clip:
- ist abgeschlossen und verständlich
- enthält eine Pointe, Erkenntnis oder einen emotionalen Moment
- wirkt wie ein Mini-Ausschnitt mit Anfang, Spannungsbogen, Auflösung oder Punchline
- ist **mindestens 30 Sekunden lang**
Nutze ausschließlich die vorhandenen Start- und Endzeiten keine neuen erfinden.
Gib ein valides JSON-Array zurück im Format:
[
{{
"start": float,
"end": float,
"summary": "Kurze Beschreibung des Inhalts"
}}
]
TRANSKRIPT:
{block_text}
"""
log_text(f"block_prompt_{i+1}.txt", prompt)
try:
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
temperature=0.4
)
raw = response.choices[0].message.content
log_text(f"block_output_{i+1}.txt", raw)
clips = extract_json(raw)
print(f"{len(clips)} Clips empfangen in Block {i+1}")
for clip in clips:
try:
dur = float(clip["end"]) - float(clip["start"])
if MIN_CLIP_LEN <= dur <= MAX_CLIP_LEN:
clip["duration"] = round(dur, 2)
all_clips.append(clip)
except Exception as e:
append_error_log(f"⛔ Clip-Filterfehler: {clip}\n{e}")
print(f"📈 Aktueller Clip-Gesamtstand: {len(all_clips)}")
# ETA berechnen
elapsed = time.perf_counter() - start_time
avg_time = elapsed / (i + 1)
eta = avg_time * (len(blocks) - (i + 1))
print(f"⏱️ Geschätzte Restzeit: {eta:.1f} Sekunden")
except Exception as e:
append_error_log(f"❌ OpenAI-Fehler Block {i+1}: {e}")
print(f"❌ Fehler bei Block {i+1}: {e}")
# === DB SPEICHERN
conn = sqlite3.connect(DB_PATH)
cur = conn.cursor()
cur.execute("DROP TABLE IF EXISTS segments")
cur.execute("""
CREATE TABLE segments (
id INTEGER PRIMARY KEY AUTOINCREMENT,
file TEXT,
start REAL,
end REAL,
duration REAL,
text TEXT,
summary TEXT
)
""")
inserted = 0
failed = 0
for clip in all_clips:
try:
start = float(clip["start"])
end = float(clip["end"])
duration = float(clip["duration"])
summary = clip.get("summary", "")
# debug=True für print aller Segment-Texte pro Clip
original_text = get_original_text(clip, segments, debug=False)
if end <= start or start < 0:
raise ValueError("Ungültige Zeiten")
cur.execute(
"INSERT INTO segments (file, start, end, duration, text, summary) VALUES (?, ?, ?, ?, ?, ?)",
(TRANSCRIPT_PATH.stem, start, end, duration, original_text, summary.strip())
)
inserted += 1
except Exception as e:
failed += 1
append_error_log(f"❌ DB-Fehler: {clip}\n{e}")
conn.commit()
conn.close()
print("\n📊 Ergebnisse:")
print(f" ✅ Clips gespeichert: {inserted}")
print(f" ❌ Fehlerhafte Clips: {failed}")
print(f"📁 Logs gespeichert in: {LOG_DIR.resolve()}")

1
text-clustering Submodule

@ -0,0 +1 @@
Subproject commit 7815f8b37d91b75cf160ed3f0ec8550c0b58cabb

108
transcription.py Normal file
View File

@ -0,0 +1,108 @@
# transcription_chunked.py
import whisper
from pathlib import Path
import os
import json
import ffmpeg
import tempfile
# === Einstellungen ===
input_file = Path("input/testVideoShort.mov")
output_dir = Path("transkripte")
output_dir.mkdir(parents=True, exist_ok=True)
output_txt = output_dir / f"{input_file.stem}_timed.txt"
output_json = output_dir / f"{input_file.stem}_segments.json"
suspect_txt = output_dir / f"{input_file.stem}_suspect_lines.txt"
CHUNKS = 4 # Anzahl Chunks (anpassen!)
OVERLAP = 2.0 # Sekunden Überlappung
os.environ["XDG_CACHE_HOME"] = str(Path(__file__).parent / "whisper-cache")
probe = ffmpeg.probe(str(input_file))
duration = float(probe["format"]["duration"])
print(f"🎥 Videolänge: {duration:.2f} Sekunden")
def extract_audio_chunk(start_time, duration, output_path):
ffmpeg.input(str(input_file), ss=start_time, t=duration).output(
str(output_path),
format="wav",
acodec="pcm_s16le",
ac=1,
ar="16000",
loglevel="error"
).overwrite_output().run()
def is_suspect(text):
words = text.strip().lower().split()
if not words:
return True
most_common = max([words.count(w) for w in set(words)])
return most_common / len(words) > 0.6 or most_common > 20
tmp_dir = Path(tempfile.mkdtemp())
all_segments = []
print(f"✂️ Teile Audio in {CHUNKS} Chunks ...")
for i in range(CHUNKS):
chunk_start = max(0, i * (duration / CHUNKS) - OVERLAP if i > 0 else 0)
chunk_end = min(duration, (i + 1) * (duration / CHUNKS) + OVERLAP)
chunk_dur = chunk_end - chunk_start
chunk_file = tmp_dir / f"chunk_{i}.wav"
print(f"🔉 Extrahiere Chunk {i+1}/{CHUNKS}: {chunk_start:.2f}s {chunk_end:.2f}s")
extract_audio_chunk(chunk_start, chunk_dur, chunk_file)
print(f"🧠 Transkribiere Chunk {i+1} ...")
model = whisper.load_model("small") # Wechsel zu "medium" oder "large" falls gewünscht
result = model.transcribe(
str(chunk_file),
language="de",
fp16=False,
word_timestamps=False,
condition_on_previous_text=True,
temperature=0,
verbose=False
)
segments = result["segments"]
# Zeitversatz für den aktuellen Chunk hinzufügen
offset = chunk_start
for seg in segments:
seg["start"] += offset
seg["end"] += offset
all_segments.extend(segments)
# === Sortiere und filtere doppelte/überlappende Segmente
all_segments.sort(key=lambda x: x["start"])
def segment_hash(seg):
return (round(seg["start"], 2), round(seg["end"], 2), seg["text"].strip().lower())
unique_segments = []
seen = set()
for seg in all_segments:
h = segment_hash(seg)
if h not in seen:
seen.add(h)
unique_segments.append(seg)
print(f"✅ Insgesamt {len(unique_segments)} einzigartige Segmente transkribiert.")
with open(output_txt, "w", encoding="utf-8") as f, open(suspect_txt, "w", encoding="utf-8") as f_sus:
for seg in unique_segments:
start = seg["start"]
end = seg["end"]
text = seg["text"].strip()
line = f"[{start:.2f} {end:.2f}] {text}\n"
f.write(line) # IMMER ins Haupttranskript!
if is_suspect(text):
f_sus.write(line)
print(f"📝 Zeitmarkiertes Transkript gespeichert unter: {output_txt}")
print(f"⚠️ Verdächtige Zeilen gespeichert unter: {suspect_txt}")
with open(output_json, "w", encoding="utf-8") as f:
json.dump(unique_segments, f, ensure_ascii=False, indent=2)
print(f"💾 Segmentdaten gespeichert unter: {output_json}")

BIN
transkripte/.DS_Store vendored Normal file

Binary file not shown.

1
whisper.cpp Submodule

@ -0,0 +1 @@
Subproject commit 2e310b841e0b4e7cf00890b53411dd9f8578f243