Initialer Stand der Bachelorarbeit
This commit is contained in:
commit
84b29e1eaf
8
.idea/.gitignore
generated
vendored
Normal file
8
.idea/.gitignore
generated
vendored
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
# Default ignored files
|
||||||
|
/shelf/
|
||||||
|
/workspace.xml
|
||||||
|
# Editor-based HTTP Client requests
|
||||||
|
/httpRequests/
|
||||||
|
# Datasource local storage ignored files
|
||||||
|
/dataSources/
|
||||||
|
/dataSources.local.xml
|
||||||
11
.idea/BachlorArbeit.iml
generated
Normal file
11
.idea/BachlorArbeit.iml
generated
Normal file
@ -0,0 +1,11 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<module type="PYTHON_MODULE" version="4">
|
||||||
|
<component name="NewModuleRootManager">
|
||||||
|
<content url="file://$MODULE_DIR$">
|
||||||
|
<sourceFolder url="file://$MODULE_DIR$" isTestSource="false" />
|
||||||
|
<excludeFolder url="file://$MODULE_DIR$/.venv" />
|
||||||
|
</content>
|
||||||
|
<orderEntry type="jdk" jdkName="Python 3.10 (BachlorArbeit)" jdkType="Python SDK" />
|
||||||
|
<orderEntry type="sourceFolder" forTests="false" />
|
||||||
|
</component>
|
||||||
|
</module>
|
||||||
35
.idea/dataSources.xml
generated
Normal file
35
.idea/dataSources.xml
generated
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="DataSourceManagerImpl" format="xml" multifile-model="true">
|
||||||
|
<data-source source="LOCAL" name="segments" uuid="b474bded-3824-407e-9dc9-bcc11057235d">
|
||||||
|
<driver-ref>sqlite.xerial</driver-ref>
|
||||||
|
<synchronize>true</synchronize>
|
||||||
|
<jdbc-driver>org.sqlite.JDBC</jdbc-driver>
|
||||||
|
<jdbc-url>jdbc:sqlite:$PROJECT_DIR$/segments.db</jdbc-url>
|
||||||
|
<working-dir>$ProjectFileDir$</working-dir>
|
||||||
|
<libraries>
|
||||||
|
<library>
|
||||||
|
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/xerial/sqlite-jdbc/3.45.1.0/sqlite-jdbc-3.45.1.0.jar</url>
|
||||||
|
</library>
|
||||||
|
<library>
|
||||||
|
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/slf4j/slf4j-api/1.7.36/slf4j-api-1.7.36.jar</url>
|
||||||
|
</library>
|
||||||
|
</libraries>
|
||||||
|
</data-source>
|
||||||
|
<data-source source="LOCAL" name="clips_openai" uuid="50f21c9a-9baf-4dc5-9c33-fde0fd385e38">
|
||||||
|
<driver-ref>sqlite.xerial</driver-ref>
|
||||||
|
<synchronize>true</synchronize>
|
||||||
|
<jdbc-driver>org.sqlite.JDBC</jdbc-driver>
|
||||||
|
<jdbc-url>jdbc:sqlite:$PROJECT_DIR$/clips_openai.db</jdbc-url>
|
||||||
|
<working-dir>$ProjectFileDir$</working-dir>
|
||||||
|
<libraries>
|
||||||
|
<library>
|
||||||
|
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/xerial/sqlite-jdbc/3.45.1.0/sqlite-jdbc-3.45.1.0.jar</url>
|
||||||
|
</library>
|
||||||
|
<library>
|
||||||
|
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/slf4j/slf4j-api/1.7.36/slf4j-api-1.7.36.jar</url>
|
||||||
|
</library>
|
||||||
|
</libraries>
|
||||||
|
</data-source>
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
6
.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
6
.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
<component name="InspectionProjectProfileManager">
|
||||||
|
<settings>
|
||||||
|
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||||
|
<version value="1.0" />
|
||||||
|
</settings>
|
||||||
|
</component>
|
||||||
6
.idea/misc.xml
generated
Normal file
6
.idea/misc.xml
generated
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="Black">
|
||||||
|
<option name="sdkName" value="Python 3.10 (BachlorArbeit)" />
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
8
.idea/modules.xml
generated
Normal file
8
.idea/modules.xml
generated
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="ProjectModuleManager">
|
||||||
|
<modules>
|
||||||
|
<module fileurl="file://$PROJECT_DIR$/.idea/BachlorArbeit.iml" filepath="$PROJECT_DIR$/.idea/BachlorArbeit.iml" />
|
||||||
|
</modules>
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
6
.idea/sqldialects.xml
generated
Normal file
6
.idea/sqldialects.xml
generated
Normal file
@ -0,0 +1,6 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="SqlDialectMappings">
|
||||||
|
<file url="file://$PROJECT_DIR$/rateCluster.py" dialect="SQLite" />
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
9
.idea/vcs.xml
generated
Normal file
9
.idea/vcs.xml
generated
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
<project version="4">
|
||||||
|
<component name="VcsDirectoryMappings">
|
||||||
|
<mapping directory="$PROJECT_DIR$/models/distiluse-base-multilingual-cased-v2" vcs="Git" />
|
||||||
|
<mapping directory="$PROJECT_DIR$/text-clustering" vcs="Git" />
|
||||||
|
<mapping directory="$PROJECT_DIR$/whisper.cpp" vcs="Git" />
|
||||||
|
<mapping directory="$PROJECT_DIR$/whisper.cpp/whisper.cpp" vcs="Git" />
|
||||||
|
</component>
|
||||||
|
</project>
|
||||||
38
cutClips.py
Normal file
38
cutClips.py
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
from moviepy.video.io.VideoFileClip import VideoFileClip
|
||||||
|
from pathlib import Path
|
||||||
|
import sqlite3
|
||||||
|
|
||||||
|
# === Setup ===
|
||||||
|
input_video = Path("input/testVideoShort.mov")
|
||||||
|
output_dir = Path("output")
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# === SQLite DB lesen ===
|
||||||
|
db_path = "clips_openai.db"
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
# Nur die Top 10 Clips mit höchstem score_total
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT start, end, text
|
||||||
|
FROM highlights
|
||||||
|
ORDER BY score_total DESC
|
||||||
|
LIMIT 10
|
||||||
|
""")
|
||||||
|
highlights = cursor.fetchall()
|
||||||
|
|
||||||
|
# === Video laden ===
|
||||||
|
video = VideoFileClip(str(input_video))
|
||||||
|
|
||||||
|
# === Clips schneiden ===
|
||||||
|
for i, (start, end, text) in enumerate(highlights):
|
||||||
|
output_file = output_dir / f"highlight_{i+1}.mp4"
|
||||||
|
end = min(end, video.duration) # Sicherstellen, dass das Ende nicht über das Video hinausgeht
|
||||||
|
print(f"🎬 Exportiere Clip {i+1}: {start:.2f}s – {end:.2f}s → {output_file.name}")
|
||||||
|
clip = video.subclipped(start, end)
|
||||||
|
clip.write_videofile(str(output_file), codec="libx264", audio_codec="aac")
|
||||||
|
|
||||||
|
# === Cleanup ===
|
||||||
|
conn.close()
|
||||||
|
video.close()
|
||||||
|
print("✅ Top 10 Clips exportiert.")
|
||||||
135
rateCluster.py
Normal file
135
rateCluster.py
Normal file
@ -0,0 +1,135 @@
|
|||||||
|
import sqlite3
|
||||||
|
import re
|
||||||
|
from openai import OpenAI
|
||||||
|
from time import sleep
|
||||||
|
|
||||||
|
# === Einstellungen ===
|
||||||
|
DB_PATH = "clips_openai.db"
|
||||||
|
VIDEO_ID = "testVideoShort"
|
||||||
|
MAX_CLIPS = 5 # oder "all"
|
||||||
|
OPENAI_API_KEY = "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"
|
||||||
|
|
||||||
|
client = OpenAI(api_key=OPENAI_API_KEY)
|
||||||
|
|
||||||
|
# === DB-Verbindung
|
||||||
|
conn = sqlite3.connect(DB_PATH)
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
cursor.execute("DROP TABLE IF EXISTS highlights")
|
||||||
|
cursor.execute("""
|
||||||
|
CREATE TABLE highlights (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
file TEXT,
|
||||||
|
start REAL,
|
||||||
|
end REAL,
|
||||||
|
text TEXT,
|
||||||
|
viralitaet INTEGER,
|
||||||
|
emotionalitaet INTEGER,
|
||||||
|
witz INTEGER,
|
||||||
|
provokation INTEGER,
|
||||||
|
score_total INTEGER
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
conn.commit()
|
||||||
|
print(f"🧹 Tabelle 'highlights' neu erstellt für: {VIDEO_ID}")
|
||||||
|
|
||||||
|
# === Segmente laden
|
||||||
|
cursor.execute("SELECT start, end, text FROM segments ORDER BY start")
|
||||||
|
segments = cursor.fetchall()
|
||||||
|
print(f"📥 {len(segments)} Segmente (Originaltext) geladen.")
|
||||||
|
|
||||||
|
# === Bewertungsfunktion (GPT-4o)
|
||||||
|
def analyse_segment(text, start, end):
|
||||||
|
print(f"\n🔎 Bewerte Clip: {start:.2f}s – {end:.2f}s")
|
||||||
|
|
||||||
|
prompt = f"""
|
||||||
|
Bewerte folgenden Podcast-Ausschnitt mit genau vier Zahlen zwischen 1 und 10. Achte darauf das es abgeschlossene Clips sind und als eigenstaendiger Clip funktionieren kann.
|
||||||
|
|
||||||
|
\"\"\"{text}\"\"\"
|
||||||
|
|
||||||
|
Dauer: {start:.2f} bis {end:.2f} Sekunden.
|
||||||
|
|
||||||
|
Antwortformat (bitte exakt einhalten, keine weiteren Kommentare):
|
||||||
|
Viralität: [Zahl]
|
||||||
|
Emotionalität: [Zahl]
|
||||||
|
Witz: [Zahl]
|
||||||
|
Provokation: [Zahl]
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = client.chat.completions.create(
|
||||||
|
model="gpt-4o",
|
||||||
|
messages=[{"role": "user", "content": prompt}],
|
||||||
|
temperature=0.4
|
||||||
|
)
|
||||||
|
output = response.choices[0].message.content.strip()
|
||||||
|
print(f"📤 GPT-Antwort:\n{output}")
|
||||||
|
|
||||||
|
values = {
|
||||||
|
"viralitaet": None,
|
||||||
|
"emotionalitaet": None,
|
||||||
|
"witz": None,
|
||||||
|
"provokation": None
|
||||||
|
}
|
||||||
|
|
||||||
|
for line in output.splitlines():
|
||||||
|
line = line.strip().lower().replace("ä", "ae")
|
||||||
|
if line.startswith("viralitaet"):
|
||||||
|
values["viralitaet"] = int(re.search(r"\d+", line).group())
|
||||||
|
elif line.startswith("emotionalitaet"):
|
||||||
|
values["emotionalitaet"] = int(re.search(r"\d+", line).group())
|
||||||
|
elif line.startswith("witz"):
|
||||||
|
values["witz"] = int(re.search(r"\d+", line).group())
|
||||||
|
elif line.startswith("provokation"):
|
||||||
|
values["provokation"] = int(re.search(r"\d+", line).group())
|
||||||
|
|
||||||
|
if all(v is not None for v in values.values()):
|
||||||
|
total_score = sum(values.values())
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT INTO highlights (
|
||||||
|
file, start, end, text,
|
||||||
|
viralitaet, emotionalitaet, witz, provokation, score_total
|
||||||
|
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||||
|
""", (
|
||||||
|
VIDEO_ID, start, end, text.strip(),
|
||||||
|
values["viralitaet"], values["emotionalitaet"],
|
||||||
|
values["witz"], values["provokation"],
|
||||||
|
total_score
|
||||||
|
))
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
return {
|
||||||
|
"start": start,
|
||||||
|
"end": end,
|
||||||
|
"text": text.strip(),
|
||||||
|
"score": values,
|
||||||
|
"total": total_score
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
raise ValueError("Unvollständige Bewertung")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"⚠️ Fehler bei GPT-Auswertung: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
# === Clips bewerten
|
||||||
|
rated = []
|
||||||
|
for start, end, text in segments:
|
||||||
|
result = analyse_segment(text, float(start), float(end))
|
||||||
|
if result:
|
||||||
|
rated.append(result)
|
||||||
|
sleep(1.2) # Anti-Rate-Limit
|
||||||
|
|
||||||
|
# === Beste Clips anzeigen
|
||||||
|
rated.sort(key=lambda x: x["total"], reverse=True)
|
||||||
|
selected = rated if MAX_CLIPS == "all" else rated[:int(MAX_CLIPS)]
|
||||||
|
|
||||||
|
print(f"\n🎬 Beste {len(selected)} Highlights für: {VIDEO_ID}")
|
||||||
|
for clip in selected:
|
||||||
|
print(f"\n🚀 {clip['start']:.2f}s – {clip['end']:.2f}s")
|
||||||
|
print(f"🎙️ {clip['text'][:200]}...")
|
||||||
|
print("📊 Bewertung:")
|
||||||
|
for k, v in clip["score"].items():
|
||||||
|
print(f" {k.capitalize()}: {v}")
|
||||||
|
print(f" 👉 Gesamt: {clip['total']}")
|
||||||
|
|
||||||
|
conn.close()
|
||||||
196
segment_transcript.py
Normal file
196
segment_transcript.py
Normal file
@ -0,0 +1,196 @@
|
|||||||
|
import json
|
||||||
|
import sqlite3
|
||||||
|
import re
|
||||||
|
from pathlib import Path
|
||||||
|
from openai import OpenAI
|
||||||
|
from datetime import datetime
|
||||||
|
import time
|
||||||
|
import nltk
|
||||||
|
|
||||||
|
nltk.download("punkt")
|
||||||
|
|
||||||
|
# === SETTINGS ===
|
||||||
|
TRANSCRIPT_PATH = Path("transkripte/testVideoShort_timed.txt")
|
||||||
|
DB_PATH = Path("clips_openai.db")
|
||||||
|
LOG_DIR = Path("logs")
|
||||||
|
LOG_DIR.mkdir(exist_ok=True)
|
||||||
|
BLOCK_DURATION = 300
|
||||||
|
MIN_CLIP_LEN = 5
|
||||||
|
MAX_CLIP_LEN = 90
|
||||||
|
|
||||||
|
client = OpenAI(api_key="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")
|
||||||
|
|
||||||
|
# === HILFSFUNKTIONEN ===
|
||||||
|
def log_text(filename, content):
|
||||||
|
(LOG_DIR / filename).write_text(content.strip(), encoding="utf-8")
|
||||||
|
|
||||||
|
def append_error_log(content):
|
||||||
|
with (LOG_DIR / "errors.txt").open("a", encoding="utf-8") as f:
|
||||||
|
f.write(content + "\n\n")
|
||||||
|
|
||||||
|
def extract_json(text):
|
||||||
|
match = re.search(r"\[.*\]", text.strip(), re.DOTALL)
|
||||||
|
if match:
|
||||||
|
try:
|
||||||
|
return json.loads(match.group())
|
||||||
|
except Exception as e:
|
||||||
|
append_error_log(f"❌ JSON-Fehler: {e}\n{text}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
def get_original_text(clip, segments, debug=False):
|
||||||
|
texts = []
|
||||||
|
used_segments = []
|
||||||
|
for s in segments:
|
||||||
|
# Überschneidung: Segment und Clip teilen sich Zeit
|
||||||
|
if not (s["end"] < clip["start"] or s["start"] > clip["end"]):
|
||||||
|
texts.append(s["text"])
|
||||||
|
used_segments.append(f"[{s['start']:.2f} – {s['end']:.2f}] {s['text']}")
|
||||||
|
if debug:
|
||||||
|
print(f"\n🟢 Clip {clip['start']}–{clip['end']} nutzt Segmente:\n" +
|
||||||
|
"\n".join(used_segments))
|
||||||
|
return " ".join(texts).strip()
|
||||||
|
|
||||||
|
# === TRANSKRIPT EINLESEN ===
|
||||||
|
lines = TRANSCRIPT_PATH.read_text(encoding="utf-8").splitlines()
|
||||||
|
segments = []
|
||||||
|
for line in lines:
|
||||||
|
match = re.match(r"\[(\d+(?:\.\d+)?)\s*[–-]\s*(\d+(?:\.\d+)?)\]\s*(.*)", line)
|
||||||
|
if match:
|
||||||
|
start, end, text = match.groups()
|
||||||
|
start = float(start)
|
||||||
|
end = float(end)
|
||||||
|
if end - start >= 2.0:
|
||||||
|
segments.append({"start": start, "end": end, "text": text.strip()})
|
||||||
|
|
||||||
|
if not segments:
|
||||||
|
raise RuntimeError("🚫 Keine gültigen Segmente gefunden.")
|
||||||
|
print(f"✅ {len(segments)} gültige Transkriptsegmente geladen.")
|
||||||
|
|
||||||
|
# === BLÖCKE BILDEN
|
||||||
|
blocks = []
|
||||||
|
current_block = []
|
||||||
|
current_start = 0.0
|
||||||
|
for seg in segments:
|
||||||
|
if seg["end"] - current_start > BLOCK_DURATION:
|
||||||
|
blocks.append(current_block)
|
||||||
|
current_block = []
|
||||||
|
current_start = seg["start"]
|
||||||
|
current_block.append(seg)
|
||||||
|
if current_block:
|
||||||
|
blocks.append(current_block)
|
||||||
|
print(f"🧱 {len(blocks)} Blöcke erstellt (à ~{BLOCK_DURATION}s).")
|
||||||
|
|
||||||
|
# === KI: CLIP-AUSWAHL
|
||||||
|
all_clips = []
|
||||||
|
start_time = time.perf_counter()
|
||||||
|
|
||||||
|
for i, block in enumerate(blocks):
|
||||||
|
if not block:
|
||||||
|
continue
|
||||||
|
|
||||||
|
print(f"\n🤖 Sende Block {i+1}/{len(blocks)} an GPT-4o...")
|
||||||
|
|
||||||
|
block_text = "\n".join([f"[{s['start']} – {s['end']}] {s['text']}" for s in block])
|
||||||
|
prompt = f"""
|
||||||
|
Du bekommst einen Transkriptblock mit Zeitangaben. Extrahiere daraus 1–3 besonders interessante Abschnitte, die sich als eigenständige Clips eignen.
|
||||||
|
Achte darauf das es abgeschlossene Clips sind und als eigenstaendiger Clip funktionieren kann.
|
||||||
|
|
||||||
|
Ein guter Clip:
|
||||||
|
- ist abgeschlossen und verständlich
|
||||||
|
- enthält eine Pointe, Erkenntnis oder einen emotionalen Moment
|
||||||
|
- wirkt wie ein Mini-Ausschnitt mit Anfang, Spannungsbogen, Auflösung oder Punchline
|
||||||
|
- ist **mindestens 30 Sekunden lang**
|
||||||
|
|
||||||
|
Nutze ausschließlich die vorhandenen Start- und Endzeiten – keine neuen erfinden.
|
||||||
|
|
||||||
|
Gib ein valides JSON-Array zurück im Format:
|
||||||
|
[
|
||||||
|
{{
|
||||||
|
"start": float,
|
||||||
|
"end": float,
|
||||||
|
"summary": "Kurze Beschreibung des Inhalts"
|
||||||
|
}}
|
||||||
|
]
|
||||||
|
|
||||||
|
TRANSKRIPT:
|
||||||
|
{block_text}
|
||||||
|
"""
|
||||||
|
log_text(f"block_prompt_{i+1}.txt", prompt)
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = client.chat.completions.create(
|
||||||
|
model="gpt-4o",
|
||||||
|
messages=[{"role": "user", "content": prompt}],
|
||||||
|
temperature=0.4
|
||||||
|
)
|
||||||
|
raw = response.choices[0].message.content
|
||||||
|
log_text(f"block_output_{i+1}.txt", raw)
|
||||||
|
clips = extract_json(raw)
|
||||||
|
|
||||||
|
print(f"✅ {len(clips)} Clips empfangen in Block {i+1}")
|
||||||
|
|
||||||
|
for clip in clips:
|
||||||
|
try:
|
||||||
|
dur = float(clip["end"]) - float(clip["start"])
|
||||||
|
if MIN_CLIP_LEN <= dur <= MAX_CLIP_LEN:
|
||||||
|
clip["duration"] = round(dur, 2)
|
||||||
|
all_clips.append(clip)
|
||||||
|
except Exception as e:
|
||||||
|
append_error_log(f"⛔ Clip-Filterfehler: {clip}\n{e}")
|
||||||
|
|
||||||
|
print(f"📈 Aktueller Clip-Gesamtstand: {len(all_clips)}")
|
||||||
|
|
||||||
|
# ETA berechnen
|
||||||
|
elapsed = time.perf_counter() - start_time
|
||||||
|
avg_time = elapsed / (i + 1)
|
||||||
|
eta = avg_time * (len(blocks) - (i + 1))
|
||||||
|
print(f"⏱️ Geschätzte Restzeit: {eta:.1f} Sekunden")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
append_error_log(f"❌ OpenAI-Fehler Block {i+1}: {e}")
|
||||||
|
print(f"❌ Fehler bei Block {i+1}: {e}")
|
||||||
|
|
||||||
|
# === DB SPEICHERN
|
||||||
|
conn = sqlite3.connect(DB_PATH)
|
||||||
|
cur = conn.cursor()
|
||||||
|
cur.execute("DROP TABLE IF EXISTS segments")
|
||||||
|
cur.execute("""
|
||||||
|
CREATE TABLE segments (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
file TEXT,
|
||||||
|
start REAL,
|
||||||
|
end REAL,
|
||||||
|
duration REAL,
|
||||||
|
text TEXT,
|
||||||
|
summary TEXT
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
|
||||||
|
inserted = 0
|
||||||
|
failed = 0
|
||||||
|
for clip in all_clips:
|
||||||
|
try:
|
||||||
|
start = float(clip["start"])
|
||||||
|
end = float(clip["end"])
|
||||||
|
duration = float(clip["duration"])
|
||||||
|
summary = clip.get("summary", "")
|
||||||
|
# debug=True für print aller Segment-Texte pro Clip
|
||||||
|
original_text = get_original_text(clip, segments, debug=False)
|
||||||
|
if end <= start or start < 0:
|
||||||
|
raise ValueError("Ungültige Zeiten")
|
||||||
|
cur.execute(
|
||||||
|
"INSERT INTO segments (file, start, end, duration, text, summary) VALUES (?, ?, ?, ?, ?, ?)",
|
||||||
|
(TRANSCRIPT_PATH.stem, start, end, duration, original_text, summary.strip())
|
||||||
|
)
|
||||||
|
inserted += 1
|
||||||
|
except Exception as e:
|
||||||
|
failed += 1
|
||||||
|
append_error_log(f"❌ DB-Fehler: {clip}\n{e}")
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
print("\n📊 Ergebnisse:")
|
||||||
|
print(f" ✅ Clips gespeichert: {inserted}")
|
||||||
|
print(f" ❌ Fehlerhafte Clips: {failed}")
|
||||||
|
print(f"📁 Logs gespeichert in: {LOG_DIR.resolve()}")
|
||||||
1
text-clustering
Submodule
1
text-clustering
Submodule
@ -0,0 +1 @@
|
|||||||
|
Subproject commit 7815f8b37d91b75cf160ed3f0ec8550c0b58cabb
|
||||||
108
transcription.py
Normal file
108
transcription.py
Normal file
@ -0,0 +1,108 @@
|
|||||||
|
# transcription_chunked.py
|
||||||
|
import whisper
|
||||||
|
from pathlib import Path
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import ffmpeg
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
# === Einstellungen ===
|
||||||
|
input_file = Path("input/testVideoShort.mov")
|
||||||
|
output_dir = Path("transkripte")
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
output_txt = output_dir / f"{input_file.stem}_timed.txt"
|
||||||
|
output_json = output_dir / f"{input_file.stem}_segments.json"
|
||||||
|
suspect_txt = output_dir / f"{input_file.stem}_suspect_lines.txt"
|
||||||
|
|
||||||
|
CHUNKS = 4 # Anzahl Chunks (anpassen!)
|
||||||
|
OVERLAP = 2.0 # Sekunden Überlappung
|
||||||
|
|
||||||
|
os.environ["XDG_CACHE_HOME"] = str(Path(__file__).parent / "whisper-cache")
|
||||||
|
|
||||||
|
probe = ffmpeg.probe(str(input_file))
|
||||||
|
duration = float(probe["format"]["duration"])
|
||||||
|
print(f"🎥 Videolänge: {duration:.2f} Sekunden")
|
||||||
|
|
||||||
|
def extract_audio_chunk(start_time, duration, output_path):
|
||||||
|
ffmpeg.input(str(input_file), ss=start_time, t=duration).output(
|
||||||
|
str(output_path),
|
||||||
|
format="wav",
|
||||||
|
acodec="pcm_s16le",
|
||||||
|
ac=1,
|
||||||
|
ar="16000",
|
||||||
|
loglevel="error"
|
||||||
|
).overwrite_output().run()
|
||||||
|
|
||||||
|
def is_suspect(text):
|
||||||
|
words = text.strip().lower().split()
|
||||||
|
if not words:
|
||||||
|
return True
|
||||||
|
most_common = max([words.count(w) for w in set(words)])
|
||||||
|
return most_common / len(words) > 0.6 or most_common > 20
|
||||||
|
|
||||||
|
tmp_dir = Path(tempfile.mkdtemp())
|
||||||
|
all_segments = []
|
||||||
|
|
||||||
|
print(f"✂️ Teile Audio in {CHUNKS} Chunks ...")
|
||||||
|
for i in range(CHUNKS):
|
||||||
|
chunk_start = max(0, i * (duration / CHUNKS) - OVERLAP if i > 0 else 0)
|
||||||
|
chunk_end = min(duration, (i + 1) * (duration / CHUNKS) + OVERLAP)
|
||||||
|
chunk_dur = chunk_end - chunk_start
|
||||||
|
chunk_file = tmp_dir / f"chunk_{i}.wav"
|
||||||
|
print(f"🔉 Extrahiere Chunk {i+1}/{CHUNKS}: {chunk_start:.2f}s – {chunk_end:.2f}s")
|
||||||
|
extract_audio_chunk(chunk_start, chunk_dur, chunk_file)
|
||||||
|
|
||||||
|
print(f"🧠 Transkribiere Chunk {i+1} ...")
|
||||||
|
model = whisper.load_model("small") # Wechsel zu "medium" oder "large" falls gewünscht
|
||||||
|
result = model.transcribe(
|
||||||
|
str(chunk_file),
|
||||||
|
language="de",
|
||||||
|
fp16=False,
|
||||||
|
word_timestamps=False,
|
||||||
|
condition_on_previous_text=True,
|
||||||
|
temperature=0,
|
||||||
|
verbose=False
|
||||||
|
)
|
||||||
|
|
||||||
|
segments = result["segments"]
|
||||||
|
# Zeitversatz für den aktuellen Chunk hinzufügen
|
||||||
|
offset = chunk_start
|
||||||
|
for seg in segments:
|
||||||
|
seg["start"] += offset
|
||||||
|
seg["end"] += offset
|
||||||
|
all_segments.extend(segments)
|
||||||
|
|
||||||
|
# === Sortiere und filtere doppelte/überlappende Segmente
|
||||||
|
all_segments.sort(key=lambda x: x["start"])
|
||||||
|
|
||||||
|
def segment_hash(seg):
|
||||||
|
return (round(seg["start"], 2), round(seg["end"], 2), seg["text"].strip().lower())
|
||||||
|
|
||||||
|
unique_segments = []
|
||||||
|
seen = set()
|
||||||
|
for seg in all_segments:
|
||||||
|
h = segment_hash(seg)
|
||||||
|
if h not in seen:
|
||||||
|
seen.add(h)
|
||||||
|
unique_segments.append(seg)
|
||||||
|
|
||||||
|
print(f"✅ Insgesamt {len(unique_segments)} einzigartige Segmente transkribiert.")
|
||||||
|
|
||||||
|
with open(output_txt, "w", encoding="utf-8") as f, open(suspect_txt, "w", encoding="utf-8") as f_sus:
|
||||||
|
for seg in unique_segments:
|
||||||
|
start = seg["start"]
|
||||||
|
end = seg["end"]
|
||||||
|
text = seg["text"].strip()
|
||||||
|
line = f"[{start:.2f} – {end:.2f}] {text}\n"
|
||||||
|
f.write(line) # IMMER ins Haupttranskript!
|
||||||
|
if is_suspect(text):
|
||||||
|
f_sus.write(line)
|
||||||
|
|
||||||
|
|
||||||
|
print(f"📝 Zeitmarkiertes Transkript gespeichert unter: {output_txt}")
|
||||||
|
print(f"⚠️ Verdächtige Zeilen gespeichert unter: {suspect_txt}")
|
||||||
|
|
||||||
|
with open(output_json, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(unique_segments, f, ensure_ascii=False, indent=2)
|
||||||
|
print(f"💾 Segmentdaten gespeichert unter: {output_json}")
|
||||||
BIN
transkripte/.DS_Store
vendored
Normal file
BIN
transkripte/.DS_Store
vendored
Normal file
Binary file not shown.
1
whisper.cpp
Submodule
1
whisper.cpp
Submodule
@ -0,0 +1 @@
|
|||||||
|
Subproject commit 2e310b841e0b4e7cf00890b53411dd9f8578f243
|
||||||
Loading…
x
Reference in New Issue
Block a user