Initialer Stand der Bachelorarbeit
This commit is contained in:
commit
84b29e1eaf
8
.idea/.gitignore
generated
vendored
Normal file
8
.idea/.gitignore
generated
vendored
Normal file
@ -0,0 +1,8 @@
|
||||
# Default ignored files
|
||||
/shelf/
|
||||
/workspace.xml
|
||||
# Editor-based HTTP Client requests
|
||||
/httpRequests/
|
||||
# Datasource local storage ignored files
|
||||
/dataSources/
|
||||
/dataSources.local.xml
|
11
.idea/BachlorArbeit.iml
generated
Normal file
11
.idea/BachlorArbeit.iml
generated
Normal file
@ -0,0 +1,11 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<module type="PYTHON_MODULE" version="4">
|
||||
<component name="NewModuleRootManager">
|
||||
<content url="file://$MODULE_DIR$">
|
||||
<sourceFolder url="file://$MODULE_DIR$" isTestSource="false" />
|
||||
<excludeFolder url="file://$MODULE_DIR$/.venv" />
|
||||
</content>
|
||||
<orderEntry type="jdk" jdkName="Python 3.10 (BachlorArbeit)" jdkType="Python SDK" />
|
||||
<orderEntry type="sourceFolder" forTests="false" />
|
||||
</component>
|
||||
</module>
|
35
.idea/dataSources.xml
generated
Normal file
35
.idea/dataSources.xml
generated
Normal file
@ -0,0 +1,35 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="DataSourceManagerImpl" format="xml" multifile-model="true">
|
||||
<data-source source="LOCAL" name="segments" uuid="b474bded-3824-407e-9dc9-bcc11057235d">
|
||||
<driver-ref>sqlite.xerial</driver-ref>
|
||||
<synchronize>true</synchronize>
|
||||
<jdbc-driver>org.sqlite.JDBC</jdbc-driver>
|
||||
<jdbc-url>jdbc:sqlite:$PROJECT_DIR$/segments.db</jdbc-url>
|
||||
<working-dir>$ProjectFileDir$</working-dir>
|
||||
<libraries>
|
||||
<library>
|
||||
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/xerial/sqlite-jdbc/3.45.1.0/sqlite-jdbc-3.45.1.0.jar</url>
|
||||
</library>
|
||||
<library>
|
||||
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/slf4j/slf4j-api/1.7.36/slf4j-api-1.7.36.jar</url>
|
||||
</library>
|
||||
</libraries>
|
||||
</data-source>
|
||||
<data-source source="LOCAL" name="clips_openai" uuid="50f21c9a-9baf-4dc5-9c33-fde0fd385e38">
|
||||
<driver-ref>sqlite.xerial</driver-ref>
|
||||
<synchronize>true</synchronize>
|
||||
<jdbc-driver>org.sqlite.JDBC</jdbc-driver>
|
||||
<jdbc-url>jdbc:sqlite:$PROJECT_DIR$/clips_openai.db</jdbc-url>
|
||||
<working-dir>$ProjectFileDir$</working-dir>
|
||||
<libraries>
|
||||
<library>
|
||||
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/xerial/sqlite-jdbc/3.45.1.0/sqlite-jdbc-3.45.1.0.jar</url>
|
||||
</library>
|
||||
<library>
|
||||
<url>file://$APPLICATION_CONFIG_DIR$/jdbc-drivers/Xerial SQLiteJDBC/3.45.1/org/slf4j/slf4j-api/1.7.36/slf4j-api-1.7.36.jar</url>
|
||||
</library>
|
||||
</libraries>
|
||||
</data-source>
|
||||
</component>
|
||||
</project>
|
6
.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
6
.idea/inspectionProfiles/profiles_settings.xml
generated
Normal file
@ -0,0 +1,6 @@
|
||||
<component name="InspectionProjectProfileManager">
|
||||
<settings>
|
||||
<option name="USE_PROJECT_PROFILE" value="false" />
|
||||
<version value="1.0" />
|
||||
</settings>
|
||||
</component>
|
6
.idea/misc.xml
generated
Normal file
6
.idea/misc.xml
generated
Normal file
@ -0,0 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="Black">
|
||||
<option name="sdkName" value="Python 3.10 (BachlorArbeit)" />
|
||||
</component>
|
||||
</project>
|
8
.idea/modules.xml
generated
Normal file
8
.idea/modules.xml
generated
Normal file
@ -0,0 +1,8 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="ProjectModuleManager">
|
||||
<modules>
|
||||
<module fileurl="file://$PROJECT_DIR$/.idea/BachlorArbeit.iml" filepath="$PROJECT_DIR$/.idea/BachlorArbeit.iml" />
|
||||
</modules>
|
||||
</component>
|
||||
</project>
|
6
.idea/sqldialects.xml
generated
Normal file
6
.idea/sqldialects.xml
generated
Normal file
@ -0,0 +1,6 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="SqlDialectMappings">
|
||||
<file url="file://$PROJECT_DIR$/rateCluster.py" dialect="SQLite" />
|
||||
</component>
|
||||
</project>
|
9
.idea/vcs.xml
generated
Normal file
9
.idea/vcs.xml
generated
Normal file
@ -0,0 +1,9 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<project version="4">
|
||||
<component name="VcsDirectoryMappings">
|
||||
<mapping directory="$PROJECT_DIR$/models/distiluse-base-multilingual-cased-v2" vcs="Git" />
|
||||
<mapping directory="$PROJECT_DIR$/text-clustering" vcs="Git" />
|
||||
<mapping directory="$PROJECT_DIR$/whisper.cpp" vcs="Git" />
|
||||
<mapping directory="$PROJECT_DIR$/whisper.cpp/whisper.cpp" vcs="Git" />
|
||||
</component>
|
||||
</project>
|
38
cutClips.py
Normal file
38
cutClips.py
Normal file
@ -0,0 +1,38 @@
|
||||
from moviepy.video.io.VideoFileClip import VideoFileClip
|
||||
from pathlib import Path
|
||||
import sqlite3
|
||||
|
||||
# === Setup ===
|
||||
input_video = Path("input/testVideoShort.mov")
|
||||
output_dir = Path("output")
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# === SQLite DB lesen ===
|
||||
db_path = "clips_openai.db"
|
||||
conn = sqlite3.connect(db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Nur die Top 10 Clips mit höchstem score_total
|
||||
cursor.execute("""
|
||||
SELECT start, end, text
|
||||
FROM highlights
|
||||
ORDER BY score_total DESC
|
||||
LIMIT 10
|
||||
""")
|
||||
highlights = cursor.fetchall()
|
||||
|
||||
# === Video laden ===
|
||||
video = VideoFileClip(str(input_video))
|
||||
|
||||
# === Clips schneiden ===
|
||||
for i, (start, end, text) in enumerate(highlights):
|
||||
output_file = output_dir / f"highlight_{i+1}.mp4"
|
||||
end = min(end, video.duration) # Sicherstellen, dass das Ende nicht über das Video hinausgeht
|
||||
print(f"🎬 Exportiere Clip {i+1}: {start:.2f}s – {end:.2f}s → {output_file.name}")
|
||||
clip = video.subclipped(start, end)
|
||||
clip.write_videofile(str(output_file), codec="libx264", audio_codec="aac")
|
||||
|
||||
# === Cleanup ===
|
||||
conn.close()
|
||||
video.close()
|
||||
print("✅ Top 10 Clips exportiert.")
|
135
rateCluster.py
Normal file
135
rateCluster.py
Normal file
@ -0,0 +1,135 @@
|
||||
import sqlite3
|
||||
import re
|
||||
from openai import OpenAI
|
||||
from time import sleep
|
||||
|
||||
# === Einstellungen ===
|
||||
DB_PATH = "clips_openai.db"
|
||||
VIDEO_ID = "testVideoShort"
|
||||
MAX_CLIPS = 5 # oder "all"
|
||||
OPENAI_API_KEY = "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"
|
||||
|
||||
client = OpenAI(api_key=OPENAI_API_KEY)
|
||||
|
||||
# === DB-Verbindung
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute("DROP TABLE IF EXISTS highlights")
|
||||
cursor.execute("""
|
||||
CREATE TABLE highlights (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
file TEXT,
|
||||
start REAL,
|
||||
end REAL,
|
||||
text TEXT,
|
||||
viralitaet INTEGER,
|
||||
emotionalitaet INTEGER,
|
||||
witz INTEGER,
|
||||
provokation INTEGER,
|
||||
score_total INTEGER
|
||||
)
|
||||
""")
|
||||
conn.commit()
|
||||
print(f"🧹 Tabelle 'highlights' neu erstellt für: {VIDEO_ID}")
|
||||
|
||||
# === Segmente laden
|
||||
cursor.execute("SELECT start, end, text FROM segments ORDER BY start")
|
||||
segments = cursor.fetchall()
|
||||
print(f"📥 {len(segments)} Segmente (Originaltext) geladen.")
|
||||
|
||||
# === Bewertungsfunktion (GPT-4o)
|
||||
def analyse_segment(text, start, end):
|
||||
print(f"\n🔎 Bewerte Clip: {start:.2f}s – {end:.2f}s")
|
||||
|
||||
prompt = f"""
|
||||
Bewerte folgenden Podcast-Ausschnitt mit genau vier Zahlen zwischen 1 und 10. Achte darauf das es abgeschlossene Clips sind und als eigenstaendiger Clip funktionieren kann.
|
||||
|
||||
\"\"\"{text}\"\"\"
|
||||
|
||||
Dauer: {start:.2f} bis {end:.2f} Sekunden.
|
||||
|
||||
Antwortformat (bitte exakt einhalten, keine weiteren Kommentare):
|
||||
Viralität: [Zahl]
|
||||
Emotionalität: [Zahl]
|
||||
Witz: [Zahl]
|
||||
Provokation: [Zahl]
|
||||
"""
|
||||
|
||||
try:
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-4o",
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
temperature=0.4
|
||||
)
|
||||
output = response.choices[0].message.content.strip()
|
||||
print(f"📤 GPT-Antwort:\n{output}")
|
||||
|
||||
values = {
|
||||
"viralitaet": None,
|
||||
"emotionalitaet": None,
|
||||
"witz": None,
|
||||
"provokation": None
|
||||
}
|
||||
|
||||
for line in output.splitlines():
|
||||
line = line.strip().lower().replace("ä", "ae")
|
||||
if line.startswith("viralitaet"):
|
||||
values["viralitaet"] = int(re.search(r"\d+", line).group())
|
||||
elif line.startswith("emotionalitaet"):
|
||||
values["emotionalitaet"] = int(re.search(r"\d+", line).group())
|
||||
elif line.startswith("witz"):
|
||||
values["witz"] = int(re.search(r"\d+", line).group())
|
||||
elif line.startswith("provokation"):
|
||||
values["provokation"] = int(re.search(r"\d+", line).group())
|
||||
|
||||
if all(v is not None for v in values.values()):
|
||||
total_score = sum(values.values())
|
||||
cursor.execute("""
|
||||
INSERT INTO highlights (
|
||||
file, start, end, text,
|
||||
viralitaet, emotionalitaet, witz, provokation, score_total
|
||||
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
""", (
|
||||
VIDEO_ID, start, end, text.strip(),
|
||||
values["viralitaet"], values["emotionalitaet"],
|
||||
values["witz"], values["provokation"],
|
||||
total_score
|
||||
))
|
||||
conn.commit()
|
||||
|
||||
return {
|
||||
"start": start,
|
||||
"end": end,
|
||||
"text": text.strip(),
|
||||
"score": values,
|
||||
"total": total_score
|
||||
}
|
||||
else:
|
||||
raise ValueError("Unvollständige Bewertung")
|
||||
except Exception as e:
|
||||
print(f"⚠️ Fehler bei GPT-Auswertung: {e}")
|
||||
return None
|
||||
|
||||
# === Clips bewerten
|
||||
rated = []
|
||||
for start, end, text in segments:
|
||||
result = analyse_segment(text, float(start), float(end))
|
||||
if result:
|
||||
rated.append(result)
|
||||
sleep(1.2) # Anti-Rate-Limit
|
||||
|
||||
# === Beste Clips anzeigen
|
||||
rated.sort(key=lambda x: x["total"], reverse=True)
|
||||
selected = rated if MAX_CLIPS == "all" else rated[:int(MAX_CLIPS)]
|
||||
|
||||
print(f"\n🎬 Beste {len(selected)} Highlights für: {VIDEO_ID}")
|
||||
for clip in selected:
|
||||
print(f"\n🚀 {clip['start']:.2f}s – {clip['end']:.2f}s")
|
||||
print(f"🎙️ {clip['text'][:200]}...")
|
||||
print("📊 Bewertung:")
|
||||
for k, v in clip["score"].items():
|
||||
print(f" {k.capitalize()}: {v}")
|
||||
print(f" 👉 Gesamt: {clip['total']}")
|
||||
|
||||
conn.close()
|
196
segment_transcript.py
Normal file
196
segment_transcript.py
Normal file
@ -0,0 +1,196 @@
|
||||
import json
|
||||
import sqlite3
|
||||
import re
|
||||
from pathlib import Path
|
||||
from openai import OpenAI
|
||||
from datetime import datetime
|
||||
import time
|
||||
import nltk
|
||||
|
||||
nltk.download("punkt")
|
||||
|
||||
# === SETTINGS ===
|
||||
TRANSCRIPT_PATH = Path("transkripte/testVideoShort_timed.txt")
|
||||
DB_PATH = Path("clips_openai.db")
|
||||
LOG_DIR = Path("logs")
|
||||
LOG_DIR.mkdir(exist_ok=True)
|
||||
BLOCK_DURATION = 300
|
||||
MIN_CLIP_LEN = 5
|
||||
MAX_CLIP_LEN = 90
|
||||
|
||||
client = OpenAI(api_key="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")
|
||||
|
||||
# === HILFSFUNKTIONEN ===
|
||||
def log_text(filename, content):
|
||||
(LOG_DIR / filename).write_text(content.strip(), encoding="utf-8")
|
||||
|
||||
def append_error_log(content):
|
||||
with (LOG_DIR / "errors.txt").open("a", encoding="utf-8") as f:
|
||||
f.write(content + "\n\n")
|
||||
|
||||
def extract_json(text):
|
||||
match = re.search(r"\[.*\]", text.strip(), re.DOTALL)
|
||||
if match:
|
||||
try:
|
||||
return json.loads(match.group())
|
||||
except Exception as e:
|
||||
append_error_log(f"❌ JSON-Fehler: {e}\n{text}")
|
||||
return []
|
||||
|
||||
def get_original_text(clip, segments, debug=False):
|
||||
texts = []
|
||||
used_segments = []
|
||||
for s in segments:
|
||||
# Überschneidung: Segment und Clip teilen sich Zeit
|
||||
if not (s["end"] < clip["start"] or s["start"] > clip["end"]):
|
||||
texts.append(s["text"])
|
||||
used_segments.append(f"[{s['start']:.2f} – {s['end']:.2f}] {s['text']}")
|
||||
if debug:
|
||||
print(f"\n🟢 Clip {clip['start']}–{clip['end']} nutzt Segmente:\n" +
|
||||
"\n".join(used_segments))
|
||||
return " ".join(texts).strip()
|
||||
|
||||
# === TRANSKRIPT EINLESEN ===
|
||||
lines = TRANSCRIPT_PATH.read_text(encoding="utf-8").splitlines()
|
||||
segments = []
|
||||
for line in lines:
|
||||
match = re.match(r"\[(\d+(?:\.\d+)?)\s*[–-]\s*(\d+(?:\.\d+)?)\]\s*(.*)", line)
|
||||
if match:
|
||||
start, end, text = match.groups()
|
||||
start = float(start)
|
||||
end = float(end)
|
||||
if end - start >= 2.0:
|
||||
segments.append({"start": start, "end": end, "text": text.strip()})
|
||||
|
||||
if not segments:
|
||||
raise RuntimeError("🚫 Keine gültigen Segmente gefunden.")
|
||||
print(f"✅ {len(segments)} gültige Transkriptsegmente geladen.")
|
||||
|
||||
# === BLÖCKE BILDEN
|
||||
blocks = []
|
||||
current_block = []
|
||||
current_start = 0.0
|
||||
for seg in segments:
|
||||
if seg["end"] - current_start > BLOCK_DURATION:
|
||||
blocks.append(current_block)
|
||||
current_block = []
|
||||
current_start = seg["start"]
|
||||
current_block.append(seg)
|
||||
if current_block:
|
||||
blocks.append(current_block)
|
||||
print(f"🧱 {len(blocks)} Blöcke erstellt (à ~{BLOCK_DURATION}s).")
|
||||
|
||||
# === KI: CLIP-AUSWAHL
|
||||
all_clips = []
|
||||
start_time = time.perf_counter()
|
||||
|
||||
for i, block in enumerate(blocks):
|
||||
if not block:
|
||||
continue
|
||||
|
||||
print(f"\n🤖 Sende Block {i+1}/{len(blocks)} an GPT-4o...")
|
||||
|
||||
block_text = "\n".join([f"[{s['start']} – {s['end']}] {s['text']}" for s in block])
|
||||
prompt = f"""
|
||||
Du bekommst einen Transkriptblock mit Zeitangaben. Extrahiere daraus 1–3 besonders interessante Abschnitte, die sich als eigenständige Clips eignen.
|
||||
Achte darauf das es abgeschlossene Clips sind und als eigenstaendiger Clip funktionieren kann.
|
||||
|
||||
Ein guter Clip:
|
||||
- ist abgeschlossen und verständlich
|
||||
- enthält eine Pointe, Erkenntnis oder einen emotionalen Moment
|
||||
- wirkt wie ein Mini-Ausschnitt mit Anfang, Spannungsbogen, Auflösung oder Punchline
|
||||
- ist **mindestens 30 Sekunden lang**
|
||||
|
||||
Nutze ausschließlich die vorhandenen Start- und Endzeiten – keine neuen erfinden.
|
||||
|
||||
Gib ein valides JSON-Array zurück im Format:
|
||||
[
|
||||
{{
|
||||
"start": float,
|
||||
"end": float,
|
||||
"summary": "Kurze Beschreibung des Inhalts"
|
||||
}}
|
||||
]
|
||||
|
||||
TRANSKRIPT:
|
||||
{block_text}
|
||||
"""
|
||||
log_text(f"block_prompt_{i+1}.txt", prompt)
|
||||
|
||||
try:
|
||||
response = client.chat.completions.create(
|
||||
model="gpt-4o",
|
||||
messages=[{"role": "user", "content": prompt}],
|
||||
temperature=0.4
|
||||
)
|
||||
raw = response.choices[0].message.content
|
||||
log_text(f"block_output_{i+1}.txt", raw)
|
||||
clips = extract_json(raw)
|
||||
|
||||
print(f"✅ {len(clips)} Clips empfangen in Block {i+1}")
|
||||
|
||||
for clip in clips:
|
||||
try:
|
||||
dur = float(clip["end"]) - float(clip["start"])
|
||||
if MIN_CLIP_LEN <= dur <= MAX_CLIP_LEN:
|
||||
clip["duration"] = round(dur, 2)
|
||||
all_clips.append(clip)
|
||||
except Exception as e:
|
||||
append_error_log(f"⛔ Clip-Filterfehler: {clip}\n{e}")
|
||||
|
||||
print(f"📈 Aktueller Clip-Gesamtstand: {len(all_clips)}")
|
||||
|
||||
# ETA berechnen
|
||||
elapsed = time.perf_counter() - start_time
|
||||
avg_time = elapsed / (i + 1)
|
||||
eta = avg_time * (len(blocks) - (i + 1))
|
||||
print(f"⏱️ Geschätzte Restzeit: {eta:.1f} Sekunden")
|
||||
|
||||
except Exception as e:
|
||||
append_error_log(f"❌ OpenAI-Fehler Block {i+1}: {e}")
|
||||
print(f"❌ Fehler bei Block {i+1}: {e}")
|
||||
|
||||
# === DB SPEICHERN
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
cur = conn.cursor()
|
||||
cur.execute("DROP TABLE IF EXISTS segments")
|
||||
cur.execute("""
|
||||
CREATE TABLE segments (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
file TEXT,
|
||||
start REAL,
|
||||
end REAL,
|
||||
duration REAL,
|
||||
text TEXT,
|
||||
summary TEXT
|
||||
)
|
||||
""")
|
||||
|
||||
inserted = 0
|
||||
failed = 0
|
||||
for clip in all_clips:
|
||||
try:
|
||||
start = float(clip["start"])
|
||||
end = float(clip["end"])
|
||||
duration = float(clip["duration"])
|
||||
summary = clip.get("summary", "")
|
||||
# debug=True für print aller Segment-Texte pro Clip
|
||||
original_text = get_original_text(clip, segments, debug=False)
|
||||
if end <= start or start < 0:
|
||||
raise ValueError("Ungültige Zeiten")
|
||||
cur.execute(
|
||||
"INSERT INTO segments (file, start, end, duration, text, summary) VALUES (?, ?, ?, ?, ?, ?)",
|
||||
(TRANSCRIPT_PATH.stem, start, end, duration, original_text, summary.strip())
|
||||
)
|
||||
inserted += 1
|
||||
except Exception as e:
|
||||
failed += 1
|
||||
append_error_log(f"❌ DB-Fehler: {clip}\n{e}")
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
print("\n📊 Ergebnisse:")
|
||||
print(f" ✅ Clips gespeichert: {inserted}")
|
||||
print(f" ❌ Fehlerhafte Clips: {failed}")
|
||||
print(f"📁 Logs gespeichert in: {LOG_DIR.resolve()}")
|
1
text-clustering
Submodule
1
text-clustering
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit 7815f8b37d91b75cf160ed3f0ec8550c0b58cabb
|
108
transcription.py
Normal file
108
transcription.py
Normal file
@ -0,0 +1,108 @@
|
||||
# transcription_chunked.py
|
||||
import whisper
|
||||
from pathlib import Path
|
||||
import os
|
||||
import json
|
||||
import ffmpeg
|
||||
import tempfile
|
||||
|
||||
# === Einstellungen ===
|
||||
input_file = Path("input/testVideoShort.mov")
|
||||
output_dir = Path("transkripte")
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
output_txt = output_dir / f"{input_file.stem}_timed.txt"
|
||||
output_json = output_dir / f"{input_file.stem}_segments.json"
|
||||
suspect_txt = output_dir / f"{input_file.stem}_suspect_lines.txt"
|
||||
|
||||
CHUNKS = 4 # Anzahl Chunks (anpassen!)
|
||||
OVERLAP = 2.0 # Sekunden Überlappung
|
||||
|
||||
os.environ["XDG_CACHE_HOME"] = str(Path(__file__).parent / "whisper-cache")
|
||||
|
||||
probe = ffmpeg.probe(str(input_file))
|
||||
duration = float(probe["format"]["duration"])
|
||||
print(f"🎥 Videolänge: {duration:.2f} Sekunden")
|
||||
|
||||
def extract_audio_chunk(start_time, duration, output_path):
|
||||
ffmpeg.input(str(input_file), ss=start_time, t=duration).output(
|
||||
str(output_path),
|
||||
format="wav",
|
||||
acodec="pcm_s16le",
|
||||
ac=1,
|
||||
ar="16000",
|
||||
loglevel="error"
|
||||
).overwrite_output().run()
|
||||
|
||||
def is_suspect(text):
|
||||
words = text.strip().lower().split()
|
||||
if not words:
|
||||
return True
|
||||
most_common = max([words.count(w) for w in set(words)])
|
||||
return most_common / len(words) > 0.6 or most_common > 20
|
||||
|
||||
tmp_dir = Path(tempfile.mkdtemp())
|
||||
all_segments = []
|
||||
|
||||
print(f"✂️ Teile Audio in {CHUNKS} Chunks ...")
|
||||
for i in range(CHUNKS):
|
||||
chunk_start = max(0, i * (duration / CHUNKS) - OVERLAP if i > 0 else 0)
|
||||
chunk_end = min(duration, (i + 1) * (duration / CHUNKS) + OVERLAP)
|
||||
chunk_dur = chunk_end - chunk_start
|
||||
chunk_file = tmp_dir / f"chunk_{i}.wav"
|
||||
print(f"🔉 Extrahiere Chunk {i+1}/{CHUNKS}: {chunk_start:.2f}s – {chunk_end:.2f}s")
|
||||
extract_audio_chunk(chunk_start, chunk_dur, chunk_file)
|
||||
|
||||
print(f"🧠 Transkribiere Chunk {i+1} ...")
|
||||
model = whisper.load_model("small") # Wechsel zu "medium" oder "large" falls gewünscht
|
||||
result = model.transcribe(
|
||||
str(chunk_file),
|
||||
language="de",
|
||||
fp16=False,
|
||||
word_timestamps=False,
|
||||
condition_on_previous_text=True,
|
||||
temperature=0,
|
||||
verbose=False
|
||||
)
|
||||
|
||||
segments = result["segments"]
|
||||
# Zeitversatz für den aktuellen Chunk hinzufügen
|
||||
offset = chunk_start
|
||||
for seg in segments:
|
||||
seg["start"] += offset
|
||||
seg["end"] += offset
|
||||
all_segments.extend(segments)
|
||||
|
||||
# === Sortiere und filtere doppelte/überlappende Segmente
|
||||
all_segments.sort(key=lambda x: x["start"])
|
||||
|
||||
def segment_hash(seg):
|
||||
return (round(seg["start"], 2), round(seg["end"], 2), seg["text"].strip().lower())
|
||||
|
||||
unique_segments = []
|
||||
seen = set()
|
||||
for seg in all_segments:
|
||||
h = segment_hash(seg)
|
||||
if h not in seen:
|
||||
seen.add(h)
|
||||
unique_segments.append(seg)
|
||||
|
||||
print(f"✅ Insgesamt {len(unique_segments)} einzigartige Segmente transkribiert.")
|
||||
|
||||
with open(output_txt, "w", encoding="utf-8") as f, open(suspect_txt, "w", encoding="utf-8") as f_sus:
|
||||
for seg in unique_segments:
|
||||
start = seg["start"]
|
||||
end = seg["end"]
|
||||
text = seg["text"].strip()
|
||||
line = f"[{start:.2f} – {end:.2f}] {text}\n"
|
||||
f.write(line) # IMMER ins Haupttranskript!
|
||||
if is_suspect(text):
|
||||
f_sus.write(line)
|
||||
|
||||
|
||||
print(f"📝 Zeitmarkiertes Transkript gespeichert unter: {output_txt}")
|
||||
print(f"⚠️ Verdächtige Zeilen gespeichert unter: {suspect_txt}")
|
||||
|
||||
with open(output_json, "w", encoding="utf-8") as f:
|
||||
json.dump(unique_segments, f, ensure_ascii=False, indent=2)
|
||||
print(f"💾 Segmentdaten gespeichert unter: {output_json}")
|
BIN
transkripte/.DS_Store
vendored
Normal file
BIN
transkripte/.DS_Store
vendored
Normal file
Binary file not shown.
1
whisper.cpp
Submodule
1
whisper.cpp
Submodule
@ -0,0 +1 @@
|
||||
Subproject commit 2e310b841e0b4e7cf00890b53411dd9f8578f243
|
Loading…
x
Reference in New Issue
Block a user