import whisper from pydub import AudioSegment import os import re from difflib import SequenceMatcher # ---------- KONFIG ---------- AUDIO_PATH = "015/SVSV015.MP3" OUTPUT_DIR = "015/phrases" MODEL_SIZE = "base" LANG = "sv" SIMILARITY_THRESHOLD = 0.85 os.makedirs(OUTPUT_DIR, exist_ok=True) # ---------------------------- # Lade Modell & Audio model = whisper.load_model(MODEL_SIZE) result = model.transcribe(AUDIO_PATH, language=LANG, word_timestamps=False) audio = AudioSegment.from_file(AUDIO_PATH) segments = result["segments"] i = 0 while i < len(segments) - 1: first = segments[i] second = segments[i + 1] text1 = first["text"].strip().lower() text2 = second["text"].strip().lower() print(text1) print(text2) ratio = SequenceMatcher(None, text1, text2).ratio() if ratio < SIMILARITY_THRESHOLD: print(f"⛔ Unterschiedliche Texte übersprungen:\n Mann: {text1}\n Frau: {text2}") i += 1 continue # Verwende weibliche Phrase als Dateiname phrase_text = text2 clean_name = re.sub(r"[^\wäöåÄÖÅ]+", "_", phrase_text).strip("_") filename = f"{clean_name}.wav" filepath = os.path.join(OUTPUT_DIR, filename) # Bestimme exakte Zeit: von Start (männlich) bis vor nächste männliche Phrase start = first["start"] if i + 2 < len(segments): end = min(second["end"], segments[i + 2]["start"]) else: end = second["end"] clip = audio[int(start * 1000):int(end * 1000)] clip.export(filepath, format="wav") print(f"💾 Gespeichert: {filename} ({start:.2f}s – {end:.2f}s)") i += 2