from pyannote.audio import Pipeline import whisper from pydub import AudioSegment import os import re import certifi os.environ['SSL_CERT_FILE'] = certifi.where() AUDIO_PATH = "SVSV011.MP3" OUTPUT_DIR = "output_phrases" LANGUAGE = "sv" os.makedirs(OUTPUT_DIR, exist_ok=True) # Lade Audio audio = AudioSegment.from_file(AUDIO_PATH) # 1) Diarization Pipeline laden (erfordert HuggingFace-Token) pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization") print("🔍 Diarization starten...") diarization = pipeline(AUDIO_PATH) # 2) Whisper Modell laden model = whisper.load_model("medium") # 3) Segmente nach Sprecher gruppieren segments = [] for turn, _, speaker in diarization.itertracks(yield_label=True): segments.append({ "start": turn.start, "end": turn.end, "speaker": speaker }) # 4) Sortiere Segmente nach Startzeit segments = sorted(segments, key=lambda x: x["start"]) # 5) Jetzt male-female-Phrase bauen und ausschneiden # Hier anpassen, wenn du weißt, wer männlich, wer weiblich ist, # z.B. speaker0 = male, speaker1 = female, oder dynamisch prüfen. # Beispiel: male = 'SPEAKER_0', female = 'SPEAKER_1' male_speaker = "SPEAKER_0" female_speaker = "SPEAKER_1" phrases = [] i = 0 while i < len(segments)-1: if segments[i]['speaker'] == male_speaker and segments[i+1]['speaker'] == female_speaker: start_ms = int(segments[i]['start'] * 1000) end_ms = int(segments[i+1]['end'] * 1000) clip = audio[start_ms:end_ms] # Transkribiere den weiblichen Teil female_audio = audio[int(segments[i+1]['start']*1000):int(segments[i+1]['end']*1000)] female_audio.export("temp_female.wav", format="wav") result = model.transcribe("temp_female.wav", language=LANGUAGE) female_text = result['text'].strip() filename = re.sub(r"[^\wäöåÄÖÅ ]+", "", female_text.lower()).strip().replace(" ", "_") filename = filename[:60] + ".wav" clip.export(os.path.join(OUTPUT_DIR, filename), format="wav") print(f"✅ Phrase gespeichert: {filename}") i += 2 else: i += 1 print("🏁 Alles fertig!")