70 lines
2.1 KiB
Python
70 lines
2.1 KiB
Python
from pyannote.audio import Pipeline
|
|
import whisper
|
|
from pydub import AudioSegment
|
|
import os
|
|
import re
|
|
|
|
import certifi
|
|
os.environ['SSL_CERT_FILE'] = certifi.where()
|
|
|
|
AUDIO_PATH = "SVSV011.MP3"
|
|
OUTPUT_DIR = "output_phrases"
|
|
LANGUAGE = "sv"
|
|
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
|
|
|
# Lade Audio
|
|
audio = AudioSegment.from_file(AUDIO_PATH)
|
|
|
|
# 1) Diarization Pipeline laden (erfordert HuggingFace-Token)
|
|
pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization")
|
|
|
|
print("🔍 Diarization starten...")
|
|
diarization = pipeline(AUDIO_PATH)
|
|
|
|
# 2) Whisper Modell laden
|
|
model = whisper.load_model("medium")
|
|
|
|
# 3) Segmente nach Sprecher gruppieren
|
|
segments = []
|
|
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
|
segments.append({
|
|
"start": turn.start,
|
|
"end": turn.end,
|
|
"speaker": speaker
|
|
})
|
|
|
|
# 4) Sortiere Segmente nach Startzeit
|
|
segments = sorted(segments, key=lambda x: x["start"])
|
|
|
|
# 5) Jetzt male-female-Phrase bauen und ausschneiden
|
|
# Hier anpassen, wenn du weißt, wer männlich, wer weiblich ist,
|
|
# z.B. speaker0 = male, speaker1 = female, oder dynamisch prüfen.
|
|
|
|
# Beispiel: male = 'SPEAKER_0', female = 'SPEAKER_1'
|
|
male_speaker = "SPEAKER_0"
|
|
female_speaker = "SPEAKER_1"
|
|
|
|
phrases = []
|
|
i = 0
|
|
while i < len(segments)-1:
|
|
if segments[i]['speaker'] == male_speaker and segments[i+1]['speaker'] == female_speaker:
|
|
start_ms = int(segments[i]['start'] * 1000)
|
|
end_ms = int(segments[i+1]['end'] * 1000)
|
|
clip = audio[start_ms:end_ms]
|
|
|
|
# Transkribiere den weiblichen Teil
|
|
female_audio = audio[int(segments[i+1]['start']*1000):int(segments[i+1]['end']*1000)]
|
|
female_audio.export("temp_female.wav", format="wav")
|
|
result = model.transcribe("temp_female.wav", language=LANGUAGE)
|
|
female_text = result['text'].strip()
|
|
|
|
filename = re.sub(r"[^\wäöåÄÖÅ ]+", "", female_text.lower()).strip().replace(" ", "_")
|
|
filename = filename[:60] + ".wav"
|
|
clip.export(os.path.join(OUTPUT_DIR, filename), format="wav")
|
|
print(f"✅ Phrase gespeichert: {filename}")
|
|
|
|
i += 2
|
|
else:
|
|
i += 1
|
|
|
|
print("🏁 Alles fertig!") |