File size: 3,414 Bytes
54d6ffa 7f4f456 54d6ffa 7f4f456 54d6ffa 7f4f456 54d6ffa 924e1ef 54d6ffa 924e1ef 54d6ffa 7f4f456 54d6ffa 7f4f456 54d6ffa 7f4f456 54d6ffa 7f4f456 54d6ffa 7f4f456 54d6ffa 7f4f456 54d6ffa 7f4f456 54d6ffa 7f4f456 54d6ffa 7f4f456 54d6ffa deb4050 54d6ffa 7f4f456 54d6ffa 7f4f456 54d6ffa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
import gradio as gr
import whisper
import torch
import pyannote.audio
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
from pyannote.audio import Audio
from pyannote.core import Segment
import subprocess
import wave
import contextlib
from sklearn.cluster import AgglomerativeClustering
import numpy as np
import datetime
# Load models
device = torch.device("cpu") # Explicitly set device to CPU
embedding_model = PretrainedSpeakerEmbedding(
"speechbrain/spkrec-ecapa-voxceleb",
device=device # Ensure it uses CPU
)
audio_processor = Audio()
# Function to process the audio file and extract transcript and diarization
def process_audio(audio_file, num_speakers, model_size="medium", language="English"):
# Save the uploaded file to a path
path = "/tmp/uploaded_audio.wav"
with open(path, "wb") as f:
f.write(audio_file.read())
# Convert audio to WAV if it's not already
if path[-3:] != 'wav':
wav_path = path.replace(path.split('.')[-1], 'wav')
subprocess.call(['ffmpeg', '-i', path, wav_path, '-y'])
path = wav_path
# Load Whisper model
model = whisper.load_model(model_size)
result = model.transcribe(path)
segments = result["segments"]
# Get audio duration
with contextlib.closing(wave.open(path, 'r')) as f:
frames = f.getnframes()
rate = f.getframerate()
duration = frames / float(rate)
# Function to generate segment embeddings
def segment_embedding(segment):
start = segment["start"]
end = min(duration, segment["end"])
clip = Segment(start, end)
waveform, sample_rate = audio_processor.crop(path, clip)
return embedding_model(waveform[None])
embeddings = np.zeros(shape=(len(segments), 192))
for i, segment in enumerate(segments):
embeddings[i] = segment_embedding(segment)
embeddings = np.nan_to_num(embeddings)
# Perform clustering
clustering = AgglomerativeClustering(n_clusters=num_speakers).fit(embeddings)
labels = clustering.labels_
for i in range(len(segments)):
segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
# Format the transcript
def time(secs):
return str(datetime.timedelta(seconds=round(secs)))
transcript = []
for i, segment in enumerate(segments):
if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
transcript.append(f"\n{segment['speaker']} {time(segment['start'])}")
transcript.append(segment["text"][1:]) # Remove leading whitespace
# Return the final transcript as a string
return "\n".join(transcript)
# Gradio interface
def diarize(audio_file, num_speakers, model_size="medium"):
return process_audio(audio_file, num_speakers, model_size)
# Gradio UI
interface = gr.Interface(
fn=diarize,
inputs=[
gr.Audio(type="filepath", label="Upload Audio File"), # Use 'filepath' here
gr.Number(label="Number of Speakers", value=2, precision=0),
gr.Radio(["tiny", "base", "small", "medium", "large"], label="Model Size", value="medium")
],
outputs=gr.Textbox(label="Transcript"),
title="Speaker Diarization & Transcription",
description="Upload an audio file, specify the number of speakers, and get a diarized transcript."
)
# Run the Gradio app
if __name__ == "__main__":
interface.launch()
|