Spaces:

sachinsen1295
/

Transcribe_with_speaker

Sleeping

App Files Files Community

sachinsen1295 commited on Dec 3, 2024

Commit

7f4f456

verified ·

1 Parent(s): 52f033a

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -38

app.py CHANGED Viewed

@@ -1,88 +1,98 @@
 import gradio as gr
 import whisper
 import torch
 from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
 from pyannote.audio import Audio
 from pyannote.core import Segment
 import subprocess
 import wave
-import numpy as np
 from sklearn.cluster import AgglomerativeClustering
-import os
 import datetime
 # Load models
-model_size = "medium"
-whisper_model = whisper.load_model(model_size)
 embedding_model = PretrainedSpeakerEmbedding(
     "speechbrain/spkrec-ecapa-voxceleb",
-    device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
 )
 audio_processor = Audio()
-def process_audio(file_path, num_speakers):
-    # Convert to WAV if necessary
-    if not file_path.endswith(".wav"):
-        wav_path = file_path.replace(file_path.split('.')[-1], 'wav')
-        subprocess.call(['ffmpeg', '-i', file_path, wav_path, '-y'])
-        file_path = wav_path
     # Get audio duration
-    with wave.open(file_path, 'r') as f:
         frames = f.getnframes()
         rate = f.getframerate()
         duration = frames / float(rate)
-    # Transcribe audio
-    result = whisper_model.transcribe(file_path)
-    segments = result["segments"]
-    # Generate speaker embeddings
-    embeddings = np.zeros(shape=(len(segments), 192))
-    for i, segment in enumerate(segments):
         start = segment["start"]
         end = min(duration, segment["end"])
         clip = Segment(start, end)
-        waveform, _ = audio_processor.crop(file_path, clip)
-        embeddings[i] = embedding_model(waveform[None])
     embeddings = np.nan_to_num(embeddings)
     # Perform clustering
     clustering = AgglomerativeClustering(n_clusters=num_speakers).fit(embeddings)
     labels = clustering.labels_
-    for i, segment in enumerate(segments):
-        segment["speaker"] = f"SPEAKER {labels[i] + 1}"
-    # Generate transcript
     transcript = []
-    for segment in segments:
-        speaker = segment["speaker"]
-        start_time = str(datetime.timedelta(seconds=round(segment["start"])))
-        text = segment["text"]
-        transcript.append(f"{speaker} ({start_time}): {text}")
-    # Clean up
-    os.remove(file_path)
     return "\n".join(transcript)
 # Gradio interface
-def diarize(audio_file, num_speakers):
-    file_path = "temp_audio.wav"
-    with open(file_path, "wb") as f:
-        f.write(audio_file.read())
-    return process_audio(file_path, num_speakers)
-# UI
 interface = gr.Interface(
     fn=diarize,
     inputs=[
-        gr.Audio(source="upload", type="file", label="Upload Audio File"),
         gr.Number(label="Number of Speakers", value=2, precision=0),
     ],
     outputs=gr.Textbox(label="Transcript"),
     title="Speaker Diarization & Transcription",
     description="Upload an audio file, specify the number of speakers, and get a diarized transcript."
 )
 if __name__ == "__main__":
     interface.launch()

 import gradio as gr
 import whisper
 import torch
+import pyannote.audio
 from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
 from pyannote.audio import Audio
 from pyannote.core import Segment
 import subprocess
 import wave
+import contextlib
 from sklearn.cluster import AgglomerativeClustering
+import numpy as np
 import datetime
 # Load models
 embedding_model = PretrainedSpeakerEmbedding(
     "speechbrain/spkrec-ecapa-voxceleb",
+    device=torch.device("cpu")  # Use "cuda" if a GPU is available
 )
 audio_processor = Audio()
+# Function to process the audio file and extract transcript and diarization
+def process_audio(audio_file, num_speakers, model_size="medium", language="English"):
+    # Save the uploaded file to a path
+    path = "/tmp/uploaded_audio.wav"
+    with open(path, "wb") as f:
+        f.write(audio_file.read())
+    # Convert audio to WAV if it's not already
+    if path[-3:] != 'wav':
+        wav_path = path.replace(path.split('.')[-1], 'wav')
+        subprocess.call(['ffmpeg', '-i', path, wav_path, '-y'])
+        path = wav_path
+    # Load Whisper model
+    model = whisper.load_model(model_size)
+    result = model.transcribe(path)
+    segments = result["segments"]
     # Get audio duration
+    with contextlib.closing(wave.open(path, 'r')) as f:
         frames = f.getnframes()
         rate = f.getframerate()
         duration = frames / float(rate)
+    # Function to generate segment embeddings
+    def segment_embedding(segment):
         start = segment["start"]
         end = min(duration, segment["end"])
         clip = Segment(start, end)
+        waveform, sample_rate = audio_processor.crop(path, clip)
+        return embedding_model(waveform[None])
+    embeddings = np.zeros(shape=(len(segments), 192))
+    for i, segment in enumerate(segments):
+        embeddings[i] = segment_embedding(segment)
     embeddings = np.nan_to_num(embeddings)
     # Perform clustering
     clustering = AgglomerativeClustering(n_clusters=num_speakers).fit(embeddings)
     labels = clustering.labels_
+    for i in range(len(segments)):
+        segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
+    # Format the transcript
+    def time(secs):
+        return str(datetime.timedelta(seconds=round(secs)))
     transcript = []
+    for i, segment in enumerate(segments):
+        if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
+            transcript.append(f"\n{segment['speaker']} {time(segment['start'])}")
+        transcript.append(segment["text"][1:])  # Remove leading whitespace
+    # Return the final transcript as a string
     return "\n".join(transcript)
 # Gradio interface
+def diarize(audio_file, num_speakers, model_size="medium"):
+    return process_audio(audio_file, num_speakers, model_size)
+# Gradio UI
 interface = gr.Interface(
     fn=diarize,
     inputs=[
+        gr.Audio(type="file", label="Upload Audio File"),  # Removed 'source' argument
         gr.Number(label="Number of Speakers", value=2, precision=0),
+        gr.Radio(["tiny", "base", "small", "medium", "large"], label="Model Size", value="medium")
     ],
     outputs=gr.Textbox(label="Transcript"),
     title="Speaker Diarization & Transcription",
     description="Upload an audio file, specify the number of speakers, and get a diarized transcript."
 )
+# Run the Gradio app
 if __name__ == "__main__":
     interface.launch()