Bindu36 commited on
Commit
e5ed27b
·
verified ·
1 Parent(s): e640d9c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +80 -0
app.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import whisper
3
+ import datetime
4
+ import torch
5
+ import subprocess
6
+ from pyannote.audio import Audio
7
+ from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
8
+ from pyannote.core import Segment
9
+ import wave
10
+ import contextlib
11
+ from sklearn.cluster import AgglomerativeClustering
12
+ import numpy as np
13
+
14
+ # Load Whisper model
15
+ model_size = "medium.en"
16
+ model = whisper.load_model(model_size)
17
+
18
+ audio = Audio()
19
+ embedding_model = PretrainedSpeakerEmbedding("speechbrain/spkrec-ecapa-voxceleb", device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
20
+
21
+ def transcribe_and_diarize(audio_file, num_speakers=2):
22
+ path = audio_file.name
23
+ # Convert to WAV if necessary
24
+ if path[-3:] != 'wav':
25
+ subprocess.call(['ffmpeg', '-i', path, 'audio.wav', '-y'])
26
+ path = 'audio.wav'
27
+
28
+ # Transcribe audio
29
+ result = model.transcribe(path)
30
+ segments = result["segments"]
31
+
32
+ # Get audio duration
33
+ with contextlib.closing(wave.open(path, 'r')) as f:
34
+ frames = f.getnframes()
35
+ rate = f.getframerate()
36
+ duration = frames / float(rate)
37
+
38
+ # Define function to extract segment embeddings
39
+ def segment_embedding(segment):
40
+ start = segment["start"]
41
+ end = min(duration, segment["end"])
42
+ clip = Segment(start, end)
43
+ waveform, sample_rate = audio.crop(path, clip)
44
+ return embedding_model(waveform[None])
45
+
46
+ # Extract embeddings for each segment
47
+ embeddings = np.zeros(shape=(len(segments), 192))
48
+ for i, segment in enumerate(segments):
49
+ embeddings[i] = segment_embedding(segment)
50
+
51
+ embeddings = np.nan_to_num(embeddings)
52
+
53
+ # Perform speaker clustering
54
+ clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
55
+ labels = clustering.labels_
56
+ for i in range(len(segments)):
57
+ segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
58
+
59
+ # Generate transcript
60
+ transcript = ""
61
+ for i, segment in enumerate(segments):
62
+ if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
63
+ transcript += "\n" + segment["speaker"] + ' ' + str(datetime.timedelta(seconds=round(segment["start"]))) + '\n'
64
+ transcript += segment["text"][1:] + ' '
65
+ transcript += "\n\n"
66
+
67
+ return transcript
68
+
69
+ iface = gr.Interface(
70
+ fn=transcribe_and_diarize,
71
+ inputs=[
72
+ gr.Audio(source="upload", type="file"),
73
+ gr.Number(value=2, label="Number of Speakers")
74
+ ],
75
+ outputs="text",
76
+ title="Audio Transcription and Speaker Diarization",
77
+ description="Upload an audio file to get a transcription with speaker diarization."
78
+ )
79
+
80
+ iface.launch()