v3 of test
Browse files- app.py +72 -18
- requirements.txt +1 -1
app.py
CHANGED
@@ -5,35 +5,89 @@ import yt_dlp as youtube_dl
|
|
5 |
import whisperx
|
6 |
import tempfile
|
7 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
10 |
-
BATCH_SIZE =
|
11 |
FILE_LIMIT_MB = 1000
|
12 |
-
COMPUTE_TYPE = "
|
13 |
YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
|
14 |
|
15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
|
17 |
@spaces.GPU
|
18 |
-
def transcribe(
|
19 |
-
if
|
20 |
raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
|
21 |
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
return output_text
|
36 |
|
|
|
|
|
37 |
def _return_yt_html_embed(yt_url):
|
38 |
video_id = yt_url.split("?v=")[-1]
|
39 |
return f'<center><iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"></iframe></center>'
|
|
|
5 |
import whisperx
|
6 |
import tempfile
|
7 |
import os
|
8 |
+
import locale
|
9 |
+
import whisper
|
10 |
+
import datetime
|
11 |
+
import subprocess
|
12 |
+
import pyannote.audio
|
13 |
+
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
|
14 |
+
from pyannote.audio import Audio
|
15 |
+
from pyannote.core import Segment
|
16 |
+
import wave
|
17 |
+
import contextlib
|
18 |
+
from sklearn.cluster import AgglomerativeClustering
|
19 |
+
import numpy as np
|
20 |
|
21 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
22 |
+
BATCH_SIZE = 8
|
23 |
FILE_LIMIT_MB = 1000
|
24 |
+
COMPUTE_TYPE = "float16"
|
25 |
YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
|
26 |
|
27 |
+
num_speakers = 2
|
28 |
+
language = 'French'
|
29 |
+
model_size = 'large'
|
30 |
+
model_name = model_size
|
31 |
+
|
32 |
+
def getpreferredencoding(do_setlocale = True):
|
33 |
+
return "UTF-8"
|
34 |
+
|
35 |
+
locale.getpreferredencoding = getpreferredencoding
|
36 |
+
embedding_model = PretrainedSpeakerEmbedding(
|
37 |
+
"speechbrain/spkrec-ecapa-voxceleb",
|
38 |
+
device=torch.device("cpu"))
|
39 |
+
model = whisper.load_model(model_size)
|
40 |
+
audio = Audio()
|
41 |
+
def segment_embedding(segment,duration,path):
|
42 |
+
start = segment["start"]
|
43 |
+
# Whisper overshoots the end timestamp in the last segment
|
44 |
+
end = min(duration, segment["end"])
|
45 |
+
clip = Segment(start, end)
|
46 |
+
waveform, sample_rate = audio.crop(path, clip)
|
47 |
+
|
48 |
+
# Convert waveform to single channel
|
49 |
+
waveform = waveform.mean(dim=0, keepdim=True)
|
50 |
+
|
51 |
+
return embedding_model(waveform.unsqueeze(0))
|
52 |
+
|
53 |
+
def time(secs):
|
54 |
+
return datetime.timedelta(seconds=round(secs))
|
55 |
|
56 |
@spaces.GPU
|
57 |
+
def transcribe(path, task):
|
58 |
+
if path is None:
|
59 |
raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
|
60 |
|
61 |
+
if path[-3:] != 'wav':
|
62 |
+
with tempfile.TemporaryDirectory() as tmpdirname:
|
63 |
+
filepath = os.path.join(tmpdirname, "audio.wav")
|
64 |
+
subprocess.call(['ffmpeg', '-i', path, filepath, '-y'])
|
65 |
+
path = filepath
|
66 |
+
result = model.transcribe(path)
|
67 |
+
segments = result["segments"]
|
68 |
+
print(segments)
|
69 |
+
with contextlib.closing(wave.open(path,'r')) as f:
|
70 |
+
frames = f.getnframes()
|
71 |
+
rate = f.getframerate()
|
72 |
+
duration = frames / float(rate)
|
73 |
+
|
74 |
+
embeddings = np.zeros(shape=(len(segments), 192))
|
75 |
+
for i, segment in enumerate(segments):
|
76 |
+
embeddings[i] = segment_embedding(segment,duration=duration,path=path)
|
77 |
+
embeddings = np.nan_to_num(embeddings)
|
78 |
+
clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
|
79 |
+
labels = clustering.labels_
|
80 |
+
output_text=""
|
81 |
+
for i in range(len(segments)):
|
82 |
+
segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
|
83 |
+
for (i, segment) in enumerate(segments):
|
84 |
+
if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
|
85 |
+
output_text += "\n" + segment["speaker"] + ' ' + str(time(segment["start"])) + '\n'
|
86 |
+
output_text += segment["text"][1:] + ' '
|
87 |
return output_text
|
88 |
|
89 |
+
|
90 |
+
|
91 |
def _return_yt_html_embed(yt_url):
|
92 |
video_id = yt_url.split("?v=")[-1]
|
93 |
return f'<center><iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"></iframe></center>'
|
requirements.txt
CHANGED
@@ -11,4 +11,4 @@ transformers>=4.19.0
|
|
11 |
yt-dlp
|
12 |
more_itertools
|
13 |
faster-whisper
|
14 |
-
git+https://github.com/
|
|
|
11 |
yt-dlp
|
12 |
more_itertools
|
13 |
faster-whisper
|
14 |
+
git+https://github.com/openai/whisper.git
|