Ngoufack commited on
Commit
136ef53
·
1 Parent(s): aa45e11

v3 of test

Browse files
Files changed (2) hide show
  1. app.py +72 -18
  2. requirements.txt +1 -1
app.py CHANGED
@@ -5,35 +5,89 @@ import yt_dlp as youtube_dl
5
  import whisperx
6
  import tempfile
7
  import os
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  device = "cuda" if torch.cuda.is_available() else "cpu"
10
- BATCH_SIZE = 4
11
  FILE_LIMIT_MB = 1000
12
- COMPUTE_TYPE = "float32"
13
  YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
14
 
15
- model = whisperx.load_model("large-v2", device,compute_type=COMPUTE_TYPE)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  @spaces.GPU
18
- def transcribe(inputs, task):
19
- if inputs is None:
20
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
21
 
22
- audio = whisperx.load_audio(inputs)
23
- result = model.transcribe(audio, batch_size=BATCH_SIZE)
24
- model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
25
- result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)
26
- diarize_model = whisperx.DiarizationPipeline(use_auth_token=os.getenv("HF_TOKEN"), device=device)
27
- diarize_segments = diarize_model(audio)
28
- result = whisperx.assign_word_speakers(diarize_segments, result)
29
- output_text = ""
30
- for segment in result['segments']:
31
- speaker = segment.get('speaker', 'Unknown Speaker')
32
- text = segment['text']
33
- output_text += f"{speaker}: {text}\n"
34
-
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  return output_text
36
 
 
 
37
  def _return_yt_html_embed(yt_url):
38
  video_id = yt_url.split("?v=")[-1]
39
  return f'<center><iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"></iframe></center>'
 
5
  import whisperx
6
  import tempfile
7
  import os
8
+ import locale
9
+ import whisper
10
+ import datetime
11
+ import subprocess
12
+ import pyannote.audio
13
+ from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
14
+ from pyannote.audio import Audio
15
+ from pyannote.core import Segment
16
+ import wave
17
+ import contextlib
18
+ from sklearn.cluster import AgglomerativeClustering
19
+ import numpy as np
20
 
21
  device = "cuda" if torch.cuda.is_available() else "cpu"
22
+ BATCH_SIZE = 8
23
  FILE_LIMIT_MB = 1000
24
+ COMPUTE_TYPE = "float16"
25
  YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
26
 
27
+ num_speakers = 2
28
+ language = 'French'
29
+ model_size = 'large'
30
+ model_name = model_size
31
+
32
+ def getpreferredencoding(do_setlocale = True):
33
+ return "UTF-8"
34
+
35
+ locale.getpreferredencoding = getpreferredencoding
36
+ embedding_model = PretrainedSpeakerEmbedding(
37
+ "speechbrain/spkrec-ecapa-voxceleb",
38
+ device=torch.device("cpu"))
39
+ model = whisper.load_model(model_size)
40
+ audio = Audio()
41
+ def segment_embedding(segment,duration,path):
42
+ start = segment["start"]
43
+ # Whisper overshoots the end timestamp in the last segment
44
+ end = min(duration, segment["end"])
45
+ clip = Segment(start, end)
46
+ waveform, sample_rate = audio.crop(path, clip)
47
+
48
+ # Convert waveform to single channel
49
+ waveform = waveform.mean(dim=0, keepdim=True)
50
+
51
+ return embedding_model(waveform.unsqueeze(0))
52
+
53
+ def time(secs):
54
+ return datetime.timedelta(seconds=round(secs))
55
 
56
  @spaces.GPU
57
+ def transcribe(path, task):
58
+ if path is None:
59
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
60
 
61
+ if path[-3:] != 'wav':
62
+ with tempfile.TemporaryDirectory() as tmpdirname:
63
+ filepath = os.path.join(tmpdirname, "audio.wav")
64
+ subprocess.call(['ffmpeg', '-i', path, filepath, '-y'])
65
+ path = filepath
66
+ result = model.transcribe(path)
67
+ segments = result["segments"]
68
+ print(segments)
69
+ with contextlib.closing(wave.open(path,'r')) as f:
70
+ frames = f.getnframes()
71
+ rate = f.getframerate()
72
+ duration = frames / float(rate)
73
+
74
+ embeddings = np.zeros(shape=(len(segments), 192))
75
+ for i, segment in enumerate(segments):
76
+ embeddings[i] = segment_embedding(segment,duration=duration,path=path)
77
+ embeddings = np.nan_to_num(embeddings)
78
+ clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
79
+ labels = clustering.labels_
80
+ output_text=""
81
+ for i in range(len(segments)):
82
+ segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
83
+ for (i, segment) in enumerate(segments):
84
+ if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
85
+ output_text += "\n" + segment["speaker"] + ' ' + str(time(segment["start"])) + '\n'
86
+ output_text += segment["text"][1:] + ' '
87
  return output_text
88
 
89
+
90
+
91
  def _return_yt_html_embed(yt_url):
92
  video_id = yt_url.split("?v=")[-1]
93
  return f'<center><iframe width="500" height="320" src="https://www.youtube.com/embed/{video_id}"></iframe></center>'
requirements.txt CHANGED
@@ -11,4 +11,4 @@ transformers>=4.19.0
11
  yt-dlp
12
  more_itertools
13
  faster-whisper
14
- git+https://github.com/m-bain/whisperx.git
 
11
  yt-dlp
12
  more_itertools
13
  faster-whisper
14
+ git+https://github.com/openai/whisper.git