Ngoufack commited on
Commit
77b9813
·
1 Parent(s): 8a85641

hotfix 2.3

Browse files
Files changed (2) hide show
  1. app.py +17 -2
  2. requirements.txt +6 -1
app.py CHANGED
@@ -1,11 +1,17 @@
1
  import spaces
2
  import torch
 
3
  import gradio as gr
4
  import yt_dlp as youtube_dl
5
  from faster_whisper import WhisperModel
6
  from transformers.pipelines.audio_utils import ffmpeg_read
7
  import tempfile
8
  import os
 
 
 
 
 
9
 
10
  MODEL_NAME = "large-v3"
11
  BATCH_SIZE = 8
@@ -15,6 +21,9 @@ YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
15
  device = "cuda" if torch.cuda.is_available() else "cpu"
16
  model = WhisperModel(MODEL_NAME, device=device, compute_type="float16" if torch.cuda.is_available() else "int8")
17
 
 
 
 
18
  @spaces.GPU
19
  def transcribe(inputs, task):
20
  if inputs is None:
@@ -22,6 +31,13 @@ def transcribe(inputs, task):
22
 
23
  segments, _ = model.transcribe(inputs, task=task)
24
  text = " ".join([segment.text for segment in segments])
 
 
 
 
 
 
 
25
  return text
26
 
27
  def _return_yt_html_embed(yt_url):
@@ -110,5 +126,4 @@ yt_transcribe = gr.Interface(
110
  with demo:
111
  gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["Microphone", "Audio file", "YouTube"])
112
 
113
- demo.queue().launch(ssr_mode=False)
114
-
 
1
  import spaces
2
  import torch
3
+ import torchaudio
4
  import gradio as gr
5
  import yt_dlp as youtube_dl
6
  from faster_whisper import WhisperModel
7
  from transformers.pipelines.audio_utils import ffmpeg_read
8
  import tempfile
9
  import os
10
+ from pyannote.audio.pipelines.speaker_diarization import SpeakerDiarization
11
+ from pyannote.audio import Model
12
+ from pyannote.core import Segment
13
+ from transformers.pipelines.audio_utils import ffmpeg_read
14
+
15
 
16
  MODEL_NAME = "large-v3"
17
  BATCH_SIZE = 8
 
21
  device = "cuda" if torch.cuda.is_available() else "cpu"
22
  model = WhisperModel(MODEL_NAME, device=device, compute_type="float16" if torch.cuda.is_available() else "int8")
23
 
24
+ model_pyannote = Model.from_pretrained("pyannote/speaker-diarization")
25
+ pipeline = SpeakerDiarization(model_pyannote)
26
+
27
  @spaces.GPU
28
  def transcribe(inputs, task):
29
  if inputs is None:
 
31
 
32
  segments, _ = model.transcribe(inputs, task=task)
33
  text = " ".join([segment.text for segment in segments])
34
+ waveform, sample_rate = torchaudio.load(audio_path)
35
+ if sample_rate != 16000:
36
+ transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
37
+ waveform = transform(waveform)
38
+
39
+ diarization = pipeline({"uri": "audio", "audio": audio_path})
40
+ speaker_segments = []
41
  return text
42
 
43
  def _return_yt_html_embed(yt_url):
 
126
  with demo:
127
  gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["Microphone", "Audio file", "YouTube"])
128
 
129
+ demo.queue().launch(ssr_mode=False)
 
requirements.txt CHANGED
@@ -4,4 +4,9 @@ torch
4
  torchvision
5
  torchaudio
6
  nemo_toolkit
7
- faster-whisper
 
 
 
 
 
 
4
  torchvision
5
  torchaudio
6
  nemo_toolkit
7
+ faster-whisper
8
+ ctranslate2
9
+ intervaltree
10
+ srt
11
+ torch
12
+ https://github.com/pyannote/pyannote-audio/archive/refs/heads/develop.zip