Ngoufack commited on
Commit
e500a6a
·
1 Parent(s): 4e80514

mis a jour

Browse files
Files changed (2) hide show
  1. app.py +19 -12
  2. requirements.txt +2 -1
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import spaces
2
  import torch
 
3
 
4
  import gradio as gr
5
  import yt_dlp as youtube_dl
@@ -9,7 +10,7 @@ from transformers.pipelines.audio_utils import ffmpeg_read
9
  import tempfile
10
  import os
11
 
12
- MODEL_NAME = "openai/whisper-medium"
13
  BATCH_SIZE = 8
14
  FILE_LIMIT_MB = 1000
15
  YT_LENGTH_LIMIT_S = 600 # limit to 1 hour YouTube files
@@ -23,14 +24,17 @@ pipe = pipeline(
23
  device=device,
24
  )
25
 
 
26
 
27
  @spaces.GPU
28
  def transcribe(inputs, task):
29
  if inputs is None:
30
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
31
-
32
- text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
33
- return text
 
 
34
 
35
 
36
  def _return_yt_html_embed(yt_url):
@@ -64,7 +68,10 @@ def download_yt_audio(yt_url, filename):
64
  file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
65
  raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
66
 
67
- ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"}
 
 
 
68
 
69
  with youtube_dl.YoutubeDL(ydl_opts) as ydl:
70
  try:
@@ -77,17 +84,17 @@ def yt_transcribe(yt_url, task, max_filesize=75.0):
77
  html_embed_str = _return_yt_html_embed(yt_url)
78
 
79
  with tempfile.TemporaryDirectory() as tmpdirname:
80
- filepath = os.path.join(tmpdirname, "video.mp4")
81
  download_yt_audio(yt_url, filepath)
82
- with open(filepath, "rb") as f:
83
- inputs = f.read()
84
 
85
- inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
86
- inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
87
 
88
- text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
 
 
89
 
90
- return html_embed_str, text
91
 
92
 
93
  demo = gr.Blocks()
 
1
  import spaces
2
  import torch
3
+ from faster_whisper import WhisperModel
4
 
5
  import gradio as gr
6
  import yt_dlp as youtube_dl
 
10
  import tempfile
11
  import os
12
 
13
+ MODEL_NAME = "large-v3"
14
  BATCH_SIZE = 8
15
  FILE_LIMIT_MB = 1000
16
  YT_LENGTH_LIMIT_S = 600 # limit to 1 hour YouTube files
 
24
  device=device,
25
  )
26
 
27
+ model = WhisperModel(MODEL_NAME, device=device, compute_type="float16" if device == "cuda" else "int8")
28
 
29
  @spaces.GPU
30
  def transcribe(inputs, task):
31
  if inputs is None:
32
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
33
+ segments, info = model.transcribe(input, beam_size=5,batch_size=BATCH_SIZE, vad_filter=True, word_timestamps=False)
34
+ transcription = " ".join([segment.text for segment in segments])
35
+
36
+ #text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
37
+ return transcription
38
 
39
 
40
  def _return_yt_html_embed(yt_url):
 
68
  file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
69
  raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
70
 
71
+ ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",'postprocessors': [{
72
+ 'key': 'FFmpegExtractAudio',
73
+ 'preferredcodec': 'mp3',
74
+ }]}
75
 
76
  with youtube_dl.YoutubeDL(ydl_opts) as ydl:
77
  try:
 
84
  html_embed_str = _return_yt_html_embed(yt_url)
85
 
86
  with tempfile.TemporaryDirectory() as tmpdirname:
87
+ filepath = os.path.join(tmpdirname, "video.mp3")
88
  download_yt_audio(yt_url, filepath)
 
 
89
 
90
+ #inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
91
+ #inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
92
 
93
+ #text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
94
+ segments, info = model.transcribe(filepath, beam_size=5,batch_size=BATCH_SIZE, vad_filter=True, word_timestamps=False)
95
+ transcription = " ".join([segment.text for segment in segments])
96
 
97
+ return html_embed_str, transcription
98
 
99
 
100
  demo = gr.Blocks()
requirements.txt CHANGED
@@ -3,4 +3,5 @@ yt-dlp
3
  torch
4
  torchvision
5
  torchaudio
6
- nemo_toolkit
 
 
3
  torch
4
  torchvision
5
  torchaudio
6
+ nemo_toolkit
7
+ faster-whisper