Ngoufack commited on
Commit
8a85641
Β·
1 Parent(s): 1cbc079

hotfix 2.2

Browse files
Files changed (1) hide show
  1. app.py +21 -53
app.py CHANGED
@@ -1,37 +1,28 @@
1
  import spaces
2
  import torch
3
-
4
  import gradio as gr
5
  import yt_dlp as youtube_dl
6
- from transformers import pipeline
7
  from transformers.pipelines.audio_utils import ffmpeg_read
8
-
9
  import tempfile
10
  import os
11
 
12
- MODEL_NAME = "openai/whisper-large-v3-turbo"
13
  BATCH_SIZE = 8
14
  FILE_LIMIT_MB = 1000
15
  YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
16
 
17
- device = 0 if torch.cuda.is_available() else "cpu"
18
-
19
- pipe = pipeline(
20
- task="automatic-speech-recognition",
21
- model=MODEL_NAME,
22
- chunk_length_s=30,
23
- device=device,
24
- )
25
-
26
 
27
  @spaces.GPU
28
  def transcribe(inputs, task):
29
  if inputs is None:
30
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
31
-
32
- text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
33
- return text
34
-
35
 
36
  def _return_yt_html_embed(yt_url):
37
  video_id = yt_url.split("?v=")[-1]
@@ -49,23 +40,11 @@ def download_yt_audio(yt_url, filename):
49
  except youtube_dl.utils.DownloadError as err:
50
  raise gr.Error(str(err))
51
 
52
- file_length = info["duration_string"]
53
- file_h_m_s = file_length.split(":")
54
- file_h_m_s = [int(sub_length) for sub_length in file_h_m_s]
55
-
56
- if len(file_h_m_s) == 1:
57
- file_h_m_s.insert(0, 0)
58
- if len(file_h_m_s) == 2:
59
- file_h_m_s.insert(0, 0)
60
- file_length_s = file_h_m_s[0] * 3600 + file_h_m_s[1] * 60 + file_h_m_s[2]
61
-
62
  if file_length_s > YT_LENGTH_LIMIT_S:
63
- yt_length_limit_hms = time.strftime("%HH:%MM:%SS", time.gmtime(YT_LENGTH_LIMIT_S))
64
- file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
65
- raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")
66
-
67
- ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best"}
68
 
 
69
  with youtube_dl.YoutubeDL(ydl_opts) as ydl:
70
  try:
71
  ydl.download([yt_url])
@@ -73,7 +52,7 @@ def download_yt_audio(yt_url, filename):
73
  raise gr.Error(str(err))
74
 
75
  @spaces.GPU
76
- def yt_transcribe(yt_url, task, max_filesize=75.0):
77
  html_embed_str = _return_yt_html_embed(yt_url)
78
 
79
  with tempfile.TemporaryDirectory() as tmpdirname:
@@ -81,15 +60,13 @@ def yt_transcribe(yt_url, task, max_filesize=75.0):
81
  download_yt_audio(yt_url, filepath)
82
  with open(filepath, "rb") as f:
83
  inputs = f.read()
84
-
85
- inputs = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)
86
- inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
87
-
88
- text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
89
-
90
  return html_embed_str, text
91
 
92
-
93
  demo = gr.Blocks(theme=gr.themes.Ocean())
94
 
95
  mf_transcribe = gr.Interface(
@@ -101,9 +78,7 @@ mf_transcribe = gr.Interface(
101
  outputs="text",
102
  title="VerbaLend Demo 1 : Prototype",
103
  description=(
104
- "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
105
- f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and πŸ€— Transformers to transcribe audio files"
106
- " of arbitrary length."
107
  ),
108
  allow_flagging="never",
109
  )
@@ -116,11 +91,7 @@ file_transcribe = gr.Interface(
116
  ],
117
  outputs="text",
118
  title="VerbaLend Demo 1 : Prototype",
119
- description=(
120
- "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
121
- f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and πŸ€— Transformers to transcribe audio files"
122
- " of arbitrary length."
123
- ),
124
  allow_flagging="never",
125
  )
126
 
@@ -132,11 +103,7 @@ yt_transcribe = gr.Interface(
132
  ],
133
  outputs=["html", "text"],
134
  title="VerbaLend Demo 1 : Prototype",
135
- description=(
136
- "Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint"
137
- f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and πŸ€— Transformers to transcribe video files of"
138
- " arbitrary length."
139
- ),
140
  allow_flagging="never",
141
  )
142
 
@@ -144,3 +111,4 @@ with demo:
144
  gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["Microphone", "Audio file", "YouTube"])
145
 
146
  demo.queue().launch(ssr_mode=False)
 
 
1
  import spaces
2
  import torch
 
3
  import gradio as gr
4
  import yt_dlp as youtube_dl
5
+ from faster_whisper import WhisperModel
6
  from transformers.pipelines.audio_utils import ffmpeg_read
 
7
  import tempfile
8
  import os
9
 
10
+ MODEL_NAME = "large-v3"
11
  BATCH_SIZE = 8
12
  FILE_LIMIT_MB = 1000
13
  YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files
14
 
15
+ device = "cuda" if torch.cuda.is_available() else "cpu"
16
+ model = WhisperModel(MODEL_NAME, device=device, compute_type="float16" if torch.cuda.is_available() else "int8")
 
 
 
 
 
 
 
17
 
18
  @spaces.GPU
19
  def transcribe(inputs, task):
20
  if inputs is None:
21
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
22
+
23
+ segments, _ = model.transcribe(inputs, task=task)
24
+ text = " ".join([segment.text for segment in segments])
25
+ return text
26
 
27
  def _return_yt_html_embed(yt_url):
28
  video_id = yt_url.split("?v=")[-1]
 
40
  except youtube_dl.utils.DownloadError as err:
41
  raise gr.Error(str(err))
42
 
43
+ file_length_s = info["duration"]
 
 
 
 
 
 
 
 
 
44
  if file_length_s > YT_LENGTH_LIMIT_S:
45
+ raise gr.Error(f"Maximum YouTube length is {YT_LENGTH_LIMIT_S} seconds, got {file_length_s} seconds.")
 
 
 
 
46
 
47
+ ydl_opts = {"outtmpl": filename, "format": "bestaudio/best"}
48
  with youtube_dl.YoutubeDL(ydl_opts) as ydl:
49
  try:
50
  ydl.download([yt_url])
 
52
  raise gr.Error(str(err))
53
 
54
  @spaces.GPU
55
+ def yt_transcribe(yt_url, task):
56
  html_embed_str = _return_yt_html_embed(yt_url)
57
 
58
  with tempfile.TemporaryDirectory() as tmpdirname:
 
60
  download_yt_audio(yt_url, filepath)
61
  with open(filepath, "rb") as f:
62
  inputs = f.read()
63
+
64
+ inputs = ffmpeg_read(inputs, 16000) # Convertir en 16kHz
65
+ segments, _ = model.transcribe(inputs, task=task)
66
+ text = " ".join([segment.text for segment in segments])
67
+
 
68
  return html_embed_str, text
69
 
 
70
  demo = gr.Blocks(theme=gr.themes.Ocean())
71
 
72
  mf_transcribe = gr.Interface(
 
78
  outputs="text",
79
  title="VerbaLend Demo 1 : Prototype",
80
  description=(
81
+ "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses Faster Whisper"
 
 
82
  ),
83
  allow_flagging="never",
84
  )
 
91
  ],
92
  outputs="text",
93
  title="VerbaLend Demo 1 : Prototype",
94
+ description="Transcribe uploaded audio files with Faster Whisper.",
 
 
 
 
95
  allow_flagging="never",
96
  )
97
 
 
103
  ],
104
  outputs=["html", "text"],
105
  title="VerbaLend Demo 1 : Prototype",
106
+ description="Transcribe YouTube videos using Faster Whisper.",
 
 
 
 
107
  allow_flagging="never",
108
  )
109
 
 
111
  gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["Microphone", "Audio file", "YouTube"])
112
 
113
  demo.queue().launch(ssr_mode=False)
114
+