cstr commited on
Commit
a8b126b
·
verified ·
1 Parent(s): acd8816

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -18
app.py CHANGED
@@ -12,8 +12,20 @@ import yt_dlp
12
 
13
  logging.basicConfig(level=logging.INFO)
14
 
 
 
 
 
 
 
 
 
 
 
15
  sys.path.append("./faster-whisper")
16
- from faster_whisper import WhisperModel, BatchedInferencePipeline
 
 
17
 
18
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
19
 
@@ -143,10 +155,49 @@ def save_transcription(transcription):
143
  f.write(transcription)
144
  return file_path
145
 
146
- def transcribe_audio(input_source, batch_size, download_method, start_time=None, end_time=None, verbose=False):
147
  try:
148
- model = WhisperModel("cstr/whisper-large-v3-turbo-int8_float32", device="auto", compute_type="int8")
149
- batched_model = BatchedInferencePipeline(model=model)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
 
151
  if isinstance(input_source, str) and (input_source.startswith('http://') or input_source.startswith('https://')):
152
  audio_path = download_audio(input_source, download_method)
@@ -160,19 +211,21 @@ def transcribe_audio(input_source, batch_size, download_method, start_time=None,
160
  trimmed_audio_path = trim_audio(audio_path, start_time or 0, end_time)
161
  audio_path = trimmed_audio_path
162
 
163
- start_time_perf = time.time()
164
- segments, info = batched_model.transcribe(audio_path, batch_size=batch_size, initial_prompt=None)
165
- end_time_perf = time.time()
 
 
 
 
 
 
166
 
167
  transcription_time = end_time_perf - start_time_perf
168
- real_time_factor = info.duration / transcription_time
169
  audio_file_size = os.path.getsize(audio_path) / (1024 * 1024)
170
 
171
  metrics_output = (
172
- f"Language: {info.language}, Probability: {info.language_probability:.2f}\n"
173
- f"Duration: {info.duration:.2f}s, Duration after VAD: {info.duration_after_vad:.2f}s\n"
174
  f"Transcription time: {transcription_time:.2f} seconds\n"
175
- f"Real-time factor: {real_time_factor:.2f}x\n"
176
  f"Audio file size: {audio_file_size:.2f} MB\n"
177
  )
178
 
@@ -182,7 +235,10 @@ def transcribe_audio(input_source, batch_size, download_method, start_time=None,
182
  transcription = ""
183
 
184
  for segment in segments:
185
- transcription_segment = f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}\n"
 
 
 
186
  transcription += transcription_segment
187
 
188
  if verbose:
@@ -205,14 +261,15 @@ def transcribe_audio(input_source, batch_size, download_method, start_time=None,
205
  os.remove(trimmed_audio_path)
206
  except:
207
  pass
208
-
209
  iface = gr.Interface(
210
  fn=transcribe_audio,
211
  inputs=[
212
  gr.Textbox(label="Audio Source (Upload, URL, or YouTube URL)"),
 
213
  gr.Slider(minimum=1, maximum=32, step=1, value=16, label="Batch Size"),
214
  gr.Dropdown(choices=["yt-dlp", "pytube", "youtube-dl", "yt-dlp-alt", "ffmpeg", "aria2", "wget"], label="Download Method", value="yt-dlp"),
215
- gr.Number(label="Start Time (seconds)", value=0),
216
  gr.Number(label="End Time (seconds)", value=0),
217
  gr.Checkbox(label="Verbose Output", value=False)
218
  ],
@@ -222,11 +279,11 @@ iface = gr.Interface(
222
  gr.File(label="Download Transcription")
223
  ],
224
  title="Multi-Model Transcription",
225
- description="Transcribe audio using with Whisper.",
226
  examples=[
227
- ["https://www.youtube.com/watch?v=daQ_hqA6HDo", 16, "yt-dlp", 0, None, False],
228
- ["https://mcdn.podbean.com/mf/web/dir5wty678b6g4vg/HoP_453_-_The_Price_is_Right_-_Law_and_Economics_in_the_Second_Scholastic5yxzh.mp3", 16, "ffmpeg", 0, 300, True],
229
- ["path/to/local/audio.mp3", 16, "yt-dlp", 60, 180, False]
230
  ],
231
  cache_examples=False,
232
  live=True
 
12
 
13
  logging.basicConfig(level=logging.INFO)
14
 
15
+ # Clone and install faster-whisper from GitHub
16
+ # (we should be able to do this in build.sh in a hf space)
17
+ try:
18
+ subprocess.run(["git", "clone", "https://github.com/SYSTRAN/faster-whisper.git"], check=True)
19
+ subprocess.run(["pip", "install", "-e", "./faster-whisper"], check=True)
20
+ except subprocess.CalledProcessError as e:
21
+ print(f"Error during faster-whisper installation: {e}")
22
+ sys.exit(1)
23
+
24
+ # Add the faster-whisper directory to the Python path
25
  sys.path.append("./faster-whisper")
26
+
27
+ from faster_whisper import WhisperModel
28
+ from faster_whisper.transcribe import BatchedInferencePipeline
29
 
30
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
31
 
 
155
  f.write(transcription)
156
  return file_path
157
 
158
+ def transcribe_audio(input_source, model_choice, batch_size, download_method, start_time=None, end_time=None, verbose=False):
159
  try:
160
+ if model_choice == "faster-whisper":
161
+ model = WhisperModel("cstr/whisper-large-v3-turbo-int8_float32", device="auto", compute_type="int8")
162
+ batched_model = BatchedInferencePipeline(model=model)
163
+ elif model_choice == "primeline/whisper-large-v3-german":
164
+ model_id = "primeline/whisper-large-v3-german"
165
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
166
+ model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
167
+ )
168
+ model.to(device)
169
+ processor = AutoProcessor.from_pretrained(model_id)
170
+ pipe = pipeline(
171
+ "automatic-speech-recognition",
172
+ model=model,
173
+ tokenizer=processor.tokenizer,
174
+ feature_extractor=processor.feature_extractor,
175
+ max_new_tokens=128,
176
+ chunk_length_s=30,
177
+ batch_size=batch_size,
178
+ return_timestamps=True,
179
+ torch_dtype=torch_dtype,
180
+ device=device,
181
+ )
182
+ elif model_choice == "openai/whisper-large-v3":
183
+ model_id = "openai/whisper-large-v3"
184
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
185
+ model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
186
+ )
187
+ model.to(device)
188
+ processor = AutoProcessor.from_pretrained(model_id)
189
+ pipe = pipeline(
190
+ "automatic-speech-recognition",
191
+ model=model,
192
+ tokenizer=processor.tokenizer,
193
+ feature_extractor=processor.feature_extractor,
194
+ torch_dtype=torch_dtype,
195
+ device=device,
196
+ )
197
+ else:
198
+ raise ValueError("Invalid model choice")
199
+
200
+ # Rest of the code remains the same
201
 
202
  if isinstance(input_source, str) and (input_source.startswith('http://') or input_source.startswith('https://')):
203
  audio_path = download_audio(input_source, download_method)
 
211
  trimmed_audio_path = trim_audio(audio_path, start_time or 0, end_time)
212
  audio_path = trimmed_audio_path
213
 
214
+ if model_choice == "faster-whisper":
215
+ start_time_perf = time.time()
216
+ segments, info = batched_model.transcribe(audio_path, batch_size=batch_size, initial_prompt=None)
217
+ end_time_perf = time.time()
218
+ else:
219
+ start_time_perf = time.time()
220
+ result = pipe(audio_path)
221
+ segments = result["chunks"]
222
+ end_time_perf = time.time()
223
 
224
  transcription_time = end_time_perf - start_time_perf
 
225
  audio_file_size = os.path.getsize(audio_path) / (1024 * 1024)
226
 
227
  metrics_output = (
 
 
228
  f"Transcription time: {transcription_time:.2f} seconds\n"
 
229
  f"Audio file size: {audio_file_size:.2f} MB\n"
230
  )
231
 
 
235
  transcription = ""
236
 
237
  for segment in segments:
238
+ if model_choice == "faster-whisper":
239
+ transcription_segment = f"[{segment.start:.2f}s -> {segment.end:.2f}s] {segment.text}\n"
240
+ else:
241
+ transcription_segment = f"[{segment['timestamp'][0]:.2f}s -> {segment['timestamp'][1]:.2f}s] {segment['text']}\n"
242
  transcription += transcription_segment
243
 
244
  if verbose:
 
261
  os.remove(trimmed_audio_path)
262
  except:
263
  pass
264
+
265
  iface = gr.Interface(
266
  fn=transcribe_audio,
267
  inputs=[
268
  gr.Textbox(label="Audio Source (Upload, URL, or YouTube URL)"),
269
+ gr.Dropdown(choices=["faster-whisper", "primeline/whisper-large-v3-german", "openai/whisper-large-v3"], label="Model Choice", value="faster-whisper"),
270
  gr.Slider(minimum=1, maximum=32, step=1, value=16, label="Batch Size"),
271
  gr.Dropdown(choices=["yt-dlp", "pytube", "youtube-dl", "yt-dlp-alt", "ffmpeg", "aria2", "wget"], label="Download Method", value="yt-dlp"),
272
+ gr.Number(label="Start Time (seconds)", value=0),
273
  gr.Number(label="End Time (seconds)", value=0),
274
  gr.Checkbox(label="Verbose Output", value=False)
275
  ],
 
279
  gr.File(label="Download Transcription")
280
  ],
281
  title="Multi-Model Transcription",
282
+ description="Transcribe audio using multiple models.",
283
  examples=[
284
+ ["https://www.youtube.com/watch?v=daQ_hqA6HDo", "faster-whisper", 16, "yt-dlp", 0, None, False],
285
+ ["https://mcdn.podbean.com/mf/web/dir5wty678b6g4vg/HoP_453_-_The_Price_is_Right_-_Law_and_Economics_in_the_Second_Scholastic5yxzh.mp3", "primeline/whisper-large-v3-german", 16, "ffmpeg", 0, 300, True],
286
+ ["path/to/local/audio.mp3", "openai/whisper-large-v3", 16, "yt-dlp", 60, 180, False]
287
  ],
288
  cache_examples=False,
289
  live=True