cstr commited on
Commit
d680d0f
1 Parent(s): 87f3409

different models

Browse files
Files changed (1) hide show
  1. app.py +39 -33
app.py CHANGED
@@ -157,49 +157,48 @@ def save_transcription(transcription):
157
  f.write(transcription)
158
  return file_path
159
 
160
- def transcribe_audio(input_source, model_choice, batch_size, download_method, start_time=None, end_time=None, verbose=False):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  try:
162
- if model_choice == "faster-whisper":
163
- model = WhisperModel("cstr/whisper-large-v3-turbo-int8_float32", device="auto", compute_type="int8")
164
- batched_model = BatchedInferencePipeline(model=model)
165
- elif model_choice == "primeline/whisper-large-v3-german":
166
- model_id = "primeline/whisper-large-v3-german"
 
 
 
167
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
168
  model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
169
  )
170
  model.to(device)
171
  processor = AutoProcessor.from_pretrained(model_id)
172
- pipe = pipeline(
173
  "automatic-speech-recognition",
174
  model=model,
175
  tokenizer=processor.tokenizer,
176
  feature_extractor=processor.feature_extractor,
177
- max_new_tokens=128,
178
  chunk_length_s=30,
179
  batch_size=batch_size,
180
  return_timestamps=True,
181
  torch_dtype=torch_dtype,
182
  device=device,
183
  )
184
- elif model_choice == "openai/whisper-large-v3":
185
- model_id = "openai/whisper-large-v3"
186
- model = AutoModelForSpeechSeq2Seq.from_pretrained(
187
- model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
188
- )
189
- model.to(device)
190
- processor = AutoProcessor.from_pretrained(model_id)
191
- pipe = pipeline(
192
- "automatic-speech-recognition",
193
- model=model,
194
- tokenizer=processor.tokenizer,
195
- feature_extractor=processor.feature_extractor,
196
- torch_dtype=torch_dtype,
197
- device=device,
198
- )
199
  else:
200
- raise ValueError("Invalid model choice")
201
-
202
- # Rest of the code remains the same
203
 
204
  if isinstance(input_source, str) and (input_source.startswith('http://') or input_source.startswith('https://')):
205
  audio_path = download_audio(input_source, download_method)
@@ -268,7 +267,9 @@ iface = gr.Interface(
268
  fn=transcribe_audio,
269
  inputs=[
270
  gr.Textbox(label="Audio Source (Upload, URL, or YouTube URL)"),
271
- gr.Dropdown(choices=["faster-whisper", "primeline/whisper-large-v3-german", "openai/whisper-large-v3"], label="Model Choice", value="faster-whisper"),
 
 
272
  gr.Slider(minimum=1, maximum=32, step=1, value=16, label="Batch Size"),
273
  gr.Dropdown(choices=["yt-dlp", "pytube", "youtube-dl", "yt-dlp-alt", "ffmpeg", "aria2", "wget"], label="Download Method", value="yt-dlp"),
274
  gr.Number(label="Start Time (seconds)", value=0),
@@ -280,15 +281,20 @@ iface = gr.Interface(
280
  gr.Textbox(label="Transcription", lines=10),
281
  gr.File(label="Download Transcription")
282
  ],
283
- title="Multi-Model Transcription",
284
- description="Transcribe audio using multiple models.",
285
  examples=[
286
- ["https://www.youtube.com/watch?v=daQ_hqA6HDo", "faster-whisper", 16, "yt-dlp", 0, None, False],
287
- ["https://mcdn.podbean.com/mf/web/dir5wty678b6g4vg/HoP_453_-_The_Price_is_Right_-_Law_and_Economics_in_the_Second_Scholastic5yxzh.mp3", "primeline/whisper-large-v3-german", 16, "ffmpeg", 0, 300, True],
288
- ["path/to/local/audio.mp3", "openai/whisper-large-v3", 16, "yt-dlp", 60, 180, False]
289
  ],
290
  cache_examples=False,
291
  live=True
292
  )
293
 
294
- iface.launch()
 
 
 
 
 
 
157
  f.write(transcription)
158
  return file_path
159
 
160
+
161
+ def get_model_options(pipeline_type):
162
+ if pipeline_type == "faster-batched":
163
+ return ["cstr/whisper-large-v3-turbo-int8_float32", "deepdml/faster-whisper-large-v3-turbo-ct2", "Systran/faster-whisper-large-v3", "GalaktischeGurke/primeline-whisper-large-v3-german-ct2"]
164
+ elif pipeline_type == "faster-sequenced":
165
+ return ["cstr/whisper-large-v3-turbo-int8_float32", "deepdml/faster-whisper-large-v3-turbo-ct2", "Systran/faster-whisper-large-v3", "GalaktischeGurke/primeline-whisper-large-v3-german-ct2"]
166
+ elif pipeline_type == "transformers":
167
+ return ["openai/whisper-large-v3", "openai/whisper-large-v3-turbo", "primeline/whisper-large-v3-german"]
168
+ else:
169
+ return []
170
+
171
+ def update_model_dropdown(pipeline_type):
172
+ return gr.Dropdown.update(choices=get_model_options(pipeline_type), value=get_model_options(pipeline_type)[0])
173
+
174
+ def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, download_method, start_time=None, end_time=None, verbose=False):
175
  try:
176
+ if pipeline_type == "faster-batched":
177
+ model = WhisperModel(model_id, device="auto", compute_type=dtype)
178
+ pipeline = BatchedInferencePipeline(model=model)
179
+ elif pipeline_type == "faster-sequenced":
180
+ model = WhisperModel(model_id)
181
+ pipeline = model.transcribe
182
+ elif pipeline_type == "transformers":
183
+ torch_dtype = torch.float16 if dtype == "float16" else torch.float32
184
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
185
  model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
186
  )
187
  model.to(device)
188
  processor = AutoProcessor.from_pretrained(model_id)
189
+ pipeline = pipeline(
190
  "automatic-speech-recognition",
191
  model=model,
192
  tokenizer=processor.tokenizer,
193
  feature_extractor=processor.feature_extractor,
 
194
  chunk_length_s=30,
195
  batch_size=batch_size,
196
  return_timestamps=True,
197
  torch_dtype=torch_dtype,
198
  device=device,
199
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
  else:
201
+ raise ValueError("Invalid pipeline type")
 
 
202
 
203
  if isinstance(input_source, str) and (input_source.startswith('http://') or input_source.startswith('https://')):
204
  audio_path = download_audio(input_source, download_method)
 
267
  fn=transcribe_audio,
268
  inputs=[
269
  gr.Textbox(label="Audio Source (Upload, URL, or YouTube URL)"),
270
+ gr.Dropdown(choices=["faster-batched", "faster-sequenced", "transformers"], label="Pipeline Type", value="faster-batched"),
271
+ gr.Dropdown(label="Model", choices=get_model_options("faster-batched"), value=get_model_options("faster-batched")[0]),
272
+ gr.Dropdown(choices=["int8", "float16", "float32"], label="Data Type", value="int8"),
273
  gr.Slider(minimum=1, maximum=32, step=1, value=16, label="Batch Size"),
274
  gr.Dropdown(choices=["yt-dlp", "pytube", "youtube-dl", "yt-dlp-alt", "ffmpeg", "aria2", "wget"], label="Download Method", value="yt-dlp"),
275
  gr.Number(label="Start Time (seconds)", value=0),
 
281
  gr.Textbox(label="Transcription", lines=10),
282
  gr.File(label="Download Transcription")
283
  ],
284
+ title="Multi-Pipeline Transcription",
285
+ description="Transcribe audio using multiple pipelines and models.",
286
  examples=[
287
+ ["https://www.youtube.com/watch?v=daQ_hqA6HDo", "faster-batched", "cstr/whisper-large-v3-turbo-int8_float32", "int8", 16, "yt-dlp", 0, None, False],
288
+ ["https://mcdn.podbean.com/mf/web/dir5wty678b6g4vg/HoP_453_-_The_Price_is_Right_-_Law_and_Economics_in_the_Second_Scholastic5yxzh.mp3", "faster-sequenced", "deepdml/faster-whisper-large-v3-turbo-ct2", "float16", 1, "ffmpeg", 0, 300, True],
289
+ ["path/to/local/audio.mp3", "transformers", "openai/whisper-large-v3", "float16", 16, "yt-dlp", 60, 180, False]
290
  ],
291
  cache_examples=False,
292
  live=True
293
  )
294
 
295
+ iface.launch()
296
+
297
+ pipeline_type_dropdown = iface.inputs[1]
298
+ model_dropdown = iface.inputs[2]
299
+
300
+ pipeline_type_dropdown.change(update_model_dropdown, inputs=[pipeline_type_dropdown], outputs=[model_dropdown])