Spaces:
Running
Running
different models
Browse files
app.py
CHANGED
@@ -157,49 +157,48 @@ def save_transcription(transcription):
|
|
157 |
f.write(transcription)
|
158 |
return file_path
|
159 |
|
160 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
161 |
try:
|
162 |
-
if
|
163 |
-
model = WhisperModel(
|
164 |
-
|
165 |
-
elif
|
166 |
-
|
|
|
|
|
|
|
167 |
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
168 |
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
|
169 |
)
|
170 |
model.to(device)
|
171 |
processor = AutoProcessor.from_pretrained(model_id)
|
172 |
-
|
173 |
"automatic-speech-recognition",
|
174 |
model=model,
|
175 |
tokenizer=processor.tokenizer,
|
176 |
feature_extractor=processor.feature_extractor,
|
177 |
-
max_new_tokens=128,
|
178 |
chunk_length_s=30,
|
179 |
batch_size=batch_size,
|
180 |
return_timestamps=True,
|
181 |
torch_dtype=torch_dtype,
|
182 |
device=device,
|
183 |
)
|
184 |
-
elif model_choice == "openai/whisper-large-v3":
|
185 |
-
model_id = "openai/whisper-large-v3"
|
186 |
-
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
187 |
-
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
|
188 |
-
)
|
189 |
-
model.to(device)
|
190 |
-
processor = AutoProcessor.from_pretrained(model_id)
|
191 |
-
pipe = pipeline(
|
192 |
-
"automatic-speech-recognition",
|
193 |
-
model=model,
|
194 |
-
tokenizer=processor.tokenizer,
|
195 |
-
feature_extractor=processor.feature_extractor,
|
196 |
-
torch_dtype=torch_dtype,
|
197 |
-
device=device,
|
198 |
-
)
|
199 |
else:
|
200 |
-
raise ValueError("Invalid
|
201 |
-
|
202 |
-
# Rest of the code remains the same
|
203 |
|
204 |
if isinstance(input_source, str) and (input_source.startswith('http://') or input_source.startswith('https://')):
|
205 |
audio_path = download_audio(input_source, download_method)
|
@@ -268,7 +267,9 @@ iface = gr.Interface(
|
|
268 |
fn=transcribe_audio,
|
269 |
inputs=[
|
270 |
gr.Textbox(label="Audio Source (Upload, URL, or YouTube URL)"),
|
271 |
-
gr.Dropdown(choices=["faster-
|
|
|
|
|
272 |
gr.Slider(minimum=1, maximum=32, step=1, value=16, label="Batch Size"),
|
273 |
gr.Dropdown(choices=["yt-dlp", "pytube", "youtube-dl", "yt-dlp-alt", "ffmpeg", "aria2", "wget"], label="Download Method", value="yt-dlp"),
|
274 |
gr.Number(label="Start Time (seconds)", value=0),
|
@@ -280,15 +281,20 @@ iface = gr.Interface(
|
|
280 |
gr.Textbox(label="Transcription", lines=10),
|
281 |
gr.File(label="Download Transcription")
|
282 |
],
|
283 |
-
title="Multi-
|
284 |
-
description="Transcribe audio using multiple models.",
|
285 |
examples=[
|
286 |
-
["https://www.youtube.com/watch?v=daQ_hqA6HDo", "faster-whisper", 16, "yt-dlp", 0, None, False],
|
287 |
-
["https://mcdn.podbean.com/mf/web/dir5wty678b6g4vg/HoP_453_-_The_Price_is_Right_-_Law_and_Economics_in_the_Second_Scholastic5yxzh.mp3", "
|
288 |
-
["path/to/local/audio.mp3", "openai/whisper-large-v3", 16, "yt-dlp", 60, 180, False]
|
289 |
],
|
290 |
cache_examples=False,
|
291 |
live=True
|
292 |
)
|
293 |
|
294 |
-
iface.launch()
|
|
|
|
|
|
|
|
|
|
|
|
157 |
f.write(transcription)
|
158 |
return file_path
|
159 |
|
160 |
+
|
161 |
+
def get_model_options(pipeline_type):
|
162 |
+
if pipeline_type == "faster-batched":
|
163 |
+
return ["cstr/whisper-large-v3-turbo-int8_float32", "deepdml/faster-whisper-large-v3-turbo-ct2", "Systran/faster-whisper-large-v3", "GalaktischeGurke/primeline-whisper-large-v3-german-ct2"]
|
164 |
+
elif pipeline_type == "faster-sequenced":
|
165 |
+
return ["cstr/whisper-large-v3-turbo-int8_float32", "deepdml/faster-whisper-large-v3-turbo-ct2", "Systran/faster-whisper-large-v3", "GalaktischeGurke/primeline-whisper-large-v3-german-ct2"]
|
166 |
+
elif pipeline_type == "transformers":
|
167 |
+
return ["openai/whisper-large-v3", "openai/whisper-large-v3-turbo", "primeline/whisper-large-v3-german"]
|
168 |
+
else:
|
169 |
+
return []
|
170 |
+
|
171 |
+
def update_model_dropdown(pipeline_type):
|
172 |
+
return gr.Dropdown.update(choices=get_model_options(pipeline_type), value=get_model_options(pipeline_type)[0])
|
173 |
+
|
174 |
+
def transcribe_audio(input_source, pipeline_type, model_id, dtype, batch_size, download_method, start_time=None, end_time=None, verbose=False):
|
175 |
try:
|
176 |
+
if pipeline_type == "faster-batched":
|
177 |
+
model = WhisperModel(model_id, device="auto", compute_type=dtype)
|
178 |
+
pipeline = BatchedInferencePipeline(model=model)
|
179 |
+
elif pipeline_type == "faster-sequenced":
|
180 |
+
model = WhisperModel(model_id)
|
181 |
+
pipeline = model.transcribe
|
182 |
+
elif pipeline_type == "transformers":
|
183 |
+
torch_dtype = torch.float16 if dtype == "float16" else torch.float32
|
184 |
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
185 |
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
|
186 |
)
|
187 |
model.to(device)
|
188 |
processor = AutoProcessor.from_pretrained(model_id)
|
189 |
+
pipeline = pipeline(
|
190 |
"automatic-speech-recognition",
|
191 |
model=model,
|
192 |
tokenizer=processor.tokenizer,
|
193 |
feature_extractor=processor.feature_extractor,
|
|
|
194 |
chunk_length_s=30,
|
195 |
batch_size=batch_size,
|
196 |
return_timestamps=True,
|
197 |
torch_dtype=torch_dtype,
|
198 |
device=device,
|
199 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
200 |
else:
|
201 |
+
raise ValueError("Invalid pipeline type")
|
|
|
|
|
202 |
|
203 |
if isinstance(input_source, str) and (input_source.startswith('http://') or input_source.startswith('https://')):
|
204 |
audio_path = download_audio(input_source, download_method)
|
|
|
267 |
fn=transcribe_audio,
|
268 |
inputs=[
|
269 |
gr.Textbox(label="Audio Source (Upload, URL, or YouTube URL)"),
|
270 |
+
gr.Dropdown(choices=["faster-batched", "faster-sequenced", "transformers"], label="Pipeline Type", value="faster-batched"),
|
271 |
+
gr.Dropdown(label="Model", choices=get_model_options("faster-batched"), value=get_model_options("faster-batched")[0]),
|
272 |
+
gr.Dropdown(choices=["int8", "float16", "float32"], label="Data Type", value="int8"),
|
273 |
gr.Slider(minimum=1, maximum=32, step=1, value=16, label="Batch Size"),
|
274 |
gr.Dropdown(choices=["yt-dlp", "pytube", "youtube-dl", "yt-dlp-alt", "ffmpeg", "aria2", "wget"], label="Download Method", value="yt-dlp"),
|
275 |
gr.Number(label="Start Time (seconds)", value=0),
|
|
|
281 |
gr.Textbox(label="Transcription", lines=10),
|
282 |
gr.File(label="Download Transcription")
|
283 |
],
|
284 |
+
title="Multi-Pipeline Transcription",
|
285 |
+
description="Transcribe audio using multiple pipelines and models.",
|
286 |
examples=[
|
287 |
+
["https://www.youtube.com/watch?v=daQ_hqA6HDo", "faster-batched", "cstr/whisper-large-v3-turbo-int8_float32", "int8", 16, "yt-dlp", 0, None, False],
|
288 |
+
["https://mcdn.podbean.com/mf/web/dir5wty678b6g4vg/HoP_453_-_The_Price_is_Right_-_Law_and_Economics_in_the_Second_Scholastic5yxzh.mp3", "faster-sequenced", "deepdml/faster-whisper-large-v3-turbo-ct2", "float16", 1, "ffmpeg", 0, 300, True],
|
289 |
+
["path/to/local/audio.mp3", "transformers", "openai/whisper-large-v3", "float16", 16, "yt-dlp", 60, 180, False]
|
290 |
],
|
291 |
cache_examples=False,
|
292 |
live=True
|
293 |
)
|
294 |
|
295 |
+
iface.launch()
|
296 |
+
|
297 |
+
pipeline_type_dropdown = iface.inputs[1]
|
298 |
+
model_dropdown = iface.inputs[2]
|
299 |
+
|
300 |
+
pipeline_type_dropdown.change(update_model_dropdown, inputs=[pipeline_type_dropdown], outputs=[model_dropdown])
|