|
import torch |
|
import spaces |
|
import gradio as gr |
|
from transformers import pipeline |
|
import concurrent.futures |
|
import time |
|
|
|
|
|
MODEL_NAME_TURBO = "openai/whisper-large-v3-turbo" |
|
MODEL_NAME_base = "openai/whisper-large-v3" |
|
|
|
device = 0 if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
pipe_turbo = pipeline( |
|
task="automatic-speech-recognition", |
|
model=MODEL_NAME_TURBO, |
|
chunk_length_s=30, |
|
device=device, |
|
) |
|
|
|
pipe_base = pipeline( |
|
task="automatic-speech-recognition", |
|
model=MODEL_NAME_base, |
|
chunk_length_s=30, |
|
device=device, |
|
) |
|
|
|
|
|
@spaces.GPU |
|
def transcribe_turbo(audio): |
|
start_time = time.time() |
|
text_turbo = pipe_turbo(audio)["text"] |
|
elapsed_time = time.time() - start_time |
|
return text_turbo, elapsed_time |
|
|
|
|
|
@spaces.GPU |
|
def transcribe_base(audio): |
|
start_time = time.time() |
|
text_base = pipe_base(audio)["text"] |
|
elapsed_time = time.time() - start_time |
|
return text_base, elapsed_time |
|
|
|
|
|
@spaces.GPU |
|
def compare_transcriptions(audio): |
|
if audio is None: |
|
raise gr.Error("No audio file submitted! Please record an audio before submitting your request.") |
|
|
|
|
|
with concurrent.futures.ThreadPoolExecutor() as executor: |
|
future_turbo = executor.submit(transcribe_turbo, audio) |
|
future_base = executor.submit(transcribe_base, audio) |
|
|
|
|
|
text_turbo, time_turbo = future_turbo.result() |
|
text_base, time_base = future_base.result() |
|
|
|
|
|
return (text_base, f"{time_base:.2f} seconds"), (text_turbo, f"{time_turbo:.2f} seconds") |
|
|
|
css = """ |
|
h1 { |
|
text-align: center; |
|
display:block; |
|
} |
|
""" |
|
|
|
|
|
with gr.Blocks(css=css) as demo: |
|
|
|
gr.Markdown("# Whisper large-v3-turbo vs Whisper large-v3") |
|
gr.Markdown("This app compares the transcription performance and processing time between openAI Whisper large-v3-turbo and the its Base model Whisper large-v3") |
|
|
|
with gr.Column(): |
|
with gr.Row(): |
|
with gr.Group(): |
|
audio_input = gr.Audio(sources=["microphone"], type="filepath") |
|
transcribe_button = gr.Button("Start transcription", variant="primary") |
|
|
|
with gr.Row(): |
|
with gr.Row(): |
|
with gr.Group(): |
|
gr.Markdown("### π **Base model**") |
|
base_output = gr.Textbox(label="Transcription") |
|
base_time = gr.Textbox(label="Processing Time") |
|
with gr.Group(): |
|
gr.Markdown("### β‘ **Turbo model**") |
|
turbo_output = gr.Textbox(label="Transcription") |
|
turbo_time = gr.Textbox(label="Processing Time") |
|
|
|
|
|
transcribe_button.click(fn=compare_transcriptions, inputs=audio_input, outputs=[base_output, base_time, turbo_output, turbo_time]) |
|
|
|
|
|
demo.launch() |
|
|