|
import torch |
|
import gradio as gr |
|
from transformers import pipeline |
|
import concurrent.futures |
|
import time |
|
|
|
|
|
MODEL_NAME_TURBO = "openai/whisper-large-v3-turbo" |
|
MODEL_NAME_STANDARD = "openai/whisper-large-v3" |
|
|
|
device = 0 if torch.cuda.is_available() else "cpu" |
|
|
|
|
|
pipe_turbo = pipeline( |
|
task="automatic-speech-recognition", |
|
model=MODEL_NAME_TURBO, |
|
chunk_length_s=30, |
|
device=device, |
|
) |
|
|
|
pipe_standard = pipeline( |
|
task="automatic-speech-recognition", |
|
model=MODEL_NAME_STANDARD, |
|
chunk_length_s=30, |
|
device=device, |
|
) |
|
|
|
|
|
def transcribe_turbo(audio): |
|
start_time = time.time() |
|
text_turbo = pipe_turbo(audio)["text"] |
|
elapsed_time = time.time() - start_time |
|
return text_turbo, elapsed_time |
|
|
|
|
|
def transcribe_standard(audio): |
|
start_time = time.time() |
|
text_standard = pipe_standard(audio)["text"] |
|
elapsed_time = time.time() - start_time |
|
return text_standard, elapsed_time |
|
|
|
|
|
def compare_transcriptions(audio): |
|
if audio is None: |
|
raise gr.Error("No audio file submitted! Please record an audio before submitting your request.") |
|
|
|
|
|
with concurrent.futures.ThreadPoolExecutor() as executor: |
|
future_turbo = executor.submit(transcribe_turbo, audio) |
|
future_standard = executor.submit(transcribe_standard, audio) |
|
|
|
|
|
text_turbo, time_turbo = future_turbo.result() |
|
text_standard, time_standard = future_standard.result() |
|
|
|
|
|
return (text_standard, f"{time_standard:.2f} seconds"), (text_turbo, f"{time_turbo:.2f} seconds") |
|
|
|
css = """ |
|
h1 { |
|
text-align: center; |
|
display:block; |
|
} |
|
""" |
|
|
|
|
|
with gr.Blocks(css=css) as demo: |
|
|
|
gr.Markdown("# Whisper large-v3-turbo ...vs... Whisper large-v3") |
|
gr.Markdown("This app compares the transcription performance and processing time between openAI 'Whisper large-v3' and 'Whisper large-v3-turbo' models") |
|
|
|
with gr.Column(): |
|
with gr.Row(): |
|
with gr.Group(): |
|
audio_input = gr.Audio(sources=["microphone"], type="filepath") |
|
transcribe_button = gr.Button("Start transcription", variant="primary") |
|
|
|
with gr.Row(): |
|
with gr.Row(): |
|
with gr.Group(): |
|
gr.Markdown("### π **Standard model**") |
|
standard_output = gr.Textbox(label="Transcription") |
|
standard_time = gr.Textbox(label="Processing Time") |
|
with gr.Group(): |
|
gr.Markdown("### β‘ **Turbo model**") |
|
turbo_output = gr.Textbox(label="Transcription") |
|
turbo_time = gr.Textbox(label="Processing Time") |
|
|
|
|
|
transcribe_button.click(fn=compare_transcriptions, inputs=audio_input, outputs=[standard_output, standard_time, turbo_output, turbo_time]) |
|
|
|
|
|
demo.launch() |
|
|