|
import gradio as gr |
|
from transformers import WhisperProcessor, WhisperForConditionalGeneration |
|
|
|
|
|
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2") |
|
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2") |
|
|
|
def transcribe_audio(audio_file) -> str: |
|
audio_data = audio_file.read() |
|
|
|
|
|
input_features = processor(audio_data, return_tensors="pt").input_features |
|
|
|
|
|
model.config.forced_decoder_ids = None |
|
predicted_ids = model.generate(input_features) |
|
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) |
|
|
|
return transcription[0] |
|
|
|
audio_input = gr.inputs.Audio(type="file", label="Upload an audio file") |
|
text_output = gr.outputs.Textbox(label="Transcription") |
|
|
|
iface = gr.Interface( |
|
fn=transcribe_audio, |
|
inputs=audio_input, |
|
outputs=text_output, |
|
title="Speech-to-Text using Whisper v2", |
|
description="Upload an audio file to transcribe it to text.", |
|
theme="Monochrome", |
|
live=True, |
|
capture_session=True, |
|
) |
|
|
|
iface.launch() |