speech-to-text / app.py
micknikolic's picture
Update app.py
e2ab019
import gradio as gr
import time
import io
import librosa
import torch
import soundfile as sf
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
#Instantiating the model object.
model = AutoModelForSpeechSeq2Seq.from_pretrained(pretrained_model_name_or_path= "openai/whisper-large-v3",
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
use_safetensors=True)
model = model.to("cuda")
#Instantiating the processor object.
processor = AutoProcessor.from_pretrained(pretrained_model_name_or_path="openai/whisper-large-v3")
#Instantiating the transformer class' pipeline object.
pipe = pipeline(task="automatic-speech-recognition",
model="openai/whisper-large-v3",
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
max_new_tokens=128,
chunk_length_s=30,
batch_size=16,
return_timestamps=True,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
device="cuda"
)
#Defining speech-to-text function.
def convert(audio, state=""):
"""
This function performs speech to text conversion and will be used in Gradio's Interface function.
Parameters:
- audio: audio data as a bytes-like object.
- state: a string representing the accumulated text from previous conversions.
"""
time.sleep(3)
try:
result = pipe(audio)
transcribed_text = result['text']
state += transcribed_text + " "
except Exception as e:
return f"Error processing audio: Please start recording!", state
return state, state
#Instantiating Gradio Interface.
gr_interface = gr.Interface(
fn = convert,
title = "Automatic Speech-to-Text",
description = "### Record your speech and watch it get converted to text!",
inputs = [
gr.Audio(
label="Please Record Your Speech Here!",
sources="microphone",
type="filepath"),
"state"],
outputs = [
"textbox",
"state"
],
theme="dark",
live=True
)
#Launching the app (share=True).
gr_interface.launch()