|
import gradio as gr |
|
import time |
|
import io |
|
import librosa |
|
import torch |
|
import soundfile as sf |
|
|
|
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline |
|
|
|
|
|
|
|
model = AutoModelForSpeechSeq2Seq.from_pretrained(pretrained_model_name_or_path= "openai/whisper-large-v3", |
|
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, |
|
use_safetensors=True) |
|
|
|
model = model.to("cuda") |
|
|
|
|
|
|
|
processor = AutoProcessor.from_pretrained(pretrained_model_name_or_path="openai/whisper-large-v3") |
|
|
|
|
|
|
|
pipe = pipeline(task="automatic-speech-recognition", |
|
model="openai/whisper-large-v3", |
|
tokenizer=processor.tokenizer, |
|
feature_extractor=processor.feature_extractor, |
|
max_new_tokens=128, |
|
chunk_length_s=30, |
|
batch_size=16, |
|
return_timestamps=True, |
|
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, |
|
device="cuda" |
|
) |
|
|
|
|
|
|
|
def convert(audio, state=""): |
|
""" |
|
This function performs speech to text conversion and will be used in Gradio's Interface function. |
|
Parameters: |
|
- audio: audio data as a bytes-like object. |
|
- state: a string representing the accumulated text from previous conversions. |
|
""" |
|
time.sleep(3) |
|
try: |
|
result = pipe(audio) |
|
transcribed_text = result['text'] |
|
state += transcribed_text + " " |
|
except Exception as e: |
|
return f"Error processing audio: Please start recording!", state |
|
|
|
return state, state |
|
|
|
|
|
|
|
gr_interface = gr.Interface( |
|
fn = convert, |
|
title = "Automatic Speech-to-Text", |
|
description = "### Record your speech and watch it get converted to text!", |
|
inputs = [ |
|
gr.Audio( |
|
label="Please Record Your Speech Here!", |
|
sources="microphone", |
|
type="filepath"), |
|
"state"], |
|
outputs = [ |
|
"textbox", |
|
"state" |
|
], |
|
theme="dark", |
|
live=True |
|
) |
|
|
|
|
|
|
|
gr_interface.launch() |