Vinicius Oliveira
adicionando requirements e app
1b4bf5a
import gradio as gr
import speech_recognition as sr
from pydub import AudioSegment
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import os
import torch
tokenizer = Wav2Vec2Processor.from_pretrained('jonatasgrosman/wav2vec2-large-xlsr-53-portuguese')
model = Wav2Vec2ForCTC.from_pretrained('jonatasgrosman/wav2vec2-large-xlsr-53-portuguese')
# Load the pre-trained speech recognition model
recognizer = sr.Recognizer()
def recognize_speech(audio_path):
print(audio_path)
# Perform speech recognition on the captured audio
try:
clip = AudioSegment.from_file(audio_path)
clip = clip.set_frame_rate(16000)
print(clip)
x = torch.FloatTensor(clip.get_array_of_samples())
inputs = tokenizer(x, sampling_rate=16000, return_tensors='pt', padding='longest').input_values
logits = model(inputs).logits
tokens = torch.argmax(logits, axis=-1)
text = tokenizer.batch_decode(tokens)
return str(text).lower()
except sr.UnknownValueError:
return "Could not understand the audio."
except sr.RequestError as e:
return f"Error accessing the Google Speech Recognition service: {e}"
# Create the Gradio interface with microphone input
audio_recognizer_interface = gr.Interface(
fn=recognize_speech,
inputs=gr.inputs.Audio(source="microphone", type="filepath", label="Speak into the microphone..."),
outputs="text",
title="Real-time Speech Recognition"
)
# Run the interface
audio_recognizer_interface.launch()