Spaces:
Sleeping
Sleeping
import gradio as gr | |
import speech_recognition as sr | |
from pydub import AudioSegment | |
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor | |
import os | |
import torch | |
tokenizer = Wav2Vec2Processor.from_pretrained('jonatasgrosman/wav2vec2-large-xlsr-53-portuguese') | |
model = Wav2Vec2ForCTC.from_pretrained('jonatasgrosman/wav2vec2-large-xlsr-53-portuguese') | |
# Load the pre-trained speech recognition model | |
recognizer = sr.Recognizer() | |
def recognize_speech(audio_path): | |
print(audio_path) | |
# Perform speech recognition on the captured audio | |
try: | |
clip = AudioSegment.from_file(audio_path) | |
clip = clip.set_frame_rate(16000) | |
print(clip) | |
x = torch.FloatTensor(clip.get_array_of_samples()) | |
inputs = tokenizer(x, sampling_rate=16000, return_tensors='pt', padding='longest').input_values | |
logits = model(inputs).logits | |
tokens = torch.argmax(logits, axis=-1) | |
text = tokenizer.batch_decode(tokens) | |
return str(text).lower() | |
except sr.UnknownValueError: | |
return "Could not understand the audio." | |
except sr.RequestError as e: | |
return f"Error accessing the Google Speech Recognition service: {e}" | |
# Create the Gradio interface with microphone input | |
audio_recognizer_interface = gr.Interface( | |
fn=recognize_speech, | |
inputs=gr.inputs.Audio(source="microphone", type="filepath", label="Speak into the microphone..."), | |
outputs="text", | |
title="Real-time Speech Recognition" | |
) | |
# Run the interface | |
audio_recognizer_interface.launch() | |