Spaces:
Runtime error
Runtime error
import os | |
os.system("pip uninstall -y gradio") | |
os.system("pip install gradio==3.5") | |
# import gradio as gr | |
# import torch | |
# import librosa | |
# import soundfile | |
# import nemo.collections.asr as nemo_asr | |
# import tempfile | |
# import os | |
# import uuid | |
# import wget | |
# model_url = 'https://storage.googleapis.com/vakyansh-open-models/conformer_models/hindi/filtered_v1_ssl_2022-07-08_19-43-25/Conformer-CTC-BPE-Large.nemo' | |
# wget.download(model_url) | |
# SAMPLE_RATE = 16000 | |
# # Load pre-trained model | |
# model = nemo_asr.models.EncDecRNNTBPEModel.restore_from("./Conformer-CTC-BPE-Large.nemo") | |
# model.change_decoding_strategy(None) | |
# model.eval() | |
# def process_audio_file(file_path): | |
# print(file_path) | |
# print(SAMPLE_RATE) | |
# # Load audio file | |
# data, sr = librosa.load(file_path, sr=SAMPLE_RATE) | |
# # # Resample if necessary | |
# # if sr != SAMPLE_RATE: | |
# # data = librosa.resample(data, sr, SAMPLE_RATE) | |
# # Convert to mono channel | |
# data = librosa.to_mono(data) | |
# return data | |
# def transcribe(audio): | |
# # Handle warning message | |
# # Process audio file | |
# sr, data = audio | |
# if sr != SAMPLE_RATE: | |
# data = librosa.resample(data, sr, SAMPLE_RATE) | |
# with tempfile.TemporaryDirectory() as tmpdir: | |
# # Save audio data to a temporary file | |
# audio_path = os.path.join(tmpdir, f'audio_{uuid.uuid4()}.wav') | |
# soundfile.write(audio_path, data, SAMPLE_RATE) | |
# # Transcribe audio | |
# transcriptions = model.transcribe([audio_path]) | |
# # Extract best hypothesis if transcriptions form a tuple (from RNNT) | |
# if isinstance(transcriptions, tuple) and len(transcriptions) == 2: | |
# transcriptions = transcriptions[0] | |
# return warn_output + transcriptions[0] | |
# iface = gr.Interface( | |
# fn=transcribe, | |
# inputs=gr.Audio(sources=["microphone"]), | |
# outputs="textbox", | |
# title="NeMo Conformer Transducer Large - English", | |
# description="Demo for English speech recognition using Conformer Transducers", | |
# allow_flagging='never', | |
# ) | |
# iface.queue(max_size=10) | |
# iface.launch() | |
import gradio as gr | |
import torch | |
import librosa | |
import soundfile | |
import nemo.collections.asr as nemo_asr | |
import tempfile | |
import os | |
import uuid | |
import wget | |
model_url = 'https://storage.googleapis.com/vakyansh-open-models/conformer_models/hindi/filtered_v1_ssl_2022-07-08_19-43-25/Conformer-CTC-BPE-Large.nemo' | |
wget.download(model_url) | |
SAMPLE_RATE = 16000 | |
# Load pre-trained model | |
model = nemo_asr.models.EncDecRNNTBPEModel.restore_from("./Conformer-CTC-BPE-Large.nemo") | |
model.change_decoding_strategy(None) | |
model.eval() | |
def process_audio_file(file_path): | |
# Load audio file | |
data, sr = librosa.load(file_path) | |
# Resample if necessary | |
if sr != SAMPLE_RATE: | |
data = librosa.resample(data, sr, SAMPLE_RATE) | |
# Convert to mono channel | |
data = librosa.to_mono(data) | |
return data | |
def transcribe(microphone_audio, uploaded_audio): | |
# Handle warning message | |
warn_output = "" | |
if microphone_audio and uploaded_audio: | |
warn_output = ("WARNING: You've uploaded an audio file and used the microphone. " | |
"The recorded file from the microphone will be used and the uploaded audio will be discarded.\n") | |
audio_file = microphone_audio | |
elif not microphone_audio and not uploaded_audio: | |
return "ERROR: You have to either use the microphone or upload an audio file" | |
elif microphone_audio: | |
audio_file = microphone_audio | |
else: | |
audio_file = uploaded_audio | |
# Process audio file | |
audio_data = process_audio_file(audio_file) | |
with tempfile.TemporaryDirectory() as tmpdir: | |
# Save audio data to a temporary file | |
audio_path = os.path.join(tmpdir, f'audio_{uuid.uuid4()}.wav') | |
soundfile.write(audio_path, audio_data, SAMPLE_RATE) | |
# Transcribe audio | |
transcriptions = model.transcribe([audio_path]) | |
# Extract best hypothesis if transcriptions form a tuple (from RNNT) | |
if isinstance(transcriptions, tuple) and len(transcriptions) == 2: | |
transcriptions = transcriptions[0] | |
return warn_output + transcriptions[0] | |
iface = gr.Interface( | |
fn=transcribe, | |
inputs=[gr.inputs.Audio(source="microphone", type='filepath', optional=True), | |
gr.inputs.Audio(source="upload", type='filepath', optional=True)], | |
outputs="text", | |
title="NeMo Conformer Transducer Large - English", | |
description="Demo for English speech recognition using Conformer Transducers", | |
allow_flagging='never', | |
) | |
iface.launch(enable_queue=True) |