vakyansh's picture
Update app.py
653cae7 verified
import os
os.system("pip uninstall -y gradio")
os.system("pip install gradio==3.5")
# import gradio as gr
# import torch
# import librosa
# import soundfile
# import nemo.collections.asr as nemo_asr
# import tempfile
# import os
# import uuid
# import wget
# model_url = 'https://storage.googleapis.com/vakyansh-open-models/conformer_models/hindi/filtered_v1_ssl_2022-07-08_19-43-25/Conformer-CTC-BPE-Large.nemo'
# wget.download(model_url)
# SAMPLE_RATE = 16000
# # Load pre-trained model
# model = nemo_asr.models.EncDecRNNTBPEModel.restore_from("./Conformer-CTC-BPE-Large.nemo")
# model.change_decoding_strategy(None)
# model.eval()
# def process_audio_file(file_path):
# print(file_path)
# print(SAMPLE_RATE)
# # Load audio file
# data, sr = librosa.load(file_path, sr=SAMPLE_RATE)
# # # Resample if necessary
# # if sr != SAMPLE_RATE:
# # data = librosa.resample(data, sr, SAMPLE_RATE)
# # Convert to mono channel
# data = librosa.to_mono(data)
# return data
# def transcribe(audio):
# # Handle warning message
# # Process audio file
# sr, data = audio
# if sr != SAMPLE_RATE:
# data = librosa.resample(data, sr, SAMPLE_RATE)
# with tempfile.TemporaryDirectory() as tmpdir:
# # Save audio data to a temporary file
# audio_path = os.path.join(tmpdir, f'audio_{uuid.uuid4()}.wav')
# soundfile.write(audio_path, data, SAMPLE_RATE)
# # Transcribe audio
# transcriptions = model.transcribe([audio_path])
# # Extract best hypothesis if transcriptions form a tuple (from RNNT)
# if isinstance(transcriptions, tuple) and len(transcriptions) == 2:
# transcriptions = transcriptions[0]
# return warn_output + transcriptions[0]
# iface = gr.Interface(
# fn=transcribe,
# inputs=gr.Audio(sources=["microphone"]),
# outputs="textbox",
# title="NeMo Conformer Transducer Large - English",
# description="Demo for English speech recognition using Conformer Transducers",
# allow_flagging='never',
# )
# iface.queue(max_size=10)
# iface.launch()
import gradio as gr
import torch
import librosa
import soundfile
import nemo.collections.asr as nemo_asr
import tempfile
import os
import uuid
import wget
model_url = 'https://storage.googleapis.com/vakyansh-open-models/conformer_models/hindi/filtered_v1_ssl_2022-07-08_19-43-25/Conformer-CTC-BPE-Large.nemo'
wget.download(model_url)
SAMPLE_RATE = 16000
# Load pre-trained model
model = nemo_asr.models.EncDecRNNTBPEModel.restore_from("./Conformer-CTC-BPE-Large.nemo")
model.change_decoding_strategy(None)
model.eval()
def process_audio_file(file_path):
# Load audio file
data, sr = librosa.load(file_path)
# Resample if necessary
if sr != SAMPLE_RATE:
data = librosa.resample(data, sr, SAMPLE_RATE)
# Convert to mono channel
data = librosa.to_mono(data)
return data
def transcribe(microphone_audio, uploaded_audio):
# Handle warning message
warn_output = ""
if microphone_audio and uploaded_audio:
warn_output = ("WARNING: You've uploaded an audio file and used the microphone. "
"The recorded file from the microphone will be used and the uploaded audio will be discarded.\n")
audio_file = microphone_audio
elif not microphone_audio and not uploaded_audio:
return "ERROR: You have to either use the microphone or upload an audio file"
elif microphone_audio:
audio_file = microphone_audio
else:
audio_file = uploaded_audio
# Process audio file
audio_data = process_audio_file(audio_file)
with tempfile.TemporaryDirectory() as tmpdir:
# Save audio data to a temporary file
audio_path = os.path.join(tmpdir, f'audio_{uuid.uuid4()}.wav')
soundfile.write(audio_path, audio_data, SAMPLE_RATE)
# Transcribe audio
transcriptions = model.transcribe([audio_path])
# Extract best hypothesis if transcriptions form a tuple (from RNNT)
if isinstance(transcriptions, tuple) and len(transcriptions) == 2:
transcriptions = transcriptions[0]
return warn_output + transcriptions[0]
iface = gr.Interface(
fn=transcribe,
inputs=[gr.inputs.Audio(source="microphone", type='filepath', optional=True),
gr.inputs.Audio(source="upload", type='filepath', optional=True)],
outputs="text",
title="NeMo Conformer Transducer Large - English",
description="Demo for English speech recognition using Conformer Transducers",
allow_flagging='never',
)
iface.launch(enable_queue=True)