Spaces:
Runtime error
Runtime error
File size: 4,648 Bytes
653cae7 d0f88d8 647230d 710b824 647230d d0f88d8 647230d d0f88d8 647230d d0f88d8 647230d d0f88d8 67cc85c d0f88d8 647230d 9f1b9c0 d0f88d8 647230d d0f88d8 eed141f d0f88d8 4fd2e4a 647230d d0f88d8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 |
import os
os.system("pip uninstall -y gradio")
os.system("pip install gradio==3.5")
# import gradio as gr
# import torch
# import librosa
# import soundfile
# import nemo.collections.asr as nemo_asr
# import tempfile
# import os
# import uuid
# import wget
# model_url = 'https://storage.googleapis.com/vakyansh-open-models/conformer_models/hindi/filtered_v1_ssl_2022-07-08_19-43-25/Conformer-CTC-BPE-Large.nemo'
# wget.download(model_url)
# SAMPLE_RATE = 16000
# # Load pre-trained model
# model = nemo_asr.models.EncDecRNNTBPEModel.restore_from("./Conformer-CTC-BPE-Large.nemo")
# model.change_decoding_strategy(None)
# model.eval()
# def process_audio_file(file_path):
# print(file_path)
# print(SAMPLE_RATE)
# # Load audio file
# data, sr = librosa.load(file_path, sr=SAMPLE_RATE)
# # # Resample if necessary
# # if sr != SAMPLE_RATE:
# # data = librosa.resample(data, sr, SAMPLE_RATE)
# # Convert to mono channel
# data = librosa.to_mono(data)
# return data
# def transcribe(audio):
# # Handle warning message
# # Process audio file
# sr, data = audio
# if sr != SAMPLE_RATE:
# data = librosa.resample(data, sr, SAMPLE_RATE)
# with tempfile.TemporaryDirectory() as tmpdir:
# # Save audio data to a temporary file
# audio_path = os.path.join(tmpdir, f'audio_{uuid.uuid4()}.wav')
# soundfile.write(audio_path, data, SAMPLE_RATE)
# # Transcribe audio
# transcriptions = model.transcribe([audio_path])
# # Extract best hypothesis if transcriptions form a tuple (from RNNT)
# if isinstance(transcriptions, tuple) and len(transcriptions) == 2:
# transcriptions = transcriptions[0]
# return warn_output + transcriptions[0]
# iface = gr.Interface(
# fn=transcribe,
# inputs=gr.Audio(sources=["microphone"]),
# outputs="textbox",
# title="NeMo Conformer Transducer Large - English",
# description="Demo for English speech recognition using Conformer Transducers",
# allow_flagging='never',
# )
# iface.queue(max_size=10)
# iface.launch()
import gradio as gr
import torch
import librosa
import soundfile
import nemo.collections.asr as nemo_asr
import tempfile
import os
import uuid
import wget
model_url = 'https://storage.googleapis.com/vakyansh-open-models/conformer_models/hindi/filtered_v1_ssl_2022-07-08_19-43-25/Conformer-CTC-BPE-Large.nemo'
wget.download(model_url)
SAMPLE_RATE = 16000
# Load pre-trained model
model = nemo_asr.models.EncDecRNNTBPEModel.restore_from("./Conformer-CTC-BPE-Large.nemo")
model.change_decoding_strategy(None)
model.eval()
def process_audio_file(file_path):
# Load audio file
data, sr = librosa.load(file_path)
# Resample if necessary
if sr != SAMPLE_RATE:
data = librosa.resample(data, sr, SAMPLE_RATE)
# Convert to mono channel
data = librosa.to_mono(data)
return data
def transcribe(microphone_audio, uploaded_audio):
# Handle warning message
warn_output = ""
if microphone_audio and uploaded_audio:
warn_output = ("WARNING: You've uploaded an audio file and used the microphone. "
"The recorded file from the microphone will be used and the uploaded audio will be discarded.\n")
audio_file = microphone_audio
elif not microphone_audio and not uploaded_audio:
return "ERROR: You have to either use the microphone or upload an audio file"
elif microphone_audio:
audio_file = microphone_audio
else:
audio_file = uploaded_audio
# Process audio file
audio_data = process_audio_file(audio_file)
with tempfile.TemporaryDirectory() as tmpdir:
# Save audio data to a temporary file
audio_path = os.path.join(tmpdir, f'audio_{uuid.uuid4()}.wav')
soundfile.write(audio_path, audio_data, SAMPLE_RATE)
# Transcribe audio
transcriptions = model.transcribe([audio_path])
# Extract best hypothesis if transcriptions form a tuple (from RNNT)
if isinstance(transcriptions, tuple) and len(transcriptions) == 2:
transcriptions = transcriptions[0]
return warn_output + transcriptions[0]
iface = gr.Interface(
fn=transcribe,
inputs=[gr.inputs.Audio(source="microphone", type='filepath', optional=True),
gr.inputs.Audio(source="upload", type='filepath', optional=True)],
outputs="text",
title="NeMo Conformer Transducer Large - English",
description="Demo for English speech recognition using Conformer Transducers",
allow_flagging='never',
)
iface.launch(enable_queue=True) |