Spaces:

vakyansh
/

vakyansh-hindi-conformer-2022

Runtime error

App Files Files Community

vakyansh-hindi-conformer-2022 / app.py

vakyansh

Update app.py

653cae7 verified about 1 year ago

raw

history blame contribute delete

4.65 kB

	import os
	os.system("pip uninstall -y gradio")
	os.system("pip install gradio==3.5")
	# import gradio as gr
	# import torch
	# import librosa
	# import soundfile
	# import nemo.collections.asr as nemo_asr
	# import tempfile
	# import os
	# import uuid
	# import wget
	# model_url = 'https://storage.googleapis.com/vakyansh-open-models/conformer_models/hindi/filtered_v1_ssl_2022-07-08_19-43-25/Conformer-CTC-BPE-Large.nemo'
	# wget.download(model_url)

	# SAMPLE_RATE = 16000

	# # Load pre-trained model
	# model = nemo_asr.models.EncDecRNNTBPEModel.restore_from("./Conformer-CTC-BPE-Large.nemo")
	# model.change_decoding_strategy(None)
	# model.eval()


	# def process_audio_file(file_path):
	# print(file_path)
	# print(SAMPLE_RATE)
	# # Load audio file
	# data, sr = librosa.load(file_path, sr=SAMPLE_RATE)

	# # # Resample if necessary
	# # if sr != SAMPLE_RATE:
	# # data = librosa.resample(data, sr, SAMPLE_RATE)

	# # Convert to mono channel
	# data = librosa.to_mono(data)
	# return data


	# def transcribe(audio):
	# # Handle warning message

	# # Process audio file
	# sr, data = audio

	# if sr != SAMPLE_RATE:
	# data = librosa.resample(data, sr, SAMPLE_RATE)

	# with tempfile.TemporaryDirectory() as tmpdir:
	# # Save audio data to a temporary file
	# audio_path = os.path.join(tmpdir, f'audio_{uuid.uuid4()}.wav')
	# soundfile.write(audio_path, data, SAMPLE_RATE)

	# # Transcribe audio
	# transcriptions = model.transcribe([audio_path])

	# # Extract best hypothesis if transcriptions form a tuple (from RNNT)
	# if isinstance(transcriptions, tuple) and len(transcriptions) == 2:
	# transcriptions = transcriptions[0]

	# return warn_output + transcriptions[0]


	# iface = gr.Interface(
	# fn=transcribe,
	# inputs=gr.Audio(sources=["microphone"]),
	# outputs="textbox",


	# title="NeMo Conformer Transducer Large - English",
	# description="Demo for English speech recognition using Conformer Transducers",
	# allow_flagging='never',
	# )
	# iface.queue(max_size=10)
	# iface.launch()




	import gradio as gr
	import torch
	import librosa
	import soundfile
	import nemo.collections.asr as nemo_asr
	import tempfile
	import os
	import uuid
	import wget
	model_url = 'https://storage.googleapis.com/vakyansh-open-models/conformer_models/hindi/filtered_v1_ssl_2022-07-08_19-43-25/Conformer-CTC-BPE-Large.nemo'
	wget.download(model_url)

	SAMPLE_RATE = 16000

	# Load pre-trained model
	model = nemo_asr.models.EncDecRNNTBPEModel.restore_from("./Conformer-CTC-BPE-Large.nemo")
	model.change_decoding_strategy(None)
	model.eval()


	def process_audio_file(file_path):
	# Load audio file
	data, sr = librosa.load(file_path)

	# Resample if necessary
	if sr != SAMPLE_RATE:
	data = librosa.resample(data, sr, SAMPLE_RATE)

	# Convert to mono channel
	data = librosa.to_mono(data)
	return data


	def transcribe(microphone_audio, uploaded_audio):
	# Handle warning message
	warn_output = ""
	if microphone_audio and uploaded_audio:
	warn_output = ("WARNING: You've uploaded an audio file and used the microphone. "
	"The recorded file from the microphone will be used and the uploaded audio will be discarded.\n")
	audio_file = microphone_audio

	elif not microphone_audio and not uploaded_audio:
	return "ERROR: You have to either use the microphone or upload an audio file"

	elif microphone_audio:
	audio_file = microphone_audio
	else:
	audio_file = uploaded_audio

	# Process audio file
	audio_data = process_audio_file(audio_file)

	with tempfile.TemporaryDirectory() as tmpdir:
	# Save audio data to a temporary file
	audio_path = os.path.join(tmpdir, f'audio_{uuid.uuid4()}.wav')
	soundfile.write(audio_path, audio_data, SAMPLE_RATE)

	# Transcribe audio
	transcriptions = model.transcribe([audio_path])

	# Extract best hypothesis if transcriptions form a tuple (from RNNT)
	if isinstance(transcriptions, tuple) and len(transcriptions) == 2:
	transcriptions = transcriptions[0]

	return warn_output + transcriptions[0]


	iface = gr.Interface(
	fn=transcribe,
	inputs=[gr.inputs.Audio(source="microphone", type='filepath', optional=True),
	gr.inputs.Audio(source="upload", type='filepath', optional=True)],
	outputs="text",

	title="NeMo Conformer Transducer Large - English",
	description="Demo for English speech recognition using Conformer Transducers",
	allow_flagging='never',
	)
	iface.launch(enable_queue=True)