Spaces:

vakyansh
/

vakyansh-hindi-conformer-2022

Runtime error

vakyansh-hindi-conformer-2022

File size: 4,648 Bytes

import os
os.system("pip uninstall -y gradio")
os.system("pip install gradio==3.5")
# import gradio as gr
# import torch
# import librosa
# import soundfile
# import nemo.collections.asr as nemo_asr
# import tempfile
# import os
# import uuid
# import wget
# model_url = 'https://storage.googleapis.com/vakyansh-open-models/conformer_models/hindi/filtered_v1_ssl_2022-07-08_19-43-25/Conformer-CTC-BPE-Large.nemo'
# wget.download(model_url)

# SAMPLE_RATE = 16000

# # Load pre-trained model
# model = nemo_asr.models.EncDecRNNTBPEModel.restore_from("./Conformer-CTC-BPE-Large.nemo")
# model.change_decoding_strategy(None)
# model.eval()


# def process_audio_file(file_path):
#     print(file_path)
#     print(SAMPLE_RATE)
#     # Load audio file
#     data, sr = librosa.load(file_path, sr=SAMPLE_RATE)

#     # # Resample if necessary
#     # if sr != SAMPLE_RATE:
#     #     data = librosa.resample(data, sr, SAMPLE_RATE)

#     # Convert to mono channel
#     data = librosa.to_mono(data)
#     return data


# def transcribe(audio):
#     # Handle warning message
    
#     # Process audio file
#     sr, data = audio

#     if sr != SAMPLE_RATE:
#         data = librosa.resample(data, sr, SAMPLE_RATE)

#     with tempfile.TemporaryDirectory() as tmpdir:
#         # Save audio data to a temporary file
#         audio_path = os.path.join(tmpdir, f'audio_{uuid.uuid4()}.wav')
#         soundfile.write(audio_path, data, SAMPLE_RATE)

#         # Transcribe audio
#         transcriptions = model.transcribe([audio_path])

#         # Extract best hypothesis if transcriptions form a tuple (from RNNT)
#         if isinstance(transcriptions, tuple) and len(transcriptions) == 2:
#             transcriptions = transcriptions[0]

#     return warn_output + transcriptions[0]


# iface = gr.Interface(
#     fn=transcribe,
#     inputs=gr.Audio(sources=["microphone"]),
#     outputs="textbox",
    
    
#     title="NeMo Conformer Transducer Large - English",
#     description="Demo for English speech recognition using Conformer Transducers",
#     allow_flagging='never',
# )
# iface.queue(max_size=10)
# iface.launch()




import gradio as gr
import torch
import librosa
import soundfile
import nemo.collections.asr as nemo_asr
import tempfile
import os
import uuid
import wget
model_url = 'https://storage.googleapis.com/vakyansh-open-models/conformer_models/hindi/filtered_v1_ssl_2022-07-08_19-43-25/Conformer-CTC-BPE-Large.nemo'
wget.download(model_url)

SAMPLE_RATE = 16000

# Load pre-trained model
model = nemo_asr.models.EncDecRNNTBPEModel.restore_from("./Conformer-CTC-BPE-Large.nemo")
model.change_decoding_strategy(None)
model.eval()


def process_audio_file(file_path):
    # Load audio file
    data, sr = librosa.load(file_path)

    # Resample if necessary
    if sr != SAMPLE_RATE:
        data = librosa.resample(data, sr, SAMPLE_RATE)

    # Convert to mono channel
    data = librosa.to_mono(data)
    return data


def transcribe(microphone_audio, uploaded_audio):
    # Handle warning message
    warn_output = ""
    if microphone_audio and uploaded_audio:
        warn_output = ("WARNING: You've uploaded an audio file and used the microphone. "
                       "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n")
        audio_file = microphone_audio

    elif not microphone_audio and not uploaded_audio:
        return "ERROR: You have to either use the microphone or upload an audio file"

    elif microphone_audio:
        audio_file = microphone_audio
    else:
        audio_file = uploaded_audio

    # Process audio file
    audio_data = process_audio_file(audio_file)

    with tempfile.TemporaryDirectory() as tmpdir:
        # Save audio data to a temporary file
        audio_path = os.path.join(tmpdir, f'audio_{uuid.uuid4()}.wav')
        soundfile.write(audio_path, audio_data, SAMPLE_RATE)

        # Transcribe audio
        transcriptions = model.transcribe([audio_path])

        # Extract best hypothesis if transcriptions form a tuple (from RNNT)
        if isinstance(transcriptions, tuple) and len(transcriptions) == 2:
            transcriptions = transcriptions[0]

    return warn_output + transcriptions[0]


iface = gr.Interface(
    fn=transcribe,
    inputs=[gr.inputs.Audio(source="microphone", type='filepath', optional=True),
        gr.inputs.Audio(source="upload", type='filepath', optional=True)],
    outputs="text",
    
    title="NeMo Conformer Transducer Large - English",
    description="Demo for English speech recognition using Conformer Transducers",
    allow_flagging='never',
)
iface.launch(enable_queue=True)