import nemo.collections.asr as nemo_asr import gradio as gr import pandas as pd from pydub import AudioSegment import os asr_model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name="stt_rw_conformer_ctc_large") df = pd.read_csv("amasaku_data.tsv",sep='\t') amasaku_mapping = {str(key).lower():str(val).lower() for key,val in zip(df.iloc[:,0],df.iloc[:,1])} def transcribe(file): #if not audio: # return {state_var: state, transcription_var: state} #print("filename: ",file) # print("microphone: ",microphone) # print("uploaded file: ",file) if microphone: file_name = file elif file: file_name = file try: audio = AudioSegment.from_file(file_name) if audio.frame_rate != 16000: audio = audio.set_frame_rate(16000) if audio.channels != 1: audio = audio.set_channels(1) audio.export(file_name.split(".")[0]+".wav",format="wav") except Exception as e: print(e) transcription= asr_model.transcribe([file]) transcription = transcription[0].lower().split() transcribed_with_amasaku = [] for word in transcription: transcribed_with_amasaku.append(amasaku_mapping.get(word,word)) transcribed_with_amasaku = " ".join(transcribed_with_amasaku) return transcribed_with_amasaku.capitalize() with gr.Blocks() as demo: # state_var = gr.State("") with gr.Row(): with gr.Column(): microphone = gr.Audio(sources="microphone", type="filepath", label="Microphone") # print("microphone source: ",microphone) # print(os.listdir("/tmp")) uploaded_audio = gr.Audio(label="Upload Audio File", type="filepath",sources="upload") print('upload path: ',uploaded_audio) with gr.Column(): transcription = gr.Textbox(type="text", label="Transcription") with gr.Row(): transcribe_button = gr.Button("Transcribe") transcribe_button.click( transcribe, [microphone, uploaded_audio], transcription ) demo.launch()