File size: 1,574 Bytes
580817b f1ccfa6 681b48e 580817b 3d4d376 919ca0a 54e1781 88c40a0 3d4d376 1e0aa04 580817b fbab64a 3d4d376 681b48e c73c4f7 130a481 c73c4f7 1686f52 e505cae 8fb298d 580817b c761a0e 580817b c4af624 580817b c4af624 1544677 580817b c4af624 1544677 580817b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
import nemo.collections.asr as nemo_asr
import gradio as gr
import pandas as pd
from pydub import AudioSegment
asr_model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name="stt_rw_conformer_ctc_large")
df = pd.read_csv("amasaku_data.tsv",sep='\t')
amasaku_mapping = {str(key).lower():str(val).lower() for key,val in zip(df.iloc[:,0],df.iloc[:,1])}
def transcribe(file):
#if not audio:
# return {state_var: state, transcription_var: state}
#print("filename: ",file)
try:
audio = AudioSegment.from_file(file).set_frame_rate(16000).set_channels(1)
new_file_name = file.split(".")[0]+".wav"
audio.export(new_file_name,format)
except Exception as e:
print(e)
transcription= asr_model.transcribe([new_file_name])
transcription = transcription[0].lower().split()
transcribed_with_amasaku = []
for word in transcription:
transcribed_with_amasaku.append(amasaku_mapping.get(word,word))
transcribed_with_amasaku = " ".join(transcribed_with_amasaku)
return transcribed_with_amasaku.capitalize()
with gr.Blocks() as demo:
# state_var = gr.State("")
with gr.Row():
with gr.Column():
uploaded_audio = gr.Audio(label="Upload Audio File", type="filepath")
with gr.Column():
transcription = gr.Textbox(type="text", label="Transcription")
with gr.Row():
transcribe_button = gr.Button("Transcribe")
transcribe_button.click(
transcribe,
[uploaded_audio],
transcription
)
demo.launch() |