|
import nemo.collections.asr as nemo_asr |
|
import gradio as gr |
|
import pandas as pd |
|
from pydub import AudioSegment |
|
import os |
|
|
|
asr_model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name="stt_rw_conformer_ctc_large") |
|
df = pd.read_csv("amasaku_data.tsv",sep='\t') |
|
|
|
amasaku_mapping = {str(key).lower():str(val).lower() for key,val in zip(df.iloc[:,0],df.iloc[:,1])} |
|
|
|
def transcribe(file): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if file: |
|
file_name = file |
|
|
|
if not file_name: |
|
return "No audio file provided." |
|
|
|
try: |
|
audio = AudioSegment.from_file(file_name) |
|
if audio.frame_rate != 16000: |
|
audio = audio.set_frame_rate(16000) |
|
if audio.channels != 1: |
|
audio = audio.set_channels(1) |
|
audio.export(file_name.split(".")[0]+".wav",format="wav") |
|
except Exception as e: |
|
print(e) |
|
transcription= asr_model.transcribe([file]) |
|
transcription = transcription[0].lower().split() |
|
transcribed_with_amasaku = [] |
|
for word in transcription: |
|
transcribed_with_amasaku.append(amasaku_mapping.get(word,word)) |
|
transcribed_with_amasaku = " ".join(transcribed_with_amasaku) |
|
return transcribed_with_amasaku.capitalize() |
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
|
|
|
with gr.Row(): |
|
with gr.Column(): |
|
microphone = gr.Audio(sources="microphone", type="filepath", label="Microphone") |
|
|
|
|
|
uploaded_audio = gr.Audio(label="Upload Audio File", type="filepath",sources="upload") |
|
print('upload path: ',uploaded_audio) |
|
with gr.Column(): |
|
transcription = gr.Textbox(type="text", label="Transcription") |
|
with gr.Row(): |
|
transcribe_button = gr.Button("Transcribe") |
|
|
|
def handle_audio(microphone, uploaded_audio): |
|
if microphone: |
|
return transcribe(microphone) |
|
elif uploaded_audio: |
|
return transcribe(uploaded_audio) |
|
else: |
|
return "No audio file provided." |
|
|
|
transcribe_button.click( |
|
handle_audio, |
|
[microphone, uploaded_audio], |
|
transcription |
|
) |
|
|
|
|
|
demo.launch() |