File size: 2,110 Bytes
580817b
 
f1ccfa6
681b48e
5f55f11
580817b
3d4d376
919ca0a
54e1781
88c40a0
3d4d376
a719fc8
580817b
fbab64a
 
3d4d376
681b48e
1440c86
 
a719fc8
00eb299
 
 
681b48e
00eb299
52c6777
 
 
 
00eb299
681b48e
 
bd275e2
c73c4f7
130a481
c73c4f7
1686f52
e505cae
8fb298d
580817b
 
 
c761a0e
580817b
c4af624
 
73486fc
e06917c
5f55f11
73486fc
e06917c
c4af624
1544677
580817b
 
 
 
c4af624
a719fc8
1544677
580817b
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import nemo.collections.asr as nemo_asr
import gradio as gr
import pandas as pd 
from pydub import AudioSegment 
import os 

asr_model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name="stt_rw_conformer_ctc_large")
df = pd.read_csv("amasaku_data.tsv",sep='\t')

amasaku_mapping = {str(key).lower():str(val).lower() for key,val in zip(df.iloc[:,0],df.iloc[:,1])}

def transcribe(microphone,file):

    #if not audio:
    #    return {state_var: state, transcription_var: state} 
    
    #print("filename: ",file)
    print("microphone: ",microphone)
    print("uploaded file: ",file)
    if microphone:
        file_name = microphone
    elif file:
        file_name = file 
    try:
        audio = AudioSegment.from_file(file_name)
        if audio.frame_rate != 16000:
            audio = audio.set_frame_rate(16000)
        if audio.channels != 1:
            audio = audio.set_channels(1)
        audio.export(file_name.split(".")[0]+".wav",format="wav")
    except Exception as e:
        print(e)
    transcription= asr_model.transcribe([file])
    transcription = transcription[0].lower().split()
    transcribed_with_amasaku = []
    for word in transcription:
        transcribed_with_amasaku.append(amasaku_mapping.get(word,word))
    transcribed_with_amasaku = " ".join(transcribed_with_amasaku)
    return  transcribed_with_amasaku.capitalize()


with gr.Blocks() as demo:
   # state_var = gr.State("")

    with gr.Row():
        with gr.Column():
            microphone = gr.Audio(sources="microphone", type="filepath", label="Microphone")
            print("microphone source: ",microphone)
            print(os.listdir("/tmp"))
            uploaded_audio = gr.Audio(label="Upload Audio File", type="filepath",sources="upload")
            print('upload path: ',uploaded_audio)
        with gr.Column():
            transcription = gr.Textbox(type="text", label="Transcription")
    with gr.Row():
      transcribe_button = gr.Button("Transcribe")

    transcribe_button.click(
        transcribe,
        [microphone, uploaded_audio],
        transcription
    )

  
demo.launch()