File size: 2,025 Bytes
580817b
 
f1ccfa6
681b48e
580817b
3d4d376
919ca0a
54e1781
88c40a0
3d4d376
a719fc8
580817b
fbab64a
 
3d4d376
681b48e
1440c86
 
a719fc8
 
681b48e
52c6777
 
 
 
 
8feb0c9
 
681b48e
 
bd275e2
c73c4f7
130a481
c73c4f7
1686f52
e505cae
8fb298d
580817b
 
 
c761a0e
580817b
c4af624
 
73486fc
e06917c
73486fc
e06917c
c4af624
1544677
580817b
 
 
 
c4af624
a719fc8
1544677
580817b
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import nemo.collections.asr as nemo_asr
import gradio as gr
import pandas as pd 
from pydub import AudioSegment 

asr_model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name="stt_rw_conformer_ctc_large")
df = pd.read_csv("amasaku_data.tsv",sep='\t')

amasaku_mapping = {str(key).lower():str(val).lower() for key,val in zip(df.iloc[:,0],df.iloc[:,1])}

def transcribe(microphone,file):

    #if not audio:
    #    return {state_var: state, transcription_var: state} 
    
    #print("filename: ",file)
    print("microphone: ",microphone)
    print("uploaded file: ",file)
    if microphone:
        file = microphone
    try:
        audio = AudioSegment.from_file(file)
        if audio.frame_rate != 16000:
            audio = audio.set_frame_rate(16000)
        if audio.channels != 1:
            audio = audio.set_channels(1)
        file = file.split(".")[0]+".wav"
        audio.export(file,format="wav")
    except Exception as e:
        print(e)
    transcription= asr_model.transcribe([file])
    transcription = transcription[0].lower().split()
    transcribed_with_amasaku = []
    for word in transcription:
        transcribed_with_amasaku.append(amasaku_mapping.get(word,word))
    transcribed_with_amasaku = " ".join(transcribed_with_amasaku)
    return  transcribed_with_amasaku.capitalize()


with gr.Blocks() as demo:
   # state_var = gr.State("")

    with gr.Row():
        with gr.Column():
            microphone = gr.Audio(sources="microphone", type="filepath", label="Microphone")
            print("microphone source: ",microphone)
            uploaded_audio = gr.Audio(label="Upload Audio File", type="filepath",sources="upload")
            print('upload path: ',uploaded_audio)
        with gr.Column():
            transcription = gr.Textbox(type="text", label="Transcription")
    with gr.Row():
      transcribe_button = gr.Button("Transcribe")

    transcribe_button.click(
        transcribe,
        [microphone, uploaded_audio],
        transcription
    )

  
demo.launch()