File size: 2,472 Bytes
580817b
 
f1ccfa6
681b48e
5f55f11
580817b
3d4d376
919ca0a
54e1781
88c40a0
3d4d376
49ec72e
580817b
fbab64a
 
3d4d376
681b48e
49ec72e
 
1857e8c
 
 
 
 
49ec72e
1857e8c
 
 
 
681b48e
00eb299
52c6777
 
 
 
00eb299
681b48e
 
bd275e2
c73c4f7
130a481
c73c4f7
1686f52
e505cae
8fb298d
580817b
 
 
c761a0e
580817b
c4af624
 
02c7af3
49ec72e
 
73486fc
e06917c
c4af624
1544677
580817b
 
1857e8c
 
 
 
 
 
 
 
580817b
 
1857e8c
02c7af3
1544677
580817b
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import nemo.collections.asr as nemo_asr
import gradio as gr
import pandas as pd 
from pydub import AudioSegment 
import os 

asr_model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name="stt_rw_conformer_ctc_large")
df = pd.read_csv("amasaku_data.tsv",sep='\t')

amasaku_mapping = {str(key).lower():str(val).lower() for key,val in zip(df.iloc[:,0],df.iloc[:,1])}

def transcribe(file):

    #if not audio:
    #    return {state_var: state, transcription_var: state} 
    
    #print("filename: ",file)
    # print("microphone: ",microphone)
    # print("uploaded file: ",file)
    # if microphone:
    #     file_name = file
    # elif file:
    #     file_name = file 
    if file:
        file_name = file
    
    if not file_name:
        return "No audio file provided."

    try:
        audio = AudioSegment.from_file(file_name)
        if audio.frame_rate != 16000:
            audio = audio.set_frame_rate(16000)
        if audio.channels != 1:
            audio = audio.set_channels(1)
        audio.export(file_name.split(".")[0]+".wav",format="wav")
    except Exception as e:
        print(e)
    transcription= asr_model.transcribe([file])
    transcription = transcription[0].lower().split()
    transcribed_with_amasaku = []
    for word in transcription:
        transcribed_with_amasaku.append(amasaku_mapping.get(word,word))
    transcribed_with_amasaku = " ".join(transcribed_with_amasaku)
    return  transcribed_with_amasaku.capitalize()


with gr.Blocks() as demo:
   # state_var = gr.State("")

    with gr.Row():
        with gr.Column():
            microphone = gr.Audio(sources="microphone", type="filepath", label="Microphone")
            # print("microphone source: ",microphone)
            # print(os.listdir("/tmp"))
            uploaded_audio = gr.Audio(label="Upload Audio File", type="filepath",sources="upload")
            print('upload path: ',uploaded_audio)
        with gr.Column():
            transcription = gr.Textbox(type="text", label="Transcription")
    with gr.Row():
      transcribe_button = gr.Button("Transcribe")
    
    def handle_audio(microphone, uploaded_audio):
        if microphone:
            return transcribe(microphone)
        elif uploaded_audio:
            return transcribe(uploaded_audio)
        else:
            return "No audio file provided."

    transcribe_button.click(
        handle_audio,
        [microphone, uploaded_audio],
        transcription
    )

  
demo.launch()