File size: 5,242 Bytes
2c3f8ff
 
 
 
 
 
478bedf
2c3f8ff
478bedf
 
2904d5d
478bedf
 
649f719
 
 
 
 
a7bf230
649f719
478bedf
 
 
2c3f8ff
478bedf
2c3f8ff
 
 
649f719
 
 
 
 
a7bf230
478bedf
 
 
 
 
 
 
2c3f8ff
2904d5d
 
 
 
 
a7bf230
2904d5d
 
 
 
 
 
 
 
 
478bedf
 
 
2c3f8ff
dec6760
2c3f8ff
 
 
 
 
 
e60cafb
478bedf
 
 
 
 
dec6760
478bedf
 
 
 
 
 
dec6760
478bedf
 
2904d5d
 
 
 
dec6760
2904d5d
 
 
 
 
 
dec6760
2904d5d
 
 
478bedf
 
 
 
6a9d77a
dec6760
478bedf
dec6760
e60cafb
478bedf
 
 
 
 
 
dec6760
 
478bedf
dec6760
 
478bedf
 
2904d5d
 
 
 
dec6760
 
2904d5d
dec6760
 
2904d5d
 
478bedf
 
2904d5d
 
 
 
 
 
 
 
478bedf
 
2904d5d
6a9d77a
478bedf
2904d5d
6a9d77a
478bedf
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import gradio as gr
from transformers import pipeline
import time

# p = pipeline("automatic-speech-recognition", model="/Users/mkesavan/aidev/speechAI-trials/xlsr-wave2vec/wav2vec2-large-xls-r-300m-tamil-colab/checkpoint-1600")

# combining Tamil and arabic

p_ta = pipeline("automatic-speech-recognition", model="kmknair/wav2vec2-xlsr-tamil")
p_ar = pipeline("automatic-speech-recognition", model="kmknair/wav2vec2-xlsr-arabic")
p_en = pipeline("automatic-speech-recognition", mdoel="patrickvonplaten/hubert-xlarge-ls960-ft-4-gram")


def transcribe_ta(audio_u, audio_m):        
    text = ""
    if(audio_u is not None):
        text += p_ta(audio_u)["text"]
    if (audio_m is not None):
        text += "\n" + p_ta(audio_m)["text"]
   
    return text

def transcribe_ta_stream(audio, state=""):
    time.sleep(2)
    text = p_ta(audio)["text"]
    state += text + " " 
    return state, state

def transcribe_ar(audio_u, audio_m):    
    text = ""
    if audio_u is not None:
        text += p_ar(audio_u)["text"]
    if audio_m is not None:
        text += "\n" + p_ar(audio_m)["text"]
    return text

def transcribe_ar_stream(audio, state=""):
    time.sleep(2)
    text = p_ar(audio)["text"]
    state += text + " " 
    return state, state

def transcribe_en(audio_u, audio_m):    
    text = ""
    if audio_u is not None:
        text += p_en(audio_u)["text"]
    if audio_m is not None:
        text += "\n" + p_en(audio_m)["text"]
    return text

def transcribe_en_stream(audio, state=""):
    time.sleep(2)
    text = p_en(audio)["text"]
    state += text + " " 
    return state, state


# transcribe Tamil stream
ta_tr_stream_tab = gr.Interface(
    fn=transcribe_ta_stream,
    inputs=[
        gr.Audio(source="microphone", type="filepath", streaming=True, label="தமிழ் பேச்சு"),
        "state"
    ],
    outputs=[
    "textbox",
    "state"
    ],
    description="ரெகாட் பட்டண் அமர்தி பேசவும், பேச்சு சொல் பகிர்ப்பு வலது பக்கதில் அச்சிடபடும்",
    live=True)
# transcribe Arabic stream
ar_tr_stream_tab = gr.Interface(
    fn=transcribe_ar_stream,
    inputs=[
        gr.Audio(source="microphone", type="filepath", streaming=True, label="Arabic speech"),
        "state"
    ],
    outputs=[
    "textbox",
    "state"
    ],
    description="Click record from microphone and start talking, transcription shall appear to the right.",
    live=True)

# transcribe English stream
en_tr_stream_tab = gr.Interface(
    fn=transcribe_en_stream,
    inputs=[
        gr.Audio(source="microphone", type="filepath", streaming=True, label="English speech"),
        "state"
    ],
    outputs=[
    "textbox",
    "state"
    ],
    description="Click record from microphone and start talking, transcription shall appear to the right.",
    live=True)


# transcribe Tamil file
ta_tr_file_tab = gr.Interface(
    fn=transcribe_ta,
    inputs=[
        gr.Audio(type="filepath", label="தமிழ் ஒலி பதிப்பு சமர்ப்பித்தல்"),
        gr.Audio(source="microphone", type="filepath", label= "தமிழ் பேச்சு")
    ],
    examples=[["samples/ta/32862591.mp3", None], ["samples/ta/32862612.mp3", None]],
    description="ஒலி பதிப்பு சமர்ப்பிக்கவும், அல்லது ரெகாட் பட்டண் அமர்தி பேசவும், பேச்சு சொல் பகிர்ப்பு வலது பக்கதில் அச்சிடபடும்",
    outputs="text")

# transcribe Arabic file
ar_tr_file_tab = gr.Interface(
    fn=transcribe_ar,
    inputs=[    
        gr.Audio(type="filepath", label="Arabic file upload"),
        gr.Audio(source="microphone", type="filepath", label="Arabic speech")
    ],
    examples=[["samples/ar/19706399.mp3", None],["samples/ar/19985784.mp3", None]],
    description="Upload a file or, click record from microphone and start talking, transcription shall appear to the right.",
    outputs="text")    

# transcribe English file
en_tr_file_tab = gr.Interface(
    fn=transcribe_en,
    inputs=[    
        gr.Audio(type="filepath", label="English file upload"),
        gr.Audio(source="microphone", type="filepath", label="English speech")
    ],
    examples=[["samples/en/32941920.mp3", None], ["samples/en/32941921.mp3", None]],
    description="Upload a file or, click record from microphone and start talking, transcription shall appear to the right.",
    outputs="text")       


tabs = gr.TabbedInterface(
    [   
        ar_tr_stream_tab,
        en_tr_stream_tab,
        ta_tr_stream_tab,
        ar_tr_file_tab,
        en_tr_file_tab,
        ta_tr_file_tab 
    ],
    [
        "Arabic Live Transcription",
        "English Live Transcription",
        "தமிழ் நேரடி சொல் பகிர்ப்பு",
        "Arabic File Transcription",
        "English File Transcription",
        "தமிழ் ஒலி பதிப்பு சொல் பகிர்ப்பு"
    ]
)

if __name__ == "__main__":
    tabs.launch()