Spaces:

kmknair
/

xlsr-gradio

Runtime error

File size: 5,242 Bytes

2c3f8ff
 
 
 
 
 
478bedf
2c3f8ff
478bedf
 
2904d5d
478bedf
 
649f719
 
 
 
 
a7bf230
649f719
478bedf
 
 
2c3f8ff
478bedf
2c3f8ff
 
 
649f719
 
 
 
 
a7bf230
478bedf
 
 
 
 
 
 
2c3f8ff
2904d5d
 
 
 
 
a7bf230
2904d5d
 
 
 
 
 
 
 
 
478bedf
 
 
2c3f8ff
dec6760
2c3f8ff
 
 
 
 
 
e60cafb
478bedf
 
 
 
 
dec6760
478bedf
 
 
 
 
 
dec6760
478bedf
 
2904d5d
 
 
 
dec6760
2904d5d
 
 
 
 
 
dec6760
2904d5d
 
 
478bedf
 
 
 
6a9d77a
dec6760
478bedf
dec6760
e60cafb
478bedf
 
 
 
 
 
dec6760
 
478bedf
dec6760
 
478bedf
 
2904d5d
 
 
 
dec6760
 
2904d5d
dec6760
 
2904d5d
 
478bedf
 
2904d5d
 
 
 
 
 
 
 
478bedf
 
2904d5d
6a9d77a
478bedf
2904d5d
6a9d77a
478bedf

import gradio as gr
from transformers import pipeline
import time

# p = pipeline("automatic-speech-recognition", model="/Users/mkesavan/aidev/speechAI-trials/xlsr-wave2vec/wav2vec2-large-xls-r-300m-tamil-colab/checkpoint-1600")

# combining Tamil and arabic

p_ta = pipeline("automatic-speech-recognition", model="kmknair/wav2vec2-xlsr-tamil")
p_ar = pipeline("automatic-speech-recognition", model="kmknair/wav2vec2-xlsr-arabic")
p_en = pipeline("automatic-speech-recognition", mdoel="patrickvonplaten/hubert-xlarge-ls960-ft-4-gram")


def transcribe_ta(audio_u, audio_m):        
    text = ""
    if(audio_u is not None):
        text += p_ta(audio_u)["text"]
    if (audio_m is not None):
        text += "\n" + p_ta(audio_m)["text"]
   
    return text

def transcribe_ta_stream(audio, state=""):
    time.sleep(2)
    text = p_ta(audio)["text"]
    state += text + " " 
    return state, state

def transcribe_ar(audio_u, audio_m):    
    text = ""
    if audio_u is not None:
        text += p_ar(audio_u)["text"]
    if audio_m is not None:
        text += "\n" + p_ar(audio_m)["text"]
    return text

def transcribe_ar_stream(audio, state=""):
    time.sleep(2)
    text = p_ar(audio)["text"]
    state += text + " " 
    return state, state

def transcribe_en(audio_u, audio_m):    
    text = ""
    if audio_u is not None:
        text += p_en(audio_u)["text"]
    if audio_m is not None:
        text += "\n" + p_en(audio_m)["text"]
    return text

def transcribe_en_stream(audio, state=""):
    time.sleep(2)
    text = p_en(audio)["text"]
    state += text + " " 
    return state, state


# transcribe Tamil stream
ta_tr_stream_tab = gr.Interface(
    fn=transcribe_ta_stream,
    inputs=[
        gr.Audio(source="microphone", type="filepath", streaming=True, label="தமிழ் பேச்சு"),
        "state"
    ],
    outputs=[
    "textbox",
    "state"
    ],
    description="ரெகாட் பட்டண் அமர்தி பேசவும், பேச்சு சொல் பகிர்ப்பு வலது பக்கதில் அச்சிடபடும்",
    live=True)
# transcribe Arabic stream
ar_tr_stream_tab = gr.Interface(
    fn=transcribe_ar_stream,
    inputs=[
        gr.Audio(source="microphone", type="filepath", streaming=True, label="Arabic speech"),
        "state"
    ],
    outputs=[
    "textbox",
    "state"
    ],
    description="Click record from microphone and start talking, transcription shall appear to the right.",
    live=True)

# transcribe English stream
en_tr_stream_tab = gr.Interface(
    fn=transcribe_en_stream,
    inputs=[
        gr.Audio(source="microphone", type="filepath", streaming=True, label="English speech"),
        "state"
    ],
    outputs=[
    "textbox",
    "state"
    ],
    description="Click record from microphone and start talking, transcription shall appear to the right.",
    live=True)


# transcribe Tamil file
ta_tr_file_tab = gr.Interface(
    fn=transcribe_ta,
    inputs=[
        gr.Audio(type="filepath", label="தமிழ் ஒலி பதிப்பு சமர்ப்பித்தல்"),
        gr.Audio(source="microphone", type="filepath", label= "தமிழ் பேச்சு")
    ],
    examples=[["samples/ta/32862591.mp3", None], ["samples/ta/32862612.mp3", None]],
    description="ஒலி பதிப்பு சமர்ப்பிக்கவும், அல்லது ரெகாட் பட்டண் அமர்தி பேசவும், பேச்சு சொல் பகிர்ப்பு வலது பக்கதில் அச்சிடபடும்",
    outputs="text")

# transcribe Arabic file
ar_tr_file_tab = gr.Interface(
    fn=transcribe_ar,
    inputs=[    
        gr.Audio(type="filepath", label="Arabic file upload"),
        gr.Audio(source="microphone", type="filepath", label="Arabic speech")
    ],
    examples=[["samples/ar/19706399.mp3", None],["samples/ar/19985784.mp3", None]],
    description="Upload a file or, click record from microphone and start talking, transcription shall appear to the right.",
    outputs="text")    

# transcribe English file
en_tr_file_tab = gr.Interface(
    fn=transcribe_en,
    inputs=[    
        gr.Audio(type="filepath", label="English file upload"),
        gr.Audio(source="microphone", type="filepath", label="English speech")
    ],
    examples=[["samples/en/32941920.mp3", None], ["samples/en/32941921.mp3", None]],
    description="Upload a file or, click record from microphone and start talking, transcription shall appear to the right.",
    outputs="text")       


tabs = gr.TabbedInterface(
    [   
        ar_tr_stream_tab,
        en_tr_stream_tab,
        ta_tr_stream_tab,
        ar_tr_file_tab,
        en_tr_file_tab,
        ta_tr_file_tab 
    ],
    [
        "Arabic Live Transcription",
        "English Live Transcription",
        "தமிழ் நேரடி சொல் பகிர்ப்பு",
        "Arabic File Transcription",
        "English File Transcription",
        "தமிழ் ஒலி பதிப்பு சொல் பகிர்ப்பு"
    ]
)

if __name__ == "__main__":
    tabs.launch()