import gradio as gr
from transformers import pipeline
import time

# p = pipeline("automatic-speech-recognition", model="/Users/mkesavan/aidev/speechAI-trials/xlsr-wave2vec/wav2vec2-large-xls-r-300m-tamil-colab/checkpoint-1600")

# combining Tamil and arabic

p_ta = pipeline("automatic-speech-recognition", model="kmknair/wav2vec2-xlsr-tamil")
p_ar = pipeline("automatic-speech-recognition", model="kmknair/wav2vec2-xlsr-arabic")
p_en = pipeline("automatic-speech-recognition", mdoel="patrickvonplaten/hubert-xlarge-ls960-ft-4-gram")


def transcribe_ta(audio_u, audio_m):        
    text = ""
    if(audio_u is not None):
        text += p_ta(audio_u)["text"]
    if (audio_m is not None):
        text += p_ta(audio_m)["text"]
   
    return text

def transcribe_ta_stream(audio, state=""):
    time.sleep(2)
    text = p_ta(audio)["text"]
    state += text + " " 
    return state, state

def transcribe_ar(audio_u, audio_m):    
    text = ""
    if audio_u is not None:
        text += p_ar(audio_u)["text"]
    if audio_m is not None:
        text += p_ar(audio_m)["text"]
    return text

def transcribe_ar_stream(audio, state=""):
    time.sleep(2)
    text = p_ar(audio)["text"]
    state += text + " " 
    return state, state

def transcribe_en(audio_u, audio_m):    
    text = ""
    if audio_u is not None:
        text += p_en(audio_u)["text"]
    if audio_m is not None:
        text += p_en(audio_m)["text"]
    return text

def transcribe_en_stream(audio, state=""):
    time.sleep(2)
    text = p_en(audio)["text"]
    state += text + " " 
    return state, state


# transcribe Tamil stream
ta_tr_stream_tab = gr.Interface(
    fn=transcribe_ta_stream,
    inputs=[
        gr.Audio(source="microphone", type="filepath", streaming=True),
        "state"
    ],
    outputs=[
    "textbox",
    "state"
    ],
    live=True)
# transcribe Arabic stream
ar_tr_stream_tab = gr.Interface(
    fn=transcribe_ar_stream,
    inputs=[
        gr.Audio(source="microphone", type="filepath", streaming=True),
        "state"
    ],
    outputs=[
    "textbox",
    "state"
    ],
    live=True)

# transcribe English stream
en_tr_stream_tab = gr.Interface(
    fn=transcribe_en_stream,
    inputs=[
        gr.Audio(source="microphone", type="filepath", streaming=True),
        "state"
    ],
    outputs=[
    "textbox",
    "state"
    ],
    live=True)


# transcribe Tamil file
ta_tr_file_tab = gr.Interface(
    fn=transcribe_ta,
    inputs=[
        gr.Audio(type="filepath"),
        gr.Audio(source="microphone", type="filepath")
    ],
    outputs="text")

# transcribe Arabic file
ar_tr_file_tab = gr.Interface(
    fn=transcribe_ar,
    inputs=[    
        gr.Audio(type="filepath"),
        gr.Audio(source="microphone", type="filepath")
    ],
    outputs="text")    

# transcribe English file
en_tr_file_tab = gr.Interface(
    fn=transcribe_en,
    inputs=[    
        gr.Audio(type="filepath"),
        gr.Audio(source="microphone", type="filepath")
    ],
    outputs="text")       


tabs = gr.TabbedInterface(
    [   
        ar_tr_stream_tab,
        en_tr_stream_tab,
        ta_tr_stream_tab,
        ar_tr_file_tab,
        en_tr_file_tab,
        ta_tr_file_tab 
    ],
    [
        "Arabic Live Transcription",
        "English Live Transcription",
        "Tamil Live Transcription",
        "Arabic File Transcription",
        "English File Transcription",
        "Tamil File Transcription"
    ]
)

if __name__ == "__main__":
    tabs.launch()