import gradio as gr import torch from transformers import pipeline #modelname = 'openai/whisper-large-v2' modelname = 'openai/whisper-small' device = 'cuda:0' if torch.cuda.is_available() else 'cpu' pipe = pipeline('automatic-speech-recognition', model=modelname, chunk_length_s=30, device=device) def speech_to_text(mic, upload, state): if state['active'] == 1: audio = upload else: audio = mic if audio is None: yield None return prediction = pipe(audio, batch_size=8, generate_kwargs={'task': 'transcribe'}, return_timestamps=True) yield prediction['text'] def tab_select(evt: gr.SelectData, state): state['active'] = evt.index print('select {}'.format(evt.index)) return state with gr.Blocks(title='OpenAI Whisper Demo') as app: state = gr.State({ 'active': 0 }) gr.Markdown(''' # OpenAI Whisper Demo ''') with gr.Row(): with gr.Column(): inputs = [] with gr.Tab('microphone') as tab1: mic = gr.Audio(sources='microphone', type='filepath') inputs.append(mic) tab1.select(tab_select, inputs=state, outputs=state) with gr.Tab('upload') as tab2: upload = gr.Audio(sources='upload', type='filepath') inputs.append(upload) tab2.select(tab_select, inputs=state, outputs=state) with gr.Row(): with gr.Column(min_width=160): clearBtn = gr.ClearButton() with gr.Column(min_width=160): btn = gr.Button(value='Submit') with gr.Row(): with gr.Column(): outputs = [gr.Textbox(label='output')] clearBtn.add(inputs + outputs) btn.click(speech_to_text, inputs=inputs + [state], outputs=outputs, concurrency_limit=20) app.launch()