import gradio as gr import torch from transformers import pipeline device = 'cuda:0' if torch.cuda.is_available() else 'cpu' pipe = pipeline('automatic-speech-recognition', model='openai/whisper-large', chunk_length_s=30, device=device) def speech_to_text(mic, upload, state): if state['active'] == 1: audio = upload else: audio = mic if audio is None: yield None return prediction = pipe(audio, batch_size=8, generate_kwargs={'task': 'transcribe'}, return_timestamps=True) yield prediction['text'] def tab_select(evt: gr.SelectData, state): state['active'] = evt.target.id print('select {}'.format(evt.target.id)) return state with gr.Blocks(title='OpenAI Whisper Demo') as app: state = gr.State({ 'active': 0 }) gr.Markdown(''' # OpenAI Whisper Demo ''') with gr.Row(): with gr.Column(): inputs = [] with gr.Tab(label='microphone', id=0) as tab1: mic = gr.Audio(source='microphone', type='filepath') inputs.append(mic) tab1.select(tab_select, inputs=state, outputs=state) with gr.Tab(label='upload', id=1) as tab2: upload = gr.Audio(source='upload', type='filepath') inputs.append(upload) tab2.select(tab_select, inputs=state, outputs=state) with gr.Row(): with gr.Column(min_width=160): clearBtn = gr.ClearButton() with gr.Column(min_width=160): btn = gr.Button(value='Submit') with gr.Row(): with gr.Column(): outputs = [gr.Textbox(label='output')] clearBtn.add(inputs + outputs) btn.click(speech_to_text, inputs=inputs + [state], outputs=outputs) app.queue(concurrency_count=20) app.launch()