File size: 1,657 Bytes
c4d1e20
 
 
 
2dbcc16
 
 
c4d1e20
2dbcc16
c4d1e20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8a11b0e
 
c4d1e20
 
 
 
 
 
 
 
 
 
 
8a11b0e
aba6d47
c4d1e20
 
8a11b0e
aba6d47
c4d1e20
 
 
 
 
 
 
 
 
 
 
ed5246e
c4d1e20
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import gradio as gr
import torch
from transformers import pipeline

#modelname = 'openai/whisper-large-v2'
modelname = 'openai/whisper-small'

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
pipe = pipeline('automatic-speech-recognition', model=modelname, chunk_length_s=30, device=device)

def speech_to_text(mic, upload, state):
	if state['active'] == 1:
		audio = upload
	else:
		audio = mic

	if audio is None:
		yield None
		return

	prediction = pipe(audio, batch_size=8, generate_kwargs={'task': 'transcribe'}, return_timestamps=True)
	yield prediction['text']

def tab_select(evt: gr.SelectData, state):
	state['active'] = evt.index
	print('select {}'.format(evt.index))
	return state

with gr.Blocks(title='OpenAI Whisper Demo') as app:
	state = gr.State({ 'active': 0 })

	gr.Markdown('''

	# OpenAI Whisper Demo

	''')
	with gr.Row():
		with gr.Column():
			inputs = []
			with gr.Tab('microphone') as tab1:
				mic = gr.Audio(sources='microphone', type='filepath')
				inputs.append(mic)
				tab1.select(tab_select, inputs=state, outputs=state)
			with gr.Tab('upload') as tab2:
				upload = gr.Audio(sources='upload', type='filepath')
				inputs.append(upload)
				tab2.select(tab_select, inputs=state, outputs=state)
	with gr.Row():
		with gr.Column(min_width=160):
			clearBtn = gr.ClearButton()
		with gr.Column(min_width=160):
			btn = gr.Button(value='Submit')
	with gr.Row():
		with gr.Column():
			outputs = [gr.Textbox(label='output')]
	clearBtn.add(inputs + outputs)
	btn.click(speech_to_text, inputs=inputs + [state], outputs=outputs, concurrency_limit=20)
app.launch()