owaski commited on
Commit
ed9aac5
·
1 Parent(s): f42dcac
Files changed (1) hide show
  1. app.py +64 -3
app.py CHANGED
@@ -1,4 +1,65 @@
1
- import streamlit as st
 
 
2
 
3
- x = st.slider('Select a value')
4
- st.write(x, 'squared is', x * x)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import librosa
3
+ import torch
4
 
5
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
6
+
7
+ processor = WhisperProcessor.from_pretrained("openai/whisper-large")
8
+ model = SpeechT5ForSpeechToText.from_pretrained("openai/whisper-large")
9
+
10
+ model.config.forced_decoder_ids = WhisperProcessor.get_decoder_prompt_ids(language="english", task="transcribe")
11
+
12
+ def process_audio(sampling_rate, waveform):
13
+ # convert from int16 to floating point
14
+ waveform = waveform / 32678.0
15
+
16
+ # convert to mono if stereo
17
+ if len(waveform.shape) > 1:
18
+ waveform = librosa.to_mono(waveform.T)
19
+
20
+ # resample to 16 kHz if necessary
21
+ if sampling_rate != 16000:
22
+ waveform = librosa.resample(waveform, orig_sr=sampling_rate, target_sr=16000)
23
+
24
+ # limit to 30 seconds
25
+ waveform = waveform[:16000*30]
26
+
27
+ # make PyTorch tensor
28
+ waveform = torch.tensor(waveform)
29
+ return waveform
30
+
31
+ def predict(audio, mic_audio=None):
32
+ # audio = tuple (sample_rate, frames) or (sample_rate, (frames, channels))
33
+ if mic_audio is not None:
34
+ sampling_rate, waveform = mic_audio
35
+ elif audio is not None:
36
+ sampling_rate, waveform = audio
37
+ else:
38
+ return "(please provide audio)"
39
+
40
+ waveform = process_audio(sampling_rate, waveform)
41
+ input_features = processor(waveform, sampling_rate=16000, return_tensors="pt").input_features
42
+ predicted_ids = model.generate(input_features, max_length=400)
43
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
44
+ return transcription[0]
45
+
46
+
47
+ title = "Demo for Whisper -> Something -> XLS-R"
48
+
49
+ description = """
50
+ <b>How to use:</b> Upload an audio file or record using the microphone. The audio is converted to mono and resampled to 16 kHz before
51
+ being passed into the model. The output is the text transcription of the audio.
52
+ """
53
+
54
+ gr.Interface(
55
+ fn=predict,
56
+ inputs=[
57
+ gr.Audio(label="Upload Speech", source="upload", type="numpy"),
58
+ gr.Audio(label="Record Speech", source="microphone", type="numpy"),
59
+ ],
60
+ outputs=[
61
+ gr.Text(label="Transcription"),
62
+ ],
63
+ title=title,
64
+ article=article,
65
+ ).launch()