Spaces:
Sleeping
Sleeping
Commit
·
b6907f5
1
Parent(s):
ed9aac5
Trying a commit
Browse files
app.py
CHANGED
@@ -1,5 +1,5 @@
|
|
1 |
import gradio as gr
|
2 |
-
import librosa
|
3 |
import torch
|
4 |
|
5 |
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
@@ -9,24 +9,6 @@ model = SpeechT5ForSpeechToText.from_pretrained("openai/whisper-large")
|
|
9 |
|
10 |
model.config.forced_decoder_ids = WhisperProcessor.get_decoder_prompt_ids(language="english", task="transcribe")
|
11 |
|
12 |
-
def process_audio(sampling_rate, waveform):
|
13 |
-
# convert from int16 to floating point
|
14 |
-
waveform = waveform / 32678.0
|
15 |
-
|
16 |
-
# convert to mono if stereo
|
17 |
-
if len(waveform.shape) > 1:
|
18 |
-
waveform = librosa.to_mono(waveform.T)
|
19 |
-
|
20 |
-
# resample to 16 kHz if necessary
|
21 |
-
if sampling_rate != 16000:
|
22 |
-
waveform = librosa.resample(waveform, orig_sr=sampling_rate, target_sr=16000)
|
23 |
-
|
24 |
-
# limit to 30 seconds
|
25 |
-
waveform = waveform[:16000*30]
|
26 |
-
|
27 |
-
# make PyTorch tensor
|
28 |
-
waveform = torch.tensor(waveform)
|
29 |
-
return waveform
|
30 |
|
31 |
def predict(audio, mic_audio=None):
|
32 |
# audio = tuple (sample_rate, frames) or (sample_rate, (frames, channels))
|
|
|
1 |
import gradio as gr
|
2 |
+
#import librosa
|
3 |
import torch
|
4 |
|
5 |
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
|
|
9 |
|
10 |
model.config.forced_decoder_ids = WhisperProcessor.get_decoder_prompt_ids(language="english", task="transcribe")
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
def predict(audio, mic_audio=None):
|
14 |
# audio = tuple (sample_rate, frames) or (sample_rate, (frames, channels))
|