Zeimoto commited on
Commit
d130ccc
·
1 Parent(s): d5a4a27

add openai whisper-large-v3

Browse files
Files changed (1) hide show
  1. app.py +35 -1
app.py CHANGED
@@ -1,10 +1,44 @@
1
  import streamlit as st
2
  from st_audiorec import st_audiorec
3
 
 
 
 
 
4
  # x = st.slider('Select a value')
5
  # st.write(x, 'squared is', x * x)
6
 
7
  wav_audio_data = st_audiorec()
8
 
9
  if wav_audio_data is not None:
10
- st.audio(wav_audio_data, format='audio/wav')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
  from st_audiorec import st_audiorec
3
 
4
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
5
+ from datasets import load_dataset
6
+ import torch
7
+
8
  # x = st.slider('Select a value')
9
  # st.write(x, 'squared is', x * x)
10
 
11
  wav_audio_data = st_audiorec()
12
 
13
  if wav_audio_data is not None:
14
+ st.audio(wav_audio_data, format='audio/wav')
15
+
16
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
17
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
18
+
19
+ model_id = "openai/whisper-large-v3"
20
+
21
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
22
+ model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
23
+ )
24
+ model.to(device)
25
+
26
+ processor = AutoProcessor.from_pretrained(model_id)
27
+
28
+ pipe = pipeline(
29
+ "automatic-speech-recognition",
30
+ model=model,
31
+ tokenizer=processor.tokenizer,
32
+ feature_extractor=processor.feature_extractor,
33
+ max_new_tokens=128,
34
+ chunk_length_s=30,
35
+ batch_size=16,
36
+ return_timestamps=True,
37
+ torch_dtype=torch_dtype,
38
+ device=device,
39
+ )
40
+
41
+ dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
42
+ sample = dataset[0]["audio"]
43
+ result = pipe(sample)
44
+ print(result["text"])