j-tobias commited on
Commit
8cfce12
Β·
1 Parent(s): 8414736

added Whisper Large

Browse files
Files changed (4) hide show
  1. __pycache__/processing.cpython-310.pyc +0 -0
  2. app.py +4 -2
  3. cards.txt +10 -2
  4. processing.py +13 -1
__pycache__/processing.cpython-310.pyc DELETED
Binary file (6.05 kB)
 
app.py CHANGED
@@ -25,7 +25,7 @@ login(hf_token)
25
 
26
 
27
  # GENERAL OPTIONS FOR MODELS AND DATASETS
28
- MODEL_OPTIONS = ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr", "facebook/wav2vec2-base-960h"]
29
  DATASET_OPTIONS = ["Common Voice", "Librispeech ASR clean", "OWN Recoding/Sample"]
30
 
31
  # HELPER FUNCTIONS
@@ -59,7 +59,9 @@ with gr.Blocks() as demo:
59
 
60
 
61
  gr.Markdown('# <p style="text-align: center;">ASR Model Comparison πŸ’¬</p>')
62
- gr.Markdown("-------")
 
 
63
 
64
 
65
  gr.Markdown("""### Welcome to ASR Model Comparison Hub! πŸŽ‰
 
25
 
26
 
27
  # GENERAL OPTIONS FOR MODELS AND DATASETS
28
+ MODEL_OPTIONS = ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr", "facebook/wav2vec2-base-960h","openai/whisper-large-v2"]
29
  DATASET_OPTIONS = ["Common Voice", "Librispeech ASR clean", "OWN Recoding/Sample"]
30
 
31
  # HELPER FUNCTIONS
 
59
 
60
 
61
  gr.Markdown('# <p style="text-align: center;">ASR Model Comparison πŸ’¬</p>')
62
+ gr.Markdown("""
63
+
64
+ """)
65
 
66
 
67
  gr.Markdown("""### Welcome to ASR Model Comparison Hub! πŸŽ‰
cards.txt CHANGED
@@ -16,7 +16,7 @@
16
  - Model Paper: [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171)
17
  - Training Data: [LibriSpeech ASR Corpus](https://www.openslr.org/12)
18
  @@
19
- ####
20
  - ID: facebook/wav2vec2-base-960h
21
  - Hugging Face: [model](https://huggingface.co/facebook/wav2vec2-base-960h)
22
  - Creator: facebook
@@ -24,4 +24,12 @@
24
  - Model Size: 94.4 M Parameters
25
  - Model Paper: [Wav2vec 2.0: Learning the structure of speech from raw audio](https://ai.meta.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/)
26
  - Training Data: ?
27
- @@
 
 
 
 
 
 
 
 
 
16
  - Model Paper: [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171)
17
  - Training Data: [LibriSpeech ASR Corpus](https://www.openslr.org/12)
18
  @@
19
+ #### Wav2Vec Base 960h
20
  - ID: facebook/wav2vec2-base-960h
21
  - Hugging Face: [model](https://huggingface.co/facebook/wav2vec2-base-960h)
22
  - Creator: facebook
 
24
  - Model Size: 94.4 M Parameters
25
  - Model Paper: [Wav2vec 2.0: Learning the structure of speech from raw audio](https://ai.meta.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/)
26
  - Training Data: ?
27
+ @@
28
+ #### Whisper Large v2
29
+ - ID: openai/whisper-large-v2
30
+ - Hugging Face: [model](https://huggingface.co/openai/whisper-large-v2)
31
+ - Creator: openai
32
+ - Finetuned: No
33
+ - Model Size: 1.54 B Parameters
34
+ - Model Paper: [Robust Speech Recognition via Large-Scale Weak Supervision](https://arxiv.org/abs/2212.04356)
35
+ - Training Data: The models are trained on 680,000 hours of audio and the corresponding transcripts collected from the internet. 65% of this data (or 438,000 hours) represents English-language audio and matched English transcripts, roughly 18% (or 126,000 hours) represents non-English audio and English transcripts, while the final 17% (or 117,000 hours) represents non-English audio and the corresponding transcript. This non-English data represents 98 different languages.
processing.py CHANGED
@@ -219,7 +219,11 @@ def load_model(model_id:str):
219
  processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-librispeech-asr")
220
  elif model_id == "facebook/wav2vec2-base-960h":
221
  processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
222
- model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
 
 
 
 
223
  else:
224
  model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
225
  processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
@@ -252,6 +256,14 @@ def model_compute(model, processor, sample, model_id):
252
  predicted_ids = torch.argmax(logits, dim=-1)
253
  transcription = processor.batch_decode(predicted_ids)
254
  return transcription[0].lower()
 
 
 
 
 
 
 
 
255
  else:
256
  sample = sample["audio"]
257
  input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
 
219
  processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-librispeech-asr")
220
  elif model_id == "facebook/wav2vec2-base-960h":
221
  processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
222
+ model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
223
+ elif model_id == "openai/whisper-large-v2":
224
+ processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
225
+ model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
226
+ model.config.forced_decoder_ids = None
227
  else:
228
  model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
229
  processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
 
256
  predicted_ids = torch.argmax(logits, dim=-1)
257
  transcription = processor.batch_decode(predicted_ids)
258
  return transcription[0].lower()
259
+ elif model_id == "openai/whisper-large-v2":
260
+ sample = sample["audio"]
261
+ input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
262
+ predicted_ids = model.generate(input_features)
263
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
264
+ transcription = processor.tokenizer.normalize(transcription[0])
265
+ print("TRANSCRIPTION Whisper Large v2: ", transcription)
266
+ return transcription
267
  else:
268
  sample = sample["audio"]
269
  input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features