Spaces:

justus-tobias
/

ASR_Model_Comparison

Paused

App Files Files Community

j-tobias commited on Aug 14, 2024

Commit

8cfce12

1 Parent(s): 8414736

added Whisper Large

Browse files

Files changed (4) hide show

__pycache__/processing.cpython-310.pyc +0 -0
app.py +4 -2
cards.txt +10 -2
processing.py +13 -1

__pycache__/processing.cpython-310.pyc DELETED Viewed

Binary file (6.05 kB)

app.py CHANGED Viewed

@@ -25,7 +25,7 @@ login(hf_token)
 # GENERAL OPTIONS FOR MODELS AND DATASETS
-MODEL_OPTIONS = ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr", "facebook/wav2vec2-base-960h"]
 DATASET_OPTIONS = ["Common Voice", "Librispeech ASR clean", "OWN Recoding/Sample"]
 # HELPER FUNCTIONS
@@ -59,7 +59,9 @@ with gr.Blocks() as demo:
     gr.Markdown('# <p style="text-align: center;">ASR Model Comparison 💬</p>')
-    gr.Markdown("-------")
     gr.Markdown("""### Welcome to ASR Model Comparison Hub! 🎉

 # GENERAL OPTIONS FOR MODELS AND DATASETS
+MODEL_OPTIONS = ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr", "facebook/wav2vec2-base-960h","openai/whisper-large-v2"]
 DATASET_OPTIONS = ["Common Voice", "Librispeech ASR clean", "OWN Recoding/Sample"]
 # HELPER FUNCTIONS
     gr.Markdown('# <p style="text-align: center;">ASR Model Comparison 💬</p>')
+    gr.Markdown("""
+""")
     gr.Markdown("""### Welcome to ASR Model Comparison Hub! 🎉

cards.txt CHANGED Viewed

@@ -16,7 +16,7 @@
 - Model Paper: [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171)
 - Training Data: [LibriSpeech ASR Corpus](https://www.openslr.org/12)
 @@
-####
 - ID: facebook/wav2vec2-base-960h
 - Hugging Face: [model](https://huggingface.co/facebook/wav2vec2-base-960h)
 - Creator: facebook
@@ -24,4 +24,12 @@
 - Model Size: 94.4 M Parameters
 - Model Paper: [Wav2vec 2.0: Learning the structure of speech from raw audio](https://ai.meta.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/)
 - Training Data: ?
-@@

 - Model Paper: [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171)
 - Training Data: [LibriSpeech ASR Corpus](https://www.openslr.org/12)
 @@
+#### Wav2Vec Base 960h
 - ID: facebook/wav2vec2-base-960h
 - Hugging Face: [model](https://huggingface.co/facebook/wav2vec2-base-960h)
 - Creator: facebook
 - Model Size: 94.4 M Parameters
 - Model Paper: [Wav2vec 2.0: Learning the structure of speech from raw audio](https://ai.meta.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/)
 - Training Data: ?
+@@
+#### Whisper Large v2
+- ID: openai/whisper-large-v2
+- Hugging Face: [model](https://huggingface.co/openai/whisper-large-v2)
+- Creator: openai
+- Finetuned: No
+- Model Size: 1.54 B Parameters
+- Model Paper: [Robust Speech Recognition via Large-Scale Weak Supervision](https://arxiv.org/abs/2212.04356)
+- Training Data: The models are trained on 680,000 hours of audio and the corresponding transcripts collected from the internet. 65% of this data (or 438,000 hours) represents English-language audio and matched English transcripts, roughly 18% (or 126,000 hours) represents non-English audio and English transcripts, while the final 17% (or 117,000 hours) represents non-English audio and the corresponding transcript. This non-English data represents 98 different languages.

processing.py CHANGED Viewed

@@ -219,7 +219,11 @@ def load_model(model_id:str):
         processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-librispeech-asr")
     elif model_id == "facebook/wav2vec2-base-960h":
         processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
-        model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
     else:
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
         processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
@@ -252,6 +256,14 @@ def model_compute(model, processor, sample, model_id):
         predicted_ids = torch.argmax(logits, dim=-1)
         transcription = processor.batch_decode(predicted_ids)
         return transcription[0].lower()
     else:
         sample = sample["audio"]
         input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features

         processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-librispeech-asr")
     elif model_id == "facebook/wav2vec2-base-960h":
         processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
+        model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
+    elif model_id == "openai/whisper-large-v2":
+        processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
+        model.config.forced_decoder_ids = None
     else:
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
         processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
         predicted_ids = torch.argmax(logits, dim=-1)
         transcription = processor.batch_decode(predicted_ids)
         return transcription[0].lower()
+    elif model_id == "openai/whisper-large-v2":
+        sample = sample["audio"]
+        input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
+        predicted_ids = model.generate(input_features)
+        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
+        transcription = processor.tokenizer.normalize(transcription[0])
+        print("TRANSCRIPTION Whisper Large v2: ", transcription)
+        return transcription
     else:
         sample = sample["audio"]
         input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features