j-tobias
commited on
Commit
Β·
8cfce12
1
Parent(s):
8414736
added Whisper Large
Browse files- __pycache__/processing.cpython-310.pyc +0 -0
- app.py +4 -2
- cards.txt +10 -2
- processing.py +13 -1
__pycache__/processing.cpython-310.pyc
DELETED
Binary file (6.05 kB)
|
|
app.py
CHANGED
@@ -25,7 +25,7 @@ login(hf_token)
|
|
25 |
|
26 |
|
27 |
# GENERAL OPTIONS FOR MODELS AND DATASETS
|
28 |
-
MODEL_OPTIONS = ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr", "facebook/wav2vec2-base-960h"]
|
29 |
DATASET_OPTIONS = ["Common Voice", "Librispeech ASR clean", "OWN Recoding/Sample"]
|
30 |
|
31 |
# HELPER FUNCTIONS
|
@@ -59,7 +59,9 @@ with gr.Blocks() as demo:
|
|
59 |
|
60 |
|
61 |
gr.Markdown('# <p style="text-align: center;">ASR Model Comparison π¬</p>')
|
62 |
-
gr.Markdown("
|
|
|
|
|
63 |
|
64 |
|
65 |
gr.Markdown("""### Welcome to ASR Model Comparison Hub! π
|
|
|
25 |
|
26 |
|
27 |
# GENERAL OPTIONS FOR MODELS AND DATASETS
|
28 |
+
MODEL_OPTIONS = ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr", "facebook/wav2vec2-base-960h","openai/whisper-large-v2"]
|
29 |
DATASET_OPTIONS = ["Common Voice", "Librispeech ASR clean", "OWN Recoding/Sample"]
|
30 |
|
31 |
# HELPER FUNCTIONS
|
|
|
59 |
|
60 |
|
61 |
gr.Markdown('# <p style="text-align: center;">ASR Model Comparison π¬</p>')
|
62 |
+
gr.Markdown("""
|
63 |
+
|
64 |
+
""")
|
65 |
|
66 |
|
67 |
gr.Markdown("""### Welcome to ASR Model Comparison Hub! π
|
cards.txt
CHANGED
@@ -16,7 +16,7 @@
|
|
16 |
- Model Paper: [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171)
|
17 |
- Training Data: [LibriSpeech ASR Corpus](https://www.openslr.org/12)
|
18 |
@@
|
19 |
-
####
|
20 |
- ID: facebook/wav2vec2-base-960h
|
21 |
- Hugging Face: [model](https://huggingface.co/facebook/wav2vec2-base-960h)
|
22 |
- Creator: facebook
|
@@ -24,4 +24,12 @@
|
|
24 |
- Model Size: 94.4 M Parameters
|
25 |
- Model Paper: [Wav2vec 2.0: Learning the structure of speech from raw audio](https://ai.meta.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/)
|
26 |
- Training Data: ?
|
27 |
-
@@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
- Model Paper: [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171)
|
17 |
- Training Data: [LibriSpeech ASR Corpus](https://www.openslr.org/12)
|
18 |
@@
|
19 |
+
#### Wav2Vec Base 960h
|
20 |
- ID: facebook/wav2vec2-base-960h
|
21 |
- Hugging Face: [model](https://huggingface.co/facebook/wav2vec2-base-960h)
|
22 |
- Creator: facebook
|
|
|
24 |
- Model Size: 94.4 M Parameters
|
25 |
- Model Paper: [Wav2vec 2.0: Learning the structure of speech from raw audio](https://ai.meta.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/)
|
26 |
- Training Data: ?
|
27 |
+
@@
|
28 |
+
#### Whisper Large v2
|
29 |
+
- ID: openai/whisper-large-v2
|
30 |
+
- Hugging Face: [model](https://huggingface.co/openai/whisper-large-v2)
|
31 |
+
- Creator: openai
|
32 |
+
- Finetuned: No
|
33 |
+
- Model Size: 1.54 B Parameters
|
34 |
+
- Model Paper: [Robust Speech Recognition via Large-Scale Weak Supervision](https://arxiv.org/abs/2212.04356)
|
35 |
+
- Training Data: The models are trained on 680,000 hours of audio and the corresponding transcripts collected from the internet. 65% of this data (or 438,000 hours) represents English-language audio and matched English transcripts, roughly 18% (or 126,000 hours) represents non-English audio and English transcripts, while the final 17% (or 117,000 hours) represents non-English audio and the corresponding transcript. This non-English data represents 98 different languages.
|
processing.py
CHANGED
@@ -219,7 +219,11 @@ def load_model(model_id:str):
|
|
219 |
processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-librispeech-asr")
|
220 |
elif model_id == "facebook/wav2vec2-base-960h":
|
221 |
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
|
222 |
-
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
|
|
|
|
|
|
|
|
|
223 |
else:
|
224 |
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
|
225 |
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
|
@@ -252,6 +256,14 @@ def model_compute(model, processor, sample, model_id):
|
|
252 |
predicted_ids = torch.argmax(logits, dim=-1)
|
253 |
transcription = processor.batch_decode(predicted_ids)
|
254 |
return transcription[0].lower()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
255 |
else:
|
256 |
sample = sample["audio"]
|
257 |
input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
|
|
|
219 |
processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-librispeech-asr")
|
220 |
elif model_id == "facebook/wav2vec2-base-960h":
|
221 |
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
|
222 |
+
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
|
223 |
+
elif model_id == "openai/whisper-large-v2":
|
224 |
+
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
|
225 |
+
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
|
226 |
+
model.config.forced_decoder_ids = None
|
227 |
else:
|
228 |
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
|
229 |
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
|
|
|
256 |
predicted_ids = torch.argmax(logits, dim=-1)
|
257 |
transcription = processor.batch_decode(predicted_ids)
|
258 |
return transcription[0].lower()
|
259 |
+
elif model_id == "openai/whisper-large-v2":
|
260 |
+
sample = sample["audio"]
|
261 |
+
input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
|
262 |
+
predicted_ids = model.generate(input_features)
|
263 |
+
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
|
264 |
+
transcription = processor.tokenizer.normalize(transcription[0])
|
265 |
+
print("TRANSCRIPTION Whisper Large v2: ", transcription)
|
266 |
+
return transcription
|
267 |
else:
|
268 |
sample = sample["audio"]
|
269 |
input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
|