Steveeeeeeen HF staff commited on
Commit
2cf0ee6
Β·
verified Β·
1 Parent(s): 5bc0279

Update constants.py

Browse files
Files changed (1) hide show
  1. constants.py +124 -51
constants.py CHANGED
@@ -1,51 +1,124 @@
1
- from dataclasses import dataclass
2
-
3
- # These classes are for user facing column names, to avoid having to change them
4
- # all around the code when a modif is needed
5
- @dataclass
6
- class ColumnContent:
7
- name: str
8
- type: str
9
-
10
- def fields(raw_class):
11
- return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
12
-
13
- @dataclass(frozen=True)
14
- class AutoEvalColumn: # Auto evals column
15
- model = ColumnContent("Model", "markdown")
16
- avg_wer = ColumnContent("Average WER ⬇️", "number")
17
- rtf = ColumnContent("RTFx ⬆️️", "number")
18
- ami_wer = ColumnContent("AMI", "number")
19
- e22_wer = ColumnContent("Earnings22", "number")
20
- gs_wer = ColumnContent("Gigaspeech", "number")
21
- lsc_wer = ColumnContent("LS Clean", "number")
22
- lso_wer = ColumnContent("LS Other", "number")
23
- ss_wer = ColumnContent("SPGISpeech", "number")
24
- tl_wer = ColumnContent("Tedlium", "number")
25
- vp_wer = ColumnContent("Voxpopuli", "number")
26
-
27
-
28
- def make_clickable_model(model_name):
29
- model_name_list = model_name.split("/")
30
- if model_name_list[0] == "trt-llm":
31
- link = "https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/whisper"
32
- elif model_name_list[0] == "faster-whisper":
33
- link = "https://github.com/guillaumekln/faster-whisper"
34
- elif model_name_list[0] == "Whisper.cpp":
35
- link = "https://github.com/ggerganov/whisper.cpp"
36
- elif model_name_list[0] == "WhisperKit":
37
- link = "https://github.com/argmaxinc/WhisperKit"
38
- elif model_name_list[0] == "WhisperMLX":
39
- link = "https://huggingface.co/collections/mlx-community/whisper-663256f9964fbb1177db93dc"
40
- else:
41
- link = f"https://huggingface.co/{model_name}"
42
- return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
43
-
44
- def styled_error(error):
45
- return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
46
-
47
- def styled_warning(warn):
48
- return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
49
-
50
- def styled_message(message):
51
- return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ # Directory where request by models are stored
4
+ DIR_OUTPUT_REQUESTS = Path("requested_models")
5
+ EVAL_REQUESTS_PATH = Path("eval_requests")
6
+
7
+ ##########################
8
+ # Text definitions #
9
+ ##########################
10
+
11
+ banner_url = "https://huggingface.co/datasets/reach-vb/random-images/resolve/main/asr_leaderboard.png"
12
+ BANNER = f'<div style="display: flex; justify-content: space-around;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 300px; max-width: 600px;"> </div>'
13
+
14
+ TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> πŸ€— Open Automatic Speech Recognition Leaderboard </b> </body> </html>"
15
+
16
+ INTRODUCTION_TEXT = "πŸ“ The πŸ€— Open ASR Leaderboard ranks and evaluates speech recognition models \
17
+ on the Hugging Face Hub. \
18
+ \nWe report the Average [WER](https://huggingface.co/spaces/evaluate-metric/wer) (⬇️ lower the better) and [RTFx](https://github.com/NVIDIA/DeepLearningExamples/blob/master/Kaldi/SpeechRecognition/README.md#metrics) (⬆️ higher the better). Models are ranked based on their Average WER, from lowest to highest. Check the πŸ“ˆ Metrics tab to understand how the models are evaluated. \
19
+ \nIf you want results for a model that is not listed here, you can submit a request for it to be included βœ‰οΈβœ¨. \
20
+ \nThe leaderboard currently focuses on English speech recognition, and will be expanded to multilingual evaluation in later versions."
21
+
22
+ CITATION_TEXT = """@misc{open-asr-leaderboard,
23
+ title = {Open Automatic Speech Recognition Leaderboard},
24
+ author = {Srivastav, Vaibhav and Majumdar, Somshubra and Koluguri, Nithin and Moumen, Adel and Gandhi, Sanchit and others},
25
+ year = 2023,
26
+ publisher = {Hugging Face},
27
+ howpublished = "\\url{https://huggingface.co/spaces/hf-audio/open_asr_leaderboard}"
28
+ }
29
+ """
30
+
31
+ METRICS_TAB_TEXT = """
32
+ Here you will find details about the speech recognition metrics and datasets reported in our leaderboard.
33
+
34
+ ## Metrics
35
+
36
+ Models are evaluated jointly using the Word Error Rate (WER) and Inverse Real Time Factor (RTFx) metrics. The WER metric
37
+ is used to assess the accuracy of a system, and the RTFx the inference speed. Models are ranked in the leaderboard based
38
+ on their WER, lowest to highest.
39
+
40
+ Crucially, the WER and RTFx values are computed for the same inference run using a single script. The implication of this is two-fold:
41
+ 1. The WER and RTFx values are coupled: for a given WER, one can expect to achieve the corresponding RTFx. This allows the proposer to trade-off lower WER for higher RTFx should they wish.
42
+ 2. The WER and RTFx values are averaged over all audios in the benchmark (in the order of thousands of audios).
43
+
44
+ For details on reproducing the benchmark numbers, refer to the [Open ASR GitHub repository](https://github.com/huggingface/open_asr_leaderboard#evaluate-a-model).
45
+
46
+ ### Word Error Rate (WER)
47
+
48
+ Word Error Rate is used to measure the **accuracy** of automatic speech recognition systems. It calculates the percentage
49
+ of words in the system's output that differ from the reference (correct) transcript. **A lower WER value indicates higher accuracy**.
50
+
51
+ Take the following example:
52
+
53
+ | Reference: | the | cat | sat | on | the | mat |
54
+ |-------------|-----|-----|---------|-----|-----|-----|
55
+ | Prediction: | the | cat | **sit** | on | the | | |
56
+ | Label: | βœ… | βœ… | S | βœ… | βœ… | D |
57
+
58
+ Here, we have:
59
+ * 1 substitution ("sit" instead of "sat")
60
+ * 0 insertions
61
+ * 1 deletion ("mat" is missing)
62
+
63
+ This gives 2 errors in total. To get our word error rate, we divide the total number of errors (substitutions + insertions + deletions) by the total number of words in our
64
+ reference (N), which for this example is 6:
65
+
66
+ ```
67
+ WER = (S + I + D) / N = (1 + 0 + 1) / 6 = 0.333
68
+ ```
69
+
70
+ Giving a WER of 0.33, or 33%. For a fair comparison, we calculate **zero-shot** (i.e. pre-trained models only) *normalised WER* for all the model checkpoints, meaning punctuation and casing is removed from the references and predictions. You can find the evaluation code on our [Github repository](https://github.com/huggingface/open_asr_leaderboard). To read more about how the WER is computed, refer to the [Audio Transformers Course](https://huggingface.co/learn/audio-course/chapter5/evaluation).
71
+
72
+ ### Inverse Real Time Factor (RTFx)
73
+
74
+ Inverse Real Time Factor is a measure of the **latency** of automatic speech recognition systems, i.e. how long it takes an
75
+ model to process a given amount of speech. It is defined as:
76
+ ```
77
+ RTFx = (number of seconds of audio inferred) / (compute time in seconds)
78
+ ```
79
+
80
+ Therefore, and RTFx of 1 means a system processes speech as fast as it's spoken, while an RTFx of 2 means it takes half the time.
81
+ Thus, **a higher RTFx value indicates lower latency**.
82
+
83
+ ## How to reproduce our results
84
+
85
+ The ASR Leaderboard will be a continued effort to benchmark open source/access speech recognition models where possible.
86
+ Along with the Leaderboard we're open-sourcing the codebase used for running these evaluations.
87
+ For more details head over to our repo at: https://github.com/huggingface/open_asr_leaderboard
88
+
89
+ P.S. We'd love to know which other models you'd like us to benchmark next. Contributions are more than welcome! β™₯️
90
+
91
+ ## Benchmark datasets
92
+
93
+ Evaluating Speech Recognition systems is a hard problem. We use the multi-dataset benchmarking strategy proposed in the
94
+ [ESB paper](https://arxiv.org/abs/2210.13352) to obtain robust evaluation scores for each model.
95
+
96
+ ESB is a benchmark for evaluating the performance of a single automatic speech recognition (ASR) system across a broad
97
+ set of speech datasets. It comprises eight English speech recognition datasets, capturing a broad range of domains,
98
+ acoustic conditions, speaker styles, and transcription requirements. As such, it gives a better indication of how
99
+ a model is likely to perform on downstream ASR compared to evaluating it on one dataset alone.
100
+
101
+ The ESB score is calculated as a macro-average of the WER scores across the ESB datasets. The models in the leaderboard
102
+ are ranked based on their average WER scores, from lowest to highest.
103
+
104
+ | Dataset | Domain | Speaking Style | Train (h) | Dev (h) | Test (h) | Transcriptions | License |
105
+ |-----------------------------------------------------------------------------------------|-----------------------------|-----------------------|-----------|---------|----------|--------------------|-----------------|
106
+ | [LibriSpeech](https://huggingface.co/datasets/librispeech_asr) | Audiobook | Narrated | 960 | 11 | 11 | Normalised | CC-BY-4.0 |
107
+ | [VoxPopuli](https://huggingface.co/datasets/facebook/voxpopuli) | European Parliament | Oratory | 523 | 5 | 5 | Punctuated | CC0 |
108
+ | [TED-LIUM](https://huggingface.co/datasets/LIUM/tedlium) | TED talks | Oratory | 454 | 2 | 3 | Normalised | CC-BY-NC-ND 3.0 |
109
+ | [GigaSpeech](https://huggingface.co/datasets/speechcolab/gigaspeech) | Audiobook, podcast, YouTube | Narrated, spontaneous | 2500 | 12 | 40 | Punctuated | apache-2.0 |
110
+ | [SPGISpeech](https://huggingface.co/datasets/kensho/spgispeech) | Financial meetings | Oratory, spontaneous | 4900 | 100 | 100 | Punctuated & Cased | User Agreement |
111
+ | [Earnings-22](https://huggingface.co/datasets/revdotcom/earnings22) | Financial meetings | Oratory, spontaneous | 105 | 5 | 5 | Punctuated & Cased | CC-BY-SA-4.0 |
112
+ | [AMI](https://huggingface.co/datasets/edinburghcstr/ami) | Meetings | Spontaneous | 78 | 9 | 9 | Punctuated & Cased | CC-BY-4.0 |
113
+
114
+ For more details on the individual datasets and how models are evaluated to give the ESB score, refer to the [ESB paper](https://arxiv.org/abs/2210.13352).
115
+ """
116
+
117
+ LEADERBOARD_CSS = """
118
+ #leaderboard-table th .header-content {
119
+ white-space: nowrap;
120
+ }
121
+ #whisper-backends-tab th .header-content {
122
+ white-space: nowrap;
123
+ }
124
+ """