Update constants.py
Browse files- constants.py +51 -124
constants.py
CHANGED
@@ -1,124 +1,51 @@
|
|
1 |
-
from
|
2 |
-
|
3 |
-
#
|
4 |
-
|
5 |
-
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
""
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
| Reference: | the | cat | sat | on | the | mat |
|
54 |
-
|-------------|-----|-----|---------|-----|-----|-----|
|
55 |
-
| Prediction: | the | cat | **sit** | on | the | | |
|
56 |
-
| Label: | β
| β
| S | β
| β
| D |
|
57 |
-
|
58 |
-
Here, we have:
|
59 |
-
* 1 substitution ("sit" instead of "sat")
|
60 |
-
* 0 insertions
|
61 |
-
* 1 deletion ("mat" is missing)
|
62 |
-
|
63 |
-
This gives 2 errors in total. To get our word error rate, we divide the total number of errors (substitutions + insertions + deletions) by the total number of words in our
|
64 |
-
reference (N), which for this example is 6:
|
65 |
-
|
66 |
-
```
|
67 |
-
WER = (S + I + D) / N = (1 + 0 + 1) / 6 = 0.333
|
68 |
-
```
|
69 |
-
|
70 |
-
Giving a WER of 0.33, or 33%. For a fair comparison, we calculate **zero-shot** (i.e. pre-trained models only) *normalised WER* for all the model checkpoints, meaning punctuation and casing is removed from the references and predictions. You can find the evaluation code on our [Github repository](https://github.com/huggingface/open_asr_leaderboard). To read more about how the WER is computed, refer to the [Audio Transformers Course](https://huggingface.co/learn/audio-course/chapter5/evaluation).
|
71 |
-
|
72 |
-
### Inverse Real Time Factor (RTFx)
|
73 |
-
|
74 |
-
Inverse Real Time Factor is a measure of the **latency** of automatic speech recognition systems, i.e. how long it takes an
|
75 |
-
model to process a given amount of speech. It is defined as:
|
76 |
-
```
|
77 |
-
RTFx = (number of seconds of audio inferred) / (compute time in seconds)
|
78 |
-
```
|
79 |
-
|
80 |
-
Therefore, and RTFx of 1 means a system processes speech as fast as it's spoken, while an RTFx of 2 means it takes half the time.
|
81 |
-
Thus, **a higher RTFx value indicates lower latency**.
|
82 |
-
|
83 |
-
## How to reproduce our results
|
84 |
-
|
85 |
-
The ASR Leaderboard will be a continued effort to benchmark open source/access speech recognition models where possible.
|
86 |
-
Along with the Leaderboard we're open-sourcing the codebase used for running these evaluations.
|
87 |
-
For more details head over to our repo at: https://github.com/huggingface/open_asr_leaderboard
|
88 |
-
|
89 |
-
P.S. We'd love to know which other models you'd like us to benchmark next. Contributions are more than welcome! β₯οΈ
|
90 |
-
|
91 |
-
## Benchmark datasets
|
92 |
-
|
93 |
-
Evaluating Speech Recognition systems is a hard problem. We use the multi-dataset benchmarking strategy proposed in the
|
94 |
-
[ESB paper](https://arxiv.org/abs/2210.13352) to obtain robust evaluation scores for each model.
|
95 |
-
|
96 |
-
ESB is a benchmark for evaluating the performance of a single automatic speech recognition (ASR) system across a broad
|
97 |
-
set of speech datasets. It comprises eight English speech recognition datasets, capturing a broad range of domains,
|
98 |
-
acoustic conditions, speaker styles, and transcription requirements. As such, it gives a better indication of how
|
99 |
-
a model is likely to perform on downstream ASR compared to evaluating it on one dataset alone.
|
100 |
-
|
101 |
-
The ESB score is calculated as a macro-average of the WER scores across the ESB datasets. The models in the leaderboard
|
102 |
-
are ranked based on their average WER scores, from lowest to highest.
|
103 |
-
|
104 |
-
| Dataset | Domain | Speaking Style | Train (h) | Dev (h) | Test (h) | Transcriptions | License |
|
105 |
-
|-----------------------------------------------------------------------------------------|-----------------------------|-----------------------|-----------|---------|----------|--------------------|-----------------|
|
106 |
-
| [LibriSpeech](https://huggingface.co/datasets/librispeech_asr) | Audiobook | Narrated | 960 | 11 | 11 | Normalised | CC-BY-4.0 |
|
107 |
-
| [VoxPopuli](https://huggingface.co/datasets/facebook/voxpopuli) | European Parliament | Oratory | 523 | 5 | 5 | Punctuated | CC0 |
|
108 |
-
| [TED-LIUM](https://huggingface.co/datasets/LIUM/tedlium) | TED talks | Oratory | 454 | 2 | 3 | Normalised | CC-BY-NC-ND 3.0 |
|
109 |
-
| [GigaSpeech](https://huggingface.co/datasets/speechcolab/gigaspeech) | Audiobook, podcast, YouTube | Narrated, spontaneous | 2500 | 12 | 40 | Punctuated | apache-2.0 |
|
110 |
-
| [SPGISpeech](https://huggingface.co/datasets/kensho/spgispeech) | Financial meetings | Oratory, spontaneous | 4900 | 100 | 100 | Punctuated & Cased | User Agreement |
|
111 |
-
| [Earnings-22](https://huggingface.co/datasets/revdotcom/earnings22) | Financial meetings | Oratory, spontaneous | 105 | 5 | 5 | Punctuated & Cased | CC-BY-SA-4.0 |
|
112 |
-
| [AMI](https://huggingface.co/datasets/edinburghcstr/ami) | Meetings | Spontaneous | 78 | 9 | 9 | Punctuated & Cased | CC-BY-4.0 |
|
113 |
-
|
114 |
-
For more details on the individual datasets and how models are evaluated to give the ESB score, refer to the [ESB paper](https://arxiv.org/abs/2210.13352).
|
115 |
-
"""
|
116 |
-
|
117 |
-
LEADERBOARD_CSS = """
|
118 |
-
#leaderboard-table th .header-content {
|
119 |
-
white-space: nowrap;
|
120 |
-
}
|
121 |
-
#whisper-backends-tab th .header-content {
|
122 |
-
white-space: nowrap;
|
123 |
-
}
|
124 |
-
"""
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
|
3 |
+
# These classes are for user facing column names, to avoid having to change them
|
4 |
+
# all around the code when a modif is needed
|
5 |
+
@dataclass
|
6 |
+
class ColumnContent:
|
7 |
+
name: str
|
8 |
+
type: str
|
9 |
+
|
10 |
+
def fields(raw_class):
|
11 |
+
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
12 |
+
|
13 |
+
@dataclass(frozen=True)
|
14 |
+
class AutoEvalColumn: # Auto evals column
|
15 |
+
model = ColumnContent("Model", "markdown")
|
16 |
+
avg_wer = ColumnContent("Average WER β¬οΈ", "number")
|
17 |
+
rtf = ColumnContent("RTFx β¬οΈοΈ", "number")
|
18 |
+
ami_wer = ColumnContent("AMI", "number")
|
19 |
+
e22_wer = ColumnContent("Earnings22", "number")
|
20 |
+
gs_wer = ColumnContent("Gigaspeech", "number")
|
21 |
+
lsc_wer = ColumnContent("LS Clean", "number")
|
22 |
+
lso_wer = ColumnContent("LS Other", "number")
|
23 |
+
ss_wer = ColumnContent("SPGISpeech", "number")
|
24 |
+
tl_wer = ColumnContent("Tedlium", "number")
|
25 |
+
vp_wer = ColumnContent("Voxpopuli", "number")
|
26 |
+
|
27 |
+
|
28 |
+
def make_clickable_model(model_name):
|
29 |
+
model_name_list = model_name.split("/")
|
30 |
+
if model_name_list[0] == "trt-llm":
|
31 |
+
link = "https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/whisper"
|
32 |
+
elif model_name_list[0] == "faster-whisper":
|
33 |
+
link = "https://github.com/guillaumekln/faster-whisper"
|
34 |
+
elif model_name_list[0] == "Whisper.cpp":
|
35 |
+
link = "https://github.com/ggerganov/whisper.cpp"
|
36 |
+
elif model_name_list[0] == "WhisperKit":
|
37 |
+
link = "https://github.com/argmaxinc/WhisperKit"
|
38 |
+
elif model_name_list[0] == "WhisperMLX":
|
39 |
+
link = "https://huggingface.co/collections/mlx-community/whisper-663256f9964fbb1177db93dc"
|
40 |
+
else:
|
41 |
+
link = f"https://huggingface.co/{model_name}"
|
42 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
43 |
+
|
44 |
+
def styled_error(error):
|
45 |
+
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
|
46 |
+
|
47 |
+
def styled_warning(warn):
|
48 |
+
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
|
49 |
+
|
50 |
+
def styled_message(message):
|
51 |
+
return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|