rjzevallos commited on
Commit
79ddbe0
·
verified ·
1 Parent(s): d979581

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -11
app.py CHANGED
@@ -6,11 +6,11 @@ LAST_UPDATED = "06/12/2024"
6
  # Datos estáticos del leaderboard
7
  ####################################
8
  leaderboard_data = [
9
- {'name': 'StyleTTS 2', 'STOI': 0.998, 'PESQ': 3.921, 'WER': 0.162, 'UTMOS': 2.42, 'SpeechBERT': 0, 'Logf0': 0},
10
- {'name': 'Matxa-TTS', 'STOI': 0.996, 'PESQ': 3.539, 'WER': 0.179, 'UTMOS': 3.50, 'SpeechBERT': 0, 'Logf0': 0},
11
- {'name': 'Matxa-TTS-multiaccent', 'STOI': 0.996, 'PESQ': 3.415, 'WER': 0.242, 'UTMOS': 2.98, 'SpeechBERT': 0, 'Logf0': 0},
12
- {'name': 'StableTTS', 'STOI': 0.997, 'PESQ': 3.643, 'WER': 0.164, 'UTMOS': 2.62, 'SpeechBERT': 0.7837, 'Logf0': 0.3831},
13
- {'name': 'Vits 2', 'STOI': 0, 'PESQ': 0, 'WER': 0, 'UTMOS': 3.61, 'SpeechBERT': 0, 'Logf0': 0},
14
  ]
15
 
16
 
@@ -20,7 +20,6 @@ METRICS_TAB_TEXT = """
20
  Models in the leaderboard are evaluated using several key metrics:
21
  * **UTMOS** (UTokyo-SaruLab Mean Opinion Score),
22
  * **WER** (Word Error Rate),
23
- * **STOI** (Short-Time Objective Intelligibility),
24
  * **PESQ** (Perceptual Evaluation of Speech Quality).
25
  These metrics help evaluate both the accuracy and quality of the model.
26
  ### UTMOS (UTokyo-SaruLab Mean Opinion Score)[[Paper](https://arxiv.org/abs/2204.02152)]
@@ -38,8 +37,8 @@ The WER calculation is done as follows:
38
  ```
39
  WER = (S + I + D) / N = (1 + 0 + 1) / 6 = 0.333
40
  ```
41
- ### STOI (Short-Time Objective Intelligibility)[[Paper](https://ieeexplore.ieee.org/abstract/document/5495701?casa_token=PLtqLc8KNAgAAAAA:FOLuZ4dgMYsnGb1dQHgqVOouQzRJ3vA5yqj-sbwf8gs9Q-AIDCLkMZzAgzRrAogwwxULK9zsYeE)]
42
- STOI measures the intelligibility of the synthesized speech signal compared to the original signal. **A higher STOI indicates better intelligibility**.
43
  ### PESQ (Perceptual Evaluation of Speech Quality)[[Paper](https://ieeexplore.ieee.org/abstract/document/941023?casa_token=jdtHy84_KhQAAAAA:qHN3WbT6cNdufj6OOn_fn0Je0RedMv-WJCmhQ_3CWy4nMTuDvFMF3KstAmKqLx5suQwdPgGByoY)]
44
  PESQ is a perceptual metric that evaluates the quality of speech in a similar manner to how a human listener would. **A higher PESQ indicates better voice quality**.
45
  ## Benchmark Datasets
@@ -76,7 +75,7 @@ def get_leaderboard():
76
  for rank, model in enumerate(sorted_leaderboard):
77
  model['rank'] = rank + 1 # rank es la posición en la lista (1-indexed)
78
 
79
- return [[model['rank'], model['name'], model['UTMOS'], model['WER'], model['STOI'], model['PESQ'], model['SpeechBERT'], model['Logf0']] for model in sorted_leaderboard]
80
 
81
  ####################################
82
  # Interfaz con Gradio
@@ -92,8 +91,8 @@ with gr.Blocks(theme=theme) as demo:
92
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
93
  with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0):
94
  leaderboard_table = gr.DataFrame(
95
- headers=["Rank", "Model", "UTMOS ⬆️️", "WER ⬇️", "STOI", "PESQ", "SpeechBERT ⬆️", "Logf0 ⬆️"],
96
- datatype=["str", "str", "str", "str", "str", "str", "str", "str"],
97
  value=get_leaderboard() # Carga los datos iniciales de la tabla
98
  )
99
 
 
6
  # Datos estáticos del leaderboard
7
  ####################################
8
  leaderboard_data = [
9
+ {'name': 'StyleTTS 2', 'PESQ': 3.921, 'WER': 0.162, 'UTMOS': 2.42, 'SpeechBERT': 0, 'Logf0': 0},
10
+ {'name': 'Matxa-TTS', 'PESQ': 3.539, 'WER': 0.179, 'UTMOS': 3.50, 'SpeechBERT': 0, 'Logf0': 0},
11
+ {'name': 'Matxa-TTS-multiaccent', 'PESQ': 3.415, 'WER': 0.242, 'UTMOS': 2.98, 'SpeechBERT': 0, 'Logf0': 0},
12
+ {'name': 'StableTTS', 'PESQ': 3.643, 'WER': 0.164, 'UTMOS': 2.62, 'SpeechBERT': 0.7837, 'Logf0': 0.3831},
13
+ {'name': 'Vits 2', 'PESQ': 0, 'WER': 0, 'UTMOS': 3.61, 'SpeechBERT': 0, 'Logf0': 0},
14
  ]
15
 
16
 
 
20
  Models in the leaderboard are evaluated using several key metrics:
21
  * **UTMOS** (UTokyo-SaruLab Mean Opinion Score),
22
  * **WER** (Word Error Rate),
 
23
  * **PESQ** (Perceptual Evaluation of Speech Quality).
24
  These metrics help evaluate both the accuracy and quality of the model.
25
  ### UTMOS (UTokyo-SaruLab Mean Opinion Score)[[Paper](https://arxiv.org/abs/2204.02152)]
 
37
  ```
38
  WER = (S + I + D) / N = (1 + 0 + 1) / 6 = 0.333
39
  ```
40
+ Moreover, We calculate the WER using the STT_Ca_Citrinet_512 model. [[Link](https://langtech-bsc.gitbook.io/aina-kit/aina-hack/automatic-speech-recognition)]
41
+
42
  ### PESQ (Perceptual Evaluation of Speech Quality)[[Paper](https://ieeexplore.ieee.org/abstract/document/941023?casa_token=jdtHy84_KhQAAAAA:qHN3WbT6cNdufj6OOn_fn0Je0RedMv-WJCmhQ_3CWy4nMTuDvFMF3KstAmKqLx5suQwdPgGByoY)]
43
  PESQ is a perceptual metric that evaluates the quality of speech in a similar manner to how a human listener would. **A higher PESQ indicates better voice quality**.
44
  ## Benchmark Datasets
 
75
  for rank, model in enumerate(sorted_leaderboard):
76
  model['rank'] = rank + 1 # rank es la posición en la lista (1-indexed)
77
 
78
+ return [[model['rank'], model['name'], model['UTMOS'], model['WER'], model['PESQ'], model['SpeechBERT'], model['Logf0']] for model in sorted_leaderboard]
79
 
80
  ####################################
81
  # Interfaz con Gradio
 
91
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
92
  with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0):
93
  leaderboard_table = gr.DataFrame(
94
+ headers=["Rank", "Model", "UTMOS ⬆️️", "WER ⬇️", "PESQ", "SpeechBERT ⬆️", "Logf0 ⬆️"],
95
+ datatype=["str", "str", "str", "str", "str", "str", "str"],
96
  value=get_leaderboard() # Carga los datos iniciales de la tabla
97
  )
98