patrickvonplaten commited on
Commit
45e6546
·
1 Parent(s): a347f8f
Files changed (1) hide show
  1. app.py +10 -13
app.py CHANGED
@@ -1,14 +1,7 @@
1
- import requests
2
- import json
3
- import pandas as pd
4
- from tqdm.auto import tqdm
5
  import streamlit as st
6
  from pandas import read_csv
7
  import os
8
- from huggingface_hub import HfApi, hf_hub_download
9
- from huggingface_hub.repocard import metadata_load
10
  import jiwer
11
- import datetime
12
  from huggingface_hub import Repository
13
 
14
  REFERENCE_NAME = "references"
@@ -118,7 +111,11 @@ COLUMN_NAMES = {
118
 
119
  table = all_results
120
 
121
- tabel = table.select_dtypes(exclude=['object', 'string']) * 100
 
 
 
 
122
  table = table.round(2)
123
  table = table.rename(columns=COLUMN_NAMES)
124
 
@@ -128,20 +125,20 @@ st.markdown("# ESC: A Benchmark For Multi-Domain End-to-End Speech Recognition")
128
  st.markdown(
129
  f"""
130
  This is the leaderboard of the End-to end Speech Challenge (ESC).
131
- Submitted systems are ranked by the **ESC Score** which is the average of
132
  all non-optional datasets: {', '.join(COLUMN_NAMES.values())}."""
133
  )
134
 
135
- #st.table(table)
136
  table
137
 
138
  # *Sanchit Gandhi, Patrick Von Platen, and, Alexander M Rush*
139
  st.markdown(
140
  """
141
  ESC was proposed in *ESC: A Benchmark For Multi-Domain End-to-End Speech Recognition* by ...
142
- \n
143
- The abstract of the paper is as follows:
144
- \n
145
  *Speech recognition applications cover a range of different audio and text distributions, with different speaking styles, background noise, transcription punctuation and character casing. However, many speech recognition systems require dataset-specific tuning (audio filtering, punctuation removal and normalisation of casing), therefore assuming a-priori knowledge of both the audio and text distributions. This tuning requirement can lead to systems failing to generalise to other datasets and domains. To promote the development of multi-domain speech systems, we introduce the End-to end Speech Challenge (ESC) for evaluating the performance of a single automatic speech recognition (ASR) system across a broad set of speech datasets. Benchmarked systems must use the same data pre- and post-processing algorithm across datasets - assuming the audio and text data distributions are a-priori unknown. We compare a series of state-of-the-art (SoTA) end-to-end (E2E) systems on this benchmark, demonstrating how a single speechsystem can be applied and evaluated on a wide range of data distributions. We find E2E systems to be effective across datasets: in a fair comparison, E2E systems achieve within 2.6% of SoTA systems tuned to a specific dataset. Our analysis reveals that transcription artefacts, such as punctuation and casing, pose difficulties for ASR systems and should be included in evaluation. We believe E2E benchmarking over a range of datasets promotes the research of multi-domain speech recognition systems.*
146
  \n
147
  For more information, please see the official submission on [OpenReview.net](https://openreview.net/forum?id=9OL2fIfDLK).
 
 
 
 
 
1
  import streamlit as st
2
  from pandas import read_csv
3
  import os
 
 
4
  import jiwer
 
5
  from huggingface_hub import Repository
6
 
7
  REFERENCE_NAME = "references"
 
111
 
112
  table = all_results
113
 
114
+ esc_column = table.pop("esc-score")
115
+ name_column = table.pop("name")
116
+ table.insert(0, "esc-score", esc_column)
117
+ table = table.select_dtypes(exclude=['object', 'string']) * 100
118
+ table.insert(0, "name", name_column)
119
  table = table.round(2)
120
  table = table.rename(columns=COLUMN_NAMES)
121
 
 
125
  st.markdown(
126
  f"""
127
  This is the leaderboard of the End-to end Speech Challenge (ESC).
128
+ Submitted systems are ranked by the **ESC Score** which is the average of
129
  all non-optional datasets: {', '.join(COLUMN_NAMES.values())}."""
130
  )
131
 
132
+ # st.table(table)
133
  table
134
 
135
  # *Sanchit Gandhi, Patrick Von Platen, and, Alexander M Rush*
136
  st.markdown(
137
  """
138
  ESC was proposed in *ESC: A Benchmark For Multi-Domain End-to-End Speech Recognition* by ...
139
+ \n
140
+ The abstract of the paper is as follows:
141
+ \n
142
  *Speech recognition applications cover a range of different audio and text distributions, with different speaking styles, background noise, transcription punctuation and character casing. However, many speech recognition systems require dataset-specific tuning (audio filtering, punctuation removal and normalisation of casing), therefore assuming a-priori knowledge of both the audio and text distributions. This tuning requirement can lead to systems failing to generalise to other datasets and domains. To promote the development of multi-domain speech systems, we introduce the End-to end Speech Challenge (ESC) for evaluating the performance of a single automatic speech recognition (ASR) system across a broad set of speech datasets. Benchmarked systems must use the same data pre- and post-processing algorithm across datasets - assuming the audio and text data distributions are a-priori unknown. We compare a series of state-of-the-art (SoTA) end-to-end (E2E) systems on this benchmark, demonstrating how a single speechsystem can be applied and evaluated on a wide range of data distributions. We find E2E systems to be effective across datasets: in a fair comparison, E2E systems achieve within 2.6% of SoTA systems tuned to a specific dataset. Our analysis reveals that transcription artefacts, such as punctuation and casing, pose difficulties for ASR systems and should be included in evaluation. We believe E2E benchmarking over a range of datasets promotes the research of multi-domain speech recognition systems.*
143
  \n
144
  For more information, please see the official submission on [OpenReview.net](https://openreview.net/forum?id=9OL2fIfDLK).