|
import streamlit as st |
|
from pandas import read_csv |
|
import os |
|
import jiwer |
|
from huggingface_hub import Repository |
|
import zipfile |
|
|
|
REFERENCE_NAME = "references" |
|
SUBMISSION_NAME = "submissions" |
|
|
|
REFERENCE_URL = os.path.join( |
|
"https://huggingface.co/datasets/esc-bench", REFERENCE_NAME |
|
) |
|
SUBMISSION_URL = os.path.join( |
|
"https://huggingface.co/datasets/esc-bench", SUBMISSION_NAME |
|
) |
|
|
|
TEST_SETS = [ |
|
"librispeech-clean", |
|
"librispeech-other", |
|
"common-voice-9", |
|
"vox-populi", |
|
"ted-lium", |
|
"giga-speech", |
|
"spgi-speech", |
|
"earnings-22", |
|
"ami", |
|
] |
|
EXPECTED_TEST_FILES = [f + ".txt" for f in TEST_SETS] |
|
OPTIONAL_TEST_SETS = ["switch-board", "call-home", "chime-4"] |
|
|
|
CSV_RESULTS_FILE = os.path.join(SUBMISSION_NAME, "results.csv") |
|
|
|
|
|
HF_TOKEN = os.environ.get("HF_TOKEN") |
|
|
|
|
|
def compute_wer(pred_file, ref_file): |
|
with open(pred_file, "r", encoding="utf-8") as pred, open( |
|
ref_file, "r", encoding="utf-8" |
|
) as ref: |
|
pred_lines = [line.strip() for line in pred.readlines()] |
|
ref_lines = [line.strip() for line in ref.readlines()] |
|
|
|
wer = jiwer.wer(ref_lines, pred_lines) |
|
return wer |
|
|
|
|
|
reference_repo = Repository( |
|
local_dir="references", clone_from=REFERENCE_URL, use_auth_token=HF_TOKEN |
|
) |
|
submission_repo = Repository( |
|
local_dir="submissions", clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN |
|
) |
|
submission_repo.git_pull() |
|
|
|
all_submissions = [ |
|
folder |
|
for folder in os.listdir(SUBMISSION_NAME) |
|
if os.path.isdir(os.path.join(SUBMISSION_NAME, folder)) and folder != ".git" |
|
] |
|
|
|
|
|
COLUMN_NAMES = { |
|
"librispeech-clean": "ls-clean", |
|
"librispeech-other": "ls-other", |
|
"common-voice-9": "cv9", |
|
"vox-populi": "vox", |
|
"ted-lium": "ted", |
|
"giga-speech": "giga", |
|
"spgi-speech": "spgi", |
|
"earnings-22": "e22", |
|
"ami": "ami", |
|
"chime-4": "chime", |
|
"switch-board": "swbd", |
|
} |
|
|
|
all_results = read_csv(CSV_RESULTS_FILE) |
|
|
|
|
|
|
|
table = all_results.copy() |
|
|
|
esc_column = table.pop("esc-score") |
|
name_column = table.pop("name") |
|
table.insert(0, "esc-score", esc_column) |
|
|
|
table = table.select_dtypes(exclude=['object', 'string']) |
|
table.insert(0, "name", name_column) |
|
table = table.round(2) |
|
table = table.rename(columns=COLUMN_NAMES) |
|
|
|
table.index = table.index + 1 |
|
|
|
|
|
st.markdown("# ESC: A Benchmark For Multi-Domain End-to-End Speech Recognition") |
|
|
|
st.markdown( |
|
f""" |
|
This is the leaderboard of the End-to end Speech Challenge (ESC). |
|
Submitted systems are ranked by the **ESC Score** which is the average of |
|
all non-optional datasets: {', '.join(COLUMN_NAMES.values())}.""" |
|
) |
|
|
|
|
|
table |
|
|
|
st.markdown( |
|
""" |
|
ESC was proposed in *ESC: A Benchmark For Multi-Domain End-to-End Speech Recognition* by ... |
|
\n |
|
The abstract of the paper is as follows: |
|
\n |
|
*Speech recognition applications cover a range of different audio and text distributions, with different speaking styles, background noise, transcription punctuation and character casing. However, many speech recognition systems require dataset-specific tuning (audio filtering, punctuation removal and normalisation of casing), therefore assuming a-priori knowledge of both the audio and text distributions. This tuning requirement can lead to systems failing to generalise to other datasets and domains. To promote the development of multi-domain speech systems, we introduce the End-to end Speech Challenge (ESC) for evaluating the performance of a single automatic speech recognition (ASR) system across a broad set of speech datasets. Benchmarked systems must use the same data pre- and post-processing algorithm across datasets - assuming the audio and text data distributions are a-priori unknown. We compare a series of state-of-the-art (SoTA) end-to-end (E2E) systems on this benchmark, demonstrating how a single speechsystem can be applied and evaluated on a wide range of data distributions. We find E2E systems to be effective across datasets: in a fair comparison, E2E systems achieve within 2.6% of SoTA systems tuned to a specific dataset. Our analysis reveals that transcription artefacts, such as punctuation and casing, pose difficulties for ASR systems and should be included in evaluation. We believe E2E benchmarking over a range of datasets promotes the research of multi-domain speech recognition systems.* |
|
\n |
|
For more information, please see the official submission on [OpenReview.net](https://openreview.net/forum?id=9OL2fIfDLK). |
|
""" |
|
) |
|
|
|
st.markdown("To submit to ESC, please click on the instructions below β") |
|
|
|
st.markdown("TODO: Add instructions ...") |
|
|
|
|
|
with st.form(key="my_form"): |
|
uploaded_file = st.file_uploader("Choose a zip file") |
|
submit_button = st.form_submit_button(label="Submit") |
|
|
|
if submit_button: |
|
if uploaded_file is None: |
|
raise ValueError("Please make sure to have uploaded a zip file.") |
|
|
|
submission = uploaded_file.name.split(".zip")[0] |
|
with st.spinner(f"Uploading {submission}..."): |
|
with zipfile.ZipFile(uploaded_file, 'r') as zip_ref: |
|
zip_ref.extractall(submission_repo.local_dir) |
|
submission_repo.push_to_hub() |
|
|
|
with st.spinner(f"Computing ESC Score for {submission}..."): |
|
results = {"name": submission} |
|
submitted_files = os.listdir(os.path.join(SUBMISSION_NAME, submission)) |
|
|
|
submitted_files = [f for f in submitted_files if f in EXPECTED_TEST_FILES] |
|
|
|
if sorted(EXPECTED_TEST_FILES) != sorted(submitted_files): |
|
raise ValueError( |
|
f"{', '.join(submitted_files)} were submitted, but expected {', '.join(EXPECTED_TEST_FILES)}" |
|
) |
|
|
|
for file in submitted_files: |
|
ref_file = os.path.join(REFERENCE_NAME, file) |
|
pred_file = os.path.join(SUBMISSION_NAME, submission, file) |
|
|
|
wer = compute_wer(pred_file, ref_file) |
|
results[file.split(".")[0]] = str(wer) |
|
|
|
wer_values = [float(results[t]) for t in TEST_SETS] |
|
all_wer = sum(wer_values) / len(wer_values) |
|
|
|
results["esc-score"] = all_wer |
|
all_results = all_results.append(results, ignore_index=True) |
|
|
|
|
|
all_results.to_csv(CSV_RESULTS_FILE) |
|
commit_url = submission_repo.push_to_hub() |
|
|
|
st.success('Please refresh this space (CTRL+R) to see your result') |
|
|