File size: 5,824 Bytes
5cd28f2 0806a18 5cd28f2 0806a18 ae20e1b 0806a18 5cd28f2 0806a18 ae20e1b 0806a18 ae20e1b 0806a18 3e1f438 3d6609c e231451 3d6609c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
import requests
import json
import pandas as pd
from tqdm.auto import tqdm
import streamlit as st
from pandas import read_csv
import os
from huggingface_hub import HfApi, hf_hub_download
from huggingface_hub.repocard import metadata_load
import jiwer
import datetime
from huggingface_hub import Repository
REFERENCE_NAME = "references"
SUBMISSION_NAME = "submissions"
REFERENCE_URL = os.path.join("https://huggingface.co/datasets/esc-bench", REFERENCE_NAME)
SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/esc-bench", SUBMISSION_NAME)
TEST_SETS = ["librispeech-clean", "librispeech-other", "common-voice-9", "vox-populi", "ted-lium", "giga-speech", "spgi-speech", "earnings-22", "ami"]
EXPECTED_TEST_FILES = [f + ".txt" for f in TEST_SETS]
OPTIONAL_TEST_SETS = ["switch-board", "call-home", "chime-4"]
CSV_RESULTS_FILE = os.path.join(SUBMISSION_NAME, "results.csv")
HF_TOKEN = os.environ.get("HF_TOKEN")
def compute_wer(pred_file, ref_file):
with open(pred_file, "r", encoding="utf-8") as pred, open(ref_file, "r", encoding="utf-8") as ref:
pred_lines = [line.strip() for line in pred.readlines()]
ref_lines = [line.strip() for line in ref.readlines()]
wer = jiwer.wer(ref_lines, pred_lines)
return wer
reference_repo = Repository(local_dir="references", clone_from=REFERENCE_URL, use_auth_token=HF_TOKEN)
submission_repo = Repository(local_dir="submissions", clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN)
all_submissions = [folder for folder in os.listdir(SUBMISSION_NAME) if os.path.isdir(os.path.join(SUBMISSION_NAME, folder)) and folder != ".git"]
all_results = read_csv(CSV_RESULTS_FILE)
evaluated_submissions = all_results["name"].values.tolist()
non_evaluated_submissions = set(all_submissions) - set(evaluated_submissions)
if len(non_evaluated_submissions) > 0:
for submission in non_evaluated_submissions:
print(f"Evaluate {submission}")
results = {"name": submission}
submitted_files = os.listdir(os.path.join(SUBMISSION_NAME, submission))
submitted_files = [f for f in submitted_files if f in EXPECTED_TEST_FILES]
if sorted(EXPECTED_TEST_FILES) != sorted(submitted_files):
raise ValueError(f"{', '.join(submitted_files)} were submitted, but expected {', '.join(EXPECTED_TEST_FILES)}")
for file in submitted_files:
ref_file = os.path.join(REFERENCE_NAME, file)
pred_file = os.path.join(SUBMISSION_NAME, submission, file)
wer = compute_wer(pred_file, ref_file)
results[file.split(".")[0]] = str(wer)
wer_values = [float(results[t]) for t in TEST_SETS]
all_wer = sum(wer_values) / len(wer_values)
results["esc-score"] = all_wer
all_results = all_results.append(results, ignore_index=True)
# save and upload new evaluated results
all_results.to_csv(CSV_RESULTS_FILE)
commit_url = reference_repo.push_to_hub()
print(commit_url)
COLUMN_NAMES = {"librispeech-clean": "ls-clean", "librispeech-other": "ls-other", "common-voice-9": "cv9", "vox-populi": "vox", "ted-lium": "ted", "giga-speech": "giga", "spgi-speech": "spgi", "earnings-22": "e22", "ami": "ami", "chime-4": "chime", "switch-board": "swbd"}
table = all_results.round(4)
table = table.rename(columns=COLUMN_NAMES)
# Streamlit
st.markdown("# ESC: A Benchmark For Multi-Domain End-to-End Speech Recognition")
st.markdown(
f"""
This is the leaderboard of the End-to end Speech Challenge (ESC).
Submitted systems are ranked by the **ESC Score** which is the average of
all non-optional datasets: {', '.join(COLUMN_NAMES.values())}."""
)
st.table(table)
# *Sanchit Gandhi, Patrick Von Platen, and, Alexander M Rush*
st.markdown(
"""
ESC was proposed in *ESC: A Benchmark For Multi-Domain End-to-End Speech Recognition* by ...
\n
The abstract of the paper is as follows:
\n
*Speech recognition applications cover a range of different audio and text distributions, with different speaking styles, background noise, transcription punctuation and character casing. However, many speech recognition systems require dataset-specific tuning (audio filtering, punctuation removal and normalisation of casing), therefore assuming a-priori knowledge of both the audio and text distributions. This tuning requirement can lead to systems failing to generalise to other datasets and domains. To promote the development of multi-domain speech systems, we introduce the End-to end Speech Challenge (ESC) for evaluating the performance of a single automatic speech recognition (ASR) system across a broad set of speech datasets. Benchmarked systems must use the same data pre- and post-processing algorithm across datasets - assuming the audio and text data distributions are a-priori unknown. We compare a series of state-of-the-art (SoTA) end-to-end (E2E) systems on this benchmark, demonstrating how a single speechsystem can be applied and evaluated on a wide range of data distributions. We find E2E systems to be effective across datasets: in a fair comparison, E2E systems achieve within 2.6% of SoTA systems tuned to a specific dataset. Our analysis reveals that transcription artefacts, such as punctuation and casing, pose difficulties for ASR systems and should be included in evaluation. We believe E2E benchmarking over a range of datasets promotes the research of multi-domain speech recognition systems.*
\n
For more information, please see the official submission on [OpenReview.net](https://openreview.net/forum?id=9OL2fIfDLK).
"""
)
st.markdown("To submit to ESC, please click on the instructions below ↓")
st.markdown("TODO: Add instructions ...")
uploaded_file = st.file_uploader("Choose a file")
if st.button('Submit'):
st.write('Computing scores ...')
|