j-tobias commited on
Commit
752ce9b
·
1 Parent(s): bb8566e

added backend

Browse files
Files changed (8) hide show
  1. .gitignore → .codetogether.ignore +0 -0
  2. app.py +53 -32
  3. cards.txt +5 -0
  4. dataset.py +76 -19
  5. eval.py +22 -0
  6. model.py +104 -0
  7. test.v01.py +25 -0
  8. utils.py +23 -0
.gitignore → .codetogether.ignore RENAMED
File without changes
app.py CHANGED
@@ -1,20 +1,33 @@
1
  import gradio as gr # needs to be installed
2
 
3
  from dataset import Dataset
 
4
 
 
5
 
 
6
  dataset = Dataset()
 
7
 
8
 
9
- def eval(data_subset:str, model_1:str, model_2:str, model_3:str)->str:
10
 
11
- result = f"""# HELLO THERE
12
- Data Subset: {data_subset}
13
- Model 1: {model_1}
14
- Model 2: {model_2}
15
- Model_3: {model_3}
16
- """
17
- return result
 
 
 
 
 
 
 
 
 
18
 
19
  def get_card(selected_model:str)->str:
20
 
@@ -38,20 +51,18 @@ def is_own(data_subset:str):
38
 
39
  with gr.Blocks() as demo:
40
 
41
- gr.Markdown("# VocalVenturer 💬")
 
42
  gr.Markdown("-------")
43
- gr.Markdown("""
44
- Hello there, this is the VocalVenturer, this app is aimed at helping you making more informed model choices for ASR.
45
- Please choose a Data Subset to evalutate the Models on. You also have the opportunity to record and transcibe an own sample.
46
- The Models will be evaluated using the *WER* metric -> here is an amazing Leaderboard for it LINK""")
47
 
48
  with gr.Row():
49
  with gr.Column(scale=1):
50
  pass
51
- with gr.Column(scale=3):
52
  data_subset = gr.Radio(
53
- value="Libris",
54
- choices=["Libris","Common","own"],
55
  label="Data subset / Own Sample",
56
  )
57
  with gr.Column(scale=1):
@@ -68,41 +79,51 @@ with gr.Blocks() as demo:
68
 
69
  with gr.Column(scale=1):
70
  model_1 = gr.Dropdown(
71
- choices=["None","Model2","Model3"],
72
- label="Select Model 1"
73
  )
74
  model_1_card = gr.Markdown("")
75
 
76
  with gr.Column(scale=1):
77
  model_2 = gr.Dropdown(
78
- choices=["None","Model2","Model3"],
79
- label="Select Model 2"
80
  )
81
  model_2_card = gr.Markdown("")
82
 
83
- with gr.Column(scale=1):
84
- model_3 = gr.Dropdown(
85
- choices=["None","Model2","Model3"],
86
- label="Select Model 3"
87
- )
88
- model_3_card = gr.Markdown("")
89
 
90
  model_1.change(get_card, inputs=model_1, outputs=model_1_card)
91
  model_2.change(get_card, inputs=model_2, outputs=model_2_card)
92
- model_3.change(get_card, inputs=model_3, outputs=model_3_card)
93
 
94
  eval_btn = gr.Button(
95
  value="Evaluate",
96
- variant="primary"
 
97
  )
98
-
99
 
100
 
 
 
101
  gr.Markdown("-------")
102
- gr.Markdown("### Results")
 
 
 
 
 
 
 
 
103
 
104
- results = gr.Markdown("")
105
 
106
- eval_btn.click(eval, [data_subset, model_1, model_2, model_3], results)
107
 
108
  demo.launch(debug=True)
 
1
  import gradio as gr # needs to be installed
2
 
3
  from dataset import Dataset
4
+ from model import Model
5
 
6
+ from utils import hf_login, compute_wer
7
 
8
+ hf_login()
9
  dataset = Dataset()
10
+ models = Model()
11
 
12
 
13
+ def run_tests (dataset_choice:str, model:str):
14
 
15
+ MoDeL = Model()
16
+ MoDeL.select(model)
17
+ MoDeL.load()
18
+ DaTaSeT = Dataset(100)
19
+ DaTaSeT.load(dataset_choice)
20
+ references, predictions = MoDeL.process(DaTaSeT)
21
+ wer = compute_wer(references=references, predictions=predictions)
22
+ return wer
23
+
24
+
25
+ def eval(data_subset:str, model_1:str, model_2:str)->str:
26
+
27
+ wer_result_1 = run_tests(data_subset, model_1)
28
+ wer_result_2 = run_tests(data_subset, model_2)
29
+
30
+ return f"WER Score: {wer_result_1}", f"WER Score: {wer_result_2}"
31
 
32
  def get_card(selected_model:str)->str:
33
 
 
51
 
52
  with gr.Blocks() as demo:
53
 
54
+
55
+ gr.Markdown('# <p style="text-align: center;">VocalVenturer 💬</p>')
56
  gr.Markdown("-------")
57
+ gr.Markdown("""#### Hello there, this is the VocalVenturer, this app is aimed at helping you making more informed model choices for ASR. Please choose a Data Subset to evalutate the Models on. You also have the opportunity to record and transcibe an own sample. The Models will be evaluated using the *WER* metric -> here is an amazing Leaderboard for it LINK""")
 
 
 
58
 
59
  with gr.Row():
60
  with gr.Column(scale=1):
61
  pass
62
+ with gr.Column(scale=5):
63
  data_subset = gr.Radio(
64
+ value="LibriSpeech Clean",
65
+ choices=dataset.get_options(),
66
  label="Data subset / Own Sample",
67
  )
68
  with gr.Column(scale=1):
 
79
 
80
  with gr.Column(scale=1):
81
  model_1 = gr.Dropdown(
82
+ choices=models.get_options(),
83
+ label="Select Model"
84
  )
85
  model_1_card = gr.Markdown("")
86
 
87
  with gr.Column(scale=1):
88
  model_2 = gr.Dropdown(
89
+ choices=models.get_options(),
90
+ label="Select Model"
91
  )
92
  model_2_card = gr.Markdown("")
93
 
94
+ # with gr.Column(scale=1):
95
+ # model_3 = gr.Dropdown(
96
+ # choices=models.get_options(),
97
+ # label="Select Model"
98
+ # )
99
+ # model_3_card = gr.Markdown("")
100
 
101
  model_1.change(get_card, inputs=model_1, outputs=model_1_card)
102
  model_2.change(get_card, inputs=model_2, outputs=model_2_card)
103
+ # model_3.change(get_card, inputs=model_3, outputs=model_3_card)
104
 
105
  eval_btn = gr.Button(
106
  value="Evaluate",
107
+ variant="primary",
108
+ size="sm"
109
  )
 
110
 
111
 
112
+
113
+ gr.Markdown('## <p style="text-align: center;">Results</p>')
114
  gr.Markdown("-------")
115
+
116
+
117
+ with gr.Row():
118
+
119
+ with gr.Column(scale=1):
120
+ results_model_1 = gr.Markdown("")
121
+
122
+ with gr.Column(scale=1):
123
+ results_model_2 = gr.Markdown("")
124
 
125
+ eval_btn.click(eval, [data_subset, model_1, model_2], [results_model_1, results_model_2])
126
 
127
+
128
 
129
  demo.launch(debug=True)
cards.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ #### Whisper Tiny (EN)
2
+
3
+ -
4
+ @@
5
+ @@
dataset.py CHANGED
@@ -1,36 +1,93 @@
1
- from huggingface_hub import login
2
  from datasets import load_dataset
3
  from datasets import Audio
4
 
5
- import json
6
 
7
- def get_credentials():
8
- with open("credentials.json","r") as f:
9
- credentials = json.load(f)
10
- return credentials['token']
11
 
 
12
 
 
13
 
14
- class Dataset:
 
 
 
 
15
 
16
- def __init__(self, n:int=100):
 
17
 
18
- # # Log the client into HF
19
- # login(token=get_credentials())
 
 
20
 
21
- # # Load the Dataset in Streaming Mode
22
- # self.librispeech_clean = load_dataset("librispeech_asr", "all", split="test.clean", streaming=True)
23
- # self.librispeech_other = load_dataset("librispeech_asr", "all", split="test.other", streaming=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
- self.n = n
 
 
 
 
26
 
27
- self.options = ['librisspeech_clean','librisspeech_other']
28
-
29
- def get_option(self):
30
- return self.options
 
 
 
 
 
 
 
 
 
 
 
 
31
 
 
32
 
 
 
33
 
 
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
- dataset = Dataset()
 
 
1
  from datasets import load_dataset
2
  from datasets import Audio
3
 
 
4
 
 
 
 
 
5
 
6
+ class Dataset:
7
 
8
+ def __init__(self, n:int = 100):
9
 
10
+ self.n = n
11
+ self.options = ['LibriSpeech Clean', 'LibriSpeech Other', 'Common Voice', 'VoxPopuli', 'TEDLIUM', 'GigaSpeech', 'SPGISpeech', 'AMI', 'OWN']
12
+ self.selected = None
13
+ self.dataset = None
14
+ self.text = None
15
 
16
+ def get_options(self):
17
+ return self.options
18
 
19
+ def _check_text(self):
20
+ sample = next(iter(self.dataset))
21
+ print(sample)
22
+ self._get_text(sample)
23
 
24
+ def _get_text(self, sample):
25
+ if "text" in sample:
26
+ self.text = "text"
27
+ return sample["text"]
28
+ elif "sentence" in sample:
29
+ self.text = "sentence"
30
+ return sample["sentence"]
31
+ elif "normalized_text" in sample:
32
+ self.text = "normalized_text"
33
+ return sample["normalized_text"]
34
+ elif "transcript" in sample:
35
+ self.text = "transcript"
36
+ return sample["transcript"]
37
+ else:
38
+ raise ValueError(f"Sample: {sample.keys()} has no transcript.")
39
+
40
+ def filter(self, input_column:str = None):
41
 
42
+ if input_column is None:
43
+ if self.text is not None:
44
+ input_column = self.text
45
+ else:
46
+ input_column = self._check_text()
47
 
48
+ def is_target_text_in_range(ref):
49
+ if ref.strip() == "ignore time segment in scoring":
50
+ return False
51
+ else:
52
+ return ref.strip() != ""
53
+
54
+ self.dataset = self.dataset.filter(is_target_text_in_range, input_columns=[input_column])
55
+ return self.dataset
56
+
57
+ def normalised(self, normalise):
58
+ self.dataset = self.dataset.map(normalise)
59
+
60
+ def _select(self, option:str):
61
+ if option not in self.options:
62
+ raise ValueError(f"This value is not an option, please see: {self.options}")
63
+ self.selected = option
64
 
65
+ def _preprocess(self):
66
 
67
+ self.dataset = self.dataset.take(self.n)
68
+ self.dataset = self.dataset.cast_column("audio", Audio(sampling_rate=16000))
69
 
70
+ def load(self, option:str = None):
71
 
72
+ self._select(option)
73
+
74
+ if option == "OWN":
75
+ pass
76
+ elif option == "LibriSpeech Clean":
77
+ self.dataset = load_dataset("librispeech_asr", "all", split="test.clean", streaming=True)
78
+ elif option == "LibriSpeech Other":
79
+ self.dataset = load_dataset("librispeech_asr", "all", split="test.other", streaming=True)
80
+ elif option == "Common Voice":
81
+ self.dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", revision="streaming", split="test", streaming=True, token=True, trust_remote_code=True)
82
+ elif option == "VoxPopuli":
83
+ self.dataset = load_dataset("facebook/voxpopuli", "en", split="test", streaming=True, trust_remote_code=True)
84
+ elif option == "TEDLIUM":
85
+ self.dataset = load_dataset("LIUM/tedlium", "release3", split="test", streaming=True, trust_remote_code=True)
86
+ elif option == "GigaSpeech":
87
+ self.dataset = load_dataset("speechcolab/gigaspeech", "xs", split="test", streaming=True, token=True, trust_remote_code=True)
88
+ elif option == "SPGISpeech":
89
+ self.dataset = load_dataset("kensho/spgispeech", "S", split="test", streaming=True, token=True, trust_remote_code=True)
90
+ elif option == "AMI":
91
+ self.dataset = load_dataset("edinburghcstr/ami", "ihm", split="test", streaming=True, trust_remote_code=True)
92
 
93
+ self._preprocess()
eval.py CHANGED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataset import Dataset
2
+ from model import Models
3
+
4
+
5
+ def data(dataset):
6
+ for i, item in enumerate(dataset):
7
+ yield {**item["audio"], "reference": item["norm_text"]}
8
+
9
+
10
+ def streamed_infernce(dataset, pipeline):
11
+
12
+
13
+ # placeholders for predictions and references
14
+ predictions = []
15
+ references = []
16
+
17
+ # run streamed inference
18
+ for out in pipeline(data(dataset), batch_size=16):
19
+ predictions.append(pipeline(out["text"]))
20
+ references.append(out["reference"][0])
21
+
22
+ return predictions, references
model.py CHANGED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
2
+ from transformers import Speech2TextForConditionalGeneration, Speech2TextProcessor
3
+ from transformers import pipeline
4
+
5
+ from dataset import Dataset
6
+ from utils import data
7
+
8
+
9
+
10
+ class Model:
11
+
12
+
13
+ def __init__(self):
14
+
15
+ self.options = [
16
+ "openai/whisper-tiny.en",
17
+ "facebook/s2t-medium-librispeech-asr"
18
+ ]
19
+ self.selected = None
20
+ self.pipeline = None
21
+ self.normalize = None
22
+
23
+ def get_options(self):
24
+ return self.options
25
+
26
+ def load(self, option:str = None):
27
+
28
+ if option is None:
29
+ if self.selected is None:
30
+ raise ValueError("No model selected. Please first select a model")
31
+ option = self.selected
32
+
33
+ if option not in self.options:
34
+ raise ValueError(f"Selected Option is not a valid value, see: {self.options}")
35
+
36
+ if option == "openai/whisper-tiny.en":
37
+ self.pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-tiny.en", device=0)
38
+ self.normalize = self.pipeline.tokenizer.normalize
39
+
40
+ elif option == "facebook/s2t-medium-librispeech-asr":
41
+ self.model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-librispeech-asr")
42
+ self.processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-librispeech-asr", do_upper_case=True)
43
+
44
+
45
+ def select(self, option:str=None):
46
+ if option not in self.options:
47
+ raise ValueError(f"This value is not an option, please see: {self.options}")
48
+ self.selected = option
49
+
50
+ def process(self, dataset:Dataset):
51
+
52
+ if self.selected is None:
53
+ raise ValueError("No Model is yet selected. Please select a model first")
54
+
55
+ if self.selected == "openai/whisper-tiny.en":
56
+ references, predictions = self._process_openai_whisper_tiny_en(dataset)
57
+ elif self.selected == "facebook/s2t-medium-librispeech-asr":
58
+ references, predictions = self._process_facebook_s2t_medium(dataset)
59
+
60
+ return references, predictions
61
+
62
+ def _process_openai_whisper_tiny_en(self, DaTaSeT:Dataset):
63
+
64
+ def normalise(batch):
65
+ batch["norm_text"] = self.normalize(DaTaSeT._get_text(batch))
66
+ return batch
67
+
68
+ DaTaSeT.normalised(normalise)
69
+ dataset = DaTaSeT.filter("norm_text")
70
+
71
+ predictions = []
72
+ references = []
73
+
74
+ # run streamed inference
75
+ for out in self.pipeline(data(dataset), batch_size=16):
76
+ predictions.append(self.normalize(out["text"]))
77
+ references.append(out["reference"][0])
78
+
79
+ return references, predictions
80
+
81
+ def _process_facebook_s2t_medium(self, DaTaSeT:Dataset):
82
+
83
+
84
+
85
+ def map_to_pred(batch):
86
+ features = self.processor(batch["audio"]["array"], sampling_rate=16000, padding=True, return_tensors="pt")
87
+ input_features = features.input_features
88
+ attention_mask = features.attention_mask
89
+
90
+ gen_tokens = self.model.generate(input_features=input_features, attention_mask=attention_mask)
91
+ batch["transcription"] = self.processor.batch_decode(gen_tokens, skip_special_tokens=True)[0]
92
+ return batch
93
+
94
+ DaTaSeT.dataset = DaTaSeT.dataset.take(100)
95
+ result = DaTaSeT.dataset.map(map_to_pred, remove_columns=["audio"])
96
+
97
+ predictions = []
98
+ references = []
99
+
100
+ for sample in result:
101
+ predictions.append(sample['transcription'])
102
+ references.append(sample['text'])
103
+
104
+ return references, predictions
test.v01.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils import hf_login, data, compute_wer
2
+ from dataset import Dataset
3
+ from model import Model
4
+
5
+ hf_login()
6
+
7
+
8
+
9
+
10
+
11
+
12
+ def run_tests (dataset_choice:str, model:str):
13
+
14
+ MoDeL = Model()
15
+ MoDeL.select(model)
16
+ MoDeL.load()
17
+ DaTaSeT = Dataset(100)
18
+ DaTaSeT.load(dataset_choice)
19
+ references, predictions = MoDeL.process(DaTaSeT)
20
+ wer = compute_wer(references=references, predictions=predictions)
21
+ return wer
22
+
23
+
24
+ print("WER:", run_tests(dataset_choice="GigaSpeech", model="facebook/s2t-medium-librispeech-asr"))
25
+
utils.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import login
2
+ import json
3
+ import evaluate
4
+ import os
5
+
6
+ def hf_login():
7
+ hf_token = os.getenv("HF_TOKEN")
8
+ # if hf_token is None:
9
+ # with open("credentials.json", "r") as f:
10
+ # hf_token = json.load(f)["token"]
11
+ login(token=hf_token)
12
+
13
+ def data(dataset):
14
+ for i, item in enumerate(dataset):
15
+ yield {**item["audio"], "reference": item["norm_text"]}
16
+
17
+ def compute_wer(references, predictions):
18
+ wer_metric = evaluate.load("wer")
19
+ wer = wer_metric.compute(references=references, predictions=predictions)
20
+ wer = round(100 * wer, 2)
21
+ return wer
22
+
23
+