ctl commited on
Commit
254b055
·
1 Parent(s): 452250c

cer impl for colab

Browse files
Files changed (1) hide show
  1. cer_memory_efficient.py +263 -0
cer_memory_efficient.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2021 The HuggingFace Datasets Authors.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """ Word Error Ratio (WER) metric. """
16
+
17
+ import datasets
18
+
19
+ # Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
20
+ #
21
+ # Licensed under the Apache License, Version 2.0 (the "License");
22
+ # you may not use this file except in compliance with the License.
23
+ # You may obtain a copy of the License at
24
+ #
25
+ # http://www.apache.org/licenses/LICENSE-2.0
26
+ #
27
+ # Unless required by applicable law or agreed to in writing, software
28
+ # distributed under the License is distributed on an "AS IS" BASIS,
29
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
30
+ # See the License for the specific language governing permissions and
31
+ # limitations under the License.
32
+ """This module provides functions to calculate error rate in different level.
33
+ e.g. wer for word-level, cer for char-level.
34
+ """
35
+ import numpy as np
36
+
37
+ # credit: https://github.com/PaddlePaddle/DeepSpeech/blob/d7e753546a813f7493c8834ca1a4b3f37a7ff139/deepspeech/utils/error_rate.py
38
+
39
+ def _levenshtein_distance(ref, hyp):
40
+ """Levenshtein distance is a string metric for measuring the difference
41
+ between two sequences. Informally, the levenshtein disctance is defined as
42
+ the minimum number of single-character edits (substitutions, insertions or
43
+ deletions) required to change one word into the other. We can naturally
44
+ extend the edits to word level when calculate levenshtein disctance for
45
+ two sentences.
46
+ """
47
+ m = len(ref)
48
+ n = len(hyp)
49
+
50
+ # special case
51
+ if ref == hyp:
52
+ return 0
53
+ if m == 0:
54
+ return n
55
+ if n == 0:
56
+ return m
57
+
58
+ if m < n:
59
+ ref, hyp = hyp, ref
60
+ m, n = n, m
61
+
62
+ # use O(min(m, n)) space
63
+ distance = np.zeros((2, n + 1), dtype=np.int32)
64
+
65
+ # initialize distance matrix
66
+ for j in range(n + 1):
67
+ distance[0][j] = j
68
+
69
+ # calculate levenshtein distance
70
+ for i in range(1, m + 1):
71
+ prev_row_idx = (i - 1) % 2
72
+ cur_row_idx = i % 2
73
+ distance[cur_row_idx][0] = i
74
+ for j in range(1, n + 1):
75
+ if ref[i - 1] == hyp[j - 1]:
76
+ distance[cur_row_idx][j] = distance[prev_row_idx][j - 1]
77
+ else:
78
+ s_num = distance[prev_row_idx][j - 1] + 1
79
+ i_num = distance[cur_row_idx][j - 1] + 1
80
+ d_num = distance[prev_row_idx][j] + 1
81
+ distance[cur_row_idx][j] = min(s_num, i_num, d_num)
82
+
83
+ return distance[m % 2][n]
84
+
85
+
86
+ def word_errors(reference, hypothesis, ignore_case=False, delimiter=' '):
87
+ """Compute the levenshtein distance between reference sequence and
88
+ hypothesis sequence in word-level.
89
+
90
+ :param reference: The reference sentence.
91
+ :type reference: str
92
+ :param hypothesis: The hypothesis sentence.
93
+ :type hypothesis: str
94
+ :param ignore_case: Whether case-sensitive or not.
95
+ :type ignore_case: bool
96
+ :param delimiter: Delimiter of input sentences.
97
+ :type delimiter: char
98
+ :return: Levenshtein distance and word number of reference sentence.
99
+ :rtype: list
100
+ """
101
+ if ignore_case == True:
102
+ reference = reference.lower()
103
+ hypothesis = hypothesis.lower()
104
+
105
+ ref_words = list(filter(None, reference.split(delimiter)))
106
+ hyp_words = list(filter(None, hypothesis.split(delimiter)))
107
+
108
+ edit_distance = _levenshtein_distance(ref_words, hyp_words)
109
+ return float(edit_distance), len(ref_words)
110
+
111
+
112
+ def char_errors(reference, hypothesis, ignore_case=False, remove_space=False):
113
+ """Compute the levenshtein distance between reference sequence and
114
+ hypothesis sequence in char-level.
115
+
116
+ :param reference: The reference sentence.
117
+ :type reference: str
118
+ :param hypothesis: The hypothesis sentence.
119
+ :type hypothesis: str
120
+ :param ignore_case: Whether case-sensitive or not.
121
+ :type ignore_case: bool
122
+ :param remove_space: Whether remove internal space characters
123
+ :type remove_space: bool
124
+ :return: Levenshtein distance and length of reference sentence.
125
+ :rtype: list
126
+ """
127
+ if ignore_case == True:
128
+ reference = reference.lower()
129
+ hypothesis = hypothesis.lower()
130
+
131
+ join_char = ' '
132
+ if remove_space == True:
133
+ join_char = ''
134
+
135
+ reference = join_char.join(list(filter(None, reference.split(' '))))
136
+ hypothesis = join_char.join(list(filter(None, hypothesis.split(' '))))
137
+
138
+ edit_distance = _levenshtein_distance(reference, hypothesis)
139
+ return float(edit_distance), len(reference)
140
+
141
+
142
+ def cer(reference, hypothesis, ignore_case=False, remove_space=True):
143
+ """Calculate charactor error rate (CER). CER compares reference text and
144
+ hypothesis text in char-level. CER is defined as:
145
+
146
+ .. math::
147
+ CER = (Sc + Dc + Ic) / Nc
148
+
149
+ where
150
+
151
+ .. code-block:: text
152
+
153
+ Sc is the number of characters substituted,
154
+ Dc is the number of characters deleted,
155
+ Ic is the number of characters inserted
156
+ Nc is the number of characters in the reference
157
+
158
+ We can use levenshtein distance to calculate CER. Chinese input should be
159
+ encoded to unicode. Please draw an attention that the leading and tailing
160
+ space characters will be truncated and multiple consecutive space
161
+ characters in a sentence will be replaced by one space character.
162
+
163
+ :param reference: The reference sentence.
164
+ :type reference: str
165
+ :param hypothesis: The hypothesis sentence.
166
+ :type hypothesis: str
167
+ :param ignore_case: Whether case-sensitive or not.
168
+ :type ignore_case: bool
169
+ :param remove_space: Whether remove internal space characters
170
+ :type remove_space: bool
171
+ :return: Character error rate.
172
+ :rtype: float
173
+ :raises ValueError: If the reference length is zero.
174
+ """
175
+ edit_distance, ref_len = char_errors(reference, hypothesis, ignore_case,
176
+ remove_space)
177
+
178
+ if ref_len == 0:
179
+ raise ValueError("Length of reference should be greater than 0.")
180
+
181
+ cer = float(edit_distance) / ref_len
182
+ return edit_distance, ref_len, cer
183
+
184
+
185
+
186
+ _CITATION = """\
187
+ @inproceedings{inproceedings,
188
+ author = {Morris, Andrew and Maier, Viktoria and Green, Phil},
189
+ year = {2004},
190
+ month = {01},
191
+ pages = {},
192
+ title = {From WER and RIL to MER and WIL: improved evaluation measures for connected speech recognition.}
193
+ }
194
+ """
195
+
196
+ _DESCRIPTION = """\
197
+ Word error rate (WER) is a common metric of the performance of an automatic speech recognition system.
198
+
199
+ The general difficulty of measuring performance lies in the fact that the recognized word sequence can have a different length from the reference word sequence (supposedly the correct one). The WER is derived from the Levenshtein distance, working at the word level instead of the phoneme level. The WER is a valuable tool for comparing different systems as well as for evaluating improvements within one system. This kind of measurement, however, provides no details on the nature of translation errors and further work is therefore required to identify the main source(s) of error and to focus any research effort.
200
+
201
+ This problem is solved by first aligning the recognized word sequence with the reference (spoken) word sequence using dynamic string alignment. Examination of this issue is seen through a theory called the power law that states the correlation between perplexity and word error rate.
202
+
203
+ Word error rate can then be computed as:
204
+
205
+ WER = (S + D + I) / N = (S + D + I) / (S + D + C)
206
+
207
+ where
208
+
209
+ S is the number of substitutions,
210
+ D is the number of deletions,
211
+ I is the number of insertions,
212
+ C is the number of correct words,
213
+ N is the number of words in the reference (N=S+D+C).
214
+
215
+ WER's output is always a number between 0 and 1. This value indicates the percentage of words that were incorrectly predicted. The lower the value, the better the
216
+ performance of the ASR system with a WER of 0 being a perfect score.
217
+ """
218
+
219
+ _KWARGS_DESCRIPTION = """
220
+ Computes WER score of transcribed segments against references.
221
+ Args:
222
+ references: list of references for each speech input.
223
+ predictions: list of transcribtions to score.
224
+ Returns:
225
+ (float): the word error rate
226
+
227
+ Examples:
228
+
229
+ >>> predictions = ["this is the prediction", "there is an other sample"]
230
+ >>> references = ["this is the reference", "there is another one"]
231
+ >>> wer = datasets.load_metric("wer")
232
+ >>> wer_score = wer.compute(predictions=predictions, references=references)
233
+ >>> print(wer_score)
234
+ 0.5
235
+ """
236
+
237
+
238
+ @datasets.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
239
+ class CER(datasets.Metric):
240
+ def _info(self):
241
+ return datasets.MetricInfo(
242
+ description=_DESCRIPTION,
243
+ citation=_CITATION,
244
+ inputs_description=_KWARGS_DESCRIPTION,
245
+ features=datasets.Features(
246
+ {
247
+ "predictions": datasets.Value("string", id="sequence"),
248
+ "references": datasets.Value("string", id="sequence"),
249
+ }
250
+ ),
251
+ codebase_urls=["https://github.com/jitsi/jiwer/"],
252
+ reference_urls=[
253
+ "https://en.wikipedia.org/wiki/Word_error_rate",
254
+ ],
255
+ )
256
+
257
+ def _compute(self, predictions, references):
258
+ total_edit_distance, total_ref_len = 0, 0
259
+ for pred, ref in zip(predictions, references):
260
+ edit_distance, ref_len, _ = cer(ref, pred)
261
+ total_edit_distance += edit_distance
262
+ total_ref_len += ref_len
263
+ return total_edit_distance / total_ref_len