Spaces:
Sleeping
Sleeping
DoctorSlimm
commited on
Commit
•
0d3bd5e
1
Parent(s):
a12a36a
omit eval long texts columns (comments)
Browse files- __pycache__/bangalore_score.cpython-39.pyc +0 -0
- bangalore_score.py +147 -63
__pycache__/bangalore_score.cpython-39.pyc
ADDED
Binary file (8.41 kB). View file
|
|
bangalore_score.py
CHANGED
@@ -85,32 +85,32 @@ class Bangalore_Score(evaluate.Metric):
|
|
85 |
|
86 |
def _download_and_prepare(self, dl_manager):
|
87 |
"""Optional: download external resources useful to compute the scores"""
|
88 |
-
# TODO: Download external resources if needed
|
89 |
import nltk
|
90 |
nltk.download('punkt')
|
91 |
|
|
|
92 |
import pandas as pd
|
93 |
-
from datasets import Dataset
|
94 |
|
|
|
95 |
# https://huggingface.co/evaluate-metric
|
96 |
-
|
97 |
-
evaluate.load('evaluate-metric/meteor')
|
98 |
-
# evaluate.load('evaluate-metric/
|
99 |
-
# evaluate.load('evaluate-metric/
|
100 |
-
# evaluate.load('evaluate-metric/
|
101 |
-
# evaluate.load('evaluate-metric/
|
102 |
-
# evaluate.load('
|
103 |
-
# evaluate.load('
|
104 |
-
# evaluate.load('rouge')
|
105 |
# evaluate.load('ncoop57/levenshtein_distance')
|
106 |
pass
|
107 |
|
108 |
-
|
109 |
def normalize_fn(
|
110 |
self,
|
111 |
example,
|
112 |
text_field='text',
|
113 |
-
unk_token='Not Disclosed'
|
|
|
114 |
):
|
115 |
"""
|
116 |
parse output text into headers, rows, and records
|
@@ -126,30 +126,38 @@ class Bangalore_Score(evaluate.Metric):
|
|
126 |
| ... | ... | ... |
|
127 |
"""
|
128 |
import pandas as pd
|
129 |
-
headers_text, records, rows_text = unk_token, [], unk_token
|
130 |
-
rows = dict(example)[text_field].strip().split('\n')
|
131 |
|
132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
133 |
if len(rows) > 0:
|
134 |
-
|
135 |
-
headers_row =
|
136 |
-
|
137 |
-
headers_list =
|
138 |
-
|
139 |
|
140 |
-
#
|
141 |
if len(rows) > 2:
|
142 |
-
data_rows =
|
143 |
-
|
144 |
-
|
|
|
|
|
|
|
|
|
|
|
145 |
for row in data_rows:
|
146 |
cleaned_row = []
|
147 |
for cell in row:
|
148 |
-
# Remove leading and trailing whitespace
|
149 |
-
cell = cell.strip()
|
150 |
-
# Check if cell is empty or contains only whitespace
|
151 |
if not cell or cell.isspace():
|
152 |
-
cell =
|
153 |
cleaned_row.append(cell)
|
154 |
try:
|
155 |
if len(cleaned_row) == len(headers_list):
|
@@ -158,22 +166,71 @@ class Bangalore_Score(evaluate.Metric):
|
|
158 |
except Exception as e:
|
159 |
print(e)
|
160 |
|
161 |
-
# normalize
|
162 |
sorted_headers = sorted(set(list(headers_list)))
|
163 |
-
df = pd.DataFrame(records, columns=sorted_headers) #
|
164 |
-
df.fillna(unk_token, inplace=True)
|
165 |
-
df = df.sort_values(by=sorted_headers) #
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
# csv
|
167 |
csv_norm = df.to_csv(index=False, sep='|')
|
168 |
csv_norm = csv_norm.replace('|', ' | ') # add spaces around pipes
|
169 |
csv_norm = csv_norm.replace('\r', '') # remove carriage returns
|
170 |
-
|
171 |
-
# rows text
|
172 |
rows_text = csv_norm.split('\n')[1:-1]
|
173 |
rows_text = '\n'.join(rows_text).strip()
|
174 |
-
return {'rtext': rows_text}
|
175 |
|
176 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
177 |
"""
|
178 |
MSR (Most Similar Row / Record)
|
179 |
* computes metric for predictions
|
@@ -187,12 +244,20 @@ class Bangalore_Score(evaluate.Metric):
|
|
187 |
"""
|
188 |
scores_list = []
|
189 |
for ref, pred in zip([reference]*len(predictions), predictions):
|
190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
191 |
scores_list.append(score)
|
|
|
192 |
if max_score:
|
193 |
best_score = max(scores_list)
|
194 |
else:
|
195 |
best_score = min(scores_list)
|
|
|
196 |
best_pred = predictions[scores_list.index(best_score)]
|
197 |
_predictions = []
|
198 |
for pred in predictions:
|
@@ -200,18 +265,27 @@ class Bangalore_Score(evaluate.Metric):
|
|
200 |
_predictions.append(pred)
|
201 |
return best_score, best_pred, _predictions
|
202 |
|
203 |
-
def amsr_fn(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
ref_text, pred_text = example['ref'].strip(), example['pred'].strip()
|
205 |
ref_rows, pred_rows = ref_text.split('\n'), pred_text.split('\n')
|
206 |
-
|
207 |
-
# test msr
|
208 |
msr_list = []
|
209 |
for ref in ref_rows:
|
210 |
if len(pred_rows) == 0:
|
211 |
msr_list.append(0)
|
212 |
continue
|
213 |
score, best_pred, pred_rows = self.msr_fn(reference=ref, predictions=pred_rows, **kwargs)
|
214 |
-
|
215 |
if False:
|
216 |
print(
|
217 |
'\n\n\n---'
|
@@ -225,13 +299,14 @@ class Bangalore_Score(evaluate.Metric):
|
|
225 |
msr_list.append(score)
|
226 |
|
227 |
aggregate_score = sum(msr_list) / len(msr_list)
|
228 |
-
|
229 |
-
|
230 |
-
|
231 |
-
|
232 |
-
|
233 |
-
|
234 |
-
|
|
|
235 |
return {'amsr': aggregate_score}
|
236 |
|
237 |
def _compute(
|
@@ -240,28 +315,37 @@ class Bangalore_Score(evaluate.Metric):
|
|
240 |
references,
|
241 |
metric,
|
242 |
metric_key,
|
243 |
-
|
244 |
-
num_proc=None
|
245 |
):
|
246 |
"""Returns the scores"""
|
247 |
import json
|
248 |
import evaluate
|
249 |
import pandas as pd
|
250 |
-
from datasets import Dataset, DatasetDict
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
|
255 |
-
proc_ds = DatasetDict({'predictions': pred_ds, 'references': refs_ds})
|
256 |
-
proc_ds = proc_ds.map(self.normalize_fn, num_proc=num_proc, desc='normalizing')
|
257 |
-
predictions = proc_ds['predictions']['rtext']
|
258 |
-
references = proc_ds['references']['rtext']
|
259 |
proc_ds = Dataset.from_dict({'pred': predictions, 'ref': references})
|
260 |
|
261 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
262 |
proc_ds = proc_ds.map(
|
263 |
-
lambda x: self.amsr_fn(
|
264 |
-
|
|
|
|
|
|
|
|
|
|
|
265 |
)
|
266 |
|
267 |
amsr_mean = sum(proc_ds['amsr']) / len(proc_ds['amsr'])
|
|
|
85 |
|
86 |
def _download_and_prepare(self, dl_manager):
|
87 |
"""Optional: download external resources useful to compute the scores"""
|
|
|
88 |
import nltk
|
89 |
nltk.download('punkt')
|
90 |
|
91 |
+
import evaluate
|
92 |
import pandas as pd
|
93 |
+
from datasets import Dataset
|
94 |
|
95 |
+
### metrics ###
|
96 |
# https://huggingface.co/evaluate-metric
|
97 |
+
###############
|
98 |
+
# evaluate.load('evaluate-metric/meteor') # respect order (machine translation)
|
99 |
+
# evaluate.load('evaluate-metric/wer') # word error rate
|
100 |
+
# evaluate.load('evaluate-metric/exact_match') # exact match
|
101 |
+
# evaluate.load('evaluate-metric/character') # character error rate
|
102 |
+
# evaluate.load('evaluate-metric/ter') # translation error rate
|
103 |
+
# evaluate.load('bleu') # no respect order (machine translation)
|
104 |
+
# evaluate.load('rouge') # no respect order (machine translation)
|
|
|
105 |
# evaluate.load('ncoop57/levenshtein_distance')
|
106 |
pass
|
107 |
|
|
|
108 |
def normalize_fn(
|
109 |
self,
|
110 |
example,
|
111 |
text_field='text',
|
112 |
+
unk_token='Not Disclosed',
|
113 |
+
return_df=False
|
114 |
):
|
115 |
"""
|
116 |
parse output text into headers, rows, and records
|
|
|
126 |
| ... | ... | ... |
|
127 |
"""
|
128 |
import pandas as pd
|
|
|
|
|
129 |
|
130 |
+
records = []
|
131 |
+
rows_text = unk_token
|
132 |
+
omit_columns = example.get('omit_columns', [])
|
133 |
+
|
134 |
+
text = dict(example)[text_field]
|
135 |
+
text = text.strip()
|
136 |
+
rows = text.split('\n')
|
137 |
+
|
138 |
+
# headers
|
139 |
if len(rows) > 0:
|
140 |
+
headers_row = rows[0]
|
141 |
+
headers_row = headers_row.strip()
|
142 |
+
headers_row = headers_row.strip('|')
|
143 |
+
headers_list = headers_row.split('|')
|
144 |
+
headers_list = [c.strip() for c in headers_list]
|
145 |
|
146 |
+
# records / rows
|
147 |
if len(rows) > 2:
|
148 |
+
data_rows = []
|
149 |
+
for row_text in rows[2:]:
|
150 |
+
row_text = row_text.strip()
|
151 |
+
row_text = row_text.strip('|')
|
152 |
+
row_values = row_text.split('|')
|
153 |
+
row_values = [v.strip() for v in row_values]
|
154 |
+
data_rows.append(row_values)
|
155 |
+
|
156 |
for row in data_rows:
|
157 |
cleaned_row = []
|
158 |
for cell in row:
|
|
|
|
|
|
|
159 |
if not cell or cell.isspace():
|
160 |
+
cell = unk_token
|
161 |
cleaned_row.append(cell)
|
162 |
try:
|
163 |
if len(cleaned_row) == len(headers_list):
|
|
|
166 |
except Exception as e:
|
167 |
print(e)
|
168 |
|
169 |
+
# normalize
|
170 |
sorted_headers = sorted(set(list(headers_list)))
|
171 |
+
df = pd.DataFrame(records, columns=sorted_headers) # normalize headers
|
172 |
+
df.fillna(unk_token, inplace=True) # fill NaNs
|
173 |
+
df = df.sort_values(by=sorted_headers) # normalize rows
|
174 |
+
|
175 |
+
# omit columns
|
176 |
+
if len(omit_columns) > 0:
|
177 |
+
omit_columns = [c.strip() for c in omit_columns]
|
178 |
+
omit_columns = [c for c in omit_columns if c in df.columns]
|
179 |
+
df = df.drop(columns=omit_columns)
|
180 |
+
|
181 |
+
# return df only
|
182 |
+
if return_df:
|
183 |
+
return df
|
184 |
+
|
185 |
# csv
|
186 |
csv_norm = df.to_csv(index=False, sep='|')
|
187 |
csv_norm = csv_norm.replace('|', ' | ') # add spaces around pipes
|
188 |
csv_norm = csv_norm.replace('\r', '') # remove carriage returns
|
189 |
+
# only rows text
|
|
|
190 |
rows_text = csv_norm.split('\n')[1:-1]
|
191 |
rows_text = '\n'.join(rows_text).strip()
|
|
|
192 |
|
193 |
+
if return_df:
|
194 |
+
return None
|
195 |
+
else:
|
196 |
+
return {'rtext': rows_text}
|
197 |
+
|
198 |
+
def ref_omit_columns(
|
199 |
+
self,
|
200 |
+
example,
|
201 |
+
max_chars=50,
|
202 |
+
):
|
203 |
+
"""
|
204 |
+
Detect columns to omit from eval
|
205 |
+
1. columns with text > max_chars are likely to be comments
|
206 |
+
* to be skipped in evaluation
|
207 |
+
* screws up metrics
|
208 |
+
:param example:
|
209 |
+
:param max_chars:
|
210 |
+
:param omit_columns:
|
211 |
+
:param text_field:
|
212 |
+
:return:
|
213 |
+
{
|
214 |
+
'omit_columns': <list of text columns>,
|
215 |
+
}
|
216 |
+
"""
|
217 |
+
comments = []
|
218 |
+
df = self.normalize_fn(example,text_field='ref', return_df=True,)
|
219 |
+
if df is not None:
|
220 |
+
for colname in df.columns:
|
221 |
+
c_lens = [len(str(x)) for x in df[colname]]
|
222 |
+
if max(c_lens) > max_chars:
|
223 |
+
comments.append(colname)
|
224 |
+
return {'omit_columns': comments}
|
225 |
+
|
226 |
+
def msr_fn(
|
227 |
+
self,
|
228 |
+
reference,
|
229 |
+
predictions,
|
230 |
+
metric,
|
231 |
+
metric_key,
|
232 |
+
max_score=True,
|
233 |
+
):
|
234 |
"""
|
235 |
MSR (Most Similar Row / Record)
|
236 |
* computes metric for predictions
|
|
|
244 |
"""
|
245 |
scores_list = []
|
246 |
for ref, pred in zip([reference]*len(predictions), predictions):
|
247 |
+
score_dict = metric.compute(references=[ref], predictions=[pred])
|
248 |
+
if isinstance(score_dict, dict):
|
249 |
+
score = score_dict.get(metric_key, 0)
|
250 |
+
elif isinstance(score_dict, float):
|
251 |
+
score = score_dict
|
252 |
+
else:
|
253 |
+
score = 0
|
254 |
scores_list.append(score)
|
255 |
+
|
256 |
if max_score:
|
257 |
best_score = max(scores_list)
|
258 |
else:
|
259 |
best_score = min(scores_list)
|
260 |
+
|
261 |
best_pred = predictions[scores_list.index(best_score)]
|
262 |
_predictions = []
|
263 |
for pred in predictions:
|
|
|
265 |
_predictions.append(pred)
|
266 |
return best_score, best_pred, _predictions
|
267 |
|
268 |
+
def amsr_fn(
|
269 |
+
self,
|
270 |
+
example,
|
271 |
+
**kwargs
|
272 |
+
):
|
273 |
+
"""
|
274 |
+
Aggregate MSR (Most Similar Row / Record)
|
275 |
+
:param example:
|
276 |
+
:param kwargs:
|
277 |
+
:return:
|
278 |
+
"""
|
279 |
ref_text, pred_text = example['ref'].strip(), example['pred'].strip()
|
280 |
ref_rows, pred_rows = ref_text.split('\n'), pred_text.split('\n')
|
281 |
+
### test msr
|
|
|
282 |
msr_list = []
|
283 |
for ref in ref_rows:
|
284 |
if len(pred_rows) == 0:
|
285 |
msr_list.append(0)
|
286 |
continue
|
287 |
score, best_pred, pred_rows = self.msr_fn(reference=ref, predictions=pred_rows, **kwargs)
|
288 |
+
### meteor STILL too flexible...
|
289 |
if False:
|
290 |
print(
|
291 |
'\n\n\n---'
|
|
|
299 |
msr_list.append(score)
|
300 |
|
301 |
aggregate_score = sum(msr_list) / len(msr_list)
|
302 |
+
if False:
|
303 |
+
print('ref_rows:')
|
304 |
+
for ref in ref_rows:
|
305 |
+
print(f'\t* {ref}')
|
306 |
+
print('\n\npred_rows:')
|
307 |
+
for pred in pred_text.split('\n'):
|
308 |
+
print(f'\t* {pred}')
|
309 |
+
print(f'\n\naggregate_score: {aggregate_score}')
|
310 |
return {'amsr': aggregate_score}
|
311 |
|
312 |
def _compute(
|
|
|
315 |
references,
|
316 |
metric,
|
317 |
metric_key,
|
318 |
+
best='max',
|
|
|
319 |
):
|
320 |
"""Returns the scores"""
|
321 |
import json
|
322 |
import evaluate
|
323 |
import pandas as pd
|
324 |
+
from datasets import Dataset, DatasetDict, disable_caching
|
325 |
+
|
326 |
+
disable_caching()
|
327 |
+
|
328 |
+
# 1. parse predictions and references
|
|
|
|
|
|
|
|
|
329 |
proc_ds = Dataset.from_dict({'pred': predictions, 'ref': references})
|
330 |
|
331 |
+
# 2. detect columns to omit from eval calculations (eg: comments)
|
332 |
+
proc_ds = proc_ds.map(self.ref_omit_columns, desc='omit_columns (ref)')
|
333 |
+
|
334 |
+
# 3. normalize predictions and references
|
335 |
+
predictions_ds = proc_ds.map(lambda x: self.normalize_fn(x, text_field='pred'))
|
336 |
+
references_ds = proc_ds.map(lambda x: self.normalize_fn(x, text_field='ref'))
|
337 |
+
eval_data = {'pred': predictions_ds['rtext'], 'ref': references_ds['rtext']}
|
338 |
+
|
339 |
+
# 4. compute amsr for given metric
|
340 |
+
proc_ds = Dataset.from_dict(eval_data)
|
341 |
proc_ds = proc_ds.map(
|
342 |
+
lambda x: self.amsr_fn(
|
343 |
+
example=x,
|
344 |
+
metric=metric,
|
345 |
+
metric_key=metric_key,
|
346 |
+
max_score=True if best == 'max' else False
|
347 |
+
),
|
348 |
+
desc=f'amsr ({metric_key})'
|
349 |
)
|
350 |
|
351 |
amsr_mean = sum(proc_ds['amsr']) / len(proc_ds['amsr'])
|