DoctorSlimm commited on
Commit
0d3bd5e
1 Parent(s): a12a36a

omit eval long texts columns (comments)

Browse files
__pycache__/bangalore_score.cpython-39.pyc ADDED
Binary file (8.41 kB). View file
 
bangalore_score.py CHANGED
@@ -85,32 +85,32 @@ class Bangalore_Score(evaluate.Metric):
85
 
86
  def _download_and_prepare(self, dl_manager):
87
  """Optional: download external resources useful to compute the scores"""
88
- # TODO: Download external resources if needed
89
  import nltk
90
  nltk.download('punkt')
91
 
 
92
  import pandas as pd
93
- from datasets import Dataset, DatasetDict
94
 
 
95
  # https://huggingface.co/evaluate-metric
96
- import evaluate
97
- evaluate.load('evaluate-metric/meteor')
98
- # evaluate.load('evaluate-metric/meteor')
99
- # evaluate.load('evaluate-metric/wer')
100
- # evaluate.load('evaluate-metric/exact_match')
101
- # evaluate.load('evaluate-metric/character')
102
- # evaluate.load('evaluate-metric/ter')
103
- # evaluate.load('bleu')
104
- # evaluate.load('rouge')
105
  # evaluate.load('ncoop57/levenshtein_distance')
106
  pass
107
 
108
-
109
  def normalize_fn(
110
  self,
111
  example,
112
  text_field='text',
113
- unk_token='Not Disclosed'
 
114
  ):
115
  """
116
  parse output text into headers, rows, and records
@@ -126,30 +126,38 @@ class Bangalore_Score(evaluate.Metric):
126
  | ... | ... | ... |
127
  """
128
  import pandas as pd
129
- headers_text, records, rows_text = unk_token, [], unk_token
130
- rows = dict(example)[text_field].strip().split('\n')
131
 
132
- # parse headers
 
 
 
 
 
 
 
 
133
  if len(rows) > 0:
134
- # trailing pipes
135
- headers_row = rows[0].strip().strip('|')
136
- # split on pipes and remove whitespace
137
- headers_list = [x.strip() for x in headers_row.split('|')]
138
- headers_text = ' '.join(sorted(headers_list)) # join headers
139
 
140
- # try parse records
141
  if len(rows) > 2:
142
- data_rows = rows[2:]
143
- # trailing pipes
144
- data_rows = [x.strip('|').split('|') for x in data_rows]
 
 
 
 
 
145
  for row in data_rows:
146
  cleaned_row = []
147
  for cell in row:
148
- # Remove leading and trailing whitespace
149
- cell = cell.strip()
150
- # Check if cell is empty or contains only whitespace
151
  if not cell or cell.isspace():
152
- cell = 'Not Disclosed'
153
  cleaned_row.append(cell)
154
  try:
155
  if len(cleaned_row) == len(headers_list):
@@ -158,22 +166,71 @@ class Bangalore_Score(evaluate.Metric):
158
  except Exception as e:
159
  print(e)
160
 
161
- # normalize rows (set column order with sorted headers)
162
  sorted_headers = sorted(set(list(headers_list)))
163
- df = pd.DataFrame(records, columns=sorted_headers) # create dataframe
164
- df.fillna(unk_token, inplace=True)
165
- df = df.sort_values(by=sorted_headers) # sort rows
 
 
 
 
 
 
 
 
 
 
 
166
  # csv
167
  csv_norm = df.to_csv(index=False, sep='|')
168
  csv_norm = csv_norm.replace('|', ' | ') # add spaces around pipes
169
  csv_norm = csv_norm.replace('\r', '') # remove carriage returns
170
-
171
- # rows text
172
  rows_text = csv_norm.split('\n')[1:-1]
173
  rows_text = '\n'.join(rows_text).strip()
174
- return {'rtext': rows_text}
175
 
176
- def msr_fn(self, reference, predictions, metric, metric_key, max_score=True):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  """
178
  MSR (Most Similar Row / Record)
179
  * computes metric for predictions
@@ -187,12 +244,20 @@ class Bangalore_Score(evaluate.Metric):
187
  """
188
  scores_list = []
189
  for ref, pred in zip([reference]*len(predictions), predictions):
190
- score = metric.compute(references=[ref], predictions=[pred])[metric_key]
 
 
 
 
 
 
191
  scores_list.append(score)
 
192
  if max_score:
193
  best_score = max(scores_list)
194
  else:
195
  best_score = min(scores_list)
 
196
  best_pred = predictions[scores_list.index(best_score)]
197
  _predictions = []
198
  for pred in predictions:
@@ -200,18 +265,27 @@ class Bangalore_Score(evaluate.Metric):
200
  _predictions.append(pred)
201
  return best_score, best_pred, _predictions
202
 
203
- def amsr_fn(self, example, **kwargs):
 
 
 
 
 
 
 
 
 
 
204
  ref_text, pred_text = example['ref'].strip(), example['pred'].strip()
205
  ref_rows, pred_rows = ref_text.split('\n'), pred_text.split('\n')
206
-
207
- # test msr
208
  msr_list = []
209
  for ref in ref_rows:
210
  if len(pred_rows) == 0:
211
  msr_list.append(0)
212
  continue
213
  score, best_pred, pred_rows = self.msr_fn(reference=ref, predictions=pred_rows, **kwargs)
214
- # meteor STILL too flexible...
215
  if False:
216
  print(
217
  '\n\n\n---'
@@ -225,13 +299,14 @@ class Bangalore_Score(evaluate.Metric):
225
  msr_list.append(score)
226
 
227
  aggregate_score = sum(msr_list) / len(msr_list)
228
- # print('ref_rows:')
229
- # for ref in ref_rows:
230
- # print(f'\t* {ref}')
231
- # print('\n\npred_rows:')
232
- # for pred in pred_text.split('\n'):
233
- # print(f'\t* {pred}')
234
- # print(f'\n\naggregate_score: {aggregate_score}')
 
235
  return {'amsr': aggregate_score}
236
 
237
  def _compute(
@@ -240,28 +315,37 @@ class Bangalore_Score(evaluate.Metric):
240
  references,
241
  metric,
242
  metric_key,
243
- max_score=True,
244
- num_proc=None
245
  ):
246
  """Returns the scores"""
247
  import json
248
  import evaluate
249
  import pandas as pd
250
- from datasets import Dataset, DatasetDict
251
-
252
- ### normalize ###
253
- pred_ds = Dataset.from_dict({'text': predictions})
254
- refs_ds = Dataset.from_dict({'text': references})
255
- proc_ds = DatasetDict({'predictions': pred_ds, 'references': refs_ds})
256
- proc_ds = proc_ds.map(self.normalize_fn, num_proc=num_proc, desc='normalizing')
257
- predictions = proc_ds['predictions']['rtext']
258
- references = proc_ds['references']['rtext']
259
  proc_ds = Dataset.from_dict({'pred': predictions, 'ref': references})
260
 
261
- ### compute amsr ###
 
 
 
 
 
 
 
 
 
262
  proc_ds = proc_ds.map(
263
- lambda x: self.amsr_fn(x, metric=metric, metric_key=metric_key, max_score=max_score),
264
- num_proc=num_proc, desc='computing amsr'
 
 
 
 
 
265
  )
266
 
267
  amsr_mean = sum(proc_ds['amsr']) / len(proc_ds['amsr'])
 
85
 
86
  def _download_and_prepare(self, dl_manager):
87
  """Optional: download external resources useful to compute the scores"""
 
88
  import nltk
89
  nltk.download('punkt')
90
 
91
+ import evaluate
92
  import pandas as pd
93
+ from datasets import Dataset
94
 
95
+ ### metrics ###
96
  # https://huggingface.co/evaluate-metric
97
+ ###############
98
+ # evaluate.load('evaluate-metric/meteor') # respect order (machine translation)
99
+ # evaluate.load('evaluate-metric/wer') # word error rate
100
+ # evaluate.load('evaluate-metric/exact_match') # exact match
101
+ # evaluate.load('evaluate-metric/character') # character error rate
102
+ # evaluate.load('evaluate-metric/ter') # translation error rate
103
+ # evaluate.load('bleu') # no respect order (machine translation)
104
+ # evaluate.load('rouge') # no respect order (machine translation)
 
105
  # evaluate.load('ncoop57/levenshtein_distance')
106
  pass
107
 
 
108
  def normalize_fn(
109
  self,
110
  example,
111
  text_field='text',
112
+ unk_token='Not Disclosed',
113
+ return_df=False
114
  ):
115
  """
116
  parse output text into headers, rows, and records
 
126
  | ... | ... | ... |
127
  """
128
  import pandas as pd
 
 
129
 
130
+ records = []
131
+ rows_text = unk_token
132
+ omit_columns = example.get('omit_columns', [])
133
+
134
+ text = dict(example)[text_field]
135
+ text = text.strip()
136
+ rows = text.split('\n')
137
+
138
+ # headers
139
  if len(rows) > 0:
140
+ headers_row = rows[0]
141
+ headers_row = headers_row.strip()
142
+ headers_row = headers_row.strip('|')
143
+ headers_list = headers_row.split('|')
144
+ headers_list = [c.strip() for c in headers_list]
145
 
146
+ # records / rows
147
  if len(rows) > 2:
148
+ data_rows = []
149
+ for row_text in rows[2:]:
150
+ row_text = row_text.strip()
151
+ row_text = row_text.strip('|')
152
+ row_values = row_text.split('|')
153
+ row_values = [v.strip() for v in row_values]
154
+ data_rows.append(row_values)
155
+
156
  for row in data_rows:
157
  cleaned_row = []
158
  for cell in row:
 
 
 
159
  if not cell or cell.isspace():
160
+ cell = unk_token
161
  cleaned_row.append(cell)
162
  try:
163
  if len(cleaned_row) == len(headers_list):
 
166
  except Exception as e:
167
  print(e)
168
 
169
+ # normalize
170
  sorted_headers = sorted(set(list(headers_list)))
171
+ df = pd.DataFrame(records, columns=sorted_headers) # normalize headers
172
+ df.fillna(unk_token, inplace=True) # fill NaNs
173
+ df = df.sort_values(by=sorted_headers) # normalize rows
174
+
175
+ # omit columns
176
+ if len(omit_columns) > 0:
177
+ omit_columns = [c.strip() for c in omit_columns]
178
+ omit_columns = [c for c in omit_columns if c in df.columns]
179
+ df = df.drop(columns=omit_columns)
180
+
181
+ # return df only
182
+ if return_df:
183
+ return df
184
+
185
  # csv
186
  csv_norm = df.to_csv(index=False, sep='|')
187
  csv_norm = csv_norm.replace('|', ' | ') # add spaces around pipes
188
  csv_norm = csv_norm.replace('\r', '') # remove carriage returns
189
+ # only rows text
 
190
  rows_text = csv_norm.split('\n')[1:-1]
191
  rows_text = '\n'.join(rows_text).strip()
 
192
 
193
+ if return_df:
194
+ return None
195
+ else:
196
+ return {'rtext': rows_text}
197
+
198
+ def ref_omit_columns(
199
+ self,
200
+ example,
201
+ max_chars=50,
202
+ ):
203
+ """
204
+ Detect columns to omit from eval
205
+ 1. columns with text > max_chars are likely to be comments
206
+ * to be skipped in evaluation
207
+ * screws up metrics
208
+ :param example:
209
+ :param max_chars:
210
+ :param omit_columns:
211
+ :param text_field:
212
+ :return:
213
+ {
214
+ 'omit_columns': <list of text columns>,
215
+ }
216
+ """
217
+ comments = []
218
+ df = self.normalize_fn(example,text_field='ref', return_df=True,)
219
+ if df is not None:
220
+ for colname in df.columns:
221
+ c_lens = [len(str(x)) for x in df[colname]]
222
+ if max(c_lens) > max_chars:
223
+ comments.append(colname)
224
+ return {'omit_columns': comments}
225
+
226
+ def msr_fn(
227
+ self,
228
+ reference,
229
+ predictions,
230
+ metric,
231
+ metric_key,
232
+ max_score=True,
233
+ ):
234
  """
235
  MSR (Most Similar Row / Record)
236
  * computes metric for predictions
 
244
  """
245
  scores_list = []
246
  for ref, pred in zip([reference]*len(predictions), predictions):
247
+ score_dict = metric.compute(references=[ref], predictions=[pred])
248
+ if isinstance(score_dict, dict):
249
+ score = score_dict.get(metric_key, 0)
250
+ elif isinstance(score_dict, float):
251
+ score = score_dict
252
+ else:
253
+ score = 0
254
  scores_list.append(score)
255
+
256
  if max_score:
257
  best_score = max(scores_list)
258
  else:
259
  best_score = min(scores_list)
260
+
261
  best_pred = predictions[scores_list.index(best_score)]
262
  _predictions = []
263
  for pred in predictions:
 
265
  _predictions.append(pred)
266
  return best_score, best_pred, _predictions
267
 
268
+ def amsr_fn(
269
+ self,
270
+ example,
271
+ **kwargs
272
+ ):
273
+ """
274
+ Aggregate MSR (Most Similar Row / Record)
275
+ :param example:
276
+ :param kwargs:
277
+ :return:
278
+ """
279
  ref_text, pred_text = example['ref'].strip(), example['pred'].strip()
280
  ref_rows, pred_rows = ref_text.split('\n'), pred_text.split('\n')
281
+ ### test msr
 
282
  msr_list = []
283
  for ref in ref_rows:
284
  if len(pred_rows) == 0:
285
  msr_list.append(0)
286
  continue
287
  score, best_pred, pred_rows = self.msr_fn(reference=ref, predictions=pred_rows, **kwargs)
288
+ ### meteor STILL too flexible...
289
  if False:
290
  print(
291
  '\n\n\n---'
 
299
  msr_list.append(score)
300
 
301
  aggregate_score = sum(msr_list) / len(msr_list)
302
+ if False:
303
+ print('ref_rows:')
304
+ for ref in ref_rows:
305
+ print(f'\t* {ref}')
306
+ print('\n\npred_rows:')
307
+ for pred in pred_text.split('\n'):
308
+ print(f'\t* {pred}')
309
+ print(f'\n\naggregate_score: {aggregate_score}')
310
  return {'amsr': aggregate_score}
311
 
312
  def _compute(
 
315
  references,
316
  metric,
317
  metric_key,
318
+ best='max',
 
319
  ):
320
  """Returns the scores"""
321
  import json
322
  import evaluate
323
  import pandas as pd
324
+ from datasets import Dataset, DatasetDict, disable_caching
325
+
326
+ disable_caching()
327
+
328
+ # 1. parse predictions and references
 
 
 
 
329
  proc_ds = Dataset.from_dict({'pred': predictions, 'ref': references})
330
 
331
+ # 2. detect columns to omit from eval calculations (eg: comments)
332
+ proc_ds = proc_ds.map(self.ref_omit_columns, desc='omit_columns (ref)')
333
+
334
+ # 3. normalize predictions and references
335
+ predictions_ds = proc_ds.map(lambda x: self.normalize_fn(x, text_field='pred'))
336
+ references_ds = proc_ds.map(lambda x: self.normalize_fn(x, text_field='ref'))
337
+ eval_data = {'pred': predictions_ds['rtext'], 'ref': references_ds['rtext']}
338
+
339
+ # 4. compute amsr for given metric
340
+ proc_ds = Dataset.from_dict(eval_data)
341
  proc_ds = proc_ds.map(
342
+ lambda x: self.amsr_fn(
343
+ example=x,
344
+ metric=metric,
345
+ metric_key=metric_key,
346
+ max_score=True if best == 'max' else False
347
+ ),
348
+ desc=f'amsr ({metric_key})'
349
  )
350
 
351
  amsr_mean = sum(proc_ds['amsr']) / len(proc_ds['amsr'])