DoctorSlimm commited on
Commit
001c9b9
1 Parent(s): 0d3bd5e
Files changed (1) hide show
  1. bangalore_score.py +35 -7
bangalore_score.py CHANGED
@@ -105,19 +105,36 @@ class Bangalore_Score(evaluate.Metric):
105
  # evaluate.load('ncoop57/levenshtein_distance')
106
  pass
107
 
 
 
 
 
108
  def normalize_fn(
109
  self,
110
  example,
111
  text_field='text',
112
  unk_token='Not Disclosed',
 
113
  return_df=False
114
  ):
115
  """
116
- parse output text into headers, rows, and records
117
- - parse row by row (incomplete rows)
 
 
 
 
 
 
 
 
 
118
  :param example:
 
 
 
 
119
  :return:
120
- Note: this does not handle special tokens
121
  expected input format:
122
 
123
  | col1 | col2 | col3 | <- start and trailing pipes required
@@ -129,7 +146,10 @@ class Bangalore_Score(evaluate.Metric):
129
 
130
  records = []
131
  rows_text = unk_token
132
- omit_columns = example.get('omit_columns', [])
 
 
 
133
 
134
  text = dict(example)[text_field]
135
  text = text.strip()
@@ -186,6 +206,8 @@ class Bangalore_Score(evaluate.Metric):
186
  csv_norm = df.to_csv(index=False, sep='|')
187
  csv_norm = csv_norm.replace('|', ' | ') # add spaces around pipes
188
  csv_norm = csv_norm.replace('\r', '') # remove carriage returns
 
 
189
  # only rows text
190
  rows_text = csv_norm.split('\n')[1:-1]
191
  rows_text = '\n'.join(rows_text).strip()
@@ -244,7 +266,12 @@ class Bangalore_Score(evaluate.Metric):
244
  """
245
  scores_list = []
246
  for ref, pred in zip([reference]*len(predictions), predictions):
247
- score_dict = metric.compute(references=[ref], predictions=[pred])
 
 
 
 
 
248
  if isinstance(score_dict, dict):
249
  score = score_dict.get(metric_key, 0)
250
  elif isinstance(score_dict, float):
@@ -316,6 +343,7 @@ class Bangalore_Score(evaluate.Metric):
316
  metric,
317
  metric_key,
318
  best='max',
 
319
  ):
320
  """Returns the scores"""
321
  import json
@@ -332,8 +360,8 @@ class Bangalore_Score(evaluate.Metric):
332
  proc_ds = proc_ds.map(self.ref_omit_columns, desc='omit_columns (ref)')
333
 
334
  # 3. normalize predictions and references
335
- predictions_ds = proc_ds.map(lambda x: self.normalize_fn(x, text_field='pred'))
336
- references_ds = proc_ds.map(lambda x: self.normalize_fn(x, text_field='ref'))
337
  eval_data = {'pred': predictions_ds['rtext'], 'ref': references_ds['rtext']}
338
 
339
  # 4. compute amsr for given metric
 
105
  # evaluate.load('ncoop57/levenshtein_distance')
106
  pass
107
 
108
+ # todo: compartmentalize... and soften
109
+ # * extract_records(text, return_df=False, omit_columns=[])
110
+ # * preprocess_fn(example), omit, extract, ...
111
+
112
  def normalize_fn(
113
  self,
114
  example,
115
  text_field='text',
116
  unk_token='Not Disclosed',
117
+ omit_columns=True,
118
  return_df=False
119
  ):
120
  """
121
+ Normalize Markdown Text String to rtext or DataFrame
122
+ * fill NaNs with unk_token
123
+ * assumes markdown table format
124
+ * assumes headers are present
125
+ * assumes headers are unique
126
+ * does not handle special tokens
127
+ Normalization:
128
+ * columns -> pd.DataFrame(x, columns=sorted(x.columns))
129
+ * rows -> pd.DataFrame.sort_values(by=sorted(x.columns))
130
+ * replaces -> NaNs with unk_token
131
+ * omit -> columns with text > max_chars
132
  :param example:
133
+ {
134
+ 'text': <markdown table>,
135
+ 'omit_columns': <list of columns to omit>
136
+ }
137
  :return:
 
138
  expected input format:
139
 
140
  | col1 | col2 | col3 | <- start and trailing pipes required
 
146
 
147
  records = []
148
  rows_text = unk_token
149
+ if omit_columns:
150
+ omit_columns = example.get('omit_columns', [])
151
+ else:
152
+ omit_columns = []
153
 
154
  text = dict(example)[text_field]
155
  text = text.strip()
 
206
  csv_norm = df.to_csv(index=False, sep='|')
207
  csv_norm = csv_norm.replace('|', ' | ') # add spaces around pipes
208
  csv_norm = csv_norm.replace('\r', '') # remove carriage returns
209
+ csv_norm = csv_norm.replace('\t', ' ') # remove tabs
210
+
211
  # only rows text
212
  rows_text = csv_norm.split('\n')[1:-1]
213
  rows_text = '\n'.join(rows_text).strip()
 
266
  """
267
  scores_list = []
268
  for ref, pred in zip([reference]*len(predictions), predictions):
269
+ score_dict = metric.compute(
270
+ references=[ref],
271
+ predictions=[pred],
272
+ # ignore_case=True,
273
+ # ignore_punctuation=True,
274
+ )
275
  if isinstance(score_dict, dict):
276
  score = score_dict.get(metric_key, 0)
277
  elif isinstance(score_dict, float):
 
343
  metric,
344
  metric_key,
345
  best='max',
346
+ omit_columns=True
347
  ):
348
  """Returns the scores"""
349
  import json
 
360
  proc_ds = proc_ds.map(self.ref_omit_columns, desc='omit_columns (ref)')
361
 
362
  # 3. normalize predictions and references
363
+ predictions_ds = proc_ds.map(lambda x: self.normalize_fn(x, text_field='pred', omit_columns=omit_columns))
364
+ references_ds = proc_ds.map(lambda x: self.normalize_fn(x, text_field='ref', omit_columns=omit_columns))
365
  eval_data = {'pred': predictions_ds['rtext'], 'ref': references_ds['rtext']}
366
 
367
  # 4. compute amsr for given metric