DoctorSlimm commited on
Commit
cad74f0
·
1 Parent(s): 2f5d16f

correct mismatch column names unique df

Browse files
Files changed (1) hide show
  1. bangalore_score.py +8 -10
bangalore_score.py CHANGED
@@ -126,9 +126,9 @@ class Bangalore_Score(evaluate.Metric):
126
 
127
  # parse headers
128
  if len(rows) > 0:
129
- headers_row = rows[0].strip().strip('|') # remove trailing pipes
130
  headers_list = [x.strip() for x in headers_row.split('|')] # split by pipes and remove whitespace
131
- headers_text = ' '.join(sorted(headers_list)) # join headers
132
 
133
  # try parse records
134
  if len(rows) > 2:
@@ -145,7 +145,7 @@ class Bangalore_Score(evaluate.Metric):
145
  pass
146
 
147
  # normalize rows (set column order with sorted headers)
148
- sorted_headers = sorted(headers_list)
149
  df = pd.DataFrame(records, columns=sorted_headers) # create dataframe
150
  df = df.sort_values(by=sorted_headers) # sort rows
151
  # csv
@@ -166,22 +166,21 @@ class Bangalore_Score(evaluate.Metric):
166
  'num_rows': [len(records)],
167
  }
168
 
169
- ### run computations ###
170
  pred_ds = Dataset.from_dict({'text': predictions})
171
  refs_ds = Dataset.from_dict({'text': references})
172
  proc_ds = DatasetDict({'predictions': pred_ds, 'references': refs_ds})
173
  proc_ds = proc_ds.map(normalize_fn, num_proc=num_proc)
174
 
 
 
 
 
175
  # exact_match (headers)
176
  exact_match_headers = evaluate.load('exact_match').compute(
177
  predictions=proc_ds['predictions']['headers_text'], references=proc_ds['references']['headers_text']
178
  )['exact_match']
179
 
180
- # charcut (data)
181
- # charcut = evaluate.load('BramVanroy/CharCut').compute(
182
- # predictions=proc_ds['predictions']['rows_text'], references=proc_ds['references']['rows_text']
183
- # )['charcut_mt']
184
-
185
  # bleu (data)
186
  bleu = evaluate.load('bleu').compute(
187
  predictions=proc_ds['predictions']['rows_text'], references=proc_ds['references']['rows_text']
@@ -197,7 +196,6 @@ class Bangalore_Score(evaluate.Metric):
197
  predictions=proc_ds['predictions']['rows_text'], references=proc_ds['references']['rows_text']
198
  )['levenshtein_distance']
199
 
200
-
201
  # row accuracy (num rows)
202
  # row_accuracy = evaluate.load('accuracy').compute(
203
  # predictions=proc_ds['predictions']['num_rows'], references=proc_ds['references']['num_rows']
 
126
 
127
  # parse headers
128
  if len(rows) > 0:
129
+ headers_row = rows[0].strip().strip('|') # remove trailing pipes
130
  headers_list = [x.strip() for x in headers_row.split('|')] # split by pipes and remove whitespace
131
+ headers_text = ' '.join(sorted(headers_list)) # join headers
132
 
133
  # try parse records
134
  if len(rows) > 2:
 
145
  pass
146
 
147
  # normalize rows (set column order with sorted headers)
148
+ sorted_headers = sorted(set(list(headers_list)))
149
  df = pd.DataFrame(records, columns=sorted_headers) # create dataframe
150
  df = df.sort_values(by=sorted_headers) # sort rows
151
  # csv
 
166
  'num_rows': [len(records)],
167
  }
168
 
169
+ ### normalize ###
170
  pred_ds = Dataset.from_dict({'text': predictions})
171
  refs_ds = Dataset.from_dict({'text': references})
172
  proc_ds = DatasetDict({'predictions': pred_ds, 'references': refs_ds})
173
  proc_ds = proc_ds.map(normalize_fn, num_proc=num_proc)
174
 
175
+
176
+ ### compute metrics ###
177
+ # https://huggingface.co/evaluate-metric
178
+
179
  # exact_match (headers)
180
  exact_match_headers = evaluate.load('exact_match').compute(
181
  predictions=proc_ds['predictions']['headers_text'], references=proc_ds['references']['headers_text']
182
  )['exact_match']
183
 
 
 
 
 
 
184
  # bleu (data)
185
  bleu = evaluate.load('bleu').compute(
186
  predictions=proc_ds['predictions']['rows_text'], references=proc_ds['references']['rows_text']
 
196
  predictions=proc_ds['predictions']['rows_text'], references=proc_ds['references']['rows_text']
197
  )['levenshtein_distance']
198
 
 
199
  # row accuracy (num rows)
200
  # row_accuracy = evaluate.load('accuracy').compute(
201
  # predictions=proc_ds['predictions']['num_rows'], references=proc_ds['references']['num_rows']