Spaces:
Sleeping
Sleeping
DoctorSlimm
commited on
Commit
·
cad74f0
1
Parent(s):
2f5d16f
correct mismatch column names unique df
Browse files- bangalore_score.py +8 -10
bangalore_score.py
CHANGED
@@ -126,9 +126,9 @@ class Bangalore_Score(evaluate.Metric):
|
|
126 |
|
127 |
# parse headers
|
128 |
if len(rows) > 0:
|
129 |
-
headers_row = rows[0].strip().strip('|')
|
130 |
headers_list = [x.strip() for x in headers_row.split('|')] # split by pipes and remove whitespace
|
131 |
-
headers_text = ' '.join(sorted(headers_list))
|
132 |
|
133 |
# try parse records
|
134 |
if len(rows) > 2:
|
@@ -145,7 +145,7 @@ class Bangalore_Score(evaluate.Metric):
|
|
145 |
pass
|
146 |
|
147 |
# normalize rows (set column order with sorted headers)
|
148 |
-
sorted_headers = sorted(headers_list)
|
149 |
df = pd.DataFrame(records, columns=sorted_headers) # create dataframe
|
150 |
df = df.sort_values(by=sorted_headers) # sort rows
|
151 |
# csv
|
@@ -166,22 +166,21 @@ class Bangalore_Score(evaluate.Metric):
|
|
166 |
'num_rows': [len(records)],
|
167 |
}
|
168 |
|
169 |
-
###
|
170 |
pred_ds = Dataset.from_dict({'text': predictions})
|
171 |
refs_ds = Dataset.from_dict({'text': references})
|
172 |
proc_ds = DatasetDict({'predictions': pred_ds, 'references': refs_ds})
|
173 |
proc_ds = proc_ds.map(normalize_fn, num_proc=num_proc)
|
174 |
|
|
|
|
|
|
|
|
|
175 |
# exact_match (headers)
|
176 |
exact_match_headers = evaluate.load('exact_match').compute(
|
177 |
predictions=proc_ds['predictions']['headers_text'], references=proc_ds['references']['headers_text']
|
178 |
)['exact_match']
|
179 |
|
180 |
-
# charcut (data)
|
181 |
-
# charcut = evaluate.load('BramVanroy/CharCut').compute(
|
182 |
-
# predictions=proc_ds['predictions']['rows_text'], references=proc_ds['references']['rows_text']
|
183 |
-
# )['charcut_mt']
|
184 |
-
|
185 |
# bleu (data)
|
186 |
bleu = evaluate.load('bleu').compute(
|
187 |
predictions=proc_ds['predictions']['rows_text'], references=proc_ds['references']['rows_text']
|
@@ -197,7 +196,6 @@ class Bangalore_Score(evaluate.Metric):
|
|
197 |
predictions=proc_ds['predictions']['rows_text'], references=proc_ds['references']['rows_text']
|
198 |
)['levenshtein_distance']
|
199 |
|
200 |
-
|
201 |
# row accuracy (num rows)
|
202 |
# row_accuracy = evaluate.load('accuracy').compute(
|
203 |
# predictions=proc_ds['predictions']['num_rows'], references=proc_ds['references']['num_rows']
|
|
|
126 |
|
127 |
# parse headers
|
128 |
if len(rows) > 0:
|
129 |
+
headers_row = rows[0].strip().strip('|') # remove trailing pipes
|
130 |
headers_list = [x.strip() for x in headers_row.split('|')] # split by pipes and remove whitespace
|
131 |
+
headers_text = ' '.join(sorted(headers_list)) # join headers
|
132 |
|
133 |
# try parse records
|
134 |
if len(rows) > 2:
|
|
|
145 |
pass
|
146 |
|
147 |
# normalize rows (set column order with sorted headers)
|
148 |
+
sorted_headers = sorted(set(list(headers_list)))
|
149 |
df = pd.DataFrame(records, columns=sorted_headers) # create dataframe
|
150 |
df = df.sort_values(by=sorted_headers) # sort rows
|
151 |
# csv
|
|
|
166 |
'num_rows': [len(records)],
|
167 |
}
|
168 |
|
169 |
+
### normalize ###
|
170 |
pred_ds = Dataset.from_dict({'text': predictions})
|
171 |
refs_ds = Dataset.from_dict({'text': references})
|
172 |
proc_ds = DatasetDict({'predictions': pred_ds, 'references': refs_ds})
|
173 |
proc_ds = proc_ds.map(normalize_fn, num_proc=num_proc)
|
174 |
|
175 |
+
|
176 |
+
### compute metrics ###
|
177 |
+
# https://huggingface.co/evaluate-metric
|
178 |
+
|
179 |
# exact_match (headers)
|
180 |
exact_match_headers = evaluate.load('exact_match').compute(
|
181 |
predictions=proc_ds['predictions']['headers_text'], references=proc_ds['references']['headers_text']
|
182 |
)['exact_match']
|
183 |
|
|
|
|
|
|
|
|
|
|
|
184 |
# bleu (data)
|
185 |
bleu = evaluate.load('bleu').compute(
|
186 |
predictions=proc_ds['predictions']['rows_text'], references=proc_ds['references']['rows_text']
|
|
|
196 |
predictions=proc_ds['predictions']['rows_text'], references=proc_ds['references']['rows_text']
|
197 |
)['levenshtein_distance']
|
198 |
|
|
|
199 |
# row accuracy (num rows)
|
200 |
# row_accuracy = evaluate.load('accuracy').compute(
|
201 |
# predictions=proc_ds['predictions']['num_rows'], references=proc_ds['references']['num_rows']
|