Spaces:
Sleeping
Sleeping
DoctorSlimm
commited on
Commit
•
001c9b9
1
Parent(s):
0d3bd5e
save
Browse files- bangalore_score.py +35 -7
bangalore_score.py
CHANGED
@@ -105,19 +105,36 @@ class Bangalore_Score(evaluate.Metric):
|
|
105 |
# evaluate.load('ncoop57/levenshtein_distance')
|
106 |
pass
|
107 |
|
|
|
|
|
|
|
|
|
108 |
def normalize_fn(
|
109 |
self,
|
110 |
example,
|
111 |
text_field='text',
|
112 |
unk_token='Not Disclosed',
|
|
|
113 |
return_df=False
|
114 |
):
|
115 |
"""
|
116 |
-
|
117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
:param example:
|
|
|
|
|
|
|
|
|
119 |
:return:
|
120 |
-
Note: this does not handle special tokens
|
121 |
expected input format:
|
122 |
|
123 |
| col1 | col2 | col3 | <- start and trailing pipes required
|
@@ -129,7 +146,10 @@ class Bangalore_Score(evaluate.Metric):
|
|
129 |
|
130 |
records = []
|
131 |
rows_text = unk_token
|
132 |
-
|
|
|
|
|
|
|
133 |
|
134 |
text = dict(example)[text_field]
|
135 |
text = text.strip()
|
@@ -186,6 +206,8 @@ class Bangalore_Score(evaluate.Metric):
|
|
186 |
csv_norm = df.to_csv(index=False, sep='|')
|
187 |
csv_norm = csv_norm.replace('|', ' | ') # add spaces around pipes
|
188 |
csv_norm = csv_norm.replace('\r', '') # remove carriage returns
|
|
|
|
|
189 |
# only rows text
|
190 |
rows_text = csv_norm.split('\n')[1:-1]
|
191 |
rows_text = '\n'.join(rows_text).strip()
|
@@ -244,7 +266,12 @@ class Bangalore_Score(evaluate.Metric):
|
|
244 |
"""
|
245 |
scores_list = []
|
246 |
for ref, pred in zip([reference]*len(predictions), predictions):
|
247 |
-
score_dict = metric.compute(
|
|
|
|
|
|
|
|
|
|
|
248 |
if isinstance(score_dict, dict):
|
249 |
score = score_dict.get(metric_key, 0)
|
250 |
elif isinstance(score_dict, float):
|
@@ -316,6 +343,7 @@ class Bangalore_Score(evaluate.Metric):
|
|
316 |
metric,
|
317 |
metric_key,
|
318 |
best='max',
|
|
|
319 |
):
|
320 |
"""Returns the scores"""
|
321 |
import json
|
@@ -332,8 +360,8 @@ class Bangalore_Score(evaluate.Metric):
|
|
332 |
proc_ds = proc_ds.map(self.ref_omit_columns, desc='omit_columns (ref)')
|
333 |
|
334 |
# 3. normalize predictions and references
|
335 |
-
predictions_ds = proc_ds.map(lambda x: self.normalize_fn(x, text_field='pred'))
|
336 |
-
references_ds = proc_ds.map(lambda x: self.normalize_fn(x, text_field='ref'))
|
337 |
eval_data = {'pred': predictions_ds['rtext'], 'ref': references_ds['rtext']}
|
338 |
|
339 |
# 4. compute amsr for given metric
|
|
|
105 |
# evaluate.load('ncoop57/levenshtein_distance')
|
106 |
pass
|
107 |
|
108 |
+
# todo: compartmentalize... and soften
|
109 |
+
# * extract_records(text, return_df=False, omit_columns=[])
|
110 |
+
# * preprocess_fn(example), omit, extract, ...
|
111 |
+
|
112 |
def normalize_fn(
|
113 |
self,
|
114 |
example,
|
115 |
text_field='text',
|
116 |
unk_token='Not Disclosed',
|
117 |
+
omit_columns=True,
|
118 |
return_df=False
|
119 |
):
|
120 |
"""
|
121 |
+
Normalize Markdown Text String to rtext or DataFrame
|
122 |
+
* fill NaNs with unk_token
|
123 |
+
* assumes markdown table format
|
124 |
+
* assumes headers are present
|
125 |
+
* assumes headers are unique
|
126 |
+
* does not handle special tokens
|
127 |
+
Normalization:
|
128 |
+
* columns -> pd.DataFrame(x, columns=sorted(x.columns))
|
129 |
+
* rows -> pd.DataFrame.sort_values(by=sorted(x.columns))
|
130 |
+
* replaces -> NaNs with unk_token
|
131 |
+
* omit -> columns with text > max_chars
|
132 |
:param example:
|
133 |
+
{
|
134 |
+
'text': <markdown table>,
|
135 |
+
'omit_columns': <list of columns to omit>
|
136 |
+
}
|
137 |
:return:
|
|
|
138 |
expected input format:
|
139 |
|
140 |
| col1 | col2 | col3 | <- start and trailing pipes required
|
|
|
146 |
|
147 |
records = []
|
148 |
rows_text = unk_token
|
149 |
+
if omit_columns:
|
150 |
+
omit_columns = example.get('omit_columns', [])
|
151 |
+
else:
|
152 |
+
omit_columns = []
|
153 |
|
154 |
text = dict(example)[text_field]
|
155 |
text = text.strip()
|
|
|
206 |
csv_norm = df.to_csv(index=False, sep='|')
|
207 |
csv_norm = csv_norm.replace('|', ' | ') # add spaces around pipes
|
208 |
csv_norm = csv_norm.replace('\r', '') # remove carriage returns
|
209 |
+
csv_norm = csv_norm.replace('\t', ' ') # remove tabs
|
210 |
+
|
211 |
# only rows text
|
212 |
rows_text = csv_norm.split('\n')[1:-1]
|
213 |
rows_text = '\n'.join(rows_text).strip()
|
|
|
266 |
"""
|
267 |
scores_list = []
|
268 |
for ref, pred in zip([reference]*len(predictions), predictions):
|
269 |
+
score_dict = metric.compute(
|
270 |
+
references=[ref],
|
271 |
+
predictions=[pred],
|
272 |
+
# ignore_case=True,
|
273 |
+
# ignore_punctuation=True,
|
274 |
+
)
|
275 |
if isinstance(score_dict, dict):
|
276 |
score = score_dict.get(metric_key, 0)
|
277 |
elif isinstance(score_dict, float):
|
|
|
343 |
metric,
|
344 |
metric_key,
|
345 |
best='max',
|
346 |
+
omit_columns=True
|
347 |
):
|
348 |
"""Returns the scores"""
|
349 |
import json
|
|
|
360 |
proc_ds = proc_ds.map(self.ref_omit_columns, desc='omit_columns (ref)')
|
361 |
|
362 |
# 3. normalize predictions and references
|
363 |
+
predictions_ds = proc_ds.map(lambda x: self.normalize_fn(x, text_field='pred', omit_columns=omit_columns))
|
364 |
+
references_ds = proc_ds.map(lambda x: self.normalize_fn(x, text_field='ref', omit_columns=omit_columns))
|
365 |
eval_data = {'pred': predictions_ds['rtext'], 'ref': references_ds['rtext']}
|
366 |
|
367 |
# 4. compute amsr for given metric
|