DoctorSlimm commited on
Commit
8b5d1da
·
1 Parent(s): 4b03595
__pycache__/kaushiks_criteria.cpython-39.pyc ADDED
Binary file (3.13 kB). View file
 
kaushiks_criteria.py CHANGED
@@ -97,12 +97,84 @@ class kaushiks_criteria(evaluate.Metric):
97
  def _download_and_prepare(self, dl_manager):
98
  """Optional: download external resources useful to compute the scores"""
99
  # TODO: Download external resources if needed
 
 
100
  pass
101
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  def _compute(self, predictions, references):
103
- """Returns the scores"""
104
- # TODO: Compute the different scores of the module
105
- accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  return {
107
- "accuracy": accuracy,
 
 
108
  }
 
97
  def _download_and_prepare(self, dl_manager):
98
  """Optional: download external resources useful to compute the scores"""
99
  # TODO: Download external resources if needed
100
+ import evaluate
101
+ evaluate.load('exact_match')
102
  pass
103
 
104
+ def normalize_fn(self, example, text_field='text'):
105
+ """
106
+ parse output text into headers, rows, and records
107
+ - parse row by row (incomplete rows)
108
+ :param example:
109
+ :return:
110
+ Note: this does not handle special tokens
111
+ expected input format:
112
+
113
+ | col1 | col2 | col3 | <- start and trailing pipes required
114
+ | ---- | ---- | ---- | <- exactly 3x '-' per column
115
+ | val1 | val2 | val3 |
116
+ | ... | ... | ... |
117
+ """
118
+ headers, sep_row, row_counts = "", "", []
119
+
120
+ rows = dict(example)[text_field].strip().split('\n')
121
+
122
+ # parse headers
123
+ if len(rows) > 0:
124
+ headers = rows[0].strip()
125
+
126
+ # parse separator row
127
+ if len(rows) > 1:
128
+ sep_row = rows[1].strip()
129
+
130
+ # parse row cell counts
131
+ if len(rows) > 2:
132
+ data_rows = rows[2:]
133
+ for row in data_rows:
134
+ cell_counts = len(row.strip('|').split('|'))
135
+ row_counts.append(str(int(cell_counts)))
136
+ return {
137
+ 'headers': headers,
138
+ 'sep_row': sep_row,
139
+ 'row_counts': ''.join(row_counts)
140
+ }
141
+
142
  def _compute(self, predictions, references):
143
+ """
144
+ compute the quality of the output format with respect to the reference format
145
+ * column names match
146
+ * column order matches
147
+ * total row count
148
+ * number of cells in each row
149
+ :param predictions:
150
+ :param references:
151
+ :return:
152
+ """
153
+ pred_ds = Dataset.from_dict({'text': predictions})
154
+ refs_ds = Dataset.from_dict({'text': references})
155
+ proc_ds = DatasetDict({'predictions': pred_ds, 'references': refs_ds})
156
+ proc_ds = proc_ds.map(self.normalize_fn, num_proc=num_proc)
157
+
158
+ # compare headers
159
+ exact_match = evaluate.load('exact_match')
160
+ exact_match_headers = exact_match.compute(
161
+ predictions=proc_ds['predictions']['headers'],
162
+ references=proc_ds['references']['headers']
163
+ )['exact_match']
164
+
165
+ # compare separator row
166
+ exact_match_sep_row = exact_match.compute(
167
+ predictions=proc_ds['predictions']['sep_row'],
168
+ references=proc_ds['references']['sep_row']
169
+ )['exact_match']
170
+
171
+ # compare row counts
172
+ exact_match_row_counts = exact_match.compute(
173
+ predictions=proc_ds['predictions']['row_counts'],
174
+ references=proc_ds['references']['row_counts']
175
+ )['exact_match']
176
  return {
177
+ 'exact_match_headers': exact_match_headers,
178
+ 'exact_match_sep_row': exact_match_sep_row,
179
+ 'exact_match_row_counts': exact_match_row_counts,
180
  }