Spaces:
Sleeping
Sleeping
fix indentations
Browse files
app.py
CHANGED
@@ -191,43 +191,42 @@ def get_predictions_labels(model, dataloader):
|
|
191 |
|
192 |
|
193 |
def load_data():
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
|
207 |
-
|
208 |
|
209 |
-
|
210 |
|
211 |
-
|
212 |
predicted_labels = extract_labels(predictions)
|
213 |
|
214 |
reference_feedback = [x.split('Feedback:', 1)[1].strip() for x in labels]
|
215 |
-
|
216 |
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
reference_labels_np = np.array(reference_labels)
|
223 |
|
224 |
-
|
225 |
-
|
226 |
-
|
|
|
227 |
|
228 |
-
|
229 |
|
230 |
-
|
231 |
return df
|
232 |
|
233 |
dataframe = load_data()
|
|
|
191 |
|
192 |
|
193 |
def load_data():
|
194 |
+
df = pd.DataFrame(columns=['Model', 'Dataset', 'SacreBLEU', 'ROUGE-2', 'METEOR', 'BERTScore', 'Accuracy', 'Weighted F1', 'Macro F1'])
|
195 |
+
for ds in all_datasets:
|
196 |
+
split = get_split(ds)
|
197 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(get_model(ds))
|
198 |
+
tokenizer = AutoTokenizer.from_pretrained(get_tokenizer(ds))
|
199 |
|
200 |
+
processed_dataset = split.map(
|
201 |
+
preprocess_function,
|
202 |
+
batched=True,
|
203 |
+
remove_columns=split.column_names
|
204 |
+
)
|
205 |
+
processed_dataset.set_format('torch')
|
206 |
|
207 |
+
dataloader = DataLoader(processed_dataset, batch_size=4)
|
208 |
|
209 |
+
predictions, labels = get_predictions_labels(model, dataloader)
|
210 |
|
211 |
+
predicted_feedback = extract_feedback(predictions)
|
212 |
predicted_labels = extract_labels(predictions)
|
213 |
|
214 |
reference_feedback = [x.split('Feedback:', 1)[1].strip() for x in labels]
|
215 |
+
reference_labels = [x.split('Feedback:', 1)[0].strip() for x in labels]
|
216 |
|
217 |
+
rouge_score = rouge.compute(predictions=predicted_feedback, references=reference_feedback)['rouge2']
|
218 |
+
bleu_score = sacrebleu.compute(predictions=predicted_feedback, references=[[x] for x in reference_feedback])['score']
|
219 |
+
meteor_score = meteor.compute(predictions=predicted_feedback, references=reference_feedback)['meteor']
|
220 |
+
bert_score = bertscore.compute(predictions=predicted_feedback, references=reference_feedback, lang='de', model_type='bert-base-multilingual-cased', rescale_with_baseline=True)
|
|
|
|
|
221 |
|
222 |
+
reference_labels_np = np.array(reference_labels)
|
223 |
+
accuracy_value = accuracy_score(reference_labels_np, predicted_labels)
|
224 |
+
f1_weighted_value = f1_score(reference_labels_np, predicted_labels, average='weighted')
|
225 |
+
f1_macro_value = f1_score(reference_labels_np, predicted_labels, average='macro', labels=['Incorrect', 'Partially correct', 'Correct'])
|
226 |
|
227 |
+
new_row = pd.Dataframe("Model" : get_model(ds), "Dataset" : ds, "SacreBLEU" : bleu_score, "ROUGE-2" : rouge_score, "METEOR" : meteor_score, "BERTScore" : bert_score, "Accuracy" : accuracy_value, "Weighted F1" : f1_weighted_value, "Macro F1": f1_macro_value)
|
228 |
|
229 |
+
df = pd.concat([df, new_row])
|
230 |
return df
|
231 |
|
232 |
dataframe = load_data()
|