ans_f1 = 72.7162551492616 correct = 559 eval_loss = -8.96630859375 incorrect = 51 similar = 154