|
import gradio as gr |
|
import pandas as pd |
|
import os |
|
import re |
|
from datetime import datetime |
|
|
|
|
|
LEADERBOARD_FILE = "leaderboard.csv" |
|
|
|
def clean_answer(answer): |
|
if pd.isna(answer): |
|
return None |
|
answer = str(answer) |
|
clean = re.sub(r'[^A-Da-d]', '', answer) |
|
if clean: |
|
first_letter = clean[0].upper() |
|
if first_letter in ['A', 'B', 'C', 'D']: |
|
return first_letter |
|
return None |
|
|
|
def update_leaderboard(results): |
|
|
|
new_entry = { |
|
"Model Name": results['model_name'], |
|
"Overall Accuracy": f"{results['overall_accuracy']:.2%}", |
|
"Valid Accuracy": f"{results['valid_accuracy']:.2%}", |
|
"Correct Predictions": results['correct_predictions'], |
|
"Total Questions": results['total_questions'], |
|
"Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), |
|
} |
|
leaderboard_df = pd.DataFrame([new_entry]) |
|
if os.path.exists(LEADERBOARD_FILE): |
|
existing_df = pd.read_csv(LEADERBOARD_FILE) |
|
leaderboard_df = pd.concat([existing_df, leaderboard_df], ignore_index=True) |
|
leaderboard_df.to_csv(LEADERBOARD_FILE, index=False) |
|
|
|
def evaluate_predictions(prediction_file): |
|
ground_truth_file = "ground_truth.csv" |
|
if not prediction_file: |
|
return "Prediction file not uploaded", None |
|
|
|
if not os.path.exists(ground_truth_file): |
|
return "Ground truth file not found", None |
|
|
|
try: |
|
predictions_df = pd.read_csv(prediction_file.name) |
|
ground_truth_df = pd.read_csv(ground_truth_file) |
|
filename = os.path.basename(prediction_file.name) |
|
model_name = filename.split('_')[1].split('.')[0] if "_" in filename else "unknown_model" |
|
|
|
merged_df = pd.merge(predictions_df, ground_truth_df, on='question_id', how='inner') |
|
merged_df['pred_answer'] = merged_df['predicted_answer'].apply(clean_answer) |
|
correct_predictions = (merged_df['pred_answer'] == merged_df['Answer']).sum() |
|
total_predictions = len(merged_df) |
|
overall_accuracy = correct_predictions / total_predictions |
|
|
|
results = { |
|
'model_name': model_name, |
|
'overall_accuracy': overall_accuracy, |
|
'correct_predictions': correct_predictions, |
|
'total_questions': total_predictions, |
|
} |
|
|
|
update_leaderboard(results) |
|
|
|
return "Evaluation completed successfully! Leaderboard updated.", LEADERBOARD_FILE |
|
except Exception as e: |
|
return f"Error: {str(e)}", None |
|
|
|
|
|
def display_leaderboard(): |
|
if not os.path.exists(LEADERBOARD_FILE): |
|
return "Leaderboard is empty." |
|
leaderboard_df = pd.read_csv(LEADERBOARD_FILE) |
|
return leaderboard_df.to_markdown(index=False) |
|
|
|
demo = gr.Blocks() |
|
|
|
with demo: |
|
gr.Markdown("# Prediction Evaluation Tool with Leaderboard") |
|
with gr.Tab("Evaluate"): |
|
file_input = gr.File(label="Upload Prediction CSV") |
|
eval_status = gr.Textbox(label="Evaluation Status") |
|
eval_results_file = gr.File(label="Download Evaluation Results") |
|
eval_button = gr.Button("Evaluate") |
|
eval_button.click( |
|
evaluate_predictions, inputs=file_input, outputs=[eval_status, eval_results_file] |
|
) |
|
with gr.Tab("Leaderboard"): |
|
leaderboard_text = gr.Textbox(label="Leaderboard", interactive=False) |
|
refresh_button = gr.Button("Refresh Leaderboard") |
|
refresh_button.click(display_leaderboard, outputs=leaderboard_text) |
|
|
|
if __name__ == "__main__": |
|
demo.launch() |
|
|