SondosMB commited on
Commit
5d5c6ec
·
verified ·
1 Parent(s): e4f66e8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -43
app.py CHANGED
@@ -4,9 +4,6 @@ import os
4
  import re
5
  from datetime import datetime
6
 
7
- # Leaderboard Data (example CSV file for leaderboard)
8
- LEADERBOARD_FILE = "leaderboard.csv"
9
-
10
  def clean_answer(answer):
11
  if pd.isna(answer):
12
  return None
@@ -18,21 +15,38 @@ def clean_answer(answer):
18
  return first_letter
19
  return None
20
 
21
- def update_leaderboard(results):
22
- # Append results to leaderboard file
23
- new_entry = {
24
- "Model Name": results['model_name'],
25
- "Overall Accuracy": f"{results['overall_accuracy']:.2%}",
26
- "Valid Accuracy": f"{results['valid_accuracy']:.2%}",
27
- "Correct Predictions": results['correct_predictions'],
28
- "Total Questions": results['total_questions'],
29
- "Timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
30
- }
31
- leaderboard_df = pd.DataFrame([new_entry])
32
- if os.path.exists(LEADERBOARD_FILE):
33
- existing_df = pd.read_csv(LEADERBOARD_FILE)
34
- leaderboard_df = pd.concat([existing_df, leaderboard_df], ignore_index=True)
35
- leaderboard_df.to_csv(LEADERBOARD_FILE, index=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  def evaluate_predictions(prediction_file):
38
  ground_truth_file = "ground_truth.csv" # Specify the path to the ground truth file
@@ -70,7 +84,6 @@ def evaluate_predictions(prediction_file):
70
  total_predictions = len(merged_df)
71
  total_valid_predictions = len(valid_predictions)
72
 
73
- # Ensure no division by zero
74
  overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
75
  valid_accuracy = (
76
  correct_predictions / total_valid_predictions
@@ -114,30 +127,21 @@ def evaluate_predictions(prediction_file):
114
  except Exception as e:
115
  return f"Error during evaluation: {str(e)}", None
116
 
117
-
118
- # Gradio Interface with Leaderboard
119
- def display_leaderboard():
120
- if not os.path.exists(LEADERBOARD_FILE):
121
- return "Leaderboard is empty."
122
- leaderboard_df = pd.read_csv(LEADERBOARD_FILE)
123
- return leaderboard_df.to_markdown(index=False)
124
-
125
- demo = gr.Blocks()
126
-
127
- with demo:
128
- gr.Markdown("# Prediction Evaluation Tool with Leaderboard")
129
- with gr.Tab("Evaluate"):
130
- file_input = gr.File(label="Upload Prediction CSV")
131
- eval_status = gr.Textbox(label="Evaluation Status")
132
- eval_results_file = gr.File(label="Download Evaluation Results")
133
- eval_button = gr.Button("Evaluate")
134
- eval_button.click(
135
- evaluate_predictions, inputs=file_input, outputs=[eval_status, eval_results_file]
136
- )
137
- with gr.Tab("Leaderboard"):
138
- leaderboard_text = gr.Textbox(label="Leaderboard", interactive=False)
139
- refresh_button = gr.Button("Refresh Leaderboard")
140
- refresh_button.click(display_leaderboard, outputs=leaderboard_text)
141
 
142
  if __name__ == "__main__":
143
  demo.launch()
 
4
  import re
5
  from datetime import datetime
6
 
 
 
 
7
  def clean_answer(answer):
8
  if pd.isna(answer):
9
  return None
 
15
  return first_letter
16
  return None
17
 
18
+ def write_evaluation_results(results, output_file):
19
+ os.makedirs(os.path.dirname(output_file) if os.path.dirname(output_file) else '.', exist_ok=True)
20
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
21
+
22
+ output_text = [
23
+ f"Evaluation Results for Model: {results['model_name']}",
24
+ f"Timestamp: {timestamp}",
25
+ "-" * 50,
26
+ f"Overall Accuracy (including invalid): {results['overall_accuracy']:.2%}",
27
+ f"Accuracy (valid predictions only): {results['valid_accuracy']:.2%}",
28
+ f"Total Questions: {results['total_questions']}",
29
+ f"Valid Predictions: {results['valid_predictions']}",
30
+ f"Invalid/Malformed Predictions: {results['invalid_predictions']}",
31
+ f"Correct Predictions: {results['correct_predictions']}",
32
+ "\nPerformance by Field:",
33
+ "-" * 50
34
+ ]
35
+
36
+ for field, metrics in results['field_performance'].items():
37
+ field_results = [
38
+ f"\nField: {field}",
39
+ f"Accuracy (including invalid): {metrics['accuracy']:.2%}",
40
+ f"Accuracy (valid only): {metrics['valid_accuracy']:.2%}",
41
+ f"Correct: {metrics['correct']}/{metrics['total']}",
42
+ f"Invalid predictions: {metrics['invalid']}"
43
+ ]
44
+ output_text.extend(field_results)
45
+
46
+ with open(output_file, 'w') as f:
47
+ f.write('\n'.join(output_text))
48
+ print('\n'.join(output_text))
49
+ print(f"\nResults have been saved to: {output_file}")
50
 
51
  def evaluate_predictions(prediction_file):
52
  ground_truth_file = "ground_truth.csv" # Specify the path to the ground truth file
 
84
  total_predictions = len(merged_df)
85
  total_valid_predictions = len(valid_predictions)
86
 
 
87
  overall_accuracy = correct_predictions / total_predictions if total_predictions > 0 else 0
88
  valid_accuracy = (
89
  correct_predictions / total_valid_predictions
 
127
  except Exception as e:
128
  return f"Error during evaluation: {str(e)}", None
129
 
130
+ # Gradio Interface
131
+ description = "Upload a prediction CSV file to evaluate predictions against the ground truth stored in the system."
132
+
133
+ demo = gr.Interface(
134
+ fn=evaluate_predictions,
135
+ inputs=[
136
+ gr.File(label="Upload Prediction CSV")
137
+ ],
138
+ outputs=[
139
+ gr.Textbox(label="Evaluation Status"),
140
+ gr.File(label="Download Evaluation Results")
141
+ ],
142
+ title="Prediction Evaluation Tool",
143
+ description=description
144
+ )
 
 
 
 
 
 
 
 
 
145
 
146
  if __name__ == "__main__":
147
  demo.launch()