Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -37,6 +37,70 @@ columns = ["Model Configuration", "tinyArc", "tinyHellaswag", "tinyMMLU", "tinyT
|
|
37 |
# Convert to DataFrame
|
38 |
df_full = pd.DataFrame(data_full, columns=columns)
|
39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
40 |
def summary_statistics():
|
41 |
stats = df_full.iloc[:, 1:].describe().T # Summary stats for each task
|
42 |
stats['Std Dev'] = df_full.iloc[:, 1:].std(axis=0)
|
@@ -68,8 +132,28 @@ def plot_heatmap():
|
|
68 |
return "performance_heatmap.png"
|
69 |
|
70 |
with gr.Blocks() as demo:
|
71 |
-
gr.Markdown("#
|
72 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
73 |
with gr.Row():
|
74 |
btn1 = gr.Button("Show Summary Statistics")
|
75 |
stats_output = gr.Dataframe()
|
|
|
37 |
# Convert to DataFrame
|
38 |
df_full = pd.DataFrame(data_full, columns=columns)
|
39 |
|
40 |
+
|
41 |
+
def plot_average_scores():
|
42 |
+
df_full["Average Score"] = df_full.iloc[:, 1:].mean(axis=1)
|
43 |
+
df_avg_sorted = df_full.sort_values(by="Average Score", ascending=False)
|
44 |
+
|
45 |
+
plt.figure(figsize=(12, 8))
|
46 |
+
plt.barh(df_avg_sorted["Model Configuration"], df_avg_sorted["Average Score"])
|
47 |
+
plt.title("Average Performance of Models Across Tasks", fontsize=16)
|
48 |
+
plt.xlabel("Average Score", fontsize=14)
|
49 |
+
plt.ylabel("Model Configuration", fontsize=14)
|
50 |
+
plt.gca().invert_yaxis()
|
51 |
+
plt.grid(axis='x', linestyle='--', alpha=0.7)
|
52 |
+
plt.tight_layout()
|
53 |
+
plt.savefig("average_performance.png")
|
54 |
+
return "average_performance.png"
|
55 |
+
|
56 |
+
def plot_task_performance():
|
57 |
+
df_full_melted = df_full.melt(id_vars="Model Configuration", var_name="Task", value_name="Score")
|
58 |
+
|
59 |
+
plt.figure(figsize=(14, 10))
|
60 |
+
for model in df_full["Model Configuration"]:
|
61 |
+
model_data = df_full_melted[df_full_melted["Model Configuration"] == model]
|
62 |
+
plt.plot(model_data["Task"], model_data["Score"], marker="o", label=model)
|
63 |
+
|
64 |
+
plt.title("Performance of All Models Across Tasks", fontsize=16)
|
65 |
+
plt.xlabel("Task", fontsize=14)
|
66 |
+
plt.ylabel("Score", fontsize=14)
|
67 |
+
plt.xticks(rotation=45)
|
68 |
+
plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=9)
|
69 |
+
plt.grid(axis='y', linestyle='--', alpha=0.7)
|
70 |
+
plt.tight_layout()
|
71 |
+
plt.savefig("task_performance.png")
|
72 |
+
return "task_performance.png"
|
73 |
+
|
74 |
+
def plot_task_specific_top_models():
|
75 |
+
top_models = df_full.iloc[:, :-1].set_index("Model Configuration").idxmax()
|
76 |
+
top_scores = df_full.iloc[:, :-1].set_index("Model Configuration").max()
|
77 |
+
|
78 |
+
results = pd.DataFrame({"Top Model": top_models, "Score": top_scores}).reset_index().rename(columns={"index": "Task"})
|
79 |
+
|
80 |
+
plt.figure(figsize=(12, 6))
|
81 |
+
plt.bar(results["Task"], results["Score"])
|
82 |
+
plt.title("Task-Specific Top Models", fontsize=16)
|
83 |
+
plt.xlabel("Task", fontsize=14)
|
84 |
+
plt.ylabel("Score", fontsize=14)
|
85 |
+
plt.grid(axis="y", linestyle="--", alpha=0.7)
|
86 |
+
plt.tight_layout()
|
87 |
+
plt.savefig("task_specific_top_models.png")
|
88 |
+
return "task_specific_top_models.png"
|
89 |
+
|
90 |
+
def top_3_models_per_task():
|
91 |
+
top_3_data = {
|
92 |
+
task: df_full.nlargest(3, task)[["Model Configuration", task]].values.tolist()
|
93 |
+
for task in df_full.columns[1:-1]
|
94 |
+
}
|
95 |
+
top_3_results = pd.DataFrame({
|
96 |
+
task: {
|
97 |
+
"Top 3 Models": [entry[0] for entry in top_3_data[task]],
|
98 |
+
"Scores": [entry[1] for entry in top_3_data[task]],
|
99 |
+
}
|
100 |
+
for task in top_3_data
|
101 |
+
}).T.rename_axis("Task").reset_index()
|
102 |
+
return top_3_results
|
103 |
+
|
104 |
def summary_statistics():
|
105 |
stats = df_full.iloc[:, 1:].describe().T # Summary stats for each task
|
106 |
stats['Std Dev'] = df_full.iloc[:, 1:].std(axis=0)
|
|
|
132 |
return "performance_heatmap.png"
|
133 |
|
134 |
with gr.Blocks() as demo:
|
135 |
+
gr.Markdown("# Model Performance Analysis")
|
136 |
|
137 |
+
with gr.Row():
|
138 |
+
btn1 = gr.Button("Show Average Performance")
|
139 |
+
img1 = gr.Image(type="filepath")
|
140 |
+
btn1.click(plot_average_scores, outputs=img1)
|
141 |
+
|
142 |
+
with gr.Row():
|
143 |
+
btn2 = gr.Button("Show Task Performance")
|
144 |
+
img2 = gr.Image(type="filepath")
|
145 |
+
btn2.click(plot_task_performance, outputs=img2)
|
146 |
+
|
147 |
+
with gr.Row():
|
148 |
+
btn3 = gr.Button("Task-Specific Top Models")
|
149 |
+
img3 = gr.Image(type="filepath")
|
150 |
+
btn3.click(plot_task_specific_top_models, outputs=img3)
|
151 |
+
|
152 |
+
with gr.Row():
|
153 |
+
btn4 = gr.Button("Top 3 Models Per Task")
|
154 |
+
output4 = gr.Dataframe()
|
155 |
+
btn4.click(top_3_models_per_task, outputs=output4)
|
156 |
+
|
157 |
with gr.Row():
|
158 |
btn1 = gr.Button("Show Summary Statistics")
|
159 |
stats_output = gr.Dataframe()
|