glitchbench commited on
Commit
da7ea76
1 Parent(s): cbd4a8c

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +69 -0
  2. raw_outputs.pkl +3 -0
app.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+
4
+
5
+ def load_and_process_data(file_path):
6
+ # Load the leaderboard data
7
+ df = pd.read_pickle(file_path)
8
+
9
+ # Group by 'lmm' and 'question' to calculate mean accuracy
10
+ accuracy_df = (
11
+ df.groupby(["lmm", "question"])["accepted_by_judge"].mean().reset_index()
12
+ )
13
+ accuracy_df = accuracy_df.rename(columns={"accepted_by_judge": "accuracy"})
14
+ accuracy_df["accuracy"] = (accuracy_df["accuracy"] * 100).round(1)
15
+
16
+ # Group by 'lmm' to calculate the count of images
17
+ image_count_df = df.groupby("lmm")["image"].nunique().reset_index()
18
+ image_count_df = image_count_df.rename(columns={"image": "Total Images"})
19
+
20
+ return accuracy_df, image_count_df
21
+
22
+
23
+ def expand_and_format_df(accuracy_df, image_count_df):
24
+ # Pivot and format the accuracy dataframe
25
+ expanded_df = accuracy_df.pivot(index="lmm", columns="question", values="accuracy")
26
+ expanded_df["Average"] = expanded_df.mean(axis=1).round(1)
27
+ expanded_df = expanded_df.sort_values(by="Average", ascending=False).reset_index()
28
+ expanded_df.columns.name = None
29
+
30
+ # Merge the 'total_images' column
31
+ final_df = pd.merge(expanded_df, image_count_df, on="lmm")
32
+
33
+ return final_df.rename(columns={"lmm": "Model"})
34
+
35
+
36
+ def map_model_names(df, name_dict):
37
+ # Map model names using the provided dictionary
38
+ df["Model"] = df["Model"].map(name_dict)
39
+ return df
40
+
41
+
42
+ # Dictionary for renaming models
43
+ name_dict = {
44
+ "gpt4v": "GPT-4V(ision)",
45
+ "llava": "LLaVA-1.5-13B",
46
+ "llava-7b": "LLaVA-1.5-7B",
47
+ "Long-SPHINX": "Long-SPHINX",
48
+ "SPHINX": "SPHINX",
49
+ "OtterHD": "OtterHD",
50
+ "minigpt4v2": "MiniGPT4v2",
51
+ "InstructBLIP-13B": "InstructBLIP-13B",
52
+ "InstructBLIP": "InstructBLIP-7B",
53
+ "qwen": "Qwen-VL-Chat",
54
+ "fuyu-8b": "Fuyu-8B",
55
+ }
56
+
57
+ # Processing steps
58
+ accuracy_df, image_count_df = load_and_process_data("raw_outputs.pkl")
59
+ final_df = expand_and_format_df(accuracy_df, image_count_df)
60
+ final_df = map_model_names(final_df, name_dict)
61
+
62
+
63
+ # Gradio interface
64
+ with gr.Blocks() as demo:
65
+ gr.Markdown("# GlitchBench Leaderboard")
66
+ with gr.Row():
67
+ gr.Dataframe(final_df)
68
+
69
+ demo.launch()
raw_outputs.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1777b3c9404d0d8ebbe286fd42114767e70f19c428af736c64273bc414af25e
3
+ size 22207169