TaiMingLu commited on
Commit
34660db
Β·
1 Parent(s): cbda60c

Add static model performance leaderboard for world-in-world

Browse files
Files changed (2) hide show
  1. app.py +70 -180
  2. src/leaderboard/read_evals.py +2 -0
app.py CHANGED
@@ -1,204 +1,94 @@
1
  import gradio as gr
2
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
4
- from apscheduler.schedulers.background import BackgroundScheduler
5
- from huggingface_hub import snapshot_download
6
 
7
- from src.about import (
8
- CITATION_BUTTON_LABEL,
9
- CITATION_BUTTON_TEXT,
10
- EVALUATION_QUEUE_TEXT,
11
- INTRODUCTION_TEXT,
12
- LLM_BENCHMARKS_TEXT,
13
- TITLE,
14
- )
15
- from src.display.css_html_js import custom_css
16
- from src.display.utils import (
17
- BENCHMARK_COLS,
18
- COLS,
19
- EVAL_COLS,
20
- EVAL_TYPES,
21
- AutoEvalColumn,
22
- ModelType,
23
- fields,
24
- WeightType,
25
- Precision
26
- )
27
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
- from src.submission.submit import add_new_eval
30
-
31
-
32
- def restart_space():
33
- API.restart_space(repo_id=REPO_ID)
34
-
35
- ### Space initialisation
36
- try:
37
- print(EVAL_REQUESTS_PATH)
38
- snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
- )
41
- except Exception:
42
- restart_space()
43
- try:
44
- print(EVAL_RESULTS_PATH)
45
- snapshot_download(
46
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
47
- )
48
- except Exception:
49
- restart_space()
50
-
51
-
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
-
54
- (
55
- finished_eval_queue_df,
56
- running_eval_queue_df,
57
- pending_eval_queue_df,
58
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
 
60
  def init_leaderboard(dataframe):
61
  if dataframe is None or dataframe.empty:
62
  raise ValueError("Leaderboard DataFrame is empty or None.")
63
  return Leaderboard(
64
  value=dataframe,
65
- datatype=[c.type for c in fields(AutoEvalColumn)],
66
  select_columns=SelectColumns(
67
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
68
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
69
  label="Select Columns to Display:",
70
  ),
71
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
72
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
73
  filter_columns=[
74
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
75
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
76
- ColumnFilter(
77
- AutoEvalColumn.params.name,
78
- type="slider",
79
- min=0.01,
80
- max=150,
81
- label="Select the number of parameters (B)",
82
- ),
83
- ColumnFilter(
84
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
85
- ),
86
  ],
87
  bool_checkboxgroup_label="Hide models",
88
  interactive=False,
89
  )
90
 
91
-
92
- demo = gr.Blocks(css=custom_css)
93
  with demo:
94
- gr.HTML(TITLE)
95
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
 
 
 
 
96
 
97
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
- with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
  leaderboard = init_leaderboard(LEADERBOARD_DF)
100
 
101
- with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
102
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
-
104
- with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
- with gr.Column():
106
- with gr.Row():
107
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
-
109
- with gr.Column():
110
- with gr.Accordion(
111
- f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
112
- open=False,
113
- ):
114
- with gr.Row():
115
- finished_eval_table = gr.components.Dataframe(
116
- value=finished_eval_queue_df,
117
- headers=EVAL_COLS,
118
- datatype=EVAL_TYPES,
119
- row_count=5,
120
- )
121
- with gr.Accordion(
122
- f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
123
- open=False,
124
- ):
125
- with gr.Row():
126
- running_eval_table = gr.components.Dataframe(
127
- value=running_eval_queue_df,
128
- headers=EVAL_COLS,
129
- datatype=EVAL_TYPES,
130
- row_count=5,
131
- )
132
-
133
- with gr.Accordion(
134
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
- open=False,
136
- ):
137
- with gr.Row():
138
- pending_eval_table = gr.components.Dataframe(
139
- value=pending_eval_queue_df,
140
- headers=EVAL_COLS,
141
- datatype=EVAL_TYPES,
142
- row_count=5,
143
- )
144
- with gr.Row():
145
- gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
146
-
147
- with gr.Row():
148
- with gr.Column():
149
- model_name_textbox = gr.Textbox(label="Model name")
150
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- label="Model type",
154
- multiselect=False,
155
- value=None,
156
- interactive=True,
157
- )
158
-
159
- with gr.Column():
160
- precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
163
- multiselect=False,
164
- value="float16",
165
- interactive=True,
166
- )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
- multiselect=False,
171
- value="Original",
172
- interactive=True,
173
- )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
-
176
- submit_button = gr.Button("Submit Eval")
177
- submission_result = gr.Markdown()
178
- submit_button.click(
179
- add_new_eval,
180
- [
181
- model_name_textbox,
182
- base_model_name_textbox,
183
- revision_name_textbox,
184
- precision,
185
- weight_type,
186
- model_type,
187
- ],
188
- submission_result,
189
- )
190
-
191
- with gr.Row():
192
- with gr.Accordion("πŸ“™ Citation", open=False):
193
- citation_button = gr.Textbox(
194
- value=CITATION_BUTTON_TEXT,
195
- label=CITATION_BUTTON_LABEL,
196
- lines=20,
197
- elem_id="citation-button",
198
- show_copy_button=True,
199
- )
200
-
201
- scheduler = BackgroundScheduler()
202
- scheduler.add_job(restart_space, "interval", seconds=1800)
203
- scheduler.start()
204
- demo.queue(default_concurrency_limit=40).launch()
 
1
  import gradio as gr
2
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
  import pandas as pd
 
 
4
 
5
+ # Static data
6
+ STATIC_DATA = [
7
+ ["VLM", "w/o WM", "–", "RGB", "72B", 50.27, 6.24],
8
+ ["Image Gen.", "PathDreamer [36]", "Viewpoint", "RGB-D; Sem; Pano", "0.69B", 56.99, 5.28],
9
+ ["Image Gen.", "SE3DS [11]", "Viewpoint", "RGB-D; Pano", "1.1B", 57.53, 5.29],
10
+ ["Video Gen.", "NWM [25]", "Trajectory", "RGB", "1B", 57.35, 5.68],
11
+ ["Video Gen.", "SVD [6]", "Image", "RGB", "1.5B", 57.71, 5.29],
12
+ ["Video Gen.", "LTX-Video [5]", "Text", "RGB", "2B", 56.08, 5.37],
13
+ ["Video Gen.", "Hunyuan [4]", "Text", "RGB", "13B", 57.71, 5.21],
14
+ ["Video Gen.", "Wan2.1 [23]", "Text", "RGB", "14B", 58.26, 5.24],
15
+ ["Video Gen.", "Cosmos [1]", "Text", "RGB", "2B", 52.27, 5.898],
16
+ ["Video Gen.", "Runway", "Text", "–", "–", "–", "–"],
17
+ ["Video Gen. Post-Train", "SVD† [6]", "Action", "RGB; Pano", "1.5B", 60.98, 5.02],
18
+ ["Video Gen. Post-Train", "LTX† [5]", "Action", "RGB; Pano", "2B", 57.53, 5.49],
19
+ ["Video Gen. Post-Train", "WAN2.1† [23]", "Action", "RGB; Pano", "14B", "XXX", "XXX"],
20
+ ["Video Gen. Post-Train", "Cosmos† [1]", "Action", "RGB; Pano", "2B", 60.25, 5.08],
21
+ ]
22
+
23
+ COLUMNS = ["Model Type", "Method", "Control Type", "Input Type", "#Param.", "Acc. ↑", "Mean Traj. ↓"]
24
+ LEADERBOARD_DF = pd.DataFrame(STATIC_DATA, columns=COLUMNS)
25
+
26
+ # Custom CSS (simplified)
27
+ custom_css = """
28
+ /* Add any custom styling here */
29
+ .gradio-container {
30
+ max-width: 1200px !important;
31
+ }
32
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  def init_leaderboard(dataframe):
35
  if dataframe is None or dataframe.empty:
36
  raise ValueError("Leaderboard DataFrame is empty or None.")
37
  return Leaderboard(
38
  value=dataframe,
39
+ datatype=["str", "str", "str", "str", "str", "number", "number"],
40
  select_columns=SelectColumns(
41
+ default_selection=COLUMNS,
42
+ cant_deselect=["Model Type", "Method", "Acc. ↑"],
43
  label="Select Columns to Display:",
44
  ),
45
+ search_columns=["Model Type", "Method"],
46
+ hide_columns=[],
47
  filter_columns=[
48
+ ColumnFilter("Model Type", type="checkboxgroup", label="Model types"),
49
+ ColumnFilter("Control Type", type="checkboxgroup", label="Control types"),
50
+ ColumnFilter("Input Type", type="checkboxgroup", label="Input types"),
 
 
 
 
 
 
 
 
 
51
  ],
52
  bool_checkboxgroup_label="Hide models",
53
  interactive=False,
54
  )
55
 
56
+ demo = gr.Blocks(css=custom_css, title="Model Performance Leaderboard")
 
57
  with demo:
58
+ gr.HTML("<h1 style='text-align: center'>πŸ† Model Performance Leaderboard</h1>")
59
+ gr.Markdown("""
60
+ **Performance comparison across vision-language models, image generation, and video generation models.**
61
+
62
+ πŸ“Š **Metrics:** Acc. ↑ (Accuracy - higher is better) | Mean Traj. ↓ (Mean Trajectory error - lower is better)
63
+ """, elem_classes="markdown-text")
64
 
65
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
66
+ with gr.TabItem("πŸ… Leaderboard", elem_id="leaderboard-tab", id=0):
67
  leaderboard = init_leaderboard(LEADERBOARD_DF)
68
 
69
+ with gr.TabItem("πŸ“ About", elem_id="about-tab", id=1):
70
+ gr.Markdown("""
71
+ # About This Leaderboard
72
+
73
+ This leaderboard showcases performance metrics across different types of AI models:
74
+
75
+ ## Model Categories
76
+ - **VLM**: Vision-Language Models
77
+ - **Image Gen.**: Image Generation Models
78
+ - **Video Gen.**: Video Generation Models
79
+ - **Video Gen. Post-Train**: Post-training specialized Video Generation Models
80
+
81
+ ## Metrics Explained
82
+ - **Acc. ↑**: Accuracy score (higher values indicate better performance)
83
+ - **Mean Traj. ↓**: Mean trajectory error (lower values indicate better performance)
84
+
85
+ ## Notes
86
+ - † indicates post-training specialized models
87
+ - XXX indicates results pending/unavailable
88
+ - – indicates not applicable or not available
89
+
90
+ *Results may vary across different evaluation settings and benchmarks.*
91
+ """, elem_classes="markdown-text")
92
+
93
+ if __name__ == "__main__":
94
+ demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/leaderboard/read_evals.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import glob
2
  import json
3
  import math
 
1
+ # src/leaderboard/read_evals.py
2
+
3
  import glob
4
  import json
5
  import math