kexinhuang12345 commited on
Commit
6d97820
1 Parent(s): 1c85aff
app.py CHANGED
@@ -11,17 +11,23 @@ from src.about import (
11
  INTRODUCTION_TEXT,
12
  LLM_BENCHMARKS_TEXT,
13
  TITLE,
 
14
  )
15
  from src.display.css_html_js import custom_css
16
  from src.display.utils import (
17
  BENCHMARK_COLS,
18
  COLS,
 
19
  EVAL_COLS,
20
  EVAL_TYPES,
21
  NUMERIC_INTERVALS,
22
  TYPES,
23
- AutoEvalColumn,
 
24
  ModelType,
 
 
 
25
  fields,
26
  WeightType,
27
  Precision
@@ -50,44 +56,32 @@ except Exception:
50
  restart_space()
51
 
52
 
53
- raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
54
  leaderboard_df = original_df.copy()
55
 
56
- (
57
- finished_eval_queue_df,
58
- running_eval_queue_df,
59
- pending_eval_queue_df,
60
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
61
-
62
-
63
  # Searching and filtering
64
  def update_table(
65
  hidden_df: pd.DataFrame,
66
  columns: list,
67
- type_query: list,
68
- precision_query: str,
69
- size_query: list,
70
- show_deleted: bool,
71
  query: str,
72
  ):
73
- filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
74
- filtered_df = filter_queries(query, filtered_df)
75
  df = select_columns(filtered_df, columns)
76
  return df
77
 
78
 
79
  def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
80
- return df[(df[AutoEvalColumn.model.name].str.contains(query, case=False))]
81
 
82
 
83
  def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
84
  always_here_cols = [
85
- AutoEvalColumn.model_type_symbol.name,
86
- AutoEvalColumn.model.name,
87
  ]
88
  # We use COLS to maintain sorting
89
  filtered_df = df[
90
- always_here_cols + [c for c in COLS if c in df.columns and c in columns]
91
  ]
92
  return filtered_df
93
 
@@ -105,40 +99,39 @@ def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
105
  if len(final_df) > 0:
106
  filtered_df = pd.concat(final_df)
107
  filtered_df = filtered_df.drop_duplicates(
108
- subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
109
  )
110
 
111
  return filtered_df
112
 
113
 
114
  def filter_models(
115
- df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
116
  ) -> pd.DataFrame:
117
  # Show all models
118
  if show_deleted:
119
  filtered_df = df
120
  else: # Show only still on the hub models
121
- filtered_df = df[df[AutoEvalColumn.still_on_hub.name] == True]
122
 
123
- type_emoji = [t[0] for t in type_query]
124
- filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
125
- filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
126
 
127
  numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
128
- params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
129
  mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
130
  filtered_df = filtered_df.loc[mask]
131
 
132
  return filtered_df
133
 
134
-
135
  demo = gr.Blocks(css=custom_css)
136
  with demo:
137
  gr.HTML(TITLE)
138
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
139
 
140
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
141
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
142
  with gr.Row():
143
  with gr.Column():
144
  with gr.Row():
@@ -151,52 +144,27 @@ with demo:
151
  shown_columns = gr.CheckboxGroup(
152
  choices=[
153
  c.name
154
- for c in fields(AutoEvalColumn)
155
  if not c.hidden and not c.never_hidden
156
  ],
157
  value=[
158
  c.name
159
- for c in fields(AutoEvalColumn)
160
  if c.displayed_by_default and not c.hidden and not c.never_hidden
161
  ],
162
  label="Select columns to show",
163
  elem_id="column-select",
164
  interactive=True,
165
  )
166
- with gr.Row():
167
- deleted_models_visibility = gr.Checkbox(
168
- value=False, label="Show gated/private/deleted models", interactive=True
169
- )
170
- with gr.Column(min_width=320):
171
- #with gr.Box(elem_id="box-filter"):
172
- filter_columns_type = gr.CheckboxGroup(
173
- label="Model types",
174
- choices=[t.to_str() for t in ModelType],
175
- value=[t.to_str() for t in ModelType],
176
- interactive=True,
177
- elem_id="filter-columns-type",
178
- )
179
- filter_columns_precision = gr.CheckboxGroup(
180
- label="Precision",
181
- choices=[i.value.name for i in Precision],
182
- value=[i.value.name for i in Precision],
183
- interactive=True,
184
- elem_id="filter-columns-precision",
185
- )
186
- filter_columns_size = gr.CheckboxGroup(
187
- label="Model sizes (in billions of parameters)",
188
- choices=list(NUMERIC_INTERVALS.keys()),
189
- value=list(NUMERIC_INTERVALS.keys()),
190
- interactive=True,
191
- elem_id="filter-columns-size",
192
- )
193
-
194
  leaderboard_table = gr.components.Dataframe(
195
  value=leaderboard_df[
196
- [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
197
  + shown_columns.value
198
  ],
199
- headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value,
200
  datatype=TYPES,
201
  elem_id="leaderboard-table",
202
  interactive=False,
@@ -205,8 +173,8 @@ with demo:
205
 
206
  # Dummy leaderboard for handling the case when the user uses backspace key
207
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
208
- value=original_df[COLS],
209
- headers=COLS,
210
  datatype=TYPES,
211
  visible=False,
212
  )
@@ -215,116 +183,86 @@ with demo:
215
  [
216
  hidden_leaderboard_table_for_search,
217
  shown_columns,
218
- filter_columns_type,
219
- filter_columns_precision,
220
- filter_columns_size,
221
- deleted_models_visibility,
222
  search_bar,
223
  ],
224
  leaderboard_table,
225
  )
226
- for selector in [shown_columns, filter_columns_type, filter_columns_precision, filter_columns_size, deleted_models_visibility]:
227
  selector.change(
228
  update_table,
229
  [
230
  hidden_leaderboard_table_for_search,
231
  shown_columns,
232
- filter_columns_type,
233
- filter_columns_precision,
234
- filter_columns_size,
235
- deleted_models_visibility,
236
  search_bar,
237
  ],
238
  leaderboard_table,
239
  queue=True,
240
  )
241
 
242
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
243
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
244
-
245
  with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
246
  with gr.Column():
247
  with gr.Row():
248
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
249
 
250
- with gr.Column():
251
- with gr.Accordion(
252
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
253
- open=False,
254
- ):
255
- with gr.Row():
256
- finished_eval_table = gr.components.Dataframe(
257
- value=finished_eval_queue_df,
258
- headers=EVAL_COLS,
259
- datatype=EVAL_TYPES,
260
- row_count=5,
261
- )
262
- with gr.Accordion(
263
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
264
- open=False,
265
- ):
266
- with gr.Row():
267
- running_eval_table = gr.components.Dataframe(
268
- value=running_eval_queue_df,
269
- headers=EVAL_COLS,
270
- datatype=EVAL_TYPES,
271
- row_count=5,
272
- )
273
-
274
- with gr.Accordion(
275
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
276
- open=False,
277
- ):
278
- with gr.Row():
279
- pending_eval_table = gr.components.Dataframe(
280
- value=pending_eval_queue_df,
281
- headers=EVAL_COLS,
282
- datatype=EVAL_TYPES,
283
- row_count=5,
284
- )
285
  with gr.Row():
286
  gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
287
 
288
  with gr.Row():
289
  with gr.Column():
 
 
 
 
290
  model_name_textbox = gr.Textbox(label="Model name")
291
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
292
- model_type = gr.Dropdown(
293
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
294
- label="Model type",
 
295
  multiselect=False,
296
  value=None,
297
  interactive=True,
298
  )
299
-
300
- with gr.Column():
301
- precision = gr.Dropdown(
302
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
303
- label="Precision",
304
  multiselect=False,
305
- value="float16",
306
  interactive=True,
307
  )
308
- weight_type = gr.Dropdown(
309
- choices=[i.value.name for i in WeightType],
310
- label="Weights type",
 
 
 
 
 
 
 
311
  multiselect=False,
312
- value="Original",
313
  interactive=True,
314
  )
315
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
316
 
317
  submit_button = gr.Button("Submit Eval")
318
  submission_result = gr.Markdown()
319
  submit_button.click(
320
  add_new_eval,
321
  [
 
 
 
322
  model_name_textbox,
323
- base_model_name_textbox,
324
- revision_name_textbox,
325
- precision,
326
- weight_type,
327
- model_type,
 
 
328
  ],
329
  submission_result,
330
  )
 
11
  INTRODUCTION_TEXT,
12
  LLM_BENCHMARKS_TEXT,
13
  TITLE,
14
+ nc_tasks
15
  )
16
  from src.display.css_html_js import custom_css
17
  from src.display.utils import (
18
  BENCHMARK_COLS,
19
  COLS,
20
+ COLS_NC,
21
  EVAL_COLS,
22
  EVAL_TYPES,
23
  NUMERIC_INTERVALS,
24
  TYPES,
25
+ AutoEvalColumn_NodeClassification,
26
+ #AutoEvalColumn,
27
  ModelType,
28
+ TASK_LIST,
29
+ OFFICIAL,
30
+ HONOR,
31
  fields,
32
  WeightType,
33
  Precision
 
56
  restart_space()
57
 
58
 
59
+ original_df = get_leaderboard_df(EVAL_REQUESTS_PATH, nc_tasks)
60
  leaderboard_df = original_df.copy()
61
 
 
 
 
 
 
 
 
62
  # Searching and filtering
63
  def update_table(
64
  hidden_df: pd.DataFrame,
65
  columns: list,
 
 
 
 
66
  query: str,
67
  ):
68
+ #filtered_df = filter_models(hidden_df, size_query, show_deleted)
69
+ filtered_df = filter_queries(query, hidden_df)
70
  df = select_columns(filtered_df, columns)
71
  return df
72
 
73
 
74
  def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
75
+ return df[(df[AutoEvalColumn_NodeClassification.model.name].str.contains(query, case=False))]
76
 
77
 
78
  def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
79
  always_here_cols = [
80
+ "Model"
 
81
  ]
82
  # We use COLS to maintain sorting
83
  filtered_df = df[
84
+ always_here_cols + [c for c in COLS_NC if c in df.columns and c in columns]
85
  ]
86
  return filtered_df
87
 
 
99
  if len(final_df) > 0:
100
  filtered_df = pd.concat(final_df)
101
  filtered_df = filtered_df.drop_duplicates(
102
+ subset=[AutoEvalColumn_NodeClassification.model.name]
103
  )
104
 
105
  return filtered_df
106
 
107
 
108
  def filter_models(
109
+ df: pd.DataFrame, size_query: list, show_deleted: bool
110
  ) -> pd.DataFrame:
111
  # Show all models
112
  if show_deleted:
113
  filtered_df = df
114
  else: # Show only still on the hub models
115
+ filtered_df = df[df[AutoEvalColumn_NodeClassification.still_on_hub.name] == True]
116
 
117
+ #type_emoji = [t[0] for t in type_query]
118
+ #filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
119
+ #filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
120
 
121
  numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
122
+ params_column = pd.to_numeric(df[AutoEvalColumn_NodeClassification.params.name], errors="coerce")
123
  mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
124
  filtered_df = filtered_df.loc[mask]
125
 
126
  return filtered_df
127
 
 
128
  demo = gr.Blocks(css=custom_css)
129
  with demo:
130
  gr.HTML(TITLE)
131
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
132
 
133
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
134
+ with gr.TabItem("🏅 Node Classification Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
135
  with gr.Row():
136
  with gr.Column():
137
  with gr.Row():
 
144
  shown_columns = gr.CheckboxGroup(
145
  choices=[
146
  c.name
147
+ for c in fields(AutoEvalColumn_NodeClassification)
148
  if not c.hidden and not c.never_hidden
149
  ],
150
  value=[
151
  c.name
152
+ for c in fields(AutoEvalColumn_NodeClassification)
153
  if c.displayed_by_default and not c.hidden and not c.never_hidden
154
  ],
155
  label="Select columns to show",
156
  elem_id="column-select",
157
  interactive=True,
158
  )
159
+
160
+ print(leaderboard_df)
161
+ print(fields(AutoEvalColumn_NodeClassification))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  leaderboard_table = gr.components.Dataframe(
163
  value=leaderboard_df[
164
+ [c.name for c in fields(AutoEvalColumn_NodeClassification) if c.never_hidden]
165
  + shown_columns.value
166
  ],
167
+ headers=[c.name for c in fields(AutoEvalColumn_NodeClassification) if c.never_hidden] + shown_columns.value,
168
  datatype=TYPES,
169
  elem_id="leaderboard-table",
170
  interactive=False,
 
173
 
174
  # Dummy leaderboard for handling the case when the user uses backspace key
175
  hidden_leaderboard_table_for_search = gr.components.Dataframe(
176
+ value=original_df[COLS_NC],
177
+ headers=COLS_NC,
178
  datatype=TYPES,
179
  visible=False,
180
  )
 
183
  [
184
  hidden_leaderboard_table_for_search,
185
  shown_columns,
 
 
 
 
186
  search_bar,
187
  ],
188
  leaderboard_table,
189
  )
190
+ for selector in [shown_columns]:
191
  selector.change(
192
  update_table,
193
  [
194
  hidden_leaderboard_table_for_search,
195
  shown_columns,
 
 
 
 
196
  search_bar,
197
  ],
198
  leaderboard_table,
199
  queue=True,
200
  )
201
 
 
 
 
202
  with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
203
  with gr.Column():
204
  with gr.Row():
205
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
206
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
  with gr.Row():
208
  gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
209
 
210
  with gr.Row():
211
  with gr.Column():
212
+ author_name_textbox = gr.Textbox(label="Your name")
213
+ email_textbox = gr.Textbox(label="Your email")
214
+ relbench_version_textbox = gr.Textbox(label="RelBench version")
215
+
216
  model_name_textbox = gr.Textbox(label="Model name")
217
+
218
+ '''
219
+ dataset_name_textbox = gr.Dropdown(
220
+ choices=[t.value.name for t in TASK_LIST],
221
+ label="Task name (e.g. rel-amazon-user-churn)",
222
  multiselect=False,
223
  value=None,
224
  interactive=True,
225
  )
226
+ '''
227
+
228
+ official_or_not = gr.Dropdown(
229
+ choices=[i.value.name for i in OFFICIAL],
230
+ label="Is it an official submission?",
231
  multiselect=False,
232
+ value=None,
233
  interactive=True,
234
  )
235
+ paper_url_textbox = gr.Textbox(label="Paper URL Link")
236
+ github_url_textbox = gr.Textbox(label="GitHub URL Link")
237
+
238
+ with gr.Column():
239
+ test_performance = gr.TextArea(label="Test set performance, use {task: [mean,std]} format e.g. {'rel-amazon/user-churn': [0.352,0.023], 'rel-amazon/user-ltv': [0.304,0.022], ...}")
240
+ valid_performance = gr.TextArea(label="Validation set performance, use {task: [mean,std]} format e.g. {'rel-amazon/user-churn': [0.352,0.023], 'rel-amazon/user-ltv': [0.304,0.022], ...}")
241
+ parameters_textbox = gr.Textbox(label="Number of parameters")
242
+ honor_code = gr.Dropdown(
243
+ choices=[i.value.name for i in HONOR],
244
+ label="Click here to agree to the honor code",
245
  multiselect=False,
246
+ value=None,
247
  interactive=True,
248
  )
 
249
 
250
  submit_button = gr.Button("Submit Eval")
251
  submission_result = gr.Markdown()
252
  submit_button.click(
253
  add_new_eval,
254
  [
255
+ author_name_textbox,
256
+ email_textbox,
257
+ relbench_version_textbox,
258
  model_name_textbox,
259
+ official_or_not,
260
+ test_performance,
261
+ valid_performance,
262
+ paper_url_textbox,
263
+ github_url_textbox,
264
+ parameters_textbox,
265
+ honor_code,
266
  ],
267
  submission_result,
268
  )
src/about.py CHANGED
@@ -18,55 +18,57 @@ class Tasks(Enum):
18
  NUM_FEWSHOT = 0 # Change with your few shot
19
  # ---------------------------------------------------
20
 
21
-
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
- Intro text
29
  """
30
 
31
  # Which evaluations are you running? how can people reproduce what you have?
32
  LLM_BENCHMARKS_TEXT = f"""
33
- ## How it works
34
-
35
- ## Reproducibility
36
- To reproduce our results, here is the commands you can run:
37
-
38
  """
39
 
40
  EVALUATION_QUEUE_TEXT = """
41
- ## Some good practices before submitting a model
42
-
43
- ### 1) Make sure you can load your model and tokenizer using AutoClasses:
44
- ```python
45
- from transformers import AutoConfig, AutoModel, AutoTokenizer
46
- config = AutoConfig.from_pretrained("your model name", revision=revision)
47
- model = AutoModel.from_pretrained("your model name", revision=revision)
48
- tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
49
- ```
50
- If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
51
-
52
- Note: make sure your model is public!
53
- Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
54
-
55
- ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
56
- It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
57
-
58
- ### 3) Make sure your model has an open license!
59
- This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
60
-
61
- ### 4) Fill up your model card
62
- When we add extra information about models to the leaderboard, it will be automatically taken from the model card
63
-
64
- ## In case of model failure
65
- If your model is displayed in the `FAILED` category, its execution stopped.
66
- Make sure you have followed the above steps first.
67
- If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
68
  """
69
 
70
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
71
  CITATION_BUTTON_TEXT = r"""
 
 
 
 
 
72
  """
 
18
  NUM_FEWSHOT = 0 # Change with your few shot
19
  # ---------------------------------------------------
20
 
21
+ class nc_tasks(Enum):
22
+ task0 = Task("rel-amazon/user-churn", "auroc", "user-churn")
23
+ task1 = Task("rel-amazon/item-churn", "auroc", "item-churn")
24
+ task2 = Task("rel-avito/user-clicks", "auroc", "user-clicks")
25
+ task3 = Task("rel-avito/user-visits", "auroc", "user-visits")
26
+ task4 = Task("rel-hm/user-churn", "auroc", "hm-user-churn")
27
+ task5 = Task("rel-stack/user-badge", "auroc", "user-badge")
28
+ task6 = Task("rel-stack/user-engagement", "auroc", "user-engagement")
29
+ task7 = Task("rel-f1/driver-dnf", "auroc", "driver-dnf")
30
+ task8 = Task("rel-f1/driver-top3", "auroc", "driver-top3")
31
+ task9 = Task("rel-trial/study-outcome", "auroc", "study-outcome")
32
+ task10 = Task("rel-event/user-repeat", "auroc", "user-repeat")
33
+ task11 = Task("rel-event/user-ignore", "auroc", "user-ignore")
34
 
35
  # Your leaderboard name
36
+ TITLE = """<p align="center"><img src="https://relbench.stanford.edu/img/logo.png" alt="logo" width="400px" /></p>"""
37
 
38
  # What does your leaderboard evaluate?
39
  INTRODUCTION_TEXT = """
40
+ Relational Deep Learning is a new approach for end-to-end representation learning on data spread across multiple tables, such as in a relational database (see our vision paper). RelBench is the accompanying benchmark which seeks to facilitate efficient, robust and reproducible research in this direction. It comprises of a collection of realistic, large-scale, and diverse datasets structured as relational tables, along with machine learning tasks defined on them. It provides full support for data downloading, task specification and standardized evaluation in an ML-framework-agnostic manner. Additionally, there is seamless integration with PyTorch Geometric to load the data as a graph and train GNN models, and with PyTorch Frame to encode the various types of table columns. Finally, there is a leaderboard for tracking progress.
41
  """
42
 
43
  # Which evaluations are you running? how can people reproduce what you have?
44
  LLM_BENCHMARKS_TEXT = f"""
45
+ ## Overview of RelBench
 
 
 
 
46
  """
47
 
48
  EVALUATION_QUEUE_TEXT = """
49
+ ## Instruction to submit your model
50
+
51
+ Once you have developed your model and got results, you can submit your test results to our leaderboards. For each dataset, we require you to submit the following information.
52
+
53
+ - **Your name**: Primary contact's name
54
+ - **Your email**: Primary contact's email
55
+ - **RelBench version**: The RelBench version used to conduct the experiments.
56
+ - **Model name**: The name of the method. This is an unique identifier of the model. Please be distinct with any existing model names. It will be overriden if the same model name is submitted.
57
+ - **Task name**: The name of an Relbench dataset that you use to evaluate the method. Choose from the dropdown menus.
58
+ - **Is it an official submission**: Whether the implementation is official (implementation by authors who proposed the method) or unofficial (re-implementation of the method by non-authors).
59
+ - **Test performance**: Raw test performance output by RelBench model evaluators, where average and unbiased standard deviation must be taken over 5 different random seeds. You can either not fix random seeds at all, or use the random seeds from 0 to 4. We highly discourage you to tune the random seeds.
60
+ - **Validation performance**: Validation performance of the model that is used to report the test performance above.
61
+ - **Paper URL Link**: The original paper describing the method (arXiv link is recommended. paper needs not be peer-reviewed). If your method has any original component (e.g., even just combining existing methods XXX and YYY), you have to write a technical report describing it (e.g., how you exactly combined XXX and YYY).
62
+ - **GitHub URL Link**: The Github repository or directory containining all code to reproduce the result. A placeholder repository is not allowed.
63
+ - **Number of Parameters**: The number of parameters of your model, which can be calculated by sum(p.numel() for p in model.parameters()). If you use multi-stage training (e.g., apply node2vec and then MLP), please sum up all the parameters (both node2vec and MLP parameters).
64
+ - **Honor code**: Please acknowledge that your submission adheres to all the ethical policies and your result is reproducible.
 
 
 
 
 
 
 
 
 
 
 
65
  """
66
 
67
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
68
  CITATION_BUTTON_TEXT = r"""
69
+ @article{relbench,
70
+ title={Relational Deep Learning: Graph Representation Learning on Relational Tables},
71
+ author={Matthias Fey, Weihua Hu, Kexin Huang, Jan Eric Lenssen, Rishabh Ranjan, Joshua Robinson, Rex Ying, Jiaxuan You, Jure Leskovec},
72
+ year={2023}
73
+ }
74
  """
src/display/utils.py CHANGED
@@ -3,7 +3,7 @@ from enum import Enum
3
 
4
  import pandas as pd
5
 
6
- from src.about import Tasks
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
@@ -43,6 +43,21 @@ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sh
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  ## For the queue columns in the submission tab
47
  @dataclass(frozen=True)
48
  class EvalQueueColumn: # Queue column
@@ -83,6 +98,58 @@ class ModelType(Enum):
83
  return ModelType.IFT
84
  return ModelType.Unknown
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  class WeightType(Enum):
87
  Adapter = ModelDetails("Adapter")
88
  Original = ModelDetails("Original")
@@ -114,6 +181,7 @@ class Precision(Enum):
114
 
115
  # Column selection
116
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
 
117
  TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
118
  COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
119
  TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
 
3
 
4
  import pandas as pd
5
 
6
+ from src.about import Tasks, nc_tasks
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 
43
  # We use make dataclass to dynamically fill the scores from Tasks
44
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
45
 
46
+
47
+ auto_eval_column_dict_nc = []
48
+ auto_eval_column_dict_nc.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
49
+ auto_eval_column_dict_nc.append(["average_rank", ColumnContent, ColumnContent("Average Rank⬆️", "number", True)])
50
+ for task in nc_tasks:
51
+ auto_eval_column_dict_nc.append(['_'.join(task.value.col_name.split('-')), ColumnContent, ColumnContent(task.value.col_name, "number", True)])
52
+ auto_eval_column_dict_nc.append(["author", ColumnContent, ColumnContent("Author", "markdown", True, never_hidden=False)])
53
+ auto_eval_column_dict_nc.append(["email", ColumnContent, ColumnContent("Email", "markdown", True, never_hidden=False)])
54
+ auto_eval_column_dict_nc.append(["Paper_URL", ColumnContent, ColumnContent("Paper URL", "markdown", True, never_hidden=False)])
55
+ auto_eval_column_dict_nc.append(["Github_URL", ColumnContent, ColumnContent("Github URL", "markdown", True, never_hidden=False)])
56
+ auto_eval_column_dict_nc.append(["Time", ColumnContent, ColumnContent("Time", "markdown", True, never_hidden=False)])
57
+ auto_eval_column_dict_nc.append(["num_of_Params", ColumnContent, ColumnContent("# of Params", "markdown", True, never_hidden=False)])
58
+
59
+ AutoEvalColumn_NodeClassification = make_dataclass("AutoEvalColumn_NodeClassification", auto_eval_column_dict_nc, frozen=True)
60
+
61
  ## For the queue columns in the submission tab
62
  @dataclass(frozen=True)
63
  class EvalQueueColumn: # Queue column
 
98
  return ModelType.IFT
99
  return ModelType.Unknown
100
 
101
+ class OFFICIAL(Enum):
102
+ official = ModelDetails("Official")
103
+ unofficial = ModelDetails("Unofficial")
104
+
105
+ class HONOR(Enum):
106
+ yes = ModelDetails("Yes")
107
+ no = ModelDetails("No")
108
+
109
+ class TASK_LIST(Enum):
110
+ amazon_user_churn = ModelDetails("rel-amazon-user-churn")
111
+ amazon_item_churn = ModelDetails("rel-amazon-item-churn")
112
+ amazon_user_ltv = ModelDetails("rel-amazon-user-ltv")
113
+ amazon_item_ltv = ModelDetails("rel-amazon-item-ltv")
114
+ amazon_user_item_purchase = ModelDetails("rel-amazon-user-item-purchase")
115
+ amazon_user_item_rate = ModelDetails("rel-amazon-user-item-rate")
116
+ amazon_user_item_review = ModelDetails("rel-amazon-user-item-review")
117
+
118
+ # rel-stack
119
+ stack_user_engagement = ModelDetails("rel-stack-user-engagement")
120
+ stack_user_badge = ModelDetails("rel-stack-user-badge")
121
+ stack_post_votes = ModelDetails("rel-stack-post-votes")
122
+ stack_user_post_comment = ModelDetails("rel-stack-user-post-comment")
123
+ stack_user_post_related = ModelDetails("rel-stack-user-post-related")
124
+
125
+ # rel-trial
126
+ trial_study_outcome = ModelDetails("rel-trial-study-outcome")
127
+ trial_study_adverse = ModelDetails("rel-trial-study-adverse")
128
+ trial_site_success = ModelDetails("rel-trial-site-success")
129
+ trial_condition_sponsor_run = ModelDetails("rel-trial-condition-sponsor-run")
130
+ trial_site_sponsor_run = ModelDetails("rel-trial-site-sponsor-run")
131
+
132
+ # rel-f1
133
+ f1_driver_position = ModelDetails("rel-f1-driver-position")
134
+ f1_driver_dnf = ModelDetails("rel-f1-driver-dnf")
135
+ f1_driver_top3 = ModelDetails("rel-f1-driver-top3")
136
+
137
+ # rel-hm
138
+ hm_user_churn = ModelDetails("rel-hm-user-churn")
139
+ hm_item_sales = ModelDetails("rel-hm-item-sales")
140
+ hm_user_item_purchase = ModelDetails("rel-hm-user-item-purchase")
141
+
142
+ # rel-event
143
+ event_user_repeat = ModelDetails("rel-event-user-repeat")
144
+ event_user_ignore = ModelDetails("rel-event-user-ignore")
145
+ event_user_attendance = ModelDetails("rel-event-user-attendance")
146
+
147
+ # rel-avito
148
+ avito_user_visits = ModelDetails("rel-avito-user-visits")
149
+ avito_user_clicks = ModelDetails("rel-avito-user-clicks")
150
+ avito_ads_clicks = ModelDetails("rel-avito-ads-clicks")
151
+ avito_user_ad_visit = ModelDetails("rel-avito-user-ad-visit")
152
+
153
  class WeightType(Enum):
154
  Adapter = ModelDetails("Adapter")
155
  Original = ModelDetails("Original")
 
181
 
182
  # Column selection
183
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
184
+ COLS_NC = [c.name for c in fields(AutoEvalColumn_NodeClassification) if not c.hidden]
185
  TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
186
  COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
187
  TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
src/envs.py CHANGED
@@ -6,7 +6,7 @@ from huggingface_hub import HfApi
6
  # ----------------------------------
7
  TOKEN = os.environ.get("TOKEN") # A read/write token for your org
8
 
9
- OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/leaderboard"
 
6
  # ----------------------------------
7
  TOKEN = os.environ.get("TOKEN") # A read/write token for your org
8
 
9
+ OWNER = "relbench" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/leaderboard"
src/populate.py CHANGED
@@ -1,6 +1,6 @@
1
  import json
2
  import os
3
-
4
  import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
@@ -8,19 +8,55 @@ from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
 
11
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
13
  raw_data = get_raw_eval_results(results_path, requests_path)
14
  all_data_json = [v.to_dict() for v in raw_data]
15
 
16
  df = pd.DataFrame.from_records(all_data_json)
17
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
18
- df = df[cols].round(decimals=2)
19
 
20
  # filter out if any of the benchmarks have not been produced
21
- df = df[has_no_nan_values(df, benchmark_cols)]
22
  return raw_data, df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
 
 
 
 
 
 
 
24
 
25
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
26
  """Creates the different dataframes for the evaluation queues requestes"""
 
1
  import json
2
  import os
3
+ from ast import literal_eval
4
  import pandas as pd
5
 
6
  from src.display.formatting import has_no_nan_values, make_clickable_model
 
8
  from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
11
+ '''
12
  def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
13
  """Creates a dataframe from all the individual experiment results"""
14
  raw_data = get_raw_eval_results(results_path, requests_path)
15
  all_data_json = [v.to_dict() for v in raw_data]
16
 
17
  df = pd.DataFrame.from_records(all_data_json)
18
+ #df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
19
+ #df = df[cols].round(decimals=2)
20
 
21
  # filter out if any of the benchmarks have not been produced
22
+ #df = df[has_no_nan_values(df, benchmark_cols)]
23
  return raw_data, df
24
+ '''
25
+
26
+ def get_leaderboard_df(EVAL_REQUESTS_PATH, tasks) -> pd.DataFrame:
27
+
28
+ model_result_filepaths = []
29
+ for root,_, files in os.walk(EVAL_REQUESTS_PATH):
30
+ if len(files) == 0 or any([not f.endswith(".json") for f in files]):
31
+ continue
32
+ for file in files:
33
+ model_result_filepaths.append(os.path.join(root, file))
34
+
35
+ model_res = []
36
+ for model in model_result_filepaths:
37
+ import json
38
+ with open(model) as f:
39
+ model_res.append(json.load(f))
40
+
41
+ for model in model_res:
42
+ model["test"] = literal_eval(model["test"])
43
+ model["valid"] = literal_eval(model["valid"])
44
+ model["params"] = int(model["params"])
45
+ model['submitted_time'] = model['submitted_time'].split('T')[0]
46
+ #model['paper_url'] = '[Link](' + model['paper_url'] + ')'
47
+ #model['github_url'] = '[Link](' + model['github_url'] + ')'
48
+
49
+ name2short_name = {task.value.benchmark: task.value.col_name for task in tasks}
50
+ for model in model_res:
51
+ model.update({name2short_name[i]: str(model['test'][i][0])[:4] + '±' + str(model['test'][i][1])[:4] if i in model['test'] else '-' for i in name2short_name})
52
 
53
+ columns_to_show = ['model', 'author', 'email', 'paper_url', 'github_url', 'submitted_time', 'params'] + list(name2short_name.values())
54
+ df_res = pd.DataFrame([{col: model[col] for col in columns_to_show} for model in model_res])
55
+ ranks = df_res[list(name2short_name.values())].rank()
56
+ df_res.rename(columns={'model': 'Model', 'author': 'Author', 'email': 'Email', 'paper_url': 'Paper URL', 'github_url': 'Github URL', 'submitted_time': 'Time', 'params': '# of Params'}, inplace=True)
57
+ df_res['Average Rank⬆️'] = ranks.mean(axis=1)
58
+ df_res.sort_values(by='Average Rank⬆️', ascending=True, inplace=True)
59
+ return df_res
60
 
61
  def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
62
  """Creates the different dataframes for the evaluation queues requestes"""
src/submission/check_validity.py CHANGED
@@ -88,7 +88,7 @@ def already_submitted_models(requested_models_dir: str) -> set[str]:
88
  continue
89
  with open(os.path.join(root, file), "r") as f:
90
  info = json.load(f)
91
- file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
92
 
93
  # Select organisation
94
  if info["model"].count("/") == 0 or "submitted_time" not in info:
 
88
  continue
89
  with open(os.path.join(root, file), "r") as f:
90
  info = json.load(f)
91
+ file_names.append(f"{info['model']}")
92
 
93
  # Select organisation
94
  if info["model"].count("/") == 0 or "submitted_time" not in info:
src/submission/submit.py CHANGED
@@ -14,93 +14,77 @@ from src.submission.check_validity import (
14
  REQUESTED_MODELS = None
15
  USERS_TO_SUBMISSION_DATES = None
16
 
 
 
 
 
 
 
 
 
 
 
 
17
  def add_new_eval(
18
- model: str,
19
- base_model: str,
20
- revision: str,
21
- precision: str,
22
- weight_type: str,
23
- model_type: str,
 
 
 
 
 
24
  ):
25
  global REQUESTED_MODELS
26
  global USERS_TO_SUBMISSION_DATES
27
  if not REQUESTED_MODELS:
28
  REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
29
 
30
- user_name = ""
31
  model_path = model
32
- if "/" in model:
33
- user_name = model.split("/")[0]
34
- model_path = model.split("/")[1]
35
 
36
- precision = precision.split(" ")[0]
37
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
38
 
39
- if model_type is None or model_type == "":
40
- return styled_error("Please select a model type.")
41
-
42
- # Does the model actually exist?
43
- if revision == "":
44
- revision = "main"
45
-
46
- # Is the model on the hub?
47
- if weight_type in ["Delta", "Adapter"]:
48
- base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
49
- if not base_model_on_hub:
50
- return styled_error(f'Base model "{base_model}" {error}')
51
-
52
- if not weight_type == "Adapter":
53
- model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
54
- if not model_on_hub:
55
- return styled_error(f'Model "{model}" {error}')
56
-
57
- # Is the model info correctly filled?
58
- try:
59
- model_info = API.model_info(repo_id=model, revision=revision)
60
- except Exception:
61
- return styled_error("Could not get your model information. Please fill it up properly.")
62
-
63
- model_size = get_model_size(model_info=model_info, precision=precision)
64
-
65
- # Were the model card and license filled?
66
- try:
67
- license = model_info.cardData["license"]
68
- except Exception:
69
- return styled_error("Please select a license for your model")
70
-
71
- modelcard_OK, error_msg = check_model_card(model)
72
- if not modelcard_OK:
73
- return styled_error(error_msg)
74
 
75
  # Seems good, creating the eval
76
  print("Adding new eval")
77
 
78
  eval_entry = {
79
  "model": model,
80
- "base_model": base_model,
81
- "revision": revision,
82
- "precision": precision,
83
- "weight_type": weight_type,
 
 
 
 
 
84
  "status": "PENDING",
85
  "submitted_time": current_time,
86
- "model_type": model_type,
87
- "likes": model_info.likes,
88
  "params": model_size,
89
- "license": license,
90
  "private": False,
91
  }
92
 
93
- # Check for duplicate submission
94
- if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
95
- return styled_warning("This model has been already submitted.")
96
 
97
  print("Creating eval file")
98
- OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
99
  os.makedirs(OUT_DIR, exist_ok=True)
100
- out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
101
 
102
- with open(out_path, "w") as f:
103
- f.write(json.dumps(eval_entry))
 
 
 
 
104
 
105
  print("Uploading eval file")
106
  API.upload_file(
 
14
  REQUESTED_MODELS = None
15
  USERS_TO_SUBMISSION_DATES = None
16
 
17
+ class CustomJSONEncoder(json.JSONEncoder):
18
+ def default(self, obj):
19
+ try:
20
+ return super().default(obj)
21
+ except TypeError:
22
+ return str(obj) # Convert non-serializable object to string
23
+
24
+ def add_new_eval_json(eval_entry, out_path):
25
+ with open(out_path, "w") as f:
26
+ f.write(json.dumps(eval_entry, cls=CustomJSONEncoder))
27
+
28
  def add_new_eval(
29
+ author,
30
+ email,
31
+ relbench_version,
32
+ model,
33
+ official_or_not,
34
+ test_performance,
35
+ valid_performance,
36
+ paper_url,
37
+ github_url,
38
+ parameters,
39
+ honor_code,
40
  ):
41
  global REQUESTED_MODELS
42
  global USERS_TO_SUBMISSION_DATES
43
  if not REQUESTED_MODELS:
44
  REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
45
 
 
46
  model_path = model
 
 
 
47
 
48
+ #precision = precision.split(" ")[0]
49
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
50
 
51
+ model_size = parameters
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  # Seems good, creating the eval
54
  print("Adding new eval")
55
 
56
  eval_entry = {
57
  "model": model,
58
+ "author": author,
59
+ "email": email,
60
+ "relbench_version": relbench_version,
61
+ "official_or_not": official_or_not,
62
+ "test": test_performance,
63
+ "valid": valid_performance,
64
+ "paper_url": paper_url,
65
+ "github_url": github_url,
66
+ "honor_code": honor_code,
67
  "status": "PENDING",
68
  "submitted_time": current_time,
 
 
69
  "params": model_size,
 
70
  "private": False,
71
  }
72
 
73
+ # TODO: Check for duplicate submission
74
+ #if f"{model}_{author}_{precision}" in REQUESTED_MODELS:
75
+ # return styled_warning("This model has been already submitted.")
76
 
77
  print("Creating eval file")
78
+ OUT_DIR = f"{EVAL_REQUESTS_PATH}/{model}"
79
  os.makedirs(OUT_DIR, exist_ok=True)
80
+ out_path = f"{OUT_DIR}/{model_path}_eval_request_False.json"
81
 
82
+ print(eval_entry)
83
+
84
+ #with open(out_path, "w") as f:
85
+ # f.write(json.dumps(eval_entry))
86
+
87
+ add_new_eval_json(eval_entry, out_path)
88
 
89
  print("Uploading eval file")
90
  API.upload_file(