CPunisher commited on
Commit
948d4dc
Β·
1 Parent(s): eacf49c

Other than data

Browse files
Files changed (3) hide show
  1. app.py +4 -86
  2. src/about.py +26 -29
  3. src/display/css_html_js.py +3 -0
app.py CHANGED
@@ -92,102 +92,20 @@ def init_leaderboard(dataframe):
92
  demo = gr.Blocks(css=custom_css)
93
  with demo:
94
  gr.HTML(TITLE)
95
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
 
97
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
  with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
  leaderboard = init_leaderboard(LEADERBOARD_DF)
100
 
101
- with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
102
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
 
104
- with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
  with gr.Column():
106
  with gr.Row():
107
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
 
109
- with gr.Column():
110
- with gr.Accordion(
111
- f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})",
112
- open=False,
113
- ):
114
- with gr.Row():
115
- finished_eval_table = gr.components.Dataframe(
116
- value=finished_eval_queue_df,
117
- headers=EVAL_COLS,
118
- datatype=EVAL_TYPES,
119
- row_count=5,
120
- )
121
- with gr.Accordion(
122
- f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})",
123
- open=False,
124
- ):
125
- with gr.Row():
126
- running_eval_table = gr.components.Dataframe(
127
- value=running_eval_queue_df,
128
- headers=EVAL_COLS,
129
- datatype=EVAL_TYPES,
130
- row_count=5,
131
- )
132
-
133
- with gr.Accordion(
134
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
135
- open=False,
136
- ):
137
- with gr.Row():
138
- pending_eval_table = gr.components.Dataframe(
139
- value=pending_eval_queue_df,
140
- headers=EVAL_COLS,
141
- datatype=EVAL_TYPES,
142
- row_count=5,
143
- )
144
- with gr.Row():
145
- gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text")
146
-
147
- with gr.Row():
148
- with gr.Column():
149
- model_name_textbox = gr.Textbox(label="Model name")
150
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
151
- model_type = gr.Dropdown(
152
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
153
- label="Model type",
154
- multiselect=False,
155
- value=None,
156
- interactive=True,
157
- )
158
-
159
- with gr.Column():
160
- precision = gr.Dropdown(
161
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
162
- label="Precision",
163
- multiselect=False,
164
- value="float16",
165
- interactive=True,
166
- )
167
- weight_type = gr.Dropdown(
168
- choices=[i.value.name for i in WeightType],
169
- label="Weights type",
170
- multiselect=False,
171
- value="Original",
172
- interactive=True,
173
- )
174
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
175
-
176
- submit_button = gr.Button("Submit Eval")
177
- submission_result = gr.Markdown()
178
- submit_button.click(
179
- add_new_eval,
180
- [
181
- model_name_textbox,
182
- base_model_name_textbox,
183
- revision_name_textbox,
184
- precision,
185
- weight_type,
186
- model_type,
187
- ],
188
- submission_result,
189
- )
190
-
191
  with gr.Row():
192
  with gr.Accordion("πŸ“™ Citation", open=False):
193
  citation_button = gr.Textbox(
 
92
  demo = gr.Blocks(css=custom_css)
93
  with demo:
94
  gr.HTML(TITLE)
95
+ gr.HTML(INTRODUCTION_TEXT, elem_classes="markdown-text")
96
 
97
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
  with gr.TabItem("πŸ… LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
  leaderboard = init_leaderboard(LEADERBOARD_DF)
100
 
101
+ # with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=2):
102
+ # gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
103
 
104
+ with gr.TabItem("πŸš€ Submission", elem_id="llm-benchmark-tab-table", id=3):
105
  with gr.Column():
106
  with gr.Row():
107
  gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  with gr.Row():
110
  with gr.Accordion("πŸ“™ Citation", open=False):
111
  citation_button = gr.Textbox(
src/about.py CHANGED
@@ -21,11 +21,25 @@ NUM_FEWSHOT = 0 # Change with your few shot
21
 
22
 
23
  # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
- Intro text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  """
30
 
31
  # Which evaluations are you running? how can people reproduce what you have?
@@ -38,35 +52,18 @@ To reproduce our results, here is the commands you can run:
38
  """
39
 
40
  EVALUATION_QUEUE_TEXT = """
41
- ## Some good practices before submitting a model
42
-
43
- ### 1) Make sure you can load your model and tokenizer using AutoClasses:
44
- ```python
45
- from transformers import AutoConfig, AutoModel, AutoTokenizer
46
- config = AutoConfig.from_pretrained("your model name", revision=revision)
47
- model = AutoModel.from_pretrained("your model name", revision=revision)
48
- tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
49
- ```
50
- If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
51
-
52
- Note: make sure your model is public!
53
- Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
54
-
55
- ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
56
- It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
57
-
58
- ### 3) Make sure your model has an open license!
59
- This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model πŸ€—
60
-
61
- ### 4) Fill up your model card
62
- When we add extra information about models to the leaderboard, it will be automatically taken from the model card
63
-
64
- ## In case of model failure
65
- If your model is displayed in the `FAILED` category, its execution stopped.
66
- Make sure you have followed the above steps first.
67
- If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
68
  """
69
 
70
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
71
  CITATION_BUTTON_TEXT = r"""
 
 
 
 
 
 
 
 
 
72
  """
 
21
 
22
 
23
  # Your leaderboard name
24
+ TITLE = """<h1 align="center" id="space-title">JavaBench Leaderboard</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
+ <p>
29
+ A Benchmark of Object-Oriented Code Generation for Evaluating Large Language Models
30
+ </p>
31
+
32
+ <p class="shields">
33
+ <a href="https://arxiv.org/abs/2406.12902">
34
+ <img src="https://img.shields.io/badge/arXiv-2406.12902-b31b1b.svg" />
35
+ </a>
36
+ <a href="https://github.com/java-bench/JavaBench">
37
+ <img src="https://img.shields.io/badge/Github-JavaBench-white.svg" />
38
+ </a>
39
+ <a href="https://huggingface.co/spaces/CPunisher/JavaBench">
40
+ <img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-JavaBench-ffc107?color=ffc107&logoColor=white" />
41
+ </a>
42
+ </p>
43
  """
44
 
45
  # Which evaluations are you running? how can people reproduce what you have?
 
52
  """
53
 
54
  EVALUATION_QUEUE_TEXT = """
55
+ Thank you for your interest in JavaBench. We warmly welcome researchers to submit additional benchmarking results, as we believe that collaborative efforts can significantly advance the study of Large Language Models and software engineering. For submission guidelines, please refer to our [Github Repo](https://github.com/java-bench/JavaBench?tab=readme-ov-file#usage).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  """
57
 
58
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
59
  CITATION_BUTTON_TEXT = r"""
60
+ @misc{cao2024aibeatundergraduatesentrylevel,
61
+ title={Can AI Beat Undergraduates in Entry-level Java Assignments? Benchmarking Large Language Models on JavaBench},
62
+ author={Jialun Cao and Zhiyong Chen and Jiarong Wu and Shing-chi Cheung and Chang Xu},
63
+ year={2024},
64
+ eprint={2406.12902},
65
+ archivePrefix={arXiv},
66
+ primaryClass={cs.LG},
67
+ url={https://arxiv.org/abs/2406.12902},
68
+ }
69
  """
src/display/css_html_js.py CHANGED
@@ -1,4 +1,7 @@
1
  custom_css = """
 
 
 
2
 
3
  .markdown-text {
4
  font-size: 16px !important;
 
1
  custom_css = """
2
+ .shields a {
3
+ display: inline-block;
4
+ }
5
 
6
  .markdown-text {
7
  font-size: 16px !important;