n0w0f commited on
Commit
32d70d7
Β·
1 Parent(s): a0639a7

feat: init submission, feedback and display

Browse files
README.md CHANGED
@@ -1,14 +1,14 @@
1
  ---
2
- title: Eval Cards Gallery
3
- emoji: πŸƒ
4
- colorFrom: indigo
5
- colorTo: green
6
  sdk: gradio
7
  sdk_version: 5.20.1
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
- short_description: Registry to collect eval-cards on benchmarking efforts
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Eval Cards
3
+ emoji: πŸ†
4
+ colorFrom: green
5
+ colorTo: purple
6
  sdk: gradio
7
  sdk_version: 5.20.1
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
+ short_description: Registry of eval-cards from different benchmark
12
  ---
13
 
14
+ For more details refer : https://github.com/lamalab-org/eval-cards
app.py ADDED
@@ -0,0 +1,433 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import os
3
+ import re
4
+ from pathlib import Path
5
+
6
+ import gradio as gr
7
+ import pandas as pd
8
+ import requests
9
+ import yaml
10
+
11
+ # Constants
12
+ EVAL_CARDS_DIR = "eval_cards"
13
+ TEMPLATE_PATH = "template.yaml"
14
+ DEFAULT_MODEL = "anthropic/claude-3-haiku-20240307" # Or any other model available on HF
15
+
16
+ # Ensure the eval cards directory exists
17
+ os.makedirs(EVAL_CARDS_DIR, exist_ok=True)
18
+
19
+ # Copy the template to the appropriate location
20
+ with open("template.yaml", "w") as f:
21
+ with open("yaml_template.yaml", "r") as template_file:
22
+ f.write(template_file.read())
23
+
24
+ def load_template():
25
+ """Load the YAML template"""
26
+ with open(TEMPLATE_PATH, "r") as file:
27
+ return file.read()
28
+
29
+ def yaml_to_dict(yaml_str):
30
+ """Convert YAML string to Python dictionary"""
31
+ try:
32
+ return yaml.safe_load(yaml_str)
33
+ except yaml.YAMLError as e:
34
+ return {"error": str(e)}
35
+
36
+ def compute_coverage_score(eval_data):
37
+ """
38
+ Compute a coverage score for the eval card
39
+ Returns a score from 0-100 and a breakdown of coverage by section
40
+ """
41
+ sections = {
42
+ "metadata": 5,
43
+ "evaluation_design": 15,
44
+ "estimand": 20,
45
+ "estimator": 25,
46
+ "estimate": 10,
47
+ "results_communication": 10,
48
+ "known_issues_and_limitations": 10,
49
+ "version_and_maintenance": 5,
50
+ "citation_and_usage": 5,
51
+ }
52
+
53
+ scores = {}
54
+ total_score = 0
55
+
56
+ def count_filled_fields(data, prefix=""):
57
+ if isinstance(data, dict):
58
+ filled = 0
59
+ total = 0
60
+ for key, value in data.items():
61
+ if isinstance(value, (dict, list)):
62
+ sub_filled, sub_total = count_filled_fields(value, f"{prefix}.{key}" if prefix else key)
63
+ filled += sub_filled
64
+ total += sub_total
65
+ else:
66
+ total += 1
67
+ if value and not (isinstance(value, str) and value.strip() in ["", "[]", "{}"]):
68
+ filled += 1
69
+ return filled, total
70
+ elif isinstance(data, list):
71
+ if not data:
72
+ return 0, 1
73
+ filled = 0
74
+ total = 0
75
+ for item in data:
76
+ sub_filled, sub_total = count_filled_fields(item)
77
+ filled += sub_filled
78
+ total += sub_total
79
+ return filled, total
80
+ else:
81
+ return 1 if data else 0, 1
82
+
83
+ # Compute scores for each section
84
+ for section, weight in sections.items():
85
+ if section in eval_data:
86
+ filled, total = count_filled_fields(eval_data[section])
87
+ completion_rate = filled / total if total > 0 else 0
88
+ scores[section] = {
89
+ "score": round(completion_rate * weight, 2),
90
+ "max_score": weight,
91
+ "completion_rate": round(completion_rate * 100, 2),
92
+ "fields_filled": filled,
93
+ "fields_total": total
94
+ }
95
+ total_score += scores[section]["score"]
96
+ else:
97
+ scores[section] = {
98
+ "score": 0,
99
+ "max_score": weight,
100
+ "completion_rate": 0,
101
+ "fields_filled": 0,
102
+ "fields_total": 0
103
+ }
104
+
105
+ return round(total_score, 2), scores
106
+
107
+ def get_llm_feedback(yaml_content, api_token=None):
108
+ """
109
+ Get feedback on the eval card from Groq's LLM
110
+ """
111
+ if not api_token:
112
+ return "API token is required for LLM feedback."
113
+
114
+ try:
115
+ headers = {
116
+ "Content-Type": "application/json",
117
+ "Authorization": f"Bearer {api_token}"
118
+ }
119
+
120
+ prompt = f"""
121
+ I'm reviewing an Evaluation Card in YAML format. Please analyze it for completeness,
122
+ consistency, and clarity. Provide specific recommendations for improvement.
123
+
124
+ Focus on:
125
+ 1. Sections that need more detail
126
+ 2. Inconsistencies or contradictions
127
+ 3. Clarity of language and explanations
128
+ 4. Alignment with best practices for ML evaluation
129
+
130
+ Here's the YAML content:
131
+
132
+ ```yaml
133
+ {yaml_content}
134
+ ```
135
+
136
+ Provide your feedback in a structured format with specific, actionable recommendations.
137
+ """
138
+
139
+ payload = {
140
+ "model": "llama-3.3-70b-versatile", # or another groq supported model
141
+ "messages": [
142
+ {"role": "user", "content": prompt}
143
+ ]
144
+ }
145
+
146
+ response = requests.post(
147
+ "https://api.groq.com/openai/v1/chat/completions",
148
+ headers=headers,
149
+ json=payload
150
+ )
151
+
152
+ if response.status_code == 200:
153
+ return response.json()["choices"][0]["message"]["content"]
154
+ else:
155
+ return f"Error getting Groq LLM feedback: {response.status_code} - {response.text}"
156
+
157
+ except Exception as e:
158
+ return f"Error getting Groq LLM feedback: {str(e)}"
159
+
160
+
161
+ def save_eval_card(yaml_content, filename=None):
162
+ """Save an eval card to the repository"""
163
+ try:
164
+ # Parse YAML to validate it
165
+ eval_data = yaml.safe_load(yaml_content)
166
+
167
+ # Generate filename if not provided
168
+ if not filename:
169
+ eval_name = eval_data.get("title", "Unnamed Evaluation")
170
+ # Clean filename
171
+ filename = re.sub(r'[^\w\-_]', '_', eval_name)
172
+ filename = f"{filename}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.yaml"
173
+
174
+ # Save file
175
+ file_path = os.path.join(EVAL_CARDS_DIR, filename)
176
+ with open(file_path, "w") as file:
177
+ file.write(yaml_content)
178
+
179
+ return True, file_path
180
+ except Exception as e:
181
+ return False, str(e)
182
+
183
+ def load_all_eval_cards():
184
+ """Load all eval cards from the repository"""
185
+ eval_cards = []
186
+
187
+ for filename in os.listdir(EVAL_CARDS_DIR):
188
+ if filename.endswith(".yaml"):
189
+ file_path = os.path.join(EVAL_CARDS_DIR, filename)
190
+ try:
191
+ with open(file_path, "r") as file:
192
+ yaml_content = file.read()
193
+ eval_data = yaml.safe_load(yaml_content)
194
+
195
+ # Compute coverage score
196
+ score, score_details = compute_coverage_score(eval_data)
197
+
198
+ # Extract key metadata
199
+ eval_cards.append({
200
+ "filename": filename,
201
+ "title": eval_data.get("title", "Unnamed Evaluation"),
202
+ "summary": eval_data.get("summary", ""),
203
+ "authors": ", ".join(eval_data.get("metadata", {}).get("authors", [])),
204
+ "creation_date": eval_data.get("metadata", {}).get("creation_date", ""),
205
+ "coverage_score": score,
206
+ "score_details": score_details,
207
+ "yaml_content": yaml_content,
208
+ "data": eval_data
209
+ })
210
+ except Exception as e:
211
+ print(f"Error loading {filename}: {str(e)}")
212
+
213
+ return eval_cards
214
+
215
+ def format_eval_card_as_html(eval_card):
216
+ """Format an eval card as HTML for display"""
217
+ html = f"""
218
+ <div style="border: 1px solid #ddd; padding: 15px; margin-bottom: 20px; border-radius: 5px;">
219
+ <h3>{eval_card['title']}</h3>
220
+ <p>{eval_card['summary']}</p>
221
+ <p><strong>Authors:</strong> {eval_card['authors']}</p>
222
+ <p><strong>Created:</strong> {eval_card['creation_date']}</p>
223
+ <p><strong>Coverage Score:</strong> {eval_card['coverage_score']}%</p>
224
+
225
+ <h4>Coverage by Section:</h4>
226
+ <table style="width: 100%; border-collapse: collapse;">
227
+ <tr>
228
+ <th style="text-align: left; padding: 5px; border-bottom: 1px solid #ddd;">Section</th>
229
+ <th style="text-align: right; padding: 5px; border-bottom: 1px solid #ddd;">Score</th>
230
+ <th style="text-align: right; padding: 5px; border-bottom: 1px solid #ddd;">Completion</th>
231
+ </tr>
232
+ """
233
+
234
+ for section, details in eval_card['score_details'].items():
235
+ html += f"""
236
+ <tr>
237
+ <td style="padding: 5px; border-bottom: 1px solid #eee;">{section}</td>
238
+ <td style="text-align: right; padding: 5px; border-bottom: 1px solid #eee;">{details['score']}/{details['max_score']}</td>
239
+ <td style="text-align: right; padding: 5px; border-bottom: 1px solid #eee;">{details['completion_rate']}%</td>
240
+ </tr>
241
+ """
242
+
243
+ html += """
244
+ </table>
245
+ <div style="margin-top: 10px;">
246
+ <a href="#" onclick="viewYaml(this)" data-filename="{eval_card['filename']}" style="text-decoration: none; color: #3273dc;">View YAML</a>
247
+ </div>
248
+ </div>
249
+ """
250
+
251
+ return html
252
+
253
+ def create_eval_cards_table(eval_cards):
254
+ """Create an HTML table of eval cards"""
255
+ if not eval_cards:
256
+ return "<p>No evaluation cards found.</p>"
257
+
258
+ # Sort by coverage score (highest first)
259
+ eval_cards.sort(key=lambda x: x['coverage_score'], reverse=True)
260
+
261
+ html = ""
262
+ for eval_card in eval_cards:
263
+ html += format_eval_card_as_html(eval_card)
264
+
265
+ return html
266
+
267
+ def upload_file(file):
268
+ """Process an uploaded YAML file"""
269
+ if file is None:
270
+ return "No file uploaded", None
271
+
272
+ try:
273
+ yaml_content = file.decode("utf-8")
274
+ # Validate YAML
275
+ eval_data = yaml.safe_load(yaml_content)
276
+ return yaml_content, eval_data
277
+ except Exception as e:
278
+ return f"Error processing file: {str(e)}", None
279
+
280
+ def get_feedback(yaml_content, api_token):
281
+ """Get LLM feedback on the eval card"""
282
+ if not yaml_content:
283
+ return "Please upload or paste a YAML file first."
284
+
285
+ if not api_token:
286
+ return "Please provide an API token for the LLM service."
287
+
288
+ feedback = get_llm_feedback(yaml_content, api_token)
289
+ return feedback
290
+
291
+ def submit_eval_card(yaml_content):
292
+ """Submit an eval card to the repository"""
293
+ if not yaml_content:
294
+ return "Please upload or paste a YAML file first.", None, None
295
+
296
+ try:
297
+ # Validate YAML
298
+ eval_data = yaml.safe_load(yaml_content)
299
+
300
+ # Compute coverage score
301
+ score, score_details = compute_coverage_score(eval_data)
302
+
303
+ # Save eval card
304
+ success, file_path = save_eval_card(yaml_content)
305
+
306
+ if success:
307
+ return f"Evaluation card saved successfully! Coverage score: {score}%", score, score_details
308
+ else:
309
+ return f"Error saving evaluation card: {file_path}", None, None
310
+
311
+ except Exception as e:
312
+ return f"Error processing evaluation card: {str(e)}", None, None
313
+
314
+ def refresh_gallery():
315
+ """Refresh the gallery of eval cards"""
316
+ eval_cards = load_all_eval_cards()
317
+ html = create_eval_cards_table(eval_cards)
318
+
319
+ # Convert data to pandas DataFrame for table view
320
+ table_data = []
321
+ for card in eval_cards:
322
+ table_data.append({
323
+ "Title": card["title"],
324
+ "Authors": card["authors"],
325
+ "Creation Date": card["creation_date"],
326
+ "Coverage Score": f"{card['coverage_score']}%"
327
+ })
328
+
329
+ df = pd.DataFrame(table_data)
330
+
331
+ return html, df if not df.empty else None
332
+
333
+ def handle_upload_tab(file_obj, yaml_text):
334
+ """Handle upload tab actions - either use uploaded file or pasted text"""
335
+ if file_obj is not None:
336
+ yaml_content, eval_data = upload_file(file_obj)
337
+ return yaml_content
338
+ else:
339
+ return yaml_text
340
+
341
+ # Create the Gradio interface
342
+ with gr.Blocks(title="Evaluation Card Repository") as app:
343
+ with gr.Row():
344
+ with gr.Column(scale=2):
345
+ gr.Markdown("# Evaluation Card Repository")
346
+ gr.Markdown("""
347
+ This application allows you to upload, validate, and explore ML evaluation cards.
348
+
349
+ Upload your evaluation card in YAML format, get feedback from an LLM, and submit it to the repository.
350
+ """)
351
+
352
+ with gr.Tabs():
353
+ with gr.TabItem("Upload & Review"):
354
+ with gr.Row():
355
+ with gr.Column():
356
+ file_upload = gr.File(label="Upload YAML File", file_types=[".yaml", ".yml"])
357
+
358
+ with gr.Accordion("Or paste YAML content", open=False):
359
+ yaml_input = gr.TextArea(label="YAML Content", placeholder="Paste your YAML content here...", lines=10)
360
+
361
+ load_template_btn = gr.Button("Load Template")
362
+
363
+ api_token = gr.Textbox(label="API Token (for LLM feedback)", type="password")
364
+
365
+ with gr.Row():
366
+ get_feedback_btn = gr.Button("Get LLM Feedback")
367
+ submit_btn = gr.Button("Submit Evaluation Card", variant="primary")
368
+
369
+ with gr.Column():
370
+ yaml_display = gr.TextArea(label="Current YAML", lines=20)
371
+
372
+ with gr.Accordion("LLM Feedback", open=True):
373
+ feedback_display = gr.Markdown()
374
+
375
+ with gr.Accordion("Submission Result", open=True):
376
+ result_display = gr.Markdown()
377
+ coverage_score = gr.Number(label="Coverage Score", visible=False)
378
+ coverage_details = gr.JSON(label="Coverage Details", visible=False)
379
+
380
+ with gr.TabItem("Gallery"):
381
+ refresh_btn = gr.Button("Refresh Gallery")
382
+
383
+ with gr.Tabs():
384
+ with gr.TabItem("Card View"):
385
+ gallery_html = gr.HTML()
386
+
387
+ with gr.TabItem("Table View"):
388
+ gallery_table = gr.DataFrame()
389
+
390
+ # Set up event handlers
391
+ load_template_btn.click(
392
+ fn=load_template,
393
+ outputs=[yaml_display]
394
+ )
395
+
396
+ file_upload.change(
397
+ fn=handle_upload_tab,
398
+ inputs=[file_upload, yaml_input],
399
+ outputs=[yaml_display]
400
+ )
401
+
402
+ yaml_input.change(
403
+ fn=lambda x: x,
404
+ inputs=[yaml_input],
405
+ outputs=[yaml_display]
406
+ )
407
+
408
+ get_feedback_btn.click(
409
+ fn=get_feedback,
410
+ inputs=[yaml_display, api_token],
411
+ outputs=[feedback_display]
412
+ )
413
+
414
+ submit_btn.click(
415
+ fn=submit_eval_card,
416
+ inputs=[yaml_display],
417
+ outputs=[result_display, coverage_score, coverage_details]
418
+ )
419
+
420
+ refresh_btn.click(
421
+ fn=refresh_gallery,
422
+ outputs=[gallery_html, gallery_table]
423
+ )
424
+
425
+ # Initialize the gallery on app start
426
+ app.load(
427
+ fn=refresh_gallery,
428
+ outputs=[gallery_html, gallery_table]
429
+ )
430
+
431
+ # Launch the app
432
+ if __name__ == "__main__":
433
+ app.launch()
eval_cards/ChemBench_20250312_170522.yaml ADDED
@@ -0,0 +1,411 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ title: "ChemBench"
2
+
3
+ summary: >
4
+ ChemBench was developed as a comprehensive benchmarking suite for the performance of LLMs in chemistry.
5
+ It features a curation of more than 2,700 question-answer pairs classified to probe knowledge, intuition
6
+ and reasoning abilities of LLMs. ChemBench goes beyond simple MCQ evaluation, supports floating point
7
+ answers (also in scientific notation), and prompts models closely to how they were trained.
8
+
9
+ metadata:
10
+ authors:
11
+ - Adrian Mirza
12
+ - Nawaf Alampara
13
+ - Sreekanth Kunchapu
14
+ - MartiΓ±o RΓ­os-GarcΓ­a
15
+ - Benedict Emoekabu
16
+ - Aswanth Krishnan
17
+ - Tanya Gupta
18
+ - Mara Schilling-Wilhelmi
19
+ - Macjonathan Okereke
20
+ - Anagha Aneesh
21
+ - Mehrdad Asgari
22
+ - Juliane Eberhardt
23
+ - Amir Mohammad Elahi
24
+ - Hani M. Elbeheiry
25
+ - MarΓ­a Victoria Gil
26
+ - Christina Glaubitz
27
+ - Maximilian Greiner
28
+ - Caroline T. Holick
29
+ - Tim Hoffmann
30
+ - Abdelrahman Ibrahim
31
+ - Lea C. Klepsch
32
+ - Yannik KΓΆster
33
+ - Fabian Alexander Kreth
34
+ - Jakob Meyer
35
+ - Santiago Miret
36
+ - Jan Matthias Peschel
37
+ - Michael Ringleb
38
+ - Nicole Roesner
39
+ - Johanna Schreiber
40
+ - Ulrich S. Schubert
41
+ - Leanne M. Stafast
42
+ - Dinga Wonanke
43
+ - Michael Pieler
44
+ - Philippe Schwaller
45
+ - Kevin Maik Jablonka
46
+ maintainers:
47
+ - Adrian Mirza
48
+ - Nawaf Alampara
49
+ - MartiΓ±o RΓ­os-GarcΓ­a
50
+ - Kevin Maik Jablonka
51
+ creation_date: "2023-05-15"
52
+ last_review_date: "2024-11-01"
53
+ next_review_date: "YTBD"
54
+ version_compatibility:
55
+ - "v0.3.0"
56
+
57
+ evaluation_design:
58
+ motivation:
59
+ scientific_needs: >
60
+ ChemBench is one of the pioneering benchmarks to evaluate performance of LLMs in chemistry specifically.
61
+ Prior selection of LLMs on chemistry tasks has been based on their performance on general benchmarks like Big Bench.
62
+ approach_justification: >
63
+ ChemBench comprehensively evaluates almost all the leading models on a wide range of chemistry topics,
64
+ allowing topic-specific leaders identification. It also probes safety knowledge of LLMs and evaluates
65
+ measures of alignment with human intuitions.
66
+ expected_benefits: >
67
+ Provides comparison metrics for LLM training on chemistry-specific tasks and evaluates performance
68
+ across different chemistry topics.
69
+ tradeoffs: >
70
+ Current LLMs lack human intuitions. ChemBench currently does not support evaluation of open-ended chemistry tasks.
71
+
72
+ type_and_structure:
73
+ type: "Benchmark"
74
+ structure: >
75
+ End-to-end automation, careful validation by experts, and usability with black box systems.
76
+ The benchmark covers a diverse set of topics and skills (reasoning, calculation, knowledge, and intuition)
77
+ across a range of difficulty levels.
78
+ timeline: ""
79
+ key_design_decisions:
80
+ - Benchmark approach for scalability and easier accessibility
81
+ - End-to-end automation for frequent model evaluation
82
+ - Careful validation by experts to minimize incorrect or unanswerable questions
83
+ - Support for models with special treatment of molecules
84
+ - Usability with black box systems without access to weights or logits
85
+ - Probing capabilities beyond MCQs to reflect real-world chemistry
86
+ - Coverage of diverse topics and skills
87
+ - Range of difficulty levels to measure improvement
88
+ - Impossible to completely solve with current models
89
+ design_process:
90
+ stakeholder_consultation: "ChemBench is internally used by some of the leading AI labs"
91
+ pilot_studies:
92
+ - "LLM ChemBench results were compared against humans using a subset of ChemBench"
93
+ validation_approaches:
94
+ - "Codebase tested with unit tests covering parsing modules, metrics modules, and extraction modules"
95
+ - "Questions verified manually by experts through GitHub pull requests"
96
+ - "Automated checks via GitHub Actions for schemas, LATEX templating, and formatting"
97
+ - "Leaderboard verification of complete corpus evaluation"
98
+
99
+ stakeholders_and_resources:
100
+ target_users:
101
+ - "General audience developing or evaluating ML models"
102
+ - "Researchers developing chemistry datasets"
103
+ required_expertise:
104
+ - "Basic knowledge of using benchmarks (simple how-to guide provided)"
105
+ resource_requirements:
106
+ - "API keys for closed-source models"
107
+ - "GPUs for fast local benchmarking (CPU also possible but slower)"
108
+ cost_considerations: "Nil"
109
+
110
+ estimand:
111
+ target_construct:
112
+ primary_capability: "Capabilities of models to answer chemistry questions"
113
+ measurement_type: "Pragmatic"
114
+ relationship_to_applications: >
115
+ ChemBench score can be considered a comparative metric to measure gains in LLM training.
116
+ Shows positive correlation to performance on tasks like data extraction.
117
+ theoretical_framework: >
118
+ Assumes the corpus is not being used for training during model development.
119
+ Findings on capabilities are based on performance in answering questions that rely on
120
+ reasoning, calculation, knowledge, and intuition for humans to solve.
121
+
122
+ scope_and_limitations:
123
+ coverage: >
124
+ Over 2,700 question-answer pairs classified to probe knowledge, intuition, and reasoning.
125
+ Covers subjects within Chemistry taught at undergraduate and postgraduate level courses.
126
+ excluded_capabilities:
127
+ - "Property prediction capabilities"
128
+ - "Data extraction capabilities"
129
+ - "Embedding meaningfulness"
130
+ - "Agentic capabilities"
131
+ known_blind_spots:
132
+ - "Questions considered answered correctly only if final answer is correct"
133
+ - "Partial scoring and open-ended evaluation not covered"
134
+ theoretical_limitations:
135
+ - "Questions treated with equal weights, no clear approach for weighing tasks"
136
+ - "Reliability and correlation between log probabilities and model responses not known"
137
+
138
+ assessment_components:
139
+ test_set:
140
+ data_sources:
141
+ - "Curated questions from existing exams or exercise sheets"
142
+ - "Programmatically created questions"
143
+ sampling_methodology: "Each model evaluated on all questions"
144
+ known_biases:
145
+ - "Questions mainly curated from the background of the developers"
146
+ approach_to_duplicates: >
147
+ Each question-answer pair hashed to create unique IDs, filtering to keep unique questions based on UUIDs.
148
+ data_quality: >
149
+ Guidelines followed by reviewers: originality, clarity, factual correctness, and avoiding ambiguity.
150
+
151
+ estimator:
152
+ evaluation_protocol:
153
+ methodology: >
154
+ Distinct prompt templates for completion and instruction-tuned models. Multistep parsing workflow
155
+ based on regular expressions with LLM extraction as fallback. Comprehensive refusal detection combining
156
+ regular expression-based detection and a fine-tuned BERT model.
157
+ control_measures:
158
+ - "Model-specific prompt templates"
159
+ - "Consistent parsing workflow"
160
+ - "Refusal detection and retry mechanism"
161
+ handling_random_components: "Refusal detection and retry mechanism for up to n times"
162
+ reproducibility_requirements: >
163
+ Storage of model timestamp, time, and version of the dataset used for benchmarking.
164
+
165
+ metrics:
166
+ primary_metrics:
167
+ - "Fraction of correctly answered questions"
168
+ aggregation_methodology: "Final score is mean of scores across all questions from all topics"
169
+ task_weightings:
170
+ approach: "All questions treated equally to avoid ambiguity"
171
+ note: "Questions classified into three difficulty levels manually by experts for further analysis"
172
+ performance_bounds:
173
+ scoring: "No partial scoring - all questions measured as correct/incorrect"
174
+ connection_to_outcomes: "Scores reflect how well the model is trained on chemistry"
175
+
176
+ metric_details:
177
+ - name: "Fraction Correct"
178
+ definition: >
179
+ Proportion of correct answers out of total questions. For MCQs, uses Hamming Loss;
180
+ for numerics, uses Mean Absolute Error with 1% threshold.
181
+ implementation: >
182
+ (1/n) * (sum(1-HammingLoss_i for i in MCQ) + sum(indicator(MAE_j < 0.01*|Target_j|) for j in Numeric))
183
+ edge_cases:
184
+ - "Perfect score: 1 when all questions answered correctly"
185
+ - "Complete failure: 0 when all questions answered incorrectly"
186
+ statistical_properties:
187
+ - "Simplicity: Easy to calculate and interpret"
188
+ - "Range: Always bounded between [0, 1]"
189
+ - "Binary nature: Each question contributes either 0 or 1"
190
+ failure_modes:
191
+ - "Masking: High overall accuracy can hide poor performance on specific question types"
192
+ - "Insensitivity to confidence: Doesn't account for prediction confidence"
193
+ - "Equal weighting: Assigns equal importance regardless of difficulty"
194
+ - "Heterogeneous data: Combining different question types with different evaluation criteria"
195
+ - "Threshold sensitivity: Results highly dependent on chosen thresholds"
196
+ - "Near-zero targets: For small target values, 1% threshold becomes extremely stringent"
197
+
198
+ - name: "Hamming Loss"
199
+ definition: >
200
+ Measures fraction of labels incorrectly predicted for MCQs.
201
+ (1/L) * sum(indicator(y_i,l != y_hat_i,l) for l in 1 to L)
202
+ implementation: "For single-answer MCQ, 0 if answer correct, 1 if incorrect"
203
+ statistical_properties:
204
+ - "Linearity: Scales linearly with misclassifications"
205
+ - "Range: Always bounded between [0, 1]"
206
+ - "Symmetry: Treats false positives and negatives equally"
207
+ failure_modes:
208
+ - "Equal weighting: Assigns equal importance regardless of difficulty"
209
+ - "Lack of severity grading: All errors weighted equally"
210
+ - "Multi-label complexity: May not capture label dependencies"
211
+ - "Simplistic for complex MCQs: Doesn't account for partial correctness"
212
+
213
+ technical_framework:
214
+ implementation_requirements:
215
+ - "Installing ChemBench package"
216
+ - "API keys for closed-source models"
217
+ - "GPUs for fast benchmarking (CPU also possible)"
218
+ time_constraints: "Complete benchmarking requires around 2 hours"
219
+ dependencies:
220
+ - "tenacity==8.3.0"
221
+ - "langchain>=0.1.5"
222
+ - "fastcore>=1.5.29"
223
+ - "scikit-learn>=1.4.0"
224
+ - "loguru>=0.7.2"
225
+ - "litellm>=1.59.1"
226
+ - "backoff>=2.2.1"
227
+ - "tqdm>=4.66.1"
228
+ - "pint>=0.23"
229
+ - "pandas>=2.2.0"
230
+ - "python-dotenv>=1.0.1"
231
+ - "fire>=0.5.0"
232
+ - "datasets"
233
+ - "torch"
234
+ - "transformers"
235
+ - "langchain-community>=0.0.17"
236
+ - "pillow"
237
+
238
+ constraints_and_rules:
239
+ allowed_resources:
240
+ - "Models not trained on the ChemBench corpus (not tested)"
241
+ permitted_approaches:
242
+ - "Tools or other agentic setups"
243
+ - "No constraints on model parameters or computational constraints"
244
+ - "No constraints on temperature or decoding strategies"
245
+ - "No constraints on architecture or post-training approaches"
246
+ optimization_constraints:
247
+ - "Prompts not optimized unless part of modeling"
248
+ ethical_boundaries:
249
+ - "Models not trained on the ChemBench corpus (not tested)"
250
+
251
+ estimate:
252
+ required_reporting:
253
+ essential_metrics:
254
+ - "all_correct (binary score of 0/1 for each question)"
255
+ - "Fraction correct (final score computed across all questions)"
256
+ - "Refusal detections and LLM parsing flags"
257
+ results_disaggregation: >
258
+ Individual scoring and relative position available for Topics:
259
+ Analytical Chemistry, Materials Science, Technical Chemistry, General Chemistry,
260
+ Physical Chemistry, Toxicity and Safety, Inorganic Chemistry, Organic Chemistry,
261
+ and Human Preference. Separate scores for easy/hard tasks, reasoning tasks,
262
+ computation tasks, knowledge tasks, human preference alignment, and comparison
263
+ against human chemists.
264
+ uncertainty_quantification: >
265
+ ChemBench has a unique way to obtain confidence of model predictions using prompting,
266
+ but this is a separate analysis not part of benchmark metrics.
267
+ performance_variation: "Currently not done"
268
+ resource_usage_reporting: "Currently tracks number of parameters if available"
269
+
270
+ reproducibility_information:
271
+ documentation_requirements:
272
+ - "model_name"
273
+ - "model_timestamp"
274
+ - "model_description"
275
+ - "date_published (optional)"
276
+ - "open_weights (optional)"
277
+ - "open_dataset (optional)"
278
+ - "nr_of_parameters (optional)"
279
+ - "github (optional)"
280
+ - "paper (optional)"
281
+ - "api_endpoint (optional)"
282
+ - "nr_of_tokens (optional)"
283
+ - "architecture (optional)"
284
+ - "mixture_of_experts (optional)"
285
+ - "model_alignment (optional)"
286
+ - "reinforcement_learning_from_human_feedback (optional)"
287
+ - "domain_specific_pretraining (optional)"
288
+ - "domain_specific_finetuning (optional)"
289
+ - "tool_use (optional)"
290
+ - "tool_type (optional)"
291
+ - "temperature (optional)"
292
+ - "epochs (optional)"
293
+ - "reasoning_model (optional)"
294
+ - "reasoning_type (optional)"
295
+ environment_specifications: >
296
+ Benchmarking performed using latest version of ChemBench pipeline and ChemBench Dataset.
297
+ randomization_handling: >
298
+ Temperature or other randomization or seeding expected in model description.
299
+ output_standardization: >
300
+ Outputs prompted to be given in ChemBench parsing compatible format.
301
+
302
+ results_communication:
303
+ visualization:
304
+ recommended_plots:
305
+ - "Spider chart showing model performance on different topics against baseline and other leading models"
306
+ - "Reliability and distribution of confidence estimates, showing confidence calibration"
307
+ standardized_formats:
308
+ - "Latest results maintained in ChemBench-Leaderboard"
309
+ - "Refusals counted as incorrect"
310
+ - "Baseline model as defined in paper"
311
+ - "Final answer based on ChemBench pipeline, not log probabilities"
312
+
313
+ leaderboard_guidelines:
314
+ submission_process: "Detailed in Huggingface Space documentation"
315
+ required_metadata:
316
+ - "Model details as specified in documentation requirements"
317
+
318
+ known_issues_and_limitations:
319
+ validity_concerns:
320
+ construct_validity: >
321
+ Even though ChemBench goes beyond MCQ-only benchmarks by including numeric questions,
322
+ evaluation on open-ended tasks is not included. Partial scoring and task weighing not supported.
323
+ gaming_possibilities: "Possibility to host ChemBench as a challenge"
324
+ stability_considerations: >
325
+ Refusal detection and retry mechanism implemented to tackle LLM refusals,
326
+ combining regex-based detection and fine-tuned BERT model.
327
+ temporal_validity: >
328
+ Questions based on scientific principles won't lose validity,
329
+ but may appear in training corpora over time.
330
+
331
+ practical_limitations:
332
+ resource_constraints: "Based on the model being benchmarked"
333
+ scalability_issues: "Based on the model being benchmarked"
334
+ cost_factors: "Based on the model being benchmarked"
335
+ time_boundaries: "Benchmark might lose validity as questions leak to training corpora"
336
+
337
+ bias_and_fairness:
338
+ known_biases:
339
+ - "Biases from human curation process"
340
+ representation_issues: "Certain areas of chemistry not evaluated"
341
+ potential_impacts: "Certain areas of chemistry not evaluated"
342
+ mitigation_approaches: "Curation by team of more than 10 people to balance biases"
343
+
344
+ version_and_maintenance:
345
+ version_information:
346
+ version:
347
+ results: "v1.0.4"
348
+ dataset: "v1.0.0"
349
+ code: "v0.3.0"
350
+ release_date: "2024-11-01"
351
+ change_history: "Tracked in GitHub repository changelog"
352
+ update_plans: "Discussed in GitHub repository discussions"
353
+
354
+ maintenance_protocol:
355
+ update_frequency: "Ad hoc after release"
356
+ deprecation_policy: >
357
+ Based on major issues with questions. Questions removed and dataset version updated.
358
+ Major updates lead to rerunning models for updated Leaderboard.
359
+ issue_reporting: "Issues tracked in GitHub repository"
360
+ community_involvement: >
361
+ Maintainers active in solving user issues on GitHub.
362
+ Proposal for forum in Mat Sci Community Disclosure.
363
+ Discussions available on GitHub and Huggingface.
364
+ criteria_for_updates:
365
+ - "Codebase updated for new features or bug fixes"
366
+ - "Dataset updated when questions added or removed"
367
+ - "Leaderboard updated for new models or dataset updates"
368
+ breaking_change_policy: >
369
+ All models in leaderboard rerun with new updates.
370
+ Update of arXiv paper released. Proposal to release a commit.
371
+ backwards_compatibility: >
372
+ Pydantic base classes for task and report stable for compatibility.
373
+ Major changes to tasks and report backward compatible.
374
+ migration_guides: "Released in documentation as needed"
375
+
376
+ citation_and_usage:
377
+ citation_information:
378
+ recommended_citation: >
379
+ @misc{mirza2024largelanguagemodelssuperhuman,
380
+ title={Are large language models superhuman chemists?},
381
+ author={Adrian Mirza and Nawaf Alampara and Sreekanth Kunchapu and Benedict Emoekabu and Aswanth Krishnan and Mara Wilhelmi and Macjonathan Okereke and Juliane Eberhardt and Amir Mohammad Elahi and Maximilian Greiner and Caroline T. Holick and Tanya Gupta and Mehrdad Asgari and Christina Glaubitz and Lea C. Klepsch and Yannik KΓΆster and Jakob Meyer and Santiago Miret and Tim Hoffmann and Fabian Alexander Kreth and Michael Ringleb and Nicole Roesner and Ulrich S. Schubert and Leanne M. Stafast and Dinga Wonanke and Michael Pieler and Philippe Schwaller and Kevin Maik Jablonka},
382
+ year={2024},
383
+ eprint={2404.01475},
384
+ archivePrefix={arXiv},
385
+ primaryClass={cs.LG},
386
+ url={https://arxiv.org/abs/2404.01475},
387
+ }
388
+ related_publications:
389
+ - "Are large language models superhuman chemists? (https://arxiv.org/abs/2404.01475)"
390
+ - "Probing the limitations of multimodal language models for chemistry and materials research (https://arxiv.org/pdf/2411.16955)"
391
+ licensing_details: "MIT License"
392
+
393
+ usage_guidelines:
394
+ recommended_applications:
395
+ - "Evaluation of LLM capabilities in chemistry"
396
+ inappropriate_uses:
397
+ - "Training models with the ChemBench dataset"
398
+ implementation_best_practices: >
399
+ Results obtained with ChemBench pipeline and latest dataset at time of benchmarking considered valid practice.
400
+ ethical_considerations: "ChemBench dataset not meant for training"
401
+
402
+ additional_notes:
403
+ related_evaluations:
404
+ - "ChemBench extension for multimodal models (https://arxiv.org/pdf/2411.16955)"
405
+ - "MatText for bottlenecks of finetuned LLMs on property prediction (https://arxiv.org/abs/2406.17295)"
406
+ - "MaScQA for investigating materials science knowledge of LLMs (https://pubs.rsc.org/en/content/articlelanding/2024/dd/d3dd00188a)"
407
+ - "Measuring Capabilities of Language Models for Biology Research (https://arxiv.org/abs/2407.10362)"
408
+ future_directions: >
409
+ Sensitivity to prompting, improving performance with prompt optimization.
410
+ Mechanistic interpretability. Benchmarking agents on ChemBench.
411
+ Effect of grounding and post-training approaches.
requirement.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ pyyaml>=6.0
3
+ pandas>=2.0.0
4
+ requests>=2.31.0
script.js ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Function to view YAML file
2
+ function viewYaml(element) {
3
+ const filename = element.getAttribute('data-filename');
4
+
5
+ // Make an AJAX request to fetch the YAML content
6
+ fetch(`/get_yaml?filename=${encodeURIComponent(filename)}`)
7
+ .then(response => response.text())
8
+ .then(yamlContent => {
9
+ // Display the YAML content in a modal
10
+ const modal = document.createElement('div');
11
+ modal.style.position = 'fixed';
12
+ modal.style.top = '0';
13
+ modal.style.left = '0';
14
+ modal.style.width = '100%';
15
+ modal.style.height = '100%';
16
+ modal.style.backgroundColor = 'rgba(0, 0, 0, 0.5)';
17
+ modal.style.zIndex = '1000';
18
+ modal.style.display = 'flex';
19
+ modal.style.justifyContent = 'center';
20
+ modal.style.alignItems = 'center';
21
+
22
+ const modalContent = document.createElement('div');
23
+ modalContent.style.backgroundColor = 'white';
24
+ modalContent.style.padding = '20px';
25
+ modalContent.style.borderRadius = '5px';
26
+ modalContent.style.maxWidth = '80%';
27
+ modalContent.style.maxHeight = '80%';
28
+ modalContent.style.overflow = 'auto';
29
+
30
+ const closeButton = document.createElement('button');
31
+ closeButton.textContent = 'Close';
32
+ closeButton.style.marginBottom = '10px';
33
+ closeButton.style.padding = '5px 10px';
34
+ closeButton.style.cursor = 'pointer';
35
+ closeButton.onclick = () => {
36
+ document.body.removeChild(modal);
37
+ };
38
+
39
+ const yamlPre = document.createElement('pre');
40
+ yamlPre.textContent = yamlContent;
41
+ yamlPre.style.whiteSpace = 'pre-wrap';
42
+ yamlPre.style.wordBreak = 'break-word';
43
+
44
+ modalContent.appendChild(closeButton);
45
+ modalContent.appendChild(yamlPre);
46
+ modal.appendChild(modalContent);
47
+
48
+ document.body.appendChild(modal);
49
+ })
50
+ .catch(error => {
51
+ console.error('Error fetching YAML content:', error);
52
+ alert('Error fetching YAML content: ' + error.message);
53
+ });
54
+ }
55
+
56
+ // Function to visualize coverage scores
57
+ function visualizeCoverage(scoreDetails) {
58
+ const chartContainer = document.getElementById('coverage-chart');
59
+
60
+ // Create a bar chart using a visualization library
61
+ // This is just a placeholder - you would use a library like Chart.js
62
+
63
+ let html = `<div style="margin-top: 20px;">
64
+ <h3>Coverage by Section</h3>
65
+ <div style="display: flex; flex-direction: column; gap: 5px;">`;
66
+
67
+ for (const [section, details] of Object.entries(scoreDetails)) {
68
+ const percentage = details.completion_rate;
69
+ html += `
70
+ <div>
71
+ <div style="display: flex; justify-content: space-between; margin-bottom: 2px;">
72
+ <span>${section}</span>
73
+ <span>${percentage}%</span>
74
+ </div>
75
+ <div style="width: 100%; background-color: #eee; height: 10px; border-radius: 5px;">
76
+ <div style="width: ${percentage}%; background-color: #3273dc; height: 10px; border-radius: 5px;"></div>
77
+ </div>
78
+ </div>`;
79
+ }
80
+
81
+ html += '</div></div>';
82
+
83
+ chartContainer.innerHTML = html;
84
+ }
85
+
86
+ // Initialize any client-side functionality when the document loads
87
+ document.addEventListener('DOMContentLoaded', function() {
88
+ // This could be used to initialize charts or other client-side features
89
+ console.log('Client-side JavaScript initialized');
90
+ });
style.css ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* General styles */
2
+ body {
3
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, Cantarell, 'Open Sans', 'Helvetica Neue', sans-serif;
4
+ }
5
+
6
+ /* Eval card gallery styles */
7
+ .eval-card {
8
+ border: 1px solid #ddd;
9
+ border-radius: 5px;
10
+ padding: 20px;
11
+ margin-bottom: 20px;
12
+ background-color: white;
template.yaml ADDED
File without changes
yaml_template.yaml ADDED
File without changes