n0w0f commited on
Commit
4c9761f
·
1 Parent(s): a464556

chore: update, cards

Browse files
Files changed (5) hide show
  1. app.py +189 -148
  2. eval_cards/ChemBench_20250312_170522.yaml +0 -411
  3. script.js +85 -1
  4. template.yaml +188 -0
  5. yaml_template.yaml +188 -0
app.py CHANGED
@@ -1,17 +1,15 @@
1
  import datetime
2
  import os
3
  import re
4
- from pathlib import Path
5
 
6
  import gradio as gr
7
  import pandas as pd
8
- import requests
9
  import yaml
10
 
11
  # Constants
12
  EVAL_CARDS_DIR = "eval_cards"
13
  TEMPLATE_PATH = "template.yaml"
14
- DEFAULT_MODEL = "anthropic/claude-3-haiku-20240307" # Or any other model available on HF
15
 
16
  # Ensure the eval cards directory exists
17
  os.makedirs(EVAL_CARDS_DIR, exist_ok=True)
@@ -21,11 +19,13 @@ with open("template.yaml", "w") as f:
21
  with open("yaml_template.yaml", "r") as template_file:
22
  f.write(template_file.read())
23
 
 
24
  def load_template():
25
  """Load the YAML template"""
26
  with open(TEMPLATE_PATH, "r") as file:
27
  return file.read()
28
 
 
29
  def yaml_to_dict(yaml_str):
30
  """Convert YAML string to Python dictionary"""
31
  try:
@@ -33,6 +33,7 @@ def yaml_to_dict(yaml_str):
33
  except yaml.YAMLError as e:
34
  return {"error": str(e)}
35
 
 
36
  def compute_coverage_score(eval_data):
37
  """
38
  Compute a coverage score for the eval card
@@ -49,22 +50,26 @@ def compute_coverage_score(eval_data):
49
  "version_and_maintenance": 5,
50
  "citation_and_usage": 5,
51
  }
52
-
53
  scores = {}
54
  total_score = 0
55
-
56
  def count_filled_fields(data, prefix=""):
57
  if isinstance(data, dict):
58
  filled = 0
59
  total = 0
60
  for key, value in data.items():
61
  if isinstance(value, (dict, list)):
62
- sub_filled, sub_total = count_filled_fields(value, f"{prefix}.{key}" if prefix else key)
 
 
63
  filled += sub_filled
64
  total += sub_total
65
  else:
66
  total += 1
67
- if value and not (isinstance(value, str) and value.strip() in ["", "[]", "{}"]):
 
 
68
  filled += 1
69
  return filled, total
70
  elif isinstance(data, list):
@@ -79,7 +84,7 @@ def compute_coverage_score(eval_data):
79
  return filled, total
80
  else:
81
  return 1 if data else 0, 1
82
-
83
  # Compute scores for each section
84
  for section, weight in sections.items():
85
  if section in eval_data:
@@ -90,7 +95,7 @@ def compute_coverage_score(eval_data):
90
  "max_score": weight,
91
  "completion_rate": round(completion_rate * 100, 2),
92
  "fields_filled": filled,
93
- "fields_total": total
94
  }
95
  total_score += scores[section]["score"]
96
  else:
@@ -99,10 +104,11 @@ def compute_coverage_score(eval_data):
99
  "max_score": weight,
100
  "completion_rate": 0,
101
  "fields_filled": 0,
102
- "fields_total": 0
103
  }
104
-
105
- return round(total_score, 2), scores
 
106
 
107
  def get_llm_feedback(yaml_content, api_token=None):
108
  """
@@ -110,22 +116,23 @@ def get_llm_feedback(yaml_content, api_token=None):
110
  Uses GROQ_API_KEY from environment variables if no token is provided
111
  """
112
  import os
 
113
  import requests
114
  from dotenv import load_dotenv
115
-
116
  # Load environment variables from .env file if it exists
117
  load_dotenv()
118
-
119
  # Use provided token or get from environment
120
  api_token = api_token or os.environ.get("GROQ_API_KEY")
121
-
122
  if not api_token:
123
  return "API token is required for LLM feedback. Please set the GROQ_API_KEY environment variable or provide a token."
124
 
125
  try:
126
  headers = {
127
  "Content-Type": "application/json",
128
- "Authorization": f"Bearer {api_token}"
129
  }
130
 
131
  prompt = f"""
@@ -148,16 +155,14 @@ def get_llm_feedback(yaml_content, api_token=None):
148
  """
149
 
150
  payload = {
151
- "model": "llama-3.3-70b-versatile", # or another groq supported model
152
- "messages": [
153
- {"role": "user", "content": prompt}
154
- ]
155
  }
156
 
157
  response = requests.post(
158
  "https://api.groq.com/openai/v1/chat/completions",
159
  headers=headers,
160
- json=payload
161
  )
162
 
163
  if response.status_code == 200:
@@ -169,32 +174,38 @@ def get_llm_feedback(yaml_content, api_token=None):
169
  return f"Error getting Groq LLM feedback: {str(e)}"
170
 
171
 
172
- def save_eval_card(yaml_content, filename=None):
173
- """Save an eval card to the repository"""
174
  try:
175
- # Parse YAML to validate it
176
  eval_data = yaml.safe_load(yaml_content)
177
-
178
- # Generate filename if not provided
179
- if not filename:
180
- eval_name = eval_data.get("title", "Unnamed Evaluation")
181
- # Clean filename
182
- filename = re.sub(r'[^\w\-_]', '_', eval_name)
183
- filename = f"{filename}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.yaml"
184
-
185
- # Save file
 
 
 
 
 
186
  file_path = os.path.join(EVAL_CARDS_DIR, filename)
 
187
  with open(file_path, "w") as file:
188
  file.write(yaml_content)
189
-
190
- return True, file_path
191
  except Exception as e:
192
- return False, str(e)
 
193
 
194
  def load_all_eval_cards():
195
  """Load all eval cards from the repository"""
196
  eval_cards = []
197
-
198
  for filename in os.listdir(EVAL_CARDS_DIR):
199
  if filename.endswith(".yaml"):
200
  file_path = os.path.join(EVAL_CARDS_DIR, filename)
@@ -202,36 +213,48 @@ def load_all_eval_cards():
202
  with open(file_path, "r") as file:
203
  yaml_content = file.read()
204
  eval_data = yaml.safe_load(yaml_content)
205
-
206
  # Compute coverage score
207
  score, score_details = compute_coverage_score(eval_data)
208
-
209
  # Extract key metadata
210
- eval_cards.append({
211
- "filename": filename,
212
- "title": eval_data.get("title", "Unnamed Evaluation"),
213
- "summary": eval_data.get("summary", ""),
214
- "authors": ", ".join(eval_data.get("metadata", {}).get("authors", [])),
215
- "creation_date": eval_data.get("metadata", {}).get("creation_date", ""),
216
- "coverage_score": score,
217
- "score_details": score_details,
218
- "yaml_content": yaml_content,
219
- "data": eval_data
220
- })
 
 
 
 
 
 
221
  except Exception as e:
222
  print(f"Error loading {filename}: {str(e)}")
223
-
224
  return eval_cards
225
 
 
226
  def format_eval_card_as_html(eval_card):
227
  """Format an eval card as HTML for display"""
228
  html = f"""
229
  <div style="border: 1px solid #ddd; padding: 15px; margin-bottom: 20px; border-radius: 5px;">
230
- <h3>{eval_card['title']}</h3>
231
- <p>{eval_card['summary']}</p>
232
- <p><strong>Authors:</strong> {eval_card['authors']}</p>
233
- <p><strong>Created:</strong> {eval_card['creation_date']}</p>
234
- <p><strong>Coverage Score:</strong> {eval_card['coverage_score']}%</p>
 
 
 
 
 
235
 
236
  <h4>Coverage by Section:</h4>
237
  <table style="width: 100%; border-collapse: collapse;">
@@ -241,45 +264,47 @@ def format_eval_card_as_html(eval_card):
241
  <th style="text-align: right; padding: 5px; border-bottom: 1px solid #ddd;">Completion</th>
242
  </tr>
243
  """
244
-
245
- for section, details in eval_card['score_details'].items():
246
  html += f"""
247
  <tr>
248
  <td style="padding: 5px; border-bottom: 1px solid #eee;">{section}</td>
249
- <td style="text-align: right; padding: 5px; border-bottom: 1px solid #eee;">{details['score']}/{details['max_score']}</td>
250
- <td style="text-align: right; padding: 5px; border-bottom: 1px solid #eee;">{details['completion_rate']}%</td>
251
  </tr>
252
  """
253
-
254
  html += """
255
  </table>
256
- <div style="margin-top: 10px;">
257
- <a href="#" onclick="viewYaml(this)" data-filename="{eval_card['filename']}" style="text-decoration: none; color: #3273dc;">View YAML</a>
258
  </div>
259
  </div>
260
  """
261
-
262
  return html
263
 
 
264
  def create_eval_cards_table(eval_cards):
265
  """Create an HTML table of eval cards"""
266
  if not eval_cards:
267
  return "<p>No evaluation cards found.</p>"
268
-
269
  # Sort by coverage score (highest first)
270
- eval_cards.sort(key=lambda x: x['coverage_score'], reverse=True)
271
-
272
  html = ""
273
  for eval_card in eval_cards:
274
  html += format_eval_card_as_html(eval_card)
275
-
276
  return html
277
 
 
278
  def upload_file(file):
279
  """Process an uploaded YAML file"""
280
  if file is None:
281
  return "No file uploaded", None
282
-
283
  try:
284
  yaml_content = file.decode("utf-8")
285
  # Validate YAML
@@ -288,62 +313,74 @@ def upload_file(file):
288
  except Exception as e:
289
  return f"Error processing file: {str(e)}", None
290
 
 
291
  def get_feedback(yaml_content):
292
  """Get LLM feedback on the eval card"""
293
  if not yaml_content:
294
  return "Please upload or paste a YAML file first."
295
-
296
  # Use provided token or get from environment
297
  api_token = os.environ.get("GROQ_API_KEY")
298
-
299
  if not api_token:
300
- return "Please provide an API token or set the GROQ_API_KEY environment variable."
301
-
 
 
302
  feedback = get_llm_feedback(yaml_content, api_token)
303
  return feedback
304
 
305
- def submit_eval_card(yaml_content):
 
306
  """Submit an eval card to the repository"""
307
  if not yaml_content:
308
  return "Please upload or paste a YAML file first.", None, None
309
-
310
  try:
311
  # Validate YAML
312
  eval_data = yaml.safe_load(yaml_content)
313
-
314
  # Compute coverage score
315
  score, score_details = compute_coverage_score(eval_data)
316
-
317
- # Save eval card
318
- success, file_path = save_eval_card(yaml_content)
319
-
320
- if success:
321
- return f"Evaluation card saved successfully! Coverage score: {score}%", score, score_details
 
 
 
 
322
  else:
323
- return f"Error saving evaluation card: {file_path}", None, None
324
-
325
  except Exception as e:
326
  return f"Error processing evaluation card: {str(e)}", None, None
327
 
 
328
  def refresh_gallery():
329
  """Refresh the gallery of eval cards"""
330
  eval_cards = load_all_eval_cards()
331
  html = create_eval_cards_table(eval_cards)
332
-
333
  # Convert data to pandas DataFrame for table view
334
  table_data = []
335
  for card in eval_cards:
336
- table_data.append({
337
- "Title": card["title"],
338
- "Authors": card["authors"],
339
- "Creation Date": card["creation_date"],
340
- "Coverage Score": f"{card['coverage_score']}%"
341
- })
342
-
 
 
343
  df = pd.DataFrame(table_data)
344
-
345
  return html, df if not df.empty else None
346
 
 
347
  def handle_upload_tab(file_obj, yaml_text):
348
  """Handle upload tab actions - either use uploaded file or pasted text"""
349
  if file_obj is not None:
@@ -352,96 +389,100 @@ def handle_upload_tab(file_obj, yaml_text):
352
  else:
353
  return yaml_text
354
 
 
355
  # Create the Gradio interface
356
- with gr.Blocks(title="Evaluation Card Repository") as app:
357
  with gr.Row():
358
  with gr.Column(scale=2):
359
- gr.Markdown("# Evaluation Card Repository")
360
- gr.Markdown("""
361
- This application allows you to upload, validate, and explore ML evaluation cards.
362
-
363
- Upload your evaluation card in YAML format, get feedback from an LLM, and submit it to the repository.
364
  """)
365
-
366
  with gr.Tabs():
367
  with gr.TabItem("Upload & Review"):
368
  with gr.Row():
369
  with gr.Column():
370
- file_upload = gr.File(label="Upload YAML File", file_types=[".yaml", ".yml"])
371
-
 
 
372
  with gr.Accordion("Or paste YAML content", open=False):
373
- yaml_input = gr.TextArea(label="YAML Content", placeholder="Paste your YAML content here...", lines=10)
374
-
 
 
 
 
 
 
 
 
 
 
 
 
 
375
  load_template_btn = gr.Button("Load Template")
376
-
377
  # api_token = gr.Textbox(label="API Token (for LLM feedback)", type="password")
378
-
379
  with gr.Row():
380
  get_feedback_btn = gr.Button("Get LLM Feedback")
381
- submit_btn = gr.Button("Submit Evaluation Card", variant="primary")
382
-
 
 
383
  with gr.Column():
384
  yaml_display = gr.TextArea(label="Current YAML", lines=20)
385
-
386
  with gr.Accordion("LLM Feedback", open=True):
387
  feedback_display = gr.Markdown()
388
-
389
  with gr.Accordion("Submission Result", open=True):
390
  result_display = gr.Markdown()
391
- coverage_score = gr.Number(label="Coverage Score", visible=False)
392
- coverage_details = gr.JSON(label="Coverage Details", visible=False)
393
-
 
 
 
 
394
  with gr.TabItem("Gallery"):
395
  refresh_btn = gr.Button("Refresh Gallery")
396
-
397
  with gr.Tabs():
398
  with gr.TabItem("Card View"):
399
  gallery_html = gr.HTML()
400
-
401
  with gr.TabItem("Table View"):
402
  gallery_table = gr.DataFrame()
403
-
404
  # Set up event handlers
405
- load_template_btn.click(
406
- fn=load_template,
407
- outputs=[yaml_display]
408
- )
409
-
410
  file_upload.change(
411
- fn=handle_upload_tab,
412
- inputs=[file_upload, yaml_input],
413
- outputs=[yaml_display]
414
- )
415
-
416
- yaml_input.change(
417
- fn=lambda x: x,
418
- inputs=[yaml_input],
419
- outputs=[yaml_display]
420
  )
421
-
 
 
422
  get_feedback_btn.click(
423
- fn=get_feedback,
424
- inputs=[yaml_display],
425
- outputs=[feedback_display]
426
  )
427
-
428
  submit_btn.click(
429
  fn=submit_eval_card,
430
- inputs=[yaml_display],
431
- outputs=[result_display, coverage_score, coverage_details]
432
- )
433
-
434
- refresh_btn.click(
435
- fn=refresh_gallery,
436
- outputs=[gallery_html, gallery_table]
437
  )
438
-
 
 
439
  # Initialize the gallery on app start
440
- app.load(
441
- fn=refresh_gallery,
442
- outputs=[gallery_html, gallery_table]
443
- )
444
 
445
  # Launch the app
446
  if __name__ == "__main__":
447
- app.launch()
 
1
  import datetime
2
  import os
3
  import re
 
4
 
5
  import gradio as gr
6
  import pandas as pd
 
7
  import yaml
8
 
9
  # Constants
10
  EVAL_CARDS_DIR = "eval_cards"
11
  TEMPLATE_PATH = "template.yaml"
12
+
13
 
14
  # Ensure the eval cards directory exists
15
  os.makedirs(EVAL_CARDS_DIR, exist_ok=True)
 
19
  with open("yaml_template.yaml", "r") as template_file:
20
  f.write(template_file.read())
21
 
22
+
23
  def load_template():
24
  """Load the YAML template"""
25
  with open(TEMPLATE_PATH, "r") as file:
26
  return file.read()
27
 
28
+
29
  def yaml_to_dict(yaml_str):
30
  """Convert YAML string to Python dictionary"""
31
  try:
 
33
  except yaml.YAMLError as e:
34
  return {"error": str(e)}
35
 
36
+
37
  def compute_coverage_score(eval_data):
38
  """
39
  Compute a coverage score for the eval card
 
50
  "version_and_maintenance": 5,
51
  "citation_and_usage": 5,
52
  }
53
+
54
  scores = {}
55
  total_score = 0
56
+
57
  def count_filled_fields(data, prefix=""):
58
  if isinstance(data, dict):
59
  filled = 0
60
  total = 0
61
  for key, value in data.items():
62
  if isinstance(value, (dict, list)):
63
+ sub_filled, sub_total = count_filled_fields(
64
+ value, f"{prefix}.{key}" if prefix else key
65
+ )
66
  filled += sub_filled
67
  total += sub_total
68
  else:
69
  total += 1
70
+ if value and not (
71
+ isinstance(value, str) and value.strip() in ["", "[]", "{}"]
72
+ ):
73
  filled += 1
74
  return filled, total
75
  elif isinstance(data, list):
 
84
  return filled, total
85
  else:
86
  return 1 if data else 0, 1
87
+
88
  # Compute scores for each section
89
  for section, weight in sections.items():
90
  if section in eval_data:
 
95
  "max_score": weight,
96
  "completion_rate": round(completion_rate * 100, 2),
97
  "fields_filled": filled,
98
+ "fields_total": total,
99
  }
100
  total_score += scores[section]["score"]
101
  else:
 
104
  "max_score": weight,
105
  "completion_rate": 0,
106
  "fields_filled": 0,
107
+ "fields_total": 0,
108
  }
109
+
110
+ return max(round(total_score, 2), 100), scores
111
+
112
 
113
  def get_llm_feedback(yaml_content, api_token=None):
114
  """
 
116
  Uses GROQ_API_KEY from environment variables if no token is provided
117
  """
118
  import os
119
+
120
  import requests
121
  from dotenv import load_dotenv
122
+
123
  # Load environment variables from .env file if it exists
124
  load_dotenv()
125
+
126
  # Use provided token or get from environment
127
  api_token = api_token or os.environ.get("GROQ_API_KEY")
128
+
129
  if not api_token:
130
  return "API token is required for LLM feedback. Please set the GROQ_API_KEY environment variable or provide a token."
131
 
132
  try:
133
  headers = {
134
  "Content-Type": "application/json",
135
+ "Authorization": f"Bearer {api_token}",
136
  }
137
 
138
  prompt = f"""
 
155
  """
156
 
157
  payload = {
158
+ "model": "llama-3.3-70b-versatile", # or another groq supported model
159
+ "messages": [{"role": "user", "content": prompt}],
 
 
160
  }
161
 
162
  response = requests.post(
163
  "https://api.groq.com/openai/v1/chat/completions",
164
  headers=headers,
165
+ json=payload,
166
  )
167
 
168
  if response.status_code == 200:
 
174
  return f"Error getting Groq LLM feedback: {str(e)}"
175
 
176
 
177
+ def save_eval_card(yaml_content, paper_url="", repo_url=""):
178
+ """Save an eval card with additional metadata"""
179
  try:
 
180
  eval_data = yaml.safe_load(yaml_content)
181
+
182
+ # Add paper and repository links to metadata
183
+ if paper_url:
184
+ eval_data["metadata"]["paper_link"] = paper_url
185
+ if repo_url:
186
+ eval_data["metadata"]["repository_link"] = repo_url
187
+
188
+ # Update the YAML content with the new metadata
189
+ yaml_content = yaml.dump(eval_data)
190
+
191
+ filename = re.sub(r"[^\w\-_]", "_", eval_data.get("title", "Unnamed"))
192
+ filename = (
193
+ f"{filename}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.yaml"
194
+ )
195
  file_path = os.path.join(EVAL_CARDS_DIR, filename)
196
+
197
  with open(file_path, "w") as file:
198
  file.write(yaml_content)
199
+
200
+ return f"Evaluation card saved successfully as {filename}", file_path
201
  except Exception as e:
202
+ return f"Error saving evaluation card: {str(e)}", None
203
+
204
 
205
  def load_all_eval_cards():
206
  """Load all eval cards from the repository"""
207
  eval_cards = []
208
+
209
  for filename in os.listdir(EVAL_CARDS_DIR):
210
  if filename.endswith(".yaml"):
211
  file_path = os.path.join(EVAL_CARDS_DIR, filename)
 
213
  with open(file_path, "r") as file:
214
  yaml_content = file.read()
215
  eval_data = yaml.safe_load(yaml_content)
216
+
217
  # Compute coverage score
218
  score, score_details = compute_coverage_score(eval_data)
219
+
220
  # Extract key metadata
221
+ eval_cards.append(
222
+ {
223
+ "filename": filename,
224
+ "title": eval_data.get("title", "Unnamed Evaluation"),
225
+ "summary": eval_data.get("summary", ""),
226
+ "authors": ", ".join(
227
+ eval_data.get("metadata", {}).get("authors", [])
228
+ ),
229
+ "creation_date": eval_data.get("metadata", {}).get(
230
+ "creation_date", ""
231
+ ),
232
+ "coverage_score": score,
233
+ "score_details": score_details,
234
+ "yaml_content": yaml_content,
235
+ "data": eval_data,
236
+ }
237
+ )
238
  except Exception as e:
239
  print(f"Error loading {filename}: {str(e)}")
240
+
241
  return eval_cards
242
 
243
+
244
  def format_eval_card_as_html(eval_card):
245
  """Format an eval card as HTML for display"""
246
  html = f"""
247
  <div style="border: 1px solid #ddd; padding: 15px; margin-bottom: 20px; border-radius: 5px;">
248
+ <h3>{eval_card["title"]}</h3>
249
+ <p>{eval_card["summary"]}</p>
250
+ <p><strong>Authors:</strong> {eval_card["authors"]}</p>
251
+ <p><strong>Created:</strong> {eval_card["creation_date"]}</p>
252
+
253
+ <!-- Add repository and paper links if available -->
254
+ {f'<p><strong>Repository:</strong> <a href="{eval_card["data"]["metadata"].get("repository_link", "")}" target="_blank">{eval_card["data"]["metadata"].get("repository_link", "")}</a></p>' if eval_card["data"]["metadata"].get("repository_link") else ""}
255
+ {f'<p><strong>Paper:</strong> <a href="{eval_card["data"]["metadata"].get("paper_link", "")}" target="_blank">{eval_card["data"]["metadata"].get("paper_link", "")}</a></p>' if eval_card["data"]["metadata"].get("paper_link") else ""}
256
+
257
+ <p><strong>Coverage Score:</strong> {eval_card["coverage_score"]}%</p>
258
 
259
  <h4>Coverage by Section:</h4>
260
  <table style="width: 100%; border-collapse: collapse;">
 
264
  <th style="text-align: right; padding: 5px; border-bottom: 1px solid #ddd;">Completion</th>
265
  </tr>
266
  """
267
+
268
+ for section, details in eval_card["score_details"].items():
269
  html += f"""
270
  <tr>
271
  <td style="padding: 5px; border-bottom: 1px solid #eee;">{section}</td>
272
+ <td style="text-align: right; padding: 5px; border-bottom: 1px solid #eee;">{details["score"]}/{details["max_score"]}</td>
273
+ <td style="text-align: right; padding: 5px; border-bottom: 1px solid #eee;">{details["completion_rate"]}%</td>
274
  </tr>
275
  """
276
+
277
  html += """
278
  </table>
279
+ <div style="margin-top: 15px;">
280
+ <!-- Additional actions can go here -->
281
  </div>
282
  </div>
283
  """
284
+
285
  return html
286
 
287
+
288
  def create_eval_cards_table(eval_cards):
289
  """Create an HTML table of eval cards"""
290
  if not eval_cards:
291
  return "<p>No evaluation cards found.</p>"
292
+
293
  # Sort by coverage score (highest first)
294
+ eval_cards.sort(key=lambda x: x["coverage_score"], reverse=True)
295
+
296
  html = ""
297
  for eval_card in eval_cards:
298
  html += format_eval_card_as_html(eval_card)
299
+
300
  return html
301
 
302
+
303
  def upload_file(file):
304
  """Process an uploaded YAML file"""
305
  if file is None:
306
  return "No file uploaded", None
307
+
308
  try:
309
  yaml_content = file.decode("utf-8")
310
  # Validate YAML
 
313
  except Exception as e:
314
  return f"Error processing file: {str(e)}", None
315
 
316
+
317
  def get_feedback(yaml_content):
318
  """Get LLM feedback on the eval card"""
319
  if not yaml_content:
320
  return "Please upload or paste a YAML file first."
321
+
322
  # Use provided token or get from environment
323
  api_token = os.environ.get("GROQ_API_KEY")
324
+
325
  if not api_token:
326
+ return (
327
+ "Please provide an API token or set the GROQ_API_KEY environment variable."
328
+ )
329
+
330
  feedback = get_llm_feedback(yaml_content, api_token)
331
  return feedback
332
 
333
+
334
+ def submit_eval_card(yaml_content, paper_url="", repo_url=""):
335
  """Submit an eval card to the repository"""
336
  if not yaml_content:
337
  return "Please upload or paste a YAML file first.", None, None
338
+
339
  try:
340
  # Validate YAML
341
  eval_data = yaml.safe_load(yaml_content)
342
+
343
  # Compute coverage score
344
  score, score_details = compute_coverage_score(eval_data)
345
+
346
+ # Save eval card with URLs
347
+ result, file_path = save_eval_card(yaml_content, paper_url, repo_url)
348
+
349
+ if file_path:
350
+ return (
351
+ f"Evaluation card saved successfully! Coverage score: {score}%",
352
+ score,
353
+ score_details,
354
+ )
355
  else:
356
+ return f"Error saving evaluation card: {result}", None, None
357
+
358
  except Exception as e:
359
  return f"Error processing evaluation card: {str(e)}", None, None
360
 
361
+
362
  def refresh_gallery():
363
  """Refresh the gallery of eval cards"""
364
  eval_cards = load_all_eval_cards()
365
  html = create_eval_cards_table(eval_cards)
366
+
367
  # Convert data to pandas DataFrame for table view
368
  table_data = []
369
  for card in eval_cards:
370
+ table_data.append(
371
+ {
372
+ "Title": card["title"],
373
+ "Authors": card["authors"][5],
374
+ "Creation Date": card["creation_date"],
375
+ "Coverage Score": f"{card['coverage_score']}%",
376
+ }
377
+ )
378
+
379
  df = pd.DataFrame(table_data)
380
+
381
  return html, df if not df.empty else None
382
 
383
+
384
  def handle_upload_tab(file_obj, yaml_text):
385
  """Handle upload tab actions - either use uploaded file or pasted text"""
386
  if file_obj is not None:
 
389
  else:
390
  return yaml_text
391
 
392
+
393
  # Create the Gradio interface
394
+ with gr.Blocks(title="Evaluation Cards Gallery") as app:
395
  with gr.Row():
396
  with gr.Column(scale=2):
397
+ gr.Markdown("# Evaluation Cards for Machine Learning in Materials Science. ")
398
+ gr.Markdown("""
399
+ Upload your evaluation card in YAML format, get feedback from an LLM, and submit it to the gallery.
400
+ checkout the [GitHub repository](https://github.com/lamalab-org/eval-cards) for more information.
 
401
  """)
402
+
403
  with gr.Tabs():
404
  with gr.TabItem("Upload & Review"):
405
  with gr.Row():
406
  with gr.Column():
407
+ file_upload = gr.File(
408
+ label="Upload YAML File", file_types=[".yaml", ".yml"]
409
+ )
410
+
411
  with gr.Accordion("Or paste YAML content", open=False):
412
+ yaml_input = gr.TextArea(
413
+ label="YAML Content",
414
+ placeholder="Paste your YAML content here...",
415
+ lines=10,
416
+ )
417
+ paper_url_input = gr.Textbox(
418
+ label="Paper URL (Optional)",
419
+ placeholder="https://arxiv.org/abs/...",
420
+ )
421
+
422
+ repo_url_input = gr.Textbox(
423
+ label="Repository URL (Optional)",
424
+ placeholder="https://github.com/...",
425
+ )
426
+
427
  load_template_btn = gr.Button("Load Template")
428
+
429
  # api_token = gr.Textbox(label="API Token (for LLM feedback)", type="password")
430
+
431
  with gr.Row():
432
  get_feedback_btn = gr.Button("Get LLM Feedback")
433
+ submit_btn = gr.Button(
434
+ "Submit Evaluation Card", variant="primary"
435
+ )
436
+
437
  with gr.Column():
438
  yaml_display = gr.TextArea(label="Current YAML", lines=20)
439
+
440
  with gr.Accordion("LLM Feedback", open=True):
441
  feedback_display = gr.Markdown()
442
+
443
  with gr.Accordion("Submission Result", open=True):
444
  result_display = gr.Markdown()
445
+ coverage_score = gr.Number(
446
+ label="Coverage Score", visible=False
447
+ )
448
+ coverage_details = gr.JSON(
449
+ label="Coverage Details", visible=False
450
+ )
451
+
452
  with gr.TabItem("Gallery"):
453
  refresh_btn = gr.Button("Refresh Gallery")
454
+
455
  with gr.Tabs():
456
  with gr.TabItem("Card View"):
457
  gallery_html = gr.HTML()
458
+
459
  with gr.TabItem("Table View"):
460
  gallery_table = gr.DataFrame()
461
+
462
  # Set up event handlers
463
+ load_template_btn.click(fn=load_template, outputs=[yaml_display])
464
+
 
 
 
465
  file_upload.change(
466
+ fn=handle_upload_tab, inputs=[file_upload, yaml_input], outputs=[yaml_display]
 
 
 
 
 
 
 
 
467
  )
468
+
469
+ yaml_input.change(fn=lambda x: x, inputs=[yaml_input], outputs=[yaml_display])
470
+
471
  get_feedback_btn.click(
472
+ fn=get_feedback, inputs=[yaml_display], outputs=[feedback_display]
 
 
473
  )
474
+
475
  submit_btn.click(
476
  fn=submit_eval_card,
477
+ inputs=[yaml_display, paper_url_input, repo_url_input],
478
+ outputs=[result_display, coverage_score, coverage_details],
 
 
 
 
 
479
  )
480
+
481
+ refresh_btn.click(fn=refresh_gallery, outputs=[gallery_html, gallery_table])
482
+
483
  # Initialize the gallery on app start
484
+ app.load(fn=refresh_gallery, outputs=[gallery_html, gallery_table])
 
 
 
485
 
486
  # Launch the app
487
  if __name__ == "__main__":
488
+ app.launch()
eval_cards/ChemBench_20250312_170522.yaml DELETED
@@ -1,411 +0,0 @@
1
- title: "ChemBench"
2
-
3
- summary: >
4
- ChemBench was developed as a comprehensive benchmarking suite for the performance of LLMs in chemistry.
5
- It features a curation of more than 2,700 question-answer pairs classified to probe knowledge, intuition
6
- and reasoning abilities of LLMs. ChemBench goes beyond simple MCQ evaluation, supports floating point
7
- answers (also in scientific notation), and prompts models closely to how they were trained.
8
-
9
- metadata:
10
- authors:
11
- - Adrian Mirza
12
- - Nawaf Alampara
13
- - Sreekanth Kunchapu
14
- - Martiño Ríos-García
15
- - Benedict Emoekabu
16
- - Aswanth Krishnan
17
- - Tanya Gupta
18
- - Mara Schilling-Wilhelmi
19
- - Macjonathan Okereke
20
- - Anagha Aneesh
21
- - Mehrdad Asgari
22
- - Juliane Eberhardt
23
- - Amir Mohammad Elahi
24
- - Hani M. Elbeheiry
25
- - María Victoria Gil
26
- - Christina Glaubitz
27
- - Maximilian Greiner
28
- - Caroline T. Holick
29
- - Tim Hoffmann
30
- - Abdelrahman Ibrahim
31
- - Lea C. Klepsch
32
- - Yannik Köster
33
- - Fabian Alexander Kreth
34
- - Jakob Meyer
35
- - Santiago Miret
36
- - Jan Matthias Peschel
37
- - Michael Ringleb
38
- - Nicole Roesner
39
- - Johanna Schreiber
40
- - Ulrich S. Schubert
41
- - Leanne M. Stafast
42
- - Dinga Wonanke
43
- - Michael Pieler
44
- - Philippe Schwaller
45
- - Kevin Maik Jablonka
46
- maintainers:
47
- - Adrian Mirza
48
- - Nawaf Alampara
49
- - Martiño Ríos-García
50
- - Kevin Maik Jablonka
51
- creation_date: "2023-05-15"
52
- last_review_date: "2024-11-01"
53
- next_review_date: "YTBD"
54
- version_compatibility:
55
- - "v0.3.0"
56
-
57
- evaluation_design:
58
- motivation:
59
- scientific_needs: >
60
- ChemBench is one of the pioneering benchmarks to evaluate performance of LLMs in chemistry specifically.
61
- Prior selection of LLMs on chemistry tasks has been based on their performance on general benchmarks like Big Bench.
62
- approach_justification: >
63
- ChemBench comprehensively evaluates almost all the leading models on a wide range of chemistry topics,
64
- allowing topic-specific leaders identification. It also probes safety knowledge of LLMs and evaluates
65
- measures of alignment with human intuitions.
66
- expected_benefits: >
67
- Provides comparison metrics for LLM training on chemistry-specific tasks and evaluates performance
68
- across different chemistry topics.
69
- tradeoffs: >
70
- Current LLMs lack human intuitions. ChemBench currently does not support evaluation of open-ended chemistry tasks.
71
-
72
- type_and_structure:
73
- type: "Benchmark"
74
- structure: >
75
- End-to-end automation, careful validation by experts, and usability with black box systems.
76
- The benchmark covers a diverse set of topics and skills (reasoning, calculation, knowledge, and intuition)
77
- across a range of difficulty levels.
78
- timeline: ""
79
- key_design_decisions:
80
- - Benchmark approach for scalability and easier accessibility
81
- - End-to-end automation for frequent model evaluation
82
- - Careful validation by experts to minimize incorrect or unanswerable questions
83
- - Support for models with special treatment of molecules
84
- - Usability with black box systems without access to weights or logits
85
- - Probing capabilities beyond MCQs to reflect real-world chemistry
86
- - Coverage of diverse topics and skills
87
- - Range of difficulty levels to measure improvement
88
- - Impossible to completely solve with current models
89
- design_process:
90
- stakeholder_consultation: "ChemBench is internally used by some of the leading AI labs"
91
- pilot_studies:
92
- - "LLM ChemBench results were compared against humans using a subset of ChemBench"
93
- validation_approaches:
94
- - "Codebase tested with unit tests covering parsing modules, metrics modules, and extraction modules"
95
- - "Questions verified manually by experts through GitHub pull requests"
96
- - "Automated checks via GitHub Actions for schemas, LATEX templating, and formatting"
97
- - "Leaderboard verification of complete corpus evaluation"
98
-
99
- stakeholders_and_resources:
100
- target_users:
101
- - "General audience developing or evaluating ML models"
102
- - "Researchers developing chemistry datasets"
103
- required_expertise:
104
- - "Basic knowledge of using benchmarks (simple how-to guide provided)"
105
- resource_requirements:
106
- - "API keys for closed-source models"
107
- - "GPUs for fast local benchmarking (CPU also possible but slower)"
108
- cost_considerations: "Nil"
109
-
110
- estimand:
111
- target_construct:
112
- primary_capability: "Capabilities of models to answer chemistry questions"
113
- measurement_type: "Pragmatic"
114
- relationship_to_applications: >
115
- ChemBench score can be considered a comparative metric to measure gains in LLM training.
116
- Shows positive correlation to performance on tasks like data extraction.
117
- theoretical_framework: >
118
- Assumes the corpus is not being used for training during model development.
119
- Findings on capabilities are based on performance in answering questions that rely on
120
- reasoning, calculation, knowledge, and intuition for humans to solve.
121
-
122
- scope_and_limitations:
123
- coverage: >
124
- Over 2,700 question-answer pairs classified to probe knowledge, intuition, and reasoning.
125
- Covers subjects within Chemistry taught at undergraduate and postgraduate level courses.
126
- excluded_capabilities:
127
- - "Property prediction capabilities"
128
- - "Data extraction capabilities"
129
- - "Embedding meaningfulness"
130
- - "Agentic capabilities"
131
- known_blind_spots:
132
- - "Questions considered answered correctly only if final answer is correct"
133
- - "Partial scoring and open-ended evaluation not covered"
134
- theoretical_limitations:
135
- - "Questions treated with equal weights, no clear approach for weighing tasks"
136
- - "Reliability and correlation between log probabilities and model responses not known"
137
-
138
- assessment_components:
139
- test_set:
140
- data_sources:
141
- - "Curated questions from existing exams or exercise sheets"
142
- - "Programmatically created questions"
143
- sampling_methodology: "Each model evaluated on all questions"
144
- known_biases:
145
- - "Questions mainly curated from the background of the developers"
146
- approach_to_duplicates: >
147
- Each question-answer pair hashed to create unique IDs, filtering to keep unique questions based on UUIDs.
148
- data_quality: >
149
- Guidelines followed by reviewers: originality, clarity, factual correctness, and avoiding ambiguity.
150
-
151
- estimator:
152
- evaluation_protocol:
153
- methodology: >
154
- Distinct prompt templates for completion and instruction-tuned models. Multistep parsing workflow
155
- based on regular expressions with LLM extraction as fallback. Comprehensive refusal detection combining
156
- regular expression-based detection and a fine-tuned BERT model.
157
- control_measures:
158
- - "Model-specific prompt templates"
159
- - "Consistent parsing workflow"
160
- - "Refusal detection and retry mechanism"
161
- handling_random_components: "Refusal detection and retry mechanism for up to n times"
162
- reproducibility_requirements: >
163
- Storage of model timestamp, time, and version of the dataset used for benchmarking.
164
-
165
- metrics:
166
- primary_metrics:
167
- - "Fraction of correctly answered questions"
168
- aggregation_methodology: "Final score is mean of scores across all questions from all topics"
169
- task_weightings:
170
- approach: "All questions treated equally to avoid ambiguity"
171
- note: "Questions classified into three difficulty levels manually by experts for further analysis"
172
- performance_bounds:
173
- scoring: "No partial scoring - all questions measured as correct/incorrect"
174
- connection_to_outcomes: "Scores reflect how well the model is trained on chemistry"
175
-
176
- metric_details:
177
- - name: "Fraction Correct"
178
- definition: >
179
- Proportion of correct answers out of total questions. For MCQs, uses Hamming Loss;
180
- for numerics, uses Mean Absolute Error with 1% threshold.
181
- implementation: >
182
- (1/n) * (sum(1-HammingLoss_i for i in MCQ) + sum(indicator(MAE_j < 0.01*|Target_j|) for j in Numeric))
183
- edge_cases:
184
- - "Perfect score: 1 when all questions answered correctly"
185
- - "Complete failure: 0 when all questions answered incorrectly"
186
- statistical_properties:
187
- - "Simplicity: Easy to calculate and interpret"
188
- - "Range: Always bounded between [0, 1]"
189
- - "Binary nature: Each question contributes either 0 or 1"
190
- failure_modes:
191
- - "Masking: High overall accuracy can hide poor performance on specific question types"
192
- - "Insensitivity to confidence: Doesn't account for prediction confidence"
193
- - "Equal weighting: Assigns equal importance regardless of difficulty"
194
- - "Heterogeneous data: Combining different question types with different evaluation criteria"
195
- - "Threshold sensitivity: Results highly dependent on chosen thresholds"
196
- - "Near-zero targets: For small target values, 1% threshold becomes extremely stringent"
197
-
198
- - name: "Hamming Loss"
199
- definition: >
200
- Measures fraction of labels incorrectly predicted for MCQs.
201
- (1/L) * sum(indicator(y_i,l != y_hat_i,l) for l in 1 to L)
202
- implementation: "For single-answer MCQ, 0 if answer correct, 1 if incorrect"
203
- statistical_properties:
204
- - "Linearity: Scales linearly with misclassifications"
205
- - "Range: Always bounded between [0, 1]"
206
- - "Symmetry: Treats false positives and negatives equally"
207
- failure_modes:
208
- - "Equal weighting: Assigns equal importance regardless of difficulty"
209
- - "Lack of severity grading: All errors weighted equally"
210
- - "Multi-label complexity: May not capture label dependencies"
211
- - "Simplistic for complex MCQs: Doesn't account for partial correctness"
212
-
213
- technical_framework:
214
- implementation_requirements:
215
- - "Installing ChemBench package"
216
- - "API keys for closed-source models"
217
- - "GPUs for fast benchmarking (CPU also possible)"
218
- time_constraints: "Complete benchmarking requires around 2 hours"
219
- dependencies:
220
- - "tenacity==8.3.0"
221
- - "langchain>=0.1.5"
222
- - "fastcore>=1.5.29"
223
- - "scikit-learn>=1.4.0"
224
- - "loguru>=0.7.2"
225
- - "litellm>=1.59.1"
226
- - "backoff>=2.2.1"
227
- - "tqdm>=4.66.1"
228
- - "pint>=0.23"
229
- - "pandas>=2.2.0"
230
- - "python-dotenv>=1.0.1"
231
- - "fire>=0.5.0"
232
- - "datasets"
233
- - "torch"
234
- - "transformers"
235
- - "langchain-community>=0.0.17"
236
- - "pillow"
237
-
238
- constraints_and_rules:
239
- allowed_resources:
240
- - "Models not trained on the ChemBench corpus (not tested)"
241
- permitted_approaches:
242
- - "Tools or other agentic setups"
243
- - "No constraints on model parameters or computational constraints"
244
- - "No constraints on temperature or decoding strategies"
245
- - "No constraints on architecture or post-training approaches"
246
- optimization_constraints:
247
- - "Prompts not optimized unless part of modeling"
248
- ethical_boundaries:
249
- - "Models not trained on the ChemBench corpus (not tested)"
250
-
251
- estimate:
252
- required_reporting:
253
- essential_metrics:
254
- - "all_correct (binary score of 0/1 for each question)"
255
- - "Fraction correct (final score computed across all questions)"
256
- - "Refusal detections and LLM parsing flags"
257
- results_disaggregation: >
258
- Individual scoring and relative position available for Topics:
259
- Analytical Chemistry, Materials Science, Technical Chemistry, General Chemistry,
260
- Physical Chemistry, Toxicity and Safety, Inorganic Chemistry, Organic Chemistry,
261
- and Human Preference. Separate scores for easy/hard tasks, reasoning tasks,
262
- computation tasks, knowledge tasks, human preference alignment, and comparison
263
- against human chemists.
264
- uncertainty_quantification: >
265
- ChemBench has a unique way to obtain confidence of model predictions using prompting,
266
- but this is a separate analysis not part of benchmark metrics.
267
- performance_variation: "Currently not done"
268
- resource_usage_reporting: "Currently tracks number of parameters if available"
269
-
270
- reproducibility_information:
271
- documentation_requirements:
272
- - "model_name"
273
- - "model_timestamp"
274
- - "model_description"
275
- - "date_published (optional)"
276
- - "open_weights (optional)"
277
- - "open_dataset (optional)"
278
- - "nr_of_parameters (optional)"
279
- - "github (optional)"
280
- - "paper (optional)"
281
- - "api_endpoint (optional)"
282
- - "nr_of_tokens (optional)"
283
- - "architecture (optional)"
284
- - "mixture_of_experts (optional)"
285
- - "model_alignment (optional)"
286
- - "reinforcement_learning_from_human_feedback (optional)"
287
- - "domain_specific_pretraining (optional)"
288
- - "domain_specific_finetuning (optional)"
289
- - "tool_use (optional)"
290
- - "tool_type (optional)"
291
- - "temperature (optional)"
292
- - "epochs (optional)"
293
- - "reasoning_model (optional)"
294
- - "reasoning_type (optional)"
295
- environment_specifications: >
296
- Benchmarking performed using latest version of ChemBench pipeline and ChemBench Dataset.
297
- randomization_handling: >
298
- Temperature or other randomization or seeding expected in model description.
299
- output_standardization: >
300
- Outputs prompted to be given in ChemBench parsing compatible format.
301
-
302
- results_communication:
303
- visualization:
304
- recommended_plots:
305
- - "Spider chart showing model performance on different topics against baseline and other leading models"
306
- - "Reliability and distribution of confidence estimates, showing confidence calibration"
307
- standardized_formats:
308
- - "Latest results maintained in ChemBench-Leaderboard"
309
- - "Refusals counted as incorrect"
310
- - "Baseline model as defined in paper"
311
- - "Final answer based on ChemBench pipeline, not log probabilities"
312
-
313
- leaderboard_guidelines:
314
- submission_process: "Detailed in Huggingface Space documentation"
315
- required_metadata:
316
- - "Model details as specified in documentation requirements"
317
-
318
- known_issues_and_limitations:
319
- validity_concerns:
320
- construct_validity: >
321
- Even though ChemBench goes beyond MCQ-only benchmarks by including numeric questions,
322
- evaluation on open-ended tasks is not included. Partial scoring and task weighing not supported.
323
- gaming_possibilities: "Possibility to host ChemBench as a challenge"
324
- stability_considerations: >
325
- Refusal detection and retry mechanism implemented to tackle LLM refusals,
326
- combining regex-based detection and fine-tuned BERT model.
327
- temporal_validity: >
328
- Questions based on scientific principles won't lose validity,
329
- but may appear in training corpora over time.
330
-
331
- practical_limitations:
332
- resource_constraints: "Based on the model being benchmarked"
333
- scalability_issues: "Based on the model being benchmarked"
334
- cost_factors: "Based on the model being benchmarked"
335
- time_boundaries: "Benchmark might lose validity as questions leak to training corpora"
336
-
337
- bias_and_fairness:
338
- known_biases:
339
- - "Biases from human curation process"
340
- representation_issues: "Certain areas of chemistry not evaluated"
341
- potential_impacts: "Certain areas of chemistry not evaluated"
342
- mitigation_approaches: "Curation by team of more than 10 people to balance biases"
343
-
344
- version_and_maintenance:
345
- version_information:
346
- version:
347
- results: "v1.0.4"
348
- dataset: "v1.0.0"
349
- code: "v0.3.0"
350
- release_date: "2024-11-01"
351
- change_history: "Tracked in GitHub repository changelog"
352
- update_plans: "Discussed in GitHub repository discussions"
353
-
354
- maintenance_protocol:
355
- update_frequency: "Ad hoc after release"
356
- deprecation_policy: >
357
- Based on major issues with questions. Questions removed and dataset version updated.
358
- Major updates lead to rerunning models for updated Leaderboard.
359
- issue_reporting: "Issues tracked in GitHub repository"
360
- community_involvement: >
361
- Maintainers active in solving user issues on GitHub.
362
- Proposal for forum in Mat Sci Community Disclosure.
363
- Discussions available on GitHub and Huggingface.
364
- criteria_for_updates:
365
- - "Codebase updated for new features or bug fixes"
366
- - "Dataset updated when questions added or removed"
367
- - "Leaderboard updated for new models or dataset updates"
368
- breaking_change_policy: >
369
- All models in leaderboard rerun with new updates.
370
- Update of arXiv paper released. Proposal to release a commit.
371
- backwards_compatibility: >
372
- Pydantic base classes for task and report stable for compatibility.
373
- Major changes to tasks and report backward compatible.
374
- migration_guides: "Released in documentation as needed"
375
-
376
- citation_and_usage:
377
- citation_information:
378
- recommended_citation: >
379
- @misc{mirza2024largelanguagemodelssuperhuman,
380
- title={Are large language models superhuman chemists?},
381
- author={Adrian Mirza and Nawaf Alampara and Sreekanth Kunchapu and Benedict Emoekabu and Aswanth Krishnan and Mara Wilhelmi and Macjonathan Okereke and Juliane Eberhardt and Amir Mohammad Elahi and Maximilian Greiner and Caroline T. Holick and Tanya Gupta and Mehrdad Asgari and Christina Glaubitz and Lea C. Klepsch and Yannik Köster and Jakob Meyer and Santiago Miret and Tim Hoffmann and Fabian Alexander Kreth and Michael Ringleb and Nicole Roesner and Ulrich S. Schubert and Leanne M. Stafast and Dinga Wonanke and Michael Pieler and Philippe Schwaller and Kevin Maik Jablonka},
382
- year={2024},
383
- eprint={2404.01475},
384
- archivePrefix={arXiv},
385
- primaryClass={cs.LG},
386
- url={https://arxiv.org/abs/2404.01475},
387
- }
388
- related_publications:
389
- - "Are large language models superhuman chemists? (https://arxiv.org/abs/2404.01475)"
390
- - "Probing the limitations of multimodal language models for chemistry and materials research (https://arxiv.org/pdf/2411.16955)"
391
- licensing_details: "MIT License"
392
-
393
- usage_guidelines:
394
- recommended_applications:
395
- - "Evaluation of LLM capabilities in chemistry"
396
- inappropriate_uses:
397
- - "Training models with the ChemBench dataset"
398
- implementation_best_practices: >
399
- Results obtained with ChemBench pipeline and latest dataset at time of benchmarking considered valid practice.
400
- ethical_considerations: "ChemBench dataset not meant for training"
401
-
402
- additional_notes:
403
- related_evaluations:
404
- - "ChemBench extension for multimodal models (https://arxiv.org/pdf/2411.16955)"
405
- - "MatText for bottlenecks of finetuned LLMs on property prediction (https://arxiv.org/abs/2406.17295)"
406
- - "MaScQA for investigating materials science knowledge of LLMs (https://pubs.rsc.org/en/content/articlelanding/2024/dd/d3dd00188a)"
407
- - "Measuring Capabilities of Language Models for Biology Research (https://arxiv.org/abs/2407.10362)"
408
- future_directions: >
409
- Sensitivity to prompting, improving performance with prompt optimization.
410
- Mechanistic interpretability. Benchmarking agents on ChemBench.
411
- Effect of grounding and post-training approaches.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
script.js CHANGED
@@ -87,4 +87,88 @@ function visualizeCoverage(scoreDetails) {
87
  document.addEventListener('DOMContentLoaded', function() {
88
  // This could be used to initialize charts or other client-side features
89
  console.log('Client-side JavaScript initialized');
90
- });
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  document.addEventListener('DOMContentLoaded', function() {
88
  // This could be used to initialize charts or other client-side features
89
  console.log('Client-side JavaScript initialized');
90
+ });
91
+
92
+
93
+ // Add this to script.js
94
+
95
+ // Function to show the YAML content in a modal
96
+ function viewYAML(filename) {
97
+ // Use Gradio's client.query to get the YAML content
98
+ gradioApp().querySelector('#file_action_component').querySelector('textarea').value = filename;
99
+ gradioApp().querySelector('#file_action_type_component').querySelector('textarea').value = 'view';
100
+
101
+ // Trigger the event
102
+ const viewButton = gradioApp().querySelector('#trigger_file_action');
103
+ viewButton.click();
104
+
105
+ // The result will show up in the modal that's created by the event handler
106
+ }
107
+
108
+ // Function to download a file
109
+ function downloadFile(filename, format) {
110
+ let actionType;
111
+ if (format === 'yaml') actionType = 'download_yaml';
112
+ else if (format === 'markdown') actionType = 'download_md';
113
+ else if (format === 'latex') actionType = 'download_latex';
114
+ else return;
115
+
116
+ // Set the filename and action type
117
+ gradioApp().querySelector('#file_action_component').querySelector('textarea').value = filename;
118
+ gradioApp().querySelector('#file_action_type_component').querySelector('textarea').value = actionType;
119
+
120
+ // Trigger the event
121
+ const downloadButton = gradioApp().querySelector('#trigger_file_action');
122
+ downloadButton.click();
123
+
124
+ // The download will be handled by the event response
125
+ }
126
+
127
+ // Helper function to get the Gradio app element
128
+ function gradioApp() {
129
+ return document.getElementsByTagName('gradio-app')[0].shadowRoot || document;
130
+ }
131
+
132
+ // Function to create and display a modal with content
133
+ function showModal(content) {
134
+ // Create the modal elements
135
+ const modal = document.createElement('div');
136
+ modal.style.position = 'fixed';
137
+ modal.style.top = '0';
138
+ modal.style.left = '0';
139
+ modal.style.width = '100%';
140
+ modal.style.height = '100%';
141
+ modal.style.backgroundColor = 'rgba(0, 0, 0, 0.5)';
142
+ modal.style.zIndex = '1000';
143
+ modal.style.display = 'flex';
144
+ modal.style.justifyContent = 'center';
145
+ modal.style.alignItems = 'center';
146
+
147
+ const modalContent = document.createElement('div');
148
+ modalContent.style.backgroundColor = 'white';
149
+ modalContent.style.padding = '20px';
150
+ modalContent.style.borderRadius = '5px';
151
+ modalContent.style.maxWidth = '80%';
152
+ modalContent.style.maxHeight = '80%';
153
+ modalContent.style.overflow = 'auto';
154
+
155
+ const closeButton = document.createElement('button');
156
+ closeButton.textContent = 'Close';
157
+ closeButton.style.marginBottom = '10px';
158
+ closeButton.style.padding = '5px 10px';
159
+ closeButton.style.cursor = 'pointer';
160
+ closeButton.onclick = () => {
161
+ document.body.removeChild(modal);
162
+ };
163
+
164
+ const contentPre = document.createElement('pre');
165
+ contentPre.textContent = content;
166
+ contentPre.style.whiteSpace = 'pre-wrap';
167
+ contentPre.style.wordBreak = 'break-word';
168
+
169
+ modalContent.appendChild(closeButton);
170
+ modalContent.appendChild(contentPre);
171
+ modal.appendChild(modalContent);
172
+
173
+ document.body.appendChild(modal);
174
+ }
template.yaml CHANGED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Evaluation Card Template
2
+ title: "[Evaluation Name]"
3
+
4
+ summary: >
5
+ Brief description of the evaluation approach, its purpose, and scope.
6
+
7
+ metadata:
8
+ authors: []
9
+ maintainers: []
10
+ creation_date: ""
11
+ last_review_date: ""
12
+ next_review_date: ""
13
+ version_compatibility: []
14
+ repository_link: "" # Link to the code repository
15
+ paper_link: "" # Link to the research paper
16
+
17
+ evaluation_design:
18
+ motivation:
19
+ scientific_needs: ""
20
+ approach_justification: ""
21
+ expected_benefits: ""
22
+ tradeoffs: ""
23
+
24
+ type_and_structure:
25
+ type: "" # benchmark, challenge, red teaming, deployment study, structured test
26
+ structure: ""
27
+ timeline: ""
28
+ key_design_decisions: []
29
+ design_process:
30
+ stakeholder_consultation: ""
31
+ pilot_studies: []
32
+ validation_approaches: []
33
+
34
+ stakeholders_and_resources:
35
+ target_users: []
36
+ required_expertise: []
37
+ resource_requirements: []
38
+ cost_considerations: ""
39
+
40
+ estimand:
41
+ target_construct:
42
+ primary_capability: ""
43
+ measurement_type: "" # representational or pragmatic
44
+ relationship_to_applications: ""
45
+ theoretical_framework: ""
46
+
47
+ scope_and_limitations:
48
+ coverage: ""
49
+ excluded_capabilities: []
50
+ known_blind_spots: []
51
+ theoretical_limitations: []
52
+
53
+ assessment_components:
54
+ test_set:
55
+ data_sources: []
56
+ sampling_methodology: ""
57
+ known_biases: []
58
+ approach_to_duplicates: ""
59
+ data_quality: ""
60
+
61
+ challenge:
62
+ design_principles: []
63
+ task_selection_criteria: []
64
+ difficulty_progression: ""
65
+ time_constraints: ""
66
+
67
+ red_teaming:
68
+ probing_methodology: ""
69
+ coverage_strategy: ""
70
+ adversarial_approach: ""
71
+ safety_considerations: ""
72
+
73
+ deployment_study:
74
+ environment_characteristics: ""
75
+ integration_points: []
76
+ success_criteria: []
77
+ monitoring_approach: ""
78
+
79
+ estimator:
80
+ evaluation_protocol:
81
+ methodology: ""
82
+ control_measures: []
83
+ handling_random_components: ""
84
+ reproducibility_requirements: ""
85
+
86
+ metrics:
87
+ primary_metrics: []
88
+ aggregation_methodology: ""
89
+ task_weightings: {}
90
+ performance_bounds: {}
91
+ connection_to_outcomes: ""
92
+
93
+ metric_details:
94
+ - name: ""
95
+ definition: ""
96
+ implementation: ""
97
+ edge_cases: []
98
+ statistical_properties: ""
99
+ baseline_values: {}
100
+ failure_modes: []
101
+
102
+ technical_framework:
103
+ implementation_requirements: []
104
+ time_constraints: ""
105
+ dependencies: []
106
+ authentication_needs: ""
107
+
108
+ constraints_and_rules:
109
+ allowed_resources: []
110
+ permitted_approaches: []
111
+ optimization_constraints: []
112
+ ethical_boundaries: []
113
+
114
+ estimate:
115
+ required_reporting:
116
+ essential_metrics: []
117
+ results_disaggregation: ""
118
+ uncertainty_quantification: ""
119
+ performance_variation: ""
120
+ resource_usage_reporting: ""
121
+
122
+ reproducibility_information:
123
+ documentation_requirements: []
124
+ environment_specifications: ""
125
+ randomization_handling: ""
126
+ output_standardization: ""
127
+
128
+ results_communication:
129
+ visualization:
130
+ recommended_plots: []
131
+ standardized_formats: []
132
+ key_comparisons: []
133
+
134
+ leaderboard_guidelines:
135
+ submission_process: ""
136
+ required_metadata: []
137
+
138
+ known_issues_and_limitations:
139
+ validity_concerns:
140
+ construct_validity: ""
141
+ gaming_possibilities: ""
142
+ stability_considerations: ""
143
+ temporal_validity: ""
144
+
145
+ practical_limitations:
146
+ resource_constraints: ""
147
+ scalability_issues: ""
148
+ cost_factors: ""
149
+ time_boundaries: ""
150
+
151
+ bias_and_fairness:
152
+ known_biases: []
153
+ representation_issues: ""
154
+ potential_impacts: ""
155
+ mitigation_approaches: []
156
+
157
+ version_and_maintenance:
158
+ version_information:
159
+ version: ""
160
+ release_date: ""
161
+ change_history: []
162
+ update_plans: ""
163
+
164
+ maintenance_protocol:
165
+ update_frequency: ""
166
+ deprecation_policy: ""
167
+ issue_reporting: ""
168
+ community_involvement: ""
169
+ criteria_for_updates: []
170
+ breaking_change_policy: ""
171
+ backwards_compatibility: ""
172
+ migration_guides: ""
173
+
174
+ citation_and_usage:
175
+ citation_information:
176
+ recommended_citation: ""
177
+ related_publications: []
178
+ licensing_details: ""
179
+
180
+ usage_guidelines:
181
+ recommended_applications: []
182
+ inappropriate_uses: []
183
+ implementation_best_practices: ""
184
+ ethical_considerations: ""
185
+
186
+ additional_notes:
187
+ related_evaluations: []
188
+ future_directions: ""
yaml_template.yaml CHANGED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Evaluation Card Template
2
+ title: "[Evaluation Name]"
3
+
4
+ summary: >
5
+ Brief description of the evaluation approach, its purpose, and scope.
6
+
7
+ metadata:
8
+ authors: []
9
+ maintainers: []
10
+ creation_date: ""
11
+ last_review_date: ""
12
+ next_review_date: ""
13
+ version_compatibility: []
14
+ repository_link: "" # Link to the code repository
15
+ paper_link: "" # Link to the research paper
16
+
17
+ evaluation_design:
18
+ motivation:
19
+ scientific_needs: ""
20
+ approach_justification: ""
21
+ expected_benefits: ""
22
+ tradeoffs: ""
23
+
24
+ type_and_structure:
25
+ type: "" # benchmark, challenge, red teaming, deployment study, structured test
26
+ structure: ""
27
+ timeline: ""
28
+ key_design_decisions: []
29
+ design_process:
30
+ stakeholder_consultation: ""
31
+ pilot_studies: []
32
+ validation_approaches: []
33
+
34
+ stakeholders_and_resources:
35
+ target_users: []
36
+ required_expertise: []
37
+ resource_requirements: []
38
+ cost_considerations: ""
39
+
40
+ estimand:
41
+ target_construct:
42
+ primary_capability: ""
43
+ measurement_type: "" # representational or pragmatic
44
+ relationship_to_applications: ""
45
+ theoretical_framework: ""
46
+
47
+ scope_and_limitations:
48
+ coverage: ""
49
+ excluded_capabilities: []
50
+ known_blind_spots: []
51
+ theoretical_limitations: []
52
+
53
+ assessment_components:
54
+ test_set:
55
+ data_sources: []
56
+ sampling_methodology: ""
57
+ known_biases: []
58
+ approach_to_duplicates: ""
59
+ data_quality: ""
60
+
61
+ challenge:
62
+ design_principles: []
63
+ task_selection_criteria: []
64
+ difficulty_progression: ""
65
+ time_constraints: ""
66
+
67
+ red_teaming:
68
+ probing_methodology: ""
69
+ coverage_strategy: ""
70
+ adversarial_approach: ""
71
+ safety_considerations: ""
72
+
73
+ deployment_study:
74
+ environment_characteristics: ""
75
+ integration_points: []
76
+ success_criteria: []
77
+ monitoring_approach: ""
78
+
79
+ estimator:
80
+ evaluation_protocol:
81
+ methodology: ""
82
+ control_measures: []
83
+ handling_random_components: ""
84
+ reproducibility_requirements: ""
85
+
86
+ metrics:
87
+ primary_metrics: []
88
+ aggregation_methodology: ""
89
+ task_weightings: {}
90
+ performance_bounds: {}
91
+ connection_to_outcomes: ""
92
+
93
+ metric_details:
94
+ - name: ""
95
+ definition: ""
96
+ implementation: ""
97
+ edge_cases: []
98
+ statistical_properties: ""
99
+ baseline_values: {}
100
+ failure_modes: []
101
+
102
+ technical_framework:
103
+ implementation_requirements: []
104
+ time_constraints: ""
105
+ dependencies: []
106
+ authentication_needs: ""
107
+
108
+ constraints_and_rules:
109
+ allowed_resources: []
110
+ permitted_approaches: []
111
+ optimization_constraints: []
112
+ ethical_boundaries: []
113
+
114
+ estimate:
115
+ required_reporting:
116
+ essential_metrics: []
117
+ results_disaggregation: ""
118
+ uncertainty_quantification: ""
119
+ performance_variation: ""
120
+ resource_usage_reporting: ""
121
+
122
+ reproducibility_information:
123
+ documentation_requirements: []
124
+ environment_specifications: ""
125
+ randomization_handling: ""
126
+ output_standardization: ""
127
+
128
+ results_communication:
129
+ visualization:
130
+ recommended_plots: []
131
+ standardized_formats: []
132
+ key_comparisons: []
133
+
134
+ leaderboard_guidelines:
135
+ submission_process: ""
136
+ required_metadata: []
137
+
138
+ known_issues_and_limitations:
139
+ validity_concerns:
140
+ construct_validity: ""
141
+ gaming_possibilities: ""
142
+ stability_considerations: ""
143
+ temporal_validity: ""
144
+
145
+ practical_limitations:
146
+ resource_constraints: ""
147
+ scalability_issues: ""
148
+ cost_factors: ""
149
+ time_boundaries: ""
150
+
151
+ bias_and_fairness:
152
+ known_biases: []
153
+ representation_issues: ""
154
+ potential_impacts: ""
155
+ mitigation_approaches: []
156
+
157
+ version_and_maintenance:
158
+ version_information:
159
+ version: ""
160
+ release_date: ""
161
+ change_history: []
162
+ update_plans: ""
163
+
164
+ maintenance_protocol:
165
+ update_frequency: ""
166
+ deprecation_policy: ""
167
+ issue_reporting: ""
168
+ community_involvement: ""
169
+ criteria_for_updates: []
170
+ breaking_change_policy: ""
171
+ backwards_compatibility: ""
172
+ migration_guides: ""
173
+
174
+ citation_and_usage:
175
+ citation_information:
176
+ recommended_citation: ""
177
+ related_publications: []
178
+ licensing_details: ""
179
+
180
+ usage_guidelines:
181
+ recommended_applications: []
182
+ inappropriate_uses: []
183
+ implementation_best_practices: ""
184
+ ethical_considerations: ""
185
+
186
+ additional_notes:
187
+ related_evaluations: []
188
+ future_directions: ""