chore: update, cards
Browse files- app.py +189 -148
- eval_cards/ChemBench_20250312_170522.yaml +0 -411
- script.js +85 -1
- template.yaml +188 -0
- yaml_template.yaml +188 -0
app.py
CHANGED
@@ -1,17 +1,15 @@
|
|
1 |
import datetime
|
2 |
import os
|
3 |
import re
|
4 |
-
from pathlib import Path
|
5 |
|
6 |
import gradio as gr
|
7 |
import pandas as pd
|
8 |
-
import requests
|
9 |
import yaml
|
10 |
|
11 |
# Constants
|
12 |
EVAL_CARDS_DIR = "eval_cards"
|
13 |
TEMPLATE_PATH = "template.yaml"
|
14 |
-
|
15 |
|
16 |
# Ensure the eval cards directory exists
|
17 |
os.makedirs(EVAL_CARDS_DIR, exist_ok=True)
|
@@ -21,11 +19,13 @@ with open("template.yaml", "w") as f:
|
|
21 |
with open("yaml_template.yaml", "r") as template_file:
|
22 |
f.write(template_file.read())
|
23 |
|
|
|
24 |
def load_template():
|
25 |
"""Load the YAML template"""
|
26 |
with open(TEMPLATE_PATH, "r") as file:
|
27 |
return file.read()
|
28 |
|
|
|
29 |
def yaml_to_dict(yaml_str):
|
30 |
"""Convert YAML string to Python dictionary"""
|
31 |
try:
|
@@ -33,6 +33,7 @@ def yaml_to_dict(yaml_str):
|
|
33 |
except yaml.YAMLError as e:
|
34 |
return {"error": str(e)}
|
35 |
|
|
|
36 |
def compute_coverage_score(eval_data):
|
37 |
"""
|
38 |
Compute a coverage score for the eval card
|
@@ -49,22 +50,26 @@ def compute_coverage_score(eval_data):
|
|
49 |
"version_and_maintenance": 5,
|
50 |
"citation_and_usage": 5,
|
51 |
}
|
52 |
-
|
53 |
scores = {}
|
54 |
total_score = 0
|
55 |
-
|
56 |
def count_filled_fields(data, prefix=""):
|
57 |
if isinstance(data, dict):
|
58 |
filled = 0
|
59 |
total = 0
|
60 |
for key, value in data.items():
|
61 |
if isinstance(value, (dict, list)):
|
62 |
-
sub_filled, sub_total = count_filled_fields(
|
|
|
|
|
63 |
filled += sub_filled
|
64 |
total += sub_total
|
65 |
else:
|
66 |
total += 1
|
67 |
-
if value and not (
|
|
|
|
|
68 |
filled += 1
|
69 |
return filled, total
|
70 |
elif isinstance(data, list):
|
@@ -79,7 +84,7 @@ def compute_coverage_score(eval_data):
|
|
79 |
return filled, total
|
80 |
else:
|
81 |
return 1 if data else 0, 1
|
82 |
-
|
83 |
# Compute scores for each section
|
84 |
for section, weight in sections.items():
|
85 |
if section in eval_data:
|
@@ -90,7 +95,7 @@ def compute_coverage_score(eval_data):
|
|
90 |
"max_score": weight,
|
91 |
"completion_rate": round(completion_rate * 100, 2),
|
92 |
"fields_filled": filled,
|
93 |
-
"fields_total": total
|
94 |
}
|
95 |
total_score += scores[section]["score"]
|
96 |
else:
|
@@ -99,10 +104,11 @@ def compute_coverage_score(eval_data):
|
|
99 |
"max_score": weight,
|
100 |
"completion_rate": 0,
|
101 |
"fields_filled": 0,
|
102 |
-
"fields_total": 0
|
103 |
}
|
104 |
-
|
105 |
-
return round(total_score, 2), scores
|
|
|
106 |
|
107 |
def get_llm_feedback(yaml_content, api_token=None):
|
108 |
"""
|
@@ -110,22 +116,23 @@ def get_llm_feedback(yaml_content, api_token=None):
|
|
110 |
Uses GROQ_API_KEY from environment variables if no token is provided
|
111 |
"""
|
112 |
import os
|
|
|
113 |
import requests
|
114 |
from dotenv import load_dotenv
|
115 |
-
|
116 |
# Load environment variables from .env file if it exists
|
117 |
load_dotenv()
|
118 |
-
|
119 |
# Use provided token or get from environment
|
120 |
api_token = api_token or os.environ.get("GROQ_API_KEY")
|
121 |
-
|
122 |
if not api_token:
|
123 |
return "API token is required for LLM feedback. Please set the GROQ_API_KEY environment variable or provide a token."
|
124 |
|
125 |
try:
|
126 |
headers = {
|
127 |
"Content-Type": "application/json",
|
128 |
-
"Authorization": f"Bearer {api_token}"
|
129 |
}
|
130 |
|
131 |
prompt = f"""
|
@@ -148,16 +155,14 @@ def get_llm_feedback(yaml_content, api_token=None):
|
|
148 |
"""
|
149 |
|
150 |
payload = {
|
151 |
-
"model": "llama-3.3-70b-versatile",
|
152 |
-
"messages": [
|
153 |
-
{"role": "user", "content": prompt}
|
154 |
-
]
|
155 |
}
|
156 |
|
157 |
response = requests.post(
|
158 |
"https://api.groq.com/openai/v1/chat/completions",
|
159 |
headers=headers,
|
160 |
-
json=payload
|
161 |
)
|
162 |
|
163 |
if response.status_code == 200:
|
@@ -169,32 +174,38 @@ def get_llm_feedback(yaml_content, api_token=None):
|
|
169 |
return f"Error getting Groq LLM feedback: {str(e)}"
|
170 |
|
171 |
|
172 |
-
def save_eval_card(yaml_content,
|
173 |
-
"""Save an eval card
|
174 |
try:
|
175 |
-
# Parse YAML to validate it
|
176 |
eval_data = yaml.safe_load(yaml_content)
|
177 |
-
|
178 |
-
#
|
179 |
-
if
|
180 |
-
|
181 |
-
|
182 |
-
|
183 |
-
|
184 |
-
|
185 |
-
|
|
|
|
|
|
|
|
|
|
|
186 |
file_path = os.path.join(EVAL_CARDS_DIR, filename)
|
|
|
187 |
with open(file_path, "w") as file:
|
188 |
file.write(yaml_content)
|
189 |
-
|
190 |
-
return
|
191 |
except Exception as e:
|
192 |
-
return
|
|
|
193 |
|
194 |
def load_all_eval_cards():
|
195 |
"""Load all eval cards from the repository"""
|
196 |
eval_cards = []
|
197 |
-
|
198 |
for filename in os.listdir(EVAL_CARDS_DIR):
|
199 |
if filename.endswith(".yaml"):
|
200 |
file_path = os.path.join(EVAL_CARDS_DIR, filename)
|
@@ -202,36 +213,48 @@ def load_all_eval_cards():
|
|
202 |
with open(file_path, "r") as file:
|
203 |
yaml_content = file.read()
|
204 |
eval_data = yaml.safe_load(yaml_content)
|
205 |
-
|
206 |
# Compute coverage score
|
207 |
score, score_details = compute_coverage_score(eval_data)
|
208 |
-
|
209 |
# Extract key metadata
|
210 |
-
eval_cards.append(
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
221 |
except Exception as e:
|
222 |
print(f"Error loading {filename}: {str(e)}")
|
223 |
-
|
224 |
return eval_cards
|
225 |
|
|
|
226 |
def format_eval_card_as_html(eval_card):
|
227 |
"""Format an eval card as HTML for display"""
|
228 |
html = f"""
|
229 |
<div style="border: 1px solid #ddd; padding: 15px; margin-bottom: 20px; border-radius: 5px;">
|
230 |
-
<h3>{eval_card[
|
231 |
-
<p>{eval_card[
|
232 |
-
<p><strong>Authors:</strong> {eval_card[
|
233 |
-
<p><strong>Created:</strong> {eval_card[
|
234 |
-
|
|
|
|
|
|
|
|
|
|
|
235 |
|
236 |
<h4>Coverage by Section:</h4>
|
237 |
<table style="width: 100%; border-collapse: collapse;">
|
@@ -241,45 +264,47 @@ def format_eval_card_as_html(eval_card):
|
|
241 |
<th style="text-align: right; padding: 5px; border-bottom: 1px solid #ddd;">Completion</th>
|
242 |
</tr>
|
243 |
"""
|
244 |
-
|
245 |
-
for section, details in eval_card[
|
246 |
html += f"""
|
247 |
<tr>
|
248 |
<td style="padding: 5px; border-bottom: 1px solid #eee;">{section}</td>
|
249 |
-
<td style="text-align: right; padding: 5px; border-bottom: 1px solid #eee;">{details[
|
250 |
-
<td style="text-align: right; padding: 5px; border-bottom: 1px solid #eee;">{details[
|
251 |
</tr>
|
252 |
"""
|
253 |
-
|
254 |
html += """
|
255 |
</table>
|
256 |
-
<div style="margin-top:
|
257 |
-
|
258 |
</div>
|
259 |
</div>
|
260 |
"""
|
261 |
-
|
262 |
return html
|
263 |
|
|
|
264 |
def create_eval_cards_table(eval_cards):
|
265 |
"""Create an HTML table of eval cards"""
|
266 |
if not eval_cards:
|
267 |
return "<p>No evaluation cards found.</p>"
|
268 |
-
|
269 |
# Sort by coverage score (highest first)
|
270 |
-
eval_cards.sort(key=lambda x: x[
|
271 |
-
|
272 |
html = ""
|
273 |
for eval_card in eval_cards:
|
274 |
html += format_eval_card_as_html(eval_card)
|
275 |
-
|
276 |
return html
|
277 |
|
|
|
278 |
def upload_file(file):
|
279 |
"""Process an uploaded YAML file"""
|
280 |
if file is None:
|
281 |
return "No file uploaded", None
|
282 |
-
|
283 |
try:
|
284 |
yaml_content = file.decode("utf-8")
|
285 |
# Validate YAML
|
@@ -288,62 +313,74 @@ def upload_file(file):
|
|
288 |
except Exception as e:
|
289 |
return f"Error processing file: {str(e)}", None
|
290 |
|
|
|
291 |
def get_feedback(yaml_content):
|
292 |
"""Get LLM feedback on the eval card"""
|
293 |
if not yaml_content:
|
294 |
return "Please upload or paste a YAML file first."
|
295 |
-
|
296 |
# Use provided token or get from environment
|
297 |
api_token = os.environ.get("GROQ_API_KEY")
|
298 |
-
|
299 |
if not api_token:
|
300 |
-
return
|
301 |
-
|
|
|
|
|
302 |
feedback = get_llm_feedback(yaml_content, api_token)
|
303 |
return feedback
|
304 |
|
305 |
-
|
|
|
306 |
"""Submit an eval card to the repository"""
|
307 |
if not yaml_content:
|
308 |
return "Please upload or paste a YAML file first.", None, None
|
309 |
-
|
310 |
try:
|
311 |
# Validate YAML
|
312 |
eval_data = yaml.safe_load(yaml_content)
|
313 |
-
|
314 |
# Compute coverage score
|
315 |
score, score_details = compute_coverage_score(eval_data)
|
316 |
-
|
317 |
-
# Save eval card
|
318 |
-
|
319 |
-
|
320 |
-
if
|
321 |
-
return
|
|
|
|
|
|
|
|
|
322 |
else:
|
323 |
-
return f"Error saving evaluation card: {
|
324 |
-
|
325 |
except Exception as e:
|
326 |
return f"Error processing evaluation card: {str(e)}", None, None
|
327 |
|
|
|
328 |
def refresh_gallery():
|
329 |
"""Refresh the gallery of eval cards"""
|
330 |
eval_cards = load_all_eval_cards()
|
331 |
html = create_eval_cards_table(eval_cards)
|
332 |
-
|
333 |
# Convert data to pandas DataFrame for table view
|
334 |
table_data = []
|
335 |
for card in eval_cards:
|
336 |
-
table_data.append(
|
337 |
-
|
338 |
-
|
339 |
-
|
340 |
-
|
341 |
-
|
342 |
-
|
|
|
|
|
343 |
df = pd.DataFrame(table_data)
|
344 |
-
|
345 |
return html, df if not df.empty else None
|
346 |
|
|
|
347 |
def handle_upload_tab(file_obj, yaml_text):
|
348 |
"""Handle upload tab actions - either use uploaded file or pasted text"""
|
349 |
if file_obj is not None:
|
@@ -352,96 +389,100 @@ def handle_upload_tab(file_obj, yaml_text):
|
|
352 |
else:
|
353 |
return yaml_text
|
354 |
|
|
|
355 |
# Create the Gradio interface
|
356 |
-
with gr.Blocks(title="Evaluation
|
357 |
with gr.Row():
|
358 |
with gr.Column(scale=2):
|
359 |
-
gr.Markdown("# Evaluation
|
360 |
-
gr.Markdown("""
|
361 |
-
|
362 |
-
|
363 |
-
Upload your evaluation card in YAML format, get feedback from an LLM, and submit it to the repository.
|
364 |
""")
|
365 |
-
|
366 |
with gr.Tabs():
|
367 |
with gr.TabItem("Upload & Review"):
|
368 |
with gr.Row():
|
369 |
with gr.Column():
|
370 |
-
file_upload = gr.File(
|
371 |
-
|
|
|
|
|
372 |
with gr.Accordion("Or paste YAML content", open=False):
|
373 |
-
yaml_input = gr.TextArea(
|
374 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
375 |
load_template_btn = gr.Button("Load Template")
|
376 |
-
|
377 |
# api_token = gr.Textbox(label="API Token (for LLM feedback)", type="password")
|
378 |
-
|
379 |
with gr.Row():
|
380 |
get_feedback_btn = gr.Button("Get LLM Feedback")
|
381 |
-
submit_btn = gr.Button(
|
382 |
-
|
|
|
|
|
383 |
with gr.Column():
|
384 |
yaml_display = gr.TextArea(label="Current YAML", lines=20)
|
385 |
-
|
386 |
with gr.Accordion("LLM Feedback", open=True):
|
387 |
feedback_display = gr.Markdown()
|
388 |
-
|
389 |
with gr.Accordion("Submission Result", open=True):
|
390 |
result_display = gr.Markdown()
|
391 |
-
coverage_score = gr.Number(
|
392 |
-
|
393 |
-
|
|
|
|
|
|
|
|
|
394 |
with gr.TabItem("Gallery"):
|
395 |
refresh_btn = gr.Button("Refresh Gallery")
|
396 |
-
|
397 |
with gr.Tabs():
|
398 |
with gr.TabItem("Card View"):
|
399 |
gallery_html = gr.HTML()
|
400 |
-
|
401 |
with gr.TabItem("Table View"):
|
402 |
gallery_table = gr.DataFrame()
|
403 |
-
|
404 |
# Set up event handlers
|
405 |
-
load_template_btn.click(
|
406 |
-
|
407 |
-
outputs=[yaml_display]
|
408 |
-
)
|
409 |
-
|
410 |
file_upload.change(
|
411 |
-
fn=handle_upload_tab,
|
412 |
-
inputs=[file_upload, yaml_input],
|
413 |
-
outputs=[yaml_display]
|
414 |
-
)
|
415 |
-
|
416 |
-
yaml_input.change(
|
417 |
-
fn=lambda x: x,
|
418 |
-
inputs=[yaml_input],
|
419 |
-
outputs=[yaml_display]
|
420 |
)
|
421 |
-
|
|
|
|
|
422 |
get_feedback_btn.click(
|
423 |
-
fn=get_feedback,
|
424 |
-
inputs=[yaml_display],
|
425 |
-
outputs=[feedback_display]
|
426 |
)
|
427 |
-
|
428 |
submit_btn.click(
|
429 |
fn=submit_eval_card,
|
430 |
-
inputs=[yaml_display],
|
431 |
-
outputs=[result_display, coverage_score, coverage_details]
|
432 |
-
)
|
433 |
-
|
434 |
-
refresh_btn.click(
|
435 |
-
fn=refresh_gallery,
|
436 |
-
outputs=[gallery_html, gallery_table]
|
437 |
)
|
438 |
-
|
|
|
|
|
439 |
# Initialize the gallery on app start
|
440 |
-
app.load(
|
441 |
-
fn=refresh_gallery,
|
442 |
-
outputs=[gallery_html, gallery_table]
|
443 |
-
)
|
444 |
|
445 |
# Launch the app
|
446 |
if __name__ == "__main__":
|
447 |
-
app.launch()
|
|
|
1 |
import datetime
|
2 |
import os
|
3 |
import re
|
|
|
4 |
|
5 |
import gradio as gr
|
6 |
import pandas as pd
|
|
|
7 |
import yaml
|
8 |
|
9 |
# Constants
|
10 |
EVAL_CARDS_DIR = "eval_cards"
|
11 |
TEMPLATE_PATH = "template.yaml"
|
12 |
+
|
13 |
|
14 |
# Ensure the eval cards directory exists
|
15 |
os.makedirs(EVAL_CARDS_DIR, exist_ok=True)
|
|
|
19 |
with open("yaml_template.yaml", "r") as template_file:
|
20 |
f.write(template_file.read())
|
21 |
|
22 |
+
|
23 |
def load_template():
|
24 |
"""Load the YAML template"""
|
25 |
with open(TEMPLATE_PATH, "r") as file:
|
26 |
return file.read()
|
27 |
|
28 |
+
|
29 |
def yaml_to_dict(yaml_str):
|
30 |
"""Convert YAML string to Python dictionary"""
|
31 |
try:
|
|
|
33 |
except yaml.YAMLError as e:
|
34 |
return {"error": str(e)}
|
35 |
|
36 |
+
|
37 |
def compute_coverage_score(eval_data):
|
38 |
"""
|
39 |
Compute a coverage score for the eval card
|
|
|
50 |
"version_and_maintenance": 5,
|
51 |
"citation_and_usage": 5,
|
52 |
}
|
53 |
+
|
54 |
scores = {}
|
55 |
total_score = 0
|
56 |
+
|
57 |
def count_filled_fields(data, prefix=""):
|
58 |
if isinstance(data, dict):
|
59 |
filled = 0
|
60 |
total = 0
|
61 |
for key, value in data.items():
|
62 |
if isinstance(value, (dict, list)):
|
63 |
+
sub_filled, sub_total = count_filled_fields(
|
64 |
+
value, f"{prefix}.{key}" if prefix else key
|
65 |
+
)
|
66 |
filled += sub_filled
|
67 |
total += sub_total
|
68 |
else:
|
69 |
total += 1
|
70 |
+
if value and not (
|
71 |
+
isinstance(value, str) and value.strip() in ["", "[]", "{}"]
|
72 |
+
):
|
73 |
filled += 1
|
74 |
return filled, total
|
75 |
elif isinstance(data, list):
|
|
|
84 |
return filled, total
|
85 |
else:
|
86 |
return 1 if data else 0, 1
|
87 |
+
|
88 |
# Compute scores for each section
|
89 |
for section, weight in sections.items():
|
90 |
if section in eval_data:
|
|
|
95 |
"max_score": weight,
|
96 |
"completion_rate": round(completion_rate * 100, 2),
|
97 |
"fields_filled": filled,
|
98 |
+
"fields_total": total,
|
99 |
}
|
100 |
total_score += scores[section]["score"]
|
101 |
else:
|
|
|
104 |
"max_score": weight,
|
105 |
"completion_rate": 0,
|
106 |
"fields_filled": 0,
|
107 |
+
"fields_total": 0,
|
108 |
}
|
109 |
+
|
110 |
+
return max(round(total_score, 2), 100), scores
|
111 |
+
|
112 |
|
113 |
def get_llm_feedback(yaml_content, api_token=None):
|
114 |
"""
|
|
|
116 |
Uses GROQ_API_KEY from environment variables if no token is provided
|
117 |
"""
|
118 |
import os
|
119 |
+
|
120 |
import requests
|
121 |
from dotenv import load_dotenv
|
122 |
+
|
123 |
# Load environment variables from .env file if it exists
|
124 |
load_dotenv()
|
125 |
+
|
126 |
# Use provided token or get from environment
|
127 |
api_token = api_token or os.environ.get("GROQ_API_KEY")
|
128 |
+
|
129 |
if not api_token:
|
130 |
return "API token is required for LLM feedback. Please set the GROQ_API_KEY environment variable or provide a token."
|
131 |
|
132 |
try:
|
133 |
headers = {
|
134 |
"Content-Type": "application/json",
|
135 |
+
"Authorization": f"Bearer {api_token}",
|
136 |
}
|
137 |
|
138 |
prompt = f"""
|
|
|
155 |
"""
|
156 |
|
157 |
payload = {
|
158 |
+
"model": "llama-3.3-70b-versatile", # or another groq supported model
|
159 |
+
"messages": [{"role": "user", "content": prompt}],
|
|
|
|
|
160 |
}
|
161 |
|
162 |
response = requests.post(
|
163 |
"https://api.groq.com/openai/v1/chat/completions",
|
164 |
headers=headers,
|
165 |
+
json=payload,
|
166 |
)
|
167 |
|
168 |
if response.status_code == 200:
|
|
|
174 |
return f"Error getting Groq LLM feedback: {str(e)}"
|
175 |
|
176 |
|
177 |
+
def save_eval_card(yaml_content, paper_url="", repo_url=""):
|
178 |
+
"""Save an eval card with additional metadata"""
|
179 |
try:
|
|
|
180 |
eval_data = yaml.safe_load(yaml_content)
|
181 |
+
|
182 |
+
# Add paper and repository links to metadata
|
183 |
+
if paper_url:
|
184 |
+
eval_data["metadata"]["paper_link"] = paper_url
|
185 |
+
if repo_url:
|
186 |
+
eval_data["metadata"]["repository_link"] = repo_url
|
187 |
+
|
188 |
+
# Update the YAML content with the new metadata
|
189 |
+
yaml_content = yaml.dump(eval_data)
|
190 |
+
|
191 |
+
filename = re.sub(r"[^\w\-_]", "_", eval_data.get("title", "Unnamed"))
|
192 |
+
filename = (
|
193 |
+
f"{filename}_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.yaml"
|
194 |
+
)
|
195 |
file_path = os.path.join(EVAL_CARDS_DIR, filename)
|
196 |
+
|
197 |
with open(file_path, "w") as file:
|
198 |
file.write(yaml_content)
|
199 |
+
|
200 |
+
return f"Evaluation card saved successfully as {filename}", file_path
|
201 |
except Exception as e:
|
202 |
+
return f"Error saving evaluation card: {str(e)}", None
|
203 |
+
|
204 |
|
205 |
def load_all_eval_cards():
|
206 |
"""Load all eval cards from the repository"""
|
207 |
eval_cards = []
|
208 |
+
|
209 |
for filename in os.listdir(EVAL_CARDS_DIR):
|
210 |
if filename.endswith(".yaml"):
|
211 |
file_path = os.path.join(EVAL_CARDS_DIR, filename)
|
|
|
213 |
with open(file_path, "r") as file:
|
214 |
yaml_content = file.read()
|
215 |
eval_data = yaml.safe_load(yaml_content)
|
216 |
+
|
217 |
# Compute coverage score
|
218 |
score, score_details = compute_coverage_score(eval_data)
|
219 |
+
|
220 |
# Extract key metadata
|
221 |
+
eval_cards.append(
|
222 |
+
{
|
223 |
+
"filename": filename,
|
224 |
+
"title": eval_data.get("title", "Unnamed Evaluation"),
|
225 |
+
"summary": eval_data.get("summary", ""),
|
226 |
+
"authors": ", ".join(
|
227 |
+
eval_data.get("metadata", {}).get("authors", [])
|
228 |
+
),
|
229 |
+
"creation_date": eval_data.get("metadata", {}).get(
|
230 |
+
"creation_date", ""
|
231 |
+
),
|
232 |
+
"coverage_score": score,
|
233 |
+
"score_details": score_details,
|
234 |
+
"yaml_content": yaml_content,
|
235 |
+
"data": eval_data,
|
236 |
+
}
|
237 |
+
)
|
238 |
except Exception as e:
|
239 |
print(f"Error loading {filename}: {str(e)}")
|
240 |
+
|
241 |
return eval_cards
|
242 |
|
243 |
+
|
244 |
def format_eval_card_as_html(eval_card):
|
245 |
"""Format an eval card as HTML for display"""
|
246 |
html = f"""
|
247 |
<div style="border: 1px solid #ddd; padding: 15px; margin-bottom: 20px; border-radius: 5px;">
|
248 |
+
<h3>{eval_card["title"]}</h3>
|
249 |
+
<p>{eval_card["summary"]}</p>
|
250 |
+
<p><strong>Authors:</strong> {eval_card["authors"]}</p>
|
251 |
+
<p><strong>Created:</strong> {eval_card["creation_date"]}</p>
|
252 |
+
|
253 |
+
<!-- Add repository and paper links if available -->
|
254 |
+
{f'<p><strong>Repository:</strong> <a href="{eval_card["data"]["metadata"].get("repository_link", "")}" target="_blank">{eval_card["data"]["metadata"].get("repository_link", "")}</a></p>' if eval_card["data"]["metadata"].get("repository_link") else ""}
|
255 |
+
{f'<p><strong>Paper:</strong> <a href="{eval_card["data"]["metadata"].get("paper_link", "")}" target="_blank">{eval_card["data"]["metadata"].get("paper_link", "")}</a></p>' if eval_card["data"]["metadata"].get("paper_link") else ""}
|
256 |
+
|
257 |
+
<p><strong>Coverage Score:</strong> {eval_card["coverage_score"]}%</p>
|
258 |
|
259 |
<h4>Coverage by Section:</h4>
|
260 |
<table style="width: 100%; border-collapse: collapse;">
|
|
|
264 |
<th style="text-align: right; padding: 5px; border-bottom: 1px solid #ddd;">Completion</th>
|
265 |
</tr>
|
266 |
"""
|
267 |
+
|
268 |
+
for section, details in eval_card["score_details"].items():
|
269 |
html += f"""
|
270 |
<tr>
|
271 |
<td style="padding: 5px; border-bottom: 1px solid #eee;">{section}</td>
|
272 |
+
<td style="text-align: right; padding: 5px; border-bottom: 1px solid #eee;">{details["score"]}/{details["max_score"]}</td>
|
273 |
+
<td style="text-align: right; padding: 5px; border-bottom: 1px solid #eee;">{details["completion_rate"]}%</td>
|
274 |
</tr>
|
275 |
"""
|
276 |
+
|
277 |
html += """
|
278 |
</table>
|
279 |
+
<div style="margin-top: 15px;">
|
280 |
+
<!-- Additional actions can go here -->
|
281 |
</div>
|
282 |
</div>
|
283 |
"""
|
284 |
+
|
285 |
return html
|
286 |
|
287 |
+
|
288 |
def create_eval_cards_table(eval_cards):
|
289 |
"""Create an HTML table of eval cards"""
|
290 |
if not eval_cards:
|
291 |
return "<p>No evaluation cards found.</p>"
|
292 |
+
|
293 |
# Sort by coverage score (highest first)
|
294 |
+
eval_cards.sort(key=lambda x: x["coverage_score"], reverse=True)
|
295 |
+
|
296 |
html = ""
|
297 |
for eval_card in eval_cards:
|
298 |
html += format_eval_card_as_html(eval_card)
|
299 |
+
|
300 |
return html
|
301 |
|
302 |
+
|
303 |
def upload_file(file):
|
304 |
"""Process an uploaded YAML file"""
|
305 |
if file is None:
|
306 |
return "No file uploaded", None
|
307 |
+
|
308 |
try:
|
309 |
yaml_content = file.decode("utf-8")
|
310 |
# Validate YAML
|
|
|
313 |
except Exception as e:
|
314 |
return f"Error processing file: {str(e)}", None
|
315 |
|
316 |
+
|
317 |
def get_feedback(yaml_content):
|
318 |
"""Get LLM feedback on the eval card"""
|
319 |
if not yaml_content:
|
320 |
return "Please upload or paste a YAML file first."
|
321 |
+
|
322 |
# Use provided token or get from environment
|
323 |
api_token = os.environ.get("GROQ_API_KEY")
|
324 |
+
|
325 |
if not api_token:
|
326 |
+
return (
|
327 |
+
"Please provide an API token or set the GROQ_API_KEY environment variable."
|
328 |
+
)
|
329 |
+
|
330 |
feedback = get_llm_feedback(yaml_content, api_token)
|
331 |
return feedback
|
332 |
|
333 |
+
|
334 |
+
def submit_eval_card(yaml_content, paper_url="", repo_url=""):
|
335 |
"""Submit an eval card to the repository"""
|
336 |
if not yaml_content:
|
337 |
return "Please upload or paste a YAML file first.", None, None
|
338 |
+
|
339 |
try:
|
340 |
# Validate YAML
|
341 |
eval_data = yaml.safe_load(yaml_content)
|
342 |
+
|
343 |
# Compute coverage score
|
344 |
score, score_details = compute_coverage_score(eval_data)
|
345 |
+
|
346 |
+
# Save eval card with URLs
|
347 |
+
result, file_path = save_eval_card(yaml_content, paper_url, repo_url)
|
348 |
+
|
349 |
+
if file_path:
|
350 |
+
return (
|
351 |
+
f"Evaluation card saved successfully! Coverage score: {score}%",
|
352 |
+
score,
|
353 |
+
score_details,
|
354 |
+
)
|
355 |
else:
|
356 |
+
return f"Error saving evaluation card: {result}", None, None
|
357 |
+
|
358 |
except Exception as e:
|
359 |
return f"Error processing evaluation card: {str(e)}", None, None
|
360 |
|
361 |
+
|
362 |
def refresh_gallery():
|
363 |
"""Refresh the gallery of eval cards"""
|
364 |
eval_cards = load_all_eval_cards()
|
365 |
html = create_eval_cards_table(eval_cards)
|
366 |
+
|
367 |
# Convert data to pandas DataFrame for table view
|
368 |
table_data = []
|
369 |
for card in eval_cards:
|
370 |
+
table_data.append(
|
371 |
+
{
|
372 |
+
"Title": card["title"],
|
373 |
+
"Authors": card["authors"][5],
|
374 |
+
"Creation Date": card["creation_date"],
|
375 |
+
"Coverage Score": f"{card['coverage_score']}%",
|
376 |
+
}
|
377 |
+
)
|
378 |
+
|
379 |
df = pd.DataFrame(table_data)
|
380 |
+
|
381 |
return html, df if not df.empty else None
|
382 |
|
383 |
+
|
384 |
def handle_upload_tab(file_obj, yaml_text):
|
385 |
"""Handle upload tab actions - either use uploaded file or pasted text"""
|
386 |
if file_obj is not None:
|
|
|
389 |
else:
|
390 |
return yaml_text
|
391 |
|
392 |
+
|
393 |
# Create the Gradio interface
|
394 |
+
with gr.Blocks(title="Evaluation Cards Gallery") as app:
|
395 |
with gr.Row():
|
396 |
with gr.Column(scale=2):
|
397 |
+
gr.Markdown("# Evaluation Cards for Machine Learning in Materials Science. ")
|
398 |
+
gr.Markdown("""
|
399 |
+
Upload your evaluation card in YAML format, get feedback from an LLM, and submit it to the gallery.
|
400 |
+
checkout the [GitHub repository](https://github.com/lamalab-org/eval-cards) for more information.
|
|
|
401 |
""")
|
402 |
+
|
403 |
with gr.Tabs():
|
404 |
with gr.TabItem("Upload & Review"):
|
405 |
with gr.Row():
|
406 |
with gr.Column():
|
407 |
+
file_upload = gr.File(
|
408 |
+
label="Upload YAML File", file_types=[".yaml", ".yml"]
|
409 |
+
)
|
410 |
+
|
411 |
with gr.Accordion("Or paste YAML content", open=False):
|
412 |
+
yaml_input = gr.TextArea(
|
413 |
+
label="YAML Content",
|
414 |
+
placeholder="Paste your YAML content here...",
|
415 |
+
lines=10,
|
416 |
+
)
|
417 |
+
paper_url_input = gr.Textbox(
|
418 |
+
label="Paper URL (Optional)",
|
419 |
+
placeholder="https://arxiv.org/abs/...",
|
420 |
+
)
|
421 |
+
|
422 |
+
repo_url_input = gr.Textbox(
|
423 |
+
label="Repository URL (Optional)",
|
424 |
+
placeholder="https://github.com/...",
|
425 |
+
)
|
426 |
+
|
427 |
load_template_btn = gr.Button("Load Template")
|
428 |
+
|
429 |
# api_token = gr.Textbox(label="API Token (for LLM feedback)", type="password")
|
430 |
+
|
431 |
with gr.Row():
|
432 |
get_feedback_btn = gr.Button("Get LLM Feedback")
|
433 |
+
submit_btn = gr.Button(
|
434 |
+
"Submit Evaluation Card", variant="primary"
|
435 |
+
)
|
436 |
+
|
437 |
with gr.Column():
|
438 |
yaml_display = gr.TextArea(label="Current YAML", lines=20)
|
439 |
+
|
440 |
with gr.Accordion("LLM Feedback", open=True):
|
441 |
feedback_display = gr.Markdown()
|
442 |
+
|
443 |
with gr.Accordion("Submission Result", open=True):
|
444 |
result_display = gr.Markdown()
|
445 |
+
coverage_score = gr.Number(
|
446 |
+
label="Coverage Score", visible=False
|
447 |
+
)
|
448 |
+
coverage_details = gr.JSON(
|
449 |
+
label="Coverage Details", visible=False
|
450 |
+
)
|
451 |
+
|
452 |
with gr.TabItem("Gallery"):
|
453 |
refresh_btn = gr.Button("Refresh Gallery")
|
454 |
+
|
455 |
with gr.Tabs():
|
456 |
with gr.TabItem("Card View"):
|
457 |
gallery_html = gr.HTML()
|
458 |
+
|
459 |
with gr.TabItem("Table View"):
|
460 |
gallery_table = gr.DataFrame()
|
461 |
+
|
462 |
# Set up event handlers
|
463 |
+
load_template_btn.click(fn=load_template, outputs=[yaml_display])
|
464 |
+
|
|
|
|
|
|
|
465 |
file_upload.change(
|
466 |
+
fn=handle_upload_tab, inputs=[file_upload, yaml_input], outputs=[yaml_display]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
467 |
)
|
468 |
+
|
469 |
+
yaml_input.change(fn=lambda x: x, inputs=[yaml_input], outputs=[yaml_display])
|
470 |
+
|
471 |
get_feedback_btn.click(
|
472 |
+
fn=get_feedback, inputs=[yaml_display], outputs=[feedback_display]
|
|
|
|
|
473 |
)
|
474 |
+
|
475 |
submit_btn.click(
|
476 |
fn=submit_eval_card,
|
477 |
+
inputs=[yaml_display, paper_url_input, repo_url_input],
|
478 |
+
outputs=[result_display, coverage_score, coverage_details],
|
|
|
|
|
|
|
|
|
|
|
479 |
)
|
480 |
+
|
481 |
+
refresh_btn.click(fn=refresh_gallery, outputs=[gallery_html, gallery_table])
|
482 |
+
|
483 |
# Initialize the gallery on app start
|
484 |
+
app.load(fn=refresh_gallery, outputs=[gallery_html, gallery_table])
|
|
|
|
|
|
|
485 |
|
486 |
# Launch the app
|
487 |
if __name__ == "__main__":
|
488 |
+
app.launch()
|
eval_cards/ChemBench_20250312_170522.yaml
DELETED
@@ -1,411 +0,0 @@
|
|
1 |
-
title: "ChemBench"
|
2 |
-
|
3 |
-
summary: >
|
4 |
-
ChemBench was developed as a comprehensive benchmarking suite for the performance of LLMs in chemistry.
|
5 |
-
It features a curation of more than 2,700 question-answer pairs classified to probe knowledge, intuition
|
6 |
-
and reasoning abilities of LLMs. ChemBench goes beyond simple MCQ evaluation, supports floating point
|
7 |
-
answers (also in scientific notation), and prompts models closely to how they were trained.
|
8 |
-
|
9 |
-
metadata:
|
10 |
-
authors:
|
11 |
-
- Adrian Mirza
|
12 |
-
- Nawaf Alampara
|
13 |
-
- Sreekanth Kunchapu
|
14 |
-
- Martiño Ríos-García
|
15 |
-
- Benedict Emoekabu
|
16 |
-
- Aswanth Krishnan
|
17 |
-
- Tanya Gupta
|
18 |
-
- Mara Schilling-Wilhelmi
|
19 |
-
- Macjonathan Okereke
|
20 |
-
- Anagha Aneesh
|
21 |
-
- Mehrdad Asgari
|
22 |
-
- Juliane Eberhardt
|
23 |
-
- Amir Mohammad Elahi
|
24 |
-
- Hani M. Elbeheiry
|
25 |
-
- María Victoria Gil
|
26 |
-
- Christina Glaubitz
|
27 |
-
- Maximilian Greiner
|
28 |
-
- Caroline T. Holick
|
29 |
-
- Tim Hoffmann
|
30 |
-
- Abdelrahman Ibrahim
|
31 |
-
- Lea C. Klepsch
|
32 |
-
- Yannik Köster
|
33 |
-
- Fabian Alexander Kreth
|
34 |
-
- Jakob Meyer
|
35 |
-
- Santiago Miret
|
36 |
-
- Jan Matthias Peschel
|
37 |
-
- Michael Ringleb
|
38 |
-
- Nicole Roesner
|
39 |
-
- Johanna Schreiber
|
40 |
-
- Ulrich S. Schubert
|
41 |
-
- Leanne M. Stafast
|
42 |
-
- Dinga Wonanke
|
43 |
-
- Michael Pieler
|
44 |
-
- Philippe Schwaller
|
45 |
-
- Kevin Maik Jablonka
|
46 |
-
maintainers:
|
47 |
-
- Adrian Mirza
|
48 |
-
- Nawaf Alampara
|
49 |
-
- Martiño Ríos-García
|
50 |
-
- Kevin Maik Jablonka
|
51 |
-
creation_date: "2023-05-15"
|
52 |
-
last_review_date: "2024-11-01"
|
53 |
-
next_review_date: "YTBD"
|
54 |
-
version_compatibility:
|
55 |
-
- "v0.3.0"
|
56 |
-
|
57 |
-
evaluation_design:
|
58 |
-
motivation:
|
59 |
-
scientific_needs: >
|
60 |
-
ChemBench is one of the pioneering benchmarks to evaluate performance of LLMs in chemistry specifically.
|
61 |
-
Prior selection of LLMs on chemistry tasks has been based on their performance on general benchmarks like Big Bench.
|
62 |
-
approach_justification: >
|
63 |
-
ChemBench comprehensively evaluates almost all the leading models on a wide range of chemistry topics,
|
64 |
-
allowing topic-specific leaders identification. It also probes safety knowledge of LLMs and evaluates
|
65 |
-
measures of alignment with human intuitions.
|
66 |
-
expected_benefits: >
|
67 |
-
Provides comparison metrics for LLM training on chemistry-specific tasks and evaluates performance
|
68 |
-
across different chemistry topics.
|
69 |
-
tradeoffs: >
|
70 |
-
Current LLMs lack human intuitions. ChemBench currently does not support evaluation of open-ended chemistry tasks.
|
71 |
-
|
72 |
-
type_and_structure:
|
73 |
-
type: "Benchmark"
|
74 |
-
structure: >
|
75 |
-
End-to-end automation, careful validation by experts, and usability with black box systems.
|
76 |
-
The benchmark covers a diverse set of topics and skills (reasoning, calculation, knowledge, and intuition)
|
77 |
-
across a range of difficulty levels.
|
78 |
-
timeline: ""
|
79 |
-
key_design_decisions:
|
80 |
-
- Benchmark approach for scalability and easier accessibility
|
81 |
-
- End-to-end automation for frequent model evaluation
|
82 |
-
- Careful validation by experts to minimize incorrect or unanswerable questions
|
83 |
-
- Support for models with special treatment of molecules
|
84 |
-
- Usability with black box systems without access to weights or logits
|
85 |
-
- Probing capabilities beyond MCQs to reflect real-world chemistry
|
86 |
-
- Coverage of diverse topics and skills
|
87 |
-
- Range of difficulty levels to measure improvement
|
88 |
-
- Impossible to completely solve with current models
|
89 |
-
design_process:
|
90 |
-
stakeholder_consultation: "ChemBench is internally used by some of the leading AI labs"
|
91 |
-
pilot_studies:
|
92 |
-
- "LLM ChemBench results were compared against humans using a subset of ChemBench"
|
93 |
-
validation_approaches:
|
94 |
-
- "Codebase tested with unit tests covering parsing modules, metrics modules, and extraction modules"
|
95 |
-
- "Questions verified manually by experts through GitHub pull requests"
|
96 |
-
- "Automated checks via GitHub Actions for schemas, LATEX templating, and formatting"
|
97 |
-
- "Leaderboard verification of complete corpus evaluation"
|
98 |
-
|
99 |
-
stakeholders_and_resources:
|
100 |
-
target_users:
|
101 |
-
- "General audience developing or evaluating ML models"
|
102 |
-
- "Researchers developing chemistry datasets"
|
103 |
-
required_expertise:
|
104 |
-
- "Basic knowledge of using benchmarks (simple how-to guide provided)"
|
105 |
-
resource_requirements:
|
106 |
-
- "API keys for closed-source models"
|
107 |
-
- "GPUs for fast local benchmarking (CPU also possible but slower)"
|
108 |
-
cost_considerations: "Nil"
|
109 |
-
|
110 |
-
estimand:
|
111 |
-
target_construct:
|
112 |
-
primary_capability: "Capabilities of models to answer chemistry questions"
|
113 |
-
measurement_type: "Pragmatic"
|
114 |
-
relationship_to_applications: >
|
115 |
-
ChemBench score can be considered a comparative metric to measure gains in LLM training.
|
116 |
-
Shows positive correlation to performance on tasks like data extraction.
|
117 |
-
theoretical_framework: >
|
118 |
-
Assumes the corpus is not being used for training during model development.
|
119 |
-
Findings on capabilities are based on performance in answering questions that rely on
|
120 |
-
reasoning, calculation, knowledge, and intuition for humans to solve.
|
121 |
-
|
122 |
-
scope_and_limitations:
|
123 |
-
coverage: >
|
124 |
-
Over 2,700 question-answer pairs classified to probe knowledge, intuition, and reasoning.
|
125 |
-
Covers subjects within Chemistry taught at undergraduate and postgraduate level courses.
|
126 |
-
excluded_capabilities:
|
127 |
-
- "Property prediction capabilities"
|
128 |
-
- "Data extraction capabilities"
|
129 |
-
- "Embedding meaningfulness"
|
130 |
-
- "Agentic capabilities"
|
131 |
-
known_blind_spots:
|
132 |
-
- "Questions considered answered correctly only if final answer is correct"
|
133 |
-
- "Partial scoring and open-ended evaluation not covered"
|
134 |
-
theoretical_limitations:
|
135 |
-
- "Questions treated with equal weights, no clear approach for weighing tasks"
|
136 |
-
- "Reliability and correlation between log probabilities and model responses not known"
|
137 |
-
|
138 |
-
assessment_components:
|
139 |
-
test_set:
|
140 |
-
data_sources:
|
141 |
-
- "Curated questions from existing exams or exercise sheets"
|
142 |
-
- "Programmatically created questions"
|
143 |
-
sampling_methodology: "Each model evaluated on all questions"
|
144 |
-
known_biases:
|
145 |
-
- "Questions mainly curated from the background of the developers"
|
146 |
-
approach_to_duplicates: >
|
147 |
-
Each question-answer pair hashed to create unique IDs, filtering to keep unique questions based on UUIDs.
|
148 |
-
data_quality: >
|
149 |
-
Guidelines followed by reviewers: originality, clarity, factual correctness, and avoiding ambiguity.
|
150 |
-
|
151 |
-
estimator:
|
152 |
-
evaluation_protocol:
|
153 |
-
methodology: >
|
154 |
-
Distinct prompt templates for completion and instruction-tuned models. Multistep parsing workflow
|
155 |
-
based on regular expressions with LLM extraction as fallback. Comprehensive refusal detection combining
|
156 |
-
regular expression-based detection and a fine-tuned BERT model.
|
157 |
-
control_measures:
|
158 |
-
- "Model-specific prompt templates"
|
159 |
-
- "Consistent parsing workflow"
|
160 |
-
- "Refusal detection and retry mechanism"
|
161 |
-
handling_random_components: "Refusal detection and retry mechanism for up to n times"
|
162 |
-
reproducibility_requirements: >
|
163 |
-
Storage of model timestamp, time, and version of the dataset used for benchmarking.
|
164 |
-
|
165 |
-
metrics:
|
166 |
-
primary_metrics:
|
167 |
-
- "Fraction of correctly answered questions"
|
168 |
-
aggregation_methodology: "Final score is mean of scores across all questions from all topics"
|
169 |
-
task_weightings:
|
170 |
-
approach: "All questions treated equally to avoid ambiguity"
|
171 |
-
note: "Questions classified into three difficulty levels manually by experts for further analysis"
|
172 |
-
performance_bounds:
|
173 |
-
scoring: "No partial scoring - all questions measured as correct/incorrect"
|
174 |
-
connection_to_outcomes: "Scores reflect how well the model is trained on chemistry"
|
175 |
-
|
176 |
-
metric_details:
|
177 |
-
- name: "Fraction Correct"
|
178 |
-
definition: >
|
179 |
-
Proportion of correct answers out of total questions. For MCQs, uses Hamming Loss;
|
180 |
-
for numerics, uses Mean Absolute Error with 1% threshold.
|
181 |
-
implementation: >
|
182 |
-
(1/n) * (sum(1-HammingLoss_i for i in MCQ) + sum(indicator(MAE_j < 0.01*|Target_j|) for j in Numeric))
|
183 |
-
edge_cases:
|
184 |
-
- "Perfect score: 1 when all questions answered correctly"
|
185 |
-
- "Complete failure: 0 when all questions answered incorrectly"
|
186 |
-
statistical_properties:
|
187 |
-
- "Simplicity: Easy to calculate and interpret"
|
188 |
-
- "Range: Always bounded between [0, 1]"
|
189 |
-
- "Binary nature: Each question contributes either 0 or 1"
|
190 |
-
failure_modes:
|
191 |
-
- "Masking: High overall accuracy can hide poor performance on specific question types"
|
192 |
-
- "Insensitivity to confidence: Doesn't account for prediction confidence"
|
193 |
-
- "Equal weighting: Assigns equal importance regardless of difficulty"
|
194 |
-
- "Heterogeneous data: Combining different question types with different evaluation criteria"
|
195 |
-
- "Threshold sensitivity: Results highly dependent on chosen thresholds"
|
196 |
-
- "Near-zero targets: For small target values, 1% threshold becomes extremely stringent"
|
197 |
-
|
198 |
-
- name: "Hamming Loss"
|
199 |
-
definition: >
|
200 |
-
Measures fraction of labels incorrectly predicted for MCQs.
|
201 |
-
(1/L) * sum(indicator(y_i,l != y_hat_i,l) for l in 1 to L)
|
202 |
-
implementation: "For single-answer MCQ, 0 if answer correct, 1 if incorrect"
|
203 |
-
statistical_properties:
|
204 |
-
- "Linearity: Scales linearly with misclassifications"
|
205 |
-
- "Range: Always bounded between [0, 1]"
|
206 |
-
- "Symmetry: Treats false positives and negatives equally"
|
207 |
-
failure_modes:
|
208 |
-
- "Equal weighting: Assigns equal importance regardless of difficulty"
|
209 |
-
- "Lack of severity grading: All errors weighted equally"
|
210 |
-
- "Multi-label complexity: May not capture label dependencies"
|
211 |
-
- "Simplistic for complex MCQs: Doesn't account for partial correctness"
|
212 |
-
|
213 |
-
technical_framework:
|
214 |
-
implementation_requirements:
|
215 |
-
- "Installing ChemBench package"
|
216 |
-
- "API keys for closed-source models"
|
217 |
-
- "GPUs for fast benchmarking (CPU also possible)"
|
218 |
-
time_constraints: "Complete benchmarking requires around 2 hours"
|
219 |
-
dependencies:
|
220 |
-
- "tenacity==8.3.0"
|
221 |
-
- "langchain>=0.1.5"
|
222 |
-
- "fastcore>=1.5.29"
|
223 |
-
- "scikit-learn>=1.4.0"
|
224 |
-
- "loguru>=0.7.2"
|
225 |
-
- "litellm>=1.59.1"
|
226 |
-
- "backoff>=2.2.1"
|
227 |
-
- "tqdm>=4.66.1"
|
228 |
-
- "pint>=0.23"
|
229 |
-
- "pandas>=2.2.0"
|
230 |
-
- "python-dotenv>=1.0.1"
|
231 |
-
- "fire>=0.5.0"
|
232 |
-
- "datasets"
|
233 |
-
- "torch"
|
234 |
-
- "transformers"
|
235 |
-
- "langchain-community>=0.0.17"
|
236 |
-
- "pillow"
|
237 |
-
|
238 |
-
constraints_and_rules:
|
239 |
-
allowed_resources:
|
240 |
-
- "Models not trained on the ChemBench corpus (not tested)"
|
241 |
-
permitted_approaches:
|
242 |
-
- "Tools or other agentic setups"
|
243 |
-
- "No constraints on model parameters or computational constraints"
|
244 |
-
- "No constraints on temperature or decoding strategies"
|
245 |
-
- "No constraints on architecture or post-training approaches"
|
246 |
-
optimization_constraints:
|
247 |
-
- "Prompts not optimized unless part of modeling"
|
248 |
-
ethical_boundaries:
|
249 |
-
- "Models not trained on the ChemBench corpus (not tested)"
|
250 |
-
|
251 |
-
estimate:
|
252 |
-
required_reporting:
|
253 |
-
essential_metrics:
|
254 |
-
- "all_correct (binary score of 0/1 for each question)"
|
255 |
-
- "Fraction correct (final score computed across all questions)"
|
256 |
-
- "Refusal detections and LLM parsing flags"
|
257 |
-
results_disaggregation: >
|
258 |
-
Individual scoring and relative position available for Topics:
|
259 |
-
Analytical Chemistry, Materials Science, Technical Chemistry, General Chemistry,
|
260 |
-
Physical Chemistry, Toxicity and Safety, Inorganic Chemistry, Organic Chemistry,
|
261 |
-
and Human Preference. Separate scores for easy/hard tasks, reasoning tasks,
|
262 |
-
computation tasks, knowledge tasks, human preference alignment, and comparison
|
263 |
-
against human chemists.
|
264 |
-
uncertainty_quantification: >
|
265 |
-
ChemBench has a unique way to obtain confidence of model predictions using prompting,
|
266 |
-
but this is a separate analysis not part of benchmark metrics.
|
267 |
-
performance_variation: "Currently not done"
|
268 |
-
resource_usage_reporting: "Currently tracks number of parameters if available"
|
269 |
-
|
270 |
-
reproducibility_information:
|
271 |
-
documentation_requirements:
|
272 |
-
- "model_name"
|
273 |
-
- "model_timestamp"
|
274 |
-
- "model_description"
|
275 |
-
- "date_published (optional)"
|
276 |
-
- "open_weights (optional)"
|
277 |
-
- "open_dataset (optional)"
|
278 |
-
- "nr_of_parameters (optional)"
|
279 |
-
- "github (optional)"
|
280 |
-
- "paper (optional)"
|
281 |
-
- "api_endpoint (optional)"
|
282 |
-
- "nr_of_tokens (optional)"
|
283 |
-
- "architecture (optional)"
|
284 |
-
- "mixture_of_experts (optional)"
|
285 |
-
- "model_alignment (optional)"
|
286 |
-
- "reinforcement_learning_from_human_feedback (optional)"
|
287 |
-
- "domain_specific_pretraining (optional)"
|
288 |
-
- "domain_specific_finetuning (optional)"
|
289 |
-
- "tool_use (optional)"
|
290 |
-
- "tool_type (optional)"
|
291 |
-
- "temperature (optional)"
|
292 |
-
- "epochs (optional)"
|
293 |
-
- "reasoning_model (optional)"
|
294 |
-
- "reasoning_type (optional)"
|
295 |
-
environment_specifications: >
|
296 |
-
Benchmarking performed using latest version of ChemBench pipeline and ChemBench Dataset.
|
297 |
-
randomization_handling: >
|
298 |
-
Temperature or other randomization or seeding expected in model description.
|
299 |
-
output_standardization: >
|
300 |
-
Outputs prompted to be given in ChemBench parsing compatible format.
|
301 |
-
|
302 |
-
results_communication:
|
303 |
-
visualization:
|
304 |
-
recommended_plots:
|
305 |
-
- "Spider chart showing model performance on different topics against baseline and other leading models"
|
306 |
-
- "Reliability and distribution of confidence estimates, showing confidence calibration"
|
307 |
-
standardized_formats:
|
308 |
-
- "Latest results maintained in ChemBench-Leaderboard"
|
309 |
-
- "Refusals counted as incorrect"
|
310 |
-
- "Baseline model as defined in paper"
|
311 |
-
- "Final answer based on ChemBench pipeline, not log probabilities"
|
312 |
-
|
313 |
-
leaderboard_guidelines:
|
314 |
-
submission_process: "Detailed in Huggingface Space documentation"
|
315 |
-
required_metadata:
|
316 |
-
- "Model details as specified in documentation requirements"
|
317 |
-
|
318 |
-
known_issues_and_limitations:
|
319 |
-
validity_concerns:
|
320 |
-
construct_validity: >
|
321 |
-
Even though ChemBench goes beyond MCQ-only benchmarks by including numeric questions,
|
322 |
-
evaluation on open-ended tasks is not included. Partial scoring and task weighing not supported.
|
323 |
-
gaming_possibilities: "Possibility to host ChemBench as a challenge"
|
324 |
-
stability_considerations: >
|
325 |
-
Refusal detection and retry mechanism implemented to tackle LLM refusals,
|
326 |
-
combining regex-based detection and fine-tuned BERT model.
|
327 |
-
temporal_validity: >
|
328 |
-
Questions based on scientific principles won't lose validity,
|
329 |
-
but may appear in training corpora over time.
|
330 |
-
|
331 |
-
practical_limitations:
|
332 |
-
resource_constraints: "Based on the model being benchmarked"
|
333 |
-
scalability_issues: "Based on the model being benchmarked"
|
334 |
-
cost_factors: "Based on the model being benchmarked"
|
335 |
-
time_boundaries: "Benchmark might lose validity as questions leak to training corpora"
|
336 |
-
|
337 |
-
bias_and_fairness:
|
338 |
-
known_biases:
|
339 |
-
- "Biases from human curation process"
|
340 |
-
representation_issues: "Certain areas of chemistry not evaluated"
|
341 |
-
potential_impacts: "Certain areas of chemistry not evaluated"
|
342 |
-
mitigation_approaches: "Curation by team of more than 10 people to balance biases"
|
343 |
-
|
344 |
-
version_and_maintenance:
|
345 |
-
version_information:
|
346 |
-
version:
|
347 |
-
results: "v1.0.4"
|
348 |
-
dataset: "v1.0.0"
|
349 |
-
code: "v0.3.0"
|
350 |
-
release_date: "2024-11-01"
|
351 |
-
change_history: "Tracked in GitHub repository changelog"
|
352 |
-
update_plans: "Discussed in GitHub repository discussions"
|
353 |
-
|
354 |
-
maintenance_protocol:
|
355 |
-
update_frequency: "Ad hoc after release"
|
356 |
-
deprecation_policy: >
|
357 |
-
Based on major issues with questions. Questions removed and dataset version updated.
|
358 |
-
Major updates lead to rerunning models for updated Leaderboard.
|
359 |
-
issue_reporting: "Issues tracked in GitHub repository"
|
360 |
-
community_involvement: >
|
361 |
-
Maintainers active in solving user issues on GitHub.
|
362 |
-
Proposal for forum in Mat Sci Community Disclosure.
|
363 |
-
Discussions available on GitHub and Huggingface.
|
364 |
-
criteria_for_updates:
|
365 |
-
- "Codebase updated for new features or bug fixes"
|
366 |
-
- "Dataset updated when questions added or removed"
|
367 |
-
- "Leaderboard updated for new models or dataset updates"
|
368 |
-
breaking_change_policy: >
|
369 |
-
All models in leaderboard rerun with new updates.
|
370 |
-
Update of arXiv paper released. Proposal to release a commit.
|
371 |
-
backwards_compatibility: >
|
372 |
-
Pydantic base classes for task and report stable for compatibility.
|
373 |
-
Major changes to tasks and report backward compatible.
|
374 |
-
migration_guides: "Released in documentation as needed"
|
375 |
-
|
376 |
-
citation_and_usage:
|
377 |
-
citation_information:
|
378 |
-
recommended_citation: >
|
379 |
-
@misc{mirza2024largelanguagemodelssuperhuman,
|
380 |
-
title={Are large language models superhuman chemists?},
|
381 |
-
author={Adrian Mirza and Nawaf Alampara and Sreekanth Kunchapu and Benedict Emoekabu and Aswanth Krishnan and Mara Wilhelmi and Macjonathan Okereke and Juliane Eberhardt and Amir Mohammad Elahi and Maximilian Greiner and Caroline T. Holick and Tanya Gupta and Mehrdad Asgari and Christina Glaubitz and Lea C. Klepsch and Yannik Köster and Jakob Meyer and Santiago Miret and Tim Hoffmann and Fabian Alexander Kreth and Michael Ringleb and Nicole Roesner and Ulrich S. Schubert and Leanne M. Stafast and Dinga Wonanke and Michael Pieler and Philippe Schwaller and Kevin Maik Jablonka},
|
382 |
-
year={2024},
|
383 |
-
eprint={2404.01475},
|
384 |
-
archivePrefix={arXiv},
|
385 |
-
primaryClass={cs.LG},
|
386 |
-
url={https://arxiv.org/abs/2404.01475},
|
387 |
-
}
|
388 |
-
related_publications:
|
389 |
-
- "Are large language models superhuman chemists? (https://arxiv.org/abs/2404.01475)"
|
390 |
-
- "Probing the limitations of multimodal language models for chemistry and materials research (https://arxiv.org/pdf/2411.16955)"
|
391 |
-
licensing_details: "MIT License"
|
392 |
-
|
393 |
-
usage_guidelines:
|
394 |
-
recommended_applications:
|
395 |
-
- "Evaluation of LLM capabilities in chemistry"
|
396 |
-
inappropriate_uses:
|
397 |
-
- "Training models with the ChemBench dataset"
|
398 |
-
implementation_best_practices: >
|
399 |
-
Results obtained with ChemBench pipeline and latest dataset at time of benchmarking considered valid practice.
|
400 |
-
ethical_considerations: "ChemBench dataset not meant for training"
|
401 |
-
|
402 |
-
additional_notes:
|
403 |
-
related_evaluations:
|
404 |
-
- "ChemBench extension for multimodal models (https://arxiv.org/pdf/2411.16955)"
|
405 |
-
- "MatText for bottlenecks of finetuned LLMs on property prediction (https://arxiv.org/abs/2406.17295)"
|
406 |
-
- "MaScQA for investigating materials science knowledge of LLMs (https://pubs.rsc.org/en/content/articlelanding/2024/dd/d3dd00188a)"
|
407 |
-
- "Measuring Capabilities of Language Models for Biology Research (https://arxiv.org/abs/2407.10362)"
|
408 |
-
future_directions: >
|
409 |
-
Sensitivity to prompting, improving performance with prompt optimization.
|
410 |
-
Mechanistic interpretability. Benchmarking agents on ChemBench.
|
411 |
-
Effect of grounding and post-training approaches.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
script.js
CHANGED
@@ -87,4 +87,88 @@ function visualizeCoverage(scoreDetails) {
|
|
87 |
document.addEventListener('DOMContentLoaded', function() {
|
88 |
// This could be used to initialize charts or other client-side features
|
89 |
console.log('Client-side JavaScript initialized');
|
90 |
-
});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
document.addEventListener('DOMContentLoaded', function() {
|
88 |
// This could be used to initialize charts or other client-side features
|
89 |
console.log('Client-side JavaScript initialized');
|
90 |
+
});
|
91 |
+
|
92 |
+
|
93 |
+
// Add this to script.js
|
94 |
+
|
95 |
+
// Function to show the YAML content in a modal
|
96 |
+
function viewYAML(filename) {
|
97 |
+
// Use Gradio's client.query to get the YAML content
|
98 |
+
gradioApp().querySelector('#file_action_component').querySelector('textarea').value = filename;
|
99 |
+
gradioApp().querySelector('#file_action_type_component').querySelector('textarea').value = 'view';
|
100 |
+
|
101 |
+
// Trigger the event
|
102 |
+
const viewButton = gradioApp().querySelector('#trigger_file_action');
|
103 |
+
viewButton.click();
|
104 |
+
|
105 |
+
// The result will show up in the modal that's created by the event handler
|
106 |
+
}
|
107 |
+
|
108 |
+
// Function to download a file
|
109 |
+
function downloadFile(filename, format) {
|
110 |
+
let actionType;
|
111 |
+
if (format === 'yaml') actionType = 'download_yaml';
|
112 |
+
else if (format === 'markdown') actionType = 'download_md';
|
113 |
+
else if (format === 'latex') actionType = 'download_latex';
|
114 |
+
else return;
|
115 |
+
|
116 |
+
// Set the filename and action type
|
117 |
+
gradioApp().querySelector('#file_action_component').querySelector('textarea').value = filename;
|
118 |
+
gradioApp().querySelector('#file_action_type_component').querySelector('textarea').value = actionType;
|
119 |
+
|
120 |
+
// Trigger the event
|
121 |
+
const downloadButton = gradioApp().querySelector('#trigger_file_action');
|
122 |
+
downloadButton.click();
|
123 |
+
|
124 |
+
// The download will be handled by the event response
|
125 |
+
}
|
126 |
+
|
127 |
+
// Helper function to get the Gradio app element
|
128 |
+
function gradioApp() {
|
129 |
+
return document.getElementsByTagName('gradio-app')[0].shadowRoot || document;
|
130 |
+
}
|
131 |
+
|
132 |
+
// Function to create and display a modal with content
|
133 |
+
function showModal(content) {
|
134 |
+
// Create the modal elements
|
135 |
+
const modal = document.createElement('div');
|
136 |
+
modal.style.position = 'fixed';
|
137 |
+
modal.style.top = '0';
|
138 |
+
modal.style.left = '0';
|
139 |
+
modal.style.width = '100%';
|
140 |
+
modal.style.height = '100%';
|
141 |
+
modal.style.backgroundColor = 'rgba(0, 0, 0, 0.5)';
|
142 |
+
modal.style.zIndex = '1000';
|
143 |
+
modal.style.display = 'flex';
|
144 |
+
modal.style.justifyContent = 'center';
|
145 |
+
modal.style.alignItems = 'center';
|
146 |
+
|
147 |
+
const modalContent = document.createElement('div');
|
148 |
+
modalContent.style.backgroundColor = 'white';
|
149 |
+
modalContent.style.padding = '20px';
|
150 |
+
modalContent.style.borderRadius = '5px';
|
151 |
+
modalContent.style.maxWidth = '80%';
|
152 |
+
modalContent.style.maxHeight = '80%';
|
153 |
+
modalContent.style.overflow = 'auto';
|
154 |
+
|
155 |
+
const closeButton = document.createElement('button');
|
156 |
+
closeButton.textContent = 'Close';
|
157 |
+
closeButton.style.marginBottom = '10px';
|
158 |
+
closeButton.style.padding = '5px 10px';
|
159 |
+
closeButton.style.cursor = 'pointer';
|
160 |
+
closeButton.onclick = () => {
|
161 |
+
document.body.removeChild(modal);
|
162 |
+
};
|
163 |
+
|
164 |
+
const contentPre = document.createElement('pre');
|
165 |
+
contentPre.textContent = content;
|
166 |
+
contentPre.style.whiteSpace = 'pre-wrap';
|
167 |
+
contentPre.style.wordBreak = 'break-word';
|
168 |
+
|
169 |
+
modalContent.appendChild(closeButton);
|
170 |
+
modalContent.appendChild(contentPre);
|
171 |
+
modal.appendChild(modalContent);
|
172 |
+
|
173 |
+
document.body.appendChild(modal);
|
174 |
+
}
|
template.yaml
CHANGED
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Evaluation Card Template
|
2 |
+
title: "[Evaluation Name]"
|
3 |
+
|
4 |
+
summary: >
|
5 |
+
Brief description of the evaluation approach, its purpose, and scope.
|
6 |
+
|
7 |
+
metadata:
|
8 |
+
authors: []
|
9 |
+
maintainers: []
|
10 |
+
creation_date: ""
|
11 |
+
last_review_date: ""
|
12 |
+
next_review_date: ""
|
13 |
+
version_compatibility: []
|
14 |
+
repository_link: "" # Link to the code repository
|
15 |
+
paper_link: "" # Link to the research paper
|
16 |
+
|
17 |
+
evaluation_design:
|
18 |
+
motivation:
|
19 |
+
scientific_needs: ""
|
20 |
+
approach_justification: ""
|
21 |
+
expected_benefits: ""
|
22 |
+
tradeoffs: ""
|
23 |
+
|
24 |
+
type_and_structure:
|
25 |
+
type: "" # benchmark, challenge, red teaming, deployment study, structured test
|
26 |
+
structure: ""
|
27 |
+
timeline: ""
|
28 |
+
key_design_decisions: []
|
29 |
+
design_process:
|
30 |
+
stakeholder_consultation: ""
|
31 |
+
pilot_studies: []
|
32 |
+
validation_approaches: []
|
33 |
+
|
34 |
+
stakeholders_and_resources:
|
35 |
+
target_users: []
|
36 |
+
required_expertise: []
|
37 |
+
resource_requirements: []
|
38 |
+
cost_considerations: ""
|
39 |
+
|
40 |
+
estimand:
|
41 |
+
target_construct:
|
42 |
+
primary_capability: ""
|
43 |
+
measurement_type: "" # representational or pragmatic
|
44 |
+
relationship_to_applications: ""
|
45 |
+
theoretical_framework: ""
|
46 |
+
|
47 |
+
scope_and_limitations:
|
48 |
+
coverage: ""
|
49 |
+
excluded_capabilities: []
|
50 |
+
known_blind_spots: []
|
51 |
+
theoretical_limitations: []
|
52 |
+
|
53 |
+
assessment_components:
|
54 |
+
test_set:
|
55 |
+
data_sources: []
|
56 |
+
sampling_methodology: ""
|
57 |
+
known_biases: []
|
58 |
+
approach_to_duplicates: ""
|
59 |
+
data_quality: ""
|
60 |
+
|
61 |
+
challenge:
|
62 |
+
design_principles: []
|
63 |
+
task_selection_criteria: []
|
64 |
+
difficulty_progression: ""
|
65 |
+
time_constraints: ""
|
66 |
+
|
67 |
+
red_teaming:
|
68 |
+
probing_methodology: ""
|
69 |
+
coverage_strategy: ""
|
70 |
+
adversarial_approach: ""
|
71 |
+
safety_considerations: ""
|
72 |
+
|
73 |
+
deployment_study:
|
74 |
+
environment_characteristics: ""
|
75 |
+
integration_points: []
|
76 |
+
success_criteria: []
|
77 |
+
monitoring_approach: ""
|
78 |
+
|
79 |
+
estimator:
|
80 |
+
evaluation_protocol:
|
81 |
+
methodology: ""
|
82 |
+
control_measures: []
|
83 |
+
handling_random_components: ""
|
84 |
+
reproducibility_requirements: ""
|
85 |
+
|
86 |
+
metrics:
|
87 |
+
primary_metrics: []
|
88 |
+
aggregation_methodology: ""
|
89 |
+
task_weightings: {}
|
90 |
+
performance_bounds: {}
|
91 |
+
connection_to_outcomes: ""
|
92 |
+
|
93 |
+
metric_details:
|
94 |
+
- name: ""
|
95 |
+
definition: ""
|
96 |
+
implementation: ""
|
97 |
+
edge_cases: []
|
98 |
+
statistical_properties: ""
|
99 |
+
baseline_values: {}
|
100 |
+
failure_modes: []
|
101 |
+
|
102 |
+
technical_framework:
|
103 |
+
implementation_requirements: []
|
104 |
+
time_constraints: ""
|
105 |
+
dependencies: []
|
106 |
+
authentication_needs: ""
|
107 |
+
|
108 |
+
constraints_and_rules:
|
109 |
+
allowed_resources: []
|
110 |
+
permitted_approaches: []
|
111 |
+
optimization_constraints: []
|
112 |
+
ethical_boundaries: []
|
113 |
+
|
114 |
+
estimate:
|
115 |
+
required_reporting:
|
116 |
+
essential_metrics: []
|
117 |
+
results_disaggregation: ""
|
118 |
+
uncertainty_quantification: ""
|
119 |
+
performance_variation: ""
|
120 |
+
resource_usage_reporting: ""
|
121 |
+
|
122 |
+
reproducibility_information:
|
123 |
+
documentation_requirements: []
|
124 |
+
environment_specifications: ""
|
125 |
+
randomization_handling: ""
|
126 |
+
output_standardization: ""
|
127 |
+
|
128 |
+
results_communication:
|
129 |
+
visualization:
|
130 |
+
recommended_plots: []
|
131 |
+
standardized_formats: []
|
132 |
+
key_comparisons: []
|
133 |
+
|
134 |
+
leaderboard_guidelines:
|
135 |
+
submission_process: ""
|
136 |
+
required_metadata: []
|
137 |
+
|
138 |
+
known_issues_and_limitations:
|
139 |
+
validity_concerns:
|
140 |
+
construct_validity: ""
|
141 |
+
gaming_possibilities: ""
|
142 |
+
stability_considerations: ""
|
143 |
+
temporal_validity: ""
|
144 |
+
|
145 |
+
practical_limitations:
|
146 |
+
resource_constraints: ""
|
147 |
+
scalability_issues: ""
|
148 |
+
cost_factors: ""
|
149 |
+
time_boundaries: ""
|
150 |
+
|
151 |
+
bias_and_fairness:
|
152 |
+
known_biases: []
|
153 |
+
representation_issues: ""
|
154 |
+
potential_impacts: ""
|
155 |
+
mitigation_approaches: []
|
156 |
+
|
157 |
+
version_and_maintenance:
|
158 |
+
version_information:
|
159 |
+
version: ""
|
160 |
+
release_date: ""
|
161 |
+
change_history: []
|
162 |
+
update_plans: ""
|
163 |
+
|
164 |
+
maintenance_protocol:
|
165 |
+
update_frequency: ""
|
166 |
+
deprecation_policy: ""
|
167 |
+
issue_reporting: ""
|
168 |
+
community_involvement: ""
|
169 |
+
criteria_for_updates: []
|
170 |
+
breaking_change_policy: ""
|
171 |
+
backwards_compatibility: ""
|
172 |
+
migration_guides: ""
|
173 |
+
|
174 |
+
citation_and_usage:
|
175 |
+
citation_information:
|
176 |
+
recommended_citation: ""
|
177 |
+
related_publications: []
|
178 |
+
licensing_details: ""
|
179 |
+
|
180 |
+
usage_guidelines:
|
181 |
+
recommended_applications: []
|
182 |
+
inappropriate_uses: []
|
183 |
+
implementation_best_practices: ""
|
184 |
+
ethical_considerations: ""
|
185 |
+
|
186 |
+
additional_notes:
|
187 |
+
related_evaluations: []
|
188 |
+
future_directions: ""
|
yaml_template.yaml
CHANGED
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Evaluation Card Template
|
2 |
+
title: "[Evaluation Name]"
|
3 |
+
|
4 |
+
summary: >
|
5 |
+
Brief description of the evaluation approach, its purpose, and scope.
|
6 |
+
|
7 |
+
metadata:
|
8 |
+
authors: []
|
9 |
+
maintainers: []
|
10 |
+
creation_date: ""
|
11 |
+
last_review_date: ""
|
12 |
+
next_review_date: ""
|
13 |
+
version_compatibility: []
|
14 |
+
repository_link: "" # Link to the code repository
|
15 |
+
paper_link: "" # Link to the research paper
|
16 |
+
|
17 |
+
evaluation_design:
|
18 |
+
motivation:
|
19 |
+
scientific_needs: ""
|
20 |
+
approach_justification: ""
|
21 |
+
expected_benefits: ""
|
22 |
+
tradeoffs: ""
|
23 |
+
|
24 |
+
type_and_structure:
|
25 |
+
type: "" # benchmark, challenge, red teaming, deployment study, structured test
|
26 |
+
structure: ""
|
27 |
+
timeline: ""
|
28 |
+
key_design_decisions: []
|
29 |
+
design_process:
|
30 |
+
stakeholder_consultation: ""
|
31 |
+
pilot_studies: []
|
32 |
+
validation_approaches: []
|
33 |
+
|
34 |
+
stakeholders_and_resources:
|
35 |
+
target_users: []
|
36 |
+
required_expertise: []
|
37 |
+
resource_requirements: []
|
38 |
+
cost_considerations: ""
|
39 |
+
|
40 |
+
estimand:
|
41 |
+
target_construct:
|
42 |
+
primary_capability: ""
|
43 |
+
measurement_type: "" # representational or pragmatic
|
44 |
+
relationship_to_applications: ""
|
45 |
+
theoretical_framework: ""
|
46 |
+
|
47 |
+
scope_and_limitations:
|
48 |
+
coverage: ""
|
49 |
+
excluded_capabilities: []
|
50 |
+
known_blind_spots: []
|
51 |
+
theoretical_limitations: []
|
52 |
+
|
53 |
+
assessment_components:
|
54 |
+
test_set:
|
55 |
+
data_sources: []
|
56 |
+
sampling_methodology: ""
|
57 |
+
known_biases: []
|
58 |
+
approach_to_duplicates: ""
|
59 |
+
data_quality: ""
|
60 |
+
|
61 |
+
challenge:
|
62 |
+
design_principles: []
|
63 |
+
task_selection_criteria: []
|
64 |
+
difficulty_progression: ""
|
65 |
+
time_constraints: ""
|
66 |
+
|
67 |
+
red_teaming:
|
68 |
+
probing_methodology: ""
|
69 |
+
coverage_strategy: ""
|
70 |
+
adversarial_approach: ""
|
71 |
+
safety_considerations: ""
|
72 |
+
|
73 |
+
deployment_study:
|
74 |
+
environment_characteristics: ""
|
75 |
+
integration_points: []
|
76 |
+
success_criteria: []
|
77 |
+
monitoring_approach: ""
|
78 |
+
|
79 |
+
estimator:
|
80 |
+
evaluation_protocol:
|
81 |
+
methodology: ""
|
82 |
+
control_measures: []
|
83 |
+
handling_random_components: ""
|
84 |
+
reproducibility_requirements: ""
|
85 |
+
|
86 |
+
metrics:
|
87 |
+
primary_metrics: []
|
88 |
+
aggregation_methodology: ""
|
89 |
+
task_weightings: {}
|
90 |
+
performance_bounds: {}
|
91 |
+
connection_to_outcomes: ""
|
92 |
+
|
93 |
+
metric_details:
|
94 |
+
- name: ""
|
95 |
+
definition: ""
|
96 |
+
implementation: ""
|
97 |
+
edge_cases: []
|
98 |
+
statistical_properties: ""
|
99 |
+
baseline_values: {}
|
100 |
+
failure_modes: []
|
101 |
+
|
102 |
+
technical_framework:
|
103 |
+
implementation_requirements: []
|
104 |
+
time_constraints: ""
|
105 |
+
dependencies: []
|
106 |
+
authentication_needs: ""
|
107 |
+
|
108 |
+
constraints_and_rules:
|
109 |
+
allowed_resources: []
|
110 |
+
permitted_approaches: []
|
111 |
+
optimization_constraints: []
|
112 |
+
ethical_boundaries: []
|
113 |
+
|
114 |
+
estimate:
|
115 |
+
required_reporting:
|
116 |
+
essential_metrics: []
|
117 |
+
results_disaggregation: ""
|
118 |
+
uncertainty_quantification: ""
|
119 |
+
performance_variation: ""
|
120 |
+
resource_usage_reporting: ""
|
121 |
+
|
122 |
+
reproducibility_information:
|
123 |
+
documentation_requirements: []
|
124 |
+
environment_specifications: ""
|
125 |
+
randomization_handling: ""
|
126 |
+
output_standardization: ""
|
127 |
+
|
128 |
+
results_communication:
|
129 |
+
visualization:
|
130 |
+
recommended_plots: []
|
131 |
+
standardized_formats: []
|
132 |
+
key_comparisons: []
|
133 |
+
|
134 |
+
leaderboard_guidelines:
|
135 |
+
submission_process: ""
|
136 |
+
required_metadata: []
|
137 |
+
|
138 |
+
known_issues_and_limitations:
|
139 |
+
validity_concerns:
|
140 |
+
construct_validity: ""
|
141 |
+
gaming_possibilities: ""
|
142 |
+
stability_considerations: ""
|
143 |
+
temporal_validity: ""
|
144 |
+
|
145 |
+
practical_limitations:
|
146 |
+
resource_constraints: ""
|
147 |
+
scalability_issues: ""
|
148 |
+
cost_factors: ""
|
149 |
+
time_boundaries: ""
|
150 |
+
|
151 |
+
bias_and_fairness:
|
152 |
+
known_biases: []
|
153 |
+
representation_issues: ""
|
154 |
+
potential_impacts: ""
|
155 |
+
mitigation_approaches: []
|
156 |
+
|
157 |
+
version_and_maintenance:
|
158 |
+
version_information:
|
159 |
+
version: ""
|
160 |
+
release_date: ""
|
161 |
+
change_history: []
|
162 |
+
update_plans: ""
|
163 |
+
|
164 |
+
maintenance_protocol:
|
165 |
+
update_frequency: ""
|
166 |
+
deprecation_policy: ""
|
167 |
+
issue_reporting: ""
|
168 |
+
community_involvement: ""
|
169 |
+
criteria_for_updates: []
|
170 |
+
breaking_change_policy: ""
|
171 |
+
backwards_compatibility: ""
|
172 |
+
migration_guides: ""
|
173 |
+
|
174 |
+
citation_and_usage:
|
175 |
+
citation_information:
|
176 |
+
recommended_citation: ""
|
177 |
+
related_publications: []
|
178 |
+
licensing_details: ""
|
179 |
+
|
180 |
+
usage_guidelines:
|
181 |
+
recommended_applications: []
|
182 |
+
inappropriate_uses: []
|
183 |
+
implementation_best_practices: ""
|
184 |
+
ethical_considerations: ""
|
185 |
+
|
186 |
+
additional_notes:
|
187 |
+
related_evaluations: []
|
188 |
+
future_directions: ""
|