Spaces:
Running
Running
Upload 2 files
Browse files- app.py +433 -0
- questions.yaml +260 -0
app.py
ADDED
@@ -0,0 +1,433 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import yaml
|
3 |
+
import json
|
4 |
+
import os
|
5 |
+
from typing import Dict, List, Any, Tuple
|
6 |
+
from datetime import datetime
|
7 |
+
|
8 |
+
class AIEvaluationForm:
|
9 |
+
def __init__(self, template_file: str = "questions.yaml"):
|
10 |
+
"""Initialize the evaluation form with questions from YAML file"""
|
11 |
+
self.template_file = template_file
|
12 |
+
self.template = self.load_template()
|
13 |
+
self.components = {}
|
14 |
+
|
15 |
+
def load_template(self) -> Dict:
|
16 |
+
"""Load evaluation template from YAML file"""
|
17 |
+
try:
|
18 |
+
with open(self.template_file, 'r', encoding='utf-8') as f:
|
19 |
+
return yaml.safe_load(f)
|
20 |
+
except FileNotFoundError:
|
21 |
+
raise FileNotFoundError(f"Template file '{self.template_file}' not found. Please ensure the file exists.")
|
22 |
+
except yaml.YAMLError as e:
|
23 |
+
raise ValueError(f"Error parsing YAML file: {e}")
|
24 |
+
|
25 |
+
def create_system_info_section(self) -> Tuple[List, Dict]:
|
26 |
+
"""Create the system information section"""
|
27 |
+
components = {}
|
28 |
+
|
29 |
+
with gr.Group():
|
30 |
+
gr.Markdown("## 📋 AI System Information")
|
31 |
+
gr.Markdown("*Please provide basic information about the AI system being evaluated.*")
|
32 |
+
|
33 |
+
components['name'] = gr.Textbox(
|
34 |
+
label="AI System Name",
|
35 |
+
placeholder="e.g., GPT-4, BERT, StarCoder2",
|
36 |
+
info="The official name of your AI system"
|
37 |
+
)
|
38 |
+
|
39 |
+
components['provider'] = gr.Textbox(
|
40 |
+
label="Provider/Organization",
|
41 |
+
placeholder="e.g., OpenAI, Google, BigCode",
|
42 |
+
info="The organization that developed the system"
|
43 |
+
)
|
44 |
+
|
45 |
+
components['url'] = gr.Textbox(
|
46 |
+
label="System URL",
|
47 |
+
placeholder="e.g., https://huggingface.co/model-name",
|
48 |
+
info="URL to the model, paper, or documentation"
|
49 |
+
)
|
50 |
+
|
51 |
+
components['type'] = gr.Dropdown(
|
52 |
+
choices=[
|
53 |
+
"Large Language Model",
|
54 |
+
"Computer Vision Model",
|
55 |
+
"Multimodal Model",
|
56 |
+
"Speech/Audio Model",
|
57 |
+
"Reinforcement Learning Agent",
|
58 |
+
"Other"
|
59 |
+
],
|
60 |
+
label="System Type",
|
61 |
+
value="Large Language Model",
|
62 |
+
info="Primary category of the AI system"
|
63 |
+
)
|
64 |
+
|
65 |
+
components['modalities'] = gr.CheckboxGroup(
|
66 |
+
choices=[
|
67 |
+
"Text-to-Text",
|
68 |
+
"Text-to-Image",
|
69 |
+
"Image-to-Text",
|
70 |
+
"Image-to-Image",
|
71 |
+
"Audio",
|
72 |
+
"Video",
|
73 |
+
"Multimodal"
|
74 |
+
],
|
75 |
+
label="Modalities (select all that apply)",
|
76 |
+
value=["Text-to-Text"],
|
77 |
+
info="Input/output modalities supported by the system"
|
78 |
+
)
|
79 |
+
|
80 |
+
return list(components.values()), components
|
81 |
+
|
82 |
+
def create_evaluation_sections(self) -> Tuple[List, Dict]:
|
83 |
+
"""Create dynamic evaluation sections from template"""
|
84 |
+
all_components = []
|
85 |
+
section_components = {}
|
86 |
+
|
87 |
+
for section_name, section_data in self.template.items():
|
88 |
+
with gr.Group():
|
89 |
+
gr.Markdown(f"## {section_name}")
|
90 |
+
|
91 |
+
section_components[section_name] = {}
|
92 |
+
|
93 |
+
for subsection_name, subsection_data in section_data.items():
|
94 |
+
with gr.Accordion(subsection_name, open=False):
|
95 |
+
# Explainer text
|
96 |
+
gr.Markdown(f"**Explainer:** {subsection_data['explainer']}")
|
97 |
+
|
98 |
+
# Overall status
|
99 |
+
status_component = gr.Radio(
|
100 |
+
choices=["Yes", "No", "N/A"],
|
101 |
+
label=f"Overall Status",
|
102 |
+
value="N/A",
|
103 |
+
info="Does this subsection apply to your system and have you conducted these evaluations?"
|
104 |
+
)
|
105 |
+
|
106 |
+
# Sources/Evidence
|
107 |
+
sources_component = gr.Textbox(
|
108 |
+
label="Sources & Evidence",
|
109 |
+
placeholder="Enter sources, papers, benchmarks, or evidence (one per line)\nExample:\nhttps://arxiv.org/abs/2402.19173\nBOLD Bias Benchmark\nInternal evaluation report",
|
110 |
+
lines=4,
|
111 |
+
info="Provide references to evaluations, papers, benchmarks, or internal reports"
|
112 |
+
)
|
113 |
+
|
114 |
+
# Individual questions
|
115 |
+
gr.Markdown("**Detailed Questions:**")
|
116 |
+
question_components = {}
|
117 |
+
|
118 |
+
# IMPORTANT: Add components in the correct order - status, sources, then questions
|
119 |
+
all_components.extend([status_component, sources_component])
|
120 |
+
|
121 |
+
for question in subsection_data['questions']:
|
122 |
+
question_component = gr.Checkbox(
|
123 |
+
label=question,
|
124 |
+
value=False,
|
125 |
+
info="Check if this evaluation has been performed"
|
126 |
+
)
|
127 |
+
question_components[question] = question_component
|
128 |
+
all_components.append(question_component)
|
129 |
+
|
130 |
+
section_components[section_name][subsection_name] = {
|
131 |
+
'status': status_component,
|
132 |
+
'sources': sources_component,
|
133 |
+
'questions': question_components
|
134 |
+
}
|
135 |
+
|
136 |
+
return all_components, section_components
|
137 |
+
|
138 |
+
def parse_sources(self, sources_text: str) -> List[Dict]:
|
139 |
+
"""Parse sources text into structured format"""
|
140 |
+
sources = []
|
141 |
+
|
142 |
+
# Handle case where sources_text might not be a string
|
143 |
+
if not isinstance(sources_text, str):
|
144 |
+
return sources
|
145 |
+
|
146 |
+
if not sources_text.strip():
|
147 |
+
return sources
|
148 |
+
|
149 |
+
for line in sources_text.strip().split('\n'):
|
150 |
+
line = line.strip()
|
151 |
+
if not line:
|
152 |
+
continue
|
153 |
+
|
154 |
+
# Determine source type based on content
|
155 |
+
if line.startswith('http'):
|
156 |
+
source_type = "🌐"
|
157 |
+
name = line.split('/')[-1] if '/' in line else line
|
158 |
+
elif 'internal' in line.lower() or 'proprietary' in line.lower():
|
159 |
+
source_type = "🏢"
|
160 |
+
name = line
|
161 |
+
else:
|
162 |
+
source_type = "📄"
|
163 |
+
name = line
|
164 |
+
|
165 |
+
sources.append({
|
166 |
+
"type": source_type,
|
167 |
+
"detail": line,
|
168 |
+
"name": name
|
169 |
+
})
|
170 |
+
|
171 |
+
return sources
|
172 |
+
|
173 |
+
def generate_scorecard(self, *args) -> Tuple[Dict, str]:
|
174 |
+
"""Generate scorecard JSON from form inputs"""
|
175 |
+
# Debug: Print argument types and counts
|
176 |
+
print(f"Total arguments received: {len(args)}")
|
177 |
+
for i, arg in enumerate(args[:10]): # Print first 10 for debugging
|
178 |
+
print(f"Arg {i}: {type(arg)} = {arg}")
|
179 |
+
|
180 |
+
# Extract system info (first 5 arguments)
|
181 |
+
name, provider, url, sys_type, modalities = args[:5]
|
182 |
+
remaining_args = list(args[5:])
|
183 |
+
|
184 |
+
# Build metadata
|
185 |
+
metadata = {
|
186 |
+
"Name": name or "Unknown",
|
187 |
+
"Provider": provider or "Unknown",
|
188 |
+
"URL": url or "",
|
189 |
+
"Type": sys_type or "Unknown",
|
190 |
+
"Modalities": modalities or []
|
191 |
+
}
|
192 |
+
|
193 |
+
# Build scores
|
194 |
+
scores = {}
|
195 |
+
arg_index = 0
|
196 |
+
|
197 |
+
for section_name, section_data in self.template.items():
|
198 |
+
scores[section_name] = {}
|
199 |
+
|
200 |
+
for subsection_name, subsection_data in section_data.items():
|
201 |
+
# Get status and sources (next 2 arguments)
|
202 |
+
if arg_index < len(remaining_args):
|
203 |
+
status = remaining_args[arg_index]
|
204 |
+
print(f"Status for {section_name}/{subsection_name}: {type(status)} = {status}")
|
205 |
+
else:
|
206 |
+
status = "N/A"
|
207 |
+
|
208 |
+
if arg_index + 1 < len(remaining_args):
|
209 |
+
sources_text = remaining_args[arg_index + 1]
|
210 |
+
print(f"Sources for {section_name}/{subsection_name}: {type(sources_text)} = {sources_text}")
|
211 |
+
else:
|
212 |
+
sources_text = ""
|
213 |
+
|
214 |
+
# Ensure sources_text is a string
|
215 |
+
if not isinstance(sources_text, str):
|
216 |
+
sources_text = str(sources_text) if sources_text is not None else ""
|
217 |
+
|
218 |
+
# Parse sources
|
219 |
+
sources = self.parse_sources(sources_text)
|
220 |
+
|
221 |
+
# Get question responses
|
222 |
+
questions_dict = {}
|
223 |
+
question_start_index = arg_index + 2
|
224 |
+
num_questions = len(subsection_data['questions'])
|
225 |
+
|
226 |
+
for i, question in enumerate(subsection_data['questions']):
|
227 |
+
q_index = question_start_index + i
|
228 |
+
if q_index < len(remaining_args):
|
229 |
+
questions_dict[question] = remaining_args[q_index]
|
230 |
+
else:
|
231 |
+
questions_dict[question] = False
|
232 |
+
|
233 |
+
# Store subsection data
|
234 |
+
scores[section_name][subsection_name] = {
|
235 |
+
"status": status,
|
236 |
+
"sources": sources,
|
237 |
+
"questions": questions_dict
|
238 |
+
}
|
239 |
+
|
240 |
+
# Move to next subsection (2 for status/sources + number of questions)
|
241 |
+
arg_index += 2 + num_questions
|
242 |
+
|
243 |
+
# Create final scorecard
|
244 |
+
scorecard = {
|
245 |
+
"metadata": metadata,
|
246 |
+
"scores": scores
|
247 |
+
}
|
248 |
+
|
249 |
+
# Generate filename
|
250 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
251 |
+
safe_name = (name or "ai_system").replace(' ', '_').lower()
|
252 |
+
filename = f"{safe_name}_scorecard_{timestamp}.json"
|
253 |
+
|
254 |
+
return scorecard, filename
|
255 |
+
|
256 |
+
def create_interface(self):
|
257 |
+
"""Create the complete Gradio interface"""
|
258 |
+
with gr.Blocks(
|
259 |
+
title="AI System Evaluation Scorecard",
|
260 |
+
# theme=gr.themes.Soft(),
|
261 |
+
css="""
|
262 |
+
.gradio-container {
|
263 |
+
max-width: 1200px !important;
|
264 |
+
}
|
265 |
+
.accordion-header {
|
266 |
+
background-color: #f0f0f0 !important;
|
267 |
+
}
|
268 |
+
"""
|
269 |
+
) as demo:
|
270 |
+
|
271 |
+
# Header
|
272 |
+
gr.Markdown("""
|
273 |
+
# 🔍 AI System Evaluation Scorecard
|
274 |
+
|
275 |
+
This comprehensive evaluation form helps you assess AI systems across multiple dimensions including bias,
|
276 |
+
cultural sensitivity, environmental impact, privacy, and more. Complete the sections relevant to your system
|
277 |
+
to generate a detailed scorecard.
|
278 |
+
|
279 |
+
---
|
280 |
+
""")
|
281 |
+
|
282 |
+
# System information section
|
283 |
+
system_inputs, system_components = self.create_system_info_section()
|
284 |
+
|
285 |
+
# Evaluation sections
|
286 |
+
eval_inputs, eval_components = self.create_evaluation_sections()
|
287 |
+
self.components = {**system_components, **eval_components}
|
288 |
+
|
289 |
+
# Generate button and outputs
|
290 |
+
with gr.Group():
|
291 |
+
gr.Markdown("## 📊 Generate Scorecard")
|
292 |
+
|
293 |
+
with gr.Row():
|
294 |
+
generate_btn = gr.Button(
|
295 |
+
"🚀 Generate Evaluation Scorecard",
|
296 |
+
variant="primary",
|
297 |
+
size="lg",
|
298 |
+
scale=2
|
299 |
+
)
|
300 |
+
clear_btn = gr.Button(
|
301 |
+
"🗑️ Clear Form",
|
302 |
+
variant="secondary",
|
303 |
+
scale=1
|
304 |
+
)
|
305 |
+
|
306 |
+
# Progress indicator
|
307 |
+
progress = gr.Progress()
|
308 |
+
|
309 |
+
# Outputs
|
310 |
+
with gr.Group():
|
311 |
+
gr.Markdown("### 📋 Generated Scorecard")
|
312 |
+
|
313 |
+
with gr.Row():
|
314 |
+
json_output = gr.JSON(
|
315 |
+
label="Scorecard JSON",
|
316 |
+
show_label=True
|
317 |
+
)
|
318 |
+
|
319 |
+
with gr.Row():
|
320 |
+
download_file = gr.File(
|
321 |
+
label="Download Scorecard",
|
322 |
+
visible=False
|
323 |
+
)
|
324 |
+
download_btn = gr.Button(
|
325 |
+
"💾 Download JSON",
|
326 |
+
visible=False,
|
327 |
+
variant="secondary"
|
328 |
+
)
|
329 |
+
|
330 |
+
# Event handlers
|
331 |
+
all_inputs = system_inputs + eval_inputs
|
332 |
+
|
333 |
+
def generate_with_progress(*args):
|
334 |
+
"""Generate scorecard with progress indication"""
|
335 |
+
progress(0.3, desc="Processing inputs...")
|
336 |
+
scorecard, filename = self.generate_scorecard(*args)
|
337 |
+
|
338 |
+
progress(0.7, desc="Generating JSON...")
|
339 |
+
json_content = json.dumps(scorecard, indent=2)
|
340 |
+
|
341 |
+
progress(1.0, desc="Complete!")
|
342 |
+
|
343 |
+
# Save to temporary file for download
|
344 |
+
with open(filename, 'w') as f:
|
345 |
+
f.write(json_content)
|
346 |
+
|
347 |
+
return (
|
348 |
+
scorecard, # JSON display
|
349 |
+
gr.File(value=filename, visible=True), # File for download
|
350 |
+
gr.Button(visible=True) # Show download button
|
351 |
+
)
|
352 |
+
|
353 |
+
def clear_form():
|
354 |
+
"""Clear all form inputs"""
|
355 |
+
return [None] * len(all_inputs)
|
356 |
+
|
357 |
+
# Wire up events
|
358 |
+
generate_btn.click(
|
359 |
+
fn=generate_with_progress,
|
360 |
+
inputs=all_inputs,
|
361 |
+
outputs=[json_output, download_file, download_btn],
|
362 |
+
show_progress="full"
|
363 |
+
)
|
364 |
+
|
365 |
+
clear_btn.click(
|
366 |
+
fn=clear_form,
|
367 |
+
outputs=all_inputs
|
368 |
+
)
|
369 |
+
|
370 |
+
# Add example data button
|
371 |
+
with gr.Group():
|
372 |
+
gr.Markdown("### 📚 Quick Start")
|
373 |
+
example_btn = gr.Button("📝 Load Example Data", variant="secondary")
|
374 |
+
|
375 |
+
def load_example():
|
376 |
+
"""Load example data for StarCoder2-like system"""
|
377 |
+
example_data = [
|
378 |
+
"StarCoder2", # name
|
379 |
+
"BigCode", # provider
|
380 |
+
"https://huggingface.co/bigcode/starcoder2-15b", # url
|
381 |
+
"Large Language Model", # type
|
382 |
+
["Text-to-Text"] # modalities
|
383 |
+
]
|
384 |
+
# Add default values for evaluation sections (all N/A initially)
|
385 |
+
remaining_defaults = []
|
386 |
+
for section_name, section_data in self.template.items():
|
387 |
+
for subsection_name, subsection_data in section_data.items():
|
388 |
+
remaining_defaults.extend([
|
389 |
+
"N/A", # status
|
390 |
+
"", # sources
|
391 |
+
*([False] * len(subsection_data['questions'])) # questions
|
392 |
+
])
|
393 |
+
|
394 |
+
return example_data + remaining_defaults
|
395 |
+
|
396 |
+
example_btn.click(
|
397 |
+
fn=load_example,
|
398 |
+
outputs=all_inputs
|
399 |
+
)
|
400 |
+
|
401 |
+
return demo
|
402 |
+
|
403 |
+
def main():
|
404 |
+
"""Main function to run the application"""
|
405 |
+
try:
|
406 |
+
# Create the evaluation form
|
407 |
+
eval_form = AIEvaluationForm("questions.yaml")
|
408 |
+
|
409 |
+
# Create and launch the interface
|
410 |
+
demo = eval_form.create_interface()
|
411 |
+
|
412 |
+
print("🚀 Launching AI Evaluation Scorecard...")
|
413 |
+
print(f"📁 Loading questions from: {eval_form.template_file}")
|
414 |
+
print(f"📊 Found {len(eval_form.template)} evaluation categories")
|
415 |
+
|
416 |
+
# Count total questions
|
417 |
+
total_questions = sum(
|
418 |
+
len(subsection['questions'])
|
419 |
+
for section in eval_form.template.values()
|
420 |
+
for subsection in section.values()
|
421 |
+
)
|
422 |
+
print(f"❓ Total evaluation questions: {total_questions}")
|
423 |
+
|
424 |
+
demo.launch(ssr_mode=False)
|
425 |
+
|
426 |
+
except FileNotFoundError as e:
|
427 |
+
print(f"❌ Error: {e}")
|
428 |
+
print("Please ensure 'questions.yaml' exists in the current directory.")
|
429 |
+
except Exception as e:
|
430 |
+
print(f"❌ Unexpected error: {e}")
|
431 |
+
|
432 |
+
if __name__ == "__main__":
|
433 |
+
main()
|
questions.yaml
ADDED
@@ -0,0 +1,260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# AI System Evaluation Template
|
2 |
+
# This template contains all evaluation categories and questions for comprehensive AI system assessment
|
3 |
+
|
4 |
+
"1. Bias, Stereotypes, and Representational Harms Evaluation":
|
5 |
+
"1.1 Bias Detection Overview":
|
6 |
+
explainer: "Has the AI system been comprehensively evaluated across multiple stages of the system development chain using diverse evaluation techniques?"
|
7 |
+
questions:
|
8 |
+
- "Evaluations at various stages (data collection, preprocessing, AI system architecture, training, deployment)"
|
9 |
+
- "Have intrinsic properties of the AI system been evaluated for bias (e.g., embedding analysis)"
|
10 |
+
- "Have extrinsic bias evaluations been run (e.g., downstream task performance)"
|
11 |
+
- "Have evaluations been run across all applicable modalities"
|
12 |
+
- "Have bias evaluations been run that take the form of automatic quantitative evaluation"
|
13 |
+
- "Have bias evaluations been run with human participants?"
|
14 |
+
|
15 |
+
"1.2 Protected Classes and Intersectional Measures":
|
16 |
+
explainer: "Does the evaluation include a sufficiently broad range of protected classes that are disproportionately subject to harm by in-scope uses of the system, and the intersections of these classes?"
|
17 |
+
questions:
|
18 |
+
- "Do evaluations cover all applicable legal protected categories for in-scope uses of the system?"
|
19 |
+
- "Do evaluations cover additional subgroups that are likely to be harmed based on other personal characteristics"
|
20 |
+
- "Evaluation of how different aspects of identity interact and compound in AI system behavior"
|
21 |
+
- "Evaluation of AI system biases for legal protected categories and additional relevant subgroups"
|
22 |
+
|
23 |
+
"1.3 Measurement of Stereotypes and Harmful Associations":
|
24 |
+
explainer: "Has the AI system been evaluated for the presence of harmful associations and stereotypes in its outputs?"
|
25 |
+
questions:
|
26 |
+
- "Measurement of known stereotypes in AI system outputs"
|
27 |
+
- "Measurement of other negative associations and assumptions regarding specific groups"
|
28 |
+
- "Measurement of stereotypes and negative associations across in-scope contexts"
|
29 |
+
|
30 |
+
"1.4 Bias Evaluation Transparency and Documentation":
|
31 |
+
explainer: "Are the AI system's bias evaluations clearly documented for easy reproduction and interpretation?"
|
32 |
+
questions:
|
33 |
+
- "Sufficient documentation of evaluation methods (including code and datasets) to replicate findings"
|
34 |
+
- "Sufficient documentation of evaluation results (including intermediary statistics) to support comparison to other AI systems"
|
35 |
+
- "Documentation of bias mitigation measures, including their secondary impacts"
|
36 |
+
- "Documentation of bias monitoring approaches post-release/deployment if applicable"
|
37 |
+
|
38 |
+
"2. Cultural Values and Sensitive Content Evaluation":
|
39 |
+
"2.1 Cultural Variation Overview":
|
40 |
+
explainer: "Has the AI system been comprehensively evaluated for cultural variation across multiple stages of the system development chain using diverse evaluation techniques?"
|
41 |
+
questions:
|
42 |
+
- "Evaluations at various stages (data collection, preprocessing, AI system architecture, training, deployment)"
|
43 |
+
- "Have intrinsic properties of the AI system been evaluated for cultural variation (e.g., embedding analysis)"
|
44 |
+
- "Have extrinsic cultural variation evaluations been run (e.g., downstream task performance)"
|
45 |
+
- "Have evaluations been run across all applicable modalities"
|
46 |
+
- "Have cultural variation evaluations been run that take the form of automatic quantitative evaluation"
|
47 |
+
- "Have cultural variation evaluations been run with human participants?"
|
48 |
+
|
49 |
+
"2.2 Cultural Diversity and Representation":
|
50 |
+
explainer: "Has the AI system been evaluated for its respect towards cultural values and norms across in-scope uses and contexts? Does the evaluation examine cultural diversity both across and within different regions and communities?"
|
51 |
+
questions:
|
52 |
+
- "Use of evaluation methods developed in the cultural contexts in scope"
|
53 |
+
- "Respect of indigenous sovereignty, protected rights, and cultural norms in AI system-generated content"
|
54 |
+
- "Evaluation of cultural variation across geographic dimensions"
|
55 |
+
- "Evaluation of cultural variation representing communities' perspectives within geographical contexts"
|
56 |
+
- "Analysis of how cultural context affects AI system performance"
|
57 |
+
|
58 |
+
"2.3 Generated Sensitive Content across Cultural Contexts":
|
59 |
+
explainer: "Has the AI system been evaluated for the potential negative impacts and implications of its generated content across different cultural contexts? Has the system been evaluated for its handling of hate speech, harmful content, and culturally sensitive material?"
|
60 |
+
questions:
|
61 |
+
- "Has the AI system been evaluated for its likelihood of facilitating generation of threatening or violent content"
|
62 |
+
- "Has the AI system been evaluated for its likelihood of facilitating generation of targeted harassment or discrimination"
|
63 |
+
- "Has the AI system been evaluated for its likelihood of facilitating generation of hate speech"
|
64 |
+
- "Has the AI system been evaluated for its likelihood of exposing its direct users to content embedding values and assumptions not reflective of their cultural context"
|
65 |
+
- "Has the AI system been evaluated for its likelihood of exposing its direct users to inappropriate content for their use context"
|
66 |
+
- "Has the AI system been evaluated for its likelihood of exposing its direct users to content with negative psychological impacts"
|
67 |
+
- "Has the evaluation of the AI system's behaviors explicitly considered cultural variation in their definition"
|
68 |
+
|
69 |
+
"2.4 Cultural Variation Transparency and Documentation":
|
70 |
+
explainer: "Are the cultural limitations of the evaluation methods clearly documented? Has a comprehensive, culturally-informed evaluation methodology been implemented?"
|
71 |
+
questions:
|
72 |
+
- "Documentation of cultural contexts considered during development"
|
73 |
+
- "Documentation of the range of cultural contexts covered by evaluations"
|
74 |
+
- "Sufficient documentation of evaluation method to understand the scope of the findings"
|
75 |
+
- "Construct validity, documentation of strengths, weaknesses, and assumptions"
|
76 |
+
- "Domain shift between evaluation development and AI system development settings"
|
77 |
+
- "Sufficient documentation of evaluation methods to replicate findings"
|
78 |
+
- "Sufficient documentation of evaluation results to support comparison"
|
79 |
+
- "Document of psychological impact on evaluators reviewing harmful content"
|
80 |
+
- "Documentation of measures to protect evaluator well-being"
|
81 |
+
|
82 |
+
"3. Disparate Performance Evaluation":
|
83 |
+
"3.1 Disparate Performance Overview":
|
84 |
+
explainer: "Has the AI system been comprehensively evaluated for disparity in performance across groups in specific task and deployment contexts?"
|
85 |
+
questions:
|
86 |
+
- "Have development choices and intrinsic properties of the AI system been evaluated for their contribution to disparate performance?"
|
87 |
+
- "Have extrinsic disparate performance evaluations been run"
|
88 |
+
- "Have evaluations been run across all applicable modalities"
|
89 |
+
- "Have disparate performance evaluations been run that take the form of automatic quantitative evaluation"
|
90 |
+
- "Have disparate performance evaluations been run with human participants"
|
91 |
+
|
92 |
+
"3.2 Identifying Target Groups for Disparate Performance Evaluation":
|
93 |
+
explainer: "Has the evaluation identified subgroups more likely to be harmed by disparate performance in context by considering the scope of the AI system's application and its relationship to existing systemic issues?"
|
94 |
+
questions:
|
95 |
+
- "Identification of mandated target group based on legal nondiscrimination frameworks"
|
96 |
+
- "Identification of further target groups that are likely to be harmed by disparate performance"
|
97 |
+
- "Assessment of systemic barriers in dataset collection methods for different groups"
|
98 |
+
- "Consideration of historical disparities in the task in which the AI system is deployed"
|
99 |
+
- "Identification of both implicit and explicit markers for the target groups"
|
100 |
+
|
101 |
+
"3.3 Subgroup Performance Analysis":
|
102 |
+
explainer: "Has the AI system been evaluated for disparate performance across different subpopulations for specific in-scope applications of the AI system?"
|
103 |
+
questions:
|
104 |
+
- "Non-aggregated evaluation results across subpopulations, including feature importance and consistency analysis"
|
105 |
+
- "Metrics to measure performance in decision-making tasks"
|
106 |
+
- "Metrics to measure disparate performance in other tasks including generative tasks"
|
107 |
+
- "Worst-case subgroup performance analysis, including performance on rare or underrepresented cases"
|
108 |
+
- "Intersectional analysis examining performance across combinations of subgroup"
|
109 |
+
- "Do evaluations of disparate performance account for implicit social group markers"
|
110 |
+
|
111 |
+
"3.4 Disparate Performance Evaluation Transparency and Documentation":
|
112 |
+
explainer: "Are the disparate performance evaluations clearly documented for easy reproduction and interpretation?"
|
113 |
+
questions:
|
114 |
+
- "Sufficient documentation of evaluation method to understand the scope of the findings"
|
115 |
+
- "Documentation of strengths, weaknesses, and assumptions about the context"
|
116 |
+
- "Documentation of domain shift between evaluation and deployment settings"
|
117 |
+
- "Sufficient documentation of evaluation methods to replicate findings"
|
118 |
+
- "Sufficient documentation of evaluation results to support comparison"
|
119 |
+
- "Documentation of disparate performance mitigation measures"
|
120 |
+
- "Documentation of disparate performance monitoring approaches"
|
121 |
+
|
122 |
+
"4. Environmental Costs and Carbon Emissions Evaluation":
|
123 |
+
"4.1 Environmental Costs Overview":
|
124 |
+
explainer: "Has the AI system been comprehensively evaluated across multiple stages of the system development chain using diverse evaluation techniques?"
|
125 |
+
questions:
|
126 |
+
- "Evaluations of different processes within development and deployment"
|
127 |
+
- "Have evaluations been run across all applicable modalities?"
|
128 |
+
- "Have evaluations been run on standardized benchmarks or metrics?"
|
129 |
+
- "Have evaluations taken into account community feedback from regions affected by data center power consumption?"
|
130 |
+
- "Do evaluations consider the full supply chain including environmental impact of hardware components and data centers used?"
|
131 |
+
|
132 |
+
"4.2 Energy Cost and Environmental Impact of Development":
|
133 |
+
explainer: "Has the AI system been comprehensively evaluated for its carbon footprint and broader environmental impact?"
|
134 |
+
questions:
|
135 |
+
- "Accounting of FLOPS across development stages"
|
136 |
+
- "Evaluation of energy consumption using standardized tracking tools"
|
137 |
+
- "Evaluation of carbon impact accounting for regional energy sources"
|
138 |
+
- "Evaluation of hardware lifecycle environmental impact"
|
139 |
+
|
140 |
+
"4.3 Energy Cost and Environmental Impact of Deployment":
|
141 |
+
explainer: "Has the AI system been evaluated for its hardware resource usage and efficiency?"
|
142 |
+
questions:
|
143 |
+
- "Evaluation of inference FLOPS for the system"
|
144 |
+
- "Evaluation of inference energy consumption on most common deployment setting"
|
145 |
+
- "Evaluation of inference energy consumption on multiple deployment settings"
|
146 |
+
- "Evaluation of task-specific energy consumption variations"
|
147 |
+
- "Evaluation of carbon impact for deployment infrastructure"
|
148 |
+
- "Evaluation of hardware lifecycle environmental impact for deployment"
|
149 |
+
|
150 |
+
"4.4 Environmental Costs Transparency and Documentation":
|
151 |
+
explainer: "Are the limitations of the evaluation methods clearly documented? Has a comprehensive environmental evaluation methodology been implemented?"
|
152 |
+
questions:
|
153 |
+
- "Documentation about equipment and infrastructure specifications"
|
154 |
+
- "Sufficient documentation of evaluation methods including components covered"
|
155 |
+
- "Sufficient documentation of evaluation methods to replicate findings"
|
156 |
+
- "Sufficient documentation of evaluation results for comparison"
|
157 |
+
|
158 |
+
"5. Privacy and Data Protection Evaluation":
|
159 |
+
"5.1 Privacy and Data Protection Overview":
|
160 |
+
explainer: "Has the AI system been comprehensively evaluated for privacy across multiple stages of the system development chain using diverse evaluation techniques?"
|
161 |
+
questions:
|
162 |
+
- "Evaluations at various stages (data collection, preprocessing, AI system architecture, training, deployment)"
|
163 |
+
- "Have intrinsic properties of the AI system been evaluated for privacy vulnerabilities"
|
164 |
+
- "Have extrinsic privacy evaluations been run"
|
165 |
+
- "Have evaluations been run across all applicable modalities"
|
166 |
+
- "Have privacy evaluations been run that take the form of automatic quantitative evaluation"
|
167 |
+
- "Have privacy evaluations been run with human participants?"
|
168 |
+
|
169 |
+
"5.2 Privacy, Likeness, and Publicity Harms":
|
170 |
+
explainer: "Has the AI system been evaluated for risks to personal integrity, privacy, and control of one's likeness?"
|
171 |
+
questions:
|
172 |
+
- "Has the AI system been evaluated for its likelihood of revealing personal information from its training data?"
|
173 |
+
- "Has the AI system been evaluated for its likelihood of facilitating generation of content impersonating an individual?"
|
174 |
+
- "Has the AI system been evaluated for its likelihood of providing made up or confabulated personal information about individuals?"
|
175 |
+
|
176 |
+
"5.3 Intellectual Property and Information Security":
|
177 |
+
explainer: "Has the AI system been evaluated for its likelihood of reproducing sensitive information or information with attached property rights?"
|
178 |
+
questions:
|
179 |
+
- "Has the AI system been evaluated for its likelihood of reproducing other categories of information from its training data"
|
180 |
+
- "Has the system been evaluated for other information security risks for in-scope uses"
|
181 |
+
|
182 |
+
"5.4 Privacy Evaluation Transparency and Documentation":
|
183 |
+
explainer: "Are the privacy evaluations clearly documented to enable understanding of privacy risks, limitations, and reproducibility of findings?"
|
184 |
+
questions:
|
185 |
+
- "Documentation of the categories of training data that present information risk"
|
186 |
+
- "Documentation of evaluation methods to replicate findings"
|
187 |
+
- "Documentation of evaluation results to support comparison"
|
188 |
+
- "Documentation of evaluation limitations"
|
189 |
+
- "Documentation of deployment considerations"
|
190 |
+
|
191 |
+
"6. Financial Costs Evaluation":
|
192 |
+
"6.1 Financial Costs Overview":
|
193 |
+
explainer: "Has the AI system been comprehensively evaluated for system costs across multiple stages of development and deployment?"
|
194 |
+
questions:
|
195 |
+
- "Evaluation of costs at various stages"
|
196 |
+
- "Have costs been evaluated for different system components"
|
197 |
+
- "Have cost evaluations been run across all applicable modalities"
|
198 |
+
- "Have cost evaluations included both direct and indirect expenses"
|
199 |
+
- "Have cost projections been validated against actual expenses"
|
200 |
+
|
201 |
+
"6.2 Development and Training Costs":
|
202 |
+
explainer: "Has the AI system been evaluated for costs associated with development and training phases?"
|
203 |
+
questions:
|
204 |
+
- "Assessment of research and development labor costs"
|
205 |
+
- "Evaluation of data collection and preprocessing costs"
|
206 |
+
- "Assessment of training infrastructure costs"
|
207 |
+
- "Assessment of costs associated with different training approaches"
|
208 |
+
- "Evaluation of model architecture and size impact on costs"
|
209 |
+
|
210 |
+
"6.3 Deployment and Operation Costs":
|
211 |
+
explainer: "Has the AI system been evaluated for ongoing deployment and operational costs?"
|
212 |
+
questions:
|
213 |
+
- "Assessment of inference and serving costs"
|
214 |
+
- "Evaluation of storage and hosting expenses"
|
215 |
+
- "Assessment of scaling costs based on usage patterns"
|
216 |
+
- "Evaluation of costs specific to different deployment contexts"
|
217 |
+
- "Assessment of costs for model updates or fine-tuning by end users"
|
218 |
+
|
219 |
+
"6.4 Financial Cost Documentation and Transparency":
|
220 |
+
explainer: "Are the financial cost evaluations clearly documented to enable understanding and planning?"
|
221 |
+
questions:
|
222 |
+
- "Sufficient documentation of cost evaluation methodology and assumptions"
|
223 |
+
- "Sufficient documentation of cost breakdowns and metrics"
|
224 |
+
- "Documentation of cost variations across different usage scenarios"
|
225 |
+
- "Documentation of long-term cost projections and risk factors"
|
226 |
+
|
227 |
+
"7. Data and Content Moderation Labor Evaluation":
|
228 |
+
"7.1 Labor Evaluation Overview":
|
229 |
+
explainer: "Has the AI system been comprehensively evaluated for labor practices across different stages of AI system development and deployment?"
|
230 |
+
questions:
|
231 |
+
- "Evaluation of labor practices at various stages"
|
232 |
+
- "Have labor conditions been evaluated for different worker categories"
|
233 |
+
- "Have labor evaluations been run across all applicable task types"
|
234 |
+
- "Have labor practices been evaluated against established industry standards"
|
235 |
+
- "Have labor evaluations included both direct employees and contracted workers"
|
236 |
+
- "Have evaluations considered different regional and jurisdictional contexts"
|
237 |
+
|
238 |
+
"7.2 Working Conditions and Compensation":
|
239 |
+
explainer: "Has the AI system been evaluated for its labor practices, compensation structures, and working conditions?"
|
240 |
+
questions:
|
241 |
+
- "Assessment of compensation relative to local living wages and industry standards"
|
242 |
+
- "Assessment of job security and employment classification"
|
243 |
+
- "Evaluation of workplace safety, worker protections and rights"
|
244 |
+
- "Assessment of worker autonomy and task assignment practices"
|
245 |
+
- "Evaluation of power dynamics and worker feedback mechanisms"
|
246 |
+
|
247 |
+
"7.3 Worker Wellbeing and Support":
|
248 |
+
explainer: "Has the AI system been evaluated for its support of worker wellbeing, particularly for those exposed to challenging content?"
|
249 |
+
questions:
|
250 |
+
- "Assessment of psychological support systems, trauma resources, and other long-term mental health monitoring"
|
251 |
+
- "Evaluation of training and preparation for difficult content"
|
252 |
+
- "Evaluation of cultural and linguistic support for diverse workforces"
|
253 |
+
|
254 |
+
"7.4 Labor Practice Documentation and Transparency":
|
255 |
+
explainer: "Are the labor evaluations clearly documented to enable understanding and accountability?"
|
256 |
+
questions:
|
257 |
+
- "Documentation of labor evaluation methodology and frameworks used"
|
258 |
+
- "Documentation of worker demographics and task distribution"
|
259 |
+
- "Documentation of support systems, worker protections"
|
260 |
+
- "Documentation of incident reporting and resolution procedures"
|