kaikaidai commited on
Commit
7db401b
·
verified ·
1 Parent(s): 2642da4

Synced repo using 'sync_with_huggingface' Github Action

Browse files
.env.example ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ TOGETHER_API_KEY=your_together_api_key_here
2
+ ANTHROPIC_API_KEY=your_anthropic_api_key_here
3
+ OPENAI_API_KEY=your_openai_api_key_here
app.py CHANGED
@@ -1,468 +1,58 @@
1
- import json
2
- import re
3
- from datetime import datetime
4
- import hashlib
5
- import gradio as gr
6
-
7
- from gen_api_answer import (
8
- atla_parse_model_response,
9
- get_atla_response
10
- )
11
-
12
- from prompts import (
13
- ATLA_PROMPT,
14
- ATLA_PROMPT_WITH_REFERENCE
15
- )
16
-
17
- from random_sample_generation import (
18
- get_random_human_ai_pair,
19
- get_random_human_ai_ground_truth_pair,
20
- generate_ai_response
21
- )
22
-
23
- from utils import Vote
24
-
25
- from prompts import (
26
- DEFAULT_EVAL_PROMPT,
27
- DEFAULT_EVAL_PROMPT_EDITABLE,
28
- FIXED_EVAL_SUFFIX,
29
- DEFAULT_EVAL_CRITERIA
30
- )
31
-
32
- from common import CSS_STYLES, MAIN_TITLE, HOW_IT_WORKS
33
-
34
-
35
- # Load the model_data from JSONL
36
- def load_model_data():
37
- model_data = {}
38
- try:
39
- with open("data/models.jsonl", "r") as f:
40
- for line in f:
41
- model = json.loads(line)
42
- model_data[model["name"]] = {
43
- "organization": model["organization"],
44
- "license": model["license"],
45
- "api_model": model["api_model"],
46
- }
47
- except FileNotFoundError:
48
- print("Warning: models.jsonl not found")
49
- return {}
50
- return model_data
51
-
52
-
53
- model_data = load_model_data()
54
-
55
- def store_vote_data(prompt, response_a, response_b, model_a, model_b, winner, judge_id):
56
- prompt_value = prompt.value if hasattr(prompt, 'value') else prompt
57
-
58
- vote = Vote(
59
- timestamp=datetime.now().isoformat(),
60
- prompt=prompt_value,
61
- response_a=response_a,
62
- response_b=response_b,
63
- model_a=model_a,
64
- model_b=model_b,
65
- winner=winner,
66
- judge_id=judge_id,
67
- )
68
- add_vote(vote, db)
69
-
70
-
71
- def parse_variables(prompt):
72
- # Extract variables enclosed in double curly braces
73
- variables = re.findall(r"{{(.*?)}}", prompt)
74
- # Remove duplicates while preserving order
75
- seen = set()
76
- variables = [
77
- x.strip() for x in variables if not (x.strip() in seen or seen.add(x.strip()))
78
- ]
79
- return variables
80
-
81
-
82
- def get_final_prompt(eval_prompt, variable_values):
83
- # Replace variables in the eval prompt with their values
84
- for var, val in variable_values.items():
85
- eval_prompt = eval_prompt.replace("{{" + var + "}}", val)
86
- return eval_prompt
87
-
88
-
89
-
90
- def get_ip(request: gr.Request) -> str:
91
- """Get and hash the IP address from the request."""
92
- if "cf-connecting-ip" in request.headers:
93
- ip = request.headers["cf-connecting-ip"]
94
- elif "x-forwarded-for" in request.headers:
95
- ip = request.headers["x-forwarded-for"]
96
- if "," in ip:
97
- ip = ip.split(",")[0]
98
- else:
99
- ip = request.client.host
100
-
101
- # Hash the IP address for privacy
102
- return hashlib.sha256(ip.encode()).hexdigest()[:16]
103
-
104
-
105
- def get_vote_message(choice: str, model_a: str, model_b: str) -> tuple[str, str]:
106
- """Generate appropriate message based on vote and model rankings.
107
- Returns (title, message) tuple."""
108
- # Get current rankings
109
- voting_data = get_current_votes()
110
- leaderboard = get_leaderboard(model_data, voting_data, show_preliminary=True)
111
- rankings = get_model_rankings(leaderboard)
112
- pos_a = rankings.get(model_a, 0)
113
- pos_b = rankings.get(model_b, 0)
114
-
115
- if choice == "Tie":
116
- return "It's a tie!", "Keep voting responsibly 🤗"
117
-
118
- # Check if vote aligns with leaderboard
119
- if (choice == "A" and pos_a < pos_b) or (choice == "B" and pos_b < pos_a):
120
- return "The favourite wins!", "Keep voting responsibly 🤗"
121
- else:
122
- return "The underdog wins!", "Keep voting responsibly 🤗"
123
-
124
-
125
- def populate_random_example(request: gr.Request, compatible_mode: bool):
126
- """Generate a random human-AI conversation example and reset judge outputs."""
127
- if compatible_mode:
128
- human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
129
- else:
130
- human_msg, ai_msg = get_random_human_ai_pair()
131
- ground_truth_msg = ""
132
-
133
- return [
134
- gr.update(value=human_msg),
135
- gr.update(value=ai_msg),
136
- gr.update(value="🎲", variant="secondary"),
137
- gr.update(value=""), # Clear score
138
- gr.update(value=""), # Clear critique
139
- gr.update(value=ground_truth_msg, visible=compatible_mode), # Set ground truth and visibility
140
- ]
141
-
142
-
143
- with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
144
- gr.Markdown(MAIN_TITLE)
145
- gr.Markdown(HOW_IT_WORKS)
146
-
147
- # Hidden eval prompt that will always contain DEFAULT_EVAL_PROMPT
148
- eval_prompt = gr.Textbox(
149
- value=DEFAULT_EVAL_PROMPT,
150
- visible=False
151
- )
152
-
153
- with gr.Tabs():
154
- with gr.TabItem("Playground"):
155
- with gr.Row():
156
- # Left side - Input section
157
- with gr.Column(scale=1):
158
- with gr.Group():
159
- human_input = gr.TextArea(
160
- label="👩 User Input",
161
- lines=5,
162
- placeholder="Enter the human message here..."
163
- )
164
- with gr.Row():
165
- generate_btn = gr.Button(
166
- "Generate AI Response",
167
- size="sm",
168
- interactive=False
169
- )
170
-
171
- ai_response = gr.TextArea(
172
- label="🤖 AI Response",
173
- lines=10,
174
- placeholder="Enter the AI response here..."
175
- )
176
-
177
- # Ground truth response (initially hidden)
178
- ground_truth = gr.TextArea(
179
- label="🎯 Ground truth response",
180
- lines=10,
181
- placeholder="Enter the ground truth response here...",
182
- visible=False
183
- )
184
-
185
- with gr.Row():
186
- random_btn = gr.Button("🎲", scale=2)
187
- send_btn = gr.Button(
188
- value="Run evaluation",
189
- variant="primary",
190
- size="lg",
191
- scale=8
192
- )
193
-
194
- # Right side - Model outputs
195
- with gr.Column(scale=1):
196
- gr.Markdown("### 👩‍⚖️ Selene-Mini Evaluation")
197
- with gr.Group():
198
- with gr.Row():
199
- score = gr.Textbox(label="Score", lines=1, interactive=False)
200
- critique = gr.TextArea(label="Critique", lines=12, interactive=False)
201
-
202
- gr.Markdown("<br>")
203
-
204
 
205
- # Replace the "Edit Judge Prompt" Accordion section with:
206
- with gr.Accordion("📝 Edit Judge Prompt", open=False) as prompt_accordion:
207
- gr.Markdown("<br>")
208
- use_reference_toggle = gr.Checkbox(
209
- label="Use a reference response",
210
- value=False
211
- )
212
-
213
- # Hide the default prompt editor
214
- with gr.Column(visible=False) as default_prompt_editor:
215
- eval_prompt_editable = gr.TextArea(
216
- value=DEFAULT_EVAL_PROMPT_EDITABLE,
217
- label="Evaluation Criteria",
218
- lines=12
219
- )
220
-
221
- with gr.Row(visible=False) as edit_buttons_row:
222
- cancel_prompt_btn = gr.Button("Cancel")
223
- save_prompt_btn = gr.Button("Save", variant="primary")
224
- gr.Markdown("*The sample being evaluated is always appended as:*")
225
- gr.Markdown(f"```{FIXED_EVAL_SUFFIX}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
 
227
- # Show the compatible mode editor
228
- with gr.Column(visible=True) as compatible_prompt_editor:
229
- eval_criteria_text = gr.TextArea(
230
- label="Evaluation Criteria",
231
- lines=12,
232
- value=DEFAULT_EVAL_CRITERIA,
233
- placeholder="Enter the complete evaluation criteria and scoring rubric..."
234
- )
235
-
236
- # Define state variables for model tracking
237
- model_a_state = gr.State()
238
- model_b_state = gr.State()
239
- final_prompt_state = gr.State()
240
- eval_prompt_previous = gr.State(value=DEFAULT_EVAL_PROMPT_EDITABLE) # Initialize with default value
241
- is_editing = gr.State(False) # Track editing state
242
- compatible_mode_state = gr.State(False) # Track compatible mode state
243
-
244
- # Update model names after responses are generated
245
- def update_model_names(model_a, model_b):
246
- return gr.update(value=f"*Model: {model_a}*"), gr.update(
247
- value=f"*Model: {model_b}*"
248
- )
249
-
250
- # Store the last submitted prompt and variables for comparison
251
- last_submission = gr.State({})
252
-
253
- # Add handlers for save/cancel buttons
254
- def save_prompt(new_prompt, previous_prompt):
255
- return [
256
- gr.update(value=new_prompt), # Update the prompt
257
- new_prompt, # Update the previous prompt state
258
- gr.update(visible=False) # Hide the buttons
259
- ]
260
-
261
- def cancel_prompt(previous_prompt):
262
- return [
263
- gr.update(value=previous_prompt), # Revert to previous prompt
264
- previous_prompt, # Keep the previous prompt state
265
- gr.update(visible=False) # Hide the buttons
266
- ]
267
-
268
- def show_edit_buttons(current_value, previous_value):
269
- # Show buttons only if the current value differs from the previous value
270
- return gr.update(visible=current_value != previous_value)
271
-
272
- # Add handlers for save/cancel buttons and prompt changes
273
- save_prompt_btn.click(
274
- fn=save_prompt,
275
- inputs=[eval_prompt_editable, eval_prompt_previous],
276
- outputs=[eval_prompt_editable, eval_prompt_previous, edit_buttons_row]
277
- )
278
-
279
- cancel_prompt_btn.click(
280
- fn=cancel_prompt,
281
- inputs=[eval_prompt_previous],
282
- outputs=[eval_prompt_editable, eval_prompt_previous, edit_buttons_row]
283
- )
284
-
285
- eval_prompt_editable.change(
286
- fn=show_edit_buttons,
287
- inputs=[eval_prompt_editable, eval_prompt_previous],
288
- outputs=edit_buttons_row
289
- )
290
-
291
- # Function to toggle visibility based on compatible mode
292
- def toggle_use_reference(checked):
293
- if checked:
294
- human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
295
- return {
296
- ground_truth: gr.update(visible=True, value=ground_truth_msg),
297
- human_input: gr.update(value=human_msg),
298
- ai_response: gr.update(value=ai_msg),
299
- score: gr.update(value=""),
300
- critique: gr.update(value=""),
301
- random_btn: gr.update(value="🎲", variant="secondary"),
302
- }
303
- else:
304
- return {
305
- ground_truth: gr.update(visible=False)
306
- }
307
-
308
- # Update the change handler to include all necessary outputs
309
- use_reference_toggle.change(
310
- fn=toggle_use_reference,
311
- inputs=[use_reference_toggle],
312
- outputs=[
313
- ground_truth,
314
- human_input,
315
- ai_response,
316
- score,
317
- critique,
318
- random_btn,
319
- ]
320
- )
321
-
322
- # Add a new state variable to track first game
323
- first_game_state = gr.State(True) # Initialize as True
324
-
325
- # Update the submit function to parse the evaluation criteria
326
- def submit_and_store(
327
- use_reference,
328
- eval_criteria_text,
329
- human_input,
330
- ai_response,
331
- ground_truth_input,
332
- ):
333
- # Build prompt data dictionary
334
- prompt_data = {
335
- 'human_input': human_input,
336
- 'ai_response': ai_response,
337
- 'ground_truth_input': ground_truth_input if use_reference else '',
338
- 'eval_criteria': eval_criteria_text,
339
- }
340
-
341
- # Get base prompt based on whether reference is used
342
- base_prompt = ATLA_PROMPT_WITH_REFERENCE if use_reference else ATLA_PROMPT
343
-
344
- # Format the prompt
345
- final_prompt = base_prompt.format(
346
- human_input=prompt_data['human_input'],
347
- ai_response=prompt_data['ai_response'],
348
- ground_truth_input=prompt_data['ground_truth_input'],
349
- eval_criteria=prompt_data['eval_criteria']
350
- )
351
-
352
- # Get response from Atla
353
- response = get_atla_response(
354
- model_name="AtlaAI/Selene-1-Mini-Llama-3.1-8B",
355
- prompt=final_prompt,
356
- max_tokens=500,
357
- temperature=0.01
358
- )
359
-
360
- # Parse the response
361
- score, critique = atla_parse_model_response(response)
362
-
363
- return [
364
- score,
365
- critique,
366
- gr.update(value="Regenerate evaluation", variant="secondary", interactive=True),
367
- gr.update(value="🎲"),
368
- ]
369
-
370
- # Update the click handler to use False for is_first_game after first submission
371
- def create_submit_handler():
372
- first_game = True
373
-
374
- def handler(*args):
375
- nonlocal first_game
376
- result = submit_and_store(*args)
377
- first_game = False # Set to False after first submission
378
- return result
379
-
380
- return handler
381
-
382
- # Update the send_btn click handler
383
- send_btn.click(
384
- fn=submit_and_store,
385
- inputs=[
386
- use_reference_toggle,
387
- eval_criteria_text,
388
- human_input,
389
- ai_response,
390
- ground_truth,
391
- ],
392
- outputs=[
393
- score,
394
- critique,
395
- send_btn,
396
- random_btn,
397
- ],
398
- )
399
-
400
- # Add random button handler
401
- random_btn.click(
402
- fn=populate_random_example,
403
- inputs=[use_reference_toggle],
404
- outputs=[
405
- human_input,
406
- ai_response,
407
- random_btn,
408
- score,
409
- critique,
410
- ground_truth,
411
- ]
412
- )
413
-
414
- # Add input change handlers
415
- def handle_input_change():
416
- """Reset UI state when inputs are changed"""
417
- return [
418
- gr.update(value="Run evaluation", variant="primary"), # send_btn
419
- gr.update(value="🎲", variant="secondary"), # random_btn
420
- ]
421
-
422
- # Update the change handlers for inputs
423
- human_input.change(
424
- fn=handle_input_change,
425
- inputs=[],
426
- outputs=[send_btn, random_btn]
427
- )
428
-
429
- ai_response.change(
430
- fn=handle_input_change,
431
- inputs=[],
432
- outputs=[send_btn, random_btn]
433
- )
434
 
435
- generate_btn.click(
436
- fn=lambda msg: (
437
- generate_ai_response(msg)[0], # Only take the response text
438
- gr.update(
439
- value="Generate AI Response", # Keep the label
440
- interactive=False # Disable the button
441
- )
442
- ),
443
- inputs=[human_input],
444
- outputs=[ai_response, generate_btn]
445
- )
446
 
447
- human_input.change(
448
- fn=lambda x: gr.update(interactive=bool(x.strip())),
449
- inputs=[human_input],
450
- outputs=[generate_btn]
451
- )
452
 
453
- # Update the demo.load to include the random example population
454
- demo.load(
455
- fn=lambda: populate_random_example(None, False), # Pass False for initial compatible_mode
456
- inputs=[],
457
- outputs=[
458
- human_input,
459
- ai_response,
460
- random_btn,
461
- score,
462
- critique,
463
- ground_truth,
464
- ]
465
- )
466
 
467
  if __name__ == "__main__":
468
- demo.launch()
 
1
+ # sandbox_runner.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
+ import gradio as gr
4
+ from data_handler import upload_test_data
5
+ from criteria_handler import select_evaluation_criteria
6
+ from model_handler import select_evaluators
7
+ from score_handler import handle_analysis
8
+ from random_sample_tab import random_sample_tab
9
+
10
+ def run_sandbox():
11
+ with gr.Blocks(css="""
12
+ .truncate_cells table {
13
+ table-layout: fixed !important;
14
+ width: 100% !important;
15
+ }
16
+ .truncate_cells table td,
17
+ .truncate_cells table th {
18
+ white-space: nowrap !important;
19
+ overflow: hidden !important;
20
+ text-overflow: ellipsis !important;
21
+ max-width: 200px !important;
22
+ text-align: left !important;
23
+ vertical-align: top !important;
24
+ }
25
+ """) as demo:
26
+ gr.Markdown("# Selene Playground")
27
+ gr.Markdown("Try running evals with Selene and Selene-Mini in this playground! The Selene-Mini model card can be found [here](https://huggingface.co/AtlaAI/Selene-1-Mini-Llama-3.1-8B)")
28
+ with gr.Tabs():
29
+ # Random samples tab
30
+ random_sample_tab()
31
+
32
+ # Sandbox tab
33
+ with gr.TabItem("Custom dataset"):
34
+ # Initialize state object to track the DataFrame
35
+ df_state = gr.State(value=None)
36
+ # Initialize state object to track the prompt
37
+ prompt_state = gr.State(value=None)
38
+ # Initialize the evaluation_complete flag
39
+ evaluation_complete = gr.State(value=None)
40
+
41
+ # Data upload
42
+ data_upload_group, df_state = upload_test_data(df_state)
43
 
44
+ # Criteria selection
45
+ criteria_group, df_state, prompt_state, save_prompt_button = \
46
+ select_evaluation_criteria(data_upload_group, df_state, prompt_state)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
+ # Models selection
49
+ model_selection_group, df_state, analyze_results_button = \
50
+ select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button)
 
 
 
 
 
 
 
 
51
 
52
+ # Result analysis
53
+ handle_analysis(df_state, model_selection_group, analyze_results_button)
 
 
 
54
 
55
+ demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
  if __name__ == "__main__":
58
+ run_sandbox()
common.py CHANGED
@@ -1,20 +1,18 @@
1
  # Page Headers
2
- MAIN_TITLE = "# Selene 1 Mini Playground"
3
 
4
  # How it works section
5
  HOW_IT_WORKS = """
6
- Try running evals with Selene-Mini in this playground! Our HF model card can be found [here](https://huggingface.co/AtlaAI/Selene-1-Mini-Llama-3.1-8B).
 
7
  """
8
 
9
  BATTLE_RULES = """
10
- ## 🤺 Choose the winner
11
- 1. Define your scoring criteria in the **Evaluator Prompt**
12
- 2. Add a test case to the **Sample to evaluate**
13
- 3. Test the evaluators & vote for the model that best aligns with your judgement!
14
- \n
15
- Variables defined in your prompt with {{double curly braces}} map to input fields under **Sample to evaluate**.
16
-
17
- <br>
18
  """
19
 
20
  # CSS Styles
@@ -40,13 +38,40 @@ CSS_STYLES = """
40
 
41
  # Default Eval Prompt
42
  EVAL_DESCRIPTION = """
43
- ## 📝 Tips
44
  **Precise evaluation criteria leads to more consistent and reliable judgments.** A good evaluation prompt should include the following elements:
45
  - Evaluation criteria
46
  - Scoring rubric
47
- - Examples (Optional)
 
 
 
 
48
  """
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  # Voting Section Header
51
  VOTING_HEADER = """
52
  # Start Voting Now
@@ -54,7 +79,7 @@ VOTING_HEADER = """
54
 
55
  # Acknowledgements
56
  ACKNOWLEDGEMENTS = """
57
- <br><br>
58
  # Acknowledgements
59
 
60
  We thank [LMSYS Org](https://lmsys.org/) for their hard work on the Chatbot Arena and fully credit them for the inspiration to build this.
@@ -68,50 +93,55 @@ POLICY_CONTENT = """
68
 
69
  Atla is an applied research organization that trains models as evaluators to capture human preferences. We're a team of researchers, engineers, and operational leaders, with experience spanning a variety of disciplines, all working together to build reliable and understandable AI systems. Our research is informed by our experiences conducting AI safety research at the UK AI Task Force, OpenAI and the Stanford Existential Risks Initiative.
70
  <br><br>
71
- # [Our Mission](https://www.atla-ai.com/company)
72
 
73
- By creating advanced evaluation models, we enable AI developers to identify and fix risks, leading to safer, more reliable AI that can be trusted and widely used. Our aim is to surpass the current state-of-the-art evaluation methods by training models specifically for evaluation. AIs will probably become very powerful, and perform tasks that are difficult for us to verify. We want to enable humans to oversee AI systems that are solving tasks too difficult for humans to evaluate.
74
- Read more about [our approach to scalable oversight](https://www.atla-ai.com/post/scaling-alignment) on our blog.
75
  <br><br>
76
  # Judge Arena Policy
77
 
78
  ## Overview
79
 
80
- Judge Arena is an open-source platform dedicated to determining which models make the best judges. Users can run evals and assess anonymized responses from two competing model judges, choosing the better judgement or declaring a tie. This policy outlines our commitments to maintain a fair and open environment :)
81
 
82
  ## Transparency
83
 
84
  - **Open-Source**: Judge Arena's code is open-source and available on GitHub. We encourage contributions from the community and anyone can replicate or modify the platform to suit their needs. We use proprietary model provider APIs where provided and Together AI's API to serve leading open-source models.
85
- - **Methodology**: All processes related to model evaluation, rating calculations, and model selection are openly documented.
86
  - **Data Sharing**: Periodically, we'll share 20% of the collected evaluation data with the community. The data collected from Judge Arena is restricted to an anonymized user ID, the final prompt sent, the model responses, the user vote, and the timestamp.
87
 
88
  ## Model Inclusion Criteria
89
 
90
  Judge Arena is specifically designed to assess AI models that function as evaluators (a.k.a judges). This includes but is not limited to powerful general-purpose models and the latest language models designed for evaluation tasks. Models are eligible for inclusion if they meet the following criteria:
91
 
92
- - **Judge Capability**: The model should possess the ability to score AND critique other models' outputs effectively.
93
- - **Promptable:** The model must be promptable to be evaluate in different scoring formats, for different criteria.
94
  - **Accessibility**:
95
  - **Public API Access**: Models accessible through public APIs without restrictive barriers.
96
  - **Open-Source Models**: Models with publicly available weights that can be downloaded and run by the community.
97
 
98
  ## Leaderboard Management
99
 
100
- - **ELO Ranking System**: Models are ranked on a public leaderboard based on aggregated user evaluations. We use an ELO rating system to rank AI judges on the public leaderboard. Each model begins with an initial rating of 1200, and we use a K-factor of 32 to determine the maximum rating adjustment after each evaluation.
101
  - **Minimum Period**: Listed models remain accessible on Judge Arena for a minimum period of two weeks so they can be comprehensively evaluated.
102
  - **Deprecation Policy**: Models may be removed from the leaderboard if they become inaccessible or are no longer publicly available.
103
 
104
- *This policy might be updated to reflect changes in our practices or in response to community feedback.*
105
- <br><br>
106
  # FAQ
107
 
108
  **Isn't this the same as Chatbot Arena?**
109
 
110
  We are big fans of what the LMSYS team have done with Chatbot Arena and fully credit them for the inspiration to develop this. We were looking for a dynamic leaderboard that graded on AI judge capabilities and didn't manage to find one, so we created Judge Arena. This UI is designed especially for evals; to match the format of the model-based eval prompts that you would use in your LLM evaluation / monitoring tool.
111
 
 
 
 
 
 
 
112
  **Why should I trust this leaderboard?**
113
 
114
- We have listed out our efforts to be fully transparent in the policies above. All of the code for this leaderboard is open-source and can be found on our [Github](https://github.com/atla-ai/judge-arena). Check out our [blog](https://www.atla-ai.com/blog) to stay up to date as we analyse the results from the leaderboard.
115
 
116
  **Who funds this effort?**
117
 
@@ -122,5 +152,4 @@ Atla currently funds this out of our own pocket. We are looking for API credits
122
  We are training a general-purpose evaluator that you will soon be able to run in this Judge Arena. Our next step will be to open-source a powerful model that the community can use to run fast and accurate evaluations.
123
  <br><br>
124
  # Get in touch
125
- We’d love to hear your feedback! For general feature requests or to submit / suggest new models to add to the arena, please open up a discussion in the [community](https://huggingface.co/spaces/AtlaAI/judge-arena/discussions) tab. You can also contact us directly on [X](https://x.com/Atla_AI) or [Discord](https://discord.gg/yNpUAMqs).
126
- \nPlease file any issues on our [Github](https://github.com/atla-ai/judge-arena)."""
 
1
  # Page Headers
2
+ MAIN_TITLE = "# Judge Arena - Free LLM Evals to test your GenAI application"
3
 
4
  # How it works section
5
  HOW_IT_WORKS = """
6
+ - **Run any form of evaluation:** from simple hallucination detection to qualitative interpretations
7
+ - **Evaluate anything:** coding, analysis, creative writing, math, or general knowledge
8
  """
9
 
10
  BATTLE_RULES = """
11
+ ## 🤺 Battle Rules:
12
+ - Both AIs stay anonymous - if either reveals its identity, the duel is void
13
+ - Choose the LLM judge that most aligns with your judgement
14
+ - If both score the same - choose the critique that you prefer more!
15
+ <br><br>
 
 
 
16
  """
17
 
18
  # CSS Styles
 
38
 
39
  # Default Eval Prompt
40
  EVAL_DESCRIPTION = """
41
+ ## 📝 Instructions
42
  **Precise evaluation criteria leads to more consistent and reliable judgments.** A good evaluation prompt should include the following elements:
43
  - Evaluation criteria
44
  - Scoring rubric
45
+ - (Optional) Examples\n
46
+
47
+ **Any variables you define in your prompt using {{double curly braces}} will automatically map to the corresponding input fields under "Sample to evaluate" section on the right.**
48
+
49
+ <br><br>
50
  """
51
 
52
+ DEFAULT_EVAL_PROMPT = """You are assessing a chat bot response to a user's input based on [INSERT CRITERIA]
53
+
54
+ Score:
55
+ A score of 1 means that the response's answer meets all of the evaluation criteria.
56
+ A score of 0 means that the response's answer does not meet all of the evaluation criteria.
57
+
58
+ Here is the data:
59
+ [BEGIN DATA]
60
+ ***
61
+ [User Query]: {{input}}
62
+ ***
63
+ [Response]: {{response}}
64
+ ***
65
+ [END DATA]"""
66
+
67
+ # Default Variable Values
68
+ DEFAULT_INPUT = """Which of these animals is least likely to be found in a rainforest?"
69
+ A) Jaguar
70
+ B) Toucan
71
+ C) Polar Bear
72
+ D) Sloth"""
73
+ DEFAULT_RESPONSE = "C) Polar Bear"
74
+
75
  # Voting Section Header
76
  VOTING_HEADER = """
77
  # Start Voting Now
 
79
 
80
  # Acknowledgements
81
  ACKNOWLEDGEMENTS = """
82
+ <br><br><br>
83
  # Acknowledgements
84
 
85
  We thank [LMSYS Org](https://lmsys.org/) for their hard work on the Chatbot Arena and fully credit them for the inspiration to build this.
 
93
 
94
  Atla is an applied research organization that trains models as evaluators to capture human preferences. We're a team of researchers, engineers, and operational leaders, with experience spanning a variety of disciplines, all working together to build reliable and understandable AI systems. Our research is informed by our experiences conducting AI safety research at the UK AI Task Force, OpenAI and the Stanford Existential Risks Initiative.
95
  <br><br>
96
+ # Our Mission
97
 
98
+ By creating advanced evaluation models, we enable AI developers to identify and fix risks, leading to safer, more reliable AI that can be trusted and widely used. Our aim is to surpass the current state-of-the-art evaluation methods by training models specifically for evaluation. AIs will probably become very powerful, and perform tasks that are difficult for us to verify. We want to enable humans to oversee AI systems that are solving tasks too difficult for humans to evaluate. We have written more about [our approach to scalable oversight](https://www.atla-ai.com/post/scaling-alignment) on our blog.
 
99
  <br><br>
100
  # Judge Arena Policy
101
 
102
  ## Overview
103
 
104
+ Judge Arena is an open-source platform dedicated to improving the standard of evaluation of generative AI models in their role as judges. Users can run evals and assess anonymized responses from two competing model judges, choosing the better judgement or declaring a tie. This policy outlines our commitments to maintain a fair, open, and collaborative environment :)
105
 
106
  ## Transparency
107
 
108
  - **Open-Source**: Judge Arena's code is open-source and available on GitHub. We encourage contributions from the community and anyone can replicate or modify the platform to suit their needs. We use proprietary model provider APIs where provided and Together AI's API to serve leading open-source models.
109
+ - **Methodology**: All processes related to model evaluation, rating calculations, and model selection are openly documented. We'd like to ensure that our ranking system is understandable and reproducible by others!
110
  - **Data Sharing**: Periodically, we'll share 20% of the collected evaluation data with the community. The data collected from Judge Arena is restricted to an anonymized user ID, the final prompt sent, the model responses, the user vote, and the timestamp.
111
 
112
  ## Model Inclusion Criteria
113
 
114
  Judge Arena is specifically designed to assess AI models that function as evaluators (a.k.a judges). This includes but is not limited to powerful general-purpose models and the latest language models designed for evaluation tasks. Models are eligible for inclusion if they meet the following criteria:
115
 
116
+ - **Judge Capability**: The model should possess the ability to score AND critique responses, content, or other models' outputs effectively.
117
+ - **Adaptable:** The model must be prompt-able to be evaluate in different scoring formats, for different criteria.
118
  - **Accessibility**:
119
  - **Public API Access**: Models accessible through public APIs without restrictive barriers.
120
  - **Open-Source Models**: Models with publicly available weights that can be downloaded and run by the community.
121
 
122
  ## Leaderboard Management
123
 
124
+ - **ELO Ranking System**: Models are ranked on a public leaderboard based on aggregated user evaluations. We use an ELO rating system to rank AI judges on the public leaderboard. Each model begins with an initial rating of 1500 (as is used by the International Chess Federation), and we use a K-factor of 32 to determine the maximum rating adjustment after each evaluation.
125
  - **Minimum Period**: Listed models remain accessible on Judge Arena for a minimum period of two weeks so they can be comprehensively evaluated.
126
  - **Deprecation Policy**: Models may be removed from the leaderboard if they become inaccessible or are no longer publicly available.
127
 
128
+ This policy might be updated to reflect changes in our practices or in response to community feedback.
129
+
130
  # FAQ
131
 
132
  **Isn't this the same as Chatbot Arena?**
133
 
134
  We are big fans of what the LMSYS team have done with Chatbot Arena and fully credit them for the inspiration to develop this. We were looking for a dynamic leaderboard that graded on AI judge capabilities and didn't manage to find one, so we created Judge Arena. This UI is designed especially for evals; to match the format of the model-based eval prompts that you would use in your LLM evaluation / monitoring tool.
135
 
136
+ **What are the Evaluator Prompt Templates based on?**
137
+
138
+ As a quick start, we've set up templates that cover the most popular evaluation metrics out there on LLM evaluation / monitoring tools, often known as 'base metrics'. The data samples used in these were randomly picked from popular datasets from academia - [ARC](https://huggingface.co/datasets/allenai/ai2_arc), [Preference Collection](https://huggingface.co/datasets/prometheus-eval/Preference-Collection), [RewardBench](https://huggingface.co/datasets/allenai/reward-bench), [RAGTruth](https://arxiv.org/abs/2401.00396).
139
+
140
+ These templates are designed as a starting point to showcase how to interact with the Judge Arena, especially for those less familiar with using LLM judges.
141
+
142
  **Why should I trust this leaderboard?**
143
 
144
+ We have listed out our efforts to be fully transparent in the policies above. All of the code for this leaderboard is open-source and can be found on our [Github](https://github.com/atla-ai/judge-arena).
145
 
146
  **Who funds this effort?**
147
 
 
152
  We are training a general-purpose evaluator that you will soon be able to run in this Judge Arena. Our next step will be to open-source a powerful model that the community can use to run fast and accurate evaluations.
153
  <br><br>
154
  # Get in touch
155
+ Feel free to email us at [[email protected]](mailto:support@atla-ai.com) or leave feedback on our [Github](https://github.com/atla-ai/judge-arena)!"""
 
criteria_handler.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # criteria_handler.py
2
+
3
+ import gradio as gr
4
+ import re
5
+ from eval_criteria_library import EXAMPLE_METRICS
6
+
7
+ SYSTEM_PROMPT = """Please act as an impartial judge and evaluate based on the user's instruction. Your output format should strictly adhere to JSON as follows: {"feedback": "<write feedback>", "result": <numerical score>}. Ensure the output is valid JSON, without additional formatting or explanations."""
8
+
9
+ EVALUATION_TEMPLATE = '''You are tasked with evaluating a response based on a given instruction (which may contain an Input) and a scoring rubric. Provide a comprehensive feedback on the response quality strictly adhering to the scoring rubric, without any general evaluation. Follow this with a score, referring to the scoring rubric. Avoid generating any additional opening, closing, or explanations.
10
+
11
+ Here are some rules of the evaluation:
12
+ (1) You should prioritize evaluating whether the response satisfies the provided rubric. The basis of your score should depend exactly on the rubric. However, the response does not need to explicitly address points raised in the rubric. Rather, evaluate the response based on the criteria outlined in the rubric.
13
+
14
+ Your reply should strictly follow this format:
15
+ Your output format should strictly adhere to JSON as follows: {% raw %}{"feedback": "<write feedback>", "result": <numerical score>}{% endraw %}. Ensure the output is valid JSON, without additional formatting or explanations.
16
+
17
+ Here is the data.
18
+
19
+ {% if model_context is defined and model_context %}Context:
20
+ ```
21
+ {{ model_context }}
22
+ ```
23
+
24
+ {% endif %}Instruction:
25
+ ```
26
+ {{ model_input }}
27
+ ```
28
+
29
+ Response:
30
+ ```
31
+ {{ model_output }}
32
+ ```
33
+
34
+ Score Rubrics:
35
+ {{ evaluation_criteria }}
36
+
37
+ {% if expected_model_output is defined and expected_model_output %}Reference answer:
38
+ {{ expected_model_output }}{% endif %}'''
39
+
40
+ def select_evaluation_criteria(data_upload_group, df_state, prompt_state):
41
+ with gr.Group(visible=True) as criteria_group:
42
+ select_eval_criteria_button = gr.Button("Select Evaluation Criteria", visible=False)
43
+
44
+ criteria_dropdown = gr.Dropdown(
45
+ choices=list(EXAMPLE_METRICS.keys()),
46
+ label="Choose Evaluation Criteria",
47
+ value=list(EXAMPLE_METRICS.keys())[0],
48
+ visible=False
49
+ )
50
+
51
+ with gr.Row(visible=False) as mapping_row:
52
+ with gr.Column():
53
+ # Left column - Evaluation Criteria Editor
54
+ prompt_editor = gr.Textbox(
55
+ label="Evaluation Criteria",
56
+ lines=15,
57
+ visible=False,
58
+ placeholder="Enter the evaluation criteria/rubric here..."
59
+ )
60
+ with gr.Column():
61
+ # Right column - Required and Optional Variable Mapping
62
+ # Required mappings
63
+ input_mapping = gr.Dropdown(
64
+ choices=[],
65
+ label="Map 'model_input' to column (Required)",
66
+ interactive=True,
67
+ visible=False
68
+ )
69
+ output_mapping = gr.Dropdown(
70
+ choices=[],
71
+ label="Map 'model_output' to column (Required)",
72
+ interactive=True,
73
+ visible=False
74
+ )
75
+ # Optional mappings
76
+ context_mapping = gr.Dropdown(
77
+ choices=[],
78
+ label="Map 'model_context' to column (Optional)",
79
+ interactive=True,
80
+ visible=False
81
+ )
82
+ expected_output_mapping = gr.Dropdown(
83
+ choices=[],
84
+ label="Map 'expected_model_output' to column (Optional)",
85
+ interactive=True,
86
+ visible=False
87
+ )
88
+ # We'll place the "Back to Data" and "Select Evaluators" within the same row:
89
+ with gr.Row(visible=False) as nav_row:
90
+ back_to_data_button = gr.Button("← Back to Data", visible=False)
91
+ save_prompt_button = gr.Button("Select Evaluators", visible=False)
92
+
93
+ def update_column_choices(df_state):
94
+ df = df_state.value
95
+ columns = df.columns.tolist() if df is not None else []
96
+ return {
97
+ input_mapping: gr.update(choices=columns, visible=True),
98
+ output_mapping: gr.update(choices=columns, visible=True),
99
+ context_mapping: gr.update(choices=['None'] + columns, visible=True),
100
+ expected_output_mapping: gr.update(choices=['None'] + columns, visible=True)
101
+ }
102
+
103
+ def update_prompt(selected_criteria, df_state):
104
+ if selected_criteria in EXAMPLE_METRICS:
105
+ evaluation_criteria = EXAMPLE_METRICS[selected_criteria]['prompt']
106
+ else:
107
+ evaluation_criteria = ""
108
+ updates = {prompt_editor: gr.update(value=evaluation_criteria, visible=True)}
109
+ updates.update(update_column_choices(df_state))
110
+ return updates
111
+
112
+ def show_criteria_selection():
113
+ default_criterion = list(EXAMPLE_METRICS.keys())[0]
114
+ evaluation_criteria = EXAMPLE_METRICS[default_criterion]['prompt']
115
+ updates = {
116
+ select_eval_criteria_button: gr.update(visible=False),
117
+ criteria_dropdown: gr.update(visible=True),
118
+ prompt_editor: gr.update(value=evaluation_criteria, visible=True),
119
+ data_upload_group: gr.update(visible=False),
120
+ mapping_row: gr.update(visible=True),
121
+ # Show the nav row and buttons
122
+ nav_row: gr.update(visible=True),
123
+ back_to_data_button: gr.update(visible=True),
124
+ save_prompt_button: gr.update(visible=True),
125
+ }
126
+ updates.update(update_column_choices(df_state))
127
+ return updates
128
+
129
+ def save_prompt(evaluation_criteria, input_col, output_col, context_col, expected_output_col):
130
+ # Use the actual Jinja template with proper Jinja syntax and raw JSON
131
+ template = EVALUATION_TEMPLATE
132
+
133
+ # Create mapping dictionary
134
+ mapping_dict = {
135
+ 'model_input': input_col,
136
+ 'model_output': output_col,
137
+ 'evaluation_criteria': evaluation_criteria
138
+ }
139
+
140
+ # Add optional mappings if selected
141
+ if context_col != 'None':
142
+ mapping_dict['model_context'] = context_col
143
+ if expected_output_col != 'None':
144
+ mapping_dict['expected_model_output'] = expected_output_col
145
+
146
+ prompt_state.value = {
147
+ 'template': template,
148
+ 'mappings': mapping_dict
149
+ }
150
+
151
+ # Update event handlers
152
+ select_eval_criteria_button.click(
153
+ fn=show_criteria_selection,
154
+ inputs=[],
155
+ outputs=[
156
+
157
+ select_eval_criteria_button,
158
+ criteria_dropdown,
159
+ prompt_editor,
160
+
161
+ data_upload_group,
162
+ mapping_row,
163
+ nav_row,
164
+ back_to_data_button,
165
+ save_prompt_button
166
+ ,
167
+ input_mapping, output_mapping, context_mapping, expected_output_mapping
168
+ ]
169
+ )
170
+
171
+ criteria_dropdown.change(
172
+ fn=update_prompt,
173
+ inputs=[criteria_dropdown, df_state],
174
+ outputs=[prompt_editor, input_mapping, output_mapping, context_mapping, expected_output_mapping]
175
+ )
176
+
177
+ def make_select_button_visible(df_value):
178
+ if df_value is not None:
179
+ return gr.update(visible=True)
180
+ else:
181
+ return gr.update(visible=False)
182
+
183
+ df_state.change(
184
+ fn=make_select_button_visible,
185
+ inputs=df_state,
186
+ outputs=select_eval_criteria_button
187
+ )
188
+
189
+ save_prompt_button.click(
190
+ fn=save_prompt,
191
+ inputs=[
192
+ prompt_editor, input_mapping, output_mapping,
193
+ context_mapping, expected_output_mapping
194
+ ],
195
+ outputs=[]
196
+ )
197
+
198
+ # BACK BUTTON: Hide the criteria UI, show the data upload UI
199
+ def back_to_data():
200
+ return {
201
+ # show data upload group again
202
+ data_upload_group: gr.update(visible=True),
203
+ # hide the criteria group
204
+ criteria_dropdown: gr.update(visible=False),
205
+ prompt_editor: gr.update(visible=False),
206
+ mapping_row: gr.update(visible=False),
207
+ nav_row: gr.update(visible=False),
208
+ # make "Select Evaluation Criteria" button visible again
209
+ select_eval_criteria_button: gr.update(visible=True),
210
+ }
211
+
212
+ back_to_data_button.click(
213
+ fn=back_to_data,
214
+ inputs=[],
215
+ outputs=[
216
+ data_upload_group,
217
+ criteria_dropdown,
218
+ prompt_editor,
219
+ mapping_row,
220
+ nav_row,
221
+ select_eval_criteria_button
222
+ ]
223
+ )
224
+
225
+ # Return both the criteria rule group, the df_state, prompt_state, save_prompt_button
226
+ return criteria_group, df_state, prompt_state, save_prompt_button
data/models.jsonl ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"name": "Meta Llama 3.1 70B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo"}
2
+ {"name": "Meta Llama 3.1 405B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo"}
3
+ {"name": "Gemma 2 27B", "organization": "Google", "license": "Open Source", "api_model": "google/gemma-2-27b-it"}
4
+ {"name": "Gemma 2 9B", "organization": "Google", "license": "Open Source", "api_model": "google/gemma-2-9b-it"}
5
+ {"name": "Qwen 2 Instruct (72B)", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2-72B-Instruct"}
6
+ {"name": "Mistral (7B) Instruct v0.3", "organization": "Mistral AI", "license": "Open Source", "api_model": "mistralai/Mistral-7B-Instruct-v0.3"}
7
+ {"name": "GPT-4o", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4o"}
8
+ {"name": "GPT-4 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4-turbo"}
9
+ {"name": "GPT-3.5 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-3.5-turbo"}
10
+ {"name": "Claude 3 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-haiku-20240307"}
11
+ {"name": "Claude 3 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-sonnet-20240229"}
12
+ {"name": "Claude 3 Opus", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-opus-20240229"}
13
+ {"name": "Meta Llama 3.1 8B Instruct Turbo", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"}
14
+ {"name": "Qwen 2.5 72B Instruct Turbo", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-72B-Instruct-Turbo"}
15
+ {"name": "Qwen 2.5 7B Instruct Turbo", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-7B-Instruct-Turbo"}
16
+ {"name": "Mistral (7B) Instruct v0.1", "organization": "Mistral AI", "license": "Open Source", "api_model": "mistralai/Mistral-7B-Instruct-v0.1"}
17
+ {"name": "Atla Selene", "organization": "Atla", "license": "Proprietary", "api_model": "atla-selene"}
data_handler.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # data_handler.py
2
+
3
+ import gradio as gr
4
+ import pandas as pd
5
+ import json
6
+ def upload_test_data(df_state):
7
+ with gr.Group() as data_upload_group:
8
+ file_upload = gr.File(
9
+ label="Upload JSON with test data incl. true labels as integers or floats",
10
+ file_types=[".json"],
11
+ )
12
+ import_button = gr.Button("Import Data", visible=False)
13
+ # Show exactly 5 rows, no scrolling
14
+ df_display = gr.Dataframe(
15
+ visible=False,
16
+ elem_classes=["truncate_cells"],
17
+ label="Uploaded Data"
18
+ )
19
+ error_display = gr.Textbox(visible=False)
20
+
21
+ def display_file_info(file):
22
+ if file is not None:
23
+ return {
24
+ import_button: gr.update(visible=True),
25
+ error_display: gr.update(visible=False) # Hide previous errors
26
+ }
27
+ else:
28
+ return {
29
+ import_button: gr.update(visible=False),
30
+ df_display: gr.update(visible=False),
31
+ error_display: gr.update(visible=False) # Hide previous errors
32
+ }
33
+
34
+ def import_data(file):
35
+ if file is not None:
36
+ try:
37
+ loaded_json = json.load(open(file.name))
38
+
39
+ # Handle various common JSON structures
40
+ if isinstance(loaded_json, list):
41
+ # Top-level list
42
+ df = pd.json_normalize(loaded_json, sep=".")
43
+ elif isinstance(loaded_json, dict):
44
+ # Dictionary could contain a "data" key or not
45
+ if "data" in loaded_json and isinstance(loaded_json["data"], list):
46
+ df = pd.json_normalize(loaded_json["data"], sep=".")
47
+ else:
48
+ # Flatten the top-level dictionary
49
+ df = pd.json_normalize(loaded_json, sep=".")
50
+ else:
51
+ raise ValueError("Unsupported JSON structure. Please provide a list or object.")
52
+
53
+ df_state.value = df
54
+
55
+ return {
56
+ df_display: gr.update(value=df_state.value, visible=True),
57
+ import_button: gr.update(visible=False),
58
+ df_state: df_state,
59
+ error_display: gr.update(visible=False) # Hide previous errors
60
+ }
61
+ except json.JSONDecodeError:
62
+ return {
63
+ df_display: gr.update(visible=False),
64
+ error_display: gr.update(
65
+ value="**Error:** Invalid JSON file. Please upload a valid JSON file.",
66
+ visible=True
67
+ ),
68
+ import_button: gr.update(visible=True),
69
+ df_state: None
70
+ }
71
+ except Exception as e:
72
+ return {
73
+ df_display: gr.update(visible=False),
74
+ error_display: gr.update(value=f"**Error:** {str(e)}", visible=True),
75
+ import_button: gr.update(visible=True),
76
+ df_state: None
77
+ }
78
+ else:
79
+ return {
80
+ df_display: gr.update(visible=False),
81
+ import_button: gr.update(visible=True),
82
+ df_state: None
83
+ }
84
+
85
+ file_upload.change(
86
+ fn=display_file_info,
87
+ inputs=file_upload,
88
+ outputs=[import_button, df_display, error_display]
89
+ )
90
+ import_button.click(
91
+ fn=import_data,
92
+ inputs=file_upload,
93
+ outputs=[df_display, import_button, df_state, error_display]
94
+ )
95
+
96
+ return data_upload_group, df_state
eval_criteria_library.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ EXAMPLE_METRICS = {
2
+ "Custom": {
3
+ "prompt":
4
+ """Evaluate a chat bot response to a user's input based on [INSERT CRITERIA OR SELECT EXAMPLE FROM LIST]
5
+
6
+ 0: The response's answer does not meet all of the evaluation criteria.
7
+ 1: The response's answer meets all of the evaluation criteria.""",
8
+ },
9
+ "Relevance": {
10
+ "prompt": """Evaluate how well the response fulfill the requirements of the instruction by providing relevant information. This includes responding in accordance with the explicit and implicit purpose of given instruction.
11
+
12
+ 1: The response is completely unrelated to the instruction, or the model entirely misunderstands the instruction.
13
+ 2: Most of the key points in the response are irrelevant to the instruction, and the response misses major requirements of the instruction.
14
+ 3: Some major points in the response contain irrelevant information or miss some requirements of the instruction.
15
+ 4: The response is relevant to the instruction but misses minor requirements of the instruction.
16
+ 5: The response is perfectly relevant to the instruction, and the model fulfills all of the requirements of the instruction.""",
17
+ },
18
+ "Correctness": {
19
+ "prompt": """Evaluate whether the information provided in the response is correct given the reference response. Ignore differences in punctuation and phrasing between the student answer and true answer. It is okay if the student answer contains more information than the true answer, as long as it does not contain any conflicting statements.
20
+
21
+ 0: The response is not factually accurate when compared against the reference response or includes conflicting statements.
22
+ 1: The response is supported by the reference response and does not contain conflicting statements.""",
23
+ },
24
+ "Helpfulness": {
25
+ "prompt": """Evaluate how helpful the response is to address the user query.
26
+
27
+ 1: The response is not at all useful, failing to address the instruction or provide any valuable information.
28
+ 2: The response has minimal usefulness, addressing the instruction only superficially or providing mostly irrelevant information.
29
+ 3: The response is moderately useful, addressing some aspects of the instruction effectively but lacking in others.
30
+ 4: The response is very useful, effectively addressing most aspects of the instruction and providing valuable information.
31
+ 5: The response is exceptionally useful, fully addressing the instruction and providing highly valuable information.""",
32
+ },
33
+ "Faithfulness": {
34
+ "prompt": """Evaluate how well the statements in the response are directly supported by the context given in the related passages.
35
+
36
+ 1: The response contains statements that directly contradict the context or are entirely unsupported by it.
37
+ 2: The response includes some information from the context, but contains significant ungrounded claims or misinterpretations.
38
+ 3: The response is mostly grounded in the context, with only minor unsupported claims or misinterpretations.
39
+ 4: The response closely aligns with the context, with only rare and minor deviations.
40
+ 5: The response is fully grounded in the context, with all statements accurately reflecting the provided information.""",
41
+ },
42
+ "Logical coherence": {
43
+ "prompt": """Evaluate how logically accurate and correct the response is for the instruction given.
44
+
45
+ 1: The logic of the model’s response is completely incoherent.
46
+ 2: The model’s response contains major logical inconsistencies or errors.
47
+ 3: The model’s response contains some logical inconsistencies or errors, but they are not significant."
48
+ 4: The model’s response is logically sound, but it is slightly flawed in some aspect.
49
+ 5: The model’s response is logically flawless.""",
50
+ },
51
+ "Conciseness": {
52
+ "prompt": """Evaluate how concise the response is presented to the user without any unncecessary information.
53
+
54
+ 1: The response is highly redundant or contains a lot of unnecessary information, requiring a complete rewrite for optimal clarity and efficiency.
55
+ 2: The response lacks conciseness and needs a substantial rewrite for better optimization.
56
+ 3: The response is somewhat concise but includes unnecessary information, requiring
57
+ some edits for improved optimization.
58
+ 4: The response is mostly concise but could benefit from minor edits for better optimization.
59
+ 5: The response is optimally concise and does not contain any unnecessary information, requiring no further optimization.""",
60
+ },
61
+ }
get_llm_answer.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # get_llm_answer.py
2
+
3
+ from openai import OpenAI
4
+ import anthropic
5
+ from together import Together
6
+ import json
7
+ import re
8
+ from atla import Atla
9
+
10
+ from dotenv import load_dotenv
11
+ load_dotenv()
12
+
13
+ # Initialize clients
14
+ anthropic_client = anthropic.Anthropic()
15
+ openai_client = OpenAI()
16
+ together_client = Together()
17
+ atla_client = Atla()
18
+
19
+ SYSTEM_PROMPT = """Please act as an impartial judge and evaluate based on the user's instruction. Your output format should strictly adhere to JSON as follows: {"feedback": "<write feedback>", "result": <numerical score>}. Ensure the output is valid JSON, without additional formatting or explanations."""
20
+
21
+ def get_openai_response(model_name, prompt):
22
+ """Get response from OpenAI API"""
23
+ try:
24
+ response = openai_client.chat.completions.create(
25
+ model=model_name,
26
+ messages=[
27
+ {"role": "system", "content": SYSTEM_PROMPT},
28
+ {"role": "user", "content": prompt},
29
+ ],
30
+ )
31
+ return response.choices[0].message.content
32
+ except Exception as e:
33
+ return f"Error with OpenAI model {model_name}: {str(e)}"
34
+
35
+
36
+ def get_anthropic_response(model_name, prompt):
37
+ """Get response from Anthropic API"""
38
+ try:
39
+ response = anthropic_client.messages.create(
40
+ model=model_name,
41
+ max_tokens=1000,
42
+ temperature=0,
43
+ system=SYSTEM_PROMPT,
44
+ messages=[{"role": "user", "content": [{"type": "text", "text": prompt}]}],
45
+ )
46
+ return response.content[0].text
47
+ except Exception as e:
48
+ return f"Error with Anthropic model {model_name}: {str(e)}"
49
+
50
+
51
+ def get_together_response(model_name, prompt):
52
+ """Get response from Together API"""
53
+ try:
54
+ response = together_client.chat.completions.create(
55
+ model=model_name,
56
+ messages=[
57
+ {"role": "system", "content": SYSTEM_PROMPT},
58
+ {"role": "user", "content": prompt},
59
+ ],
60
+ stream=False,
61
+ )
62
+ return response.choices[0].message.content
63
+ except Exception as e:
64
+ return f"Error with Together model {model_name}: {str(e)}"
65
+
66
+
67
+ def get_atla_response(model_name, model_input, model_output, model_context, expected_output, evaluation_criteria):
68
+ """Get response from Atla API"""
69
+ try:
70
+ response = atla_client.evaluation.create(
71
+ model_id=model_name,
72
+ model_input=model_input,
73
+ model_output=model_output,
74
+ model_context=model_context,
75
+ expected_model_output=expected_output,
76
+ evaluation_criteria=evaluation_criteria,
77
+ )
78
+ # Return the score and critique directly from the evaluation result
79
+ return {
80
+ "score": response.result.evaluation.score,
81
+ "critique": response.result.evaluation.critique
82
+ }
83
+ except Exception as e:
84
+ return f"Error with Atla model {model_name}: {str(e)}"
85
+
86
+
87
+ def get_model_response(model_name, model_info, prompt=None, **kwargs):
88
+ """Get response from appropriate API based on model organization"""
89
+ if not model_info:
90
+ return "Model not found or unsupported."
91
+
92
+ api_model = model_info["api_model"]
93
+ organization = model_info["organization"]
94
+
95
+ try:
96
+ if organization == "Atla":
97
+ return get_atla_response(
98
+ api_model,
99
+ kwargs.get('model_input'),
100
+ kwargs.get('model_output'),
101
+ kwargs.get('model_context'),
102
+ kwargs.get('expected_output'),
103
+ kwargs.get('evaluation_criteria')
104
+ )
105
+ elif organization == "OpenAI":
106
+ return get_openai_response(api_model, prompt)
107
+ elif organization == "Anthropic":
108
+ return get_anthropic_response(api_model, prompt)
109
+ else:
110
+ # All other organizations use Together API
111
+ return get_together_response(api_model, prompt)
112
+ except Exception as e:
113
+ return f"Error with {organization} model {model_name}: {str(e)}"
114
+
115
+
116
+ def parse_model_response(response):
117
+ try:
118
+ # Debug print
119
+ print(f"Raw model response: {response}")
120
+
121
+ # First try to parse the entire response as JSON
122
+ try:
123
+ data = json.loads(response)
124
+ return str(data.get("result", "N/A")), data.get("feedback", "N/A")
125
+ except json.JSONDecodeError:
126
+ # If that fails (typically for smaller models), try to find JSON within the response
127
+ json_match = re.search(r"{.*}", response)
128
+ if json_match:
129
+ data = json.loads(json_match.group(0))
130
+ return str(data.get("result", "N/A")), data.get("feedback", "N/A")
131
+ else:
132
+ return "Error", f"Failed to parse response: {response}"
133
+
134
+ except Exception as e:
135
+ # Debug print for error case
136
+ print(f"Failed to parse response: {str(e)}")
137
+ return "Error", f"Failed to parse response: {response}"
model_handler.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model_handler.py
2
+
3
+ import gradio as gr
4
+ import json
5
+ import os
6
+ import re
7
+ from get_llm_answer import get_model_response, parse_model_response, get_atla_response
8
+ from jinja2 import Template
9
+
10
+ def select_evaluators(criteria_group, df_state, prompt_state, save_prompt_button):
11
+ with gr.Group(visible=True) as model_selection_group:
12
+ select_evaluators_button = gr.Button("Select Evaluators", visible=False)
13
+
14
+ # Load the model_data from JSONL
15
+ def load_model_data():
16
+ model_data = {}
17
+ try:
18
+ script_dir = os.path.dirname(__file__)
19
+ file_path = os.path.join(script_dir, "models.jsonl")
20
+ with open(file_path, "r") as f:
21
+ for line in f:
22
+ model = json.loads(line)
23
+ model_data[model["name"]] = {
24
+ "organization": model["organization"],
25
+ "license": model["license"],
26
+ "api_model": model["api_model"],
27
+ }
28
+ except FileNotFoundError:
29
+ print("Warning: models.jsonl not found")
30
+ return {}
31
+ return model_data
32
+
33
+ model_data = load_model_data()
34
+ model_choices = list(model_data.keys())
35
+
36
+ with gr.Row(visible=False) as evaluator_row:
37
+ judge_a_dropdown = gr.Dropdown(
38
+ choices=["Selene"], label="Judge A", value="Selene", interactive=False
39
+ )
40
+ judge_b_dropdown = gr.Dropdown(
41
+ choices=model_choices, label="Judge B", value="Claude 3.5 Sonnet"
42
+ )
43
+
44
+ loading_spinner = gr.Markdown("Evaluation in progress...", visible=False)
45
+
46
+ evaluation_result_df = gr.Dataframe(
47
+ visible=False,
48
+ label="Evaluation Results",
49
+ elem_classes=["truncate_cells"]
50
+ )
51
+
52
+ with gr.Row(visible=False) as evaluation_nav_row:
53
+ back_to_criteria_button = gr.Button("← Back to Criteria", visible=False)
54
+ run_evaluation_button = gr.Button("Run Evaluation", visible=False)
55
+ analyze_results_button = gr.Button("Analyze Results", visible=False)
56
+
57
+ def show_evaluator_selection(current_df):
58
+ updates = {
59
+ criteria_group: gr.update(visible=False),
60
+ save_prompt_button: gr.update(visible=False),
61
+ evaluator_row: gr.update(visible=True),
62
+ evaluation_nav_row: gr.update(visible=True),
63
+ run_evaluation_button: gr.update(visible=True),
64
+ back_to_criteria_button: gr.update(visible=True),
65
+ analyze_results_button: gr.update(visible=False),
66
+ evaluation_result_df: gr.update(visible=False),
67
+ }
68
+ if (
69
+ current_df.value is not None
70
+ and hasattr(current_df.value, "attrs")
71
+ and current_df.value.attrs.get("eval_done")
72
+ ):
73
+ updates[loading_spinner] = gr.update(value="### Evaluation Complete", visible=True)
74
+ updates[evaluation_result_df] = gr.update(value=current_df.value, visible=True)
75
+ updates[analyze_results_button] = gr.update(visible=True)
76
+
77
+ return updates
78
+
79
+ save_prompt_button.click(
80
+ fn=show_evaluator_selection,
81
+ inputs=[df_state],
82
+ outputs=[
83
+ save_prompt_button,
84
+ criteria_group,
85
+ evaluator_row,
86
+ evaluation_nav_row,
87
+ run_evaluation_button,
88
+ back_to_criteria_button,
89
+ loading_spinner,
90
+ analyze_results_button,
91
+ evaluation_result_df,
92
+ ],
93
+ )
94
+
95
+ def back_to_criteria():
96
+ return {
97
+ save_prompt_button: gr.update(visible=True),
98
+ criteria_group: gr.update(visible=True),
99
+ evaluator_row: gr.update(visible=False),
100
+ evaluation_nav_row: gr.update(visible=False),
101
+ run_evaluation_button: gr.update(visible=False),
102
+ loading_spinner: gr.update(visible=False),
103
+ analyze_results_button: gr.update(visible=False),
104
+ evaluation_result_df: gr.update(visible=False),
105
+ }
106
+
107
+ back_to_criteria_button.click(
108
+ fn=back_to_criteria,
109
+ inputs=[],
110
+ outputs=[
111
+ save_prompt_button,
112
+ criteria_group,
113
+ evaluator_row,
114
+ evaluation_nav_row,
115
+ run_evaluation_button,
116
+ loading_spinner,
117
+ analyze_results_button,
118
+ evaluation_result_df
119
+ ],
120
+ )
121
+
122
+ # Run evaluation
123
+ def run_evaluation(judge_a, judge_b):
124
+ # 1) Immediately hide old results and disable navigation while running
125
+ yield {
126
+ loading_spinner: gr.update(value="Evaluation in progress...", visible=True),
127
+ evaluation_result_df: gr.update(visible=False),
128
+ analyze_results_button: gr.update(visible=False),
129
+ run_evaluation_button: gr.update(interactive=False),
130
+ back_to_criteria_button: gr.update(interactive=False),
131
+ }
132
+
133
+ # Perform the actual evaluation
134
+ template_str = prompt_state.value['template']
135
+ mappings = prompt_state.value['mappings']
136
+ evaluation_criteria = mappings.get('evaluation_criteria')
137
+
138
+ template = Template(template_str)
139
+
140
+ for index, row in df_state.value.iterrows():
141
+ context = {}
142
+ model_context = None
143
+ expected_output = None
144
+
145
+ for key, column in mappings.items():
146
+ if key == 'evaluation_criteria':
147
+ continue
148
+ elif column and column != 'None':
149
+ context[key] = str(row[column])
150
+ if column == 'model_context':
151
+ model_context = str(row[column])
152
+ elif column == 'expected_model_output':
153
+ expected_output = str(row[column])
154
+
155
+ context['evaluation_criteria'] = evaluation_criteria
156
+
157
+ # Render the template for Judge B
158
+ current_prompt = template.render(**context)
159
+ print(f"\nDEBUG - Final Prompt sent to Model B:\n{current_prompt}\n")
160
+ response_a = get_atla_response(
161
+ "atla-selene",
162
+ model_input=context.get('model_input'),
163
+ model_output=context.get('model_output'),
164
+ model_context=model_context,
165
+ expected_output=expected_output,
166
+ evaluation_criteria=evaluation_criteria
167
+ )
168
+ response_b = get_model_response(
169
+ judge_b,
170
+ model_data.get(judge_b),
171
+ current_prompt
172
+ )
173
+
174
+ # Parse ATLA response
175
+ if isinstance(response_a, dict):
176
+ score_a, critique_a = response_a['score'], response_a['critique']
177
+ else:
178
+ score_a, critique_a = "Error", response_a
179
+
180
+ score_b, critique_b = parse_model_response(response_b)
181
+
182
+ # Convert model name to snake case for column names
183
+ model_b_snake = judge_b.lower().replace(' ', '_').replace('-', '_').replace('.', '_')
184
+
185
+ df_state.value.loc[index, 'score_selene'] = score_a
186
+ df_state.value.loc[index, 'critique_selene'] = critique_a
187
+ df_state.value.loc[index, f'score_{model_b_snake}'] = score_b
188
+ df_state.value.loc[index, f'critique_{model_b_snake}'] = critique_b
189
+
190
+ import time
191
+ time.sleep(2) # simulating time-consuming operations
192
+
193
+ # 2) Hide spinner
194
+ yield {loading_spinner: gr.update(visible=False)}
195
+
196
+ # 3) Show final results and re-enable buttons
197
+ yield {
198
+ loading_spinner: gr.update(value="### Evaluation Complete", visible=True),
199
+ evaluation_result_df: gr.update(value=df_state.value, visible=True),
200
+ analyze_results_button: gr.update(visible=True),
201
+ run_evaluation_button: gr.update(interactive=True),
202
+ back_to_criteria_button: gr.update(interactive=True),
203
+ }
204
+
205
+ if hasattr(df_state.value, "attrs"):
206
+ df_state.value.attrs["eval_done"] = True
207
+
208
+ # Include back_to_criteria_button & run_evaluation_button in outputs so we can update them
209
+ run_evaluation_button.click(
210
+ fn=run_evaluation,
211
+ inputs=[judge_a_dropdown, judge_b_dropdown],
212
+ outputs=[
213
+ loading_spinner,
214
+ evaluation_result_df,
215
+ analyze_results_button,
216
+ run_evaluation_button,
217
+ back_to_criteria_button,
218
+ ],
219
+ )
220
+
221
+ return model_selection_group, df_state, analyze_results_button
models.jsonl ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {"name": "Meta Llama 3.1 70B Instruct", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo"}
2
+ {"name": "Meta Llama 3.1 405B Instruct", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo"}
3
+ {"name": "Gemma 2 27B", "organization": "Google", "license": "Open Source", "api_model": "google/gemma-2-27b-it"}
4
+ {"name": "Gemma 2 9B", "organization": "Google", "license": "Open Source", "api_model": "google/gemma-2-9b-it"}
5
+ {"name": "Qwen 2 Instruct (72B)", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2-72B-Instruct"}
6
+ {"name": "Mistral (7B) Instruct v0.3", "organization": "Mistral AI", "license": "Open Source", "api_model": "mistralai/Mistral-7B-Instruct-v0.3"}
7
+ {"name": "GPT-4o", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4o"}
8
+ {"name": "GPT-4 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-4-turbo"}
9
+ {"name": "GPT-3.5 Turbo", "organization": "OpenAI", "license": "Proprietary", "api_model": "gpt-3.5-turbo"}
10
+ {"name": "Claude 3 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-haiku-20240307"}
11
+ {"name": "Claude 3 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-sonnet-20240229"}
12
+ {"name": "Claude 3 Opus", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-opus-latest"}
13
+ {"name": "Meta Llama 3.1 8B Instruct", "organization": "Meta", "license": "Open Source", "api_model": "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo"}
14
+ {"name": "Qwen 2.5 72B Instruct", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-72B-Instruct-Turbo"}
15
+ {"name": "Qwen 2.5 7B Instruct", "organization": "Alibaba", "license": "Open Source", "api_model": "Qwen/Qwen2.5-7B-Instruct-Turbo"}
16
+ {"name": "Mistral (7B) Instruct v0.1", "organization": "Mistral AI", "license": "Open Source", "api_model": "mistralai/Mistral-7B-Instruct-v0.1"}
17
+ {"name": "Claude 3.5 Sonnet", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-5-sonnet-latest"}
18
+ {"name": "Claude 3.5 Haiku", "organization": "Anthropic", "license": "Proprietary", "api_model": "claude-3-5-haiku-latest"}
19
+ {"name": "Atla Selene", "organization": "Atla", "license": "Proprietary", "api_model": "atla-selene"}
random_sample/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # This file can be empty - it just marks the directory as a Python package
random_sample/arena_interface.py ADDED
@@ -0,0 +1,405 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ import gradio as gr
4
+
5
+ from dotenv import load_dotenv
6
+ load_dotenv()
7
+
8
+ from .gen_api_answer import (
9
+ get_atla_response,
10
+ get_selene_mini_response,
11
+ parse_selene_mini_response
12
+ )
13
+
14
+ from .prompts import (
15
+ DEFAULT_EVAL_CRITERIA,
16
+ DEFAULT_EVAL_PROMPT,
17
+ DEFAULT_EVAL_PROMPT_EDITABLE,
18
+ ATLA_PROMPT,
19
+ ATLA_PROMPT_WITH_REFERENCE
20
+ )
21
+
22
+ from .random_sample_generation import (
23
+ get_random_human_ai_pair,
24
+ get_random_human_ai_ground_truth_pair,
25
+ generate_ai_response
26
+ )
27
+
28
+ from common import CSS_STYLES, MAIN_TITLE, HOW_IT_WORKS
29
+
30
+ def parse_variables(prompt):
31
+ # Extract variables enclosed in double curly braces
32
+ variables = re.findall(r"{{(.*?)}}", prompt)
33
+ # Remove duplicates while preserving order
34
+ seen = set()
35
+ variables = [
36
+ x.strip() for x in variables if not (x.strip() in seen or seen.add(x.strip()))
37
+ ]
38
+ return variables
39
+
40
+
41
+ def get_final_prompt(eval_prompt, variable_values):
42
+ # Replace variables in the eval prompt with their values
43
+ for var, val in variable_values.items():
44
+ eval_prompt = eval_prompt.replace("{{" + var + "}}", val)
45
+ return eval_prompt
46
+
47
+
48
+ def populate_random_example(request: gr.Request, compatible_mode: bool):
49
+ """Generate a random human-AI conversation example and reset judge outputs."""
50
+ if compatible_mode:
51
+ human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
52
+ else:
53
+ human_msg, ai_msg = get_random_human_ai_pair()
54
+ ground_truth_msg = ""
55
+
56
+ return [
57
+ gr.update(value=human_msg),
58
+ gr.update(value=ai_msg),
59
+ gr.update(value="🎲", variant="secondary"),
60
+ gr.update(value=""), # Clear score
61
+ gr.update(value=""), # Clear critique
62
+ gr.update(value=ground_truth_msg, visible=compatible_mode), # Set ground truth and visibility
63
+ ]
64
+
65
+
66
+ def create_arena_interface():
67
+ with gr.Blocks(theme="default", css=CSS_STYLES) as interface:
68
+ # Hidden eval prompt that will always contain DEFAULT_EVAL_PROMPT
69
+ eval_prompt = gr.Textbox(
70
+ value=DEFAULT_EVAL_PROMPT,
71
+ visible=False
72
+ )
73
+ with gr.Row():
74
+ # Add model selector dropdown at the top
75
+ model_selector = gr.Dropdown(
76
+ choices=["Selene", "Selene Mini"],
77
+ value="Selene",
78
+ label="Choose your Atla Model",
79
+ interactive=True
80
+ )
81
+
82
+ with gr.Row():
83
+ # Left side - Input section
84
+ with gr.Column(scale=1):
85
+ with gr.Group():
86
+ human_input = gr.TextArea(
87
+ label="👩 User Input",
88
+ lines=5,
89
+ placeholder="Enter the human message here..."
90
+ )
91
+ with gr.Row():
92
+ generate_btn = gr.Button(
93
+ "Generate AI Response",
94
+ size="sm",
95
+ interactive=False
96
+ )
97
+
98
+ ai_response = gr.TextArea(
99
+ label="🤖 AI Response",
100
+ lines=10,
101
+ placeholder="Enter the AI response here..."
102
+ )
103
+
104
+ # Ground truth response (initially hidden)
105
+ ground_truth = gr.TextArea(
106
+ label="🎯 Ground truth response",
107
+ lines=10,
108
+ placeholder="Enter the ground truth response here...",
109
+ visible=False
110
+ )
111
+
112
+ with gr.Row():
113
+ random_btn = gr.Button("🎲", scale=2)
114
+ send_btn = gr.Button(
115
+ value="Run evaluation",
116
+ variant="primary",
117
+ size="lg",
118
+ scale=8
119
+ )
120
+
121
+ # Right side - Model outputs
122
+ with gr.Column(scale=1):
123
+ gr.Markdown("## 👩‍⚖️ Selene-Mini Evaluation")
124
+ with gr.Group():
125
+ with gr.Row():
126
+ score = gr.Textbox(label="Score", lines=1, interactive=False)
127
+ critique = gr.TextArea(label="Critique", lines=12, interactive=False)
128
+
129
+ gr.Markdown("<br>")
130
+
131
+
132
+ # Replace the "Edit Judge Prompt" Accordion section with:
133
+ with gr.Accordion("📝 Edit Judge Prompt", open=False) as prompt_accordion:
134
+ gr.Markdown("<br>")
135
+ use_reference_toggle = gr.Checkbox(
136
+ label="Use a reference response",
137
+ value=False
138
+ )
139
+
140
+ # Hide the default prompt editor
141
+ with gr.Column(visible=False) as default_prompt_editor:
142
+ eval_prompt_editable = gr.TextArea(
143
+ value=DEFAULT_EVAL_PROMPT_EDITABLE,
144
+ label="Evaluation Criteria",
145
+ lines=12
146
+ )
147
+
148
+ with gr.Row(visible=False) as edit_buttons_row:
149
+ cancel_prompt_btn = gr.Button("Cancel")
150
+ save_prompt_btn = gr.Button("Save", variant="primary")
151
+
152
+ # Show the compatible mode editor
153
+ with gr.Column(visible=True) as compatible_prompt_editor:
154
+ eval_criteria_text = gr.TextArea(
155
+ label="Evaluation Criteria",
156
+ lines=12,
157
+ value=DEFAULT_EVAL_CRITERIA,
158
+ placeholder="Enter the complete evaluation criteria and scoring rubric..."
159
+ )
160
+ with gr.Row(visible=False) as compatible_edit_buttons_row:
161
+ compatible_cancel_btn = gr.Button("Cancel")
162
+ compatible_save_btn = gr.Button("Save", variant="primary")
163
+
164
+ eval_prompt_previous = gr.State(value=DEFAULT_EVAL_PROMPT_EDITABLE) # Initialize with default value
165
+ is_editing = gr.State(False) # Track editing state
166
+ compatible_mode_state = gr.State(False) # Track compatible mode state
167
+
168
+ # Update model names after responses are generated
169
+ def update_model_names(model_a, model_b):
170
+ return gr.update(value=f"*Model: {model_a}*"), gr.update(
171
+ value=f"*Model: {model_b}*"
172
+ )
173
+
174
+ # Store the last submitted prompt and variables for comparison
175
+ last_submission = gr.State({})
176
+
177
+ # Update the save/cancel buttons section in the compatible prompt editor
178
+ def save_criteria(new_criteria, previous_criteria):
179
+ return [
180
+ gr.update(value=new_criteria), # Update the criteria
181
+ new_criteria, # Update the previous criteria state
182
+ gr.update(visible=False) # Hide the buttons
183
+ ]
184
+
185
+ def cancel_criteria(previous_criteria):
186
+ return [
187
+ gr.update(value=previous_criteria), # Revert to previous criteria
188
+ previous_criteria, # Keep the previous criteria state
189
+ gr.update(visible=False) # Hide the buttons
190
+ ]
191
+
192
+ def show_criteria_edit_buttons(current_value, previous_value):
193
+ # Show buttons only if the current value differs from the previous value
194
+ return gr.update(visible=current_value != previous_value)
195
+
196
+ # Add handlers for save/cancel buttons and criteria changes
197
+ compatible_save_btn.click(
198
+ fn=save_criteria,
199
+ inputs=[eval_criteria_text, eval_prompt_previous],
200
+ outputs=[eval_criteria_text, eval_prompt_previous, compatible_edit_buttons_row]
201
+ )
202
+
203
+ compatible_cancel_btn.click(
204
+ fn=cancel_criteria,
205
+ inputs=[eval_prompt_previous],
206
+ outputs=[eval_criteria_text, eval_prompt_previous, compatible_edit_buttons_row]
207
+ )
208
+
209
+ eval_criteria_text.change(
210
+ fn=show_criteria_edit_buttons,
211
+ inputs=[eval_criteria_text, eval_prompt_previous],
212
+ outputs=compatible_edit_buttons_row
213
+ )
214
+
215
+ # Function to toggle visibility based on compatible mode
216
+ def toggle_use_reference(checked):
217
+ if checked:
218
+ human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
219
+ return {
220
+ ground_truth: gr.update(visible=True, value=ground_truth_msg),
221
+ human_input: gr.update(value=human_msg),
222
+ ai_response: gr.update(value=ai_msg),
223
+ score: gr.update(value=""),
224
+ critique: gr.update(value=""),
225
+ random_btn: gr.update(value="🎲", variant="secondary"),
226
+ }
227
+ else:
228
+ return {
229
+ ground_truth: gr.update(visible=False)
230
+ }
231
+
232
+ # Update the change handler to include all necessary outputs
233
+ use_reference_toggle.change(
234
+ fn=toggle_use_reference,
235
+ inputs=[use_reference_toggle],
236
+ outputs=[
237
+ ground_truth,
238
+ human_input,
239
+ ai_response,
240
+ score,
241
+ critique,
242
+ random_btn,
243
+ ]
244
+ )
245
+
246
+ # Add a new state variable to track first game
247
+ first_game_state = gr.State(True) # Initialize as True
248
+
249
+ # Update the submit function to handle both models
250
+ def submit_and_store(
251
+ model_choice,
252
+ use_reference,
253
+ eval_criteria_text,
254
+ human_input,
255
+ ai_response,
256
+ ground_truth,
257
+ ):
258
+ if model_choice == "Selene Mini":
259
+ # Prepare prompt based on reference mode
260
+ prompt_template = ATLA_PROMPT_WITH_REFERENCE if use_reference else ATLA_PROMPT
261
+ prompt = prompt_template.format(
262
+ human_input=human_input,
263
+ ai_response=ai_response,
264
+ eval_criteria=eval_criteria_text,
265
+ ground_truth=ground_truth if use_reference else ""
266
+ )
267
+
268
+ print("\n=== Debug: Prompt being sent to Selene Mini ===")
269
+ print(prompt)
270
+ print("============================================\n")
271
+
272
+ # Get and parse response
273
+ raw_response = get_selene_mini_response(
274
+ model_name="AtlaAI/Selene-1-Mini-Llama-3.1-8B",
275
+ prompt=prompt,
276
+ max_tokens=500,
277
+ temperature=0.01
278
+ )
279
+ response = parse_selene_mini_response(raw_response)
280
+ else:
281
+ # Selene API logic
282
+ prompt_data = {
283
+ 'human_input': human_input,
284
+ 'ai_response': ai_response,
285
+ 'ground_truth': ground_truth if use_reference else None,
286
+ 'eval_criteria': eval_criteria_text,
287
+ }
288
+
289
+ print("\n=== Debug: Prompt data being sent to Selene API ===")
290
+ print(json.dumps(prompt_data, indent=2))
291
+ print("============================================\n")
292
+
293
+ response = get_atla_response(
294
+ model_name="AtlaAI/Selene-1-Mini-Llama-3.1-8B",
295
+ prompt=prompt_data,
296
+ max_tokens=500,
297
+ temperature=0.01
298
+ )
299
+
300
+ # Response now contains score and critique directly
301
+ if isinstance(response, dict) and 'score' in response and 'critique' in response:
302
+ score = str(response['score'])
303
+ critique = response['critique']
304
+ else:
305
+ score = "Error"
306
+ critique = str(response)
307
+
308
+ return [
309
+ score,
310
+ critique,
311
+ gr.update(value="Regenerate evaluation", variant="secondary", interactive=True),
312
+ gr.update(value="🎲"),
313
+ ]
314
+
315
+ # Update the send_btn click handler with new input
316
+ send_btn.click(
317
+ fn=submit_and_store,
318
+ inputs=[
319
+ model_selector,
320
+ use_reference_toggle,
321
+ eval_criteria_text,
322
+ human_input,
323
+ ai_response,
324
+ ground_truth,
325
+ ],
326
+ outputs=[
327
+ score,
328
+ critique,
329
+ send_btn,
330
+ random_btn,
331
+ ],
332
+ )
333
+
334
+ # Add random button handler
335
+ random_btn.click(
336
+ fn=populate_random_example,
337
+ inputs=[use_reference_toggle],
338
+ outputs=[
339
+ human_input,
340
+ ai_response,
341
+ random_btn,
342
+ score,
343
+ critique,
344
+ ground_truth,
345
+ ]
346
+ )
347
+
348
+ # Add input change handlers
349
+ def handle_input_change():
350
+ """Reset UI state when inputs are changed"""
351
+ return [
352
+ gr.update(value="Run evaluation", variant="primary"), # send_btn
353
+ gr.update(value="🎲", variant="secondary"), # random_btn
354
+ ]
355
+
356
+ # Update the change handlers for inputs
357
+ human_input.change(
358
+ fn=handle_input_change,
359
+ inputs=[],
360
+ outputs=[send_btn, random_btn]
361
+ )
362
+
363
+ ai_response.change(
364
+ fn=handle_input_change,
365
+ inputs=[],
366
+ outputs=[send_btn, random_btn]
367
+ )
368
+
369
+ generate_btn.click(
370
+ fn=lambda msg: (
371
+ generate_ai_response(msg)[0], # Only take the response text
372
+ gr.update(
373
+ value="Generate AI Response", # Keep the label
374
+ interactive=False # Disable the button
375
+ )
376
+ ),
377
+ inputs=[human_input],
378
+ outputs=[ai_response, generate_btn]
379
+ )
380
+
381
+ human_input.change(
382
+ fn=lambda x: gr.update(interactive=bool(x.strip())),
383
+ inputs=[human_input],
384
+ outputs=[generate_btn]
385
+ )
386
+
387
+ # Update the demo.load to include the random example population
388
+ interface.load(
389
+ fn=lambda: populate_random_example(None, False), # Pass False for initial compatible_mode
390
+ inputs=[],
391
+ outputs=[
392
+ human_input,
393
+ ai_response,
394
+ random_btn,
395
+ score,
396
+ critique,
397
+ ground_truth,
398
+ ]
399
+ )
400
+
401
+ return interface
402
+
403
+ if __name__ == "__main__":
404
+ demo = create_arena_interface()
405
+ demo.launch()
random_sample/common.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Page Headers
2
+ MAIN_TITLE = "# Selene-Mini"
3
+
4
+ # How it works section
5
+ HOW_IT_WORKS = """
6
+ Try running evals with Selene-Mini in this playground! Our HF model card can be found [here](https://huggingface.co/AtlaAI/Selene-1-Mini-Llama-3.1-8B).
7
+ """
8
+
9
+ BATTLE_RULES = """
10
+ ## 🤺 Choose the winner
11
+ 1. Define your scoring criteria in the **Evaluator Prompt**
12
+ 2. Add a test case to the **Sample to evaluate**
13
+ 3. Test the evaluators & vote for the model that best aligns with your judgement!
14
+ \n
15
+ Variables defined in your prompt with {{double curly braces}} map to input fields under **Sample to evaluate**.
16
+
17
+ <br>
18
+ """
19
+
20
+ # CSS Styles
21
+ CSS_STYLES = """
22
+ .prompt-row {
23
+ align-items: flex-start !important;
24
+ }
25
+ .send-button-row {
26
+ display: flex;
27
+ justify-content: flex-end;
28
+ margin-top: 8px;
29
+ }
30
+ /* Style for metric buttons */
31
+ .metric-button-active {
32
+ background-color: #2B3A55 !important;
33
+ color: white !important;
34
+ }
35
+ /* Add this to ensure proper button spacing */
36
+ .metric-buttons-row {
37
+ gap: 8px;
38
+ }
39
+ """
40
+
41
+ # Default Eval Prompt
42
+ EVAL_DESCRIPTION = """
43
+ ## 📝 Tips
44
+ **Precise evaluation criteria leads to more consistent and reliable judgments.** A good evaluation prompt should include the following elements:
45
+ - Evaluation criteria
46
+ - Scoring rubric
47
+ - Examples (Optional)
48
+ """
49
+
50
+ # Voting Section Header
51
+ VOTING_HEADER = """
52
+ # Start Voting Now
53
+ """
54
+
55
+ # Acknowledgements
56
+ ACKNOWLEDGEMENTS = """
57
+ <br><br>
58
+ # Acknowledgements
59
+
60
+ We thank [LMSYS Org](https://lmsys.org/) for their hard work on the Chatbot Arena and fully credit them for the inspiration to build this.
61
+
62
+ We thank [Clementine Fourrier](https://huggingface.co/clefourrier) and Hugging Face for their guidance and partnership in setting this up.
63
+ """
64
+
65
+ # Policy Content
66
+ POLICY_CONTENT = """
67
+ # About Atla
68
+
69
+ Atla is an applied research organization that trains models as evaluators to capture human preferences. We're a team of researchers, engineers, and operational leaders, with experience spanning a variety of disciplines, all working together to build reliable and understandable AI systems. Our research is informed by our experiences conducting AI safety research at the UK AI Task Force, OpenAI and the Stanford Existential Risks Initiative.
70
+ <br><br>
71
+ # [Our Mission](https://www.atla-ai.com/company)
72
+
73
+ By creating advanced evaluation models, we enable AI developers to identify and fix risks, leading to safer, more reliable AI that can be trusted and widely used. Our aim is to surpass the current state-of-the-art evaluation methods by training models specifically for evaluation. AIs will probably become very powerful, and perform tasks that are difficult for us to verify. We want to enable humans to oversee AI systems that are solving tasks too difficult for humans to evaluate.
74
+ Read more about [our approach to scalable oversight](https://www.atla-ai.com/post/scaling-alignment) on our blog.
75
+ <br><br>
76
+ # Judge Arena Policy
77
+
78
+ ## Overview
79
+
80
+ Judge Arena is an open-source platform dedicated to determining which models make the best judges. Users can run evals and assess anonymized responses from two competing model judges, choosing the better judgement or declaring a tie. This policy outlines our commitments to maintain a fair and open environment :)
81
+
82
+ ## Transparency
83
+
84
+ - **Open-Source**: Judge Arena's code is open-source and available on GitHub. We encourage contributions from the community and anyone can replicate or modify the platform to suit their needs. We use proprietary model provider APIs where provided and Together AI's API to serve leading open-source models.
85
+ - **Methodology**: All processes related to model evaluation, rating calculations, and model selection are openly documented.
86
+ - **Data Sharing**: Periodically, we'll share 20% of the collected evaluation data with the community. The data collected from Judge Arena is restricted to an anonymized user ID, the final prompt sent, the model responses, the user vote, and the timestamp.
87
+
88
+ ## Model Inclusion Criteria
89
+
90
+ Judge Arena is specifically designed to assess AI models that function as evaluators (a.k.a judges). This includes but is not limited to powerful general-purpose models and the latest language models designed for evaluation tasks. Models are eligible for inclusion if they meet the following criteria:
91
+
92
+ - **Judge Capability**: The model should possess the ability to score AND critique other models' outputs effectively.
93
+ - **Promptable:** The model must be promptable to be evaluate in different scoring formats, for different criteria.
94
+ - **Accessibility**:
95
+ - **Public API Access**: Models accessible through public APIs without restrictive barriers.
96
+ - **Open-Source Models**: Models with publicly available weights that can be downloaded and run by the community.
97
+
98
+ ## Leaderboard Management
99
+
100
+ - **ELO Ranking System**: Models are ranked on a public leaderboard based on aggregated user evaluations. We use an ELO rating system to rank AI judges on the public leaderboard. Each model begins with an initial rating of 1200, and we use a K-factor of 32 to determine the maximum rating adjustment after each evaluation.
101
+ - **Minimum Period**: Listed models remain accessible on Judge Arena for a minimum period of two weeks so they can be comprehensively evaluated.
102
+ - **Deprecation Policy**: Models may be removed from the leaderboard if they become inaccessible or are no longer publicly available.
103
+
104
+ *This policy might be updated to reflect changes in our practices or in response to community feedback.*
105
+ <br><br>
106
+ # FAQ
107
+
108
+ **Isn't this the same as Chatbot Arena?**
109
+
110
+ We are big fans of what the LMSYS team have done with Chatbot Arena and fully credit them for the inspiration to develop this. We were looking for a dynamic leaderboard that graded on AI judge capabilities and didn't manage to find one, so we created Judge Arena. This UI is designed especially for evals; to match the format of the model-based eval prompts that you would use in your LLM evaluation / monitoring tool.
111
+
112
+ **Why should I trust this leaderboard?**
113
+
114
+ We have listed out our efforts to be fully transparent in the policies above. All of the code for this leaderboard is open-source and can be found on our [Github](https://github.com/atla-ai/judge-arena). Check out our [blog](https://www.atla-ai.com/blog) to stay up to date as we analyse the results from the leaderboard.
115
+
116
+ **Who funds this effort?**
117
+
118
+ Atla currently funds this out of our own pocket. We are looking for API credits (with no strings attached) to support this effort - please get in touch if you or someone you know might be able to help.
119
+
120
+ **What is Atla working on?**
121
+
122
+ We are training a general-purpose evaluator that you will soon be able to run in this Judge Arena. Our next step will be to open-source a powerful model that the community can use to run fast and accurate evaluations.
123
+ <br><br>
124
+ # Get in touch
125
+ We’d love to hear your feedback! For general feature requests or to submit / suggest new models to add to the arena, please open up a discussion in the [community](https://huggingface.co/spaces/AtlaAI/judge-arena/discussions) tab. You can also contact us directly on [X](https://x.com/Atla_AI) or [Discord](https://discord.gg/yNpUAMqs).
126
+ \nPlease file any issues on our [Github](https://github.com/atla-ai/judge-arena)."""
random_sample/gen_api_answer.py ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ import anthropic
3
+ from together import Together
4
+ import os
5
+ from atla import Atla
6
+ from dotenv import load_dotenv
7
+ from .prompts import (
8
+ JUDGE_SYSTEM_PROMPT
9
+ )
10
+ from transformers import AutoTokenizer
11
+ import requests
12
+ import json
13
+ import re
14
+
15
+ load_dotenv()
16
+
17
+ # Initialize clients
18
+ anthropic_client = anthropic.Anthropic()
19
+ openai_client = OpenAI()
20
+ together_client = Together()
21
+ hf_api_key = os.getenv("HF_API_KEY")
22
+
23
+ atla_client = Atla()
24
+
25
+ def get_openai_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
26
+ """Get response from OpenAI API"""
27
+ try:
28
+ response = openai_client.chat.completions.create(
29
+ model=model_name,
30
+ messages=[
31
+ {"role": "system", "content": system_prompt},
32
+ {"role": "user", "content": prompt},
33
+ ],
34
+ max_completion_tokens=max_tokens,
35
+ temperature=temperature,
36
+ )
37
+ return response.choices[0].message.content
38
+ except Exception as e:
39
+ return f"Error with OpenAI model {model_name}: {str(e)}"
40
+
41
+ def get_anthropic_response(model_name, prompt, system_prompt=JUDGE_SYSTEM_PROMPT, max_tokens=500, temperature=0):
42
+ """Get response from Anthropic API"""
43
+ try:
44
+ response = anthropic_client.messages.create(
45
+ model=model_name,
46
+ max_tokens=max_tokens,
47
+ temperature=temperature,
48
+ system=system_prompt,
49
+ messages=[{"role": "user", "content": [{"type": "text", "text": prompt}]}],
50
+ )
51
+ return response.content[0].text
52
+ except Exception as e:
53
+ return f"Error with Anthropic model {model_name}: {str(e)}"
54
+
55
+
56
+ def get_atla_response(model_name, prompt, system_prompt=None, max_tokens=500, temperature=0.01):
57
+ """Get response from Atla API"""
58
+ try:
59
+ # Extract components from the prompt data
60
+ model_input = prompt.get('human_input', '')
61
+ model_output = prompt.get('ai_response', '')
62
+ expected_output = prompt.get('ground_truth')
63
+ evaluation_criteria = prompt.get('eval_criteria', '')
64
+
65
+ response = atla_client.evaluation.create(
66
+ model_id="atla-selene",
67
+ model_input=model_input,
68
+ model_output=model_output,
69
+ expected_model_output=expected_output if expected_output else None,
70
+ evaluation_criteria=evaluation_criteria,
71
+ )
72
+
73
+ # Return the score and critique directly
74
+ return {
75
+ "score": response.result.evaluation.score,
76
+ "critique": response.result.evaluation.critique
77
+ }
78
+ except Exception as e:
79
+ return f"Error with Atla model {model_name}: {str(e)}"
80
+
81
+ def get_selene_mini_response(model_name, prompt, system_prompt=None, max_tokens=500, temperature=0.01):
82
+ """Get response from HF endpoint for Atla model"""
83
+ try:
84
+ headers = {
85
+ "Accept": "application/json",
86
+ "Authorization": f"Bearer {hf_api_key}",
87
+ "Content-Type": "application/json"
88
+ }
89
+
90
+ # Create messages list for chat template
91
+ messages = []
92
+ if system_prompt:
93
+ messages.append({"role": "system", "content": system_prompt})
94
+ messages.append({"role": "user", "content": prompt})
95
+
96
+ # Apply chat template
97
+ model_id = "AtlaAI/Selene-1-Mini-Llama-3.1-8B"
98
+ tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_api_key)
99
+ formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
100
+
101
+ payload = {
102
+ "inputs": formatted_prompt,
103
+ "parameters": {
104
+ "max_new_tokens": max_tokens,
105
+ "return_full_text": False,
106
+ "temperature": temperature,
107
+ "seed": 42,
108
+ "add_generation_prompt": True
109
+ }
110
+ }
111
+
112
+ response = requests.post(
113
+ "https://bkp9p28gri93egqh.us-east-1.aws.endpoints.huggingface.cloud",
114
+ headers=headers,
115
+ json=payload
116
+ )
117
+ return response.json()[0]["generated_text"]
118
+ except Exception as e:
119
+ return f"Error with Atla model {model_name}: {str(e)}"
120
+
121
+ def parse_selene_mini_response(response_text):
122
+ """Parse the response from Selene Mini to extract score and critique"""
123
+ try:
124
+ # Clean up the response text
125
+ response_text = response_text.strip()
126
+
127
+ # More flexible regex patterns
128
+ reasoning_pattern = r'\*\*Reasoning:?\*\*\s*(.*?)(?=\*\*Result|$)'
129
+ result_pattern = r'\*\*Result:?\*\*\s*(\d+)'
130
+
131
+ reasoning_match = re.search(reasoning_pattern, response_text, re.DOTALL | re.IGNORECASE)
132
+ result_match = re.search(result_pattern, response_text, re.IGNORECASE)
133
+
134
+ if reasoning_match and result_match:
135
+ critique = reasoning_match.group(1).strip()
136
+ score = result_match.group(1)
137
+ return {"score": score, "critique": critique}
138
+ else:
139
+ # If we can't parse it properly, let's return the raw response as critique
140
+ return {
141
+ "score": "Error",
142
+ "critique": f"Failed to parse response. Raw response:\n{response_text}"
143
+ }
144
+ except Exception as e:
145
+ return {
146
+ "score": "Error",
147
+ "critique": f"Error parsing response: {str(e)}\nRaw response:\n{response_text}"
148
+ }
random_sample/prompts.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Default values for compatible mode
2
+ DEFAULT_EVAL_CRITERIA = """Does the model provide relevant and useful responses to the user's needs or questions?
3
+
4
+ Scoring Rubric:
5
+ Score 1: The model's responses are irrelevant or unhelpful to the user's needs or queries.
6
+ Score 2: The model sometimes provides helpful information, but often fails to address the user's actual needs or questions.
7
+ Score 3: The model generally provides helpful responses that address the user's needs, though it may occasionally miss the mark.
8
+ Score 4: The model regularly provides helpful responses that are well-aligned with the user's inquiries, with only rare inaccuracies.
9
+ Score 5: The model consistently offers highly relevant and useful responses that perfectly cater to the user's needs and inquiries."""
10
+
11
+ # Default Eval Prompt
12
+ DEFAULT_EVAL_PROMPT = """Does the model provide relevant and useful responses to the user's needs or questions?
13
+
14
+ Scoring Rubric:
15
+ Score 1: The model's responses are irrelevant or unhelpful to the user's needs or queries.
16
+ Score 2: The model sometimes provides helpful information, but often fails to address the user's actual needs or questions.
17
+ Score 3: The model generally provides helpful responses that address the user's needs, though it may occasionally miss the mark.
18
+ Score 4: The model regularly provides helpful responses that are well-aligned with the user's inquiries, with only rare inaccuracies.
19
+ Score 5: The model consistently offers highly relevant and useful responses that perfectly cater to the user's needs and inquiries.
20
+
21
+ [User Query]: {{input}}
22
+
23
+ [AI Response]: {{response}}"""
24
+
25
+ # Split the eval prompt into editable and fixed parts
26
+ DEFAULT_EVAL_PROMPT_EDITABLE = """Does the model provide relevant and useful responses to the user's needs or questions?
27
+
28
+ Scoring Rubric:
29
+ Score 1: The model's responses are irrelevant or unhelpful to the user's needs or queries.
30
+ Score 2: The model sometimes provides helpful information, but often fails to address the user's actual needs or questions.
31
+ Score 3: The model generally provides helpful responses that address the user's needs, though it may occasionally miss the mark.
32
+ Score 4: The model regularly provides helpful responses that are well-aligned with the user's inquiries, with only rare inaccuracies.
33
+ Score 5: The model consistently offers highly relevant and useful responses that perfectly cater to the user's needs and inquiries."""
34
+
35
+ # Fixed suffix that will always be appended
36
+ FIXED_EVAL_SUFFIX = """
37
+ [User Query]: {{human_input}}
38
+
39
+ [AI Response]: {{ai_response}}"""
40
+
41
+ ATLA_PROMPT = """You are tasked with evaluating a response based on a given instruction (which may contain an Input) and a scoring rubric that serve as the evaluation standard. Provide a comprehensive feedback on the response quality strictly adhering to the scoring rubric, without any general evaluation. Follow this with a score integer, referring to the scoring rubric. Avoid generating any additional opening, closing, or explanations.
42
+ Here are some rules of the evaluation:
43
+ (1) You should prioritize evaluating whether the response satisfies the provided rubric. The basis of your score should depend exactly on the rubric. However, the response does not need to explicitly address points raised in the rubric. Rather, evaluate the response based on the criteria outlined in the rubric.
44
+
45
+ Your reply should strictly follow this format:
46
+ **Reasoning:** <Your feedback>
47
+
48
+ **Result:** <Your score>
49
+
50
+ Here is the data:
51
+
52
+ Instruction:
53
+ ```
54
+ {human_input}
55
+ ```
56
+
57
+ Response:
58
+ ```
59
+ {ai_response}
60
+ ```
61
+
62
+ Score Rubrics:
63
+ {eval_criteria}"""
64
+
65
+ ATLA_PROMPT_WITH_REFERENCE = """You are tasked with evaluating a response based on a given instruction (which may contain an Input) and a scoring rubric and reference answer that serve as the evaluation standard. Provide a comprehensive feedback on the response quality strictly adhering to the scoring rubric, without any general evaluation. Follow this with a score, referring to the scoring rubric. Avoid generating any additional opening, closing, or explanations.
66
+
67
+ Here are some rules of the evaluation:
68
+ (1) You should prioritize evaluating whether the response satisfies the provided rubric. The basis of your score should depend exactly on the rubric. However, the response does not need to explicitly address points raised in the rubric. Rather, evaluate the response based on the criteria outlined in the rubric.
69
+
70
+ Your reply should strictly follow this format:
71
+ **Reasoning:** <Your feedback>
72
+
73
+ **Result:** <Your score>
74
+
75
+ Here is the data:
76
+
77
+ Instruction:
78
+ ```
79
+ {human_input}
80
+ ```
81
+
82
+ Response:
83
+ ```
84
+ {ai_response}
85
+ ```
86
+
87
+ Score Rubrics:
88
+ {eval_criteria}
89
+
90
+ Reference answer:
91
+ {ground_truth}"""
92
+
93
+ # Judge system prompt for non-Prometheus models
94
+ JUDGE_SYSTEM_PROMPT = """Please act as an impartial judge and evaluate based on the user's instruction. Your output format should strictly adhere to JSON as follows: {"feedback": "<write feedback>", "result": <numerical score>}. Ensure the output is valid JSON, without additional formatting or explanations."""
random_sample/random_sample_generation.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ import anthropic
3
+ import json
4
+ import re
5
+ import random
6
+ import os
7
+ from .gen_api_answer import get_openai_response, get_anthropic_response
8
+
9
+ # Initialize clients
10
+ anthropic_client = anthropic.Anthropic()
11
+ openai_client = OpenAI()
12
+
13
+ GOOD_SYSTEM_PROMPT = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes. The AI response generated should be a few sentences long. Format your output as JSON: {"human": "<human message>", "ai": <AI assistant response>}. Ensure the output is valid JSON, without additional formatting or explanations."""
14
+ BAD_SYSTEM_PROMPT = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes. The response should contain incorrect information, logical fallacies, or misleading explanations. It should sound plausible but be fundamentally wrong. The AI response generated should be a few sentences long. Format your output as JSON: {"human": "<human message>", "ai": <AI assistant response>}. Ensure the output is valid JSON, without additional formatting or explanations."""
15
+ AMBIGUOUS_SYSTEM_PROMPT = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes. The response should mix correct and incorrect information - it should contain some accurate points but also include nuanced, questionable claims or exaggerations. The AI response generated should be a few sentences long. Format your output as JSON: {"human": "<human message>", "ai": <AI assistant response>}. Ensure the output is valid JSON, without additional formatting or explanations."""
16
+
17
+ GOOD_SYSTEM_PROMPT_WITH_GROUND_TRUTH = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes, along with an ideal reference answer. The AI response generated should be a few sentences long and contain accurate information. The ground truth response should be a perfect, comprehensive answer that would score 5/5. Format your output as JSON: {"human": "<human message>", "ai": "<AI assistant response>", "ground_truth": "<perfect reference answer>"}. Ensure the output is valid JSON, without additional formatting or explanations."""
18
+ BAD_SYSTEM_PROMPT_WITH_GROUND_TRUTH = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes, along with an ideal reference answer. The AI response should be a few sentences long and contain incorrect information, logical fallacies, or misleading explanations. It should sound plausible but be fundamentally wrong. The ground truth response should be a perfect, comprehensive answer that would score 5/5. Format your output as JSON: {"human": "<human message>", "ai": "<AI assistant response>", "ground_truth": "<perfect reference answer>"}. Ensure the output is valid JSON, without additional formatting or explanations."""
19
+ AMBIGUOUS_SYSTEM_PROMPT_WITH_GROUND_TRUTH = """You are an assistant that generates random conversations between a human and an AI assistant for testing purposes, along with an ideal reference answer. The AI response should be a few sentences long and mix correct and incorrect information - it should contain some accurate points but also include nuanced, questionable claims or exaggerations. The ground truth response should be a perfect, comprehensive answer that would score 5/5. Format your output as JSON: {"human": "<human message>", "ai": "<AI assistant response>", "ground_truth": "<perfect reference answer>"}. Ensure the output is valid JSON, without additional formatting or explanations."""
20
+
21
+ GENERATION_PROMPT = """Please generate a random human message and an AI response in the format of a QA dataset. The human input should not be a one-word answer question like "What is the capital of France?". The AI response generated should be a few sentences long."""
22
+ GENERATION_PROMPT_WITH_GROUND_TRUTH = """Please generate:
23
+ 1. A random human message (not a simple one-word answer question)
24
+ 2. An AI response (a few sentences long)
25
+ 3. A perfect reference answer that would score 5/5 on all criteria (e.g., concise, helpful, and accurate)
26
+
27
+ Format as JSON with "human", "ai", and "ground_truth" fields."""
28
+
29
+ RESPONSE_GENERATION_SYSTEM_PROMPT = "You are an assistant that generates random responses to human messages for testing purposes. Generate bad responses (with a mix of correct and incorrect information) 60% of the time and good responses 40% of the time. Do not say which type of response you are generating, just generate the response."
30
+
31
+ def get_random_human_ai_pair():
32
+ # Select system prompt with specified probabilities
33
+ system_prompt = random.choices(
34
+ [GOOD_SYSTEM_PROMPT, BAD_SYSTEM_PROMPT, AMBIGUOUS_SYSTEM_PROMPT],
35
+ weights=[0.2, 0.2, 0.6] # 20% good, 20% bad, 60% ambiguous
36
+ )[0]
37
+
38
+ # Log which type of response is being generated
39
+ prompt_type = {
40
+ GOOD_SYSTEM_PROMPT: "good",
41
+ BAD_SYSTEM_PROMPT: "bad",
42
+ AMBIGUOUS_SYSTEM_PROMPT: "ambiguous"
43
+ }[system_prompt]
44
+ print(f"Generating {prompt_type} response")
45
+
46
+ # Randomly choose between GPT-3.5 and Claude with 65%/35% weights
47
+ model_choice = random.choices([
48
+ ("gpt-3.5-turbo", get_openai_response),
49
+ ("claude-3-5-haiku-latest", get_anthropic_response)
50
+ ], weights=[0.5, 0.5])[0]
51
+ model_name, api_func = model_choice
52
+
53
+ # Generate response using selected model
54
+ response = api_func(
55
+ model_name=model_name,
56
+ prompt=GENERATION_PROMPT,
57
+ system_prompt=system_prompt,
58
+ max_tokens=500,
59
+ temperature=1
60
+ )
61
+
62
+ # Define default messages
63
+ default_human = "How do muscles grow?"
64
+ default_ai = """Muscles grow through a process called skeletal muscle hypertrophy, which adds more myosin filaments to each muscle fiber, making the engine of the cell bigger and stronger over time. This is achieved through increased muscle tension and physical stress, breaking down muscle fiber. Muscle growth is also a direct consequence of resistance training and nutrition. People build muscle at different rates depending on their age, sex, and genetics, but muscle development significantly increases if exercise is done correctly and the body stores more protein through a process called protein synthesis."""
65
+
66
+ try:
67
+ # Clean the response by replacing newlines with spaces
68
+ cleaned_response = response.replace('\n', ' ').replace('\r', '')
69
+ data = json.loads(cleaned_response)
70
+
71
+ # Extract messages with fallbacks
72
+ human_message = data.get("human", default_human)
73
+ ai_message = data.get("ai", default_ai)
74
+
75
+ # Debug logging
76
+ print(f"Parsed response: human='{human_message}', ai='{ai_message[:50]}...'")
77
+
78
+ except Exception as e:
79
+ print(f"Failed to parse response: {str(e)}\n {response}")
80
+ human_message = default_human
81
+ ai_message = default_ai
82
+
83
+ return human_message, ai_message
84
+
85
+ def get_random_human_ai_ground_truth_pair():
86
+ # Select system prompt with specified probabilities
87
+ system_prompts = {
88
+ "good": GOOD_SYSTEM_PROMPT_WITH_GROUND_TRUTH,
89
+ "bad": BAD_SYSTEM_PROMPT_WITH_GROUND_TRUTH,
90
+ "ambiguous": AMBIGUOUS_SYSTEM_PROMPT_WITH_GROUND_TRUTH
91
+ }
92
+
93
+ prompt_type = random.choices(
94
+ ["good", "bad", "ambiguous"],
95
+ weights=[0.2, 0.2, 0.6] # 20% good, 20% bad, 60% ambiguous
96
+ )[0]
97
+
98
+ system_prompt = system_prompts[prompt_type]
99
+ print(f"Generating {prompt_type} response with ground truth")
100
+
101
+ # Randomly choose between GPT-3.5 and Claude with 50/50 weights
102
+ model_choice = random.choices([
103
+ ("gpt-3.5-turbo", get_openai_response),
104
+ ("claude-3-5-haiku-latest", get_anthropic_response)
105
+ ], weights=[0.5, 0.5])[0]
106
+ model_name, api_func = model_choice
107
+
108
+ # Define default messages
109
+ defaults = {
110
+ "human": "How do muscles grow?",
111
+ "ai": """Muscles grow through a process called skeletal muscle hypertrophy, which adds more myosin filaments to each muscle fiber, making the engine of the cell bigger and stronger over time. This is achieved through increased muscle tension and physical stress, breaking down muscle fiber. Muscle growth is also a direct consequence of resistance training and nutrition. People build muscle at different rates depending on their age, sex, and genetics, but muscle development significantly increases if exercise is done correctly and the body stores more protein through a process called protein synthesis.""",
112
+ "ground_truth": """Muscle growth (hypertrophy) occurs through a complex biological process involving several key mechanisms:
113
+
114
+ 1. Mechanical Tension: Resistance training creates mechanical tension in muscle fibers, triggering molecular and cellular responses that promote growth.
115
+
116
+ 2. Metabolic Stress: The depletion of energy resources and accumulation of metabolic byproducts during exercise contributes to muscle growth signaling.
117
+
118
+ 3. Muscle Damage: Exercise-induced micro-damage to muscle fibers activates satellite cells, which help repair and build new muscle tissue.
119
+
120
+ 4. Protein Synthesis: After exercise, increased protein synthesis rates exceed protein breakdown, leading to net muscle protein accretion.
121
+
122
+ 5. Hormonal Response: Exercise triggers the release of growth-promoting hormones like testosterone, growth hormone, and IGF-1.
123
+
124
+ 6. Recovery: Adequate rest between training sessions allows for repair and growth, supported by proper nutrition, particularly protein intake (1.6-2.2g/kg/day).
125
+
126
+ This process is influenced by factors including genetics, age, sex, nutrition, sleep quality, and training variables. Optimal muscle growth requires a structured resistance training program, adequate protein intake, sufficient calories, and proper recovery."""
127
+ }
128
+
129
+ # Generate response using selected model
130
+ response = api_func(
131
+ model_name=model_name,
132
+ prompt=GENERATION_PROMPT_WITH_GROUND_TRUTH,
133
+ system_prompt=system_prompt,
134
+ max_tokens=1000, # Increased token limit to accommodate ground truth
135
+ temperature=1
136
+ )
137
+
138
+ # Parse the response to get all three components
139
+ try:
140
+ # Clean the response by replacing newlines with spaces
141
+ cleaned_response = response.replace('\n', ' ').replace('\r', '')
142
+ data = json.loads(cleaned_response)
143
+
144
+ # Extract messages with fallbacks
145
+ human_message = data.get("human", defaults["human"])
146
+ ai_message = data.get("ai", defaults["ai"])
147
+ ground_truth = data.get("ground_truth", defaults["ground_truth"])
148
+
149
+ # Debug logging
150
+ print(f"Parsed response: human='{human_message}', ai='{ai_message[:50]}...', ground_truth='{ground_truth[:50]}...'")
151
+
152
+ except Exception as e:
153
+ print(f"Failed to parse response: {str(e)}\n {response}")
154
+ human_message = defaults["human"]
155
+ ai_message = defaults["ai"]
156
+ ground_truth = defaults["ground_truth"]
157
+
158
+ return human_message, ai_message, ground_truth
159
+
160
+ def generate_ai_response(human_msg):
161
+ """Generate AI response using GPT-3.5-turbo"""
162
+ if not human_msg.strip():
163
+ return "", False
164
+
165
+ try:
166
+ response = get_openai_response(
167
+ "gpt-3.5-turbo",
168
+ human_msg,
169
+ system_prompt=RESPONSE_GENERATION_SYSTEM_PROMPT,
170
+ max_tokens=1000,
171
+ temperature=1
172
+ )
173
+ # Extract just the response content since we don't need JSON format here
174
+ if isinstance(response, str):
175
+ # Clean up any JSON formatting if present
176
+ try:
177
+ data = json.loads(response)
178
+ response = data.get("content", response)
179
+ except json.JSONDecodeError:
180
+ pass
181
+ return response, False # Return response and button interactive state
182
+ except Exception as e:
183
+ return f"Error generating response: {str(e)}", False
random_sample_tab.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from random_sample.arena_interface import create_arena_interface
3
+
4
+ def random_sample_tab():
5
+ with gr.TabItem("Random samples"):
6
+ return create_arena_interface()
requirements.txt CHANGED
@@ -1,8 +1,13 @@
 
1
  pymongo
2
  gradio
3
  python-dotenv
4
  openai
5
  anthropic
6
  together
7
- cohere
8
- transformers
 
 
 
 
 
1
+ # Core dependencies
2
  pymongo
3
  gradio
4
  python-dotenv
5
  openai
6
  anthropic
7
  together
8
+ atla
9
+ transformers
10
+
11
+ # Development dependencies
12
+ black
13
+ pytest
score_handler.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import numpy as np
4
+ import tempfile
5
+ import os
6
+
7
+ def handle_analysis(df_state, model_selection_group, analyze_results_button):
8
+ with gr.Group(visible=False) as analysis_group:
9
+ gr.Markdown("## Analysis")
10
+
11
+ # Dropdown to select the accuracy measurement
12
+ accuracy_measurement_dropdown = gr.Dropdown(
13
+ choices=['Accuracy', 'Pearson Correlation'],
14
+ label='Select Evaluation Metric'
15
+ )
16
+
17
+ # We remove the two compare dropdowns and only keep ground truth
18
+ with gr.Row():
19
+ ground_truth_dropdown = gr.Dropdown(
20
+ choices=[],
21
+ label='Select True Label Column'
22
+ )
23
+
24
+ # Define two side-by-side boxes for results
25
+ with gr.Row():
26
+ judge_a_result = gr.Textbox(
27
+ label="Judge A Results",
28
+ lines=10,
29
+ interactive=False,
30
+ visible=False
31
+ )
32
+ judge_b_result = gr.Textbox(
33
+ label="Judge B Results",
34
+ lines=10,
35
+ interactive=False,
36
+ visible=False
37
+ )
38
+
39
+ # Move the JSON output below those textboxes and buttons
40
+ json_output = gr.File(label="Results .json", interactive=False, visible=False)
41
+
42
+ # Now place the row of buttons AFTER the json_output
43
+ with gr.Row():
44
+ back_to_results_button = gr.Button("← Back to Results")
45
+ calculate_button = gr.Button("Calculate")
46
+ download_button = gr.Button("Download Results as JSON")
47
+
48
+ # Show analysis group
49
+ def show_analysis_group():
50
+ df = df_state.value
51
+ if df is not None:
52
+ columns = df.columns.tolist()
53
+ else:
54
+ columns = []
55
+ # Now we only update ground_truth_dropdown
56
+ return (
57
+ gr.update(visible=True), # analysis_group
58
+ gr.update(visible=False), # model_selection_group
59
+ gr.update(choices=columns), # ground_truth_dropdown
60
+ )
61
+
62
+ analyze_results_button.click(
63
+ fn=show_analysis_group,
64
+ inputs=[],
65
+ outputs=[
66
+ analysis_group,
67
+ model_selection_group,
68
+ ground_truth_dropdown # only this one
69
+ ]
70
+ )
71
+
72
+ def back_to_results():
73
+ return (
74
+ gr.update(visible=False), # Hide analysis_group
75
+ gr.update(visible=True), # Show model_selection_group
76
+ )
77
+
78
+ back_to_results_button.click(
79
+ fn=back_to_results,
80
+ inputs=[],
81
+ outputs=[analysis_group, model_selection_group]
82
+ )
83
+
84
+ def calculate_multiple_accuracies(measurement, ground_truth_col, df_state):
85
+ # Update column names to match new format
86
+ col2_name = "score_selene"
87
+ # Get the non-selene score column (should be the other score_* column)
88
+ df = df_state.value
89
+ score_columns = [col for col in df.columns if col.startswith('score_') and col != 'score_selene']
90
+ col3_name = score_columns[0] if score_columns else None
91
+
92
+ if df is None:
93
+ return (
94
+ gr.update(value="No DataFrame available.", visible=True),
95
+ gr.update(value="No DataFrame available.", visible=True)
96
+ )
97
+
98
+ # Check if user-supplied ground_truth_col is valid
99
+ missing_columns = [col for col in [ground_truth_col, col2_name, col3_name] if col not in df.columns]
100
+ if missing_columns:
101
+ msg = f"Selected columns not found in DataFrame: {', '.join(missing_columns)}."
102
+ return (
103
+ gr.update(value=msg, visible=True),
104
+ gr.update(value=msg, visible=True)
105
+ )
106
+
107
+ # Compare ground_truth_col with score_selene
108
+ result1 = calculate_accuracy(
109
+ measurement, ground_truth_col, col2_name,
110
+ df_state, compare_to_ground_truth=True
111
+ )
112
+ text_a = f"Comparison: '{ground_truth_col}' vs. 'Selene'\n{result1}"
113
+
114
+ # Compare ground_truth_col with the other model's score
115
+ result2 = calculate_accuracy(
116
+ measurement, ground_truth_col, col3_name,
117
+ df_state, compare_to_ground_truth=True
118
+ )
119
+ # Extract model name from column name for display
120
+ model_name = col3_name.replace('score_', '').replace('_', ' ').title()
121
+ text_b = f"Comparison: '{ground_truth_col}' vs. '{model_name}'\n{result2}"
122
+
123
+ return (
124
+ gr.update(value=text_a, visible=True),
125
+ gr.update(value=text_b, visible=True)
126
+ )
127
+
128
+ # Now the calculate_button only expects measurement, ground_truth_col, df_state
129
+ calculate_button.click(
130
+ fn=calculate_multiple_accuracies,
131
+ inputs=[
132
+ accuracy_measurement_dropdown,
133
+ ground_truth_dropdown,
134
+ df_state
135
+ ],
136
+ outputs=[judge_a_result, judge_b_result]
137
+ )
138
+
139
+ def create_json_download(df_state):
140
+ if df_state.value is None:
141
+ return gr.update(value=None, visible=True)
142
+
143
+ json_str = df_state.value.to_json(orient='records', indent=2)
144
+ temp_dir = tempfile.gettempdir()
145
+ file_path = os.path.join(temp_dir, 'atla_custom_eval_results.json')
146
+ with open(file_path, 'w', encoding='utf-8') as f:
147
+ f.write(json_str)
148
+ return gr.update(value=file_path, visible=True)
149
+
150
+ download_button.click(
151
+ fn=create_json_download,
152
+ inputs=[df_state],
153
+ outputs=[json_output]
154
+ )
155
+
156
+ # Helper functions
157
+
158
+ def calculate_accuracy(measurement, col1, col2, df_state, compare_to_ground_truth=False):
159
+ df = df_state.value
160
+ # No changes here (function remains sacred as per your request)
161
+ if df is None:
162
+ return "No DataFrame available."
163
+ if col1 not in df.columns or col2 not in df.columns:
164
+ return "Selected columns not found in DataFrame."
165
+
166
+ results_df = pd.DataFrame()
167
+ if compare_to_ground_truth:
168
+ results_df['ground_truth'] = df[col1]
169
+ results_df['predicted'] = df[col2]
170
+ else:
171
+ results_df['extracted_winner'] = df[col1]
172
+ results_df['truth_result'] = df[col2]
173
+
174
+ if measurement == 'Accuracy':
175
+ result = process_pairwise_accuracy(results_df, compare_to_ground_truth)
176
+ output_text = (
177
+ f"Overall Accuracy: {result['overall_accuracy']}\n"
178
+ f"Number of NaNs: {result['num_extracted_nan']}"
179
+ )
180
+ elif measurement == 'Pearson Correlation':
181
+ result = process_single_rating_pearson_correlation(results_df, compare_to_ground_truth)
182
+ output_text = (
183
+ f"Pearson Correlation: {result['overall_pearson_correlation']}\n"
184
+ f"Number of NaNs: {result['num_extracted_nan']}"
185
+ )
186
+ else:
187
+ output_text = "Unknown measurement selected."
188
+
189
+ return output_text
190
+
191
+ def process_pairwise_accuracy(results_df: pd.DataFrame, compare_to_ground_truth=False) -> dict:
192
+ # Compute 'results' column based on whether comparing to ground truth
193
+ if compare_to_ground_truth:
194
+ # NEW: convert both columns to float
195
+ results_df['ground_truth'] = results_df['ground_truth'].apply(convert_to_float_or_nan)
196
+ results_df['predicted'] = results_df['predicted'].apply(convert_to_float_or_nan)
197
+
198
+ results_df['results'] = results_df['ground_truth'] == results_df['predicted']
199
+ num_extracted_nan = int(results_df['predicted'].isna().sum())
200
+ else:
201
+ results_df['results'] = results_df['extracted_winner'] == results_df['truth_result']
202
+ num_extracted_nan = int(results_df['extracted_winner'].isna().sum())
203
+
204
+ overall_accuracy = results_df['results'].mean()
205
+
206
+ return {
207
+ "overall_accuracy": overall_accuracy,
208
+ "num_extracted_nan": num_extracted_nan,
209
+ }
210
+
211
+ def process_single_rating_pearson_correlation(
212
+ results_df: pd.DataFrame, compare_to_ground_truth=False
213
+ ) -> dict:
214
+ if compare_to_ground_truth:
215
+ pred_col = 'predicted'
216
+ truth_col = 'ground_truth'
217
+ else:
218
+ pred_col = 'extracted_winner'
219
+ truth_col = 'truth_result'
220
+
221
+ results_df[pred_col] = results_df[pred_col].apply(convert_to_float_or_nan)
222
+ results_df[truth_col] = results_df[truth_col].apply(convert_to_float_or_nan)
223
+
224
+ numerical_results = results_df.dropna(subset=[pred_col, truth_col])
225
+
226
+ if len(numerical_results) == 0:
227
+ pearson_corr = np.nan
228
+ else:
229
+ pearson_corr = numerical_results[pred_col].corr(numerical_results[truth_col])
230
+
231
+ num_extracted_nan = int(results_df[pred_col].isna().sum())
232
+
233
+ return {
234
+ "overall_pearson_correlation": pearson_corr if not pd.isna(pearson_corr) else 0.0,
235
+ "num_extracted_nan": num_extracted_nan,
236
+ }
237
+
238
+ def convert_to_float_or_nan(extracted_input):
239
+ if extracted_input is None or pd.isna(extracted_input):
240
+ return np.nan
241
+ try:
242
+ return float(extracted_input)
243
+ except ValueError:
244
+ return np.nan
ui_components.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # ui_components.py
2
+ import gradio as gr
3
+
4
+ save_prompt_button = gr.Button("Save Prompt", visible=False)
utils.py CHANGED
@@ -2,6 +2,16 @@ from dataclasses import dataclass
2
  from datetime import datetime
3
  import logging
4
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  def get_logger(sink_name: str = "core_utils") -> logging.Logger:
7
  logging.basicConfig(
 
2
  from datetime import datetime
3
  import logging
4
 
5
+ def parse_variables(prompt):
6
+ import re
7
+ # Extract variables enclosed in double curly braces
8
+ variables = re.findall(r"{{(.*?)}}", prompt)
9
+ # Remove duplicates while preserving order
10
+ seen = set()
11
+ variables = [
12
+ x.strip() for x in variables if not (x.strip() in seen or seen.add(x.strip()))
13
+ ]
14
+ return variables
15
 
16
  def get_logger(sink_name: str = "core_utils") -> logging.Logger:
17
  logging.basicConfig(