kaikaidai commited on
Commit
ff974dc
·
verified ·
1 Parent(s): 8b46047

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +468 -4
app.py CHANGED
@@ -1,7 +1,471 @@
 
 
 
 
1
  import gradio as gr
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import re
3
+ from datetime import datetime
4
+ import hashlib
5
  import gradio as gr
6
 
7
+ from dotenv import load_dotenv
8
+ load_dotenv()
9
 
10
+ from gen_api_answer import (
11
+ atla_parse_model_response,
12
+ get_atla_response
13
+ )
14
+
15
+ from prompts import (
16
+ ATLA_PROMPT,
17
+ ATLA_PROMPT_WITH_REFERENCE
18
+ )
19
+
20
+ from random_sample_generation import (
21
+ get_random_human_ai_pair,
22
+ get_random_human_ai_ground_truth_pair,
23
+ generate_ai_response
24
+ )
25
+
26
+ from utils import Vote
27
+
28
+ from prompts import (
29
+ DEFAULT_EVAL_PROMPT,
30
+ DEFAULT_EVAL_PROMPT_EDITABLE,
31
+ FIXED_EVAL_SUFFIX,
32
+ DEFAULT_EVAL_CRITERIA
33
+ )
34
+
35
+ from common import CSS_STYLES, MAIN_TITLE, HOW_IT_WORKS
36
+
37
+
38
+ # Load the model_data from JSONL
39
+ def load_model_data():
40
+ model_data = {}
41
+ try:
42
+ with open("data/models.jsonl", "r") as f:
43
+ for line in f:
44
+ model = json.loads(line)
45
+ model_data[model["name"]] = {
46
+ "organization": model["organization"],
47
+ "license": model["license"],
48
+ "api_model": model["api_model"],
49
+ }
50
+ except FileNotFoundError:
51
+ print("Warning: models.jsonl not found")
52
+ return {}
53
+ return model_data
54
+
55
+
56
+ model_data = load_model_data()
57
+
58
+ def store_vote_data(prompt, response_a, response_b, model_a, model_b, winner, judge_id):
59
+ prompt_value = prompt.value if hasattr(prompt, 'value') else prompt
60
+
61
+ vote = Vote(
62
+ timestamp=datetime.now().isoformat(),
63
+ prompt=prompt_value,
64
+ response_a=response_a,
65
+ response_b=response_b,
66
+ model_a=model_a,
67
+ model_b=model_b,
68
+ winner=winner,
69
+ judge_id=judge_id,
70
+ )
71
+ add_vote(vote, db)
72
+
73
+
74
+ def parse_variables(prompt):
75
+ # Extract variables enclosed in double curly braces
76
+ variables = re.findall(r"{{(.*?)}}", prompt)
77
+ # Remove duplicates while preserving order
78
+ seen = set()
79
+ variables = [
80
+ x.strip() for x in variables if not (x.strip() in seen or seen.add(x.strip()))
81
+ ]
82
+ return variables
83
+
84
+
85
+ def get_final_prompt(eval_prompt, variable_values):
86
+ # Replace variables in the eval prompt with their values
87
+ for var, val in variable_values.items():
88
+ eval_prompt = eval_prompt.replace("{{" + var + "}}", val)
89
+ return eval_prompt
90
+
91
+
92
+
93
+ def get_ip(request: gr.Request) -> str:
94
+ """Get and hash the IP address from the request."""
95
+ if "cf-connecting-ip" in request.headers:
96
+ ip = request.headers["cf-connecting-ip"]
97
+ elif "x-forwarded-for" in request.headers:
98
+ ip = request.headers["x-forwarded-for"]
99
+ if "," in ip:
100
+ ip = ip.split(",")[0]
101
+ else:
102
+ ip = request.client.host
103
+
104
+ # Hash the IP address for privacy
105
+ return hashlib.sha256(ip.encode()).hexdigest()[:16]
106
+
107
+
108
+ def get_vote_message(choice: str, model_a: str, model_b: str) -> tuple[str, str]:
109
+ """Generate appropriate message based on vote and model rankings.
110
+ Returns (title, message) tuple."""
111
+ # Get current rankings
112
+ voting_data = get_current_votes()
113
+ leaderboard = get_leaderboard(model_data, voting_data, show_preliminary=True)
114
+ rankings = get_model_rankings(leaderboard)
115
+ pos_a = rankings.get(model_a, 0)
116
+ pos_b = rankings.get(model_b, 0)
117
+
118
+ if choice == "Tie":
119
+ return "It's a tie!", "Keep voting responsibly 🤗"
120
+
121
+ # Check if vote aligns with leaderboard
122
+ if (choice == "A" and pos_a < pos_b) or (choice == "B" and pos_b < pos_a):
123
+ return "The favourite wins!", "Keep voting responsibly 🤗"
124
+ else:
125
+ return "The underdog wins!", "Keep voting responsibly 🤗"
126
+
127
+
128
+ def populate_random_example(request: gr.Request, compatible_mode: bool):
129
+ """Generate a random human-AI conversation example and reset judge outputs."""
130
+ if compatible_mode:
131
+ human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
132
+ else:
133
+ human_msg, ai_msg = get_random_human_ai_pair()
134
+ ground_truth_msg = ""
135
+
136
+ return [
137
+ gr.update(value=human_msg),
138
+ gr.update(value=ai_msg),
139
+ gr.update(value="🎲", variant="secondary"),
140
+ gr.update(value=""), # Clear score
141
+ gr.update(value=""), # Clear critique
142
+ gr.update(value=ground_truth_msg, visible=compatible_mode), # Set ground truth and visibility
143
+ ]
144
+
145
+
146
+ with gr.Blocks(theme="default", css=CSS_STYLES) as demo:
147
+ gr.Markdown(MAIN_TITLE)
148
+ gr.Markdown(HOW_IT_WORKS)
149
+
150
+ # Hidden eval prompt that will always contain DEFAULT_EVAL_PROMPT
151
+ eval_prompt = gr.Textbox(
152
+ value=DEFAULT_EVAL_PROMPT,
153
+ visible=False
154
+ )
155
+
156
+ with gr.Tabs():
157
+ with gr.TabItem("Playground"):
158
+ with gr.Row():
159
+ # Left side - Input section
160
+ with gr.Column(scale=1):
161
+ with gr.Group():
162
+ human_input = gr.TextArea(
163
+ label="👩 User Input",
164
+ lines=5,
165
+ placeholder="Enter the human message here..."
166
+ )
167
+ with gr.Row():
168
+ generate_btn = gr.Button(
169
+ "Generate AI Response",
170
+ size="sm",
171
+ interactive=False
172
+ )
173
+
174
+ ai_response = gr.TextArea(
175
+ label="🤖 AI Response",
176
+ lines=10,
177
+ placeholder="Enter the AI response here..."
178
+ )
179
+
180
+ # Ground truth response (initially hidden)
181
+ ground_truth = gr.TextArea(
182
+ label="🎯 Ground truth response",
183
+ lines=10,
184
+ placeholder="Enter the ground truth response here...",
185
+ visible=False
186
+ )
187
+
188
+ with gr.Row():
189
+ random_btn = gr.Button("🎲", scale=2)
190
+ send_btn = gr.Button(
191
+ value="Run evaluation",
192
+ variant="primary",
193
+ size="lg",
194
+ scale=8
195
+ )
196
+
197
+ # Right side - Model outputs
198
+ with gr.Column(scale=1):
199
+ gr.Markdown("### 👩‍⚖️ Selene-Mini Evaluation")
200
+ with gr.Group():
201
+ with gr.Row():
202
+ score = gr.Textbox(label="Score", lines=1, interactive=False)
203
+ critique = gr.TextArea(label="Critique", lines=12, interactive=False)
204
+
205
+ gr.Markdown("<br>")
206
+
207
+
208
+ # Replace the "Edit Judge Prompt" Accordion section with:
209
+ with gr.Accordion("📝 Edit Judge Prompt", open=False) as prompt_accordion:
210
+ gr.Markdown("<br>")
211
+ use_reference_toggle = gr.Checkbox(
212
+ label="Use a reference response",
213
+ value=False
214
+ )
215
+
216
+ # Hide the default prompt editor
217
+ with gr.Column(visible=False) as default_prompt_editor:
218
+ eval_prompt_editable = gr.TextArea(
219
+ value=DEFAULT_EVAL_PROMPT_EDITABLE,
220
+ label="Evaluation Criteria",
221
+ lines=12
222
+ )
223
+
224
+ with gr.Row(visible=False) as edit_buttons_row:
225
+ cancel_prompt_btn = gr.Button("Cancel")
226
+ save_prompt_btn = gr.Button("Save", variant="primary")
227
+ gr.Markdown("*The sample being evaluated is always appended as:*")
228
+ gr.Markdown(f"```{FIXED_EVAL_SUFFIX}")
229
+
230
+ # Show the compatible mode editor
231
+ with gr.Column(visible=True) as compatible_prompt_editor:
232
+ eval_criteria_text = gr.TextArea(
233
+ label="Evaluation Criteria",
234
+ lines=12,
235
+ value=DEFAULT_EVAL_CRITERIA,
236
+ placeholder="Enter the complete evaluation criteria and scoring rubric..."
237
+ )
238
+
239
+ # Define state variables for model tracking
240
+ model_a_state = gr.State()
241
+ model_b_state = gr.State()
242
+ final_prompt_state = gr.State()
243
+ eval_prompt_previous = gr.State(value=DEFAULT_EVAL_PROMPT_EDITABLE) # Initialize with default value
244
+ is_editing = gr.State(False) # Track editing state
245
+ compatible_mode_state = gr.State(False) # Track compatible mode state
246
+
247
+ # Update model names after responses are generated
248
+ def update_model_names(model_a, model_b):
249
+ return gr.update(value=f"*Model: {model_a}*"), gr.update(
250
+ value=f"*Model: {model_b}*"
251
+ )
252
+
253
+ # Store the last submitted prompt and variables for comparison
254
+ last_submission = gr.State({})
255
+
256
+ # Add handlers for save/cancel buttons
257
+ def save_prompt(new_prompt, previous_prompt):
258
+ return [
259
+ gr.update(value=new_prompt), # Update the prompt
260
+ new_prompt, # Update the previous prompt state
261
+ gr.update(visible=False) # Hide the buttons
262
+ ]
263
+
264
+ def cancel_prompt(previous_prompt):
265
+ return [
266
+ gr.update(value=previous_prompt), # Revert to previous prompt
267
+ previous_prompt, # Keep the previous prompt state
268
+ gr.update(visible=False) # Hide the buttons
269
+ ]
270
+
271
+ def show_edit_buttons(current_value, previous_value):
272
+ # Show buttons only if the current value differs from the previous value
273
+ return gr.update(visible=current_value != previous_value)
274
+
275
+ # Add handlers for save/cancel buttons and prompt changes
276
+ save_prompt_btn.click(
277
+ fn=save_prompt,
278
+ inputs=[eval_prompt_editable, eval_prompt_previous],
279
+ outputs=[eval_prompt_editable, eval_prompt_previous, edit_buttons_row]
280
+ )
281
+
282
+ cancel_prompt_btn.click(
283
+ fn=cancel_prompt,
284
+ inputs=[eval_prompt_previous],
285
+ outputs=[eval_prompt_editable, eval_prompt_previous, edit_buttons_row]
286
+ )
287
+
288
+ eval_prompt_editable.change(
289
+ fn=show_edit_buttons,
290
+ inputs=[eval_prompt_editable, eval_prompt_previous],
291
+ outputs=edit_buttons_row
292
+ )
293
+
294
+ # Function to toggle visibility based on compatible mode
295
+ def toggle_use_reference(checked):
296
+ if checked:
297
+ human_msg, ai_msg, ground_truth_msg = get_random_human_ai_ground_truth_pair()
298
+ return {
299
+ ground_truth: gr.update(visible=True, value=ground_truth_msg),
300
+ human_input: gr.update(value=human_msg),
301
+ ai_response: gr.update(value=ai_msg),
302
+ score: gr.update(value=""),
303
+ critique: gr.update(value=""),
304
+ random_btn: gr.update(value="🎲", variant="secondary"),
305
+ }
306
+ else:
307
+ return {
308
+ ground_truth: gr.update(visible=False)
309
+ }
310
+
311
+ # Update the change handler to include all necessary outputs
312
+ use_reference_toggle.change(
313
+ fn=toggle_use_reference,
314
+ inputs=[use_reference_toggle],
315
+ outputs=[
316
+ ground_truth,
317
+ human_input,
318
+ ai_response,
319
+ score,
320
+ critique,
321
+ random_btn,
322
+ ]
323
+ )
324
+
325
+ # Add a new state variable to track first game
326
+ first_game_state = gr.State(True) # Initialize as True
327
+
328
+ # Update the submit function to parse the evaluation criteria
329
+ def submit_and_store(
330
+ use_reference,
331
+ eval_criteria_text,
332
+ human_input,
333
+ ai_response,
334
+ ground_truth_input,
335
+ ):
336
+ # Build prompt data dictionary
337
+ prompt_data = {
338
+ 'human_input': human_input,
339
+ 'ai_response': ai_response,
340
+ 'ground_truth_input': ground_truth_input if use_reference else '',
341
+ 'eval_criteria': eval_criteria_text,
342
+ }
343
+
344
+ # Get base prompt based on whether reference is used
345
+ base_prompt = ATLA_PROMPT_WITH_REFERENCE if use_reference else ATLA_PROMPT
346
+
347
+ # Format the prompt
348
+ final_prompt = base_prompt.format(
349
+ human_input=prompt_data['human_input'],
350
+ ai_response=prompt_data['ai_response'],
351
+ ground_truth_input=prompt_data['ground_truth_input'],
352
+ eval_criteria=prompt_data['eval_criteria']
353
+ )
354
+
355
+ # Get response from Atla
356
+ response = get_atla_response(
357
+ model_name="AtlaAI/Selene-1-Mini-Llama-3.1-8B",
358
+ prompt=final_prompt,
359
+ max_tokens=500,
360
+ temperature=0.01
361
+ )
362
+
363
+ # Parse the response
364
+ score, critique = atla_parse_model_response(response)
365
+
366
+ return [
367
+ score,
368
+ critique,
369
+ gr.update(value="Regenerate evaluation", variant="secondary", interactive=True),
370
+ gr.update(value="🎲"),
371
+ ]
372
+
373
+ # Update the click handler to use False for is_first_game after first submission
374
+ def create_submit_handler():
375
+ first_game = True
376
+
377
+ def handler(*args):
378
+ nonlocal first_game
379
+ result = submit_and_store(*args)
380
+ first_game = False # Set to False after first submission
381
+ return result
382
+
383
+ return handler
384
+
385
+ # Update the send_btn click handler
386
+ send_btn.click(
387
+ fn=submit_and_store,
388
+ inputs=[
389
+ use_reference_toggle,
390
+ eval_criteria_text,
391
+ human_input,
392
+ ai_response,
393
+ ground_truth,
394
+ ],
395
+ outputs=[
396
+ score,
397
+ critique,
398
+ send_btn,
399
+ random_btn,
400
+ ],
401
+ )
402
+
403
+ # Add random button handler
404
+ random_btn.click(
405
+ fn=populate_random_example,
406
+ inputs=[use_reference_toggle],
407
+ outputs=[
408
+ human_input,
409
+ ai_response,
410
+ random_btn,
411
+ score,
412
+ critique,
413
+ ground_truth,
414
+ ]
415
+ )
416
+
417
+ # Add input change handlers
418
+ def handle_input_change():
419
+ """Reset UI state when inputs are changed"""
420
+ return [
421
+ gr.update(value="Run evaluation", variant="primary"), # send_btn
422
+ gr.update(value="🎲", variant="secondary"), # random_btn
423
+ ]
424
+
425
+ # Update the change handlers for inputs
426
+ human_input.change(
427
+ fn=handle_input_change,
428
+ inputs=[],
429
+ outputs=[send_btn, random_btn]
430
+ )
431
+
432
+ ai_response.change(
433
+ fn=handle_input_change,
434
+ inputs=[],
435
+ outputs=[send_btn, random_btn]
436
+ )
437
+
438
+ generate_btn.click(
439
+ fn=lambda msg: (
440
+ generate_ai_response(msg)[0], # Only take the response text
441
+ gr.update(
442
+ value="Generate AI Response", # Keep the label
443
+ interactive=False # Disable the button
444
+ )
445
+ ),
446
+ inputs=[human_input],
447
+ outputs=[ai_response, generate_btn]
448
+ )
449
+
450
+ human_input.change(
451
+ fn=lambda x: gr.update(interactive=bool(x.strip())),
452
+ inputs=[human_input],
453
+ outputs=[generate_btn]
454
+ )
455
+
456
+ # Update the demo.load to include the random example population
457
+ demo.load(
458
+ fn=lambda: populate_random_example(None, False), # Pass False for initial compatible_mode
459
+ inputs=[],
460
+ outputs=[
461
+ human_input,
462
+ ai_response,
463
+ random_btn,
464
+ score,
465
+ critique,
466
+ ground_truth,
467
+ ]
468
+ )
469
+
470
+ if __name__ == "__main__":
471
+ demo.launch()